summaryrefslogtreecommitdiffstats
path: root/sys/kern
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/Make.tags.inc103
-rw-r--r--sys/kern/Makefile54
-rw-r--r--sys/kern/bus_if.m246
-rw-r--r--sys/kern/clock_if.m44
-rw-r--r--sys/kern/device_if.m127
-rw-r--r--sys/kern/genassym.sh54
-rw-r--r--sys/kern/imgact_aout.c289
-rw-r--r--sys/kern/imgact_elf.c1075
-rw-r--r--sys/kern/imgact_gzip.c385
-rw-r--r--sys/kern/imgact_shell.c132
-rw-r--r--sys/kern/inflate.c1078
-rw-r--r--sys/kern/init_main.c669
-rw-r--r--sys/kern/init_sysent.c418
-rw-r--r--sys/kern/kern_acct.c345
-rw-r--r--sys/kern/kern_acl.c830
-rw-r--r--sys/kern/kern_clock.c492
-rw-r--r--sys/kern/kern_condvar.c579
-rw-r--r--sys/kern/kern_conf.c491
-rw-r--r--sys/kern/kern_descrip.c2210
-rw-r--r--sys/kern/kern_environment.c461
-rw-r--r--sys/kern/kern_event.c1082
-rw-r--r--sys/kern/kern_exec.c1022
-rw-r--r--sys/kern/kern_exit.c805
-rw-r--r--sys/kern/kern_fork.c866
-rw-r--r--sys/kern/kern_idle.c110
-rw-r--r--sys/kern/kern_intr.c684
-rw-r--r--sys/kern/kern_jail.c256
-rw-r--r--sys/kern/kern_kthread.c184
-rw-r--r--sys/kern/kern_ktr.c241
-rw-r--r--sys/kern/kern_ktrace.c850
-rw-r--r--sys/kern/kern_linker.c1812
-rw-r--r--sys/kern/kern_lock.c594
-rw-r--r--sys/kern/kern_lockf.c846
-rw-r--r--sys/kern/kern_malloc.c618
-rw-r--r--sys/kern/kern_mib.c336
-rw-r--r--sys/kern/kern_module.c394
-rw-r--r--sys/kern/kern_mtxpool.c115
-rw-r--r--sys/kern/kern_mutex.c986
-rw-r--r--sys/kern/kern_ntptime.c935
-rw-r--r--sys/kern/kern_physio.c132
-rw-r--r--sys/kern/kern_poll.c523
-rw-r--r--sys/kern/kern_proc.c1072
-rw-r--r--sys/kern/kern_prot.c1969
-rw-r--r--sys/kern/kern_resource.c1020
-rw-r--r--sys/kern/kern_sema.c177
-rw-r--r--sys/kern/kern_shutdown.c564
-rw-r--r--sys/kern/kern_sig.c2153
-rw-r--r--sys/kern/kern_subr.c582
-rw-r--r--sys/kern/kern_switch.c280
-rw-r--r--sys/kern/kern_sx.c348
-rw-r--r--sys/kern/kern_synch.c970
-rw-r--r--sys/kern/kern_syscalls.c123
-rw-r--r--sys/kern/kern_sysctl.c1422
-rw-r--r--sys/kern/kern_tc.c684
-rw-r--r--sys/kern/kern_time.c678
-rw-r--r--sys/kern/kern_timeout.c414
-rw-r--r--sys/kern/kern_uuid.c222
-rw-r--r--sys/kern/kern_xxx.c314
-rw-r--r--sys/kern/ksched.c280
-rw-r--r--sys/kern/link_aout.c590
-rw-r--r--sys/kern/link_elf.c1239
-rw-r--r--sys/kern/link_elf_obj.c1239
-rw-r--r--sys/kern/linker_if.m107
-rw-r--r--sys/kern/makesyscalls.sh446
-rw-r--r--sys/kern/md4c.c285
-rw-r--r--sys/kern/md5c.c339
-rw-r--r--sys/kern/p1003_1b.c342
-rw-r--r--sys/kern/posix4_mib.c115
-rw-r--r--sys/kern/subr_acl_posix1e.c830
-rw-r--r--sys/kern/subr_autoconf.c130
-rw-r--r--sys/kern/subr_blist.c929
-rw-r--r--sys/kern/subr_bus.c2179
-rw-r--r--sys/kern/subr_clist.c696
-rw-r--r--sys/kern/subr_clock.c316
-rw-r--r--sys/kern/subr_devstat.c307
-rw-r--r--sys/kern/subr_disk.c434
-rw-r--r--sys/kern/subr_disklabel.c426
-rw-r--r--sys/kern/subr_diskmbr.c544
-rw-r--r--sys/kern/subr_diskslice.c997
-rw-r--r--sys/kern/subr_eventhandler.c173
-rw-r--r--sys/kern/subr_hints.c366
-rw-r--r--sys/kern/subr_kobj.c216
-rw-r--r--sys/kern/subr_log.c268
-rw-r--r--sys/kern/subr_mbuf.c1111
-rw-r--r--sys/kern/subr_mchain.c550
-rw-r--r--sys/kern/subr_module.c266
-rw-r--r--sys/kern/subr_param.c169
-rw-r--r--sys/kern/subr_pcpu.c144
-rw-r--r--sys/kern/subr_power.c107
-rw-r--r--sys/kern/subr_prf.c905
-rw-r--r--sys/kern/subr_prof.c531
-rw-r--r--sys/kern/subr_rman.c609
-rw-r--r--sys/kern/subr_rtc.c316
-rw-r--r--sys/kern/subr_sbuf.c560
-rw-r--r--sys/kern/subr_scanf.c628
-rw-r--r--sys/kern/subr_smp.c321
-rw-r--r--sys/kern/subr_taskqueue.c223
-rw-r--r--sys/kern/subr_trap.c209
-rw-r--r--sys/kern/subr_turnstile.c986
-rw-r--r--sys/kern/subr_witness.c1488
-rw-r--r--sys/kern/subr_xxx.c182
-rw-r--r--sys/kern/sys_generic.c1210
-rw-r--r--sys/kern/sys_pipe.c1427
-rw-r--r--sys/kern/sys_process.c728
-rw-r--r--sys/kern/sys_socket.c217
-rw-r--r--sys/kern/syscalls.c403
-rw-r--r--sys/kern/syscalls.master565
-rw-r--r--sys/kern/sysv_ipc.c97
-rw-r--r--sys/kern/sysv_msg.c1240
-rw-r--r--sys/kern/sysv_sem.c1193
-rw-r--r--sys/kern/sysv_shm.c890
-rw-r--r--sys/kern/tty.c2660
-rw-r--r--sys/kern/tty_compat.c490
-rw-r--r--sys/kern/tty_conf.c210
-rw-r--r--sys/kern/tty_cons.c597
-rw-r--r--sys/kern/tty_pty.c874
-rw-r--r--sys/kern/tty_subr.c696
-rw-r--r--sys/kern/tty_tty.c252
-rw-r--r--sys/kern/uipc_accf.c150
-rw-r--r--sys/kern/uipc_cow.c181
-rw-r--r--sys/kern/uipc_domain.c256
-rw-r--r--sys/kern/uipc_jumbo.c252
-rw-r--r--sys/kern/uipc_mbuf.c753
-rw-r--r--sys/kern/uipc_mbuf2.c404
-rw-r--r--sys/kern/uipc_proto.c80
-rw-r--r--sys/kern/uipc_sockbuf.c983
-rw-r--r--sys/kern/uipc_socket.c1792
-rw-r--r--sys/kern/uipc_socket2.c983
-rw-r--r--sys/kern/uipc_syscalls.c1945
-rw-r--r--sys/kern/uipc_usrreq.c1503
-rw-r--r--sys/kern/vfs_acl.c830
-rw-r--r--sys/kern/vfs_aio.c2307
-rw-r--r--sys/kern/vfs_bio.c3395
-rw-r--r--sys/kern/vfs_cache.c898
-rw-r--r--sys/kern/vfs_cluster.c1008
-rw-r--r--sys/kern/vfs_conf.c396
-rw-r--r--sys/kern/vfs_default.c845
-rw-r--r--sys/kern/vfs_export.c400
-rw-r--r--sys/kern/vfs_extattr.c4862
-rw-r--r--sys/kern/vfs_init.c477
-rw-r--r--sys/kern/vfs_lookup.c754
-rw-r--r--sys/kern/vfs_mount.c396
-rw-r--r--sys/kern/vfs_subr.c3275
-rw-r--r--sys/kern/vfs_syscalls.c4862
-rw-r--r--sys/kern/vfs_vnops.c1056
-rw-r--r--sys/kern/vnode_if.src556
146 files changed, 108911 insertions, 0 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc
new file mode 100644
index 0000000..b958ba7
--- /dev/null
+++ b/sys/kern/Make.tags.inc
@@ -0,0 +1,103 @@
+# $FreeBSD$
+# @(#)Make.tags.inc 8.1 (Berkeley) 6/11/93
+
+SYS?= ${.CURDIR}/..
+
+# Common files for "make tags", included by the Makefile for each
+# architecture.
+
+# Put the /sys/sys include files at the end so that subroutine definitions
+# win when there is a struct tag with the same name (e.g., vmmeter). The
+# better solution would be for ctags to generate "struct vmmeter" tags.
+
+COMM= ${SYS}/dev/advansys/*.[ch] \
+ ${SYS}/dev/aha/*.[ch] \
+ ${SYS}/dev/aic7xxx/*.[ch] \
+ ${SYS}/dev/buslogic/*.[ch] \
+ ${SYS}/dev/ccd/*.[ch] \
+ ${SYS}/dev/dec/*.[ch] \
+ ${SYS}/dev/dpt/*.[ch] \
+ ${SYS}/dev/en/*.[ch] \
+ ${SYS}/dev/hea/*.[ch] \
+ ${SYS}/dev/hfa/*.[ch] \
+ ${SYS}/dev/iicbus/*.[ch] \
+ ${SYS}/dev/isp/*.[ch] \
+ ${SYS}/dev/pdq/*.[ch] \
+ ${SYS}/dev/ppbus/*.[ch] \
+ ${SYS}/dev/smbus/*.[ch] \
+ ${SYS}/dev/vx/*.[ch] \
+ ${SYS}/fs/deadfs/*.[ch] \
+ ${SYS}/fs/fdescfs/*.[ch] \
+ ${SYS}/fs/fifofs/*.[ch] \
+ ${SYS}/fs/msdosfs/*.[ch] \
+ ${SYS}/fs/nullfs/*.[ch] \
+ ${SYS}/fs/portalfs/*.[ch] \
+ ${SYS}/fs/procfs/*.[ch] \
+ ${SYS}/fs/specfs/*.[ch] \
+ ${SYS}/fs/umapfs/*.[ch] \
+ ${SYS}/fs/unionfs/*.[ch] \
+ ${SYS}/isofs/cd9660/*.[ch] \
+ ${SYS}/kern/*.[ch] \
+ ${SYS}/net/*.[ch] \
+ ${SYS}/netatalk/*.[ch] \
+ ${SYS}/netatm/*.[ch] \
+ ${SYS}/netinet/*.[ch] \
+ ${SYS}/netipx/*.[ch] \
+ ${SYS}/netkey/*.[ch] \
+ ${SYS}/netnatm/*.[ch] \
+ ${SYS}/netns/*.[ch] \
+ ${SYS}/nfs/*.[ch] \
+ ${SYS}/pci/*.[ch] \
+ ${SYS}/posix4/*.[ch] \
+ ${SYS}/ufs/ffs/*.[ch] \
+ ${SYS}/ufs/ufs/*.[ch] \
+ ${SYS}/vm/*.[ch] \
+ ${SYS}/sys/*.[ch]
+
+COMMDIR1= ${SYS}/conf \
+ ${SYS}/kern \
+ ${SYS}/net \
+ ${SYS}/netatalk \
+ ${SYS}/netatm \
+ ${SYS}/netinet \
+ ${SYS}/netipx \
+ ${SYS}/netkey \
+ ${SYS}/netnatm \
+ ${SYS}/netns \
+ ${SYS}/nfs \
+ ${SYS}/pci \
+ ${SYS}/posix4 \
+ ${SYS}/vm \
+ ${SYS}/sys
+
+COMMDIR2= ${SYS}/dev/advansys \
+ ${SYS}/dev/aha \
+ ${SYS}/dev/aic7xxx \
+ ${SYS}/dev/buslogic \
+ ${SYS}/dev/ccd \
+ ${SYS}/dev/dec \
+ ${SYS}/dev/dpt \
+ ${SYS}/dev/en \
+ ${SYS}/dev/hea \
+ ${SYS}/dev/hfa \
+ ${SYS}/dev/iicbus \
+ ${SYS}/dev/isp \
+ ${SYS}/dev/pdq \
+ ${SYS}/dev/ppbus \
+ ${SYS}/dev/smbus \
+ ${SYS}/dev/vn \
+ ${SYS}/dev/vx \
+ ${SYS}/fs/deadfs \
+ ${SYS}/fs/devfs \
+ ${SYS}/fs/fdescfs \
+ ${SYS}/fs/fifofs \
+ ${SYS}/fs/msdosfs \
+ ${SYS}/fs/nullfs \
+ ${SYS}/fs/portalfs \
+ ${SYS}/fs/procfs \
+ ${SYS}/fs/specfs \
+ ${SYS}/fs/umapfs \
+ ${SYS}/fs/unionfs \
+ ${SYS}/isofs/cd9660 \
+ ${SYS}/ufs/ffs \
+ ${SYS}/ufs/ufs
diff --git a/sys/kern/Makefile b/sys/kern/Makefile
new file mode 100644
index 0000000..cdfcc2a
--- /dev/null
+++ b/sys/kern/Makefile
@@ -0,0 +1,54 @@
+# @(#)Makefile 8.2 (Berkeley) 3/21/94
+# $FreeBSD$
+
+# Makefile for kernel tags files, init_sysent, etc.
+
+ARCH= i386 # luna68k news3400 pmax sparc tahoe vax
+
+all:
+ @echo "make tags, make links or init_sysent.c only"
+
+init_sysent.c syscalls.c ../sys/syscall.h \
+../sys/syscall.mk ../sys/sysproto.h: makesyscalls.sh syscalls.master
+ -mv -f init_sysent.c init_sysent.c.bak
+ -mv -f syscalls.c syscalls.c.bak
+ -mv -f ../sys/syscall.h ../sys/syscall.h.bak
+ -mv -f ../sys/syscall.mk ../sys/syscall.mk.bak
+ -mv -f ../sys/sysproto.h ../sys/sysproto.h.bak
+ sh makesyscalls.sh syscalls.master
+
+# Kernel tags:
+# Tags files are built in the top-level directory for each architecture,
+# with a makefile listing the architecture-dependent files, etc. The list
+# of common files is in ./Make.tags.inc. Links to the correct tags file
+# are placed in each source directory. We need to have links to tags files
+# from the generic directories that are relative to the machine type, even
+# via remote mounts; therefore we use symlinks to $SYSTAGS, which points at
+# ${SYSDIR}/${MACHINE_ARCH}/tags.
+
+SYSTAGS=/var/db/sys_tags
+SYSDIR=/sys
+
+# Directories in which to place tags links (other than machine-dependent)
+DGEN= conf \
+ dev dev/scsi \
+ fs fs/deadfs fs/fdescfs fs/fifofs \
+ fs/lofs fs/nullfs fs/portalfs fs/procfs \
+ fs/specfs fs/umapfs fs/unionfs \
+ hp hp/dev hp/hpux \
+ kern libkern \
+ net netccitt netinet netiso netns nfs scripts sys \
+ ufs ufs/ffs ufs/lfs ufs/ufs \
+ vm
+
+tags::
+ -for i in ${ARCH}; do \
+ (cd ../$$i && make ${MFLAGS} tags); done
+
+links::
+ rm -f ${SYSTAGS}
+ ln -s ${SYSDIR}/${MACHINE_ARCH}/tags ${SYSTAGS}
+ -for i in ${DGEN}; do \
+ (cd ../$$i && { rm -f tags; ln -s ${SYSTAGS} tags; }) done
+ -for i in ${ARCH}; do \
+ (cd ../$$i && make ${MFLAGS} SYSTAGS=${SYSTAGS} links); done
diff --git a/sys/kern/bus_if.m b/sys/kern/bus_if.m
new file mode 100644
index 0000000..bf8d4ac
--- /dev/null
+++ b/sys/kern/bus_if.m
@@ -0,0 +1,246 @@
+#
+# Copyright (c) 1998 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+
+INTERFACE bus;
+
+#
+# Default implementations of some methods.
+#
+CODE {
+ static struct resource *
+ null_alloc_resource(device_t dev, device_t child,
+ int type, int *rid,
+ u_long start, u_long end,
+ u_long count, u_int flags)
+ {
+ return 0;
+ }
+};
+
+#
+# This is called from system code which prints out a description of a
+# device. It should describe the attachment that the child has with
+# the parent. For instance the TurboLaser bus prints which node the
+# device is attached to. See bus_generic_print_child.9 for more
+# information.
+# This method returns the number of characters output.
+#
+METHOD int print_child {
+ device_t dev;
+ device_t child;
+};
+
+#
+# Called for each child device that
+# did not succeed in probing for a
+# driver.
+#
+METHOD void probe_nomatch {
+ device_t dev;
+ device_t child;
+};
+
+#
+# These two methods manage a bus specific set of instance variables of
+# a child device. The intention is that each different type of bus
+# defines a set of appropriate instance variables (such as ports and
+# irqs for ISA bus etc.)
+#
+# This information could be given to the child device as a struct but
+# that makes it hard for a bus to add or remove variables without
+# forcing an edit and recompile for all drivers which may not be
+# possible for vendor supplied binary drivers.
+
+#
+# Read an instance variable. Return 0 on success.
+#
+METHOD int read_ivar {
+ device_t _dev;
+ device_t _child;
+ int _indx;
+ uintptr_t *_result;
+};
+
+#
+# Write an instance variable. Return 0 on success.
+#
+METHOD int write_ivar {
+ device_t _dev;
+ device_t _child;
+ int _indx;
+ uintptr_t _value;
+};
+
+#
+# Called after the child's DEVICE_DETACH method to allow the parent
+# to reclaim any resources allocated on behalf of the child.
+#
+METHOD void child_detached {
+ device_t _dev;
+ device_t _child;
+};
+
+#
+# Called when a new driver is added to the devclass which owns this
+# bus. The generic implementation of this method attempts to probe and
+# attach any un-matched children of the bus.
+#
+METHOD void driver_added {
+ device_t _dev;
+ driver_t *_driver;
+} DEFAULT bus_generic_driver_added;
+
+#
+# For busses which use use drivers supporting DEVICE_IDENTIFY to
+# enumerate their devices, these methods are used to create new
+# device instances. If place is non-NULL, the new device will be
+# added after the last existing child with the same order.
+#
+METHOD device_t add_child {
+ device_t _dev;
+ int _order;
+ const char *_name;
+ int _unit;
+};
+
+#
+# Allocate a system resource attached to `dev' on behalf of `child'.
+# The types are defined in <machine/resource.h>; the meaning of the
+# resource-ID field varies from bus to bus (but *rid == 0 is always
+# valid if the resource type is). start and end reflect the allowable
+# range, and should be passed as `0UL' and `~0UL', respectively, if
+# the client has no range restriction. count is the number of consecutive
+# indices in the resource required. flags is a set of sharing flags
+# as defined in <sys/rman.h>.
+#
+# Returns a resource or a null pointer on failure. The caller is
+# responsible for calling rman_activate_resource() when it actually
+# uses the resource.
+#
+METHOD struct resource * alloc_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int *_rid;
+ u_long _start;
+ u_long _end;
+ u_long _count;
+ u_int _flags;
+} DEFAULT null_alloc_resource;
+
+METHOD int activate_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int _rid;
+ struct resource *_r;
+};
+
+METHOD int deactivate_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int _rid;
+ struct resource *_r;
+};
+
+#
+# Free a resource allocated by the preceding method. The `rid' value
+# must be the same as the one returned by BUS_ALLOC_RESOURCE (which
+# is not necessarily the same as the one the client passed).
+#
+METHOD int release_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int _rid;
+ struct resource *_res;
+};
+
+METHOD int setup_intr {
+ device_t _dev;
+ device_t _child;
+ struct resource *_irq;
+ int _flags;
+ driver_intr_t *_intr;
+ void *_arg;
+ void **_cookiep;
+};
+
+METHOD int teardown_intr {
+ device_t _dev;
+ device_t _child;
+ struct resource *_irq;
+ void *_cookie;
+};
+
+#
+# Set the range used for a particular resource. Return EINVAL if
+# the type or rid are out of range.
+#
+METHOD int set_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int _rid;
+ u_long _start;
+ u_long _count;
+};
+
+#
+# Get the range for a resource. Return ENOENT if the type or rid are
+# out of range or have not been set.
+#
+METHOD int get_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int _rid;
+ u_long *_startp;
+ u_long *_countp;
+};
+
+#
+# Delete a resource.
+#
+METHOD void delete_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int _rid;
+};
+
+#
+# Return a struct resource_list.
+#
+METHOD struct resource_list * get_resource_list {
+ device_t _dev;
+ device_t _child;
+} DEFAULT bus_generic_get_resource_list;
diff --git a/sys/kern/clock_if.m b/sys/kern/clock_if.m
new file mode 100644
index 0000000..3ddb25e
--- /dev/null
+++ b/sys/kern/clock_if.m
@@ -0,0 +1,44 @@
+# Copyright (c) 2001 by Thomas Moestl <tmm@FreeBSD.org>.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# $FreeBSD$
+
+#include <sys/bus.h>
+#include <sys/time.h>
+
+INTERFACE clock;
+
+# Interface for clock drivers. This is inspired by the NetBSD device-independent
+# clock code (by Gordon W. Ross).
+
+# An EINVAL error return from this call signifies that the clock has an illegal
+# setting.
+METHOD int gettime {
+ device_t dev;
+ struct timespec *ts;
+};
+
+METHOD int settime {
+ device_t dev;
+ struct timespec *ts;
+};
diff --git a/sys/kern/device_if.m b/sys/kern/device_if.m
new file mode 100644
index 0000000..005eb38
--- /dev/null
+++ b/sys/kern/device_if.m
@@ -0,0 +1,127 @@
+#
+# Copyright (c) 1998 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+
+INTERFACE device;
+
+#
+# Default implementations of some methods.
+#
+CODE {
+ static int null_shutdown(device_t dev)
+ {
+ return 0;
+ }
+
+ static int null_suspend(device_t dev)
+ {
+ return 0;
+ }
+
+ static int null_resume(device_t dev)
+ {
+ return 0;
+ }
+};
+
+#
+# Probe to see if the device is present. Return 0 if the device exists,
+# ENXIO if it cannot be found. If some other error happens during the
+# probe (such as a memory allocation failure), an appropriate error code
+# should be returned. For cases where more than one driver matches a
+# device, a priority value can be returned. In this case, success codes
+# are values less than or equal to zero with the highest value representing
+# the best match. Failure codes are represented by positive values and
+# the regular unix error codes should be used for the purpose.
+
+# If a driver returns a success code which is less than zero, it must
+# not assume that it will be the same driver which is attached to the
+# device. In particular, it must not assume that any values stored in
+# the softc structure will be available for its attach method and any
+# resources allocated during probe must be released and re-allocated
+# if the attach method is called. If a success code of zero is
+# returned, the driver can assume that it will be the one attached.
+#
+# Devices which implement busses should use this method to probe for
+# the existence of devices attached to the bus and add them as
+# children. If this is combined with the use of bus_generic_attach,
+# the child devices will be automatically probed and attached.
+#
+METHOD int probe {
+ device_t dev;
+};
+
+#
+# Called by a parent bus to add new devices to the bus.
+#
+STATICMETHOD void identify {
+ driver_t *driver;
+ device_t parent;
+};
+
+#
+# Attach a device to the system. The probe method will have been
+# called and will have indicated that the device exists. This routine
+# should initialise the hardware and allocate other system resources
+# (such as devfs entries). Returns 0 on success.
+#
+METHOD int attach {
+ device_t dev;
+};
+
+#
+# Detach a device. This can be called if the user is replacing the
+# driver software or if a device is about to be physically removed
+# from the system (e.g. for pccard devices). Returns 0 on success.
+#
+METHOD int detach {
+ device_t dev;
+};
+
+#
+# This is called during system shutdown to allow the driver to put the
+# hardware into a consistent state for rebooting the computer.
+#
+METHOD int shutdown {
+ device_t dev;
+} DEFAULT null_shutdown;
+
+#
+# This is called by the power-management subsystem when a suspend has been
+# requested by the user or by some automatic mechanism. This gives
+# drivers a chance to veto the suspend or save their configuration before
+# power is removed.
+#
+METHOD int suspend {
+ device_t dev;
+} DEFAULT null_suspend;
+
+METHOD int resume {
+ device_t dev;
+} DEFAULT null_resume;
diff --git a/sys/kern/genassym.sh b/sys/kern/genassym.sh
new file mode 100644
index 0000000..70ad69e
--- /dev/null
+++ b/sys/kern/genassym.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+# $FreeBSD$
+
+# Grrr, this should use stdin and stdout, but is encrufted for compatibility.
+
+usage()
+{
+ echo "usage: genassym [-o outfile] objfile"
+ exit 1
+}
+
+outfile=/dev/stdout
+while getopts "o:" option
+do
+ case "$option" in
+ o) outfile="$OPTARG";;
+ *) usage;;
+ esac
+done
+shift $(($OPTIND - 1))
+case $# in
+1) ;;
+*) usage;;
+esac
+
+${NM:='nm'} "$1" | ${AWK:='awk'} '
+/ C .*sign$/ {
+ sign = substr($1, length($1) - 3, 4)
+ sub("^0*", "", sign)
+ if (sign != "")
+ sign = "-"
+}
+/ C .*w0$/ {
+ w0 = substr($1, length($1) - 3, 4)
+}
+/ C .*w1$/ {
+ w1 = substr($1, length($1) - 3, 4)
+}
+/ C .*w2$/ {
+ w2 = substr($1, length($1) - 3, 4)
+}
+/ C .*w3$/ {
+ w3 = substr($1, length($1) - 3, 4)
+ w = w3 w2 w1 w0
+ sub("^0*", "", w)
+ if (w == "")
+ w = "0"
+ sub("w3$", "", $3)
+ # This still has minor problems representing INT_MIN, etc. E.g.,
+ # with 32-bit 2''s complement ints, this prints -0x80000000, which
+ # has the wrong type (unsigned int).
+ printf("#define\t%s\t%s0x%s\n", $3, sign, w)
+}
+' 3>"$outfile" >&3 3>&-
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
new file mode 100644
index 0000000..41ae8cf
--- /dev/null
+++ b/sys/kern/imgact_aout.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_kstack_pages.h"
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sysent.h>
+#include <sys/syscall.h>
+#include <sys/vnode.h>
+#include <sys/user.h>
+
+#include <machine/md_var.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+
+static int exec_aout_imgact(struct image_params *imgp);
+
+struct sysentvec aout_sysvec = {
+ SYS_MAXSYSCALL,
+ sysent,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ sendsig,
+ sigcode,
+ &szsigcode,
+ 0,
+ "FreeBSD a.out",
+ aout_coredump,
+ NULL,
+ MINSIGSTKSZ
+};
+
+static int
+exec_aout_imgact(imgp)
+ struct image_params *imgp;
+{
+ const struct exec *a_out = (const struct exec *) imgp->image_header;
+ struct vmspace *vmspace;
+ struct vnode *vp;
+ vm_map_t map;
+ vm_object_t object;
+ vm_offset_t text_end, data_end;
+ unsigned long virtual_offset;
+ unsigned long file_offset;
+ unsigned long bss_size;
+ int error;
+
+ GIANT_REQUIRED;
+
+ /*
+ * Linux and *BSD binaries look very much alike,
+ * only the machine id is different:
+ * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
+ * NetBSD is in network byte order.. ugh.
+ */
+ if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
+ ((a_out->a_magic >> 16) & 0xff) != 0 &&
+ ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86)
+ return -1;
+
+ /*
+ * Set file/virtual offset based on a.out variant.
+ * We do two cases: host byte order and network byte order
+ * (for NetBSD compatibility)
+ */
+ switch ((int)(a_out->a_magic & 0xffff)) {
+ case ZMAGIC:
+ virtual_offset = 0;
+ if (a_out->a_text) {
+ file_offset = PAGE_SIZE;
+ } else {
+ /* Bill's "screwball mode" */
+ file_offset = 0;
+ }
+ break;
+ case QMAGIC:
+ virtual_offset = PAGE_SIZE;
+ file_offset = 0;
+ /* Pass PS_STRINGS for BSD/OS binaries only. */
+ if (N_GETMID(*a_out) == MID_ZERO)
+ imgp->ps_strings = PS_STRINGS;
+ break;
+ default:
+ /* NetBSD compatibility */
+ switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
+ case ZMAGIC:
+ case QMAGIC:
+ virtual_offset = PAGE_SIZE;
+ file_offset = 0;
+ break;
+ default:
+ return (-1);
+ }
+ }
+
+ bss_size = roundup(a_out->a_bss, PAGE_SIZE);
+
+ /*
+ * Check various fields in header for validity/bounds.
+ */
+ if (/* entry point must lay with text region */
+ a_out->a_entry < virtual_offset ||
+ a_out->a_entry >= virtual_offset + a_out->a_text ||
+
+ /* text and data size must each be page rounded */
+ a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
+ return (-1);
+
+ /* text + data can't exceed file size */
+ if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
+ return (EFAULT);
+
+ /*
+ * text/data/bss must not exceed limits
+ */
+ mtx_assert(&Giant, MA_OWNED);
+ if (/* text can't exceed maximum text size */
+ a_out->a_text > maxtsiz ||
+
+ /* data + bss can't exceed rlimit */
+ a_out->a_data + bss_size >
+ imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
+ return (ENOMEM);
+
+ /* copy in arguments and/or environment from old process */
+ error = exec_extract_strings(imgp);
+ if (error)
+ return (error);
+
+ /*
+ * Destroy old process VM and create a new one (with a new stack)
+ */
+ exec_new_vmspace(imgp);
+
+ /*
+ * The vm space can be changed by exec_new_vmspace
+ */
+ vmspace = imgp->proc->p_vmspace;
+
+ vp = imgp->vp;
+ map = &vmspace->vm_map;
+ vm_map_lock(map);
+ VOP_GETVOBJECT(vp, &object);
+ vm_object_reference(object);
+
+ text_end = virtual_offset + a_out->a_text;
+ error = vm_map_insert(map, object,
+ file_offset,
+ virtual_offset, text_end,
+ VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
+ MAP_COPY_ON_WRITE | MAP_PREFAULT);
+ if (error) {
+ vm_map_unlock(map);
+ return (error);
+ }
+ data_end = text_end + a_out->a_data;
+ if (a_out->a_data) {
+ vm_object_reference(object);
+ error = vm_map_insert(map, object,
+ file_offset + a_out->a_text,
+ text_end, data_end,
+ VM_PROT_ALL, VM_PROT_ALL,
+ MAP_COPY_ON_WRITE | MAP_PREFAULT);
+ if (error) {
+ vm_map_unlock(map);
+ return (error);
+ }
+ }
+
+ if (bss_size) {
+ error = vm_map_insert(map, NULL, 0,
+ data_end, data_end + bss_size,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error) {
+ vm_map_unlock(map);
+ return (error);
+ }
+ }
+ vm_map_unlock(map);
+
+ /* Fill in process VM information */
+ vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
+ vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
+ vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
+ vmspace->vm_daddr = (caddr_t) (uintptr_t)
+ (virtual_offset + a_out->a_text);
+
+ /* Fill in image_params */
+ imgp->interpreted = 0;
+ imgp->entry_addr = a_out->a_entry;
+
+ imgp->proc->p_sysent = &aout_sysvec;
+
+ /* Indicate that this file should not be modified */
+ imgp->vp->v_flag |= VTEXT;
+
+ return (0);
+}
+
+/*
+ * Dump core, into a file named as described in the comments for
+ * expand_name(), unless the process was setuid/setgid.
+ */
+int
+aout_coredump(td, vp, limit)
+ register struct thread *td;
+ register struct vnode *vp;
+ off_t limit;
+{
+ struct proc *p = td->td_proc;
+ register struct ucred *cred = td->td_ucred;
+ register struct vmspace *vm = p->p_vmspace;
+ int error;
+
+ if (ctob((UAREA_PAGES + KSTACK_PAGES)
+ + vm->vm_dsize + vm->vm_ssize) >= limit)
+ return (EFAULT);
+ PROC_LOCK(p);
+ fill_kinfo_proc(p, &p->p_uarea->u_kproc);
+ PROC_UNLOCK(p);
+ error = cpu_coredump(td, vp, cred);
+ if (error == 0)
+ error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
+ (int)ctob(vm->vm_dsize),
+ (off_t)ctob(UAREA_PAGES + KSTACK_PAGES), UIO_USERSPACE,
+ IO_UNIT | IO_DIRECT, cred, (int *) NULL, td);
+ if (error == 0)
+ error = vn_rdwr_inchunks(UIO_WRITE, vp,
+ (caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)),
+ round_page(ctob(vm->vm_ssize)),
+ (off_t)ctob(UAREA_PAGES + KSTACK_PAGES) +
+ ctob(vm->vm_dsize), UIO_USERSPACE,
+ IO_UNIT | IO_DIRECT, cred, (int *) NULL, td);
+ return (error);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
+EXEC_SET(aout, aout_execsw);
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
new file mode 100644
index 0000000..9044527
--- /dev/null
+++ b/sys/kern/imgact_elf.c
@@ -0,0 +1,1075 @@
+/*-
+ * Copyright (c) 2000 David O'Brien
+ * Copyright (c) 1995-1996 Søren Schmidt
+ * Copyright (c) 1996 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/mman.h>
+#include <sys/namei.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/resourcevar.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syscall.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+
+#include <machine/elf.h>
+#include <machine/md_var.h>
+
+#define OLD_EI_BRAND 8
+
+__ElfType(Brandinfo);
+__ElfType(Auxargs);
+
+static int elf_check_header(const Elf_Ehdr *hdr);
+static int elf_freebsd_fixup(register_t **stack_base,
+ struct image_params *imgp);
+static int elf_load_file(struct proc *p, const char *file, u_long *addr,
+ u_long *entry);
+static int elf_load_section(struct proc *p,
+ struct vmspace *vmspace, struct vnode *vp,
+ vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
+ vm_prot_t prot);
+static int exec_elf_imgact(struct image_params *imgp);
+
+static int elf_trace = 0;
+SYSCTL_INT(_debug, OID_AUTO, elf_trace, CTLFLAG_RW, &elf_trace, 0, "");
+
+struct sysentvec elf_freebsd_sysvec = {
+ SYS_MAXSYSCALL,
+ sysent,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ elf_freebsd_fixup,
+ sendsig,
+ sigcode,
+ &szsigcode,
+ 0,
+ "FreeBSD ELF",
+ elf_coredump,
+ NULL,
+ MINSIGSTKSZ
+};
+
+static Elf_Brandinfo freebsd_brand_info = {
+ ELFOSABI_FREEBSD,
+ "FreeBSD",
+ "",
+ "/usr/libexec/ld-elf.so.1",
+ &elf_freebsd_sysvec
+ };
+static Elf_Brandinfo *elf_brand_list[MAX_BRANDS] = {
+ &freebsd_brand_info,
+ NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL
+ };
+
+int
+elf_insert_brand_entry(Elf_Brandinfo *entry)
+{
+ int i;
+
+ for (i=1; i<MAX_BRANDS; i++) {
+ if (elf_brand_list[i] == NULL) {
+ elf_brand_list[i] = entry;
+ break;
+ }
+ }
+ if (i == MAX_BRANDS)
+ return -1;
+ return 0;
+}
+
+int
+elf_remove_brand_entry(Elf_Brandinfo *entry)
+{
+ int i;
+
+ for (i=1; i<MAX_BRANDS; i++) {
+ if (elf_brand_list[i] == entry) {
+ elf_brand_list[i] = NULL;
+ break;
+ }
+ }
+ if (i == MAX_BRANDS)
+ return -1;
+ return 0;
+}
+
+int
+elf_brand_inuse(Elf_Brandinfo *entry)
+{
+ struct proc *p;
+ int rval = FALSE;
+
+ sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &allproc, p_list) {
+ if (p->p_sysent == entry->sysvec) {
+ rval = TRUE;
+ break;
+ }
+ }
+ sx_sunlock(&allproc_lock);
+
+ return (rval);
+}
+
+static int
+elf_check_header(const Elf_Ehdr *hdr)
+{
+ if (!IS_ELF(*hdr) ||
+ hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+ hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
+ hdr->e_ident[EI_VERSION] != EV_CURRENT)
+ return ENOEXEC;
+
+ if (!ELF_MACHINE_OK(hdr->e_machine))
+ return ENOEXEC;
+
+ if (hdr->e_version != ELF_TARG_VER)
+ return ENOEXEC;
+
+ return 0;
+}
+
+static int
+elf_load_section(struct proc *p, struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
+{
+ size_t map_len;
+ vm_offset_t map_addr;
+ int error, rv;
+ size_t copy_len;
+ vm_object_t object;
+ vm_offset_t file_addr;
+ vm_offset_t data_buf = 0;
+
+ GIANT_REQUIRED;
+
+ VOP_GETVOBJECT(vp, &object);
+ error = 0;
+
+ /*
+ * It's necessary to fail if the filsz + offset taken from the
+ * header is greater than the actual file pager object's size.
+ * If we were to allow this, then the vm_map_find() below would
+ * walk right off the end of the file object and into the ether.
+ *
+ * While I'm here, might as well check for something else that
+ * is invalid: filsz cannot be greater than memsz.
+ */
+ if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size ||
+ filsz > memsz) {
+ uprintf("elf_load_section: truncated ELF file\n");
+ return (ENOEXEC);
+ }
+
+ map_addr = trunc_page((vm_offset_t)vmaddr);
+ file_addr = trunc_page(offset);
+
+ /*
+ * We have two choices. We can either clear the data in the last page
+ * of an oversized mapping, or we can start the anon mapping a page
+ * early and copy the initialized data into that first page. We
+ * choose the second..
+ */
+ if (memsz > filsz)
+ map_len = trunc_page(offset+filsz) - file_addr;
+ else
+ map_len = round_page(offset+filsz) - file_addr;
+
+ if (map_len != 0) {
+ vm_object_reference(object);
+ vm_map_lock(&vmspace->vm_map);
+ rv = vm_map_insert(&vmspace->vm_map,
+ object,
+ file_addr, /* file offset */
+ map_addr, /* virtual start */
+ map_addr + map_len,/* virtual end */
+ prot,
+ VM_PROT_ALL,
+ MAP_COPY_ON_WRITE | MAP_PREFAULT);
+ vm_map_unlock(&vmspace->vm_map);
+ if (rv != KERN_SUCCESS) {
+ vm_object_deallocate(object);
+ return EINVAL;
+ }
+
+ /* we can stop now if we've covered it all */
+ if (memsz == filsz) {
+ return 0;
+ }
+ }
+
+
+ /*
+ * We have to get the remaining bit of the file into the first part
+ * of the oversized map segment. This is normally because the .data
+ * segment in the file is extended to provide bss. It's a neat idea
+ * to try and save a page, but it's a pain in the behind to implement.
+ */
+ copy_len = (offset + filsz) - trunc_page(offset + filsz);
+ map_addr = trunc_page((vm_offset_t)vmaddr + filsz);
+ map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr;
+
+ /* This had damn well better be true! */
+ if (map_len != 0) {
+ vm_map_lock(&vmspace->vm_map);
+ rv = vm_map_insert(&vmspace->vm_map, NULL, 0,
+ map_addr, map_addr + map_len,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ vm_map_unlock(&vmspace->vm_map);
+ if (rv != KERN_SUCCESS) {
+ return EINVAL;
+ }
+ }
+
+ if (copy_len != 0) {
+ vm_object_reference(object);
+ rv = vm_map_find(exec_map,
+ object,
+ trunc_page(offset + filsz),
+ &data_buf,
+ PAGE_SIZE,
+ TRUE,
+ VM_PROT_READ,
+ VM_PROT_ALL,
+ MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL);
+ if (rv != KERN_SUCCESS) {
+ vm_object_deallocate(object);
+ return EINVAL;
+ }
+
+ /* send the page fragment to user space */
+ error = copyout((caddr_t)data_buf, (caddr_t)map_addr, copy_len);
+ vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE);
+ if (error) {
+ return (error);
+ }
+ }
+
+ /*
+ * set it to the specified protection
+ */
+ vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len, prot,
+ FALSE);
+
+ return error;
+}
+
+/*
+ * Load the file "file" into memory. It may be either a shared object
+ * or an executable.
+ *
+ * The "addr" reference parameter is in/out. On entry, it specifies
+ * the address where a shared object should be loaded. If the file is
+ * an executable, this value is ignored. On exit, "addr" specifies
+ * where the file was actually loaded.
+ *
+ * The "entry" reference parameter is out only. On exit, it specifies
+ * the entry point for the loaded file.
+ */
+static int
+elf_load_file(struct proc *p, const char *file, u_long *addr, u_long *entry)
+{
+ struct {
+ struct nameidata nd;
+ struct vattr attr;
+ struct image_params image_params;
+ } *tempdata;
+ const Elf_Ehdr *hdr = NULL;
+ const Elf_Phdr *phdr = NULL;
+ struct nameidata *nd;
+ struct vmspace *vmspace = p->p_vmspace;
+ struct vattr *attr;
+ struct image_params *imgp;
+ vm_prot_t prot;
+ u_long rbase;
+ u_long base_addr = 0;
+ int error, i, numsegs;
+
+ if (curthread->td_proc != p)
+ panic("elf_load_file - thread"); /* XXXKSE DIAGNOSTIC */
+
+ tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
+ nd = &tempdata->nd;
+ attr = &tempdata->attr;
+ imgp = &tempdata->image_params;
+
+ /*
+ * Initialize part of the common data
+ */
+ imgp->proc = p;
+ imgp->uap = NULL;
+ imgp->attr = attr;
+ imgp->firstpage = NULL;
+ imgp->image_header = (char *)kmem_alloc_wait(exec_map, PAGE_SIZE);
+
+ if (imgp->image_header == NULL) {
+ nd->ni_vp = NULL;
+ error = ENOMEM;
+ goto fail;
+ }
+
+ /* XXXKSE */
+ NDINIT(nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, curthread);
+
+ if ((error = namei(nd)) != 0) {
+ nd->ni_vp = NULL;
+ goto fail;
+ }
+ NDFREE(nd, NDF_ONLY_PNBUF);
+ imgp->vp = nd->ni_vp;
+
+ /*
+ * Check permissions, modes, uid, etc on the file, and "open" it.
+ */
+ error = exec_check_permissions(imgp);
+ if (error) {
+ VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */
+ goto fail;
+ }
+
+ error = exec_map_first_page(imgp);
+ /*
+ * Also make certain that the interpreter stays the same, so set
+ * its VTEXT flag, too.
+ */
+ if (error == 0)
+ nd->ni_vp->v_flag |= VTEXT;
+ VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */
+ if (error)
+ goto fail;
+
+ hdr = (const Elf_Ehdr *)imgp->image_header;
+ if ((error = elf_check_header(hdr)) != 0)
+ goto fail;
+ if (hdr->e_type == ET_DYN)
+ rbase = *addr;
+ else if (hdr->e_type == ET_EXEC)
+ rbase = 0;
+ else {
+ error = ENOEXEC;
+ goto fail;
+ }
+
+ /* Only support headers that fit within first page for now */
+ if ((hdr->e_phoff > PAGE_SIZE) ||
+ (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
+ error = ENOEXEC;
+ goto fail;
+ }
+
+ phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+
+ for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
+ if (phdr[i].p_type == PT_LOAD) { /* Loadable segment */
+ prot = 0;
+ if (phdr[i].p_flags & PF_X)
+ prot |= VM_PROT_EXECUTE;
+ if (phdr[i].p_flags & PF_W)
+ prot |= VM_PROT_WRITE;
+ if (phdr[i].p_flags & PF_R)
+ prot |= VM_PROT_READ;
+
+ if ((error = elf_load_section(p, vmspace, nd->ni_vp,
+ phdr[i].p_offset,
+ (caddr_t)phdr[i].p_vaddr +
+ rbase,
+ phdr[i].p_memsz,
+ phdr[i].p_filesz, prot)) != 0)
+ goto fail;
+ /*
+ * Establish the base address if this is the
+ * first segment.
+ */
+ if (numsegs == 0)
+ base_addr = trunc_page(phdr[i].p_vaddr + rbase);
+ numsegs++;
+ }
+ }
+ *addr = base_addr;
+ *entry=(unsigned long)hdr->e_entry + rbase;
+
+fail:
+ if (imgp->firstpage)
+ exec_unmap_first_page(imgp);
+ if (imgp->image_header)
+ kmem_free_wakeup(exec_map, (vm_offset_t)imgp->image_header,
+ PAGE_SIZE);
+ if (nd->ni_vp)
+ vrele(nd->ni_vp);
+
+ free(tempdata, M_TEMP);
+
+ return error;
+}
+
+/*
+ * non static, as it can be overridden by start_init()
+ */
+#ifdef __ia64__
+int fallback_elf_brand = ELFOSABI_FREEBSD;
+#else
+int fallback_elf_brand = -1;
+#endif
+SYSCTL_INT(_kern, OID_AUTO, fallback_elf_brand, CTLFLAG_RW,
+ &fallback_elf_brand, -1,
+ "ELF brand of last resort");
+
+static int
+exec_elf_imgact(struct image_params *imgp)
+{
+ const Elf_Ehdr *hdr = (const Elf_Ehdr *) imgp->image_header;
+ const Elf_Phdr *phdr;
+ Elf_Auxargs *elf_auxargs = NULL;
+ struct vmspace *vmspace;
+ vm_prot_t prot;
+ u_long text_size = 0, data_size = 0;
+ u_long text_addr = 0, data_addr = 0;
+ u_long addr, entry = 0, proghdr = 0;
+ int error, i;
+ const char *interp = NULL;
+ Elf_Brandinfo *brand_info;
+ char *path;
+
+ GIANT_REQUIRED;
+
+ /*
+ * Do we have a valid ELF header ?
+ */
+ if (elf_check_header(hdr) != 0 || hdr->e_type != ET_EXEC)
+ return -1;
+
+ /*
+ * From here on down, we return an errno, not -1, as we've
+ * detected an ELF file.
+ */
+
+ if ((hdr->e_phoff > PAGE_SIZE) ||
+ (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
+ /* Only support headers in first page for now */
+ return ENOEXEC;
+ }
+ phdr = (const Elf_Phdr*)(imgp->image_header + hdr->e_phoff);
+
+ /*
+ * From this point on, we may have resources that need to be freed.
+ */
+
+ /*
+ * Yeah, I'm paranoid. There is every reason in the world to get
+ * VTEXT now since from here on out, there are places we can have
+ * a context switch. Better safe than sorry; I really don't want
+ * the file to change while it's being loaded.
+ */
+ mtx_lock(&imgp->vp->v_interlock);
+ imgp->vp->v_flag |= VTEXT;
+ mtx_unlock(&imgp->vp->v_interlock);
+
+ if ((error = exec_extract_strings(imgp)) != 0)
+ goto fail;
+
+ exec_new_vmspace(imgp);
+
+ vmspace = imgp->proc->p_vmspace;
+
+ for (i = 0; i < hdr->e_phnum; i++) {
+ switch(phdr[i].p_type) {
+
+ case PT_LOAD: /* Loadable segment */
+ prot = 0;
+ if (phdr[i].p_flags & PF_X)
+ prot |= VM_PROT_EXECUTE;
+ if (phdr[i].p_flags & PF_W)
+ prot |= VM_PROT_WRITE;
+ if (phdr[i].p_flags & PF_R)
+ prot |= VM_PROT_READ;
+
+ if ((error = elf_load_section(imgp->proc,
+ vmspace, imgp->vp,
+ phdr[i].p_offset,
+ (caddr_t)phdr[i].p_vaddr,
+ phdr[i].p_memsz,
+ phdr[i].p_filesz, prot)) != 0)
+ goto fail;
+
+ /*
+ * Is this .text or .data ??
+ *
+ * We only handle one each of those yet XXX
+ */
+ if (hdr->e_entry >= phdr[i].p_vaddr &&
+ hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) {
+ text_addr = trunc_page(phdr[i].p_vaddr);
+ text_size = round_page(phdr[i].p_memsz +
+ phdr[i].p_vaddr -
+ text_addr);
+ entry = (u_long)hdr->e_entry;
+ } else {
+ data_addr = trunc_page(phdr[i].p_vaddr);
+ data_size = round_page(phdr[i].p_memsz +
+ phdr[i].p_vaddr -
+ data_addr);
+ }
+ break;
+ case PT_INTERP: /* Path to interpreter */
+ if (phdr[i].p_filesz > MAXPATHLEN ||
+ phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) {
+ error = ENOEXEC;
+ goto fail;
+ }
+ interp = imgp->image_header + phdr[i].p_offset;
+ break;
+ case PT_PHDR: /* Program header table info */
+ proghdr = phdr[i].p_vaddr;
+ break;
+ default:
+ break;
+ }
+ }
+
+ vmspace->vm_tsize = text_size >> PAGE_SHIFT;
+ vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
+ vmspace->vm_dsize = data_size >> PAGE_SHIFT;
+ vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
+
+ addr = ELF_RTLD_ADDR(vmspace);
+
+ imgp->entry_addr = entry;
+
+ brand_info = NULL;
+
+ /* We support three types of branding -- (1) the ELF EI_OSABI field
+ * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
+ * branding w/in the ELF header, and (3) path of the `interp_path'
+ * field. We should also look for an ".note.ABI-tag" ELF section now
+ * in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones.
+ */
+
+ /* If the executable has a brand, search for it in the brand list. */
+ if (brand_info == NULL) {
+ for (i = 0; i < MAX_BRANDS; i++) {
+ Elf_Brandinfo *bi = elf_brand_list[i];
+
+ if (bi != NULL &&
+ (hdr->e_ident[EI_OSABI] == bi->brand
+ || 0 ==
+ strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
+ bi->compat_3_brand, strlen(bi->compat_3_brand)))) {
+ brand_info = bi;
+ break;
+ }
+ }
+ }
+
+ /* Lacking a known brand, search for a recognized interpreter. */
+ if (brand_info == NULL && interp != NULL) {
+ for (i = 0; i < MAX_BRANDS; i++) {
+ Elf_Brandinfo *bi = elf_brand_list[i];
+
+ if (bi != NULL &&
+ strcmp(interp, bi->interp_path) == 0) {
+ brand_info = bi;
+ break;
+ }
+ }
+ }
+
+ /* Lacking a recognized interpreter, try the default brand */
+ if (brand_info == NULL) {
+ for (i = 0; i < MAX_BRANDS; i++) {
+ Elf_Brandinfo *bi = elf_brand_list[i];
+
+ if (bi != NULL && fallback_elf_brand == bi->brand) {
+ brand_info = bi;
+ break;
+ }
+ }
+ }
+
+ if (brand_info == NULL) {
+ uprintf("ELF binary type \"%u\" not known.\n",
+ hdr->e_ident[EI_OSABI]);
+ error = ENOEXEC;
+ goto fail;
+ }
+
+ imgp->proc->p_sysent = brand_info->sysvec;
+ if (interp != NULL) {
+ path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ snprintf(path, MAXPATHLEN, "%s%s",
+ brand_info->emul_path, interp);
+ if ((error = elf_load_file(imgp->proc, path, &addr,
+ &imgp->entry_addr)) != 0) {
+ if ((error = elf_load_file(imgp->proc, interp, &addr,
+ &imgp->entry_addr)) != 0) {
+ uprintf("ELF interpreter %s not found\n", path);
+ free(path, M_TEMP);
+ goto fail;
+ }
+ }
+ free(path, M_TEMP);
+ }
+
+ /*
+ * Construct auxargs table (used by the fixup routine)
+ */
+ elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
+ elf_auxargs->execfd = -1;
+ elf_auxargs->phdr = proghdr;
+ elf_auxargs->phent = hdr->e_phentsize;
+ elf_auxargs->phnum = hdr->e_phnum;
+ elf_auxargs->pagesz = PAGE_SIZE;
+ elf_auxargs->base = addr;
+ elf_auxargs->flags = 0;
+ elf_auxargs->entry = entry;
+ elf_auxargs->trace = elf_trace;
+
+ imgp->auxargs = elf_auxargs;
+ imgp->interpreted = 0;
+
+fail:
+ return error;
+}
+
+static int
+elf_freebsd_fixup(register_t **stack_base, struct image_params *imgp)
+{
+ Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
+ register_t *pos;
+
+ pos = *stack_base + (imgp->argc + imgp->envc + 2);
+
+ if (args->trace) {
+ AUXARGS_ENTRY(pos, AT_DEBUG, 1);
+ }
+ if (args->execfd != -1) {
+ AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
+ }
+ AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
+ AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
+ AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
+ AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
+ AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
+ AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
+ AUXARGS_ENTRY(pos, AT_BASE, args->base);
+ AUXARGS_ENTRY(pos, AT_NULL, 0);
+
+ free(imgp->auxargs, M_TEMP);
+ imgp->auxargs = NULL;
+
+ (*stack_base)--;
+ suword(*stack_base, (long) imgp->argc);
+ return 0;
+}
+
+/*
+ * Code for generating ELF core dumps.
+ */
+
+typedef void (*segment_callback)(vm_map_entry_t, void *);
+
+/* Closure for cb_put_phdr(). */
+struct phdr_closure {
+ Elf_Phdr *phdr; /* Program header to fill in */
+ Elf_Off offset; /* Offset of segment in core file */
+};
+
+/* Closure for cb_size_segment(). */
+struct sseg_closure {
+ int count; /* Count of writable segments. */
+ size_t size; /* Total size of all writable segments. */
+};
+
+static void cb_put_phdr(vm_map_entry_t, void *);
+static void cb_size_segment(vm_map_entry_t, void *);
+static void each_writable_segment(struct proc *, segment_callback, void *);
+static int elf_corehdr(struct thread *, struct vnode *, struct ucred *,
+ int, void *, size_t);
+static void elf_puthdr(struct proc *, void *, size_t *,
+ const prstatus_t *, const prfpregset_t *, const prpsinfo_t *, int);
+static void elf_putnote(void *, size_t *, const char *, int,
+ const void *, size_t);
+
+extern int osreldate;
+
+int
+elf_coredump(td, vp, limit)
+ struct thread *td;
+ register struct vnode *vp;
+ off_t limit;
+{
+ register struct proc *p = td->td_proc;
+ register struct ucred *cred = td->td_ucred;
+ int error = 0;
+ struct sseg_closure seginfo;
+ void *hdr;
+ size_t hdrsize;
+
+ /* Size the program segments. */
+ seginfo.count = 0;
+ seginfo.size = 0;
+ each_writable_segment(p, cb_size_segment, &seginfo);
+
+ /*
+ * Calculate the size of the core file header area by making
+ * a dry run of generating it. Nothing is written, but the
+ * size is calculated.
+ */
+ hdrsize = 0;
+ elf_puthdr((struct proc *)NULL, (void *)NULL, &hdrsize,
+ (const prstatus_t *)NULL, (const prfpregset_t *)NULL,
+ (const prpsinfo_t *)NULL, seginfo.count);
+
+ if (hdrsize + seginfo.size >= limit)
+ return (EFAULT);
+
+ /*
+ * Allocate memory for building the header, fill it up,
+ * and write it out.
+ */
+ hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
+ if (hdr == NULL) {
+ return EINVAL;
+ }
+ error = elf_corehdr(td, vp, cred, seginfo.count, hdr, hdrsize);
+
+ /* Write the contents of all of the writable segments. */
+ if (error == 0) {
+ Elf_Phdr *php;
+ off_t offset;
+ int i;
+
+ php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
+ offset = hdrsize;
+ for (i = 0; i < seginfo.count; i++) {
+ error = vn_rdwr_inchunks(UIO_WRITE, vp,
+ (caddr_t)php->p_vaddr,
+ php->p_filesz, offset, UIO_USERSPACE,
+ IO_UNIT | IO_DIRECT, cred, (int *)NULL, curthread); /* XXXKSE */
+ if (error != 0)
+ break;
+ offset += php->p_filesz;
+ php++;
+ }
+ }
+ free(hdr, M_TEMP);
+
+ return error;
+}
+
+/*
+ * A callback for each_writable_segment() to write out the segment's
+ * program header entry.
+ */
+static void
+cb_put_phdr(entry, closure)
+ vm_map_entry_t entry;
+ void *closure;
+{
+ struct phdr_closure *phc = (struct phdr_closure *)closure;
+ Elf_Phdr *phdr = phc->phdr;
+
+ phc->offset = round_page(phc->offset);
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_offset = phc->offset;
+ phdr->p_vaddr = entry->start;
+ phdr->p_paddr = 0;
+ phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
+ phdr->p_align = PAGE_SIZE;
+ phdr->p_flags = 0;
+ if (entry->protection & VM_PROT_READ)
+ phdr->p_flags |= PF_R;
+ if (entry->protection & VM_PROT_WRITE)
+ phdr->p_flags |= PF_W;
+ if (entry->protection & VM_PROT_EXECUTE)
+ phdr->p_flags |= PF_X;
+
+ phc->offset += phdr->p_filesz;
+ phc->phdr++;
+}
+
+/*
+ * A callback for each_writable_segment() to gather information about
+ * the number of segments and their total size.
+ */
+static void
+cb_size_segment(entry, closure)
+ vm_map_entry_t entry;
+ void *closure;
+{
+ struct sseg_closure *ssc = (struct sseg_closure *)closure;
+
+ ssc->count++;
+ ssc->size += entry->end - entry->start;
+}
+
+/*
+ * For each writable segment in the process's memory map, call the given
+ * function with a pointer to the map entry and some arbitrary
+ * caller-supplied data.
+ */
+static void
+each_writable_segment(p, func, closure)
+ struct proc *p;
+ segment_callback func;
+ void *closure;
+{
+ vm_map_t map = &p->p_vmspace->vm_map;
+ vm_map_entry_t entry;
+
+ for (entry = map->header.next; entry != &map->header;
+ entry = entry->next) {
+ vm_object_t obj;
+
+ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) ||
+ (entry->protection & (VM_PROT_READ|VM_PROT_WRITE)) !=
+ (VM_PROT_READ|VM_PROT_WRITE))
+ continue;
+
+ /*
+ ** Dont include memory segment in the coredump if
+ ** MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
+ ** madvise(2).
+ */
+ if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
+ continue;
+
+ if ((obj = entry->object.vm_object) == NULL)
+ continue;
+
+ /* Find the deepest backing object. */
+ while (obj->backing_object != NULL)
+ obj = obj->backing_object;
+
+ /* Ignore memory-mapped devices and such things. */
+ if (obj->type != OBJT_DEFAULT &&
+ obj->type != OBJT_SWAP &&
+ obj->type != OBJT_VNODE)
+ continue;
+
+ (*func)(entry, closure);
+ }
+}
+
+/*
+ * Write the core file header to the file, including padding up to
+ * the page boundary.
+ */
+static int
+elf_corehdr(td, vp, cred, numsegs, hdr, hdrsize)
+ struct thread *td;
+ struct vnode *vp;
+ struct ucred *cred;
+ int numsegs;
+ size_t hdrsize;
+ void *hdr;
+{
+ struct {
+ prstatus_t status;
+ prfpregset_t fpregset;
+ prpsinfo_t psinfo;
+ } *tempdata;
+ struct proc *p = td->td_proc;
+ size_t off;
+ prstatus_t *status;
+ prfpregset_t *fpregset;
+ prpsinfo_t *psinfo;
+
+ tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO | M_WAITOK);
+ status = &tempdata->status;
+ fpregset = &tempdata->fpregset;
+ psinfo = &tempdata->psinfo;
+
+ /* Gather the information for the header. */
+ status->pr_version = PRSTATUS_VERSION;
+ status->pr_statussz = sizeof(prstatus_t);
+ status->pr_gregsetsz = sizeof(gregset_t);
+ status->pr_fpregsetsz = sizeof(fpregset_t);
+ status->pr_osreldate = osreldate;
+ status->pr_cursig = p->p_sig;
+ status->pr_pid = p->p_pid;
+ fill_regs(td, &status->pr_reg);
+
+ fill_fpregs(td, fpregset);
+
+ psinfo->pr_version = PRPSINFO_VERSION;
+ psinfo->pr_psinfosz = sizeof(prpsinfo_t);
+ strncpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname) - 1);
+
+ /* XXX - We don't fill in the command line arguments properly yet. */
+ strncpy(psinfo->pr_psargs, p->p_comm, PRARGSZ);
+
+ /* Fill in the header. */
+ bzero(hdr, hdrsize);
+ off = 0;
+ elf_puthdr(p, hdr, &off, status, fpregset, psinfo, numsegs);
+
+ free(tempdata, M_TEMP);
+
+ /* Write it to the core file. */
+ return vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
+ UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NULL, td); /* XXXKSE */
+}
+
+static void
+elf_puthdr(struct proc *p, void *dst, size_t *off, const prstatus_t *status,
+ const prfpregset_t *fpregset, const prpsinfo_t *psinfo, int numsegs)
+{
+ size_t ehoff;
+ size_t phoff;
+ size_t noteoff;
+ size_t notesz;
+
+ ehoff = *off;
+ *off += sizeof(Elf_Ehdr);
+
+ phoff = *off;
+ *off += (numsegs + 1) * sizeof(Elf_Phdr);
+
+ noteoff = *off;
+ elf_putnote(dst, off, "FreeBSD", NT_PRSTATUS, status,
+ sizeof *status);
+ elf_putnote(dst, off, "FreeBSD", NT_FPREGSET, fpregset,
+ sizeof *fpregset);
+ elf_putnote(dst, off, "FreeBSD", NT_PRPSINFO, psinfo,
+ sizeof *psinfo);
+ notesz = *off - noteoff;
+
+ /* Align up to a page boundary for the program segments. */
+ *off = round_page(*off);
+
+ if (dst != NULL) {
+ Elf_Ehdr *ehdr;
+ Elf_Phdr *phdr;
+ struct phdr_closure phc;
+
+ /*
+ * Fill in the ELF header.
+ */
+ ehdr = (Elf_Ehdr *)((char *)dst + ehoff);
+ ehdr->e_ident[EI_MAG0] = ELFMAG0;
+ ehdr->e_ident[EI_MAG1] = ELFMAG1;
+ ehdr->e_ident[EI_MAG2] = ELFMAG2;
+ ehdr->e_ident[EI_MAG3] = ELFMAG3;
+ ehdr->e_ident[EI_CLASS] = ELF_CLASS;
+ ehdr->e_ident[EI_DATA] = ELF_DATA;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
+ ehdr->e_ident[EI_ABIVERSION] = 0;
+ ehdr->e_ident[EI_PAD] = 0;
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_entry = 0;
+ ehdr->e_phoff = phoff;
+ ehdr->e_flags = 0;
+ ehdr->e_ehsize = sizeof(Elf_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf_Phdr);
+ ehdr->e_phnum = numsegs + 1;
+ ehdr->e_shentsize = sizeof(Elf_Shdr);
+ ehdr->e_shnum = 0;
+ ehdr->e_shstrndx = SHN_UNDEF;
+
+ /*
+ * Fill in the program header entries.
+ */
+ phdr = (Elf_Phdr *)((char *)dst + phoff);
+
+ /* The note segement. */
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = noteoff;
+ phdr->p_vaddr = 0;
+ phdr->p_paddr = 0;
+ phdr->p_filesz = notesz;
+ phdr->p_memsz = 0;
+ phdr->p_flags = 0;
+ phdr->p_align = 0;
+ phdr++;
+
+ /* All the writable segments from the program. */
+ phc.phdr = phdr;
+ phc.offset = *off;
+ each_writable_segment(p, cb_put_phdr, &phc);
+ }
+}
+
+static void
+elf_putnote(void *dst, size_t *off, const char *name, int type,
+ const void *desc, size_t descsz)
+{
+ Elf_Note note;
+
+ note.n_namesz = strlen(name) + 1;
+ note.n_descsz = descsz;
+ note.n_type = type;
+ if (dst != NULL)
+ bcopy(&note, (char *)dst + *off, sizeof note);
+ *off += sizeof note;
+ if (dst != NULL)
+ bcopy(name, (char *)dst + *off, note.n_namesz);
+ *off += roundup2(note.n_namesz, sizeof(Elf_Size));
+ if (dst != NULL)
+ bcopy(desc, (char *)dst + *off, note.n_descsz);
+ *off += roundup2(note.n_descsz, sizeof(Elf_Size));
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw elf_execsw = {exec_elf_imgact, "ELF"};
+EXEC_SET(elf, elf_execsw);
diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c
new file mode 100644
index 0000000..57a5c1d
--- /dev/null
+++ b/sys/kern/imgact_gzip.c
@@ -0,0 +1,385 @@
+/*
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD$
+ *
+ * This module handles execution of a.out files which have been run through
+ * "gzip". This saves diskspace, but wastes cpu-cycles and VM.
+ *
+ * TODO:
+ * text-segments should be made R/O after being filled
+ * is the vm-stuff safe ?
+ * should handle the entire header of gzip'ed stuff.
+ * inflate isn't quite reentrant yet...
+ * error-handling is a mess...
+ * so is the rest...
+ * tidy up unnecesary includes
+ */
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/inflate.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+struct imgact_gzip {
+ struct image_params *ip;
+ struct exec a_out;
+ int error;
+ int gotheader;
+ int where;
+ u_char *inbuf;
+ u_long offset;
+ u_long output;
+ u_long len;
+ int idx;
+ u_long virtual_offset, file_offset, file_end, bss_size;
+};
+
+static int exec_gzip_imgact(struct image_params *imgp);
+static int NextByte(void *vp);
+static int do_aout_hdr(struct imgact_gzip *);
+static int Flush(void *vp, u_char *, u_long siz);
+
+static int
+exec_gzip_imgact(imgp)
+ struct image_params *imgp;
+{
+ int error, error2 = 0;
+ const u_char *p = (const u_char *) imgp->image_header;
+ struct imgact_gzip igz;
+ struct inflate infl;
+ struct vmspace *vmspace;
+
+ /* If these four are not OK, it isn't a gzip file */
+ if (p[0] != 0x1f)
+ return -1; /* 0 Simply magic */
+ if (p[1] != 0x8b)
+ return -1; /* 1 Simply magic */
+ if (p[2] != 0x08)
+ return -1; /* 2 Compression method */
+ if (p[9] != 0x03)
+ return -1; /* 9 OS compressed on */
+
+ /*
+ * If this one contains anything but a comment or a filename marker,
+ * we don't want to chew on it
+ */
+ if (p[3] & ~(0x18))
+ return ENOEXEC; /* 3 Flags */
+
+ /* These are of no use to us */
+ /* 4-7 Timestamp */
+ /* 8 Extra flags */
+
+ bzero(&igz, sizeof igz);
+ bzero(&infl, sizeof infl);
+ infl.gz_private = (void *) &igz;
+ infl.gz_input = NextByte;
+ infl.gz_output = Flush;
+
+ igz.ip = imgp;
+ igz.idx = 10;
+
+ if (p[3] & 0x08) { /* skip a filename */
+ while (p[igz.idx++])
+ if (igz.idx >= PAGE_SIZE)
+ return ENOEXEC;
+ }
+ if (p[3] & 0x10) { /* skip a comment */
+ while (p[igz.idx++])
+ if (igz.idx >= PAGE_SIZE)
+ return ENOEXEC;
+ }
+ igz.len = imgp->attr->va_size;
+
+ error = inflate(&infl);
+
+ /*
+ * The unzipped file may not even have been long enough to contain
+ * a header giving Flush() a chance to return error. Check for this.
+ */
+ if ( !igz.gotheader )
+ return ENOEXEC;
+
+ if ( !error ) {
+ vmspace = imgp->proc->p_vmspace;
+ error = vm_map_protect(&vmspace->vm_map,
+ (vm_offset_t) vmspace->vm_taddr,
+ (vm_offset_t) (vmspace->vm_taddr +
+ (vmspace->vm_tsize << PAGE_SHIFT)) ,
+ VM_PROT_READ|VM_PROT_EXECUTE,0);
+ }
+
+ if (igz.inbuf) {
+ error2 =
+ vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf,
+ (vm_offset_t) igz.inbuf + PAGE_SIZE);
+ }
+ if (igz.error || error || error2) {
+ printf("Output=%lu ", igz.output);
+ printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n",
+ error, igz.error, error2, igz.where);
+ }
+ if (igz.error)
+ return igz.error;
+ if (error)
+ return ENOEXEC;
+ if (error2)
+ return error2;
+ return 0;
+}
+
+static int
+do_aout_hdr(struct imgact_gzip * gz)
+{
+ int error;
+ struct vmspace *vmspace;
+ vm_offset_t vmaddr;
+
+ /*
+ * Set file/virtual offset based on a.out variant. We do two cases:
+ * host byte order and network byte order (for NetBSD compatibility)
+ */
+ switch ((int) (gz->a_out.a_magic & 0xffff)) {
+ case ZMAGIC:
+ gz->virtual_offset = 0;
+ if (gz->a_out.a_text) {
+ gz->file_offset = PAGE_SIZE;
+ } else {
+ /* Bill's "screwball mode" */
+ gz->file_offset = 0;
+ }
+ break;
+ case QMAGIC:
+ gz->virtual_offset = PAGE_SIZE;
+ gz->file_offset = 0;
+ break;
+ default:
+ /* NetBSD compatibility */
+ switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
+ case ZMAGIC:
+ case QMAGIC:
+ gz->virtual_offset = PAGE_SIZE;
+ gz->file_offset = 0;
+ break;
+ default:
+ gz->where = __LINE__;
+ return (-1);
+ }
+ }
+
+ gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE);
+
+ /*
+ * Check various fields in header for validity/bounds.
+ */
+ if ( /* entry point must lay with text region */
+ gz->a_out.a_entry < gz->virtual_offset ||
+ gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text ||
+
+ /* text and data size must each be page rounded */
+ gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) {
+ gz->where = __LINE__;
+ return (-1);
+ }
+ /*
+ * text/data/bss must not exceed limits
+ */
+ mtx_assert(&Giant, MA_OWNED);
+ if ( /* text can't exceed maximum text size */
+ gz->a_out.a_text > maxtsiz ||
+
+ /* data + bss can't exceed rlimit */
+ gz->a_out.a_data + gz->bss_size >
+ gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) {
+ gz->where = __LINE__;
+ return (ENOMEM);
+ }
+ /* Find out how far we should go */
+ gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;
+
+ /* copy in arguments and/or environment from old process */
+ error = exec_extract_strings(gz->ip);
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
+ /*
+ * Destroy old process VM and create a new one (with a new stack)
+ */
+ exec_new_vmspace(gz->ip);
+
+ vmspace = gz->ip->proc->p_vmspace;
+
+ vmaddr = gz->virtual_offset;
+
+ error = vm_mmap(&vmspace->vm_map,
+ &vmaddr,
+ gz->a_out.a_text + gz->a_out.a_data,
+ VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED,
+ 0,
+ 0);
+
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
+
+ if (gz->bss_size != 0) {
+ /*
+ * Allocate demand-zeroed area for uninitialized data.
+ * "bss" = 'block started by symbol' - named after the
+ * IBM 7090 instruction of the same name.
+ */
+ vmaddr = gz->virtual_offset + gz->a_out.a_text +
+ gz->a_out.a_data;
+ error = vm_map_find(&vmspace->vm_map,
+ NULL,
+ 0,
+ &vmaddr,
+ gz->bss_size,
+ FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
+ }
+ /* Fill in process VM information */
+ vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
+ vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
+ vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset;
+ vmspace->vm_daddr = (caddr_t) (uintptr_t)
+ (gz->virtual_offset + gz->a_out.a_text);
+
+ /* Fill in image_params */
+ gz->ip->interpreted = 0;
+ gz->ip->entry_addr = gz->a_out.a_entry;
+
+ gz->ip->proc->p_sysent = &aout_sysvec;
+
+ return 0;
+}
+
+static int
+NextByte(void *vp)
+{
+ int error;
+ struct imgact_gzip *igz = (struct imgact_gzip *) vp;
+
+ if (igz->idx >= igz->len) {
+ igz->where = __LINE__;
+ return GZ_EOF;
+ }
+ if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
+ return igz->inbuf[(igz->idx++) - igz->offset];
+ }
+ if (igz->inbuf) {
+ error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf,
+ (vm_offset_t) igz->inbuf + PAGE_SIZE);
+ if (error) {
+ igz->where = __LINE__;
+ igz->error = error;
+ return GZ_EOF;
+ }
+ }
+ igz->offset = igz->idx & ~PAGE_MASK;
+
+ error = vm_mmap(kernel_map, /* map */
+ (vm_offset_t *) & igz->inbuf, /* address */
+ PAGE_SIZE, /* size */
+ VM_PROT_READ, /* protection */
+ VM_PROT_READ, /* max protection */
+ 0, /* flags */
+ (caddr_t) igz->ip->vp, /* vnode */
+ igz->offset); /* offset */
+ if (error) {
+ igz->where = __LINE__;
+ igz->error = error;
+ return GZ_EOF;
+ }
+ return igz->inbuf[(igz->idx++) - igz->offset];
+}
+
+static int
+Flush(void *vp, u_char * ptr, u_long siz)
+{
+ struct imgact_gzip *gz = (struct imgact_gzip *) vp;
+ u_char *p = ptr, *q;
+ int i;
+
+ /* First, find a a.out-header */
+ if (gz->output < sizeof gz->a_out) {
+ q = (u_char *) & gz->a_out;
+ i = min(siz, sizeof gz->a_out - gz->output);
+ bcopy(p, q + gz->output, i);
+ gz->output += i;
+ p += i;
+ siz -= i;
+ if (gz->output == sizeof gz->a_out) {
+ gz->gotheader = 1;
+ i = do_aout_hdr(gz);
+ if (i == -1) {
+ if (!gz->where)
+ gz->where = __LINE__;
+ gz->error = ENOEXEC;
+ return ENOEXEC;
+ } else if (i) {
+ gz->where = __LINE__;
+ gz->error = i;
+ return ENOEXEC;
+ }
+ if (gz->file_offset == 0) {
+ q = (u_char *) (uintptr_t) gz->virtual_offset;
+ copyout(&gz->a_out, q, sizeof gz->a_out);
+ }
+ }
+ }
+ /* Skip over zero-padded first PAGE if needed */
+ if (gz->output < gz->file_offset &&
+ gz->output + siz > gz->file_offset) {
+ i = min(siz, gz->file_offset - gz->output);
+ gz->output += i;
+ p += i;
+ siz -= i;
+ }
+ if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
+ i = min(siz, gz->file_end - gz->output);
+ q = (u_char *) (uintptr_t)
+ (gz->virtual_offset + gz->output - gz->file_offset);
+ copyout(p, q, i);
+ gz->output += i;
+ p += i;
+ siz -= i;
+ }
+ gz->output += siz;
+ return 0;
+}
+
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
+EXEC_SET(execgzip, gzip_execsw);
diff --git a/sys/kern/imgact_shell.c b/sys/kern/imgact_shell.c
new file mode 100644
index 0000000..8480fcc
--- /dev/null
+++ b/sys/kern/imgact_shell.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/kernel.h>
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define SHELLMAGIC 0x2123 /* #! */
+#else
+#define SHELLMAGIC 0x2321
+#endif
+
+/*
+ * Shell interpreter image activator. A interpreter name beginning
+ * at imgp->stringbase is the minimal successful exit requirement.
+ */
+int
+exec_shell_imgact(imgp)
+ struct image_params *imgp;
+{
+ const char *image_header = imgp->image_header;
+ const char *ihp, *line_endp;
+ char *interp;
+
+ /* a shell script? */
+ if (((const short *) image_header)[0] != SHELLMAGIC)
+ return(-1);
+
+ /*
+ * Don't allow a shell script to be the shell for a shell
+ * script. :-)
+ */
+ if (imgp->interpreted)
+ return(ENOEXEC);
+
+ imgp->interpreted = 1;
+
+ /*
+ * Copy shell name and arguments from image_header into string
+ * buffer.
+ */
+
+ /*
+ * Find end of line; return if the line > MAXSHELLCMDLEN long.
+ */
+ for (ihp = &image_header[2]; *ihp != '\n' && *ihp != '#'; ++ihp) {
+ if (ihp >= &image_header[MAXSHELLCMDLEN])
+ return(ENAMETOOLONG);
+ }
+ line_endp = ihp;
+
+ /* reset for another pass */
+ ihp = &image_header[2];
+
+ /* Skip over leading spaces - until the interpreter name */
+ while ((*ihp == ' ') || (*ihp == '\t')) ihp++;
+
+ /* copy the interpreter name */
+ interp = imgp->interpreter_name;
+ while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t'))
+ *interp++ = *ihp++;
+ *interp = '\0';
+
+ /* Disallow a null interpreter filename */
+ if (*imgp->interpreter_name == '\0')
+ return(ENOEXEC);
+
+ /* reset for another pass */
+ ihp = &image_header[2];
+
+ /* copy the interpreter name and arguments */
+ while (ihp < line_endp) {
+ /* Skip over leading spaces */
+ while ((*ihp == ' ') || (*ihp == '\t')) ihp++;
+
+ if (ihp < line_endp) {
+ /*
+ * Copy to end of token. No need to watch stringspace
+ * because this is at the front of the string buffer
+ * and the maximum shell command length is tiny.
+ */
+ while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) {
+ *imgp->stringp++ = *ihp++;
+ imgp->stringspace--;
+ }
+
+ *imgp->stringp++ = 0;
+ imgp->stringspace--;
+
+ imgp->argc++;
+ }
+ }
+
+ imgp->argv0 = imgp->uap->fname;
+
+ return(0);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw shell_execsw = { exec_shell_imgact, "#!" };
+EXEC_SET(shell, shell_execsw);
diff --git a/sys/kern/inflate.c b/sys/kern/inflate.c
new file mode 100644
index 0000000..2a16ba2
--- /dev/null
+++ b/sys/kern/inflate.c
@@ -0,0 +1,1078 @@
+/*
+ * Most parts of this file are not covered by:
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD$
+ *
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/inflate.h>
+#ifdef _KERNEL
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#endif
+#include <sys/malloc.h>
+
+#ifdef _KERNEL
+static MALLOC_DEFINE(M_GZIP, "Gzip trees", "Gzip trees");
+#endif
+
+/* needed to make inflate() work */
+#define uch u_char
+#define ush u_short
+#define ulg u_long
+
+/* Stuff to make inflate() work */
+#ifdef _KERNEL
+#define memzero(dest,len) bzero(dest,len)
+#endif
+#define NOMEMCPY
+#ifdef _KERNEL
+#define FPRINTF printf
+#else
+extern void putstr (char *);
+#define FPRINTF putstr
+#endif
+
+#define FLUSH(x,y) { \
+ int foo = (*x->gz_output)(x->gz_private,x->gz_slide,y); \
+ if (foo) \
+ return foo; \
+ }
+
+static const int qflag = 0;
+
+#ifndef _KERNEL /* want to use this file in kzip also */
+extern unsigned char *kzipmalloc (int);
+extern void kzipfree (void*);
+#define malloc(x, y, z) kzipmalloc((x))
+#define free(x, y) kzipfree((x))
+#endif
+
+/*
+ * This came from unzip-5.12. I have changed it the flow to pass
+ * a structure pointer around, thus hopefully making it re-entrant.
+ * Poul-Henning
+ */
+
+/* inflate.c -- put in the public domain by Mark Adler
+ version c14o, 23 August 1994 */
+
+/* You can do whatever you like with this source file, though I would
+ prefer that if you modify it and redistribute it that you include
+ comments to that effect with your name and the date. Thank you.
+
+ History:
+ vers date who what
+ ---- --------- -------------- ------------------------------------
+ a ~~ Feb 92 M. Adler used full (large, one-step) lookup table
+ b1 21 Mar 92 M. Adler first version with partial lookup tables
+ b2 21 Mar 92 M. Adler fixed bug in fixed-code blocks
+ b3 22 Mar 92 M. Adler sped up match copies, cleaned up some
+ b4 25 Mar 92 M. Adler added prototypes; removed window[] (now
+ is the responsibility of unzip.h--also
+ changed name to slide[]), so needs diffs
+ for unzip.c and unzip.h (this allows
+ compiling in the small model on MSDOS);
+ fixed cast of q in huft_build();
+ b5 26 Mar 92 M. Adler got rid of unintended macro recursion.
+ b6 27 Mar 92 M. Adler got rid of nextbyte() routine. fixed
+ bug in inflate_fixed().
+ c1 30 Mar 92 M. Adler removed lbits, dbits environment variables.
+ changed BMAX to 16 for explode. Removed
+ OUTB usage, and replaced it with flush()--
+ this was a 20% speed improvement! Added
+ an explode.c (to replace unimplod.c) that
+ uses the huft routines here. Removed
+ register union.
+ c2 4 Apr 92 M. Adler fixed bug for file sizes a multiple of 32k.
+ c3 10 Apr 92 M. Adler reduced memory of code tables made by
+ huft_build significantly (factor of two to
+ three).
+ c4 15 Apr 92 M. Adler added NOMEMCPY do kill use of memcpy().
+ worked around a Turbo C optimization bug.
+ c5 21 Apr 92 M. Adler added the GZ_WSIZE #define to allow reducing
+ the 32K window size for specialized
+ applications.
+ c6 31 May 92 M. Adler added some typecasts to eliminate warnings
+ c7 27 Jun 92 G. Roelofs added some more typecasts (444: MSC bug).
+ c8 5 Oct 92 J-l. Gailly added ifdef'd code to deal with PKZIP bug.
+ c9 9 Oct 92 M. Adler removed a memory error message (~line 416).
+ c10 17 Oct 92 G. Roelofs changed ULONG/UWORD/byte to ulg/ush/uch,
+ removed old inflate, renamed inflate_entry
+ to inflate, added Mark's fix to a comment.
+ c10.5 14 Dec 92 M. Adler fix up error messages for incomplete trees.
+ c11 2 Jan 93 M. Adler fixed bug in detection of incomplete
+ tables, and removed assumption that EOB is
+ the longest code (bad assumption).
+ c12 3 Jan 93 M. Adler make tables for fixed blocks only once.
+ c13 5 Jan 93 M. Adler allow all zero length codes (pkzip 2.04c
+ outputs one zero length code for an empty
+ distance tree).
+ c14 12 Mar 93 M. Adler made inflate.c standalone with the
+ introduction of inflate.h.
+ c14b 16 Jul 93 G. Roelofs added (unsigned) typecast to w at 470.
+ c14c 19 Jul 93 J. Bush changed v[N_MAX], l[288], ll[28x+3x] arrays
+ to static for Amiga.
+ c14d 13 Aug 93 J-l. Gailly de-complicatified Mark's c[*p++]++ thing.
+ c14e 8 Oct 93 G. Roelofs changed memset() to memzero().
+ c14f 22 Oct 93 G. Roelofs renamed quietflg to qflag; made Trace()
+ conditional; added inflate_free().
+ c14g 28 Oct 93 G. Roelofs changed l/(lx+1) macro to pointer (Cray bug)
+ c14h 7 Dec 93 C. Ghisler huft_build() optimizations.
+ c14i 9 Jan 94 A. Verheijen set fixed_t{d,l} to NULL after freeing;
+ G. Roelofs check NEXTBYTE macro for GZ_EOF.
+ c14j 23 Jan 94 G. Roelofs removed Ghisler "optimizations"; ifdef'd
+ GZ_EOF check.
+ c14k 27 Feb 94 G. Roelofs added some typecasts to avoid warnings.
+ c14l 9 Apr 94 G. Roelofs fixed split comments on preprocessor lines
+ to avoid bug in Encore compiler.
+ c14m 7 Jul 94 P. Kienitz modified to allow assembler version of
+ inflate_codes() (define ASM_INFLATECODES)
+ c14n 22 Jul 94 G. Roelofs changed fprintf to FPRINTF for DLL versions
+ c14o 23 Aug 94 C. Spieler added a newline to a debug statement;
+ G. Roelofs added another typecast to avoid MSC warning
+ */
+
+
+/*
+ Inflate deflated (PKZIP's method 8 compressed) data. The compression
+ method searches for as much of the current string of bytes (up to a
+ length of 258) in the previous 32K bytes. If it doesn't find any
+ matches (of at least length 3), it codes the next byte. Otherwise, it
+ codes the length of the matched string and its distance backwards from
+ the current position. There is a single Huffman code that codes both
+ single bytes (called "literals") and match lengths. A second Huffman
+ code codes the distance information, which follows a length code. Each
+ length or distance code actually represents a base value and a number
+ of "extra" (sometimes zero) bits to get to add to the base value. At
+ the end of each deflated block is a special end-of-block (EOB) literal/
+ length code. The decoding process is basically: get a literal/length
+ code; if EOB then done; if a literal, emit the decoded byte; if a
+ length then get the distance and emit the referred-to bytes from the
+ sliding window of previously emitted data.
+
+ There are (currently) three kinds of inflate blocks: stored, fixed, and
+ dynamic. The compressor outputs a chunk of data at a time and decides
+ which method to use on a chunk-by-chunk basis. A chunk might typically
+ be 32K to 64K, uncompressed. If the chunk is uncompressible, then the
+ "stored" method is used. In this case, the bytes are simply stored as
+ is, eight bits per byte, with none of the above coding. The bytes are
+ preceded by a count, since there is no longer an EOB code.
+
+ If the data is compressible, then either the fixed or dynamic methods
+ are used. In the dynamic method, the compressed data is preceded by
+ an encoding of the literal/length and distance Huffman codes that are
+ to be used to decode this block. The representation is itself Huffman
+ coded, and so is preceded by a description of that code. These code
+ descriptions take up a little space, and so for small blocks, there is
+ a predefined set of codes, called the fixed codes. The fixed method is
+ used if the block ends up smaller that way (usually for quite small
+ chunks); otherwise the dynamic method is used. In the latter case, the
+ codes are customized to the probabilities in the current block and so
+ can code it much better than the pre-determined fixed codes can.
+
+ The Huffman codes themselves are decoded using a mutli-level table
+ lookup, in order to maximize the speed of decoding plus the speed of
+ building the decoding tables. See the comments below that precede the
+ lbits and dbits tuning parameters.
+ */
+
+
+/*
+ Notes beyond the 1.93a appnote.txt:
+
+ 1. Distance pointers never point before the beginning of the output
+ stream.
+ 2. Distance pointers can point back across blocks, up to 32k away.
+ 3. There is an implied maximum of 7 bits for the bit length table and
+ 15 bits for the actual data.
+ 4. If only one code exists, then it is encoded using one bit. (Zero
+ would be more efficient, but perhaps a little confusing.) If two
+ codes exist, they are coded using one bit each (0 and 1).
+ 5. There is no way of sending zero distance codes--a dummy must be
+ sent if there are none. (History: a pre 2.0 version of PKZIP would
+ store blocks with no distance codes, but this was discovered to be
+ too harsh a criterion.) Valid only for 1.93a. 2.04c does allow
+ zero distance codes, which is sent as one code of zero bits in
+ length.
+ 6. There are up to 286 literal/length codes. Code 256 represents the
+ end-of-block. Note however that the static length tree defines
+ 288 codes just to fill out the Huffman codes. Codes 286 and 287
+ cannot be used though, since there is no length base or extra bits
+ defined for them. Similarily, there are up to 30 distance codes.
+ However, static trees define 32 codes (all 5 bits) to fill out the
+ Huffman codes, but the last two had better not show up in the data.
+ 7. Unzip can check dynamic Huffman blocks for complete code sets.
+ The exception is that a single code would not be complete (see #4).
+ 8. The five bits following the block type is really the number of
+ literal codes sent minus 257.
+ 9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits
+ (1+6+6). Therefore, to output three times the length, you output
+ three codes (1+1+1), whereas to output four times the same length,
+ you only need two codes (1+3). Hmm.
+ 10. In the tree reconstruction algorithm, Code = Code + Increment
+ only if BitLength(i) is not zero. (Pretty obvious.)
+ 11. Correction: 4 Bits: # of Bit Length codes - 4 (4 - 19)
+ 12. Note: length code 284 can represent 227-258, but length code 285
+ really is 258. The last length deserves its own, short code
+ since it gets used a lot in very redundant files. The length
+ 258 is special since 258 - 3 (the min match length) is 255.
+ 13. The literal/length and distance code bit lengths are read as a
+ single stream of lengths. It is possible (and advantageous) for
+ a repeat code (16, 17, or 18) to go across the boundary between
+ the two sets of lengths.
+ */
+
+
+#define PKZIP_BUG_WORKAROUND /* PKZIP 1.93a problem--live with it */
+
+/*
+ inflate.h must supply the uch slide[GZ_WSIZE] array and the NEXTBYTE,
+ FLUSH() and memzero macros. If the window size is not 32K, it
+ should also define GZ_WSIZE. If INFMOD is defined, it can include
+ compiled functions to support the NEXTBYTE and/or FLUSH() macros.
+ There are defaults for NEXTBYTE and FLUSH() below for use as
+ examples of what those functions need to do. Normally, you would
+ also want FLUSH() to compute a crc on the data. inflate.h also
+ needs to provide these typedefs:
+
+ typedef unsigned char uch;
+ typedef unsigned short ush;
+ typedef unsigned long ulg;
+
+ This module uses the external functions malloc() and free() (and
+ probably memset() or bzero() in the memzero() macro). Their
+ prototypes are normally found in <string.h> and <stdlib.h>.
+ */
+#define INFMOD /* tell inflate.h to include code to be
+ * compiled */
+
+/* Huffman code lookup table entry--this entry is four bytes for machines
+ that have 16-bit pointers (e.g. PC's in the small or medium model).
+ Valid extra bits are 0..13. e == 15 is EOB (end of block), e == 16
+ means that v is a literal, 16 < e < 32 means that v is a pointer to
+ the next table, which codes e - 16 bits, and lastly e == 99 indicates
+ an unused code. If a code with e == 99 is looked up, this implies an
+ error in the data. */
+struct huft {
+ uch e; /* number of extra bits or operation */
+ uch b; /* number of bits in this code or subcode */
+ union {
+ ush n; /* literal, length base, or distance
+ * base */
+ struct huft *t; /* pointer to next level of table */
+ } v;
+};
+
+
+/* Function prototypes */
+static int huft_build(struct inflate *, unsigned *, unsigned, unsigned, const ush *, const ush *, struct huft **, int *);
+static int huft_free(struct inflate *, struct huft *);
+static int inflate_codes(struct inflate *, struct huft *, struct huft *, int, int);
+static int inflate_stored(struct inflate *);
+static int xinflate(struct inflate *);
+static int inflate_fixed(struct inflate *);
+static int inflate_dynamic(struct inflate *);
+static int inflate_block(struct inflate *, int *);
+
+/* The inflate algorithm uses a sliding 32K byte window on the uncompressed
+ stream to find repeated byte strings. This is implemented here as a
+ circular buffer. The index is updated simply by incrementing and then
+ and'ing with 0x7fff (32K-1). */
+/* It is left to other modules to supply the 32K area. It is assumed
+ to be usable as if it were declared "uch slide[32768];" or as just
+ "uch *slide;" and then malloc'ed in the latter case. The definition
+ must be in unzip.h, included above. */
+
+
+/* Tables for deflate from PKZIP's appnote.txt. */
+
+/* Order of the bit length code lengths */
+static const unsigned border[] = {
+ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+static const ush cplens[] = { /* Copy lengths for literal codes 257..285 */
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+ 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+ /* note: see note #13 above about the 258 in this list. */
+
+static const ush cplext[] = { /* Extra bits for literal codes 257..285 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+ 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99}; /* 99==invalid */
+
+static const ush cpdist[] = { /* Copy offsets for distance codes 0..29 */
+ 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+ 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+ 8193, 12289, 16385, 24577};
+
+static const ush cpdext[] = { /* Extra bits for distance codes */
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+ 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+ 12, 12, 13, 13};
+
+/* And'ing with mask[n] masks the lower n bits */
+static const ush mask[] = {
+ 0x0000,
+ 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
+ 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
+};
+
+
+/* Macros for inflate() bit peeking and grabbing.
+ The usage is:
+
+ NEEDBITS(glbl,j)
+ x = b & mask[j];
+ DUMPBITS(j)
+
+ where NEEDBITS makes sure that b has at least j bits in it, and
+ DUMPBITS removes the bits from b. The macros use the variable k
+ for the number of bits in b. Normally, b and k are register
+ variables for speed, and are initialized at the begining of a
+ routine that uses these macros from a global bit buffer and count.
+
+ In order to not ask for more bits than there are in the compressed
+ stream, the Huffman tables are constructed to only ask for just
+ enough bits to make up the end-of-block code (value 256). Then no
+ bytes need to be "returned" to the buffer at the end of the last
+ block. See the huft_build() routine.
+ */
+
+/*
+ * The following 2 were global variables.
+ * They are now fields of the inflate structure.
+ */
+
+#define NEEDBITS(glbl,n) { \
+ while(k<(n)) { \
+ int c=(*glbl->gz_input)(glbl->gz_private); \
+ if(c==GZ_EOF) \
+ return 1; \
+ b|=((ulg)c)<<k; \
+ k+=8; \
+ } \
+ }
+
+#define DUMPBITS(n) {b>>=(n);k-=(n);}
+
+/*
+ Huffman code decoding is performed using a multi-level table lookup.
+ The fastest way to decode is to simply build a lookup table whose
+ size is determined by the longest code. However, the time it takes
+ to build this table can also be a factor if the data being decoded
+ is not very long. The most common codes are necessarily the
+ shortest codes, so those codes dominate the decoding time, and hence
+ the speed. The idea is you can have a shorter table that decodes the
+ shorter, more probable codes, and then point to subsidiary tables for
+ the longer codes. The time it costs to decode the longer codes is
+ then traded against the time it takes to make longer tables.
+
+ This results of this trade are in the variables lbits and dbits
+ below. lbits is the number of bits the first level table for literal/
+ length codes can decode in one step, and dbits is the same thing for
+ the distance codes. Subsequent tables are also less than or equal to
+ those sizes. These values may be adjusted either when all of the
+ codes are shorter than that, in which case the longest code length in
+ bits is used, or when the shortest code is *longer* than the requested
+ table size, in which case the length of the shortest code in bits is
+ used.
+
+ There are two different values for the two tables, since they code a
+ different number of possibilities each. The literal/length table
+ codes 286 possible values, or in a flat code, a little over eight
+ bits. The distance table codes 30 possible values, or a little less
+ than five bits, flat. The optimum values for speed end up being
+ about one bit more than those, so lbits is 8+1 and dbits is 5+1.
+ The optimum values may differ though from machine to machine, and
+ possibly even between compilers. Your mileage may vary.
+ */
+
+static const int lbits = 9; /* bits in base literal/length lookup table */
+static const int dbits = 6; /* bits in base distance lookup table */
+
+
+/* If BMAX needs to be larger than 16, then h and x[] should be ulg. */
+#define BMAX 16 /* maximum bit length of any code (16 for
+ * explode) */
+#define N_MAX 288 /* maximum number of codes in any set */
+
+/* Given a list of code lengths and a maximum table size, make a set of
+ tables to decode that set of codes. Return zero on success, one if
+ the given code set is incomplete (the tables are still built in this
+ case), two if the input is invalid (all zero length codes or an
+ oversubscribed set of lengths), and three if not enough memory.
+ The code with value 256 is special, and the tables are constructed
+ so that no bits beyond that code are fetched when that code is
+ decoded. */
+static int
+huft_build(glbl, b, n, s, d, e, t, m)
+ struct inflate *glbl;
+ unsigned *b; /* code lengths in bits (all assumed <= BMAX) */
+ unsigned n; /* number of codes (assumed <= N_MAX) */
+ unsigned s; /* number of simple-valued codes (0..s-1) */
+ const ush *d; /* list of base values for non-simple codes */
+ const ush *e; /* list of extra bits for non-simple codes */
+ struct huft **t; /* result: starting table */
+ int *m; /* maximum lookup bits, returns actual */
+{
+ unsigned a; /* counter for codes of length k */
+ unsigned c[BMAX + 1]; /* bit length count table */
+ unsigned el; /* length of EOB code (value 256) */
+ unsigned f; /* i repeats in table every f entries */
+ int g; /* maximum code length */
+ int h; /* table level */
+ register unsigned i; /* counter, current code */
+ register unsigned j; /* counter */
+ register int k; /* number of bits in current code */
+ int lx[BMAX + 1]; /* memory for l[-1..BMAX-1] */
+ int *l = lx + 1; /* stack of bits per table */
+ register unsigned *p; /* pointer into c[], b[], or v[] */
+ register struct huft *q;/* points to current table */
+ struct huft r; /* table entry for structure assignment */
+ struct huft *u[BMAX];/* table stack */
+ unsigned v[N_MAX]; /* values in order of bit length */
+ register int w; /* bits before this table == (l * h) */
+ unsigned x[BMAX + 1]; /* bit offsets, then code stack */
+ unsigned *xp; /* pointer into x */
+ int y; /* number of dummy codes added */
+ unsigned z; /* number of entries in current table */
+
+ /* Generate counts for each bit length */
+ el = n > 256 ? b[256] : BMAX; /* set length of EOB code, if any */
+#ifdef _KERNEL
+ memzero((char *) c, sizeof(c));
+#else
+ for (i = 0; i < BMAX+1; i++)
+ c [i] = 0;
+#endif
+ p = b;
+ i = n;
+ do {
+ c[*p]++;
+ p++; /* assume all entries <= BMAX */
+ } while (--i);
+ if (c[0] == n) { /* null input--all zero length codes */
+ *t = (struct huft *) NULL;
+ *m = 0;
+ return 0;
+ }
+ /* Find minimum and maximum length, bound *m by those */
+ for (j = 1; j <= BMAX; j++)
+ if (c[j])
+ break;
+ k = j; /* minimum code length */
+ if ((unsigned) *m < j)
+ *m = j;
+ for (i = BMAX; i; i--)
+ if (c[i])
+ break;
+ g = i; /* maximum code length */
+ if ((unsigned) *m > i)
+ *m = i;
+
+ /* Adjust last length count to fill out codes, if needed */
+ for (y = 1 << j; j < i; j++, y <<= 1)
+ if ((y -= c[j]) < 0)
+ return 2; /* bad input: more codes than bits */
+ if ((y -= c[i]) < 0)
+ return 2;
+ c[i] += y;
+
+ /* Generate starting offsets into the value table for each length */
+ x[1] = j = 0;
+ p = c + 1;
+ xp = x + 2;
+ while (--i) { /* note that i == g from above */
+ *xp++ = (j += *p++);
+ }
+
+ /* Make a table of values in order of bit lengths */
+ p = b;
+ i = 0;
+ do {
+ if ((j = *p++) != 0)
+ v[x[j]++] = i;
+ } while (++i < n);
+
+ /* Generate the Huffman codes and for each, make the table entries */
+ x[0] = i = 0; /* first Huffman code is zero */
+ p = v; /* grab values in bit order */
+ h = -1; /* no tables yet--level -1 */
+ w = l[-1] = 0; /* no bits decoded yet */
+ u[0] = (struct huft *) NULL; /* just to keep compilers happy */
+ q = (struct huft *) NULL; /* ditto */
+ z = 0; /* ditto */
+
+ /* go through the bit lengths (k already is bits in shortest code) */
+ for (; k <= g; k++) {
+ a = c[k];
+ while (a--) {
+ /*
+ * here i is the Huffman code of length k bits for
+ * value *p
+ */
+ /* make tables up to required level */
+ while (k > w + l[h]) {
+ w += l[h++]; /* add bits already decoded */
+
+ /*
+ * compute minimum size table less than or
+ * equal to *m bits
+ */
+ z = (z = g - w) > (unsigned) *m ? *m : z; /* upper limit */
+ if ((f = 1 << (j = k - w)) > a + 1) { /* try a k-w bit table *//* t
+ * oo few codes for k-w
+ * bit table */
+ f -= a + 1; /* deduct codes from
+ * patterns left */
+ xp = c + k;
+ while (++j < z) { /* try smaller tables up
+ * to z bits */
+ if ((f <<= 1) <= *++xp)
+ break; /* enough codes to use
+ * up j bits */
+ f -= *xp; /* else deduct codes
+ * from patterns */
+ }
+ }
+ if ((unsigned) w + j > el && (unsigned) w < el)
+ j = el - w; /* make EOB code end at
+ * table */
+ z = 1 << j; /* table entries for j-bit
+ * table */
+ l[h] = j; /* set table size in stack */
+
+ /* allocate and link in new table */
+ if ((q = (struct huft *) malloc((z + 1) * sizeof(struct huft), M_GZIP, M_WAITOK)) ==
+ (struct huft *) NULL) {
+ if (h)
+ huft_free(glbl, u[0]);
+ return 3; /* not enough memory */
+ }
+ glbl->gz_hufts += z + 1; /* track memory usage */
+ *t = q + 1; /* link to list for
+ * huft_free() */
+ *(t = &(q->v.t)) = (struct huft *) NULL;
+ u[h] = ++q; /* table starts after link */
+
+ /* connect to last table, if there is one */
+ if (h) {
+ x[h] = i; /* save pattern for
+ * backing up */
+ r.b = (uch) l[h - 1]; /* bits to dump before
+ * this table */
+ r.e = (uch) (16 + j); /* bits in this table */
+ r.v.t = q; /* pointer to this table */
+ j = (i & ((1 << w) - 1)) >> (w - l[h - 1]);
+ u[h - 1][j] = r; /* connect to last table */
+ }
+ }
+
+ /* set up table entry in r */
+ r.b = (uch) (k - w);
+ if (p >= v + n)
+ r.e = 99; /* out of values--invalid
+ * code */
+ else if (*p < s) {
+ r.e = (uch) (*p < 256 ? 16 : 15); /* 256 is end-of-block
+ * code */
+ r.v.n = *p++; /* simple code is just the
+ * value */
+ } else {
+ r.e = (uch) e[*p - s]; /* non-simple--look up
+ * in lists */
+ r.v.n = d[*p++ - s];
+ }
+
+ /* fill code-like entries with r */
+ f = 1 << (k - w);
+ for (j = i >> w; j < z; j += f)
+ q[j] = r;
+
+ /* backwards increment the k-bit code i */
+ for (j = 1 << (k - 1); i & j; j >>= 1)
+ i ^= j;
+ i ^= j;
+
+ /* backup over finished tables */
+ while ((i & ((1 << w) - 1)) != x[h])
+ w -= l[--h]; /* don't need to update q */
+ }
+ }
+
+ /* return actual size of base table */
+ *m = l[0];
+
+ /* Return true (1) if we were given an incomplete table */
+ return y != 0 && g != 1;
+}
+
+static int
+huft_free(glbl, t)
+ struct inflate *glbl;
+ struct huft *t; /* table to free */
+/* Free the malloc'ed tables built by huft_build(), which makes a linked
+ list of the tables it made, with the links in a dummy first entry of
+ each table. */
+{
+ register struct huft *p, *q;
+
+ /* Go through linked list, freeing from the malloced (t[-1]) address. */
+ p = t;
+ while (p != (struct huft *) NULL) {
+ q = (--p)->v.t;
+ free(p, M_GZIP);
+ p = q;
+ }
+ return 0;
+}
+
+/* inflate (decompress) the codes in a deflated (compressed) block.
+ Return an error code or zero if it all goes ok. */
+static int
+inflate_codes(glbl, tl, td, bl, bd)
+ struct inflate *glbl;
+ struct huft *tl, *td;/* literal/length and distance decoder tables */
+ int bl, bd; /* number of bits decoded by tl[] and td[] */
+{
+ register unsigned e; /* table entry flag/number of extra bits */
+ unsigned n, d; /* length and index for copy */
+ unsigned w; /* current window position */
+ struct huft *t; /* pointer to table entry */
+ unsigned ml, md; /* masks for bl and bd bits */
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local copies of globals */
+ b = glbl->gz_bb; /* initialize bit buffer */
+ k = glbl->gz_bk;
+ w = glbl->gz_wp; /* initialize window position */
+
+ /* inflate the coded data */
+ ml = mask[bl]; /* precompute masks for speed */
+ md = mask[bd];
+ while (1) { /* do until end of block */
+ NEEDBITS(glbl, (unsigned) bl)
+ if ((e = (t = tl + ((unsigned) b & ml))->e) > 16)
+ do {
+ if (e == 99)
+ return 1;
+ DUMPBITS(t->b)
+ e -= 16;
+ NEEDBITS(glbl, e)
+ } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+ DUMPBITS(t->b)
+ if (e == 16) { /* then it's a literal */
+ glbl->gz_slide[w++] = (uch) t->v.n;
+ if (w == GZ_WSIZE) {
+ FLUSH(glbl, w);
+ w = 0;
+ }
+ } else { /* it's an EOB or a length */
+ /* exit if end of block */
+ if (e == 15)
+ break;
+
+ /* get length of block to copy */
+ NEEDBITS(glbl, e)
+ n = t->v.n + ((unsigned) b & mask[e]);
+ DUMPBITS(e);
+
+ /* decode distance of block to copy */
+ NEEDBITS(glbl, (unsigned) bd)
+ if ((e = (t = td + ((unsigned) b & md))->e) > 16)
+ do {
+ if (e == 99)
+ return 1;
+ DUMPBITS(t->b)
+ e -= 16;
+ NEEDBITS(glbl, e)
+ } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+ DUMPBITS(t->b)
+ NEEDBITS(glbl, e)
+ d = w - t->v.n - ((unsigned) b & mask[e]);
+ DUMPBITS(e)
+ /* do the copy */
+ do {
+ n -= (e = (e = GZ_WSIZE - ((d &= GZ_WSIZE - 1) > w ? d : w)) > n ? n : e);
+#ifndef NOMEMCPY
+ if (w - d >= e) { /* (this test assumes
+ * unsigned comparison) */
+ memcpy(glbl->gz_slide + w, glbl->gz_slide + d, e);
+ w += e;
+ d += e;
+ } else /* do it slow to avoid memcpy()
+ * overlap */
+#endif /* !NOMEMCPY */
+ do {
+ glbl->gz_slide[w++] = glbl->gz_slide[d++];
+ } while (--e);
+ if (w == GZ_WSIZE) {
+ FLUSH(glbl, w);
+ w = 0;
+ }
+ } while (n);
+ }
+ }
+
+ /* restore the globals from the locals */
+ glbl->gz_wp = w; /* restore global window pointer */
+ glbl->gz_bb = b; /* restore global bit buffer */
+ glbl->gz_bk = k;
+
+ /* done */
+ return 0;
+}
+
+/* "decompress" an inflated type 0 (stored) block. */
+static int
+inflate_stored(glbl)
+ struct inflate *glbl;
+{
+ unsigned n; /* number of bytes in block */
+ unsigned w; /* current window position */
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local copies of globals */
+ b = glbl->gz_bb; /* initialize bit buffer */
+ k = glbl->gz_bk;
+ w = glbl->gz_wp; /* initialize window position */
+
+ /* go to byte boundary */
+ n = k & 7;
+ DUMPBITS(n);
+
+ /* get the length and its complement */
+ NEEDBITS(glbl, 16)
+ n = ((unsigned) b & 0xffff);
+ DUMPBITS(16)
+ NEEDBITS(glbl, 16)
+ if (n != (unsigned) ((~b) & 0xffff))
+ return 1; /* error in compressed data */
+ DUMPBITS(16)
+ /* read and output the compressed data */
+ while (n--) {
+ NEEDBITS(glbl, 8)
+ glbl->gz_slide[w++] = (uch) b;
+ if (w == GZ_WSIZE) {
+ FLUSH(glbl, w);
+ w = 0;
+ }
+ DUMPBITS(8)
+ }
+
+ /* restore the globals from the locals */
+ glbl->gz_wp = w; /* restore global window pointer */
+ glbl->gz_bb = b; /* restore global bit buffer */
+ glbl->gz_bk = k;
+ return 0;
+}
+
+/* decompress an inflated type 1 (fixed Huffman codes) block. We should
+ either replace this with a custom decoder, or at least precompute the
+ Huffman tables. */
+static int
+inflate_fixed(glbl)
+ struct inflate *glbl;
+{
+ /* if first time, set up tables for fixed blocks */
+ if (glbl->gz_fixed_tl == (struct huft *) NULL) {
+ int i; /* temporary variable */
+ static unsigned l[288]; /* length list for huft_build */
+
+ /* literal table */
+ for (i = 0; i < 144; i++)
+ l[i] = 8;
+ for (; i < 256; i++)
+ l[i] = 9;
+ for (; i < 280; i++)
+ l[i] = 7;
+ for (; i < 288; i++) /* make a complete, but wrong code
+ * set */
+ l[i] = 8;
+ glbl->gz_fixed_bl = 7;
+ if ((i = huft_build(glbl, l, 288, 257, cplens, cplext,
+ &glbl->gz_fixed_tl, &glbl->gz_fixed_bl)) != 0) {
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+ return i;
+ }
+ /* distance table */
+ for (i = 0; i < 30; i++) /* make an incomplete code
+ * set */
+ l[i] = 5;
+ glbl->gz_fixed_bd = 5;
+ if ((i = huft_build(glbl, l, 30, 0, cpdist, cpdext,
+ &glbl->gz_fixed_td, &glbl->gz_fixed_bd)) > 1) {
+ huft_free(glbl, glbl->gz_fixed_tl);
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+ return i;
+ }
+ }
+ /* decompress until an end-of-block code */
+ return inflate_codes(glbl, glbl->gz_fixed_tl, glbl->gz_fixed_td, glbl->gz_fixed_bl, glbl->gz_fixed_bd) != 0;
+}
+
+/* decompress an inflated type 2 (dynamic Huffman codes) block. */
+static int
+inflate_dynamic(glbl)
+ struct inflate *glbl;
+{
+ int i; /* temporary variables */
+ unsigned j;
+ unsigned l; /* last length */
+ unsigned m; /* mask for bit lengths table */
+ unsigned n; /* number of lengths to get */
+ struct huft *tl; /* literal/length code table */
+ struct huft *td; /* distance code table */
+ int bl; /* lookup bits for tl */
+ int bd; /* lookup bits for td */
+ unsigned nb; /* number of bit length codes */
+ unsigned nl; /* number of literal/length codes */
+ unsigned nd; /* number of distance codes */
+#ifdef PKZIP_BUG_WORKAROUND
+ unsigned ll[288 + 32]; /* literal/length and distance code
+ * lengths */
+#else
+ unsigned ll[286 + 30]; /* literal/length and distance code
+ * lengths */
+#endif
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local bit buffer */
+ b = glbl->gz_bb;
+ k = glbl->gz_bk;
+
+ /* read in table lengths */
+ NEEDBITS(glbl, 5)
+ nl = 257 + ((unsigned) b & 0x1f); /* number of
+ * literal/length codes */
+ DUMPBITS(5)
+ NEEDBITS(glbl, 5)
+ nd = 1 + ((unsigned) b & 0x1f); /* number of distance codes */
+ DUMPBITS(5)
+ NEEDBITS(glbl, 4)
+ nb = 4 + ((unsigned) b & 0xf); /* number of bit length codes */
+ DUMPBITS(4)
+#ifdef PKZIP_BUG_WORKAROUND
+ if (nl > 288 || nd > 32)
+#else
+ if (nl > 286 || nd > 30)
+#endif
+ return 1; /* bad lengths */
+ /* read in bit-length-code lengths */
+ for (j = 0; j < nb; j++) {
+ NEEDBITS(glbl, 3)
+ ll[border[j]] = (unsigned) b & 7;
+ DUMPBITS(3)
+ }
+ for (; j < 19; j++)
+ ll[border[j]] = 0;
+
+ /* build decoding table for trees--single level, 7 bit lookup */
+ bl = 7;
+ if ((i = huft_build(glbl, ll, 19, 19, NULL, NULL, &tl, &bl)) != 0) {
+ if (i == 1)
+ huft_free(glbl, tl);
+ return i; /* incomplete code set */
+ }
+ /* read in literal and distance code lengths */
+ n = nl + nd;
+ m = mask[bl];
+ i = l = 0;
+ while ((unsigned) i < n) {
+ NEEDBITS(glbl, (unsigned) bl)
+ j = (td = tl + ((unsigned) b & m))->b;
+ DUMPBITS(j)
+ j = td->v.n;
+ if (j < 16) /* length of code in bits (0..15) */
+ ll[i++] = l = j; /* save last length in l */
+ else if (j == 16) { /* repeat last length 3 to 6 times */
+ NEEDBITS(glbl, 2)
+ j = 3 + ((unsigned) b & 3);
+ DUMPBITS(2)
+ if ((unsigned) i + j > n)
+ return 1;
+ while (j--)
+ ll[i++] = l;
+ } else if (j == 17) { /* 3 to 10 zero length codes */
+ NEEDBITS(glbl, 3)
+ j = 3 + ((unsigned) b & 7);
+ DUMPBITS(3)
+ if ((unsigned) i + j > n)
+ return 1;
+ while (j--)
+ ll[i++] = 0;
+ l = 0;
+ } else { /* j == 18: 11 to 138 zero length codes */
+ NEEDBITS(glbl, 7)
+ j = 11 + ((unsigned) b & 0x7f);
+ DUMPBITS(7)
+ if ((unsigned) i + j > n)
+ return 1;
+ while (j--)
+ ll[i++] = 0;
+ l = 0;
+ }
+ }
+
+ /* free decoding table for trees */
+ huft_free(glbl, tl);
+
+ /* restore the global bit buffer */
+ glbl->gz_bb = b;
+ glbl->gz_bk = k;
+
+ /* build the decoding tables for literal/length and distance codes */
+ bl = lbits;
+ i = huft_build(glbl, ll, nl, 257, cplens, cplext, &tl, &bl);
+ if (i != 0) {
+ if (i == 1 && !qflag) {
+ FPRINTF("(incomplete l-tree) ");
+ huft_free(glbl, tl);
+ }
+ return i; /* incomplete code set */
+ }
+ bd = dbits;
+ i = huft_build(glbl, ll + nl, nd, 0, cpdist, cpdext, &td, &bd);
+ if (i != 0) {
+ if (i == 1 && !qflag) {
+ FPRINTF("(incomplete d-tree) ");
+#ifdef PKZIP_BUG_WORKAROUND
+ i = 0;
+ }
+#else
+ huft_free(glbl, td);
+ }
+ huft_free(glbl, tl);
+ return i; /* incomplete code set */
+#endif
+ }
+ /* decompress until an end-of-block code */
+ if (inflate_codes(glbl, tl, td, bl, bd))
+ return 1;
+
+ /* free the decoding tables, return */
+ huft_free(glbl, tl);
+ huft_free(glbl, td);
+ return 0;
+}
+
+/* decompress an inflated block */
+static int
+inflate_block(glbl, e)
+ struct inflate *glbl;
+ int *e; /* last block flag */
+{
+ unsigned t; /* block type */
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local bit buffer */
+ b = glbl->gz_bb;
+ k = glbl->gz_bk;
+
+ /* read in last block bit */
+ NEEDBITS(glbl, 1)
+ * e = (int) b & 1;
+ DUMPBITS(1)
+ /* read in block type */
+ NEEDBITS(glbl, 2)
+ t = (unsigned) b & 3;
+ DUMPBITS(2)
+ /* restore the global bit buffer */
+ glbl->gz_bb = b;
+ glbl->gz_bk = k;
+
+ /* inflate that block type */
+ if (t == 2)
+ return inflate_dynamic(glbl);
+ if (t == 0)
+ return inflate_stored(glbl);
+ if (t == 1)
+ return inflate_fixed(glbl);
+ /* bad block type */
+ return 2;
+}
+
+
+
+/* decompress an inflated entry */
+static int
+xinflate(glbl)
+ struct inflate *glbl;
+{
+ int e; /* last block flag */
+ int r; /* result code */
+ unsigned h; /* maximum struct huft's malloc'ed */
+
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+
+ /* initialize window, bit buffer */
+ glbl->gz_wp = 0;
+ glbl->gz_bk = 0;
+ glbl->gz_bb = 0;
+
+ /* decompress until the last block */
+ h = 0;
+ do {
+ glbl->gz_hufts = 0;
+ if ((r = inflate_block(glbl, &e)) != 0)
+ return r;
+ if (glbl->gz_hufts > h)
+ h = glbl->gz_hufts;
+ } while (!e);
+
+ /* flush out slide */
+ FLUSH(glbl, glbl->gz_wp);
+
+ /* return success */
+ return 0;
+}
+
+/* Nobody uses this - why not? */
+int
+inflate(glbl)
+ struct inflate *glbl;
+{
+ int i;
+#ifdef _KERNEL
+ u_char *p = NULL;
+
+ if (!glbl->gz_slide)
+ p = glbl->gz_slide = malloc(GZ_WSIZE, M_GZIP, M_WAITOK);
+#endif
+ if (!glbl->gz_slide)
+#ifdef _KERNEL
+ return(ENOMEM);
+#else
+ return 3; /* kzip expects 3 */
+#endif
+ i = xinflate(glbl);
+
+ if (glbl->gz_fixed_td != (struct huft *) NULL) {
+ huft_free(glbl, glbl->gz_fixed_td);
+ glbl->gz_fixed_td = (struct huft *) NULL;
+ }
+ if (glbl->gz_fixed_tl != (struct huft *) NULL) {
+ huft_free(glbl, glbl->gz_fixed_tl);
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+ }
+#ifdef _KERNEL
+ if (p == glbl->gz_slide) {
+ free(glbl->gz_slide, M_GZIP);
+ glbl->gz_slide = NULL;
+ }
+#endif
+ return i;
+}
+/* ----------------------- END INFLATE.C */
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
new file mode 100644
index 0000000..d5c5656
--- /dev/null
+++ b/sys/kern/init_main.c
@@ -0,0 +1,669 @@
+/*
+ * Copyright (c) 1995 Terrence R. Lambert
+ * All rights reserved.
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)init_main.c 8.9 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_init_path.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/vnode.h>
+#include <sys/sysent.h>
+#include <sys/reboot.h>
+#include <sys/sx.h>
+#include <sys/sysproto.h>
+#include <sys/vmmeter.h>
+#include <sys/unistd.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+
+#include <machine/cpu.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/user.h>
+#include <sys/copyright.h>
+
+void mi_startup(void); /* Should be elsewhere */
+
+/* Components of the first process -- never freed. */
+static struct session session0;
+static struct pgrp pgrp0;
+struct proc proc0;
+struct thread thread0;
+static struct procsig procsig0;
+static struct filedesc0 filedesc0;
+static struct plimit limit0;
+static struct vmspace vmspace0;
+struct proc *initproc;
+
+int cmask = CMASK;
+extern int fallback_elf_brand;
+
+struct vnode *rootvp;
+int boothowto = 0; /* initialized so that it can be patched */
+SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "");
+int bootverbose;
+SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "");
+
+/*
+ * This ensures that there is at least one entry so that the sysinit_set
+ * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never
+ * executed.
+ */
+SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL)
+
+/*
+ * The sysinit table itself. Items are checked off as the are run.
+ * If we want to register new sysinit types, add them to newsysinit.
+ */
+SET_DECLARE(sysinit_set, struct sysinit);
+struct sysinit **sysinit, **sysinit_end;
+struct sysinit **newsysinit, **newsysinit_end;
+
+/*
+ * Merge a new sysinit set into the current set, reallocating it if
+ * necessary. This can only be called after malloc is running.
+ */
+void
+sysinit_add(struct sysinit **set, struct sysinit **set_end)
+{
+ struct sysinit **newset;
+ struct sysinit **sipp;
+ struct sysinit **xipp;
+ int count;
+
+ count = set_end - set;
+ if (newsysinit)
+ count += newsysinit_end - newsysinit;
+ else
+ count += sysinit_end - sysinit;
+ newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
+ if (newset == NULL)
+ panic("cannot malloc for sysinit");
+ xipp = newset;
+ if (newsysinit)
+ for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
+ *xipp++ = *sipp;
+ else
+ for (sipp = sysinit; sipp < sysinit_end; sipp++)
+ *xipp++ = *sipp;
+ for (sipp = set; sipp < set_end; sipp++)
+ *xipp++ = *sipp;
+ if (newsysinit)
+ free(newsysinit, M_TEMP);
+ newsysinit = newset;
+ newsysinit_end = newset + count;
+}
+
+/*
+ * System startup; initialize the world, create process 0, mount root
+ * filesystem, and fork to create init and pagedaemon. Most of the
+ * hard work is done in the lower-level initialization routines including
+ * startup(), which does memory initialization and autoconfiguration.
+ *
+ * This allows simple addition of new kernel subsystems that require
+ * boot time initialization. It also allows substitution of subsystem
+ * (for instance, a scheduler, kernel profiler, or VM system) by object
+ * module. Finally, it allows for optional "kernel threads".
+ */
+void
+mi_startup(void)
+{
+
+ register struct sysinit **sipp; /* system initialization*/
+ register struct sysinit **xipp; /* interior loop of sort*/
+ register struct sysinit *save; /* bubble*/
+
+ if (sysinit == NULL) {
+ sysinit = SET_BEGIN(sysinit_set);
+ sysinit_end = SET_LIMIT(sysinit_set);
+ }
+
+restart:
+ /*
+ * Perform a bubble sort of the system initialization objects by
+ * their subsystem (primary key) and order (secondary key).
+ */
+ for (sipp = sysinit; sipp < sysinit_end; sipp++) {
+ for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
+ if ((*sipp)->subsystem < (*xipp)->subsystem ||
+ ((*sipp)->subsystem == (*xipp)->subsystem &&
+ (*sipp)->order <= (*xipp)->order))
+ continue; /* skip*/
+ save = *sipp;
+ *sipp = *xipp;
+ *xipp = save;
+ }
+ }
+
+ /*
+ * Traverse the (now) ordered list of system initialization tasks.
+ * Perform each task, and continue on to the next task.
+ *
+ * The last item on the list is expected to be the scheduler,
+ * which will not return.
+ */
+ for (sipp = sysinit; sipp < sysinit_end; sipp++) {
+
+ if ((*sipp)->subsystem == SI_SUB_DUMMY)
+ continue; /* skip dummy task(s)*/
+
+ if ((*sipp)->subsystem == SI_SUB_DONE)
+ continue;
+
+ /* Call function */
+ (*((*sipp)->func))((*sipp)->udata);
+
+ /* Check off the one we're just done */
+ (*sipp)->subsystem = SI_SUB_DONE;
+
+ /* Check if we've installed more sysinit items via KLD */
+ if (newsysinit != NULL) {
+ if (sysinit != SET_BEGIN(sysinit_set))
+ free(sysinit, M_TEMP);
+ sysinit = newsysinit;
+ sysinit_end = newsysinit_end;
+ newsysinit = NULL;
+ newsysinit_end = NULL;
+ goto restart;
+ }
+ }
+
+ panic("Shouldn't get here!");
+ /* NOTREACHED*/
+}
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's belong elsewhere, but have not yet
+ **** been moved.
+ ****
+ ***************************************************************************
+ */
+static void
+print_caddr_t(void *data __unused)
+{
+ printf("%s", (char *)data);
+}
+SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
+SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, version)
+
+static void
+set_boot_verbose(void *data __unused)
+{
+
+ if (boothowto & RB_VERBOSE)
+ bootverbose++;
+}
+SYSINIT(boot_verbose, SI_SUB_TUNABLES, SI_ORDER_ANY, set_boot_verbose, NULL)
+
+/*
+ ***************************************************************************
+ ****
+ **** The two following SYSINT's are proc0 specific glue code. I am not
+ **** convinced that they can not be safely combined, but their order of
+ **** operation has been maintained as the same as the original init_main.c
+ **** for right now.
+ ****
+ **** These probably belong in init_proc.c or kern_proc.c, since they
+ **** deal with proc0 (the fork template process).
+ ****
+ ***************************************************************************
+ */
+/* ARGSUSED*/
+static void
+proc0_init(void *dummy __unused)
+{
+ register struct proc *p;
+ register struct filedesc0 *fdp;
+ register unsigned i;
+ struct thread *td;
+ struct ksegrp *kg;
+ struct kse *ke;
+
+ GIANT_REQUIRED;
+ p = &proc0;
+ td = &thread0;
+
+ /*
+ * Initialize magic number.
+ */
+ p->p_magic = P_MAGIC;
+
+ /*
+ * Initialize thread, process and pgrp structures.
+ */
+ procinit();
+
+ /*
+ * Initialize sleep queue hash table
+ */
+ sleepinit();
+
+ /*
+ * additional VM structures
+ */
+ vm_init2();
+
+ /*
+ * Create process 0 (the swapper).
+ */
+ LIST_INSERT_HEAD(&allproc, p, p_list);
+ LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
+ mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
+ p->p_pgrp = &pgrp0;
+ LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
+ LIST_INIT(&pgrp0.pg_members);
+ LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
+
+ pgrp0.pg_session = &session0;
+ mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
+ session0.s_count = 1;
+ session0.s_leader = p;
+
+#ifdef __ELF__
+ p->p_sysent = &elf_freebsd_sysvec;
+#else
+ p->p_sysent = &aout_sysvec;
+#endif
+
+ ke = &proc0.p_kse; /* XXXKSE */
+ kg = &proc0.p_ksegrp; /* XXXKSE */
+ p->p_flag = P_SYSTEM;
+ p->p_sflag = PS_INMEM;
+ p->p_stat = SRUN;
+ p->p_ksegrp.kg_nice = NZERO;
+ kg->kg_pri_class = PRI_TIMESHARE;
+ kg->kg_user_pri = PUSER;
+ td->td_priority = PVM;
+ td->td_base_pri = PUSER;
+
+ p->p_peers = 0;
+ p->p_leader = p;
+
+ bcopy("swapper", p->p_comm, sizeof ("swapper"));
+
+ callout_init(&p->p_itcallout, 0);
+ callout_init(&td->td_slpcallout, 1);
+
+ /* Create credentials. */
+ p->p_ucred = crget();
+ p->p_ucred->cr_ngroups = 1; /* group 0 */
+ p->p_ucred->cr_uidinfo = uifind(0);
+ p->p_ucred->cr_ruidinfo = uifind(0);
+ p->p_ucred->cr_prison = NULL; /* Don't jail it. */
+ td->td_ucred = crhold(p->p_ucred);
+
+ /* Create procsig. */
+ p->p_procsig = &procsig0;
+ p->p_procsig->ps_refcnt = 1;
+
+ /* Initialize signal state for process 0. */
+ siginit(&proc0);
+
+ /* Create the file descriptor table. */
+ fdp = &filedesc0;
+ p->p_fd = &fdp->fd_fd;
+ mtx_init(&fdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
+ fdp->fd_fd.fd_refcnt = 1;
+ fdp->fd_fd.fd_cmask = cmask;
+ fdp->fd_fd.fd_ofiles = fdp->fd_dfiles;
+ fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags;
+ fdp->fd_fd.fd_nfiles = NDFILE;
+
+ /* Create the limits structures. */
+ p->p_limit = &limit0;
+ for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++)
+ limit0.pl_rlimit[i].rlim_cur =
+ limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY;
+ limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur =
+ limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
+ limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur =
+ limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
+ i = ptoa(cnt.v_free_count);
+ limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i;
+ limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
+ limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3;
+ limit0.p_cpulimit = RLIM_INFINITY;
+ limit0.p_refcnt = 1;
+
+ /* Allocate a prototype map so we have something to fork. */
+ pmap_pinit0(vmspace_pmap(&vmspace0));
+ p->p_vmspace = &vmspace0;
+ vmspace0.vm_refcnt = 1;
+ vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS),
+ trunc_page(VM_MAXUSER_ADDRESS));
+ vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0);
+
+ /*
+ * We continue to place resource usage info and signal
+ * actions in the user struct so they're pageable.
+ */
+ p->p_stats = &p->p_uarea->u_stats;
+ p->p_sigacts = &p->p_uarea->u_sigacts;
+
+ /*
+ * Charge root for one process.
+ */
+ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
+}
+SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
+
+/* ARGSUSED*/
+static void
+proc0_post(void *dummy __unused)
+{
+ struct timespec ts;
+ struct proc *p;
+
+ /*
+ * Now we can look at the time, having had a chance to verify the
+ * time from the filesystem. Pretend that proc0 started now.
+ */
+ sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &allproc, p_list) {
+ microtime(&p->p_stats->p_start);
+ p->p_runtime.sec = 0;
+ p->p_runtime.frac = 0;
+ }
+ sx_sunlock(&allproc_lock);
+ binuptime(PCPU_PTR(switchtime));
+ PCPU_SET(switchticks, ticks);
+
+ /*
+ * Give the ``random'' number generator a thump.
+ */
+ nanotime(&ts);
+ srandom(ts.tv_sec ^ ts.tv_nsec);
+}
+SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's and glue code should be moved to the
+ **** respective files on a per subsystem basis.
+ ****
+ ***************************************************************************
+ */
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following code probably belongs in another file, like
+ **** kern/init_init.c.
+ ****
+ ***************************************************************************
+ */
+
+/*
+ * List of paths to try when searching for "init".
+ */
+static char init_path[MAXPATHLEN] =
+#ifdef INIT_PATH
+ __XSTRING(INIT_PATH);
+#else
+ "/sbin/init:/sbin/oinit:/sbin/init.bak:/stand/sysinstall";
+#endif
+SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
+ "Path used to search the init process");
+
+/*
+ * Start the initial user process; try exec'ing each pathname in init_path.
+ * The program is invoked with one argument containing the boot flags.
+ */
+static void
+start_init(void *dummy)
+{
+ vm_offset_t addr;
+ struct execve_args args;
+ int options, error;
+ char *var, *path, *next, *s;
+ char *ucp, **uap, *arg0, *arg1;
+ struct thread *td;
+ struct proc *p;
+ int init_does_devfs = 0;
+
+ mtx_lock(&Giant);
+
+ GIANT_REQUIRED;
+
+ td = curthread;
+ p = td->td_proc;
+
+ vfs_mountroot(NULL);
+
+ /* Get the vnode for '/'. Set p->p_fd->fd_cdir to reference it. */
+ if (VFS_ROOT(TAILQ_FIRST(&mountlist), &rootvnode))
+ panic("cannot find root vnode");
+ FILEDESC_LOCK(p->p_fd);
+ p->p_fd->fd_cdir = rootvnode;
+ VREF(p->p_fd->fd_cdir);
+ p->p_fd->fd_rdir = rootvnode;
+ VREF(p->p_fd->fd_rdir);
+ FILEDESC_UNLOCK(p->p_fd);
+ VOP_UNLOCK(rootvnode, 0, td);
+
+ if (devfs_present) {
+ /*
+ * For disk based systems, we probably cannot do this yet
+ * since the fs will be read-only. But a NFS root
+ * might be ok. It is worth a shot.
+ */
+ error = vn_mkdir("/dev", 0700, UIO_SYSSPACE, td);
+ if (error == EEXIST)
+ error = 0;
+ if (error == 0)
+ error = kernel_vmount(0, "fstype", "devfs",
+ "fspath", "/dev", NULL);
+ if (error != 0)
+ init_does_devfs = 1;
+ }
+
+ /*
+ * Need just enough stack to hold the faked-up "execve()" arguments.
+ */
+ addr = trunc_page(USRSTACK - PAGE_SIZE);
+ if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
+ FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
+ panic("init: couldn't allocate argument space");
+ p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
+ p->p_vmspace->vm_ssize = 1;
+
+ if ((var = getenv("init_path")) != NULL) {
+ strncpy(init_path, var, sizeof init_path);
+ init_path[sizeof init_path - 1] = 0;
+ freeenv(var);
+ }
+ if ((var = getenv("kern.fallback_elf_brand")) != NULL) {
+ fallback_elf_brand = strtol(var, NULL, 0);
+ freeenv(var);
+ }
+
+ for (path = init_path; *path != '\0'; path = next) {
+ while (*path == ':')
+ path++;
+ if (*path == '\0')
+ break;
+ for (next = path; *next != '\0' && *next != ':'; next++)
+ /* nothing */ ;
+ if (bootverbose)
+ printf("start_init: trying %.*s\n", (int)(next - path),
+ path);
+
+ /*
+ * Move out the boot flag argument.
+ */
+ options = 0;
+ ucp = (char *)USRSTACK;
+ (void)subyte(--ucp, 0); /* trailing zero */
+ if (boothowto & RB_SINGLE) {
+ (void)subyte(--ucp, 's');
+ options = 1;
+ }
+#ifdef notyet
+ if (boothowto & RB_FASTBOOT) {
+ (void)subyte(--ucp, 'f');
+ options = 1;
+ }
+#endif
+
+#ifdef BOOTCDROM
+ (void)subyte(--ucp, 'C');
+ options = 1;
+#endif
+ if (init_does_devfs) {
+ (void)subyte(--ucp, 'd');
+ options = 1;
+ }
+
+ if (options == 0)
+ (void)subyte(--ucp, '-');
+ (void)subyte(--ucp, '-'); /* leading hyphen */
+ arg1 = ucp;
+
+ /*
+ * Move out the file name (also arg 0).
+ */
+ (void)subyte(--ucp, 0);
+ for (s = next - 1; s >= path; s--)
+ (void)subyte(--ucp, *s);
+ arg0 = ucp;
+
+ /*
+ * Move out the arg pointers.
+ */
+ uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
+ (void)suword((caddr_t)--uap, (long)0); /* terminator */
+ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
+ (void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
+
+ /*
+ * Point at the arguments.
+ */
+ args.fname = arg0;
+ args.argv = uap;
+ args.envv = NULL;
+
+ /*
+ * Now try to exec the program. If can't for any reason
+ * other than it doesn't exist, complain.
+ *
+ * Otherwise, return via fork_trampoline() all the way
+ * to user mode as init!
+ */
+ if ((error = execve(td, &args)) == 0) {
+ mtx_unlock(&Giant);
+ return;
+ }
+ if (error != ENOENT)
+ printf("exec %.*s: error %d\n", (int)(next - path),
+ path, error);
+ }
+ printf("init: not found in path %s\n", init_path);
+ panic("no init");
+}
+
+/*
+ * Like kthread_create(), but runs in it's own address space.
+ * We do this early to reserve pid 1.
+ *
+ * Note special case - do not make it runnable yet. Other work
+ * in progress will change this more.
+ */
+static void
+create_init(const void *udata __unused)
+{
+ struct ucred *newcred, *oldcred;
+ int error;
+
+ error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, &initproc);
+ if (error)
+ panic("cannot fork init: %d\n", error);
+ /* divorce init's credentials from the kernel's */
+ newcred = crget();
+ PROC_LOCK(initproc);
+ initproc->p_flag |= P_SYSTEM;
+ oldcred = initproc->p_ucred;
+ crcopy(newcred, oldcred);
+ initproc->p_ucred = newcred;
+ PROC_UNLOCK(initproc);
+ crfree(oldcred);
+ mtx_lock_spin(&sched_lock);
+ initproc->p_sflag |= PS_INMEM;
+ mtx_unlock_spin(&sched_lock);
+ cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
+}
+SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)
+
+/*
+ * Make it runnable now.
+ */
+static void
+kick_init(const void *udata __unused)
+{
+ struct thread *td;
+
+ td = FIRST_THREAD_IN_PROC(initproc);
+ mtx_lock_spin(&sched_lock);
+ initproc->p_stat = SRUN;
+ setrunqueue(FIRST_THREAD_IN_PROC(initproc)); /* XXXKSE */
+ mtx_unlock_spin(&sched_lock);
+}
+SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
new file mode 100644
index 0000000..425e3b7
--- /dev/null
+++ b/sys/kern/init_sysent.c
@@ -0,0 +1,418 @@
+/*
+ * System call switch table.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.113 2002/06/13 23:43:53 rwatson Exp
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+
+#define AS(name) (sizeof(struct name) / sizeof(register_t))
+
+#ifdef COMPAT_43
+#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)
+#else
+#define compat(n, name) 0, (sy_call_t *)nosys
+#endif
+
+/* The casts are bogus but will do for now. */
+struct sysent sysent[] = {
+ { 0, (sy_call_t *)nosys }, /* 0 = syscall */
+ { SYF_MPSAFE | AS(sys_exit_args), (sy_call_t *)sys_exit }, /* 1 = exit */
+ { SYF_MPSAFE | 0, (sy_call_t *)fork }, /* 2 = fork */
+ { SYF_MPSAFE | AS(read_args), (sy_call_t *)read }, /* 3 = read */
+ { SYF_MPSAFE | AS(write_args), (sy_call_t *)write }, /* 4 = write */
+ { AS(open_args), (sy_call_t *)open }, /* 5 = open */
+ { SYF_MPSAFE | AS(close_args), (sy_call_t *)close }, /* 6 = close */
+ { SYF_MPSAFE | AS(wait_args), (sy_call_t *)wait4 }, /* 7 = wait4 */
+ { compat(AS(ocreat_args),creat) }, /* 8 = old creat */
+ { AS(link_args), (sy_call_t *)link }, /* 9 = link */
+ { AS(unlink_args), (sy_call_t *)unlink }, /* 10 = unlink */
+ { 0, (sy_call_t *)nosys }, /* 11 = obsolete execv */
+ { AS(chdir_args), (sy_call_t *)chdir }, /* 12 = chdir */
+ { AS(fchdir_args), (sy_call_t *)fchdir }, /* 13 = fchdir */
+ { AS(mknod_args), (sy_call_t *)mknod }, /* 14 = mknod */
+ { AS(chmod_args), (sy_call_t *)chmod }, /* 15 = chmod */
+ { AS(chown_args), (sy_call_t *)chown }, /* 16 = chown */
+ { SYF_MPSAFE | AS(obreak_args), (sy_call_t *)obreak }, /* 17 = break */
+ { AS(getfsstat_args), (sy_call_t *)getfsstat }, /* 18 = getfsstat */
+ { compat(AS(olseek_args),lseek) }, /* 19 = old lseek */
+ { SYF_MPSAFE | 0, (sy_call_t *)getpid }, /* 20 = getpid */
+ { AS(mount_args), (sy_call_t *)mount }, /* 21 = mount */
+ { AS(unmount_args), (sy_call_t *)unmount }, /* 22 = unmount */
+ { SYF_MPSAFE | AS(setuid_args), (sy_call_t *)setuid }, /* 23 = setuid */
+ { SYF_MPSAFE | 0, (sy_call_t *)getuid }, /* 24 = getuid */
+ { SYF_MPSAFE | 0, (sy_call_t *)geteuid }, /* 25 = geteuid */
+ { AS(ptrace_args), (sy_call_t *)ptrace }, /* 26 = ptrace */
+ { SYF_MPSAFE | AS(recvmsg_args), (sy_call_t *)recvmsg }, /* 27 = recvmsg */
+ { SYF_MPSAFE | AS(sendmsg_args), (sy_call_t *)sendmsg }, /* 28 = sendmsg */
+ { SYF_MPSAFE | AS(recvfrom_args), (sy_call_t *)recvfrom }, /* 29 = recvfrom */
+ { SYF_MPSAFE | AS(accept_args), (sy_call_t *)accept }, /* 30 = accept */
+ { SYF_MPSAFE | AS(getpeername_args), (sy_call_t *)getpeername }, /* 31 = getpeername */
+ { SYF_MPSAFE | AS(getsockname_args), (sy_call_t *)getsockname }, /* 32 = getsockname */
+ { AS(access_args), (sy_call_t *)access }, /* 33 = access */
+ { AS(chflags_args), (sy_call_t *)chflags }, /* 34 = chflags */
+ { AS(fchflags_args), (sy_call_t *)fchflags }, /* 35 = fchflags */
+ { 0, (sy_call_t *)sync }, /* 36 = sync */
+ { SYF_MPSAFE | AS(kill_args), (sy_call_t *)kill }, /* 37 = kill */
+ { compat(AS(ostat_args),stat) }, /* 38 = old stat */
+ { SYF_MPSAFE | 0, (sy_call_t *)getppid }, /* 39 = getppid */
+ { compat(AS(olstat_args),lstat) }, /* 40 = old lstat */
+ { AS(dup_args), (sy_call_t *)dup }, /* 41 = dup */
+ { 0, (sy_call_t *)pipe }, /* 42 = pipe */
+ { SYF_MPSAFE | 0, (sy_call_t *)getegid }, /* 43 = getegid */
+ { SYF_MPSAFE | AS(profil_args), (sy_call_t *)profil }, /* 44 = profil */
+ { AS(ktrace_args), (sy_call_t *)ktrace }, /* 45 = ktrace */
+ { compat(SYF_MPSAFE | AS(osigaction_args),sigaction) }, /* 46 = old sigaction */
+ { SYF_MPSAFE | 0, (sy_call_t *)getgid }, /* 47 = getgid */
+ { compat(SYF_MPSAFE | AS(osigprocmask_args),sigprocmask) }, /* 48 = old sigprocmask */
+ { SYF_MPSAFE | AS(getlogin_args), (sy_call_t *)getlogin }, /* 49 = getlogin */
+ { SYF_MPSAFE | AS(setlogin_args), (sy_call_t *)setlogin }, /* 50 = setlogin */
+ { SYF_MPSAFE | AS(acct_args), (sy_call_t *)acct }, /* 51 = acct */
+ { compat(SYF_MPSAFE | 0,sigpending) }, /* 52 = old sigpending */
+ { SYF_MPSAFE | AS(sigaltstack_args), (sy_call_t *)sigaltstack }, /* 53 = sigaltstack */
+ { SYF_MPSAFE | AS(ioctl_args), (sy_call_t *)ioctl }, /* 54 = ioctl */
+ { SYF_MPSAFE | AS(reboot_args), (sy_call_t *)reboot }, /* 55 = reboot */
+ { AS(revoke_args), (sy_call_t *)revoke }, /* 56 = revoke */
+ { AS(symlink_args), (sy_call_t *)symlink }, /* 57 = symlink */
+ { AS(readlink_args), (sy_call_t *)readlink }, /* 58 = readlink */
+ { SYF_MPSAFE | AS(execve_args), (sy_call_t *)execve }, /* 59 = execve */
+ { SYF_MPSAFE | AS(umask_args), (sy_call_t *)umask }, /* 60 = umask */
+ { AS(chroot_args), (sy_call_t *)chroot }, /* 61 = chroot */
+ { compat(SYF_MPSAFE | AS(ofstat_args),fstat) }, /* 62 = old fstat */
+ { compat(SYF_MPSAFE | AS(getkerninfo_args),getkerninfo) }, /* 63 = old getkerninfo */
+ { compat(SYF_MPSAFE | 0,getpagesize) }, /* 64 = old getpagesize */
+ { AS(msync_args), (sy_call_t *)msync }, /* 65 = msync */
+ { SYF_MPSAFE | 0, (sy_call_t *)vfork }, /* 66 = vfork */
+ { 0, (sy_call_t *)nosys }, /* 67 = obsolete vread */
+ { 0, (sy_call_t *)nosys }, /* 68 = obsolete vwrite */
+ { SYF_MPSAFE | AS(sbrk_args), (sy_call_t *)sbrk }, /* 69 = sbrk */
+ { SYF_MPSAFE | AS(sstk_args), (sy_call_t *)sstk }, /* 70 = sstk */
+ { compat(SYF_MPSAFE | AS(ommap_args),mmap) }, /* 71 = old mmap */
+ { SYF_MPSAFE | AS(ovadvise_args), (sy_call_t *)ovadvise }, /* 72 = vadvise */
+ { SYF_MPSAFE | AS(munmap_args), (sy_call_t *)munmap }, /* 73 = munmap */
+ { SYF_MPSAFE | AS(mprotect_args), (sy_call_t *)mprotect }, /* 74 = mprotect */
+ { SYF_MPSAFE | AS(madvise_args), (sy_call_t *)madvise }, /* 75 = madvise */
+ { 0, (sy_call_t *)nosys }, /* 76 = obsolete vhangup */
+ { 0, (sy_call_t *)nosys }, /* 77 = obsolete vlimit */
+ { SYF_MPSAFE | AS(mincore_args), (sy_call_t *)mincore }, /* 78 = mincore */
+ { SYF_MPSAFE | AS(getgroups_args), (sy_call_t *)getgroups }, /* 79 = getgroups */
+ { SYF_MPSAFE | AS(setgroups_args), (sy_call_t *)setgroups }, /* 80 = setgroups */
+ { SYF_MPSAFE | 0, (sy_call_t *)getpgrp }, /* 81 = getpgrp */
+ { SYF_MPSAFE | AS(setpgid_args), (sy_call_t *)setpgid }, /* 82 = setpgid */
+ { SYF_MPSAFE | AS(setitimer_args), (sy_call_t *)setitimer }, /* 83 = setitimer */
+ { compat(SYF_MPSAFE | 0,wait) }, /* 84 = old wait */
+ { SYF_MPSAFE | AS(swapon_args), (sy_call_t *)swapon }, /* 85 = swapon */
+ { SYF_MPSAFE | AS(getitimer_args), (sy_call_t *)getitimer }, /* 86 = getitimer */
+ { compat(SYF_MPSAFE | AS(gethostname_args),gethostname) }, /* 87 = old gethostname */
+ { compat(SYF_MPSAFE | AS(sethostname_args),sethostname) }, /* 88 = old sethostname */
+ { SYF_MPSAFE | 0, (sy_call_t *)getdtablesize }, /* 89 = getdtablesize */
+ { SYF_MPSAFE | AS(dup2_args), (sy_call_t *)dup2 }, /* 90 = dup2 */
+ { 0, (sy_call_t *)nosys }, /* 91 = getdopt */
+ { SYF_MPSAFE | AS(fcntl_args), (sy_call_t *)fcntl }, /* 92 = fcntl */
+ { SYF_MPSAFE | AS(select_args), (sy_call_t *)select }, /* 93 = select */
+ { 0, (sy_call_t *)nosys }, /* 94 = setdopt */
+ { AS(fsync_args), (sy_call_t *)fsync }, /* 95 = fsync */
+ { SYF_MPSAFE | AS(setpriority_args), (sy_call_t *)setpriority }, /* 96 = setpriority */
+ { SYF_MPSAFE | AS(socket_args), (sy_call_t *)socket }, /* 97 = socket */
+ { SYF_MPSAFE | AS(connect_args), (sy_call_t *)connect }, /* 98 = connect */
+ { compat(SYF_MPSAFE | AS(accept_args),accept) }, /* 99 = old accept */
+ { SYF_MPSAFE | AS(getpriority_args), (sy_call_t *)getpriority }, /* 100 = getpriority */
+ { compat(SYF_MPSAFE | AS(osend_args),send) }, /* 101 = old send */
+ { compat(SYF_MPSAFE | AS(orecv_args),recv) }, /* 102 = old recv */
+ { SYF_MPSAFE | AS(osigreturn_args), (sy_call_t *)osigreturn }, /* 103 = osigreturn */
+ { SYF_MPSAFE | AS(bind_args), (sy_call_t *)bind }, /* 104 = bind */
+ { SYF_MPSAFE | AS(setsockopt_args), (sy_call_t *)setsockopt }, /* 105 = setsockopt */
+ { SYF_MPSAFE | AS(listen_args), (sy_call_t *)listen }, /* 106 = listen */
+ { 0, (sy_call_t *)nosys }, /* 107 = obsolete vtimes */
+ { compat(SYF_MPSAFE | AS(osigvec_args),sigvec) }, /* 108 = old sigvec */
+ { compat(SYF_MPSAFE | AS(osigblock_args),sigblock) }, /* 109 = old sigblock */
+ { compat(SYF_MPSAFE | AS(osigsetmask_args),sigsetmask) }, /* 110 = old sigsetmask */
+ { compat(SYF_MPSAFE | AS(osigsuspend_args),sigsuspend) }, /* 111 = old sigsuspend */
+ { compat(SYF_MPSAFE | AS(osigstack_args),sigstack) }, /* 112 = old sigstack */
+ { compat(SYF_MPSAFE | AS(orecvmsg_args),recvmsg) }, /* 113 = old recvmsg */
+ { compat(SYF_MPSAFE | AS(osendmsg_args),sendmsg) }, /* 114 = old sendmsg */
+ { 0, (sy_call_t *)nosys }, /* 115 = obsolete vtrace */
+ { SYF_MPSAFE | AS(gettimeofday_args), (sy_call_t *)gettimeofday }, /* 116 = gettimeofday */
+ { SYF_MPSAFE | AS(getrusage_args), (sy_call_t *)getrusage }, /* 117 = getrusage */
+ { SYF_MPSAFE | AS(getsockopt_args), (sy_call_t *)getsockopt }, /* 118 = getsockopt */
+ { 0, (sy_call_t *)nosys }, /* 119 = resuba */
+ { SYF_MPSAFE | AS(readv_args), (sy_call_t *)readv }, /* 120 = readv */
+ { SYF_MPSAFE | AS(writev_args), (sy_call_t *)writev }, /* 121 = writev */
+ { SYF_MPSAFE | AS(settimeofday_args), (sy_call_t *)settimeofday }, /* 122 = settimeofday */
+ { AS(fchown_args), (sy_call_t *)fchown }, /* 123 = fchown */
+ { AS(fchmod_args), (sy_call_t *)fchmod }, /* 124 = fchmod */
+ { compat(SYF_MPSAFE | AS(recvfrom_args),recvfrom) }, /* 125 = old recvfrom */
+ { SYF_MPSAFE | AS(setreuid_args), (sy_call_t *)setreuid }, /* 126 = setreuid */
+ { SYF_MPSAFE | AS(setregid_args), (sy_call_t *)setregid }, /* 127 = setregid */
+ { AS(rename_args), (sy_call_t *)rename }, /* 128 = rename */
+ { compat(AS(otruncate_args),truncate) }, /* 129 = old truncate */
+ { compat(AS(oftruncate_args),ftruncate) }, /* 130 = old ftruncate */
+ { SYF_MPSAFE | AS(flock_args), (sy_call_t *)flock }, /* 131 = flock */
+ { AS(mkfifo_args), (sy_call_t *)mkfifo }, /* 132 = mkfifo */
+ { SYF_MPSAFE | AS(sendto_args), (sy_call_t *)sendto }, /* 133 = sendto */
+ { SYF_MPSAFE | AS(shutdown_args), (sy_call_t *)shutdown }, /* 134 = shutdown */
+ { SYF_MPSAFE | AS(socketpair_args), (sy_call_t *)socketpair }, /* 135 = socketpair */
+ { AS(mkdir_args), (sy_call_t *)mkdir }, /* 136 = mkdir */
+ { AS(rmdir_args), (sy_call_t *)rmdir }, /* 137 = rmdir */
+ { AS(utimes_args), (sy_call_t *)utimes }, /* 138 = utimes */
+ { 0, (sy_call_t *)nosys }, /* 139 = obsolete 4.2 sigreturn */
+ { SYF_MPSAFE | AS(adjtime_args), (sy_call_t *)adjtime }, /* 140 = adjtime */
+ { compat(SYF_MPSAFE | AS(ogetpeername_args),getpeername) }, /* 141 = old getpeername */
+ { compat(SYF_MPSAFE | 0,gethostid) }, /* 142 = old gethostid */
+ { compat(SYF_MPSAFE | AS(osethostid_args),sethostid) }, /* 143 = old sethostid */
+ { compat(SYF_MPSAFE | AS(ogetrlimit_args),getrlimit) }, /* 144 = old getrlimit */
+ { compat(SYF_MPSAFE | AS(osetrlimit_args),setrlimit) }, /* 145 = old setrlimit */
+ { compat(SYF_MPSAFE | AS(okillpg_args),killpg) }, /* 146 = old killpg */
+ { SYF_MPSAFE | 0, (sy_call_t *)setsid }, /* 147 = setsid */
+ { AS(quotactl_args), (sy_call_t *)quotactl }, /* 148 = quotactl */
+ { compat(SYF_MPSAFE | 0,quota) }, /* 149 = old quota */
+ { compat(SYF_MPSAFE | AS(getsockname_args),getsockname) }, /* 150 = old getsockname */
+ { 0, (sy_call_t *)nosys }, /* 151 = sem_lock */
+ { 0, (sy_call_t *)nosys }, /* 152 = sem_wakeup */
+ { 0, (sy_call_t *)nosys }, /* 153 = asyncdaemon */
+ { 0, (sy_call_t *)nosys }, /* 154 = nosys */
+ { SYF_MPSAFE | AS(nfssvc_args), (sy_call_t *)nosys }, /* 155 = nfssvc */
+ { compat(AS(ogetdirentries_args),getdirentries) }, /* 156 = old getdirentries */
+ { AS(statfs_args), (sy_call_t *)statfs }, /* 157 = statfs */
+ { AS(fstatfs_args), (sy_call_t *)fstatfs }, /* 158 = fstatfs */
+ { 0, (sy_call_t *)nosys }, /* 159 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 160 = nosys */
+ { AS(getfh_args), (sy_call_t *)getfh }, /* 161 = getfh */
+ { SYF_MPSAFE | AS(getdomainname_args), (sy_call_t *)getdomainname }, /* 162 = getdomainname */
+ { SYF_MPSAFE | AS(setdomainname_args), (sy_call_t *)setdomainname }, /* 163 = setdomainname */
+ { SYF_MPSAFE | AS(uname_args), (sy_call_t *)uname }, /* 164 = uname */
+ { AS(sysarch_args), (sy_call_t *)sysarch }, /* 165 = sysarch */
+ { SYF_MPSAFE | AS(rtprio_args), (sy_call_t *)rtprio }, /* 166 = rtprio */
+ { 0, (sy_call_t *)nosys }, /* 167 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 168 = nosys */
+ { SYF_MPSAFE | AS(semsys_args), (sy_call_t *)lkmressys }, /* 169 = semsys */
+ { SYF_MPSAFE | AS(msgsys_args), (sy_call_t *)lkmressys }, /* 170 = msgsys */
+ { SYF_MPSAFE | AS(shmsys_args), (sy_call_t *)lkmressys }, /* 171 = shmsys */
+ { 0, (sy_call_t *)nosys }, /* 172 = nosys */
+ { SYF_MPSAFE | AS(pread_args), (sy_call_t *)pread }, /* 173 = pread */
+ { SYF_MPSAFE | AS(pwrite_args), (sy_call_t *)pwrite }, /* 174 = pwrite */
+ { 0, (sy_call_t *)nosys }, /* 175 = nosys */
+ { SYF_MPSAFE | AS(ntp_adjtime_args), (sy_call_t *)ntp_adjtime }, /* 176 = ntp_adjtime */
+ { 0, (sy_call_t *)nosys }, /* 177 = sfork */
+ { 0, (sy_call_t *)nosys }, /* 178 = getdescriptor */
+ { 0, (sy_call_t *)nosys }, /* 179 = setdescriptor */
+ { 0, (sy_call_t *)nosys }, /* 180 = nosys */
+ { SYF_MPSAFE | AS(setgid_args), (sy_call_t *)setgid }, /* 181 = setgid */
+ { SYF_MPSAFE | AS(setegid_args), (sy_call_t *)setegid }, /* 182 = setegid */
+ { SYF_MPSAFE | AS(seteuid_args), (sy_call_t *)seteuid }, /* 183 = seteuid */
+ { 0, (sy_call_t *)nosys }, /* 184 = lfs_bmapv */
+ { 0, (sy_call_t *)nosys }, /* 185 = lfs_markv */
+ { 0, (sy_call_t *)nosys }, /* 186 = lfs_segclean */
+ { 0, (sy_call_t *)nosys }, /* 187 = lfs_segwait */
+ { AS(stat_args), (sy_call_t *)stat }, /* 188 = stat */
+ { SYF_MPSAFE | AS(fstat_args), (sy_call_t *)fstat }, /* 189 = fstat */
+ { AS(lstat_args), (sy_call_t *)lstat }, /* 190 = lstat */
+ { AS(pathconf_args), (sy_call_t *)pathconf }, /* 191 = pathconf */
+ { SYF_MPSAFE | AS(fpathconf_args), (sy_call_t *)fpathconf }, /* 192 = fpathconf */
+ { 0, (sy_call_t *)nosys }, /* 193 = nosys */
+ { SYF_MPSAFE | AS(__getrlimit_args), (sy_call_t *)getrlimit }, /* 194 = getrlimit */
+ { SYF_MPSAFE | AS(__setrlimit_args), (sy_call_t *)setrlimit }, /* 195 = setrlimit */
+ { AS(getdirentries_args), (sy_call_t *)getdirentries }, /* 196 = getdirentries */
+ { SYF_MPSAFE | AS(mmap_args), (sy_call_t *)mmap }, /* 197 = mmap */
+ { 0, (sy_call_t *)nosys }, /* 198 = __syscall */
+ { AS(lseek_args), (sy_call_t *)lseek }, /* 199 = lseek */
+ { AS(truncate_args), (sy_call_t *)truncate }, /* 200 = truncate */
+ { AS(ftruncate_args), (sy_call_t *)ftruncate }, /* 201 = ftruncate */
+ { SYF_MPSAFE | AS(sysctl_args), (sy_call_t *)__sysctl }, /* 202 = __sysctl */
+ { SYF_MPSAFE | AS(mlock_args), (sy_call_t *)mlock }, /* 203 = mlock */
+ { SYF_MPSAFE | AS(munlock_args), (sy_call_t *)munlock }, /* 204 = munlock */
+ { AS(undelete_args), (sy_call_t *)undelete }, /* 205 = undelete */
+ { AS(futimes_args), (sy_call_t *)futimes }, /* 206 = futimes */
+ { SYF_MPSAFE | AS(getpgid_args), (sy_call_t *)getpgid }, /* 207 = getpgid */
+ { 0, (sy_call_t *)nosys }, /* 208 = newreboot */
+ { SYF_MPSAFE | AS(poll_args), (sy_call_t *)poll }, /* 209 = poll */
+ { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 210 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 211 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 212 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 213 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 214 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 215 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 216 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 217 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 218 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 219 = lkmnosys */
+ { SYF_MPSAFE | AS(__semctl_args), (sy_call_t *)lkmressys }, /* 220 = __semctl */
+ { SYF_MPSAFE | AS(semget_args), (sy_call_t *)lkmressys }, /* 221 = semget */
+ { SYF_MPSAFE | AS(semop_args), (sy_call_t *)lkmressys }, /* 222 = semop */
+ { 0, (sy_call_t *)nosys }, /* 223 = semconfig */
+ { SYF_MPSAFE | AS(msgctl_args), (sy_call_t *)lkmressys }, /* 224 = msgctl */
+ { SYF_MPSAFE | AS(msgget_args), (sy_call_t *)lkmressys }, /* 225 = msgget */
+ { SYF_MPSAFE | AS(msgsnd_args), (sy_call_t *)lkmressys }, /* 226 = msgsnd */
+ { SYF_MPSAFE | AS(msgrcv_args), (sy_call_t *)lkmressys }, /* 227 = msgrcv */
+ { SYF_MPSAFE | AS(shmat_args), (sy_call_t *)lkmressys }, /* 228 = shmat */
+ { SYF_MPSAFE | AS(shmctl_args), (sy_call_t *)lkmressys }, /* 229 = shmctl */
+ { SYF_MPSAFE | AS(shmdt_args), (sy_call_t *)lkmressys }, /* 230 = shmdt */
+ { SYF_MPSAFE | AS(shmget_args), (sy_call_t *)lkmressys }, /* 231 = shmget */
+ { SYF_MPSAFE | AS(clock_gettime_args), (sy_call_t *)clock_gettime }, /* 232 = clock_gettime */
+ { SYF_MPSAFE | AS(clock_settime_args), (sy_call_t *)clock_settime }, /* 233 = clock_settime */
+ { SYF_MPSAFE | AS(clock_getres_args), (sy_call_t *)clock_getres }, /* 234 = clock_getres */
+ { 0, (sy_call_t *)nosys }, /* 235 = timer_create */
+ { 0, (sy_call_t *)nosys }, /* 236 = timer_delete */
+ { 0, (sy_call_t *)nosys }, /* 237 = timer_settime */
+ { 0, (sy_call_t *)nosys }, /* 238 = timer_gettime */
+ { 0, (sy_call_t *)nosys }, /* 239 = timer_getoverrun */
+ { SYF_MPSAFE | AS(nanosleep_args), (sy_call_t *)nanosleep }, /* 240 = nanosleep */
+ { 0, (sy_call_t *)nosys }, /* 241 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 242 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 243 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 244 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 245 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 246 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 247 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 248 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 249 = nosys */
+ { SYF_MPSAFE | AS(minherit_args), (sy_call_t *)minherit }, /* 250 = minherit */
+ { SYF_MPSAFE | AS(rfork_args), (sy_call_t *)rfork }, /* 251 = rfork */
+ { SYF_MPSAFE | AS(openbsd_poll_args), (sy_call_t *)openbsd_poll }, /* 252 = openbsd_poll */
+ { 0, (sy_call_t *)issetugid }, /* 253 = issetugid */
+ { AS(lchown_args), (sy_call_t *)lchown }, /* 254 = lchown */
+ { 0, (sy_call_t *)nosys }, /* 255 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 256 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 257 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 258 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 259 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 260 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 261 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 262 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 263 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 264 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 265 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 266 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 267 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 268 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 269 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 270 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 271 = nosys */
+ { AS(getdents_args), (sy_call_t *)getdents }, /* 272 = getdents */
+ { 0, (sy_call_t *)nosys }, /* 273 = nosys */
+ { AS(lchmod_args), (sy_call_t *)lchmod }, /* 274 = lchmod */
+ { AS(lchown_args), (sy_call_t *)lchown }, /* 275 = netbsd_lchown */
+ { AS(lutimes_args), (sy_call_t *)lutimes }, /* 276 = lutimes */
+ { SYF_MPSAFE | AS(msync_args), (sy_call_t *)msync }, /* 277 = netbsd_msync */
+ { AS(nstat_args), (sy_call_t *)nstat }, /* 278 = nstat */
+ { SYF_MPSAFE | AS(nfstat_args), (sy_call_t *)nfstat }, /* 279 = nfstat */
+ { AS(nlstat_args), (sy_call_t *)nlstat }, /* 280 = nlstat */
+ { 0, (sy_call_t *)nosys }, /* 281 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 282 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 283 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 284 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 285 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 286 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 287 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 288 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 289 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 290 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 291 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 292 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 293 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 294 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 295 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 296 = nosys */
+ { AS(fhstatfs_args), (sy_call_t *)fhstatfs }, /* 297 = fhstatfs */
+ { AS(fhopen_args), (sy_call_t *)fhopen }, /* 298 = fhopen */
+ { AS(fhstat_args), (sy_call_t *)fhstat }, /* 299 = fhstat */
+ { SYF_MPSAFE | AS(modnext_args), (sy_call_t *)modnext }, /* 300 = modnext */
+ { SYF_MPSAFE | AS(modstat_args), (sy_call_t *)modstat }, /* 301 = modstat */
+ { SYF_MPSAFE | AS(modfnext_args), (sy_call_t *)modfnext }, /* 302 = modfnext */
+ { SYF_MPSAFE | AS(modfind_args), (sy_call_t *)modfind }, /* 303 = modfind */
+ { SYF_MPSAFE | AS(kldload_args), (sy_call_t *)kldload }, /* 304 = kldload */
+ { SYF_MPSAFE | AS(kldunload_args), (sy_call_t *)kldunload }, /* 305 = kldunload */
+ { SYF_MPSAFE | AS(kldfind_args), (sy_call_t *)kldfind }, /* 306 = kldfind */
+ { SYF_MPSAFE | AS(kldnext_args), (sy_call_t *)kldnext }, /* 307 = kldnext */
+ { SYF_MPSAFE | AS(kldstat_args), (sy_call_t *)kldstat }, /* 308 = kldstat */
+ { SYF_MPSAFE | AS(kldfirstmod_args), (sy_call_t *)kldfirstmod }, /* 309 = kldfirstmod */
+ { SYF_MPSAFE | AS(getsid_args), (sy_call_t *)getsid }, /* 310 = getsid */
+ { SYF_MPSAFE | AS(setresuid_args), (sy_call_t *)setresuid }, /* 311 = setresuid */
+ { SYF_MPSAFE | AS(setresgid_args), (sy_call_t *)setresgid }, /* 312 = setresgid */
+ { 0, (sy_call_t *)nosys }, /* 313 = obsolete signanosleep */
+ { AS(aio_return_args), (sy_call_t *)lkmressys }, /* 314 = aio_return */
+ { AS(aio_suspend_args), (sy_call_t *)lkmressys }, /* 315 = aio_suspend */
+ { AS(aio_cancel_args), (sy_call_t *)lkmressys }, /* 316 = aio_cancel */
+ { AS(aio_error_args), (sy_call_t *)lkmressys }, /* 317 = aio_error */
+ { AS(aio_read_args), (sy_call_t *)lkmressys }, /* 318 = aio_read */
+ { AS(aio_write_args), (sy_call_t *)lkmressys }, /* 319 = aio_write */
+ { AS(lio_listio_args), (sy_call_t *)lkmressys }, /* 320 = lio_listio */
+ { SYF_MPSAFE | 0, (sy_call_t *)yield }, /* 321 = yield */
+ { 0, (sy_call_t *)nosys }, /* 322 = obsolete thr_sleep */
+ { 0, (sy_call_t *)nosys }, /* 323 = obsolete thr_wakeup */
+ { SYF_MPSAFE | AS(mlockall_args), (sy_call_t *)mlockall }, /* 324 = mlockall */
+ { SYF_MPSAFE | 0, (sy_call_t *)munlockall }, /* 325 = munlockall */
+ { AS(__getcwd_args), (sy_call_t *)__getcwd }, /* 326 = __getcwd */
+ { SYF_MPSAFE | AS(sched_setparam_args), (sy_call_t *)sched_setparam }, /* 327 = sched_setparam */
+ { SYF_MPSAFE | AS(sched_getparam_args), (sy_call_t *)sched_getparam }, /* 328 = sched_getparam */
+ { SYF_MPSAFE | AS(sched_setscheduler_args), (sy_call_t *)sched_setscheduler }, /* 329 = sched_setscheduler */
+ { SYF_MPSAFE | AS(sched_getscheduler_args), (sy_call_t *)sched_getscheduler }, /* 330 = sched_getscheduler */
+ { SYF_MPSAFE | 0, (sy_call_t *)sched_yield }, /* 331 = sched_yield */
+ { SYF_MPSAFE | AS(sched_get_priority_max_args), (sy_call_t *)sched_get_priority_max }, /* 332 = sched_get_priority_max */
+ { SYF_MPSAFE | AS(sched_get_priority_min_args), (sy_call_t *)sched_get_priority_min }, /* 333 = sched_get_priority_min */
+ { SYF_MPSAFE | AS(sched_rr_get_interval_args), (sy_call_t *)sched_rr_get_interval }, /* 334 = sched_rr_get_interval */
+ { AS(utrace_args), (sy_call_t *)utrace }, /* 335 = utrace */
+ { SYF_MPSAFE | AS(sendfile_args), (sy_call_t *)sendfile }, /* 336 = sendfile */
+ { AS(kldsym_args), (sy_call_t *)kldsym }, /* 337 = kldsym */
+ { SYF_MPSAFE | AS(jail_args), (sy_call_t *)jail }, /* 338 = jail */
+ { 0, (sy_call_t *)nosys }, /* 339 = pioctl */
+ { SYF_MPSAFE | AS(sigprocmask_args), (sy_call_t *)sigprocmask }, /* 340 = sigprocmask */
+ { SYF_MPSAFE | AS(sigsuspend_args), (sy_call_t *)sigsuspend }, /* 341 = sigsuspend */
+ { SYF_MPSAFE | AS(sigaction_args), (sy_call_t *)sigaction }, /* 342 = sigaction */
+ { SYF_MPSAFE | AS(sigpending_args), (sy_call_t *)sigpending }, /* 343 = sigpending */
+ { SYF_MPSAFE | AS(sigreturn_args), (sy_call_t *)sigreturn }, /* 344 = sigreturn */
+ { 0, (sy_call_t *)nosys }, /* 345 = sigtimedwait */
+ { 0, (sy_call_t *)nosys }, /* 346 = sigwaitinfo */
+ { SYF_MPSAFE | AS(__acl_get_file_args), (sy_call_t *)__acl_get_file }, /* 347 = __acl_get_file */
+ { SYF_MPSAFE | AS(__acl_set_file_args), (sy_call_t *)__acl_set_file }, /* 348 = __acl_set_file */
+ { SYF_MPSAFE | AS(__acl_get_fd_args), (sy_call_t *)__acl_get_fd }, /* 349 = __acl_get_fd */
+ { SYF_MPSAFE | AS(__acl_set_fd_args), (sy_call_t *)__acl_set_fd }, /* 350 = __acl_set_fd */
+ { SYF_MPSAFE | AS(__acl_delete_file_args), (sy_call_t *)__acl_delete_file }, /* 351 = __acl_delete_file */
+ { SYF_MPSAFE | AS(__acl_delete_fd_args), (sy_call_t *)__acl_delete_fd }, /* 352 = __acl_delete_fd */
+ { SYF_MPSAFE | AS(__acl_aclcheck_file_args), (sy_call_t *)__acl_aclcheck_file }, /* 353 = __acl_aclcheck_file */
+ { SYF_MPSAFE | AS(__acl_aclcheck_fd_args), (sy_call_t *)__acl_aclcheck_fd }, /* 354 = __acl_aclcheck_fd */
+ { AS(extattrctl_args), (sy_call_t *)extattrctl }, /* 355 = extattrctl */
+ { AS(extattr_set_file_args), (sy_call_t *)extattr_set_file }, /* 356 = extattr_set_file */
+ { AS(extattr_get_file_args), (sy_call_t *)extattr_get_file }, /* 357 = extattr_get_file */
+ { AS(extattr_delete_file_args), (sy_call_t *)extattr_delete_file }, /* 358 = extattr_delete_file */
+ { AS(aio_waitcomplete_args), (sy_call_t *)lkmressys }, /* 359 = aio_waitcomplete */
+ { SYF_MPSAFE | AS(getresuid_args), (sy_call_t *)getresuid }, /* 360 = getresuid */
+ { SYF_MPSAFE | AS(getresgid_args), (sy_call_t *)getresgid }, /* 361 = getresgid */
+ { SYF_MPSAFE | 0, (sy_call_t *)kqueue }, /* 362 = kqueue */
+ { SYF_MPSAFE | AS(kevent_args), (sy_call_t *)kevent }, /* 363 = kevent */
+ { 0, (sy_call_t *)nosys }, /* 364 = __cap_get_proc */
+ { 0, (sy_call_t *)nosys }, /* 365 = __cap_set_proc */
+ { 0, (sy_call_t *)nosys }, /* 366 = __cap_get_fd */
+ { 0, (sy_call_t *)nosys }, /* 367 = __cap_get_file */
+ { 0, (sy_call_t *)nosys }, /* 368 = __cap_set_fd */
+ { 0, (sy_call_t *)nosys }, /* 369 = __cap_set_file */
+ { AS(nosys_args), (sy_call_t *)lkmressys }, /* 370 = lkmressys */
+ { AS(extattr_set_fd_args), (sy_call_t *)extattr_set_fd }, /* 371 = extattr_set_fd */
+ { AS(extattr_get_fd_args), (sy_call_t *)extattr_get_fd }, /* 372 = extattr_get_fd */
+ { AS(extattr_delete_fd_args), (sy_call_t *)extattr_delete_fd }, /* 373 = extattr_delete_fd */
+ { SYF_MPSAFE | AS(__setugid_args), (sy_call_t *)__setugid }, /* 374 = __setugid */
+ { AS(nfsclnt_args), (sy_call_t *)nosys }, /* 375 = nfsclnt */
+ { AS(eaccess_args), (sy_call_t *)eaccess }, /* 376 = eaccess */
+ { 0, (sy_call_t *)nosys }, /* 377 = afs_syscall */
+ { AS(nmount_args), (sy_call_t *)nmount }, /* 378 = nmount */
+ { 0, (sy_call_t *)kse_exit }, /* 379 = kse_exit */
+ { 0, (sy_call_t *)kse_wakeup }, /* 380 = kse_wakeup */
+ { AS(kse_new_args), (sy_call_t *)kse_new }, /* 381 = kse_new */
+ { AS(thread_wakeup_args), (sy_call_t *)thread_wakeup }, /* 382 = thread_wakeup */
+ { 0, (sy_call_t *)kse_yield }, /* 383 = kse_yield */
+ { 0, (sy_call_t *)nosys }, /* 384 = __mac_get_proc */
+ { 0, (sy_call_t *)nosys }, /* 385 = __mac_set_proc */
+ { 0, (sy_call_t *)nosys }, /* 386 = __mac_get_fd */
+ { 0, (sy_call_t *)nosys }, /* 387 = __mac_get_file */
+ { 0, (sy_call_t *)nosys }, /* 388 = __mac_set_fd */
+ { 0, (sy_call_t *)nosys }, /* 389 = __mac_set_file */
+ { AS(kenv_args), (sy_call_t *)kenv }, /* 390 = kenv */
+ { AS(lchflags_args), (sy_call_t *)lchflags }, /* 391 = lchflags */
+ { AS(uuidgen_args), (sy_call_t *)uuidgen }, /* 392 = uuidgen */
+};
diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c
new file mode 100644
index 0000000..6626197
--- /dev/null
+++ b/sys/kern/kern_acct.c
@@ -0,0 +1,345 @@
+/*-
+ * Copyright (c) 1994 Christopher G. Demetriou
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_acct.c 8.1 (Berkeley) 6/14/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/fcntl.h>
+#include <sys/syslog.h>
+#include <sys/kernel.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/namei.h>
+#include <sys/acct.h>
+#include <sys/resourcevar.h>
+#include <sys/tty.h>
+
+/*
+ * The routines implemented in this file are described in:
+ * Leffler, et al.: The Design and Implementation of the 4.3BSD
+ * UNIX Operating System (Addison Welley, 1989)
+ * on pages 62-63.
+ *
+ * Arguably, to simplify accounting operations, this mechanism should
+ * be replaced by one in which an accounting log file (similar to /dev/klog)
+ * is read by a user process, etc. However, that has its own problems.
+ */
+
+/*
+ * Internal accounting functions.
+ * The former's operation is described in Leffler, et al., and the latter
+ * was provided by UCB with the 4.4BSD-Lite release
+ */
+static comp_t encode_comp_t(u_long, u_long);
+static void acctwatch(void *);
+
+/*
+ * Accounting callout used for periodic scheduling of acctwatch.
+ */
+static struct callout acctwatch_callout;
+
+/*
+ * Accounting vnode pointer, and saved vnode pointer.
+ */
+static struct vnode *acctp;
+static struct vnode *savacctp;
+
+/*
+ * Values associated with enabling and disabling accounting
+ */
+static int acctsuspend = 2; /* stop accounting when < 2% free space left */
+SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW,
+ &acctsuspend, 0, "percentage of free disk space below which accounting stops");
+
+static int acctresume = 4; /* resume when free space risen to > 4% */
+SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW,
+ &acctresume, 0, "percentage of free disk space above which accounting resumes");
+
+static int acctchkfreq = 15; /* frequency (in seconds) to check space */
+SYSCTL_INT(_kern, OID_AUTO, acct_chkfreq, CTLFLAG_RW,
+ &acctchkfreq, 0, "frequency for checking the free space");
+
+/*
+ * Accounting system call. Written based on the specification and
+ * previous implementation done by Mark Tinguely.
+ *
+ * MPSAFE
+ */
+int
+acct(td, uap)
+ struct thread *td;
+ struct acct_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ struct nameidata nd;
+ int error, flags;
+
+ /* Make sure that the caller is root. */
+ error = suser(td);
+ if (error)
+ return (error);
+
+ mtx_lock(&Giant);
+ /*
+ * If accounting is to be started to a file, open that file for
+ * writing and make sure it's a 'normal'.
+ */
+ if (SCARG(uap, path) != NULL) {
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path),
+ td);
+ flags = FWRITE;
+ error = vn_open(&nd, &flags, 0);
+ if (error)
+ goto done2;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ VOP_UNLOCK(nd.ni_vp, 0, td);
+ if (nd.ni_vp->v_type != VREG) {
+ vn_close(nd.ni_vp, FWRITE, td->td_ucred, td);
+ error = EACCES;
+ goto done2;
+ }
+ }
+
+ /*
+ * If accounting was previously enabled, kill the old space-watcher,
+ * close the file, and (if no new file was specified, leave).
+ */
+ if (acctp != NULLVP || savacctp != NULLVP) {
+ callout_stop(&acctwatch_callout);
+ error = vn_close((acctp != NULLVP ? acctp : savacctp), FWRITE,
+ td->td_ucred, td);
+ acctp = savacctp = NULLVP;
+ }
+ if (SCARG(uap, path) == NULL)
+ goto done2;
+
+ /*
+ * Save the new accounting file vnode, and schedule the new
+ * free space watcher.
+ */
+ acctp = nd.ni_vp;
+ callout_init(&acctwatch_callout, 0);
+ acctwatch(NULL);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Write out process accounting information, on process exit.
+ * Data to be written out is specified in Leffler, et al.
+ * and are enumerated below. (They're also noted in the system
+ * "acct.h" header file.)
+ */
+
+int
+acct_process(td)
+ struct thread *td;
+{
+ struct proc *p = td->td_proc;
+ struct acct acct;
+ struct rusage *r;
+ struct timeval ut, st, tmp;
+ int t;
+ struct vnode *vp;
+
+ /* If accounting isn't enabled, don't bother */
+ vp = acctp;
+ if (vp == NULLVP)
+ return (0);
+
+ /*
+ * Get process accounting information.
+ */
+
+ /* (1) The name of the command that ran */
+ bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
+
+ /* (2) The amount of user and system time that was used */
+ mtx_lock_spin(&sched_lock);
+ calcru(p, &ut, &st, NULL);
+ mtx_unlock_spin(&sched_lock);
+ acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec);
+ acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec);
+
+ /* (3) The elapsed time the commmand ran (and its starting time) */
+ acct.ac_btime = p->p_stats->p_start.tv_sec;
+ microtime(&tmp);
+ timevalsub(&tmp, &p->p_stats->p_start);
+ acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec);
+
+ /* (4) The average amount of memory used */
+ r = &p->p_stats->p_ru;
+ tmp = ut;
+ timevaladd(&tmp, &st);
+ t = tmp.tv_sec * hz + tmp.tv_usec / tick;
+ if (t)
+ acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t;
+ else
+ acct.ac_mem = 0;
+
+ /* (5) The number of disk I/O operations done */
+ acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0);
+
+ /* (6) The UID and GID of the process */
+ acct.ac_uid = p->p_ucred->cr_ruid;
+ acct.ac_gid = p->p_ucred->cr_rgid;
+
+ /* (7) The terminal from which the process was started */
+ PROC_LOCK(p);
+ SESS_LOCK(p->p_session);
+ if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
+ acct.ac_tty = dev2udev(p->p_pgrp->pg_session->s_ttyp->t_dev);
+ else
+ acct.ac_tty = NOUDEV;
+ SESS_UNLOCK(p->p_session);
+ PROC_UNLOCK(p);
+
+ /* (8) The boolean flags that tell how the process terminated, etc. */
+ acct.ac_flag = p->p_acflag;
+
+ /*
+ * Eliminate any file size rlimit.
+ */
+ if (p->p_limit->p_refcnt > 1 &&
+ (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
+ p->p_limit->p_refcnt--;
+ p->p_limit = limcopy(p->p_limit);
+ }
+ p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+
+ /*
+ * Write the accounting information to the file.
+ */
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ return (vn_rdwr(UIO_WRITE, vp, (caddr_t)&acct, sizeof (acct),
+ (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, td->td_ucred,
+ (int *)0, td));
+}
+
+/*
+ * Encode_comp_t converts from ticks in seconds and microseconds
+ * to ticks in 1/AHZ seconds. The encoding is described in
+ * Leffler, et al., on page 63.
+ */
+
+#define MANTSIZE 13 /* 13 bit mantissa. */
+#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
+#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
+
+static comp_t
+encode_comp_t(s, us)
+ u_long s, us;
+{
+ int exp, rnd;
+
+ exp = 0;
+ rnd = 0;
+ s *= AHZ;
+ s += us / (1000000 / AHZ); /* Maximize precision. */
+
+ while (s > MAXFRACT) {
+ rnd = s & (1 << (EXPSIZE - 1)); /* Round up? */
+ s >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
+ exp++;
+ }
+
+ /* If we need to round up, do it (and handle overflow correctly). */
+ if (rnd && (++s > MAXFRACT)) {
+ s >>= EXPSIZE;
+ exp++;
+ }
+
+ /* Clean it up and polish it off. */
+ exp <<= MANTSIZE; /* Shift the exponent into place */
+ exp += s; /* and add on the mantissa. */
+ return (exp);
+}
+
+/*
+ * Periodically check the filesystem to see if accounting
+ * should be turned on or off. Beware the case where the vnode
+ * has been vgone()'d out from underneath us, e.g. when the file
+ * system containing the accounting file has been forcibly unmounted.
+ */
+/* ARGSUSED */
+static void
+acctwatch(a)
+ void *a;
+{
+ struct statfs sb;
+
+ if (savacctp != NULLVP) {
+ if (savacctp->v_type == VBAD) {
+ (void) vn_close(savacctp, FWRITE, NOCRED, NULL);
+ savacctp = NULLVP;
+ return;
+ }
+ (void)VFS_STATFS(savacctp->v_mount, &sb, (struct thread *)0);
+ if (sb.f_bavail > acctresume * sb.f_blocks / 100) {
+ acctp = savacctp;
+ savacctp = NULLVP;
+ log(LOG_NOTICE, "Accounting resumed\n");
+ }
+ } else {
+ if (acctp == NULLVP)
+ return;
+ if (acctp->v_type == VBAD) {
+ (void) vn_close(acctp, FWRITE, NOCRED, NULL);
+ acctp = NULLVP;
+ return;
+ }
+ (void)VFS_STATFS(acctp->v_mount, &sb, (struct thread *)0);
+ if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) {
+ savacctp = acctp;
+ acctp = NULLVP;
+ log(LOG_NOTICE, "Accounting suspended\n");
+ }
+ }
+ callout_reset(&acctwatch_callout, acctchkfreq * hz, acctwatch, NULL);
+}
diff --git a/sys/kern/kern_acl.c b/sys/kern/kern_acl.c
new file mode 100644
index 0000000..70be0ec
--- /dev/null
+++ b/sys/kern/kern_acl.c
@@ -0,0 +1,830 @@
+/*-
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ * Support for POSIX.1e access control lists.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+
+MALLOC_DEFINE(M_ACL, "acl", "access control list");
+
+static int vacl_set_acl(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+static int vacl_get_acl(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+static int vacl_aclcheck(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+
+/*
+ * Implement a version of vaccess() that understands POSIX.1e ACL semantics.
+ * Return 0 on success, else an errno value. Should be merged into
+ * vaccess() eventually.
+ */
+int
+vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
+ struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
+{
+ struct acl_entry *acl_other, *acl_mask;
+ mode_t dac_granted;
+ mode_t cap_granted;
+ mode_t acl_mask_granted;
+ int group_matched, i;
+
+ /*
+ * Look for a normal, non-privileged way to access the file/directory
+ * as requested. If it exists, go with that. Otherwise, attempt
+ * to use privileges granted via cap_granted. In some cases,
+ * which privileges to use may be ambiguous due to "best match",
+ * in which case fall back on first match for the time being.
+ */
+ if (privused != NULL)
+ *privused = 0;
+
+ /*
+ * Determine privileges now, but don't apply until we've found
+ * a DAC entry that matches but has failed to allow access.
+ */
+#ifndef CAPABILITIES
+ if (suser_cred(cred, PRISON_ROOT) == 0)
+ cap_granted = (VEXEC | VREAD | VWRITE | VADMIN);
+ else
+ cap_granted = 0;
+#else
+ cap_granted = 0;
+
+ if (type == VDIR) {
+ if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
+ CAP_DAC_READ_SEARCH, PRISON_ROOT))
+ cap_granted |= VEXEC;
+ } else {
+ if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
+ CAP_DAC_EXECUTE, PRISON_ROOT))
+ cap_granted |= VEXEC;
+ }
+
+ if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH,
+ PRISON_ROOT))
+ cap_granted |= VREAD;
+
+ if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE,
+ PRISON_ROOT))
+ cap_granted |= VWRITE;
+
+ if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER,
+ PRISON_ROOT))
+ cap_granted |= VADMIN;
+#endif /* CAPABILITIES */
+
+ /*
+ * The owner matches if the effective uid associated with the
+ * credential matches that of the ACL_USER_OBJ entry. While we're
+ * doing the first scan, also cache the location of the ACL_MASK
+ * and ACL_OTHER entries, preventing some future iterations.
+ */
+ acl_mask = acl_other = NULL;
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_USER_OBJ:
+ if (file_uid != cred->cr_uid)
+ break;
+ dac_granted = 0;
+ dac_granted |= VADMIN;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+ if ((acc_mode & (dac_granted | cap_granted)) ==
+ acc_mode) {
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+ goto error;
+
+ case ACL_MASK:
+ acl_mask = &acl->acl_entry[i];
+ break;
+
+ case ACL_OTHER:
+ acl_other = &acl->acl_entry[i];
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ /*
+ * An ACL_OTHER entry should always exist in a valid access
+ * ACL. If it doesn't, then generate a serious failure. For now,
+ * this means a debugging message and EPERM, but in the future
+ * should probably be a panic.
+ */
+ if (acl_other == NULL) {
+ /*
+ * XXX This should never happen
+ */
+ printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
+ return (EPERM);
+ }
+
+ /*
+ * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields
+ * are masked by an ACL_MASK entry, if any. As such, first identify
+ * the ACL_MASK field, then iterate through identifying potential
+ * user matches, then group matches. If there is no ACL_MASK,
+ * assume that the mask allows all requests to succeed.
+ */
+ if (acl_mask != NULL) {
+ acl_mask_granted = 0;
+ if (acl_mask->ae_perm & ACL_EXECUTE)
+ acl_mask_granted |= VEXEC;
+ if (acl_mask->ae_perm & ACL_READ)
+ acl_mask_granted |= VREAD;
+ if (acl_mask->ae_perm & ACL_WRITE)
+ acl_mask_granted |= VWRITE;
+ } else
+ acl_mask_granted = VEXEC | VREAD | VWRITE;
+
+ /*
+ * Iterate through user ACL entries. Do checks twice, first
+ * without privilege, and then if a match is found but failed,
+ * a second time with privilege.
+ */
+
+ /*
+ * Check ACL_USER ACL entries.
+ */
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_USER:
+ if (acl->acl_entry[i].ae_id != cred->cr_uid)
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+ if ((acc_mode & (dac_granted | cap_granted)) !=
+ acc_mode)
+ goto error;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+ }
+
+ /*
+ * Group match is best-match, not first-match, so find a
+ * "best" match. Iterate across, testing each potential group
+ * match. Make sure we keep track of whether we found a match
+ * or not, so that we know if we should try again with any
+ * available privilege, or if we should move on to ACL_OTHER.
+ */
+ group_matched = 0;
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_GROUP_OBJ:
+ if (!groupmember(file_gid, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+
+ group_matched = 1;
+ break;
+
+ case ACL_GROUP:
+ if (!groupmember(acl->acl_entry[i].ae_id, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+
+ group_matched = 1;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (group_matched == 1) {
+ /*
+ * There was a match, but it did not grant rights via
+ * pure DAC. Try again, this time with privilege.
+ */
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_GROUP_OBJ:
+ if (!groupmember(file_gid, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & (dac_granted | cap_granted)) !=
+ acc_mode)
+ break;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+
+ case ACL_GROUP:
+ if (!groupmember(acl->acl_entry[i].ae_id,
+ cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & (dac_granted | cap_granted)) !=
+ acc_mode)
+ break;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+
+ default:
+ break;
+ }
+ }
+ /*
+ * Even with privilege, group membership was not sufficient.
+ * Return failure.
+ */
+ goto error;
+ }
+
+ /*
+ * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER.
+ */
+ dac_granted = 0;
+ if (acl_other->ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl_other->ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl_other->ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+ if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) {
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+
+error:
+ return ((acc_mode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * For the purposes of filesystems maintaining the _OBJ entries in an
+ * inode with a mode_t field, this routine converts a mode_t entry
+ * to an acl_perm_t.
+ */
+acl_perm_t
+acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
+{
+ acl_perm_t perm = 0;
+
+ switch(tag) {
+ case ACL_USER_OBJ:
+ if (mode & S_IXUSR)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IRUSR)
+ perm |= ACL_READ;
+ if (mode & S_IWUSR)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ case ACL_GROUP_OBJ:
+ if (mode & S_IXGRP)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IRGRP)
+ perm |= ACL_READ;
+ if (mode & S_IWGRP)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ case ACL_OTHER:
+ if (mode & S_IXOTH)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IROTH)
+ perm |= ACL_READ;
+ if (mode & S_IWOTH)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ default:
+ printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
+ return (0);
+ }
+}
+
+/*
+ * Given inode information (uid, gid, mode), return an acl entry of the
+ * appropriate type.
+ */
+struct acl_entry
+acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
+{
+ struct acl_entry acl_entry;
+
+ acl_entry.ae_tag = tag;
+ acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
+ switch(tag) {
+ case ACL_USER_OBJ:
+ acl_entry.ae_id = uid;
+ break;
+
+ case ACL_GROUP_OBJ:
+ acl_entry.ae_id = gid;
+ break;
+
+ case ACL_OTHER:
+ acl_entry.ae_id = ACL_UNDEFINED_ID;
+ break;
+
+ default:
+ acl_entry.ae_id = ACL_UNDEFINED_ID;
+ printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
+ }
+
+ return (acl_entry);
+}
+
+/*
+ * Utility function to generate a file mode given appropriate ACL entries.
+ */
+mode_t
+acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
+ struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
+{
+ mode_t mode;
+
+ mode = 0;
+ if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXUSR;
+ if (acl_user_obj_entry->ae_perm & ACL_READ)
+ mode |= S_IRUSR;
+ if (acl_user_obj_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWUSR;
+ if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXGRP;
+ if (acl_group_obj_entry->ae_perm & ACL_READ)
+ mode |= S_IRGRP;
+ if (acl_group_obj_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWGRP;
+ if (acl_other_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXOTH;
+ if (acl_other_entry->ae_perm & ACL_READ)
+ mode |= S_IROTH;
+ if (acl_other_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWOTH;
+
+ return (mode);
+}
+
+/*
+ * Perform a syntactic check of the ACL, sufficient to allow an
+ * implementing filesystem to determine if it should accept this and
+ * rely on the POSIX.1e ACL properties.
+ */
+int
+acl_posix1e_check(struct acl *acl)
+{
+ int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
+ int num_acl_mask, num_acl_other, i;
+
+ /*
+ * Verify that the number of entries does not exceed the maximum
+ * defined for acl_t.
+ * Verify that the correct number of various sorts of ae_tags are
+ * present:
+ * Exactly one ACL_USER_OBJ
+ * Exactly one ACL_GROUP_OBJ
+ * Exactly one ACL_OTHER
+ * If any ACL_USER or ACL_GROUP entries appear, then exactly one
+ * ACL_MASK entry must also appear.
+ * Verify that all ae_perm entries are in ACL_PERM_BITS.
+ * Verify all ae_tag entries are understood by this implementation.
+ * Note: Does not check for uniqueness of qualifier (ae_id) field.
+ */
+ num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
+ num_acl_mask = num_acl_other = 0;
+ if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
+ return (EINVAL);
+ for (i = 0; i < acl->acl_cnt; i++) {
+ /*
+ * Check for a valid tag.
+ */
+ switch(acl->acl_entry[i].ae_tag) {
+ case ACL_USER_OBJ:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_user_obj++;
+ break;
+ case ACL_GROUP_OBJ:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_group_obj++;
+ break;
+ case ACL_USER:
+ if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_user++;
+ break;
+ case ACL_GROUP:
+ if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_group++;
+ break;
+ case ACL_OTHER:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_other++;
+ break;
+ case ACL_MASK:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_mask++;
+ break;
+ default:
+ return (EINVAL);
+ }
+ /*
+ * Check for valid perm entries.
+ */
+ if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
+ ACL_PERM_BITS)
+ return (EINVAL);
+ }
+ if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
+ (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
+ return (EINVAL);
+ if (((num_acl_group != 0) || (num_acl_user != 0)) &&
+ (num_acl_mask != 1))
+ return (EINVAL);
+ return (0);
+}
+
+/*
+ * These calls wrap the real vnode operations, and are called by the
+ * syscall code once the syscall has converted the path or file
+ * descriptor to a vnode (unlocked). The aclp pointer is assumed
+ * still to point to userland, so this should not be consumed within
+ * the kernel except by syscall code. Other code should directly
+ * invoke VOP_{SET,GET}ACL.
+ */
+
+/*
+ * Given a vnode, set its ACL.
+ */
+static int
+vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl inkernacl;
+ struct mount *mp;
+ int error;
+
+ error = copyin(aclp, &inkernacl, sizeof(struct acl));
+ if (error)
+ return(error);
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return(error);
+}
+
+/*
+ * Given a vnode, get its ACL.
+ */
+static int
+vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl inkernelacl;
+ int error;
+
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ if (error == 0)
+ error = copyout(&inkernelacl, aclp, sizeof(struct acl));
+ return (error);
+}
+
+/*
+ * Given a vnode, delete its ACL.
+ */
+static int
+vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
+{
+ struct mount *mp;
+ int error;
+
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_SETACL(vp, type, NULL, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Given a vnode, check whether an ACL is appropriate for it
+ */
+static int
+vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl inkernelacl;
+ int error;
+
+ error = copyin(aclp, &inkernelacl, sizeof(struct acl));
+ if (error)
+ return(error);
+ error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
+ return (error);
+}
+
+/*
+ * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.
+ * Don't need to lock, as the vacl_ code will get/release any locks
+ * required.
+ */
+
+/*
+ * Given a file path, get an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ mtx_lock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type),
+ SCARG(uap, aclp));
+ NDFREE(&nd, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ mtx_lock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type),
+ SCARG(uap, aclp));
+ NDFREE(&nd, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file descriptor, get an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ mtx_lock(&Giant);
+ error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+ if (error == 0) {
+ error = vacl_get_acl(td, (struct vnode *)fp->f_data,
+ SCARG(uap, type), SCARG(uap, aclp));
+ fdrop(fp, td);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file descriptor, set an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ mtx_lock(&Giant);
+ error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+ if (error == 0) {
+ error = vacl_set_acl(td, (struct vnode *)fp->f_data,
+ SCARG(uap, type), SCARG(uap, aclp));
+ fdrop(fp, td);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ *
+ * MPSAFE
+ */
+int
+__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ mtx_lock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_delete(td, nd.ni_vp, SCARG(uap, type));
+ NDFREE(&nd, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ *
+ * MPSAFE
+ */
+int
+__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ mtx_lock(&Giant);
+ error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+ if (error == 0) {
+ error = vacl_delete(td, (struct vnode *)fp->f_data,
+ SCARG(uap, type));
+ fdrop(fp, td);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ mtx_lock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type),
+ SCARG(uap, aclp));
+ NDFREE(&nd, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file descriptor, check an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ mtx_lock(&Giant);
+ error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+ if (error == 0) {
+ error = vacl_aclcheck(td, (struct vnode *)fp->f_data,
+ SCARG(uap, type), SCARG(uap, aclp));
+ fdrop(fp, td);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
new file mode 100644
index 0000000..2e7ca8b
--- /dev/null
+++ b/sys/kern/kern_clock.c
@@ -0,0 +1,492 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_ntp.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/dkstat.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/ktr.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/smp.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+
+#include <machine/cpu.h>
+#include <machine/limits.h>
+
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+
+#ifdef DEVICE_POLLING
+extern void init_device_poll(void);
+extern void hardclock_device_poll(void);
+#endif /* DEVICE_POLLING */
+
+static void initclocks(void *dummy);
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
+
+/* Some of these don't belong here, but it's easiest to concentrate them. */
+long cp_time[CPUSTATES];
+
+SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time),
+ "LU", "CPU time statistics");
+
+long tk_cancc;
+long tk_nin;
+long tk_nout;
+long tk_rawcc;
+
+/*
+ * Clock handling routines.
+ *
+ * This code is written to operate with two timers that run independently of
+ * each other.
+ *
+ * The main timer, running hz times per second, is used to trigger interval
+ * timers, timeouts and rescheduling as needed.
+ *
+ * The second timer handles kernel and user profiling,
+ * and does resource use estimation. If the second timer is programmable,
+ * it is randomized to avoid aliasing between the two clocks. For example,
+ * the randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires. Otherwise, it would never accumulate
+ * cpu ticks. The mean frequency of the second timer is stathz.
+ *
+ * If no second timer exists, stathz will be zero; in this case we drive
+ * profiling and statistics off the main clock. This WILL NOT be accurate;
+ * do not do it unless absolutely necessary.
+ *
+ * The statistics clock may (or may not) be run at a higher rate while
+ * profiling. This profile clock runs at profhz. We require that profhz
+ * be an integral multiple of stathz.
+ *
+ * If the statistics clock is running fast, it must be divided by the ratio
+ * profhz/stathz for statistics. (For profiling, every tick counts.)
+ *
+ * Time-of-day is maintained using a "timecounter", which may or may
+ * not be related to the hardware generating the above mentioned
+ * interrupts.
+ */
+
+int stathz;
+int profhz;
+static int profprocs;
+int ticks;
+static int psdiv, pscnt; /* prof => stat divider */
+int psratio; /* ratio: prof / stat */
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+ void *dummy;
+{
+ register int i;
+
+ /*
+ * Set divisors to 1 (normal case) and let the machine-specific
+ * code do its bit.
+ */
+ psdiv = pscnt = 1;
+ cpu_initclocks();
+
+#ifdef DEVICE_POLLING
+ init_device_poll();
+#endif
+ /*
+ * Compute profhz/stathz, and fix profhz if needed.
+ */
+ i = stathz ? stathz : hz;
+ if (profhz == 0)
+ profhz = i;
+ psratio = profhz / i;
+}
+
+/*
+ * Each time the real-time timer fires, this function is called on all CPUs
+ * with each CPU passing in its curthread as the first argument. If possible
+ * a nice optimization in the future would be to allow the CPU receiving the
+ * actual real-time timer interrupt to call this function on behalf of the
+ * other CPUs rather than sending an IPI to all other CPUs so that they
+ * can call this function. Note that hardclock() calls hardclock_process()
+ * for the CPU receiving the timer interrupt, so only the other CPUs in the
+ * system need to call this function (or have it called on their behalf.
+ */
+void
+hardclock_process(td, user)
+ struct thread *td;
+ int user;
+{
+ struct pstats *pstats;
+ struct proc *p = td->td_proc;
+
+ /*
+ * Run current process's virtual and profile time, as needed.
+ */
+ mtx_assert(&sched_lock, MA_OWNED);
+ if (p->p_flag & P_KSES) {
+ /* XXXKSE What to do? */
+ } else {
+ pstats = p->p_stats;
+ if (user &&
+ timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
+ itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
+ p->p_sflag |= PS_ALRMPEND;
+ td->td_kse->ke_flags |= KEF_ASTPENDING;
+ }
+ if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
+ itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
+ p->p_sflag |= PS_PROFPEND;
+ td->td_kse->ke_flags |= KEF_ASTPENDING;
+ }
+ }
+}
+
+/*
+ * The real-time timer, interrupting hz times per second.
+ */
+void
+hardclock(frame)
+ register struct clockframe *frame;
+{
+ int need_softclock = 0;
+
+ CTR0(KTR_CLK, "hardclock fired");
+ mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
+ hardclock_process(curthread, CLKF_USERMODE(frame));
+ mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
+
+ /*
+ * If no separate statistics clock is available, run it from here.
+ *
+ * XXX: this only works for UP
+ */
+ if (stathz == 0)
+ statclock(frame);
+
+#ifdef DEVICE_POLLING
+ hardclock_device_poll(); /* this is very short and quick */
+#endif /* DEVICE_POLLING */
+
+ /*
+ * Process callouts at a very low cpu priority, so we don't keep the
+ * relatively high clock interrupt priority any longer than necessary.
+ */
+ mtx_lock_spin_flags(&callout_lock, MTX_QUIET);
+ ticks++;
+ if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
+ need_softclock = 1;
+ } else if (softticks + 1 == ticks)
+ ++softticks;
+ mtx_unlock_spin_flags(&callout_lock, MTX_QUIET);
+
+ /*
+ * swi_sched acquires sched_lock, so we don't want to call it with
+ * callout_lock held; incorrect locking order.
+ */
+ if (need_softclock)
+ swi_sched(softclock_ih, 0);
+}
+
+/*
+ * Compute number of ticks in the specified amount of time.
+ */
+int
+tvtohz(tv)
+ struct timeval *tv;
+{
+ register unsigned long ticks;
+ register long sec, usec;
+
+ /*
+ * If the number of usecs in the whole seconds part of the time
+ * difference fits in a long, then the total number of usecs will
+ * fit in an unsigned long. Compute the total and convert it to
+ * ticks, rounding up and adding 1 to allow for the current tick
+ * to expire. Rounding also depends on unsigned long arithmetic
+ * to avoid overflow.
+ *
+ * Otherwise, if the number of ticks in the whole seconds part of
+ * the time difference fits in a long, then convert the parts to
+ * ticks separately and add, using similar rounding methods and
+ * overflow avoidance. This method would work in the previous
+ * case but it is slightly slower and assumes that hz is integral.
+ *
+ * Otherwise, round the time difference down to the maximum
+ * representable value.
+ *
+ * If ints have 32 bits, then the maximum value for any timeout in
+ * 10ms ticks is 248 days.
+ */
+ sec = tv->tv_sec;
+ usec = tv->tv_usec;
+ if (usec < 0) {
+ sec--;
+ usec += 1000000;
+ }
+ if (sec < 0) {
+#ifdef DIAGNOSTIC
+ if (usec > 0) {
+ sec++;
+ usec -= 1000000;
+ }
+ printf("tvotohz: negative time difference %ld sec %ld usec\n",
+ sec, usec);
+#endif
+ ticks = 1;
+ } else if (sec <= LONG_MAX / 1000000)
+ ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+ / tick + 1;
+ else if (sec <= LONG_MAX / hz)
+ ticks = sec * hz
+ + ((unsigned long)usec + (tick - 1)) / tick + 1;
+ else
+ ticks = LONG_MAX;
+ if (ticks > INT_MAX)
+ ticks = INT_MAX;
+ return ((int)ticks);
+}
+
+/*
+ * Start profiling on a process.
+ *
+ * Kernel profiling passes proc0 which never exits and hence
+ * keeps the profile clock running constantly.
+ */
+void
+startprofclock(p)
+ register struct proc *p;
+{
+ int s;
+
+ /*
+ * XXX; Right now sched_lock protects statclock(), but perhaps
+ * it should be protected later on by a time_lock, which would
+ * cover psdiv, etc. as well.
+ */
+ mtx_lock_spin(&sched_lock);
+ if ((p->p_sflag & PS_PROFIL) == 0) {
+ p->p_sflag |= PS_PROFIL;
+ if (++profprocs == 1 && stathz != 0) {
+ s = splstatclock();
+ psdiv = pscnt = psratio;
+ setstatclockrate(profhz);
+ splx(s);
+ }
+ }
+ mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Stop profiling on a process.
+ */
+void
+stopprofclock(p)
+ register struct proc *p;
+{
+ int s;
+
+ mtx_lock_spin(&sched_lock);
+ if (p->p_sflag & PS_PROFIL) {
+ p->p_sflag &= ~PS_PROFIL;
+ if (--profprocs == 0 && stathz != 0) {
+ s = splstatclock();
+ psdiv = pscnt = 1;
+ setstatclockrate(stathz);
+ splx(s);
+ }
+ }
+ mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Do process and kernel statistics. Most of the statistics are only
+ * used by user-level statistics programs. The main exceptions are
+ * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu. This function
+ * should be called by all CPUs in the system for each statistics clock
+ * interrupt. See the description of hardclock_process for more detail on
+ * this function's relationship to statclock.
+ */
+void
+statclock_process(ke, pc, user)
+ struct kse *ke;
+ register_t pc;
+ int user;
+{
+#ifdef GPROF
+ struct gmonparam *g;
+ int i;
+#endif
+ struct pstats *pstats;
+ long rss;
+ struct rusage *ru;
+ struct vmspace *vm;
+ struct proc *p = ke->ke_proc;
+ struct thread *td = ke->ke_thread; /* current thread */
+
+ KASSERT(ke == curthread->td_kse, ("statclock_process: td != curthread"));
+ mtx_assert(&sched_lock, MA_OWNED);
+ if (user) {
+ /*
+ * Came from user mode; CPU was in user state.
+ * If this process is being profiled, record the tick.
+ */
+ if (p->p_sflag & PS_PROFIL)
+ addupc_intr(ke, pc, 1);
+ if (pscnt < psdiv)
+ return;
+ /*
+ * Charge the time as appropriate.
+ */
+ ke->ke_uticks++;
+ if (ke->ke_ksegrp->kg_nice > NZERO)
+ cp_time[CP_NICE]++;
+ else
+ cp_time[CP_USER]++;
+ } else {
+#ifdef GPROF
+ /*
+ * Kernel statistics are just like addupc_intr, only easier.
+ */
+ g = &_gmonparam;
+ if (g->state == GMON_PROF_ON) {
+ i = pc - g->lowpc;
+ if (i < g->textsize) {
+ i /= HISTFRACTION * sizeof(*g->kcount);
+ g->kcount[i]++;
+ }
+ }
+#endif
+ if (pscnt < psdiv)
+ return;
+ /*
+ * Came from kernel mode, so we were:
+ * - handling an interrupt,
+ * - doing syscall or trap work on behalf of the current
+ * user process, or
+ * - spinning in the idle loop.
+ * Whichever it is, charge the time as appropriate.
+ * Note that we charge interrupts to the current process,
+ * regardless of whether they are ``for'' that process,
+ * so that we know how much of its real time was spent
+ * in ``non-process'' (i.e., interrupt) work.
+ */
+ if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) {
+ ke->ke_iticks++;
+ cp_time[CP_INTR]++;
+ } else {
+ ke->ke_sticks++;
+ if (p != PCPU_GET(idlethread)->td_proc)
+ cp_time[CP_SYS]++;
+ else
+ cp_time[CP_IDLE]++;
+ }
+ }
+
+ schedclock(ke->ke_thread);
+
+ /* Update resource usage integrals and maximums. */
+ if ((pstats = p->p_stats) != NULL &&
+ (ru = &pstats->p_ru) != NULL &&
+ (vm = p->p_vmspace) != NULL) {
+ ru->ru_ixrss += pgtok(vm->vm_tsize);
+ ru->ru_idrss += pgtok(vm->vm_dsize);
+ ru->ru_isrss += pgtok(vm->vm_ssize);
+ rss = pgtok(vmspace_resident_count(vm));
+ if (ru->ru_maxrss < rss)
+ ru->ru_maxrss = rss;
+ }
+}
+
+/*
+ * Statistics clock. Grab profile sample, and if divider reaches 0,
+ * do process and kernel statistics. Most of the statistics are only
+ * used by user-level statistics programs. The main exceptions are
+ * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu.
+ */
+void
+statclock(frame)
+ register struct clockframe *frame;
+{
+
+ CTR0(KTR_CLK, "statclock fired");
+ mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
+ if (--pscnt == 0)
+ pscnt = psdiv;
+ statclock_process(curthread->td_kse, CLKF_PC(frame), CLKF_USERMODE(frame));
+ mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
+}
+
+/*
+ * Return information about system clocks.
+ */
+static int
+sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
+{
+ struct clockinfo clkinfo;
+ /*
+ * Construct clockinfo structure.
+ */
+ bzero(&clkinfo, sizeof(clkinfo));
+ clkinfo.hz = hz;
+ clkinfo.tick = tick;
+ clkinfo.profhz = profhz;
+ clkinfo.stathz = stathz ? stathz : hz;
+ return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
+ 0, 0, sysctl_kern_clockrate, "S,clockinfo",
+ "Rate and period of various kernel clocks");
diff --git a/sys/kern/kern_condvar.c b/sys/kern/kern_condvar.c
new file mode 100644
index 0000000..9d30d25
--- /dev/null
+++ b/sys/kern/kern_condvar.c
@@ -0,0 +1,579 @@
+/*-
+ * Copyright (c) 2000 Jake Burkholder <jake@freebsd.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/condvar.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+/*
+ * Common sanity checks for cv_wait* functions.
+ */
+#define CV_ASSERT(cvp, mp, td) do { \
+ KASSERT((td) != NULL, ("%s: curthread NULL", __func__)); \
+ KASSERT((td)->td_proc->p_stat == SRUN, ("%s: not SRUN", __func__)); \
+ KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__)); \
+ KASSERT((mp) != NULL, ("%s: mp NULL", __func__)); \
+ mtx_assert((mp), MA_OWNED | MA_NOTRECURSED); \
+} while (0)
+
+#ifdef INVARIANTS
+#define CV_WAIT_VALIDATE(cvp, mp) do { \
+ if (TAILQ_EMPTY(&(cvp)->cv_waitq)) { \
+ /* Only waiter. */ \
+ (cvp)->cv_mtx = (mp); \
+ } else { \
+ /* \
+ * Other waiter; assert that we're using the \
+ * same mutex. \
+ */ \
+ KASSERT((cvp)->cv_mtx == (mp), \
+ ("%s: Multiple mutexes", __func__)); \
+ } \
+} while (0)
+#define CV_SIGNAL_VALIDATE(cvp) do { \
+ if (!TAILQ_EMPTY(&(cvp)->cv_waitq)) { \
+ KASSERT(mtx_owned((cvp)->cv_mtx), \
+ ("%s: Mutex not owned", __func__)); \
+ } \
+} while (0)
+#else
+#define CV_WAIT_VALIDATE(cvp, mp)
+#define CV_SIGNAL_VALIDATE(cvp)
+#endif
+
+static void cv_timedwait_end(void *arg);
+
+/*
+ * Initialize a condition variable. Must be called before use.
+ */
+void
+cv_init(struct cv *cvp, const char *desc)
+{
+
+ TAILQ_INIT(&cvp->cv_waitq);
+ cvp->cv_mtx = NULL;
+ cvp->cv_description = desc;
+}
+
+/*
+ * Destroy a condition variable. The condition variable must be re-initialized
+ * in order to be re-used.
+ */
+void
+cv_destroy(struct cv *cvp)
+{
+
+ KASSERT(cv_waitq_empty(cvp), ("%s: cv_waitq non-empty", __func__));
+}
+
+/*
+ * Common code for cv_wait* functions. All require sched_lock.
+ */
+
+/*
+ * Switch context.
+ */
+static __inline void
+cv_switch(struct thread *td)
+{
+
+ td->td_proc->p_stat = SSLEEP;
+ td->td_proc->p_stats->p_ru.ru_nvcsw++;
+ mi_switch();
+ CTR3(KTR_PROC, "cv_switch: resume thread %p (pid %d, %s)", td,
+ td->td_proc->p_pid, td->td_proc->p_comm);
+}
+
+/*
+ * Switch context, catching signals.
+ */
+static __inline int
+cv_switch_catch(struct thread *td)
+{
+ struct proc *p;
+ int sig;
+
+ /*
+ * We put ourselves on the sleep queue and start our timeout before
+ * calling cursig, as we could stop there, and a wakeup or a SIGCONT (or
+ * both) could occur while we were stopped. A SIGCONT would cause us to
+ * be marked as SSLEEP without resuming us, thus we must be ready for
+ * sleep when cursig is called. If the wakeup happens while we're
+ * stopped, td->td_wchan will be 0 upon return from cursig.
+ */
+ td->td_flags |= TDF_SINTR;
+ mtx_unlock_spin(&sched_lock);
+ p = td->td_proc;
+ PROC_LOCK(p);
+ sig = cursig(p); /* XXXKSE */
+ mtx_lock_spin(&sched_lock);
+ PROC_UNLOCK(p);
+ if (sig != 0) {
+ if (td->td_wchan != NULL)
+ cv_waitq_remove(td);
+ td->td_proc->p_stat = SRUN;
+ } else if (td->td_wchan != NULL) {
+ cv_switch(td);
+ }
+ td->td_flags &= ~TDF_SINTR;
+
+ return sig;
+}
+
+/*
+ * Add a thread to the wait queue of a condition variable.
+ */
+static __inline void
+cv_waitq_add(struct cv *cvp, struct thread *td)
+{
+
+ /*
+ * Process may be sitting on a slpque if asleep() was called, remove it
+ * before re-adding.
+ */
+ if (td->td_wchan != NULL)
+ unsleep(td);
+
+ td->td_flags |= TDF_CVWAITQ;
+ td->td_wchan = cvp;
+ td->td_wmesg = cvp->cv_description;
+ td->td_kse->ke_slptime = 0; /* XXXKSE */
+ td->td_ksegrp->kg_slptime = 0; /* XXXKSE */
+ td->td_base_pri = td->td_priority;
+ CTR3(KTR_PROC, "cv_waitq_add: thread %p (pid %d, %s)", td,
+ td->td_proc->p_pid, td->td_proc->p_comm);
+ TAILQ_INSERT_TAIL(&cvp->cv_waitq, td, td_slpq);
+}
+
+/*
+ * Wait on a condition variable. The current thread is placed on the condition
+ * variable's wait queue and suspended. A cv_signal or cv_broadcast on the same
+ * condition variable will resume the thread. The mutex is released before
+ * sleeping and will be held on return. It is recommended that the mutex be
+ * held when cv_signal or cv_broadcast are called.
+ */
+void
+cv_wait(struct cv *cvp, struct mtx *mp)
+{
+ struct thread *td;
+ WITNESS_SAVE_DECL(mp);
+
+ td = curthread;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 0);
+#endif
+ CV_ASSERT(cvp, mp, td);
+ WITNESS_SLEEP(0, &mp->mtx_object);
+ WITNESS_SAVE(&mp->mtx_object, mp);
+
+ if (cold || panicstr) {
+ /*
+ * After a panic, or during autoconfiguration, just give
+ * interrupts a chance, then just return; don't run any other
+ * thread or panic below, in case this is the idle process and
+ * already asleep.
+ */
+ return;
+ }
+
+ mtx_lock_spin(&sched_lock);
+
+ CV_WAIT_VALIDATE(cvp, mp);
+
+ DROP_GIANT();
+ mtx_unlock(mp);
+
+ cv_waitq_add(cvp, td);
+ cv_switch(td);
+
+ mtx_unlock_spin(&sched_lock);
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0);
+#endif
+ PICKUP_GIANT();
+ mtx_lock(mp);
+ WITNESS_RESTORE(&mp->mtx_object, mp);
+}
+
+/*
+ * Wait on a condition variable, allowing interruption by signals. Return 0 if
+ * the thread was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if
+ * a signal was caught. If ERESTART is returned the system call should be
+ * restarted if possible.
+ */
+int
+cv_wait_sig(struct cv *cvp, struct mtx *mp)
+{
+ struct thread *td;
+ struct proc *p;
+ int rval;
+ int sig;
+ WITNESS_SAVE_DECL(mp);
+
+ td = curthread;
+ p = td->td_proc;
+ rval = 0;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 0);
+#endif
+ CV_ASSERT(cvp, mp, td);
+ WITNESS_SLEEP(0, &mp->mtx_object);
+ WITNESS_SAVE(&mp->mtx_object, mp);
+
+ if (cold || panicstr) {
+ /*
+ * After a panic, or during autoconfiguration, just give
+ * interrupts a chance, then just return; don't run any other
+ * procs or panic below, in case this is the idle process and
+ * already asleep.
+ */
+ return 0;
+ }
+
+ mtx_lock_spin(&sched_lock);
+
+ CV_WAIT_VALIDATE(cvp, mp);
+
+ DROP_GIANT();
+ mtx_unlock(mp);
+
+ cv_waitq_add(cvp, td);
+ sig = cv_switch_catch(td);
+
+ mtx_unlock_spin(&sched_lock);
+
+ PROC_LOCK(p);
+ if (sig == 0)
+ sig = cursig(p); /* XXXKSE */
+ if (sig != 0) {
+ if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
+ rval = EINTR;
+ else
+ rval = ERESTART;
+ }
+ PROC_UNLOCK(p);
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0);
+#endif
+ PICKUP_GIANT();
+ mtx_lock(mp);
+ WITNESS_RESTORE(&mp->mtx_object, mp);
+
+ return (rval);
+}
+
+/*
+ * Wait on a condition variable for at most timo/hz seconds. Returns 0 if the
+ * process was resumed by cv_signal or cv_broadcast, EWOULDBLOCK if the timeout
+ * expires.
+ */
+int
+cv_timedwait(struct cv *cvp, struct mtx *mp, int timo)
+{
+ struct thread *td;
+ int rval;
+ WITNESS_SAVE_DECL(mp);
+
+ td = curthread;
+ rval = 0;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 0);
+#endif
+ CV_ASSERT(cvp, mp, td);
+ WITNESS_SLEEP(0, &mp->mtx_object);
+ WITNESS_SAVE(&mp->mtx_object, mp);
+
+ if (cold || panicstr) {
+ /*
+ * After a panic, or during autoconfiguration, just give
+ * interrupts a chance, then just return; don't run any other
+ * thread or panic below, in case this is the idle process and
+ * already asleep.
+ */
+ return 0;
+ }
+
+ mtx_lock_spin(&sched_lock);
+
+ CV_WAIT_VALIDATE(cvp, mp);
+
+ DROP_GIANT();
+ mtx_unlock(mp);
+
+ cv_waitq_add(cvp, td);
+ callout_reset(&td->td_slpcallout, timo, cv_timedwait_end, td);
+ cv_switch(td);
+
+ if (td->td_flags & TDF_TIMEOUT) {
+ td->td_flags &= ~TDF_TIMEOUT;
+ rval = EWOULDBLOCK;
+ } else if (td->td_flags & TDF_TIMOFAIL)
+ td->td_flags &= ~TDF_TIMOFAIL;
+ else if (callout_stop(&td->td_slpcallout) == 0) {
+ /*
+ * Work around race with cv_timedwait_end similar to that
+ * between msleep and endtsleep.
+ */
+ td->td_flags |= TDF_TIMEOUT;
+ td->td_proc->p_stats->p_ru.ru_nivcsw++;
+ mi_switch();
+ }
+
+ mtx_unlock_spin(&sched_lock);
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0);
+#endif
+ PICKUP_GIANT();
+ mtx_lock(mp);
+ WITNESS_RESTORE(&mp->mtx_object, mp);
+
+ return (rval);
+}
+
+/*
+ * Wait on a condition variable for at most timo/hz seconds, allowing
+ * interruption by signals. Returns 0 if the thread was resumed by cv_signal
+ * or cv_broadcast, EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if
+ * a signal was caught.
+ */
+int
+cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo)
+{
+ struct thread *td;
+ struct proc *p;
+ int rval;
+ int sig;
+ WITNESS_SAVE_DECL(mp);
+
+ td = curthread;
+ p = td->td_proc;
+ rval = 0;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 0);
+#endif
+ CV_ASSERT(cvp, mp, td);
+ WITNESS_SLEEP(0, &mp->mtx_object);
+ WITNESS_SAVE(&mp->mtx_object, mp);
+
+ if (cold || panicstr) {
+ /*
+ * After a panic, or during autoconfiguration, just give
+ * interrupts a chance, then just return; don't run any other
+ * thread or panic below, in case this is the idle process and
+ * already asleep.
+ */
+ return 0;
+ }
+
+ mtx_lock_spin(&sched_lock);
+
+ CV_WAIT_VALIDATE(cvp, mp);
+
+ DROP_GIANT();
+ mtx_unlock(mp);
+
+ cv_waitq_add(cvp, td);
+ callout_reset(&td->td_slpcallout, timo, cv_timedwait_end, td);
+ sig = cv_switch_catch(td);
+
+ if (td->td_flags & TDF_TIMEOUT) {
+ td->td_flags &= ~TDF_TIMEOUT;
+ rval = EWOULDBLOCK;
+ } else if (td->td_flags & TDF_TIMOFAIL)
+ td->td_flags &= ~TDF_TIMOFAIL;
+ else if (callout_stop(&td->td_slpcallout) == 0) {
+ /*
+ * Work around race with cv_timedwait_end similar to that
+ * between msleep and endtsleep.
+ */
+ td->td_flags |= TDF_TIMEOUT;
+ td->td_proc->p_stats->p_ru.ru_nivcsw++;
+ mi_switch();
+ }
+
+ mtx_unlock_spin(&sched_lock);
+
+ PROC_LOCK(p);
+ if (sig == 0)
+ sig = cursig(p);
+ if (sig != 0) {
+ if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
+ rval = EINTR;
+ else
+ rval = ERESTART;
+ }
+ PROC_UNLOCK(p);
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0);
+#endif
+ PICKUP_GIANT();
+ mtx_lock(mp);
+ WITNESS_RESTORE(&mp->mtx_object, mp);
+
+ return (rval);
+}
+
+/*
+ * Common code for signal and broadcast. Assumes waitq is not empty. Must be
+ * called with sched_lock held.
+ */
+static __inline void
+cv_wakeup(struct cv *cvp)
+{
+ struct thread *td;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ td = TAILQ_FIRST(&cvp->cv_waitq);
+ KASSERT(td->td_wchan == cvp, ("%s: bogus wchan", __func__));
+ KASSERT(td->td_flags & TDF_CVWAITQ, ("%s: not on waitq", __func__));
+ TAILQ_REMOVE(&cvp->cv_waitq, td, td_slpq);
+ td->td_flags &= ~TDF_CVWAITQ;
+ td->td_wchan = 0;
+ if (td->td_proc->p_stat == SSLEEP) {
+ /* OPTIMIZED EXPANSION OF setrunnable(td); */
+ CTR3(KTR_PROC, "cv_signal: thread %p (pid %d, %s)",
+ td, td->td_proc->p_pid, td->td_proc->p_comm);
+ if (td->td_ksegrp->kg_slptime > 1) /* XXXKSE */
+ updatepri(td);
+ td->td_kse->ke_slptime = 0;
+ td->td_ksegrp->kg_slptime = 0;
+ td->td_proc->p_stat = SRUN;
+ if (td->td_proc->p_sflag & PS_INMEM) {
+ setrunqueue(td);
+ maybe_resched(td);
+ } else {
+ td->td_proc->p_sflag |= PS_SWAPINREQ;
+ wakeup(&proc0); /* XXXKSE */
+ }
+ /* END INLINE EXPANSION */
+ }
+}
+
+/*
+ * Signal a condition variable, wakes up one waiting thread. Will also wakeup
+ * the swapper if the process is not in memory, so that it can bring the
+ * sleeping process in. Note that this may also result in additional threads
+ * being made runnable. Should be called with the same mutex as was passed to
+ * cv_wait held.
+ */
+void
+cv_signal(struct cv *cvp)
+{
+
+ KASSERT(cvp != NULL, ("%s: cvp NULL", __func__));
+ mtx_lock_spin(&sched_lock);
+ if (!TAILQ_EMPTY(&cvp->cv_waitq)) {
+ CV_SIGNAL_VALIDATE(cvp);
+ cv_wakeup(cvp);
+ }
+ mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Broadcast a signal to a condition variable. Wakes up all waiting threads.
+ * Should be called with the same mutex as was passed to cv_wait held.
+ */
+void
+cv_broadcast(struct cv *cvp)
+{
+
+ KASSERT(cvp != NULL, ("%s: cvp NULL", __func__));
+ mtx_lock_spin(&sched_lock);
+ CV_SIGNAL_VALIDATE(cvp);
+ while (!TAILQ_EMPTY(&cvp->cv_waitq))
+ cv_wakeup(cvp);
+ mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Remove a thread from the wait queue of its condition variable. This may be
+ * called externally.
+ */
+void
+cv_waitq_remove(struct thread *td)
+{
+ struct cv *cvp;
+
+ mtx_lock_spin(&sched_lock);
+ if ((cvp = td->td_wchan) != NULL && td->td_flags & TDF_CVWAITQ) {
+ TAILQ_REMOVE(&cvp->cv_waitq, td, td_slpq);
+ td->td_flags &= ~TDF_CVWAITQ;
+ td->td_wchan = NULL;
+ }
+ mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Timeout function for cv_timedwait. Put the thread on the runqueue and set
+ * its timeout flag.
+ */
+static void
+cv_timedwait_end(void *arg)
+{
+ struct thread *td;
+
+ td = arg;
+ CTR3(KTR_PROC, "cv_timedwait_end: thread %p (pid %d, %s)", td, td->td_proc->p_pid,
+ td->td_proc->p_comm);
+ mtx_lock_spin(&sched_lock);
+ if (td->td_flags & TDF_TIMEOUT) {
+ td->td_flags &= ~TDF_TIMEOUT;
+ setrunqueue(td);
+ } else if (td->td_wchan != NULL) {
+ if (td->td_proc->p_stat == SSLEEP) /* XXXKSE */
+ setrunnable(td);
+ else
+ cv_waitq_remove(td);
+ td->td_flags |= TDF_TIMEOUT;
+ } else
+ td->td_flags |= TDF_TIMOFAIL;
+ mtx_unlock_spin(&sched_lock);
+}
diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c
new file mode 100644
index 0000000..d1ce2fc
--- /dev/null
+++ b/sys/kern/kern_conf.c
@@ -0,0 +1,491 @@
+/*-
+ * Parts Copyright (c) 1995 Terrence R. Lambert
+ * Copyright (c) 1995 Julian R. Elischer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Terrence R. Lambert.
+ * 4. The name Terrence R. Lambert may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Julian R. Elischer ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/vnode.h>
+#include <sys/queue.h>
+#include <sys/ctype.h>
+#include <machine/stdarg.h>
+
+#define cdevsw_ALLOCSTART (NUMCDEVSW/2)
+
+static struct cdevsw *cdevsw[NUMCDEVSW];
+
+static MALLOC_DEFINE(M_DEVT, "dev_t", "dev_t storage");
+
+/*
+ * This is the number of hash-buckets. Experiements with 'real-life'
+ * udev_t's show that a prime halfway between two powers of two works
+ * best.
+ */
+#define DEVT_HASH 83
+
+/* The number of dev_t's we can create before malloc(9) kick in. */
+#define DEVT_STASH 50
+
+static struct specinfo devt_stash[DEVT_STASH];
+
+static LIST_HEAD(, specinfo) dev_hash[DEVT_HASH];
+
+static LIST_HEAD(, specinfo) dev_free;
+
+devfs_create_t *devfs_create_hook;
+devfs_destroy_t *devfs_destroy_hook;
+int devfs_present;
+
+static int ready_for_devs;
+
+static int free_devt;
+SYSCTL_INT(_debug, OID_AUTO, free_devt, CTLFLAG_RW, &free_devt, 0, "");
+
+/* XXX: This is a hack */
+void disk_dev_synth(dev_t dev);
+
+struct cdevsw *
+devsw(dev_t dev)
+{
+ if (dev->si_devsw)
+ return (dev->si_devsw);
+ /* XXX: Hack around our backwards disk code */
+ disk_dev_synth(dev);
+ if (dev->si_devsw)
+ return (dev->si_devsw);
+ if (devfs_present)
+ return (NULL);
+ return(cdevsw[major(dev)]);
+}
+
+/*
+ * Add a cdevsw entry
+ */
+
+int
+cdevsw_add(struct cdevsw *newentry)
+{
+
+ if (newentry->d_maj < 0 || newentry->d_maj >= NUMCDEVSW) {
+ printf("%s: ERROR: driver has bogus cdevsw->d_maj = %d\n",
+ newentry->d_name, newentry->d_maj);
+ return (EINVAL);
+ }
+
+ if (cdevsw[newentry->d_maj]) {
+ printf("WARNING: \"%s\" is usurping \"%s\"'s cdevsw[]\n",
+ newentry->d_name, cdevsw[newentry->d_maj]->d_name);
+ }
+
+ cdevsw[newentry->d_maj] = newentry;
+
+ return (0);
+}
+
+/*
+ * Remove a cdevsw entry
+ */
+
+int
+cdevsw_remove(struct cdevsw *oldentry)
+{
+ if (oldentry->d_maj < 0 || oldentry->d_maj >= NUMCDEVSW) {
+ printf("%s: ERROR: driver has bogus cdevsw->d_maj = %d\n",
+ oldentry->d_name, oldentry->d_maj);
+ return EINVAL;
+ }
+
+ cdevsw[oldentry->d_maj] = NULL;
+
+ return 0;
+}
+
+/*
+ * dev_t and u_dev_t primitives
+ */
+
+int
+major(dev_t x)
+{
+ if (x == NODEV)
+ return NOUDEV;
+ return((x->si_udev >> 8) & 0xff);
+}
+
+int
+minor(dev_t x)
+{
+ if (x == NODEV)
+ return NOUDEV;
+ return(x->si_udev & 0xffff00ff);
+}
+
+int
+dev2unit(dev_t x)
+{
+ int i;
+
+ if (x == NODEV)
+ return NOUDEV;
+ i = minor(x);
+ return ((i & 0xff) | (i >> 8));
+}
+
+int
+unit2minor(int unit)
+{
+
+ KASSERT(unit <= 0xffffff, ("Invalid unit (%d) in unit2minor", unit));
+ return ((unit & 0xff) | ((unit << 8) & ~0xffff));
+}
+
+static dev_t
+allocdev(void)
+{
+ static int stashed;
+ struct specinfo *si;
+
+ if (stashed >= DEVT_STASH) {
+ MALLOC(si, struct specinfo *, sizeof(*si), M_DEVT,
+ M_USE_RESERVE | M_ZERO);
+ } else if (LIST_FIRST(&dev_free)) {
+ si = LIST_FIRST(&dev_free);
+ LIST_REMOVE(si, si_hash);
+ } else {
+ si = devt_stash + stashed++;
+ bzero(si, sizeof *si);
+ si->si_flags |= SI_STASHED;
+ }
+ LIST_INIT(&si->si_children);
+ TAILQ_INIT(&si->si_snapshots);
+ return (si);
+}
+
+dev_t
+makedev(int x, int y)
+{
+ struct specinfo *si;
+ udev_t udev;
+ int hash;
+
+ if (x == umajor(NOUDEV) && y == uminor(NOUDEV))
+ panic("makedev of NOUDEV");
+ udev = (x << 8) | y;
+ hash = udev % DEVT_HASH;
+ LIST_FOREACH(si, &dev_hash[hash], si_hash) {
+ if (si->si_udev == udev)
+ return (si);
+ }
+ si = allocdev();
+ si->si_udev = udev;
+ LIST_INSERT_HEAD(&dev_hash[hash], si, si_hash);
+ return (si);
+}
+
+void
+freedev(dev_t dev)
+{
+
+ if (!free_devt)
+ return;
+ if (SLIST_FIRST(&dev->si_hlist))
+ return;
+ if (dev->si_devsw || dev->si_drv1 || dev->si_drv2)
+ return;
+ LIST_REMOVE(dev, si_hash);
+ if (dev->si_flags & SI_STASHED) {
+ bzero(dev, sizeof(*dev));
+ dev->si_flags |= SI_STASHED;
+ LIST_INSERT_HEAD(&dev_free, dev, si_hash);
+ } else {
+ FREE(dev, M_DEVT);
+ }
+}
+
+udev_t
+dev2udev(dev_t x)
+{
+ if (x == NODEV)
+ return NOUDEV;
+ return (x->si_udev);
+}
+
+dev_t
+udev2dev(udev_t x, int b)
+{
+
+ if (x == NOUDEV)
+ return (NODEV);
+ switch (b) {
+ case 0:
+ return makedev(umajor(x), uminor(x));
+ case 1:
+ return (NODEV);
+ default:
+ Debugger("udev2dev(...,X)");
+ return NODEV;
+ }
+}
+
+int
+uminor(udev_t dev)
+{
+ return(dev & 0xffff00ff);
+}
+
+int
+umajor(udev_t dev)
+{
+ return((dev & 0xff00) >> 8);
+}
+
+udev_t
+makeudev(int x, int y)
+{
+ return ((x << 8) | y);
+}
+
+dev_t
+make_dev(struct cdevsw *devsw, int minor, uid_t uid, gid_t gid, int perms, const char *fmt, ...)
+{
+ dev_t dev;
+ va_list ap;
+ int i;
+
+ KASSERT(umajor(makeudev(devsw->d_maj, minor)) == devsw->d_maj,
+ ("Invalid minor (%d) in make_dev", minor));
+
+ if (!ready_for_devs) {
+ printf("WARNING: Driver mistake: make_dev(%s) called before SI_SUB_DRIVERS\n",
+ fmt);
+ /* XXX panic here once drivers are cleaned up */
+ }
+
+ dev = makedev(devsw->d_maj, minor);
+ if (dev->si_flags & SI_NAMED) {
+ printf( "WARNING: Driver mistake: repeat make_dev(\"%s\")\n",
+ dev->si_name);
+ panic("don't do that");
+ return (dev);
+ }
+ va_start(ap, fmt);
+ i = kvprintf(fmt, NULL, dev->si_name, 32, ap);
+ dev->si_name[i] = '\0';
+ va_end(ap);
+ dev->si_devsw = devsw;
+ dev->si_uid = uid;
+ dev->si_gid = gid;
+ dev->si_mode = perms;
+ dev->si_flags |= SI_NAMED;
+
+ if (devfs_create_hook)
+ devfs_create_hook(dev);
+ return (dev);
+}
+
+int
+dev_named(dev_t pdev, const char *name)
+{
+ dev_t cdev;
+
+ if (strcmp(devtoname(pdev), name) == 0)
+ return (1);
+ LIST_FOREACH(cdev, &pdev->si_children, si_siblings)
+ if (strcmp(devtoname(cdev), name) == 0)
+ return (1);
+ return (0);
+}
+
+void
+dev_depends(dev_t pdev, dev_t cdev)
+{
+
+ cdev->si_parent = pdev;
+ cdev->si_flags |= SI_CHILD;
+ LIST_INSERT_HEAD(&pdev->si_children, cdev, si_siblings);
+}
+
+dev_t
+make_dev_alias(dev_t pdev, const char *fmt, ...)
+{
+ dev_t dev;
+ va_list ap;
+ int i;
+
+ dev = allocdev();
+ dev->si_flags |= SI_ALIAS;
+ dev->si_flags |= SI_NAMED;
+ dev_depends(pdev, dev);
+ va_start(ap, fmt);
+ i = kvprintf(fmt, NULL, dev->si_name, 32, ap);
+ dev->si_name[i] = '\0';
+ va_end(ap);
+
+ if (devfs_create_hook)
+ devfs_create_hook(dev);
+ return (dev);
+}
+
+void
+revoke_and_destroy_dev(dev_t dev)
+{
+ struct vnode *vp;
+
+ GIANT_REQUIRED;
+
+ vp = SLIST_FIRST(&dev->si_hlist);
+ if (vp != NULL)
+ VOP_REVOKE(vp, REVOKEALL);
+ destroy_dev(dev);
+}
+
+void
+destroy_dev(dev_t dev)
+{
+
+ if (!(dev->si_flags & SI_NAMED)) {
+ printf( "WARNING: Driver mistake: destroy_dev on %d/%d\n",
+ major(dev), minor(dev));
+ panic("don't do that");
+ return;
+ }
+
+ if (devfs_destroy_hook)
+ devfs_destroy_hook(dev);
+ if (dev->si_flags & SI_CHILD) {
+ LIST_REMOVE(dev, si_siblings);
+ dev->si_flags &= ~SI_CHILD;
+ }
+ while (!LIST_EMPTY(&dev->si_children))
+ destroy_dev(LIST_FIRST(&dev->si_children));
+ dev->si_drv1 = 0;
+ dev->si_drv2 = 0;
+ dev->si_devsw = 0;
+ bzero(&dev->__si_u, sizeof(dev->__si_u));
+ dev->si_flags &= ~SI_NAMED;
+ dev->si_flags &= ~SI_ALIAS;
+ freedev(dev);
+}
+
+const char *
+devtoname(dev_t dev)
+{
+ char *p;
+ int mynor;
+
+ if (dev->si_name[0] == '#' || dev->si_name[0] == '\0') {
+ p = dev->si_name;
+ if (devsw(dev))
+ sprintf(p, "#%s/", devsw(dev)->d_name);
+ else
+ sprintf(p, "#%d/", major(dev));
+ p += strlen(p);
+ mynor = minor(dev);
+ if (mynor < 0 || mynor > 255)
+ sprintf(p, "%#x", (u_int)mynor);
+ else
+ sprintf(p, "%d", mynor);
+ }
+ return (dev->si_name);
+}
+
+int
+dev_stdclone(char *name, char **namep, const char *stem, int *unit)
+{
+ int u, i;
+
+ i = strlen(stem);
+ if (bcmp(stem, name, i) != 0)
+ return (0);
+ if (!isdigit(name[i]))
+ return (0);
+ u = 0;
+ if (name[i] == '0' && isdigit(name[i+1]))
+ return (0);
+ while (isdigit(name[i])) {
+ u *= 10;
+ u += name[i++] - '0';
+ }
+ *unit = u;
+ if (namep)
+ *namep = &name[i];
+ if (name[i])
+ return (2);
+ return (1);
+}
+
+/*
+ * Helper sysctl for devname(3). We're given a {u}dev_t and return
+ * the name, if any, registered by the device driver.
+ */
+static int
+sysctl_devname(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ udev_t ud;
+ dev_t dev;
+
+ error = SYSCTL_IN(req, &ud, sizeof (ud));
+ if (error)
+ return (error);
+ if (ud == NOUDEV)
+ return(EINVAL);
+ dev = makedev(umajor(ud), uminor(ud));
+ if (dev->si_name[0] == '\0')
+ error = ENOENT;
+ else
+ error = SYSCTL_OUT(req, dev->si_name, strlen(dev->si_name) + 1);
+ freedev(dev);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, devname, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_ANYBODY,
+ NULL, 0, sysctl_devname, "", "devname(3) handler");
+
+/*
+ * Set ready_for_devs; prior to this point, device creation is not allowed.
+ */
+static void
+dev_set_ready(void *junk)
+{
+ ready_for_devs = 1;
+}
+
+SYSINIT(dev_ready, SI_SUB_DEVFS, SI_ORDER_FIRST, dev_set_ready, NULL);
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
new file mode 100644
index 0000000..15837d3
--- /dev/null
+++ b/sys/kern/kern_descrip.c
@@ -0,0 +1,2210 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/conf.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/unistd.h>
+#include <sys/resourcevar.h>
+#include <sys/event.h>
+#include <sys/sx.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+
+#include <machine/limits.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+
+static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
+static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
+
+uma_zone_t file_zone;
+
+static d_open_t fdopen;
+#define NUMFDESC 64
+
+#define CDEV_MAJOR 22
+static struct cdevsw fildesc_cdevsw = {
+ /* open */ fdopen,
+ /* close */ noclose,
+ /* read */ noread,
+ /* write */ nowrite,
+ /* ioctl */ noioctl,
+ /* poll */ nopoll,
+ /* mmap */ nommap,
+ /* strategy */ nostrategy,
+ /* name */ "FD",
+ /* maj */ CDEV_MAJOR,
+ /* dump */ nodump,
+ /* psize */ nopsize,
+ /* flags */ 0,
+};
+
+static int do_dup(struct filedesc *fdp, int old, int new, register_t *retval, struct thread *td);
+static int badfo_readwrite(struct file *fp, struct uio *uio,
+ struct ucred *cred, int flags, struct thread *td);
+static int badfo_ioctl(struct file *fp, u_long com, caddr_t data,
+ struct thread *td);
+static int badfo_poll(struct file *fp, int events,
+ struct ucred *cred, struct thread *td);
+static int badfo_kqfilter(struct file *fp, struct knote *kn);
+static int badfo_stat(struct file *fp, struct stat *sb, struct thread *td);
+static int badfo_close(struct file *fp, struct thread *td);
+
+/*
+ * Descriptor management.
+ */
+struct filelist filehead; /* head of list of open files */
+int nfiles; /* actual number of open files */
+extern int cmask;
+struct sx filelist_lock; /* sx to protect filelist */
+struct mtx sigio_lock; /* mtx to protect pointers to sigio */
+
+/*
+ * System calls on descriptors.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdtablesize_args {
+ int dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getdtablesize(td, uap)
+ struct thread *td;
+ struct getdtablesize_args *uap;
+{
+ struct proc *p = td->td_proc;
+
+ mtx_lock(&Giant);
+ td->td_retval[0] =
+ min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+/*
+ * Duplicate a file descriptor to a particular value.
+ *
+ * note: keep in mind that a potential race condition exists when closing
+ * descriptors from a shared descriptor table (via rfork).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup2_args {
+ u_int from;
+ u_int to;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+dup2(td, uap)
+ struct thread *td;
+ struct dup2_args *uap;
+{
+ struct proc *p = td->td_proc;
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ register u_int old = uap->from, new = uap->to;
+ int i, error;
+
+ FILEDESC_LOCK(fdp);
+retry:
+ if (old >= fdp->fd_nfiles ||
+ fdp->fd_ofiles[old] == NULL ||
+ new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
+ new >= maxfilesperproc) {
+ FILEDESC_UNLOCK(fdp);
+ return (EBADF);
+ }
+ if (old == new) {
+ td->td_retval[0] = new;
+ FILEDESC_UNLOCK(fdp);
+ return (0);
+ }
+ if (new >= fdp->fd_nfiles) {
+ if ((error = fdalloc(td, new, &i))) {
+ FILEDESC_UNLOCK(fdp);
+ return (error);
+ }
+ /*
+ * fdalloc() may block, retest everything.
+ */
+ goto retry;
+ }
+ error = do_dup(fdp, (int)old, (int)new, td->td_retval, td);
+ return(error);
+}
+
+/*
+ * Duplicate a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup_args {
+ u_int fd;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+dup(td, uap)
+ struct thread *td;
+ struct dup_args *uap;
+{
+ register struct filedesc *fdp;
+ u_int old;
+ int new, error;
+
+ old = uap->fd;
+ fdp = td->td_proc->p_fd;
+ FILEDESC_LOCK(fdp);
+ if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
+ FILEDESC_UNLOCK(fdp);
+ return (EBADF);
+ }
+ if ((error = fdalloc(td, 0, &new))) {
+ FILEDESC_UNLOCK(fdp);
+ return (error);
+ }
+ error = do_dup(fdp, (int)old, new, td->td_retval, td);
+ return (error);
+}
+
+/*
+ * The file control system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fcntl_args {
+ int fd;
+ int cmd;
+ long arg;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+fcntl(td, uap)
+ struct thread *td;
+ register struct fcntl_args *uap;
+{
+ register struct proc *p = td->td_proc;
+ register struct filedesc *fdp;
+ register struct file *fp;
+ register char *pop;
+ struct vnode *vp;
+ int i, tmp, error = 0, flg = F_POSIX;
+ struct flock fl;
+ u_int newmin;
+ struct proc *leaderp;
+
+ mtx_lock(&Giant);
+
+ fdp = p->p_fd;
+ FILEDESC_LOCK(fdp);
+ if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL) {
+ FILEDESC_UNLOCK(fdp);
+ error = EBADF;
+ goto done2;
+ }
+ pop = &fdp->fd_ofileflags[uap->fd];
+
+ switch (uap->cmd) {
+ case F_DUPFD:
+ newmin = uap->arg;
+ if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
+ newmin >= maxfilesperproc) {
+ FILEDESC_UNLOCK(fdp);
+ error = EINVAL;
+ break;
+ }
+ if ((error = fdalloc(td, newmin, &i))) {
+ FILEDESC_UNLOCK(fdp);
+ break;
+ }
+ error = do_dup(fdp, uap->fd, i, td->td_retval, td);
+ break;
+
+ case F_GETFD:
+ td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
+ FILEDESC_UNLOCK(fdp);
+ break;
+
+ case F_SETFD:
+ *pop = (*pop &~ UF_EXCLOSE) |
+ (uap->arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
+ FILEDESC_UNLOCK(fdp);
+ break;
+
+ case F_GETFL:
+ FILE_LOCK(fp);
+ FILEDESC_UNLOCK(fdp);
+ td->td_retval[0] = OFLAGS(fp->f_flag);
+ FILE_UNLOCK(fp);
+ break;
+
+ case F_SETFL:
+ fhold(fp);
+ FILEDESC_UNLOCK(fdp);
+ fp->f_flag &= ~FCNTLFLAGS;
+ fp->f_flag |= FFLAGS(uap->arg & ~O_ACCMODE) & FCNTLFLAGS;
+ tmp = fp->f_flag & FNONBLOCK;
+ error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
+ if (error) {
+ fdrop(fp, td);
+ break;
+ }
+ tmp = fp->f_flag & FASYNC;
+ error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
+ if (!error) {
+ fdrop(fp, td);
+ break;
+ }
+ fp->f_flag &= ~FNONBLOCK;
+ tmp = 0;
+ (void)fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
+ fdrop(fp, td);
+ break;
+
+ case F_GETOWN:
+ fhold(fp);
+ FILEDESC_UNLOCK(fdp);
+ error = fo_ioctl(fp, FIOGETOWN, (caddr_t)td->td_retval, td);
+ fdrop(fp, td);
+ break;
+
+ case F_SETOWN:
+ fhold(fp);
+ FILEDESC_UNLOCK(fdp);
+ error = fo_ioctl(fp, FIOSETOWN, (caddr_t)&uap->arg, td);
+ fdrop(fp, td);
+ break;
+
+ case F_SETLKW:
+ flg |= F_WAIT;
+ /* Fall into F_SETLK */
+
+ case F_SETLK:
+ if (fp->f_type != DTYPE_VNODE) {
+ FILEDESC_UNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
+ vp = (struct vnode *)fp->f_data;
+ /*
+ * copyin/lockop may block
+ */
+ fhold(fp);
+ FILEDESC_UNLOCK(fdp);
+ vp = (struct vnode *)fp->f_data;
+
+ /* Copy in the lock structure */
+ error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
+ sizeof(fl));
+ if (error) {
+ fdrop(fp, td);
+ break;
+ }
+ if (fl.l_whence == SEEK_CUR) {
+ if (fp->f_offset < 0 ||
+ (fl.l_start > 0 &&
+ fp->f_offset > OFF_MAX - fl.l_start)) {
+ fdrop(fp, td);
+ error = EOVERFLOW;
+ break;
+ }
+ fl.l_start += fp->f_offset;
+ }
+
+ switch (fl.l_type) {
+ case F_RDLCK:
+ if ((fp->f_flag & FREAD) == 0) {
+ error = EBADF;
+ break;
+ }
+ PROC_LOCK(p);
+ p->p_flag |= P_ADVLOCK;
+ leaderp = p->p_leader;
+ PROC_UNLOCK(p);
+ error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_SETLK,
+ &fl, flg);
+ break;
+ case F_WRLCK:
+ if ((fp->f_flag & FWRITE) == 0) {
+ error = EBADF;
+ break;
+ }
+ PROC_LOCK(p);
+ p->p_flag |= P_ADVLOCK;
+ leaderp = p->p_leader;
+ PROC_UNLOCK(p);
+ error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_SETLK,
+ &fl, flg);
+ break;
+ case F_UNLCK:
+ PROC_LOCK(p);
+ leaderp = p->p_leader;
+ PROC_UNLOCK(p);
+ error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_UNLCK,
+ &fl, F_POSIX);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ fdrop(fp, td);
+ break;
+
+ case F_GETLK:
+ if (fp->f_type != DTYPE_VNODE) {
+ FILEDESC_UNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
+ vp = (struct vnode *)fp->f_data;
+ /*
+ * copyin/lockop may block
+ */
+ fhold(fp);
+ FILEDESC_UNLOCK(fdp);
+ vp = (struct vnode *)fp->f_data;
+
+ /* Copy in the lock structure */
+ error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
+ sizeof(fl));
+ if (error) {
+ fdrop(fp, td);
+ break;
+ }
+ if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK &&
+ fl.l_type != F_UNLCK) {
+ fdrop(fp, td);
+ error = EINVAL;
+ break;
+ }
+ if (fl.l_whence == SEEK_CUR) {
+ if ((fl.l_start > 0 &&
+ fp->f_offset > OFF_MAX - fl.l_start) ||
+ (fl.l_start < 0 &&
+ fp->f_offset < OFF_MIN - fl.l_start)) {
+ fdrop(fp, td);
+ error = EOVERFLOW;
+ break;
+ }
+ fl.l_start += fp->f_offset;
+ }
+ error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK,
+ &fl, F_POSIX);
+ fdrop(fp, td);
+ if (error == 0) {
+ error = copyout((caddr_t)&fl,
+ (caddr_t)(intptr_t)uap->arg, sizeof(fl));
+ }
+ break;
+ default:
+ FILEDESC_UNLOCK(fdp);
+ error = EINVAL;
+ break;
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Common code for dup, dup2, and fcntl(F_DUPFD).
+ * filedesc must be locked, but will be unlocked as a side effect.
+ */
+static int
+do_dup(fdp, old, new, retval, td)
+ register struct filedesc *fdp;
+ register int old, new;
+ register_t *retval;
+ struct thread *td;
+{
+ struct file *fp;
+ struct file *delfp;
+
+ FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+ /*
+ * Save info on the descriptor being overwritten. We have
+ * to do the unmap now, but we cannot close it without
+ * introducing an ownership race for the slot.
+ */
+ delfp = fdp->fd_ofiles[new];
+#if 0
+ if (delfp && (fdp->fd_ofileflags[new] & UF_MAPPED))
+ (void) munmapfd(td, new);
+#endif
+
+ /*
+ * Duplicate the source descriptor, update lastfile
+ */
+ fp = fdp->fd_ofiles[old];
+ fdp->fd_ofiles[new] = fp;
+ fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
+ fhold(fp);
+ if (new > fdp->fd_lastfile)
+ fdp->fd_lastfile = new;
+ *retval = new;
+
+ FILEDESC_UNLOCK(fdp);
+
+ /*
+ * If we dup'd over a valid file, we now own the reference to it
+ * and must dispose of it using closef() semantics (as if a
+ * close() were performed on it).
+ */
+ if (delfp) {
+ mtx_lock(&Giant);
+ (void) closef(delfp, td);
+ mtx_unlock(&Giant);
+ }
+ return (0);
+}
+
+/*
+ * If sigio is on the list associated with a process or process group,
+ * disable signalling from the device, remove sigio from the list and
+ * free sigio.
+ */
+void
+funsetown(sigiop)
+ struct sigio **sigiop;
+{
+ struct sigio *sigio;
+
+ SIGIO_LOCK();
+ sigio = *sigiop;
+ if (sigio == NULL) {
+ SIGIO_UNLOCK();
+ return;
+ }
+ *(sigio->sio_myref) = NULL;
+ if ((sigio)->sio_pgid < 0) {
+ struct pgrp *pg = (sigio)->sio_pgrp;
+ PGRP_LOCK(pg);
+ SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
+ sigio, sio_pgsigio);
+ PGRP_UNLOCK(pg);
+ } else {
+ struct proc *p = (sigio)->sio_proc;
+ PROC_LOCK(p);
+ SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
+ sigio, sio_pgsigio);
+ PROC_UNLOCK(p);
+ }
+ SIGIO_UNLOCK();
+ crfree(sigio->sio_ucred);
+ FREE(sigio, M_SIGIO);
+}
+
+/*
+ * Free a list of sigio structures.
+ * We only need to lock the SIGIO_LOCK because we have made ourselves
+ * inaccessable to callers of fsetown and therefore do not need to lock
+ * the proc or pgrp struct for the list manipulation.
+ */
+void
+funsetownlst(sigiolst)
+ struct sigiolst *sigiolst;
+{
+ struct sigio *sigio;
+ struct proc *p;
+ struct pgrp *pg;
+
+ sigio = SLIST_FIRST(sigiolst);
+ if (sigio == NULL)
+ return;
+
+ p = NULL;
+ pg = NULL;
+
+ /*
+ * Every entry of the list should belong
+ * to a single proc or pgrp.
+ */
+ if (sigio->sio_pgid < 0) {
+ pg = sigio->sio_pgrp;
+ PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
+ } else /* if (sigio->sio_pgid > 0) */ {
+ p = sigio->sio_proc;
+ PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+ }
+
+ SIGIO_LOCK();
+ while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
+ *(sigio->sio_myref) = NULL;
+ if (pg != NULL) {
+ KASSERT(sigio->sio_pgid < 0,
+ ("Proc sigio in pgrp sigio list"));
+ KASSERT(sigio->sio_pgrp == pg,
+ ("Bogus pgrp in sigio list"));
+ PGRP_LOCK(pg);
+ SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
+ sio_pgsigio);
+ PGRP_UNLOCK(pg);
+ } else /* if (p != NULL) */ {
+ KASSERT(sigio->sio_pgid > 0,
+ ("Pgrp sigio in proc sigio list"));
+ KASSERT(sigio->sio_proc == p,
+ ("Bogus proc in sigio list"));
+ PROC_LOCK(p);
+ SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
+ sio_pgsigio);
+ PROC_UNLOCK(p);
+ }
+ SIGIO_UNLOCK();
+ crfree(sigio->sio_ucred);
+ FREE(sigio, M_SIGIO);
+ SIGIO_LOCK();
+ }
+ SIGIO_UNLOCK();
+}
+
+/*
+ * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
+ *
+ * After permission checking, add a sigio structure to the sigio list for
+ * the process or process group.
+ */
+int
+fsetown(pgid, sigiop)
+ pid_t pgid;
+ struct sigio **sigiop;
+{
+ struct proc *proc;
+ struct pgrp *pgrp;
+ struct sigio *sigio;
+ int ret;
+
+ if (pgid == 0) {
+ funsetown(sigiop);
+ return (0);
+ }
+
+ ret = 0;
+
+ /* Allocate and fill in the new sigio out of locks. */
+ MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
+ sigio->sio_pgid = pgid;
+ sigio->sio_ucred = crhold(curthread->td_ucred);
+ sigio->sio_myref = sigiop;
+
+ sx_slock(&proctree_lock);
+ if (pgid > 0) {
+ proc = pfind(pgid);
+ if (proc == NULL) {
+ ret = ESRCH;
+ goto fail;
+ }
+
+ /*
+ * Policy - Don't allow a process to FSETOWN a process
+ * in another session.
+ *
+ * Remove this test to allow maximum flexibility or
+ * restrict FSETOWN to the current process or process
+ * group for maximum safety.
+ */
+ PROC_UNLOCK(proc);
+ if (proc->p_session != curthread->td_proc->p_session) {
+ ret = EPERM;
+ goto fail;
+ }
+
+ pgrp = NULL;
+ } else /* if (pgid < 0) */ {
+ pgrp = pgfind(-pgid);
+ if (pgrp == NULL) {
+ ret = ESRCH;
+ goto fail;
+ }
+ PGRP_UNLOCK(pgrp);
+
+ /*
+ * Policy - Don't allow a process to FSETOWN a process
+ * in another session.
+ *
+ * Remove this test to allow maximum flexibility or
+ * restrict FSETOWN to the current process or process
+ * group for maximum safety.
+ */
+ if (pgrp->pg_session != curthread->td_proc->p_session) {
+ ret = EPERM;
+ goto fail;
+ }
+
+ proc = NULL;
+ }
+ funsetown(sigiop);
+ if (pgid > 0) {
+ PROC_LOCK(proc);
+ /*
+ * since funsetownlst() is called without the proctree
+ * locked we need to check for P_WEXIT.
+ * XXX: is ESRCH correct?
+ */
+ if ((proc->p_flag & P_WEXIT) != 0) {
+ PROC_UNLOCK(proc);
+ ret = ESRCH;
+ goto fail;
+ }
+ SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
+ sigio->sio_proc = proc;
+ PROC_UNLOCK(proc);
+ } else {
+ PGRP_LOCK(pgrp);
+ SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
+ sigio->sio_pgrp = pgrp;
+ PGRP_UNLOCK(pgrp);
+ }
+ sx_sunlock(&proctree_lock);
+ SIGIO_LOCK();
+ *sigiop = sigio;
+ SIGIO_UNLOCK();
+ return (0);
+
+fail:
+ sx_sunlock(&proctree_lock);
+ crfree(sigio->sio_ucred);
+ FREE(sigio, M_SIGIO);
+ return (ret);
+}
+
+/*
+ * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
+ */
+pid_t
+fgetown(sigio)
+ struct sigio *sigio;
+{
+ return (sigio != NULL ? sigio->sio_pgid : 0);
+}
+
+/*
+ * Close a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct close_args {
+ int fd;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+close(td, uap)
+ struct thread *td;
+ struct close_args *uap;
+{
+ register struct filedesc *fdp;
+ register struct file *fp;
+ register int fd = uap->fd;
+ int error = 0;
+
+ mtx_lock(&Giant);
+ fdp = td->td_proc->p_fd;
+ FILEDESC_LOCK(fdp);
+ if ((unsigned)fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[fd]) == NULL) {
+ FILEDESC_UNLOCK(fdp);
+ error = EBADF;
+ goto done2;
+ }
+#if 0
+ if (fdp->fd_ofileflags[fd] & UF_MAPPED)
+ (void) munmapfd(td, fd);
+#endif
+ fdp->fd_ofiles[fd] = NULL;
+ fdp->fd_ofileflags[fd] = 0;
+
+ /*
+ * we now hold the fp reference that used to be owned by the descriptor
+ * array.
+ */
+ while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+ fdp->fd_lastfile--;
+ if (fd < fdp->fd_freefile)
+ fdp->fd_freefile = fd;
+ if (fd < fdp->fd_knlistsize) {
+ FILEDESC_UNLOCK(fdp);
+ knote_fdclose(td, fd);
+ } else
+ FILEDESC_UNLOCK(fdp);
+
+ error = closef(fp, td);
+done2:
+ mtx_unlock(&Giant);
+ return(error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ofstat_args {
+ int fd;
+ struct ostat *sb;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+ofstat(td, uap)
+ struct thread *td;
+ register struct ofstat_args *uap;
+{
+ struct file *fp;
+ struct stat ub;
+ struct ostat oub;
+ int error;
+
+ mtx_lock(&Giant);
+ if ((error = fget(td, uap->fd, &fp)) != 0)
+ goto done2;
+ error = fo_stat(fp, &ub, td);
+ if (error == 0) {
+ cvtstat(&ub, &oub);
+ error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub));
+ }
+ fdrop(fp, td);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstat_args {
+ int fd;
+ struct stat *sb;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+fstat(td, uap)
+ struct thread *td;
+ struct fstat_args *uap;
+{
+ struct file *fp;
+ struct stat ub;
+ int error;
+
+ mtx_lock(&Giant);
+ if ((error = fget(td, uap->fd, &fp)) != 0)
+ goto done2;
+ error = fo_stat(fp, &ub, td);
+ if (error == 0)
+ error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub));
+ fdrop(fp, td);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nfstat_args {
+ int fd;
+ struct nstat *sb;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+nfstat(td, uap)
+ struct thread *td;
+ register struct nfstat_args *uap;
+{
+ struct file *fp;
+ struct stat ub;
+ struct nstat nub;
+ int error;
+
+ mtx_lock(&Giant);
+ if ((error = fget(td, uap->fd, &fp)) != 0)
+ goto done2;
+ error = fo_stat(fp, &ub, td);
+ if (error == 0) {
+ cvtnstat(&ub, &nub);
+ error = copyout((caddr_t)&nub, (caddr_t)uap->sb, sizeof (nub));
+ }
+ fdrop(fp, td);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Return pathconf information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fpathconf_args {
+ int fd;
+ int name;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+fpathconf(td, uap)
+ struct thread *td;
+ register struct fpathconf_args *uap;
+{
+ struct file *fp;
+ struct vnode *vp;
+ int error;
+
+ if ((error = fget(td, uap->fd, &fp)) != 0)
+ return (error);
+
+ switch (fp->f_type) {
+ case DTYPE_PIPE:
+ case DTYPE_SOCKET:
+ if (uap->name != _PC_PIPE_BUF) {
+ error = EINVAL;
+ } else {
+ td->td_retval[0] = PIPE_BUF;
+ error = 0;
+ }
+ break;
+ case DTYPE_FIFO:
+ case DTYPE_VNODE:
+ vp = (struct vnode *)fp->f_data;
+ mtx_lock(&Giant);
+ error = VOP_PATHCONF(vp, uap->name, td->td_retval);
+ mtx_unlock(&Giant);
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ fdrop(fp, td);
+ return(error);
+}
+
+/*
+ * Allocate a file descriptor for the process.
+ */
+static int fdexpand;
+SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
+
+int
+fdalloc(td, want, result)
+ struct thread *td;
+ int want;
+ int *result;
+{
+ struct proc *p = td->td_proc;
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ register int i;
+ int lim, last, nfiles;
+ struct file **newofile, **oldofile;
+ char *newofileflags;
+
+ FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+ /*
+ * Search for a free descriptor starting at the higher
+ * of want or fd_freefile. If that fails, consider
+ * expanding the ofile array.
+ */
+ lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
+ for (;;) {
+ last = min(fdp->fd_nfiles, lim);
+ if ((i = want) < fdp->fd_freefile)
+ i = fdp->fd_freefile;
+ for (; i < last; i++) {
+ if (fdp->fd_ofiles[i] == NULL) {
+ fdp->fd_ofileflags[i] = 0;
+ if (i > fdp->fd_lastfile)
+ fdp->fd_lastfile = i;
+ if (want <= fdp->fd_freefile)
+ fdp->fd_freefile = i;
+ *result = i;
+ return (0);
+ }
+ }
+
+ /*
+ * No space in current array. Expand?
+ */
+ if (fdp->fd_nfiles >= lim)
+ return (EMFILE);
+ if (fdp->fd_nfiles < NDEXTENT)
+ nfiles = NDEXTENT;
+ else
+ nfiles = 2 * fdp->fd_nfiles;
+ FILEDESC_UNLOCK(fdp);
+ mtx_lock(&Giant);
+ MALLOC(newofile, struct file **, nfiles * OFILESIZE,
+ M_FILEDESC, M_WAITOK);
+ mtx_unlock(&Giant);
+ FILEDESC_LOCK(fdp);
+
+ /*
+ * deal with file-table extend race that might have occured
+ * when malloc was blocked.
+ */
+ if (fdp->fd_nfiles >= nfiles) {
+ FILEDESC_UNLOCK(fdp);
+ mtx_lock(&Giant);
+ FREE(newofile, M_FILEDESC);
+ mtx_unlock(&Giant);
+ FILEDESC_LOCK(fdp);
+ continue;
+ }
+ newofileflags = (char *) &newofile[nfiles];
+ /*
+ * Copy the existing ofile and ofileflags arrays
+ * and zero the new portion of each array.
+ */
+ bcopy(fdp->fd_ofiles, newofile,
+ (i = sizeof(struct file *) * fdp->fd_nfiles));
+ bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i);
+ bcopy(fdp->fd_ofileflags, newofileflags,
+ (i = sizeof(char) * fdp->fd_nfiles));
+ bzero(newofileflags + i, nfiles * sizeof(char) - i);
+ if (fdp->fd_nfiles > NDFILE)
+ oldofile = fdp->fd_ofiles;
+ else
+ oldofile = NULL;
+ fdp->fd_ofiles = newofile;
+ fdp->fd_ofileflags = newofileflags;
+ fdp->fd_nfiles = nfiles;
+ fdexpand++;
+ if (oldofile != NULL) {
+ FILEDESC_UNLOCK(fdp);
+ mtx_lock(&Giant);
+ FREE(oldofile, M_FILEDESC);
+ mtx_unlock(&Giant);
+ FILEDESC_LOCK(fdp);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Check to see whether n user file descriptors
+ * are available to the process p.
+ */
+int
+fdavail(td, n)
+ struct thread *td;
+ register int n;
+{
+ struct proc *p = td->td_proc;
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ register struct file **fpp;
+ register int i, lim, last;
+
+ FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+ lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
+ if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
+ return (1);
+
+ last = min(fdp->fd_nfiles, lim);
+ fpp = &fdp->fd_ofiles[fdp->fd_freefile];
+ for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
+ if (*fpp == NULL && --n <= 0)
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Create a new open file structure and allocate
+ * a file decriptor for the process that refers to it.
+ */
+int
+falloc(td, resultfp, resultfd)
+ register struct thread *td;
+ struct file **resultfp;
+ int *resultfd;
+{
+ struct proc *p = td->td_proc;
+ register struct file *fp, *fq;
+ int error, i;
+
+ sx_xlock(&filelist_lock);
+ if (nfiles >= maxfiles) {
+ sx_xunlock(&filelist_lock);
+ tablefull("file");
+ return (ENFILE);
+ }
+ nfiles++;
+ sx_xunlock(&filelist_lock);
+ /*
+ * Allocate a new file descriptor.
+ * If the process has file descriptor zero open, add to the list
+ * of open files at that point, otherwise put it at the front of
+ * the list of open files.
+ */
+ fp = uma_zalloc(file_zone, M_WAITOK);
+ bzero(fp, sizeof(*fp));
+
+ /*
+ * wait until after malloc (which may have blocked) returns before
+ * allocating the slot, else a race might have shrunk it if we had
+ * allocated it before the malloc.
+ */
+ FILEDESC_LOCK(p->p_fd);
+ if ((error = fdalloc(td, 0, &i))) {
+ FILEDESC_UNLOCK(p->p_fd);
+ sx_xlock(&filelist_lock);
+ nfiles--;
+ sx_xunlock(&filelist_lock);
+ uma_zfree(file_zone, fp);
+ return (error);
+ }
+ fp->f_mtxp = mtx_pool_alloc();
+ fp->f_gcflag = 0;
+ fp->f_count = 1;
+ fp->f_cred = crhold(td->td_ucred);
+ fp->f_ops = &badfileops;
+ fp->f_seqcount = 1;
+ FILEDESC_UNLOCK(p->p_fd);
+ sx_xlock(&filelist_lock);
+ FILEDESC_LOCK(p->p_fd);
+ if ((fq = p->p_fd->fd_ofiles[0])) {
+ LIST_INSERT_AFTER(fq, fp, f_list);
+ } else {
+ LIST_INSERT_HEAD(&filehead, fp, f_list);
+ }
+ p->p_fd->fd_ofiles[i] = fp;
+ FILEDESC_UNLOCK(p->p_fd);
+ sx_xunlock(&filelist_lock);
+ if (resultfp)
+ *resultfp = fp;
+ if (resultfd)
+ *resultfd = i;
+ return (0);
+}
+
+/*
+ * Free a file descriptor.
+ */
+void
+ffree(fp)
+ register struct file *fp;
+{
+
+ KASSERT((fp->f_count == 0), ("ffree: fp_fcount not 0!"));
+ sx_xlock(&filelist_lock);
+ LIST_REMOVE(fp, f_list);
+ nfiles--;
+ sx_xunlock(&filelist_lock);
+ crfree(fp->f_cred);
+ uma_zfree(file_zone, fp);
+}
+
+/*
+ * Build a new filedesc structure.
+ */
+struct filedesc *
+fdinit(td)
+ struct thread *td;
+{
+ register struct filedesc0 *newfdp;
+ register struct filedesc *fdp = td->td_proc->p_fd;
+
+ MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
+ M_FILEDESC, M_WAITOK | M_ZERO);
+ mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
+ FILEDESC_LOCK(&newfdp->fd_fd);
+ newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
+ if (newfdp->fd_fd.fd_cdir)
+ VREF(newfdp->fd_fd.fd_cdir);
+ newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
+ if (newfdp->fd_fd.fd_rdir)
+ VREF(newfdp->fd_fd.fd_rdir);
+ newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
+ if (newfdp->fd_fd.fd_jdir)
+ VREF(newfdp->fd_fd.fd_jdir);
+
+ /* Create the file descriptor table. */
+ newfdp->fd_fd.fd_refcnt = 1;
+ newfdp->fd_fd.fd_cmask = cmask;
+ newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
+ newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
+ newfdp->fd_fd.fd_nfiles = NDFILE;
+ newfdp->fd_fd.fd_knlistsize = -1;
+ FILEDESC_UNLOCK(&newfdp->fd_fd);
+
+ return (&newfdp->fd_fd);
+}
+
+/*
+ * Share a filedesc structure.
+ */
+struct filedesc *
+fdshare(p)
+ struct proc *p;
+{
+ FILEDESC_LOCK(p->p_fd);
+ p->p_fd->fd_refcnt++;
+ FILEDESC_UNLOCK(p->p_fd);
+ return (p->p_fd);
+}
+
+/*
+ * Copy a filedesc structure.
+ */
+struct filedesc *
+fdcopy(td)
+ struct thread *td;
+{
+ register struct filedesc *newfdp, *fdp = td->td_proc->p_fd;
+ register struct file **fpp;
+ register int i, j;
+
+ /* Certain daemons might not have file descriptors. */
+ if (fdp == NULL)
+ return (NULL);
+
+ FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+ FILEDESC_UNLOCK(fdp);
+ MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
+ M_FILEDESC, M_WAITOK);
+ FILEDESC_LOCK(fdp);
+ bcopy(fdp, newfdp, sizeof(struct filedesc));
+ FILEDESC_UNLOCK(fdp);
+ bzero(&newfdp->fd_mtx, sizeof(newfdp->fd_mtx));
+ mtx_init(&newfdp->fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
+ if (newfdp->fd_cdir)
+ VREF(newfdp->fd_cdir);
+ if (newfdp->fd_rdir)
+ VREF(newfdp->fd_rdir);
+ if (newfdp->fd_jdir)
+ VREF(newfdp->fd_jdir);
+ newfdp->fd_refcnt = 1;
+
+ /*
+ * If the number of open files fits in the internal arrays
+ * of the open file structure, use them, otherwise allocate
+ * additional memory for the number of descriptors currently
+ * in use.
+ */
+ FILEDESC_LOCK(fdp);
+ newfdp->fd_lastfile = fdp->fd_lastfile;
+ newfdp->fd_nfiles = fdp->fd_nfiles;
+ if (newfdp->fd_lastfile < NDFILE) {
+ newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
+ newfdp->fd_ofileflags =
+ ((struct filedesc0 *) newfdp)->fd_dfileflags;
+ i = NDFILE;
+ } else {
+ /*
+ * Compute the smallest multiple of NDEXTENT needed
+ * for the file descriptors currently in use,
+ * allowing the table to shrink.
+ */
+retry:
+ i = newfdp->fd_nfiles;
+ while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
+ i /= 2;
+ FILEDESC_UNLOCK(fdp);
+ MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
+ M_FILEDESC, M_WAITOK);
+ FILEDESC_LOCK(fdp);
+ newfdp->fd_lastfile = fdp->fd_lastfile;
+ newfdp->fd_nfiles = fdp->fd_nfiles;
+ j = newfdp->fd_nfiles;
+ while (j > 2 * NDEXTENT && j > newfdp->fd_lastfile * 2)
+ j /= 2;
+ if (i != j) {
+ /*
+ * The size of the original table has changed.
+ * Go over once again.
+ */
+ FILEDESC_UNLOCK(fdp);
+ FREE(newfdp->fd_ofiles, M_FILEDESC);
+ FILEDESC_LOCK(fdp);
+ newfdp->fd_lastfile = fdp->fd_lastfile;
+ newfdp->fd_nfiles = fdp->fd_nfiles;
+ goto retry;
+ }
+ newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
+ }
+ newfdp->fd_nfiles = i;
+ bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
+ bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
+
+ /*
+ * kq descriptors cannot be copied.
+ */
+ if (newfdp->fd_knlistsize != -1) {
+ fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile];
+ for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) {
+ if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) {
+ *fpp = NULL;
+ if (i < newfdp->fd_freefile)
+ newfdp->fd_freefile = i;
+ }
+ if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0)
+ newfdp->fd_lastfile--;
+ }
+ newfdp->fd_knlist = NULL;
+ newfdp->fd_knlistsize = -1;
+ newfdp->fd_knhash = NULL;
+ newfdp->fd_knhashmask = 0;
+ }
+
+ fpp = newfdp->fd_ofiles;
+ for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) {
+ if (*fpp != NULL) {
+ fhold(*fpp);
+ }
+ }
+ return (newfdp);
+}
+
+/*
+ * Release a filedesc structure.
+ */
+void
+fdfree(td)
+ struct thread *td;
+{
+ register struct filedesc *fdp;
+ struct file **fpp;
+ register int i;
+
+ fdp = td->td_proc->p_fd;
+ /* Certain daemons might not have file descriptors. */
+ if (fdp == NULL)
+ return;
+
+ FILEDESC_LOCK(fdp);
+ if (--fdp->fd_refcnt > 0) {
+ FILEDESC_UNLOCK(fdp);
+ return;
+ }
+ /*
+ * we are the last reference to the structure, we can
+ * safely assume it will not change out from under us.
+ */
+ FILEDESC_UNLOCK(fdp);
+ fpp = fdp->fd_ofiles;
+ for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
+ if (*fpp)
+ (void) closef(*fpp, td);
+ }
+
+ PROC_LOCK(td->td_proc);
+ td->td_proc->p_fd = NULL;
+ PROC_UNLOCK(td->td_proc);
+
+ if (fdp->fd_nfiles > NDFILE)
+ FREE(fdp->fd_ofiles, M_FILEDESC);
+ if (fdp->fd_cdir)
+ vrele(fdp->fd_cdir);
+ if (fdp->fd_rdir)
+ vrele(fdp->fd_rdir);
+ if (fdp->fd_jdir)
+ vrele(fdp->fd_jdir);
+ if (fdp->fd_knlist)
+ FREE(fdp->fd_knlist, M_KQUEUE);
+ if (fdp->fd_knhash)
+ FREE(fdp->fd_knhash, M_KQUEUE);
+ mtx_destroy(&fdp->fd_mtx);
+ FREE(fdp, M_FILEDESC);
+}
+
+/*
+ * For setugid programs, we don't want to people to use that setugidness
+ * to generate error messages which write to a file which otherwise would
+ * otherwise be off-limits to the process.
+ *
+ * This is a gross hack to plug the hole. A better solution would involve
+ * a special vop or other form of generalized access control mechanism. We
+ * go ahead and just reject all procfs filesystems accesses as dangerous.
+ *
+ * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
+ * sufficient. We also don't for check setugidness since we know we are.
+ */
+static int
+is_unsafe(struct file *fp)
+{
+ if (fp->f_type == DTYPE_VNODE &&
+ ((struct vnode *)(fp->f_data))->v_tag == VT_PROCFS)
+ return (1);
+ return (0);
+}
+
+/*
+ * Make this setguid thing safe, if at all possible.
+ */
+void
+setugidsafety(td)
+ struct thread *td;
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+ register int i;
+
+ /* Certain daemons might not have file descriptors. */
+ if (fdp == NULL)
+ return;
+
+ /*
+ * note: fdp->fd_ofiles may be reallocated out from under us while
+ * we are blocked in a close. Be careful!
+ */
+ FILEDESC_LOCK(fdp);
+ for (i = 0; i <= fdp->fd_lastfile; i++) {
+ if (i > 2)
+ break;
+ if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
+ struct file *fp;
+
+#if 0
+ if ((fdp->fd_ofileflags[i] & UF_MAPPED) != 0)
+ (void) munmapfd(td, i);
+#endif
+ if (i < fdp->fd_knlistsize) {
+ FILEDESC_UNLOCK(fdp);
+ knote_fdclose(td, i);
+ FILEDESC_LOCK(fdp);
+ }
+ /*
+ * NULL-out descriptor prior to close to avoid
+ * a race while close blocks.
+ */
+ fp = fdp->fd_ofiles[i];
+ fdp->fd_ofiles[i] = NULL;
+ fdp->fd_ofileflags[i] = 0;
+ if (i < fdp->fd_freefile)
+ fdp->fd_freefile = i;
+ FILEDESC_UNLOCK(fdp);
+ (void) closef(fp, td);
+ FILEDESC_LOCK(fdp);
+ }
+ }
+ while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+ fdp->fd_lastfile--;
+ FILEDESC_UNLOCK(fdp);
+}
+
+/*
+ * Close any files on exec?
+ */
+void
+fdcloseexec(td)
+ struct thread *td;
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+ register int i;
+
+ /* Certain daemons might not have file descriptors. */
+ if (fdp == NULL)
+ return;
+
+ FILEDESC_LOCK(fdp);
+
+ /*
+ * We cannot cache fd_ofiles or fd_ofileflags since operations
+ * may block and rip them out from under us.
+ */
+ for (i = 0; i <= fdp->fd_lastfile; i++) {
+ if (fdp->fd_ofiles[i] != NULL &&
+ (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
+ struct file *fp;
+
+#if 0
+ if (fdp->fd_ofileflags[i] & UF_MAPPED)
+ (void) munmapfd(td, i);
+#endif
+ if (i < fdp->fd_knlistsize) {
+ FILEDESC_UNLOCK(fdp);
+ knote_fdclose(td, i);
+ FILEDESC_LOCK(fdp);
+ }
+ /*
+ * NULL-out descriptor prior to close to avoid
+ * a race while close blocks.
+ */
+ fp = fdp->fd_ofiles[i];
+ fdp->fd_ofiles[i] = NULL;
+ fdp->fd_ofileflags[i] = 0;
+ if (i < fdp->fd_freefile)
+ fdp->fd_freefile = i;
+ FILEDESC_UNLOCK(fdp);
+ (void) closef(fp, td);
+ FILEDESC_LOCK(fdp);
+ }
+ }
+ while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+ fdp->fd_lastfile--;
+ FILEDESC_UNLOCK(fdp);
+}
+
+/*
+ * It is unsafe for set[ug]id processes to be started with file
+ * descriptors 0..2 closed, as these descriptors are given implicit
+ * significance in the Standard C library. fdcheckstd() will create a
+ * descriptor referencing /dev/null for each of stdin, stdout, and
+ * stderr that is not already open.
+ */
+int
+fdcheckstd(td)
+ struct thread *td;
+{
+ struct nameidata nd;
+ struct filedesc *fdp;
+ struct file *fp;
+ register_t retval;
+ int fd, i, error, flags, devnull;
+
+ fdp = td->td_proc->p_fd;
+ if (fdp == NULL)
+ return (0);
+ devnull = -1;
+ error = 0;
+ for (i = 0; i < 3; i++) {
+ if (fdp->fd_ofiles[i] != NULL)
+ continue;
+ if (devnull < 0) {
+ error = falloc(td, &fp, &fd);
+ if (error != 0)
+ break;
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null",
+ td);
+ flags = FREAD | FWRITE;
+ error = vn_open(&nd, &flags, 0);
+ if (error != 0) {
+ FILEDESC_LOCK(fdp);
+ fdp->fd_ofiles[i] = NULL;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ break;
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ fp->f_data = (caddr_t)nd.ni_vp;
+ fp->f_flag = flags;
+ fp->f_ops = &vnops;
+ fp->f_type = DTYPE_VNODE;
+ VOP_UNLOCK(nd.ni_vp, 0, td);
+ devnull = fd;
+ } else {
+ FILEDESC_LOCK(fdp);
+ error = fdalloc(td, 0, &fd);
+ if (error != 0) {
+ FILEDESC_UNLOCK(fdp);
+ break;
+ }
+ error = do_dup(fdp, devnull, fd, &retval, td);
+ if (error != 0)
+ break;
+ }
+ }
+ return (error);
+}
+
+/*
+ * Internal form of close.
+ * Decrement reference count on file structure.
+ * Note: td may be NULL when closing a file
+ * that was being passed in a message.
+ */
+int
+closef(fp, td)
+ register struct file *fp;
+ register struct thread *td;
+{
+ struct vnode *vp;
+ struct flock lf;
+
+ if (fp == NULL)
+ return (0);
+ /*
+ * POSIX record locking dictates that any close releases ALL
+ * locks owned by this process. This is handled by setting
+ * a flag in the unlock to free ONLY locks obeying POSIX
+ * semantics, and not to free BSD-style file locks.
+ * If the descriptor was in a message, POSIX-style locks
+ * aren't passed with the descriptor.
+ */
+ if (td && (td->td_proc->p_flag & P_ADVLOCK) &&
+ fp->f_type == DTYPE_VNODE) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ vp = (struct vnode *)fp->f_data;
+ (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
+ F_UNLCK, &lf, F_POSIX);
+ }
+ return (fdrop(fp, td));
+}
+
+/*
+ * Drop reference on struct file passed in, may call closef if the
+ * reference hits zero.
+ */
+int
+fdrop(fp, td)
+ struct file *fp;
+ struct thread *td;
+{
+
+ FILE_LOCK(fp);
+ return (fdrop_locked(fp, td));
+}
+
+/*
+ * Extract the file pointer associated with the specified descriptor for
+ * the current user process.
+ *
+ * If the descriptor doesn't exist, EBADF is returned.
+ *
+ * If the descriptor exists but doesn't match 'flags' then
+ * return EBADF for read attempts and EINVAL for write attempts.
+ *
+ * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
+ * It should be droped with fdrop().
+ * If it is not set, then the refcount will not be bumped however the
+ * thread's filedesc struct will be returned locked (for fgetsock).
+ *
+ * If an error occured the non-zero error is returned and *fpp is set to NULL.
+ * Otherwise *fpp is set and zero is returned.
+ */
+static __inline
+int
+_fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+
+ *fpp = NULL;
+ if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
+ return(EBADF);
+ FILEDESC_LOCK(fdp);
+ if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
+ FILEDESC_UNLOCK(fdp);
+ return(EBADF);
+ }
+
+ /*
+ * Note: FREAD failures returns EBADF to maintain backwards
+ * compatibility with what routines returned before.
+ *
+ * Only one flag, or 0, may be specified.
+ */
+ if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
+ FILEDESC_UNLOCK(fdp);
+ return(EBADF);
+ }
+ if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
+ FILEDESC_UNLOCK(fdp);
+ return(EINVAL);
+ }
+ if (hold) {
+ fhold(fp);
+ FILEDESC_UNLOCK(fdp);
+ }
+ *fpp = fp;
+ return(0);
+}
+
+int
+fget(struct thread *td, int fd, struct file **fpp)
+{
+ return(_fget(td, fd, fpp, 0, 1));
+}
+
+int
+fget_read(struct thread *td, int fd, struct file **fpp)
+{
+ return(_fget(td, fd, fpp, FREAD, 1));
+}
+
+int
+fget_write(struct thread *td, int fd, struct file **fpp)
+{
+ return(_fget(td, fd, fpp, FWRITE, 1));
+}
+
+/*
+ * Like fget() but loads the underlying vnode, or returns an error if
+ * the descriptor does not represent a vnode. Note that pipes use vnodes
+ * but never have VM objects (so VOP_GETVOBJECT() calls will return an
+ * error). The returned vnode will be vref()d.
+ */
+
+static __inline
+int
+_fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
+{
+ struct file *fp;
+ int error;
+
+ *vpp = NULL;
+ if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
+ return (error);
+ if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
+ error = EINVAL;
+ } else {
+ *vpp = (struct vnode *)fp->f_data;
+ vref(*vpp);
+ }
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ return (error);
+}
+
+int
+fgetvp(struct thread *td, int fd, struct vnode **vpp)
+{
+ return(_fgetvp(td, fd, vpp, 0));
+}
+
+int
+fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
+{
+ return(_fgetvp(td, fd, vpp, FREAD));
+}
+
+int
+fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
+{
+ return(_fgetvp(td, fd, vpp, FWRITE));
+}
+
+/*
+ * Like fget() but loads the underlying socket, or returns an error if
+ * the descriptor does not represent a socket.
+ *
+ * We bump the ref count on the returned socket. XXX Also obtain the SX lock in
+ * the future.
+ */
+int
+fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
+{
+ struct file *fp;
+ int error;
+
+ *spp = NULL;
+ if (fflagp)
+ *fflagp = 0;
+ if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
+ return (error);
+ if (fp->f_type != DTYPE_SOCKET) {
+ error = ENOTSOCK;
+ } else {
+ *spp = (struct socket *)fp->f_data;
+ if (fflagp)
+ *fflagp = fp->f_flag;
+ soref(*spp);
+ }
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ return(error);
+}
+
+/*
+ * Drop the reference count on the the socket and XXX release the SX lock in
+ * the future. The last reference closes the socket.
+ */
+void
+fputsock(struct socket *so)
+{
+ sorele(so);
+}
+
+/*
+ * Drop reference on struct file passed in, may call closef if the
+ * reference hits zero.
+ * Expects struct file locked, and will unlock it.
+ */
+int
+fdrop_locked(fp, td)
+ struct file *fp;
+ struct thread *td;
+{
+ struct flock lf;
+ struct vnode *vp;
+ int error;
+
+ FILE_LOCK_ASSERT(fp, MA_OWNED);
+
+ if (--fp->f_count > 0) {
+ FILE_UNLOCK(fp);
+ return (0);
+ }
+ mtx_lock(&Giant);
+ if (fp->f_count < 0)
+ panic("fdrop: count < 0");
+ if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ vp = (struct vnode *)fp->f_data;
+ FILE_UNLOCK(fp);
+ (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
+ } else
+ FILE_UNLOCK(fp);
+ if (fp->f_ops != &badfileops)
+ error = fo_close(fp, td);
+ else
+ error = 0;
+ ffree(fp);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Apply an advisory lock on a file descriptor.
+ *
+ * Just attempt to get a record lock of the requested type on
+ * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct flock_args {
+ int fd;
+ int how;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+flock(td, uap)
+ struct thread *td;
+ register struct flock_args *uap;
+{
+ struct file *fp;
+ struct vnode *vp;
+ struct flock lf;
+ int error;
+
+ if ((error = fget(td, uap->fd, &fp)) != 0)
+ return (error);
+ if (fp->f_type != DTYPE_VNODE) {
+ fdrop(fp, td);
+ return (EOPNOTSUPP);
+ }
+
+ mtx_lock(&Giant);
+ vp = (struct vnode *)fp->f_data;
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ if (uap->how & LOCK_UN) {
+ lf.l_type = F_UNLCK;
+ FILE_LOCK(fp);
+ fp->f_flag &= ~FHASLOCK;
+ FILE_UNLOCK(fp);
+ error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
+ goto done2;
+ }
+ if (uap->how & LOCK_EX)
+ lf.l_type = F_WRLCK;
+ else if (uap->how & LOCK_SH)
+ lf.l_type = F_RDLCK;
+ else {
+ error = EBADF;
+ goto done2;
+ }
+ FILE_LOCK(fp);
+ fp->f_flag |= FHASLOCK;
+ FILE_UNLOCK(fp);
+ error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+ (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
+done2:
+ fdrop(fp, td);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * File Descriptor pseudo-device driver (/dev/fd/).
+ *
+ * Opening minor device N dup()s the file (if any) connected to file
+ * descriptor N belonging to the calling process. Note that this driver
+ * consists of only the ``open()'' routine, because all subsequent
+ * references to this file will be direct to the other driver.
+ */
+/* ARGSUSED */
+static int
+fdopen(dev, mode, type, td)
+ dev_t dev;
+ int mode, type;
+ struct thread *td;
+{
+
+ /*
+ * XXX Kludge: set curthread->td_dupfd to contain the value of the
+ * the file descriptor being sought for duplication. The error
+ * return ensures that the vnode for this device will be released
+ * by vn_open. Open will detect this special error and take the
+ * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
+ * will simply report the error.
+ */
+ td->td_dupfd = dev2unit(dev);
+ return (ENODEV);
+}
+
+/*
+ * Duplicate the specified descriptor to a free descriptor.
+ */
+int
+dupfdopen(td, fdp, indx, dfd, mode, error)
+ struct thread *td;
+ struct filedesc *fdp;
+ int indx, dfd;
+ int mode;
+ int error;
+{
+ register struct file *wfp;
+ struct file *fp;
+
+ /*
+ * If the to-be-dup'd fd number is greater than the allowed number
+ * of file descriptors, or the fd to be dup'd has already been
+ * closed, then reject.
+ */
+ FILEDESC_LOCK(fdp);
+ if ((u_int)dfd >= fdp->fd_nfiles ||
+ (wfp = fdp->fd_ofiles[dfd]) == NULL) {
+ FILEDESC_UNLOCK(fdp);
+ return (EBADF);
+ }
+
+ /*
+ * There are two cases of interest here.
+ *
+ * For ENODEV simply dup (dfd) to file descriptor
+ * (indx) and return.
+ *
+ * For ENXIO steal away the file structure from (dfd) and
+ * store it in (indx). (dfd) is effectively closed by
+ * this operation.
+ *
+ * Any other error code is just returned.
+ */
+ switch (error) {
+ case ENODEV:
+ /*
+ * Check that the mode the file is being opened for is a
+ * subset of the mode of the existing descriptor.
+ */
+ FILE_LOCK(wfp);
+ if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
+ FILE_UNLOCK(wfp);
+ FILEDESC_UNLOCK(fdp);
+ return (EACCES);
+ }
+ fp = fdp->fd_ofiles[indx];
+#if 0
+ if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
+ (void) munmapfd(td, indx);
+#endif
+ fdp->fd_ofiles[indx] = wfp;
+ fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
+ fhold_locked(wfp);
+ FILE_UNLOCK(wfp);
+ if (indx > fdp->fd_lastfile)
+ fdp->fd_lastfile = indx;
+ if (fp != NULL)
+ FILE_LOCK(fp);
+ FILEDESC_UNLOCK(fdp);
+ /*
+ * we now own the reference to fp that the ofiles[] array
+ * used to own. Release it.
+ */
+ if (fp != NULL)
+ fdrop_locked(fp, td);
+ return (0);
+
+ case ENXIO:
+ /*
+ * Steal away the file pointer from dfd, and stuff it into indx.
+ */
+ fp = fdp->fd_ofiles[indx];
+#if 0
+ if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
+ (void) munmapfd(td, indx);
+#endif
+ fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
+ fdp->fd_ofiles[dfd] = NULL;
+ fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
+ fdp->fd_ofileflags[dfd] = 0;
+
+ /*
+ * Complete the clean up of the filedesc structure by
+ * recomputing the various hints.
+ */
+ if (indx > fdp->fd_lastfile) {
+ fdp->fd_lastfile = indx;
+ } else {
+ while (fdp->fd_lastfile > 0 &&
+ fdp->fd_ofiles[fdp->fd_lastfile] == NULL) {
+ fdp->fd_lastfile--;
+ }
+ if (dfd < fdp->fd_freefile)
+ fdp->fd_freefile = dfd;
+ }
+ if (fp != NULL)
+ FILE_LOCK(fp);
+ FILEDESC_UNLOCK(fdp);
+
+ /*
+ * we now own the reference to fp that the ofiles[] array
+ * used to own. Release it.
+ */
+ if (fp != NULL)
+ fdrop_locked(fp, td);
+ return (0);
+
+ default:
+ FILEDESC_UNLOCK(fdp);
+ return (error);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Get file structures.
+ */
+static int
+sysctl_kern_file(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct file *fp;
+
+ sx_slock(&filelist_lock);
+ if (!req->oldptr) {
+ /*
+ * overestimate by 10 files
+ */
+ error = SYSCTL_OUT(req, 0, sizeof(filehead) +
+ (nfiles + 10) * sizeof(struct file));
+ sx_sunlock(&filelist_lock);
+ return (error);
+ }
+
+ error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead));
+ if (error) {
+ sx_sunlock(&filelist_lock);
+ return (error);
+ }
+
+ /*
+ * followed by an array of file structures
+ */
+ LIST_FOREACH(fp, &filehead, f_list) {
+ error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file));
+ if (error) {
+ sx_sunlock(&filelist_lock);
+ return (error);
+ }
+ }
+ sx_sunlock(&filelist_lock);
+ return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, 0, sysctl_kern_file, "S,file", "Entire file table");
+
+SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
+ &maxfilesperproc, 0, "Maximum files allowed open per process");
+
+SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
+ &maxfiles, 0, "Maximum number of files");
+
+SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
+ &nfiles, 0, "System-wide number of open files");
+
+static void
+fildesc_drvinit(void *unused)
+{
+ dev_t dev;
+
+ dev = make_dev(&fildesc_cdevsw, 0, UID_BIN, GID_BIN, 0666, "fd/0");
+ make_dev_alias(dev, "stdin");
+ dev = make_dev(&fildesc_cdevsw, 1, UID_BIN, GID_BIN, 0666, "fd/1");
+ make_dev_alias(dev, "stdout");
+ dev = make_dev(&fildesc_cdevsw, 2, UID_BIN, GID_BIN, 0666, "fd/2");
+ make_dev_alias(dev, "stderr");
+ if (!devfs_present) {
+ int fd;
+
+ for (fd = 3; fd < NUMFDESC; fd++)
+ make_dev(&fildesc_cdevsw, fd, UID_BIN, GID_BIN, 0666,
+ "fd/%d", fd);
+ }
+}
+
+struct fileops badfileops = {
+ badfo_readwrite,
+ badfo_readwrite,
+ badfo_ioctl,
+ badfo_poll,
+ badfo_kqfilter,
+ badfo_stat,
+ badfo_close
+};
+
+static int
+badfo_readwrite(fp, uio, cred, flags, td)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+ struct thread *td;
+ int flags;
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_ioctl(fp, com, data, td)
+ struct file *fp;
+ u_long com;
+ caddr_t data;
+ struct thread *td;
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_poll(fp, events, cred, td)
+ struct file *fp;
+ int events;
+ struct ucred *cred;
+ struct thread *td;
+{
+
+ return (0);
+}
+
+static int
+badfo_kqfilter(fp, kn)
+ struct file *fp;
+ struct knote *kn;
+{
+
+ return (0);
+}
+
+static int
+badfo_stat(fp, sb, td)
+ struct file *fp;
+ struct stat *sb;
+ struct thread *td;
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_close(fp, td)
+ struct file *fp;
+ struct thread *td;
+{
+
+ return (EBADF);
+}
+
+SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
+ fildesc_drvinit,NULL)
+
+static void filelistinit(void *);
+SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
+
+/* ARGSUSED*/
+static void
+filelistinit(dummy)
+ void *dummy;
+{
+ file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, 0);
+
+ sx_init(&filelist_lock, "filelist lock");
+ mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
+}
diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c
new file mode 100644
index 0000000..a33b0c7
--- /dev/null
+++ b/sys/kern/kern_environment.c
@@ -0,0 +1,461 @@
+/*-
+ * Copyright (c) 1998 Michael Smith
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * The unified bootloader passes us a pointer to a preserved copy of
+ * bootstrap/kernel environment variables. We convert them to a
+ * dynamic array of strings later when the VM subsystem is up.
+ *
+ * We make these available through the kenv(2) syscall for userland
+ * and through getenv()/freeenv() setenv() unsetenv() testenv() for
+ * the kernel.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/sx.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/libkern.h>
+#include <sys/kenv.h>
+
+MALLOC_DEFINE(M_KENV, "kenv", "kernel environment");
+
+#define KENV_SIZE 512 /* Maximum number of environment strings */
+
+/* pointer to the static environment */
+char *kern_envp;
+static char *kernenv_next(char *);
+
+/* dynamic environment variables */
+char **kenvp;
+struct sx kenv_lock;
+
+/*
+ * No need to protect this with a mutex
+ * since SYSINITS are single threaded.
+ */
+int dynamic_kenv = 0;
+
+#define KENV_CHECK if (!dynamic_kenv) \
+ panic("%s: called before SI_SUB_KMEM", __func__)
+
+int
+kenv(td, uap)
+ struct thread *td;
+ struct kenv_args /* {
+ syscallarg(int) what;
+ syscallarg(const char *) name;
+ syscallarg(char *) value;
+ syscallarg(int) len;
+ } */ *uap;
+{
+ char *name, *value;
+ size_t len, done;
+ int error, i;
+
+ KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = 0"));
+
+ error = 0;
+ if (SCARG(uap, what) == KENV_DUMP) {
+ len = 0;
+ /* Return the size if called with a NULL buffer */
+ if (SCARG(uap, value) == NULL) {
+ sx_slock(&kenv_lock);
+ for (i = 0; kenvp[i] != NULL; i++)
+ len += strlen(kenvp[i]) + 1;
+ sx_sunlock(&kenv_lock);
+ td->td_retval[0] = len;
+ return (0);
+ }
+ done = 0;
+ sx_slock(&kenv_lock);
+ for (i = 0; kenvp[i] != NULL && done < SCARG(uap, len); i++) {
+ len = min(strlen(kenvp[i]) + 1, SCARG(uap, len) - done);
+ error = copyout(kenvp[i], SCARG(uap, value) + done,
+ len);
+ if (error) {
+ sx_sunlock(&kenv_lock);
+ return (error);
+ }
+ done += len;
+ }
+ sx_sunlock(&kenv_lock);
+ return (0);
+ }
+
+ if ((SCARG(uap, what) == KENV_SET) ||
+ (SCARG(uap, what) == KENV_UNSET)) {
+ error = suser(td);
+ if (error)
+ return (error);
+ }
+
+ name = malloc(KENV_MNAMELEN, M_TEMP, M_WAITOK);
+
+ error = copyinstr(SCARG(uap, name), name, KENV_MNAMELEN, NULL);
+ if (error)
+ goto done;
+
+ switch (SCARG(uap, what)) {
+ case KENV_GET:
+ value = getenv(name);
+ if (value == NULL) {
+ error = ENOENT;
+ goto done;
+ }
+ len = strlen(value) + 1;
+ if (len > SCARG(uap, len))
+ len = SCARG(uap, len);
+ error = copyout(value, SCARG(uap, value), len);
+ freeenv(value);
+ if (error)
+ goto done;
+ td->td_retval[0] = len;
+ break;
+ case KENV_SET:
+ len = SCARG(uap, len);
+ if (len < 1) {
+ error = EINVAL;
+ goto done;
+ }
+ if (len > KENV_MVALLEN)
+ len = KENV_MVALLEN;
+ value = malloc(len, M_TEMP, M_WAITOK);
+ error = copyinstr(SCARG(uap, value), value, len, NULL);
+ if (error) {
+ free(value, M_TEMP);
+ goto done;
+ }
+ setenv(name, value);
+ free(value, M_TEMP);
+ break;
+ case KENV_UNSET:
+ error = unsetenv(name);
+ if (error)
+ error = ENOENT;
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+done:
+ free(name, M_TEMP);
+ return (error);
+}
+
+/*
+ * Setup the dynamic kernel environment.
+ */
+static void
+init_dynamic_kenv(void *data __unused)
+{
+ char *cp;
+ int len, i;
+
+ kenvp = malloc(KENV_SIZE * sizeof(char *), M_KENV, M_WAITOK | M_ZERO);
+ i = 0;
+ for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
+ len = strlen(cp) + 1;
+ kenvp[i] = malloc(len, M_KENV, M_WAITOK);
+ strcpy(kenvp[i++], cp);
+ }
+ kenvp[i] = NULL;
+
+ sx_init(&kenv_lock, "kernel environment");
+ dynamic_kenv = 1;
+}
+SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL);
+
+void
+freeenv(char *env)
+{
+
+ if (dynamic_kenv)
+ free(env, M_KENV);
+}
+
+/*
+ * Internal functions for string lookup.
+ */
+static char *
+_getenv_dynamic(const char *name, int *idx)
+{
+ char *cp;
+ int len, i;
+
+ sx_assert(&kenv_lock, SX_LOCKED);
+ len = strlen(name);
+ for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) {
+ if ((cp[len] == '=') &&
+ (strncmp(cp, name, len) == 0)) {
+ if (idx != NULL)
+ *idx = i;
+ return (cp + len + 1);
+ }
+ }
+ return (NULL);
+}
+
+static char *
+_getenv_static(const char *name)
+{
+ char *cp, *ep;
+ int len;
+
+ for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
+ for (ep = cp; (*ep != '=') && (*ep != 0); ep++)
+ ;
+ if (*ep != '=')
+ continue;
+ len = ep - cp;
+ ep++;
+ if (!strncmp(name, cp, len) && name[len] == 0)
+ return (ep);
+ }
+ return (NULL);
+}
+
+/*
+ * Look up an environment variable by name.
+ * Return a pointer to the string if found.
+ * The pointer has to be freed with freeenv()
+ * after use.
+ */
+char *
+getenv(const char *name)
+{
+ char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
+ char *ret, *cp;
+ int len;
+
+ if (dynamic_kenv) {
+ sx_slock(&kenv_lock);
+ cp = _getenv_dynamic(name, NULL);
+ if (cp != NULL) {
+ strcpy(buf, cp);
+ sx_sunlock(&kenv_lock);
+ len = strlen(buf) + 1;
+ ret = malloc(len, M_KENV, M_WAITOK);
+ strcpy(ret, buf);
+ } else {
+ sx_sunlock(&kenv_lock);
+ ret = NULL;
+ }
+ } else
+ ret = _getenv_static(name);
+ return (ret);
+}
+
+/*
+ * Test if an environment variable is defined.
+ */
+int
+testenv(const char *name)
+{
+ char *cp;
+
+ if (dynamic_kenv) {
+ sx_slock(&kenv_lock);
+ cp = _getenv_dynamic(name, NULL);
+ sx_sunlock(&kenv_lock);
+ } else
+ cp = _getenv_static(name);
+ if (cp != NULL)
+ return (1);
+ return (0);
+}
+
+/*
+ * Set an environment variable by name.
+ */
+int
+setenv(const char *name, const char *value)
+{
+ char *buf, *cp, *oldenv;
+ int namelen, vallen, i;
+
+ KENV_CHECK;
+
+ namelen = strlen(name) + 1;
+ if (namelen > KENV_MNAMELEN)
+ return (-1);
+ vallen = strlen(value) + 1;
+ if (vallen > KENV_MVALLEN)
+ return (-1);
+ buf = malloc(namelen + vallen, M_KENV, M_WAITOK);
+ sprintf(buf, "%s=%s", name, value);
+
+ sx_xlock(&kenv_lock);
+ cp = _getenv_dynamic(name, &i);
+ if (cp != NULL) {
+ oldenv = kenvp[i];
+ kenvp[i] = buf;
+ sx_xunlock(&kenv_lock);
+ free(oldenv, M_KENV);
+ } else {
+ /* We add the option if it wasn't found */
+ for (i = 0; (cp = kenvp[i]) != NULL; i++)
+ ;
+ kenvp[i] = buf;
+ kenvp[i + 1] = NULL;
+ sx_xunlock(&kenv_lock);
+ }
+ return (0);
+}
+
+/*
+ * Unset an environment variable string.
+ */
+int
+unsetenv(const char *name)
+{
+ char *cp, *oldenv;
+ int i, j;
+
+ KENV_CHECK;
+
+ sx_xlock(&kenv_lock);
+ cp = _getenv_dynamic(name, &i);
+ if (cp != NULL) {
+ oldenv = kenvp[i];
+ for (j = i + 1; kenvp[j] != NULL; j++)
+ kenvp[i++] = kenvp[j];
+ kenvp[i] = NULL;
+ sx_xunlock(&kenv_lock);
+ free(oldenv, M_KENV);
+ return (0);
+ }
+ sx_xunlock(&kenv_lock);
+ return (-1);
+}
+
+/*
+ * Return a string value from an environment variable.
+ */
+int
+getenv_string(const char *name, char *data, int size)
+{
+ char *tmp;
+
+ tmp = getenv(name);
+ if (tmp != NULL) {
+ strncpy(data, tmp, size);
+ freeenv(tmp);
+ data[size - 1] = 0;
+ return (1);
+ } else
+ return (0);
+}
+
+/*
+ * Return an integer value from an environment variable.
+ */
+int
+getenv_int(const char *name, int *data)
+{
+ quad_t tmp;
+ int rval;
+
+ rval = getenv_quad(name, &tmp);
+ if (rval)
+ *data = (int) tmp;
+ return (rval);
+}
+
+/*
+ * Return a quad_t value from an environment variable.
+ */
+int
+getenv_quad(const char *name, quad_t *data)
+{
+ char *value;
+ char *vtp;
+ quad_t iv;
+
+ value = getenv(name);
+ if (value == NULL)
+ return (0);
+ iv = strtoq(value, &vtp, 0);
+ if ((vtp == value) || (*vtp != '\0')) {
+ freeenv(value);
+ return (0);
+ }
+ freeenv(value);
+ *data = iv;
+ return (1);
+}
+
+/*
+ * Find the next entry after the one which (cp) falls within, return a
+ * pointer to its start or NULL if there are no more.
+ */
+static char *
+kernenv_next(char *cp)
+{
+
+ if (cp != NULL) {
+ while (*cp != 0)
+ cp++;
+ cp++;
+ if (*cp == 0)
+ cp = NULL;
+ }
+ return (cp);
+}
+
+void
+tunable_int_init(void *data)
+{
+ struct tunable_int *d = (struct tunable_int *)data;
+
+ TUNABLE_INT_FETCH(d->path, d->var);
+}
+
+void
+tunable_quad_init(void *data)
+{
+ struct tunable_quad *d = (struct tunable_quad *)data;
+
+ TUNABLE_QUAD_FETCH(d->path, d->var);
+}
+
+void
+tunable_str_init(void *data)
+{
+ struct tunable_str *d = (struct tunable_str *)data;
+
+ TUNABLE_STR_FETCH(d->path, d->var, d->size);
+}
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
new file mode 100644
index 0000000..46d57c9
--- /dev/null
+++ b/sys/kern/kern_event.c
@@ -0,0 +1,1082 @@
+/*-
+ * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/selinfo.h>
+#include <sys/queue.h>
+#include <sys/event.h>
+#include <sys/eventvar.h>
+#include <sys/poll.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/uio.h>
+
+#include <vm/uma.h>
+
+MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
+
+static int kqueue_scan(struct file *fp, int maxevents,
+ struct kevent *ulistp, const struct timespec *timeout,
+ struct thread *td);
+static int kqueue_read(struct file *fp, struct uio *uio,
+ struct ucred *cred, int flags, struct thread *td);
+static int kqueue_write(struct file *fp, struct uio *uio,
+ struct ucred *cred, int flags, struct thread *td);
+static int kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
+ struct thread *td);
+static int kqueue_poll(struct file *fp, int events, struct ucred *cred,
+ struct thread *td);
+static int kqueue_kqfilter(struct file *fp, struct knote *kn);
+static int kqueue_stat(struct file *fp, struct stat *st, struct thread *td);
+static int kqueue_close(struct file *fp, struct thread *td);
+static void kqueue_wakeup(struct kqueue *kq);
+
+static struct fileops kqueueops = {
+ kqueue_read,
+ kqueue_write,
+ kqueue_ioctl,
+ kqueue_poll,
+ kqueue_kqfilter,
+ kqueue_stat,
+ kqueue_close
+};
+
+static void knote_attach(struct knote *kn, struct filedesc *fdp);
+static void knote_drop(struct knote *kn, struct thread *td);
+static void knote_enqueue(struct knote *kn);
+static void knote_dequeue(struct knote *kn);
+static void knote_init(void);
+static struct knote *knote_alloc(void);
+static void knote_free(struct knote *kn);
+
+static void filt_kqdetach(struct knote *kn);
+static int filt_kqueue(struct knote *kn, long hint);
+static int filt_procattach(struct knote *kn);
+static void filt_procdetach(struct knote *kn);
+static int filt_proc(struct knote *kn, long hint);
+static int filt_fileattach(struct knote *kn);
+static void filt_timerexpire(void *knx);
+static int filt_timerattach(struct knote *kn);
+static void filt_timerdetach(struct knote *kn);
+static int filt_timer(struct knote *kn, long hint);
+
+static struct filterops file_filtops =
+ { 1, filt_fileattach, NULL, NULL };
+static struct filterops kqread_filtops =
+ { 1, NULL, filt_kqdetach, filt_kqueue };
+static struct filterops proc_filtops =
+ { 0, filt_procattach, filt_procdetach, filt_proc };
+static struct filterops timer_filtops =
+ { 0, filt_timerattach, filt_timerdetach, filt_timer };
+
+static uma_zone_t knote_zone;
+static int kq_ncallouts = 0;
+static int kq_calloutmax = (4 * 1024);
+SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
+ &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
+
+#define KNOTE_ACTIVATE(kn) do { \
+ kn->kn_status |= KN_ACTIVE; \
+ if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \
+ knote_enqueue(kn); \
+} while(0)
+
+#define KN_HASHSIZE 64 /* XXX should be tunable */
+#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
+
+static int
+filt_nullattach(struct knote *kn)
+{
+
+ return (ENXIO);
+};
+
+struct filterops null_filtops =
+ { 0, filt_nullattach, NULL, NULL };
+
+extern struct filterops sig_filtops;
+
+/*
+ * Table for for all system-defined filters.
+ */
+static struct filterops *sysfilt_ops[] = {
+ &file_filtops, /* EVFILT_READ */
+ &file_filtops, /* EVFILT_WRITE */
+ &null_filtops, /* EVFILT_AIO */
+ &file_filtops, /* EVFILT_VNODE */
+ &proc_filtops, /* EVFILT_PROC */
+ &sig_filtops, /* EVFILT_SIGNAL */
+ &timer_filtops, /* EVFILT_TIMER */
+ &file_filtops, /* EVFILT_NETDEV */
+};
+
+static int
+filt_fileattach(struct knote *kn)
+{
+
+ return (fo_kqfilter(kn->kn_fp, kn));
+}
+
+/*ARGSUSED*/
+static int
+kqueue_kqfilter(struct file *fp, struct knote *kn)
+{
+ struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
+
+ if (kn->kn_filter != EVFILT_READ)
+ return (1);
+
+ kn->kn_fop = &kqread_filtops;
+ SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext);
+ return (0);
+}
+
+static void
+filt_kqdetach(struct knote *kn)
+{
+ struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
+
+ SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext);
+}
+
+/*ARGSUSED*/
+static int
+filt_kqueue(struct knote *kn, long hint)
+{
+ struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
+
+ kn->kn_data = kq->kq_count;
+ return (kn->kn_data > 0);
+}
+
+static int
+filt_procattach(struct knote *kn)
+{
+ struct proc *p;
+ int error;
+
+ p = pfind(kn->kn_id);
+ if (p == NULL)
+ return (ESRCH);
+ if ((error = p_cansee(curthread, p))) {
+ PROC_UNLOCK(p);
+ return (error);
+ }
+
+ kn->kn_ptr.p_proc = p;
+ kn->kn_flags |= EV_CLEAR; /* automatically set */
+
+ /*
+ * internal flag indicating registration done by kernel
+ */
+ if (kn->kn_flags & EV_FLAG1) {
+ kn->kn_data = kn->kn_sdata; /* ppid */
+ kn->kn_fflags = NOTE_CHILD;
+ kn->kn_flags &= ~EV_FLAG1;
+ }
+
+ SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
+ PROC_UNLOCK(p);
+
+ return (0);
+}
+
+/*
+ * The knote may be attached to a different process, which may exit,
+ * leaving nothing for the knote to be attached to. So when the process
+ * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
+ * it will be deleted when read out. However, as part of the knote deletion,
+ * this routine is called, so a check is needed to avoid actually performing
+ * a detach, because the original process does not exist any more.
+ */
+static void
+filt_procdetach(struct knote *kn)
+{
+ struct proc *p = kn->kn_ptr.p_proc;
+
+ if (kn->kn_status & KN_DETACHED)
+ return;
+
+ PROC_LOCK(p);
+ SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
+ PROC_UNLOCK(p);
+}
+
+static int
+filt_proc(struct knote *kn, long hint)
+{
+ u_int event;
+
+ /*
+ * mask off extra data
+ */
+ event = (u_int)hint & NOTE_PCTRLMASK;
+
+ /*
+ * if the user is interested in this event, record it.
+ */
+ if (kn->kn_sfflags & event)
+ kn->kn_fflags |= event;
+
+ /*
+ * process is gone, so flag the event as finished.
+ */
+ if (event == NOTE_EXIT) {
+ kn->kn_status |= KN_DETACHED;
+ kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+ return (1);
+ }
+
+ /*
+ * process forked, and user wants to track the new process,
+ * so attach a new knote to it, and immediately report an
+ * event with the parent's pid.
+ */
+ if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
+ struct kevent kev;
+ int error;
+
+ /*
+ * register knote with new process.
+ */
+ kev.ident = hint & NOTE_PDATAMASK; /* pid */
+ kev.filter = kn->kn_filter;
+ kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
+ kev.fflags = kn->kn_sfflags;
+ kev.data = kn->kn_id; /* parent */
+ kev.udata = kn->kn_kevent.udata; /* preserve udata */
+ error = kqueue_register(kn->kn_kq, &kev, NULL);
+ if (error)
+ kn->kn_fflags |= NOTE_TRACKERR;
+ }
+
+ return (kn->kn_fflags != 0);
+}
+
+static void
+filt_timerexpire(void *knx)
+{
+ struct knote *kn = knx;
+ struct callout *calloutp;
+ struct timeval tv;
+ int tticks;
+
+ kn->kn_data++;
+ KNOTE_ACTIVATE(kn);
+
+ if ((kn->kn_flags & EV_ONESHOT) == 0) {
+ tv.tv_sec = kn->kn_sdata / 1000;
+ tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
+ tticks = tvtohz(&tv);
+ calloutp = (struct callout *)kn->kn_hook;
+ callout_reset(calloutp, tticks, filt_timerexpire, kn);
+ }
+}
+
+/*
+ * data contains amount of time to sleep, in milliseconds
+ */
+static int
+filt_timerattach(struct knote *kn)
+{
+ struct callout *calloutp;
+ struct timeval tv;
+ int tticks;
+
+ if (kq_ncallouts >= kq_calloutmax)
+ return (ENOMEM);
+ kq_ncallouts++;
+
+ tv.tv_sec = kn->kn_sdata / 1000;
+ tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
+ tticks = tvtohz(&tv);
+
+ kn->kn_flags |= EV_CLEAR; /* automatically set */
+ MALLOC(calloutp, struct callout *, sizeof(*calloutp),
+ M_KQUEUE, M_WAITOK);
+ callout_init(calloutp, 0);
+ callout_reset(calloutp, tticks, filt_timerexpire, kn);
+ kn->kn_hook = calloutp;
+
+ return (0);
+}
+
+static void
+filt_timerdetach(struct knote *kn)
+{
+ struct callout *calloutp;
+
+ calloutp = (struct callout *)kn->kn_hook;
+ callout_stop(calloutp);
+ FREE(calloutp, M_KQUEUE);
+ kq_ncallouts--;
+}
+
+static int
+filt_timer(struct knote *kn, long hint)
+{
+
+ return (kn->kn_data != 0);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kqueue(struct thread *td, struct kqueue_args *uap)
+{
+ struct filedesc *fdp;
+ struct kqueue *kq;
+ struct file *fp;
+ int fd, error;
+
+ mtx_lock(&Giant);
+ fdp = td->td_proc->p_fd;
+ error = falloc(td, &fp, &fd);
+ if (error)
+ goto done2;
+ kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&kq->kq_head);
+ FILE_LOCK(fp);
+ fp->f_flag = FREAD | FWRITE;
+ fp->f_type = DTYPE_KQUEUE;
+ fp->f_ops = &kqueueops;
+ TAILQ_INIT(&kq->kq_head);
+ fp->f_data = kq;
+ FILE_UNLOCK(fp);
+ FILEDESC_LOCK(fdp);
+ td->td_retval[0] = fd;
+ if (fdp->fd_knlistsize < 0)
+ fdp->fd_knlistsize = 0; /* this process has a kq */
+ FILEDESC_UNLOCK(fdp);
+ kq->kq_fdp = fdp;
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct kevent_args {
+ int fd;
+ const struct kevent *changelist;
+ int nchanges;
+ struct kevent *eventlist;
+ int nevents;
+ const struct timespec *timeout;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+kevent(struct thread *td, struct kevent_args *uap)
+{
+ struct kevent *kevp;
+ struct kqueue *kq;
+ struct file *fp;
+ struct timespec ts;
+ int i, n, nerrors, error;
+
+ if ((error = fget(td, uap->fd, &fp)) != 0)
+ return (error);
+ if (fp->f_type != DTYPE_KQUEUE) {
+ fdrop(fp, td);
+ return (EBADF);
+ }
+ if (uap->timeout != NULL) {
+ error = copyin(uap->timeout, &ts, sizeof(ts));
+ if (error)
+ goto done_nogiant;
+ uap->timeout = &ts;
+ }
+ mtx_lock(&Giant);
+
+ kq = (struct kqueue *)fp->f_data;
+ nerrors = 0;
+
+ while (uap->nchanges > 0) {
+ n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges;
+ error = copyin(uap->changelist, kq->kq_kev,
+ n * sizeof(struct kevent));
+ if (error)
+ goto done;
+ for (i = 0; i < n; i++) {
+ kevp = &kq->kq_kev[i];
+ kevp->flags &= ~EV_SYSFLAGS;
+ error = kqueue_register(kq, kevp, td);
+ if (error) {
+ if (uap->nevents != 0) {
+ kevp->flags = EV_ERROR;
+ kevp->data = error;
+ (void) copyout(kevp,
+ uap->eventlist,
+ sizeof(*kevp));
+ uap->eventlist++;
+ uap->nevents--;
+ nerrors++;
+ } else {
+ goto done;
+ }
+ }
+ }
+ uap->nchanges -= n;
+ uap->changelist += n;
+ }
+ if (nerrors) {
+ td->td_retval[0] = nerrors;
+ error = 0;
+ goto done;
+ }
+
+ error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, td);
+done:
+ mtx_unlock(&Giant);
+done_nogiant:
+ if (fp != NULL)
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+kqueue_add_filteropts(int filt, struct filterops *filtops)
+{
+
+ if (filt > 0)
+ panic("filt(%d) > 0", filt);
+ if (filt + EVFILT_SYSCOUNT < 0)
+ panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0",
+ filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT);
+ if (sysfilt_ops[~filt] != &null_filtops)
+ panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt);
+ sysfilt_ops[~filt] = filtops;
+ return (0);
+}
+
+int
+kqueue_del_filteropts(int filt)
+{
+
+ if (filt > 0)
+ panic("filt(%d) > 0", filt);
+ if (filt + EVFILT_SYSCOUNT < 0)
+ panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0",
+ filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT);
+ if (sysfilt_ops[~filt] == &null_filtops)
+ panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt);
+ sysfilt_ops[~filt] = &null_filtops;
+ return (0);
+}
+
+int
+kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
+{
+ struct filedesc *fdp = kq->kq_fdp;
+ struct filterops *fops;
+ struct file *fp = NULL;
+ struct knote *kn = NULL;
+ int s, error = 0;
+
+ if (kev->filter < 0) {
+ if (kev->filter + EVFILT_SYSCOUNT < 0)
+ return (EINVAL);
+ fops = sysfilt_ops[~kev->filter]; /* to 0-base index */
+ } else {
+ /*
+ * XXX
+ * filter attach routine is responsible for insuring that
+ * the identifier can be attached to it.
+ */
+ printf("unknown filter: %d\n", kev->filter);
+ return (EINVAL);
+ }
+
+ FILEDESC_LOCK(fdp);
+ if (fops->f_isfd) {
+ /* validate descriptor */
+ if ((u_int)kev->ident >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[kev->ident]) == NULL) {
+ FILEDESC_UNLOCK(fdp);
+ return (EBADF);
+ }
+ fhold(fp);
+
+ if (kev->ident < fdp->fd_knlistsize) {
+ SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
+ if (kq == kn->kn_kq &&
+ kev->filter == kn->kn_filter)
+ break;
+ }
+ } else {
+ if (fdp->fd_knhashmask != 0) {
+ struct klist *list;
+
+ list = &fdp->fd_knhash[
+ KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
+ SLIST_FOREACH(kn, list, kn_link)
+ if (kev->ident == kn->kn_id &&
+ kq == kn->kn_kq &&
+ kev->filter == kn->kn_filter)
+ break;
+ }
+ }
+ FILEDESC_UNLOCK(fdp);
+
+ if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
+ error = ENOENT;
+ goto done;
+ }
+
+ /*
+ * kn now contains the matching knote, or NULL if no match
+ */
+ if (kev->flags & EV_ADD) {
+
+ if (kn == NULL) {
+ kn = knote_alloc();
+ if (kn == NULL) {
+ error = ENOMEM;
+ goto done;
+ }
+ kn->kn_fp = fp;
+ kn->kn_kq = kq;
+ kn->kn_fop = fops;
+
+ /*
+ * apply reference count to knote structure, and
+ * do not release it at the end of this routine.
+ */
+ fp = NULL;
+
+ kn->kn_sfflags = kev->fflags;
+ kn->kn_sdata = kev->data;
+ kev->fflags = 0;
+ kev->data = 0;
+ kn->kn_kevent = *kev;
+
+ knote_attach(kn, fdp);
+ if ((error = fops->f_attach(kn)) != 0) {
+ knote_drop(kn, td);
+ goto done;
+ }
+ } else {
+ /*
+ * The user may change some filter values after the
+ * initial EV_ADD, but doing so will not reset any
+ * filter which have already been triggered.
+ */
+ kn->kn_sfflags = kev->fflags;
+ kn->kn_sdata = kev->data;
+ kn->kn_kevent.udata = kev->udata;
+ }
+
+ s = splhigh();
+ if (kn->kn_fop->f_event(kn, 0))
+ KNOTE_ACTIVATE(kn);
+ splx(s);
+
+ } else if (kev->flags & EV_DELETE) {
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ goto done;
+ }
+
+ if ((kev->flags & EV_DISABLE) &&
+ ((kn->kn_status & KN_DISABLED) == 0)) {
+ s = splhigh();
+ kn->kn_status |= KN_DISABLED;
+ splx(s);
+ }
+
+ if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
+ s = splhigh();
+ kn->kn_status &= ~KN_DISABLED;
+ if ((kn->kn_status & KN_ACTIVE) &&
+ ((kn->kn_status & KN_QUEUED) == 0))
+ knote_enqueue(kn);
+ splx(s);
+ }
+
+done:
+ if (fp != NULL)
+ fdrop(fp, td);
+ return (error);
+}
+
+static int
+kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp,
+ const struct timespec *tsp, struct thread *td)
+{
+ struct kqueue *kq;
+ struct kevent *kevp;
+ struct timeval atv, rtv, ttv;
+ struct knote *kn, marker;
+ int s, count, timeout, nkev = 0, error = 0;
+
+ FILE_LOCK_ASSERT(fp, MA_NOTOWNED);
+
+ kq = (struct kqueue *)fp->f_data;
+ count = maxevents;
+ if (count == 0)
+ goto done;
+
+ if (tsp != NULL) {
+ TIMESPEC_TO_TIMEVAL(&atv, tsp);
+ if (itimerfix(&atv)) {
+ error = EINVAL;
+ goto done;
+ }
+ if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
+ timeout = -1;
+ else
+ timeout = atv.tv_sec > 24 * 60 * 60 ?
+ 24 * 60 * 60 * hz : tvtohz(&atv);
+ getmicrouptime(&rtv);
+ timevaladd(&atv, &rtv);
+ } else {
+ atv.tv_sec = 0;
+ atv.tv_usec = 0;
+ timeout = 0;
+ }
+ goto start;
+
+retry:
+ if (atv.tv_sec || atv.tv_usec) {
+ getmicrouptime(&rtv);
+ if (timevalcmp(&rtv, &atv, >=))
+ goto done;
+ ttv = atv;
+ timevalsub(&ttv, &rtv);
+ timeout = ttv.tv_sec > 24 * 60 * 60 ?
+ 24 * 60 * 60 * hz : tvtohz(&ttv);
+ }
+
+start:
+ kevp = kq->kq_kev;
+ s = splhigh();
+ if (kq->kq_count == 0) {
+ if (timeout < 0) {
+ error = EWOULDBLOCK;
+ } else {
+ kq->kq_state |= KQ_SLEEP;
+ error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout);
+ }
+ splx(s);
+ if (error == 0)
+ goto retry;
+ /* don't restart after signals... */
+ if (error == ERESTART)
+ error = EINTR;
+ else if (error == EWOULDBLOCK)
+ error = 0;
+ goto done;
+ }
+
+ TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe);
+ while (count) {
+ kn = TAILQ_FIRST(&kq->kq_head);
+ TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
+ if (kn == &marker) {
+ splx(s);
+ if (count == maxevents)
+ goto retry;
+ goto done;
+ }
+ if (kn->kn_status & KN_DISABLED) {
+ kn->kn_status &= ~KN_QUEUED;
+ kq->kq_count--;
+ continue;
+ }
+ if ((kn->kn_flags & EV_ONESHOT) == 0 &&
+ kn->kn_fop->f_event(kn, 0) == 0) {
+ kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+ kq->kq_count--;
+ continue;
+ }
+ *kevp = kn->kn_kevent;
+ kevp++;
+ nkev++;
+ if (kn->kn_flags & EV_ONESHOT) {
+ kn->kn_status &= ~KN_QUEUED;
+ kq->kq_count--;
+ splx(s);
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ s = splhigh();
+ } else if (kn->kn_flags & EV_CLEAR) {
+ kn->kn_data = 0;
+ kn->kn_fflags = 0;
+ kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+ kq->kq_count--;
+ } else {
+ TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
+ }
+ count--;
+ if (nkev == KQ_NEVENTS) {
+ splx(s);
+ error = copyout(&kq->kq_kev, ulistp,
+ sizeof(struct kevent) * nkev);
+ ulistp += nkev;
+ nkev = 0;
+ kevp = kq->kq_kev;
+ s = splhigh();
+ if (error)
+ break;
+ }
+ }
+ TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe);
+ splx(s);
+done:
+ if (nkev != 0)
+ error = copyout(&kq->kq_kev, ulistp,
+ sizeof(struct kevent) * nkev);
+ td->td_retval[0] = maxevents - count;
+ return (error);
+}
+
+/*
+ * XXX
+ * This could be expanded to call kqueue_scan, if desired.
+ */
+/*ARGSUSED*/
+static int
+kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred,
+ int flags, struct thread *td)
+{
+ return (ENXIO);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred,
+ int flags, struct thread *td)
+{
+ return (ENXIO);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct thread *td)
+{
+ return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
+{
+ struct kqueue *kq;
+ int revents = 0;
+ int s = splnet();
+
+ kq = (struct kqueue *)fp->f_data;
+ if (events & (POLLIN | POLLRDNORM)) {
+ if (kq->kq_count) {
+ revents |= events & (POLLIN | POLLRDNORM);
+ } else {
+ selrecord(td, &kq->kq_sel);
+ kq->kq_state |= KQ_SEL;
+ }
+ }
+ splx(s);
+ return (revents);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_stat(struct file *fp, struct stat *st, struct thread *td)
+{
+ struct kqueue *kq;
+
+ kq = (struct kqueue *)fp->f_data;
+ bzero((void *)st, sizeof(*st));
+ st->st_size = kq->kq_count;
+ st->st_blksize = sizeof(struct kevent);
+ st->st_mode = S_IFIFO;
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_close(struct file *fp, struct thread *td)
+{
+ struct kqueue *kq = (struct kqueue *)fp->f_data;
+ struct filedesc *fdp = td->td_proc->p_fd;
+ struct knote **knp, *kn, *kn0;
+ int i;
+
+ FILEDESC_LOCK(fdp);
+ for (i = 0; i < fdp->fd_knlistsize; i++) {
+ knp = &SLIST_FIRST(&fdp->fd_knlist[i]);
+ kn = *knp;
+ while (kn != NULL) {
+ kn0 = SLIST_NEXT(kn, kn_link);
+ if (kq == kn->kn_kq) {
+ kn->kn_fop->f_detach(kn);
+ *knp = kn0;
+ FILE_LOCK(kn->kn_fp);
+ FILEDESC_UNLOCK(fdp);
+ fdrop_locked(kn->kn_fp, td);
+ knote_free(kn);
+ FILEDESC_LOCK(fdp);
+ } else {
+ knp = &SLIST_NEXT(kn, kn_link);
+ }
+ kn = kn0;
+ }
+ }
+ if (fdp->fd_knhashmask != 0) {
+ for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
+ knp = &SLIST_FIRST(&fdp->fd_knhash[i]);
+ kn = *knp;
+ while (kn != NULL) {
+ kn0 = SLIST_NEXT(kn, kn_link);
+ if (kq == kn->kn_kq) {
+ kn->kn_fop->f_detach(kn);
+ *knp = kn0;
+ /* XXX non-fd release of kn->kn_ptr */
+ FILEDESC_UNLOCK(fdp);
+ knote_free(kn);
+ FILEDESC_LOCK(fdp);
+ } else {
+ knp = &SLIST_NEXT(kn, kn_link);
+ }
+ kn = kn0;
+ }
+ }
+ }
+ FILEDESC_UNLOCK(fdp);
+ free(kq, M_KQUEUE);
+ fp->f_data = NULL;
+
+ return (0);
+}
+
+static void
+kqueue_wakeup(struct kqueue *kq)
+{
+
+ if (kq->kq_state & KQ_SLEEP) {
+ kq->kq_state &= ~KQ_SLEEP;
+ wakeup(kq);
+ }
+ if (kq->kq_state & KQ_SEL) {
+ kq->kq_state &= ~KQ_SEL;
+ selwakeup(&kq->kq_sel);
+ }
+ KNOTE(&kq->kq_sel.si_note, 0);
+}
+
+/*
+ * walk down a list of knotes, activating them if their event has triggered.
+ */
+void
+knote(struct klist *list, long hint)
+{
+ struct knote *kn;
+
+ SLIST_FOREACH(kn, list, kn_selnext)
+ if (kn->kn_fop->f_event(kn, hint))
+ KNOTE_ACTIVATE(kn);
+}
+
+/*
+ * remove all knotes from a specified klist
+ */
+void
+knote_remove(struct thread *td, struct klist *list)
+{
+ struct knote *kn;
+
+ while ((kn = SLIST_FIRST(list)) != NULL) {
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ }
+}
+
+/*
+ * remove all knotes referencing a specified fd
+ */
+void
+knote_fdclose(struct thread *td, int fd)
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+ struct klist *list;
+
+ FILEDESC_LOCK(fdp);
+ list = &fdp->fd_knlist[fd];
+ FILEDESC_UNLOCK(fdp);
+ knote_remove(td, list);
+}
+
+static void
+knote_attach(struct knote *kn, struct filedesc *fdp)
+{
+ struct klist *list, *oldlist;
+ int size, newsize;
+
+ FILEDESC_LOCK(fdp);
+
+ if (! kn->kn_fop->f_isfd) {
+ if (fdp->fd_knhashmask == 0)
+ fdp->fd_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
+ &fdp->fd_knhashmask);
+ list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
+ goto done;
+ }
+
+ if (fdp->fd_knlistsize <= kn->kn_id) {
+retry:
+ size = fdp->fd_knlistsize;
+ while (size <= kn->kn_id)
+ size += KQEXTENT;
+ FILEDESC_UNLOCK(fdp);
+ MALLOC(list, struct klist *,
+ size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
+ FILEDESC_LOCK(fdp);
+ newsize = fdp->fd_knlistsize;
+ while (newsize <= kn->kn_id)
+ newsize += KQEXTENT;
+ if (newsize != size) {
+ FILEDESC_UNLOCK(fdp);
+ free(list, M_TEMP);
+ FILEDESC_LOCK(fdp);
+ goto retry;
+ }
+ bcopy(fdp->fd_knlist, list,
+ fdp->fd_knlistsize * sizeof(struct klist *));
+ bzero((caddr_t)list +
+ fdp->fd_knlistsize * sizeof(struct klist *),
+ (size - fdp->fd_knlistsize) * sizeof(struct klist *));
+ if (fdp->fd_knlist != NULL)
+ oldlist = fdp->fd_knlist;
+ else
+ oldlist = NULL;
+ fdp->fd_knlistsize = size;
+ fdp->fd_knlist = list;
+ FILEDESC_UNLOCK(fdp);
+ if (oldlist != NULL)
+ FREE(oldlist, M_KQUEUE);
+ FILEDESC_LOCK(fdp);
+ }
+ list = &fdp->fd_knlist[kn->kn_id];
+done:
+ FILEDESC_UNLOCK(fdp);
+ SLIST_INSERT_HEAD(list, kn, kn_link);
+ kn->kn_status = 0;
+}
+
+/*
+ * should be called at spl == 0, since we don't want to hold spl
+ * while calling fdrop and free.
+ */
+static void
+knote_drop(struct knote *kn, struct thread *td)
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+ struct klist *list;
+
+ FILEDESC_LOCK(fdp);
+ if (kn->kn_fop->f_isfd)
+ list = &fdp->fd_knlist[kn->kn_id];
+ else
+ list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
+ if (kn->kn_fop->f_isfd)
+ FILE_LOCK(kn->kn_fp);
+ FILEDESC_UNLOCK(fdp);
+
+ SLIST_REMOVE(list, kn, knote, kn_link);
+ if (kn->kn_status & KN_QUEUED)
+ knote_dequeue(kn);
+ if (kn->kn_fop->f_isfd)
+ fdrop_locked(kn->kn_fp, td);
+ knote_free(kn);
+}
+
+
+static void
+knote_enqueue(struct knote *kn)
+{
+ struct kqueue *kq = kn->kn_kq;
+ int s = splhigh();
+
+ KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
+
+ TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
+ kn->kn_status |= KN_QUEUED;
+ kq->kq_count++;
+ splx(s);
+ kqueue_wakeup(kq);
+}
+
+static void
+knote_dequeue(struct knote *kn)
+{
+ struct kqueue *kq = kn->kn_kq;
+ int s = splhigh();
+
+ KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
+
+ TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
+ kn->kn_status &= ~KN_QUEUED;
+ kq->kq_count--;
+ splx(s);
+}
+
+static void
+knote_init(void)
+{
+ knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, 0);
+
+}
+SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
+
+static struct knote *
+knote_alloc(void)
+{
+ return ((struct knote *)uma_zalloc(knote_zone, M_WAITOK));
+}
+
+static void
+knote_free(struct knote *kn)
+{
+ uma_zfree(knote_zone, kn);
+}
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
new file mode 100644
index 0000000..bc773df
--- /dev/null
+++ b/sys/kern/kern_exec.c
@@ -0,0 +1,1022 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/fcntl.h>
+#include <sys/acct.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/wait.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/namei.h>
+#include <sys/sysent.h>
+#include <sys/shm.h>
+#include <sys/sysctl.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+
+#include <machine/reg.h>
+
+MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
+
+static MALLOC_DEFINE(M_ATEXEC, "atexec", "atexec callback");
+
+/*
+ * callout list for things to do at exec time
+ */
+struct execlist {
+ execlist_fn function;
+ TAILQ_ENTRY(execlist) next;
+};
+
+TAILQ_HEAD(exec_list_head, execlist);
+static struct exec_list_head exec_list = TAILQ_HEAD_INITIALIZER(exec_list);
+
+static register_t *exec_copyout_strings(struct image_params *);
+
+/* XXX This should be vm_size_t. */
+static u_long ps_strings = PS_STRINGS;
+SYSCTL_ULONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings, 0, "");
+
+/* XXX This should be vm_size_t. */
+static u_long usrstack = USRSTACK;
+SYSCTL_ULONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, 0, "");
+
+u_long ps_arg_cache_limit = PAGE_SIZE / 16;
+SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
+ &ps_arg_cache_limit, 0, "");
+
+int ps_argsopen = 1;
+SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, "");
+
+#ifdef __ia64__
+/* XXX HACK */
+static int regstkpages = 256;
+SYSCTL_INT(_machdep, OID_AUTO, regstkpages, CTLFLAG_RW, &regstkpages, 0, "");
+#endif
+
+/*
+ * Each of the items is a pointer to a `const struct execsw', hence the
+ * double pointer here.
+ */
+static const struct execsw **execsw;
+
+#ifndef _SYS_SYSPROTO_H_
+struct execve_args {
+ char *fname;
+ char **argv;
+ char **envv;
+};
+#endif
+
+/*
+ * execve() system call.
+ *
+ * MPSAFE
+ */
+int
+execve(td, uap)
+ struct thread *td;
+ register struct execve_args *uap;
+{
+ struct proc *p = td->td_proc;
+ struct nameidata nd, *ndp;
+ struct ucred *newcred = NULL, *oldcred;
+ struct uidinfo *euip;
+ register_t *stack_base;
+ int error, len, i;
+ struct image_params image_params, *imgp;
+ struct vattr attr;
+ int (*img_first)(struct image_params *);
+ struct pargs *oldargs = NULL, *newargs = NULL;
+ struct procsig *oldprocsig, *newprocsig;
+#ifdef KTRACE
+ struct vnode *tracevp = NULL;
+#endif
+ struct vnode *textvp = NULL;
+
+ imgp = &image_params;
+
+ /*
+ * Lock the process and set the P_INEXEC flag to indicate that
+ * it should be left alone until we're done here. This is
+ * necessary to avoid race conditions - e.g. in ptrace() -
+ * that might allow a local user to illicitly obtain elevated
+ * privileges.
+ */
+ mtx_lock(&Giant);
+ PROC_LOCK(p);
+ KASSERT((p->p_flag & P_INEXEC) == 0,
+ ("%s(): process already has P_INEXEC flag", __func__));
+ p->p_flag |= P_INEXEC;
+ PROC_UNLOCK(p);
+
+/* XXXKSE */
+/* !!!!!!!! we need abort all the other threads of this process before we */
+/* proceed beyond his point! */
+
+ /*
+ * Initialize part of the common data
+ */
+ imgp->proc = p;
+ imgp->uap = uap;
+ imgp->attr = &attr;
+ imgp->argc = imgp->envc = 0;
+ imgp->argv0 = NULL;
+ imgp->entry_addr = 0;
+ imgp->vmspace_destroyed = 0;
+ imgp->interpreted = 0;
+ imgp->interpreter_name[0] = '\0';
+ imgp->auxargs = NULL;
+ imgp->vp = NULL;
+ imgp->firstpage = NULL;
+ imgp->ps_strings = 0;
+ imgp->auxarg_size = 0;
+
+ /*
+ * Allocate temporary demand zeroed space for argument and
+ * environment strings
+ */
+ imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX + PAGE_SIZE);
+ if (imgp->stringbase == NULL) {
+ error = ENOMEM;
+ goto exec_fail;
+ }
+ imgp->stringp = imgp->stringbase;
+ imgp->stringspace = ARG_MAX;
+ imgp->image_header = imgp->stringbase + ARG_MAX;
+
+ /*
+ * Translate the file name. namei() returns a vnode pointer
+ * in ni_vp amoung other things.
+ */
+ ndp = &nd;
+ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+ UIO_USERSPACE, uap->fname, td);
+
+interpret:
+
+ error = namei(ndp);
+ if (error) {
+ kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
+ ARG_MAX + PAGE_SIZE);
+ goto exec_fail;
+ }
+
+ imgp->vp = ndp->ni_vp;
+ imgp->fname = uap->fname;
+
+ /*
+ * Check file permissions (also 'opens' file)
+ */
+ error = exec_check_permissions(imgp);
+ if (error) {
+ VOP_UNLOCK(imgp->vp, 0, td);
+ goto exec_fail_dealloc;
+ }
+
+ error = exec_map_first_page(imgp);
+ VOP_UNLOCK(imgp->vp, 0, td);
+ if (error)
+ goto exec_fail_dealloc;
+
+ /*
+ * If the current process has a special image activator it
+ * wants to try first, call it. For example, emulating shell
+ * scripts differently.
+ */
+ error = -1;
+ if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
+ error = img_first(imgp);
+
+ /*
+ * Loop through the list of image activators, calling each one.
+ * An activator returns -1 if there is no match, 0 on success,
+ * and an error otherwise.
+ */
+ for (i = 0; error == -1 && execsw[i]; ++i) {
+ if (execsw[i]->ex_imgact == NULL ||
+ execsw[i]->ex_imgact == img_first) {
+ continue;
+ }
+ error = (*execsw[i]->ex_imgact)(imgp);
+ }
+
+ if (error) {
+ if (error == -1)
+ error = ENOEXEC;
+ goto exec_fail_dealloc;
+ }
+
+ /*
+ * Special interpreter operation, cleanup and loop up to try to
+ * activate the interpreter.
+ */
+ if (imgp->interpreted) {
+ exec_unmap_first_page(imgp);
+ /* free name buffer and old vnode */
+ NDFREE(ndp, NDF_ONLY_PNBUF);
+ vrele(ndp->ni_vp);
+ /* set new name to that of the interpreter */
+ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+ UIO_SYSSPACE, imgp->interpreter_name, td);
+ goto interpret;
+ }
+
+ /*
+ * Copy out strings (args and env) and initialize stack base
+ */
+ stack_base = exec_copyout_strings(imgp);
+
+ /*
+ * If custom stack fixup routine present for this process
+ * let it do the stack setup.
+ * Else stuff argument count as first item on stack
+ */
+ if (p->p_sysent->sv_fixup)
+ (*p->p_sysent->sv_fixup)(&stack_base, imgp);
+ else
+ suword(--stack_base, imgp->argc);
+
+ /*
+ * For security and other reasons, the file descriptor table cannot
+ * be shared after an exec.
+ */
+ FILEDESC_LOCK(p->p_fd);
+ if (p->p_fd->fd_refcnt > 1) {
+ struct filedesc *tmp;
+
+ tmp = fdcopy(td);
+ FILEDESC_UNLOCK(p->p_fd);
+ fdfree(td);
+ p->p_fd = tmp;
+ } else
+ FILEDESC_UNLOCK(p->p_fd);
+
+ /*
+ * Malloc things before we need locks.
+ */
+ newcred = crget();
+ euip = uifind(attr.va_uid);
+ i = imgp->endargs - imgp->stringbase;
+ if (ps_arg_cache_limit >= i + sizeof(struct pargs))
+ newargs = pargs_alloc(i);
+
+ /* close files on exec */
+ fdcloseexec(td);
+
+ /*
+ * For security and other reasons, signal handlers cannot
+ * be shared after an exec. The new process gets a copy of the old
+ * handlers. In execsigs(), the new process will have its signals
+ * reset.
+ */
+ PROC_LOCK(p);
+ mp_fixme("procsig needs a lock");
+ if (p->p_procsig->ps_refcnt > 1) {
+ oldprocsig = p->p_procsig;
+ PROC_UNLOCK(p);
+ MALLOC(newprocsig, struct procsig *, sizeof(struct procsig),
+ M_SUBPROC, M_WAITOK);
+ bcopy(oldprocsig, newprocsig, sizeof(*newprocsig));
+ newprocsig->ps_refcnt = 1;
+ oldprocsig->ps_refcnt--;
+ PROC_LOCK(p);
+ p->p_procsig = newprocsig;
+ if (p->p_sigacts == &p->p_uarea->u_sigacts)
+ panic("shared procsig but private sigacts?");
+
+ p->p_uarea->u_sigacts = *p->p_sigacts;
+ p->p_sigacts = &p->p_uarea->u_sigacts;
+ }
+ /* Stop profiling */
+ stopprofclock(p);
+
+ /* reset caught signals */
+ execsigs(p);
+
+ /* name this process - nameiexec(p, ndp) */
+ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
+ bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
+ p->p_comm[len] = 0;
+
+ /*
+ * mark as execed, wakeup the process that vforked (if any) and tell
+ * it that it now has its own resources back
+ */
+ p->p_flag |= P_EXEC;
+ if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
+ p->p_flag &= ~P_PPWAIT;
+ wakeup((caddr_t)p->p_pptr);
+ }
+
+ /*
+ * Implement image setuid/setgid.
+ *
+ * Don't honor setuid/setgid if the filesystem prohibits it or if
+ * the process is being traced.
+ */
+ oldcred = p->p_ucred;
+ if ((((attr.va_mode & VSUID) && oldcred->cr_uid != attr.va_uid) ||
+ ((attr.va_mode & VSGID) && oldcred->cr_gid != attr.va_gid)) &&
+ (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
+ (p->p_flag & P_TRACED) == 0) {
+ /*
+ * Turn off syscall tracing for set-id programs, except for
+ * root. Record any set-id flags first to make sure that
+ * we do not regain any tracing during a possible block.
+ */
+ setsugid(p);
+#ifdef KTRACE
+ if (p->p_tracep && suser_cred(oldcred, PRISON_ROOT)) {
+ mtx_lock(&ktrace_mtx);
+ p->p_traceflag = 0;
+ tracevp = p->p_tracep;
+ p->p_tracep = NULL;
+ mtx_unlock(&ktrace_mtx);
+ }
+#endif
+ /* Make sure file descriptors 0..2 are in use. */
+ error = fdcheckstd(td);
+ if (error != 0) {
+ oldcred = NULL;
+ goto done1;
+ }
+ /*
+ * Set the new credentials.
+ */
+ crcopy(newcred, oldcred);
+ if (attr.va_mode & VSUID)
+ change_euid(newcred, euip);
+ if (attr.va_mode & VSGID)
+ change_egid(newcred, attr.va_gid);
+ setugidsafety(td);
+ /*
+ * Implement correct POSIX saved-id behavior.
+ */
+ change_svuid(newcred, newcred->cr_uid);
+ change_svgid(newcred, newcred->cr_gid);
+ p->p_ucred = newcred;
+ newcred = NULL;
+ } else {
+ if (oldcred->cr_uid == oldcred->cr_ruid &&
+ oldcred->cr_gid == oldcred->cr_rgid)
+ p->p_flag &= ~P_SUGID;
+ /*
+ * Implement correct POSIX saved-id behavior.
+ *
+ * XXX: It's not clear that the existing behavior is
+ * POSIX-compliant. A number of sources indicate that the
+ * saved uid/gid should only be updated if the new ruid is
+ * not equal to the old ruid, or the new euid is not equal
+ * to the old euid and the new euid is not equal to the old
+ * ruid. The FreeBSD code always updates the saved uid/gid.
+ * Also, this code uses the new (replaced) euid and egid as
+ * the source, which may or may not be the right ones to use.
+ */
+ if (oldcred->cr_svuid != oldcred->cr_uid ||
+ oldcred->cr_svgid != oldcred->cr_gid) {
+ crcopy(newcred, oldcred);
+ change_svuid(newcred, newcred->cr_uid);
+ change_svgid(newcred, newcred->cr_gid);
+ p->p_ucred = newcred;
+ newcred = NULL;
+ }
+ }
+
+ /*
+ * Store the vp for use in procfs
+ */
+ textvp = p->p_textvp;
+ VREF(ndp->ni_vp);
+ p->p_textvp = ndp->ni_vp;
+
+ /*
+ * Notify others that we exec'd, and clear the P_INEXEC flag
+ * as we're now a bona fide freshly-execed process.
+ */
+ KNOTE(&p->p_klist, NOTE_EXEC);
+ p->p_flag &= ~P_INEXEC;
+
+ /*
+ * If tracing the process, trap to debugger so breakpoints
+ * can be set before the program executes.
+ */
+ _STOPEVENT(p, S_EXEC, 0);
+
+ if (p->p_flag & P_TRACED)
+ psignal(p, SIGTRAP);
+
+ /* clear "fork but no exec" flag, as we _are_ execing */
+ p->p_acflag &= ~AFORK;
+
+ /* Free any previous argument cache */
+ oldargs = p->p_args;
+ p->p_args = NULL;
+
+ /* Set values passed into the program in registers. */
+ setregs(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base,
+ imgp->ps_strings);
+
+ /* Cache arguments if they fit inside our allowance */
+ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
+ bcopy(imgp->stringbase, newargs->ar_args, i);
+ p->p_args = newargs;
+ newargs = NULL;
+ }
+done1:
+ PROC_UNLOCK(p);
+
+ /*
+ * Free any resources malloc'd earlier that we didn't use.
+ */
+ uifree(euip);
+ if (newcred == NULL)
+ crfree(oldcred);
+ else
+ crfree(newcred);
+ /*
+ * Handle deferred decrement of ref counts.
+ */
+ if (textvp != NULL)
+ vrele(textvp);
+#ifdef KTRACE
+ if (tracevp != NULL)
+ vrele(tracevp);
+#endif
+ if (oldargs != NULL)
+ pargs_drop(oldargs);
+ if (newargs != NULL)
+ pargs_drop(newargs);
+
+exec_fail_dealloc:
+
+ /*
+ * free various allocated resources
+ */
+ if (imgp->firstpage)
+ exec_unmap_first_page(imgp);
+
+ if (imgp->stringbase != NULL)
+ kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
+ ARG_MAX + PAGE_SIZE);
+
+ if (imgp->vp) {
+ NDFREE(ndp, NDF_ONLY_PNBUF);
+ vrele(imgp->vp);
+ }
+
+ if (error == 0)
+ goto done2;
+
+exec_fail:
+ /* we're done here, clear P_INEXEC */
+ PROC_LOCK(p);
+ p->p_flag &= ~P_INEXEC;
+ PROC_UNLOCK(p);
+
+ if (imgp->vmspace_destroyed) {
+ /* sorry, no more process anymore. exit gracefully */
+ exit1(td, W_EXITCODE(0, SIGABRT));
+ /* NOT REACHED */
+ error = 0;
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+int
+exec_map_first_page(imgp)
+ struct image_params *imgp;
+{
+ int rv, i;
+ int initial_pagein;
+ vm_page_t ma[VM_INITIAL_PAGEIN];
+ vm_object_t object;
+
+ GIANT_REQUIRED;
+
+ if (imgp->firstpage) {
+ exec_unmap_first_page(imgp);
+ }
+
+ VOP_GETVOBJECT(imgp->vp, &object);
+
+ ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+
+ if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
+ initial_pagein = VM_INITIAL_PAGEIN;
+ if (initial_pagein > object->size)
+ initial_pagein = object->size;
+ for (i = 1; i < initial_pagein; i++) {
+ if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
+ if ((ma[i]->flags & PG_BUSY) || ma[i]->busy)
+ break;
+ if (ma[i]->valid)
+ break;
+ vm_page_busy(ma[i]);
+ } else {
+ ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL);
+ if (ma[i] == NULL)
+ break;
+ }
+ }
+ initial_pagein = i;
+
+ rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
+ ma[0] = vm_page_lookup(object, 0);
+
+ if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) {
+ if (ma[0]) {
+ vm_page_protect(ma[0], VM_PROT_NONE);
+ vm_page_free(ma[0]);
+ }
+ return EIO;
+ }
+ }
+
+ vm_page_wire(ma[0]);
+ vm_page_wakeup(ma[0]);
+
+ pmap_qenter((vm_offset_t)imgp->image_header, ma, 1);
+ imgp->firstpage = ma[0];
+
+ return 0;
+}
+
+void
+exec_unmap_first_page(imgp)
+ struct image_params *imgp;
+{
+ GIANT_REQUIRED;
+
+ if (imgp->firstpage) {
+ pmap_qremove((vm_offset_t)imgp->image_header, 1);
+ vm_page_unwire(imgp->firstpage, 1);
+ imgp->firstpage = NULL;
+ }
+}
+
+/*
+ * Destroy old address space, and allocate a new stack
+ * The new stack is only SGROWSIZ large because it is grown
+ * automatically in trap.c.
+ */
+int
+exec_new_vmspace(imgp)
+ struct image_params *imgp;
+{
+ int error;
+ struct execlist *ep;
+ struct proc *p = imgp->proc;
+ struct vmspace *vmspace = p->p_vmspace;
+ vm_offset_t stack_addr = USRSTACK - maxssiz;
+
+ GIANT_REQUIRED;
+
+ imgp->vmspace_destroyed = 1;
+
+ /*
+ * Perform functions registered with at_exec().
+ */
+ TAILQ_FOREACH(ep, &exec_list, next)
+ (*ep->function)(p);
+
+ /*
+ * Blow away entire process VM, if address space not shared,
+ * otherwise, create a new VM space so that other threads are
+ * not disrupted
+ */
+ if (vmspace->vm_refcnt == 1) {
+ if (vmspace->vm_shm)
+ shmexit(p);
+ pmap_remove_pages(vmspace_pmap(vmspace), 0, VM_MAXUSER_ADDRESS);
+ vm_map_remove(&vmspace->vm_map, 0, VM_MAXUSER_ADDRESS);
+ } else {
+ vmspace_exec(p);
+ vmspace = p->p_vmspace;
+ }
+
+ /* Allocate a new stack */
+ error = vm_map_stack(&vmspace->vm_map, stack_addr, (vm_size_t)maxssiz,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error)
+ return (error);
+
+#ifdef __ia64__
+ {
+ /*
+ * Allocate backing store. We really need something
+ * similar to vm_map_stack which can allow the backing
+ * store to grow upwards. This will do for now.
+ */
+ vm_offset_t bsaddr;
+ bsaddr = USRSTACK - 2*maxssiz;
+ error = vm_map_find(&vmspace->vm_map, 0, 0, &bsaddr,
+ regstkpages * PAGE_SIZE, 0,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ FIRST_THREAD_IN_PROC(p)->td_md.md_bspstore = bsaddr;
+ }
+#endif
+
+ /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
+ * VM_STACK case, but they are still used to monitor the size of the
+ * process stack so we can check the stack rlimit.
+ */
+ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
+ vmspace->vm_maxsaddr = (char *)USRSTACK - maxssiz;
+
+ return(0);
+}
+
+/*
+ * Copy out argument and environment strings from the old process
+ * address space into the temporary string buffer.
+ */
+int
+exec_extract_strings(imgp)
+ struct image_params *imgp;
+{
+ char **argv, **envv;
+ char *argp, *envp;
+ int error;
+ size_t length;
+
+ /*
+ * extract arguments first
+ */
+
+ argv = imgp->uap->argv;
+
+ if (argv) {
+ argp = (caddr_t) (intptr_t) fuword(argv);
+ if (argp == (caddr_t) -1)
+ return (EFAULT);
+ if (argp)
+ argv++;
+ if (imgp->argv0)
+ argp = imgp->argv0;
+ if (argp) {
+ do {
+ if (argp == (caddr_t) -1)
+ return (EFAULT);
+ if ((error = copyinstr(argp, imgp->stringp,
+ imgp->stringspace, &length))) {
+ if (error == ENAMETOOLONG)
+ return(E2BIG);
+ return (error);
+ }
+ imgp->stringspace -= length;
+ imgp->stringp += length;
+ imgp->argc++;
+ } while ((argp = (caddr_t) (intptr_t) fuword(argv++)));
+ }
+ }
+
+ imgp->endargs = imgp->stringp;
+
+ /*
+ * extract environment strings
+ */
+
+ envv = imgp->uap->envv;
+
+ if (envv) {
+ while ((envp = (caddr_t) (intptr_t) fuword(envv++))) {
+ if (envp == (caddr_t) -1)
+ return (EFAULT);
+ if ((error = copyinstr(envp, imgp->stringp,
+ imgp->stringspace, &length))) {
+ if (error == ENAMETOOLONG)
+ return(E2BIG);
+ return (error);
+ }
+ imgp->stringspace -= length;
+ imgp->stringp += length;
+ imgp->envc++;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Copy strings out to the new process address space, constructing
+ * new arg and env vector tables. Return a pointer to the base
+ * so that it can be used as the initial stack pointer.
+ */
+register_t *
+exec_copyout_strings(imgp)
+ struct image_params *imgp;
+{
+ int argc, envc;
+ char **vectp;
+ char *stringp, *destp;
+ register_t *stack_base;
+ struct ps_strings *arginfo;
+ int szsigcode;
+
+ /*
+ * Calculate string base and vector table pointers.
+ * Also deal with signal trampoline code for this exec type.
+ */
+ arginfo = (struct ps_strings *)PS_STRINGS;
+ szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
+ destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
+ roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
+
+ /*
+ * install sigcode
+ */
+ if (szsigcode)
+ copyout(imgp->proc->p_sysent->sv_sigcode,
+ ((caddr_t)arginfo - szsigcode), szsigcode);
+
+ /*
+ * If we have a valid auxargs ptr, prepare some room
+ * on the stack.
+ */
+ if (imgp->auxargs) {
+ /*
+ * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
+ * lower compatibility.
+ */
+ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
+ : (AT_COUNT * 2);
+ /*
+ * The '+ 2' is for the null pointers at the end of each of
+ * the arg and env vector sets,and imgp->auxarg_size is room
+ * for argument of Runtime loader.
+ */
+ vectp = (char **) (destp - (imgp->argc + imgp->envc + 2 +
+ imgp->auxarg_size) * sizeof(char *));
+
+ } else
+ /*
+ * The '+ 2' is for the null pointers at the end of each of
+ * the arg and env vector sets
+ */
+ vectp = (char **)
+ (destp - (imgp->argc + imgp->envc + 2) * sizeof(char *));
+
+ /*
+ * vectp also becomes our initial stack base
+ */
+ stack_base = (register_t *)vectp;
+
+ stringp = imgp->stringbase;
+ argc = imgp->argc;
+ envc = imgp->envc;
+
+ /*
+ * Copy out strings - arguments and environment.
+ */
+ copyout(stringp, destp, ARG_MAX - imgp->stringspace);
+
+ /*
+ * Fill in "ps_strings" struct for ps, w, etc.
+ */
+ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
+ suword(&arginfo->ps_nargvstr, argc);
+
+ /*
+ * Fill in argument portion of vector table.
+ */
+ for (; argc > 0; --argc) {
+ suword(vectp++, (long)(intptr_t)destp);
+ while (*stringp++ != 0)
+ destp++;
+ destp++;
+ }
+
+ /* a null vector table pointer separates the argp's from the envp's */
+ suword(vectp++, 0);
+
+ suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
+ suword(&arginfo->ps_nenvstr, envc);
+
+ /*
+ * Fill in environment portion of vector table.
+ */
+ for (; envc > 0; --envc) {
+ suword(vectp++, (long)(intptr_t)destp);
+ while (*stringp++ != 0)
+ destp++;
+ destp++;
+ }
+
+ /* end of vector table is a null pointer */
+ suword(vectp, 0);
+
+ return (stack_base);
+}
+
+/*
+ * Check permissions of file to execute.
+ * Called with imgp->vp locked.
+ * Return 0 for success or error code on failure.
+ */
+int
+exec_check_permissions(imgp)
+ struct image_params *imgp;
+{
+ struct vnode *vp = imgp->vp;
+ struct vattr *attr = imgp->attr;
+ struct thread *td;
+ int error;
+
+ td = curthread; /* XXXKSE */
+ /* Get file attributes */
+ error = VOP_GETATTR(vp, attr, td->td_ucred, td);
+ if (error)
+ return (error);
+
+ /*
+ * 1) Check if file execution is disabled for the filesystem that this
+ * file resides on.
+ * 2) Insure that at least one execute bit is on - otherwise root
+ * will always succeed, and we don't want to happen unless the
+ * file really is executable.
+ * 3) Insure that the file is a regular file.
+ */
+ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
+ ((attr->va_mode & 0111) == 0) ||
+ (attr->va_type != VREG))
+ return (EACCES);
+
+ /*
+ * Zero length files can't be exec'd
+ */
+ if (attr->va_size == 0)
+ return (ENOEXEC);
+
+ /*
+ * Check for execute permission to file based on current credentials.
+ */
+ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+ if (error)
+ return (error);
+
+ /*
+ * Check number of open-for-writes on the file and deny execution
+ * if there are any.
+ */
+ if (vp->v_writecount)
+ return (ETXTBSY);
+
+ /*
+ * Call filesystem specific open routine (which does nothing in the
+ * general case).
+ */
+ error = VOP_OPEN(vp, FREAD, td->td_ucred, td);
+ return (error);
+}
+
+/*
+ * Exec handler registration
+ */
+int
+exec_register(execsw_arg)
+ const struct execsw *execsw_arg;
+{
+ const struct execsw **es, **xs, **newexecsw;
+ int count = 2; /* New slot and trailing NULL */
+
+ if (execsw)
+ for (es = execsw; *es; es++)
+ count++;
+ newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
+ if (newexecsw == NULL)
+ return ENOMEM;
+ xs = newexecsw;
+ if (execsw)
+ for (es = execsw; *es; es++)
+ *xs++ = *es;
+ *xs++ = execsw_arg;
+ *xs = NULL;
+ if (execsw)
+ free(execsw, M_TEMP);
+ execsw = newexecsw;
+ return 0;
+}
+
+int
+exec_unregister(execsw_arg)
+ const struct execsw *execsw_arg;
+{
+ const struct execsw **es, **xs, **newexecsw;
+ int count = 1;
+
+ if (execsw == NULL)
+ panic("unregister with no handlers left?\n");
+
+ for (es = execsw; *es; es++) {
+ if (*es == execsw_arg)
+ break;
+ }
+ if (*es == NULL)
+ return ENOENT;
+ for (es = execsw; *es; es++)
+ if (*es != execsw_arg)
+ count++;
+ newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
+ if (newexecsw == NULL)
+ return ENOMEM;
+ xs = newexecsw;
+ for (es = execsw; *es; es++)
+ if (*es != execsw_arg)
+ *xs++ = *es;
+ *xs = NULL;
+ if (execsw)
+ free(execsw, M_TEMP);
+ execsw = newexecsw;
+ return 0;
+}
+
+int
+at_exec(function)
+ execlist_fn function;
+{
+ struct execlist *ep;
+
+#ifdef INVARIANTS
+ /* Be noisy if the programmer has lost track of things */
+ if (rm_at_exec(function))
+ printf("WARNING: exec callout entry (%p) already present\n",
+ function);
+#endif
+ ep = malloc(sizeof(*ep), M_ATEXEC, M_NOWAIT);
+ if (ep == NULL)
+ return (ENOMEM);
+ ep->function = function;
+ TAILQ_INSERT_TAIL(&exec_list, ep, next);
+ return (0);
+}
+
+/*
+ * Scan the exec callout list for the given item and remove it.
+ * Returns the number of items removed (0 or 1)
+ */
+int
+rm_at_exec(function)
+ execlist_fn function;
+{
+ struct execlist *ep;
+
+ TAILQ_FOREACH(ep, &exec_list, next) {
+ if (ep->function == function) {
+ TAILQ_REMOVE(&exec_list, ep, next);
+ free(ep, M_ATEXEC);
+ return(1);
+ }
+ }
+ return (0);
+}
+
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
new file mode 100644
index 0000000..fab9437
--- /dev/null
+++ b/sys/kern/kern_exit.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/tty.h>
+#include <sys/wait.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sx.h>
+#include <sys/ptrace.h>
+#include <sys/acct.h> /* for acct_process() function prototype */
+#include <sys/filedesc.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
+#include <sys/jail.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/uma.h>
+#include <sys/user.h>
+
+/* Required to be non-static for SysVR4 emulator */
+MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status");
+
+static MALLOC_DEFINE(M_ATEXIT, "atexit", "atexit callback");
+
+static int wait1(struct thread *, struct wait_args *, int);
+
+/*
+ * callout list for things to do at exit time
+ */
+struct exitlist {
+ exitlist_fn function;
+ TAILQ_ENTRY(exitlist) next;
+};
+
+TAILQ_HEAD(exit_list_head, exitlist);
+static struct exit_list_head exit_list = TAILQ_HEAD_INITIALIZER(exit_list);
+
+/*
+ * exit --
+ * Death of process.
+ *
+ * MPSAFE
+ */
+void
+sys_exit(td, uap)
+ struct thread *td;
+ struct sys_exit_args /* {
+ int rval;
+ } */ *uap;
+{
+
+ mtx_lock(&Giant);
+ exit1(td, W_EXITCODE(uap->rval, 0));
+ /* NOTREACHED */
+}
+
+/*
+ * Exit: deallocate address space and other resources, change proc state
+ * to zombie, and unlink proc from allproc and parent's lists. Save exit
+ * status and rusage for wait(). Check for child processes and orphan them.
+ */
+void
+exit1(td, rv)
+ register struct thread *td;
+ int rv;
+{
+ struct exitlist *ep;
+ struct proc *p, *nq, *q;
+ struct tty *tp;
+ struct vnode *ttyvp;
+ register struct vmspace *vm;
+ struct vnode *vtmp;
+#ifdef KTRACE
+ struct vnode *tracevp;
+#endif
+
+ GIANT_REQUIRED;
+
+ p = td->td_proc;
+ if (p == initproc) {
+ printf("init died (signal %d, exit %d)\n",
+ WTERMSIG(rv), WEXITSTATUS(rv));
+ panic("Going nowhere without my init!");
+ }
+
+ /*
+ * XXXXKSE: MUST abort all other threads before proceeding past here.
+ */
+
+ /* Are we a task leader? */
+ PROC_LOCK(p);
+ if (p == p->p_leader) {
+ q = p->p_peers;
+ while (q != NULL) {
+ PROC_LOCK(q);
+ psignal(q, SIGKILL);
+ PROC_UNLOCK(q);
+ q = q->p_peers;
+ }
+ while (p->p_peers)
+ msleep((caddr_t)p, &p->p_mtx, PWAIT, "exit1", 0);
+ }
+ PROC_UNLOCK(p);
+
+#ifdef PGINPROF
+ vmsizmon();
+#endif
+ STOPEVENT(p, S_EXIT, rv);
+ wakeup(&p->p_stype); /* Wakeup anyone in procfs' PIOCWAIT */
+
+ /*
+ * Check if any loadable modules need anything done at process exit.
+ * e.g. SYSV IPC stuff
+ * XXX what if one of these generates an error?
+ */
+ TAILQ_FOREACH(ep, &exit_list, next)
+ (*ep->function)(p);
+
+ stopprofclock(p);
+
+ MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
+ M_ZOMBIE, M_WAITOK);
+ /*
+ * If parent is waiting for us to exit or exec,
+ * P_PPWAIT is set; we will wakeup the parent below.
+ */
+ PROC_LOCK(p);
+ p->p_flag &= ~(P_TRACED | P_PPWAIT);
+ p->p_flag |= P_WEXIT;
+ SIGEMPTYSET(p->p_siglist);
+ PROC_UNLOCK(p);
+ if (timevalisset(&p->p_realtimer.it_value))
+ callout_stop(&p->p_itcallout);
+
+ /*
+ * Reset any sigio structures pointing to us as a result of
+ * F_SETOWN with our pid.
+ */
+ funsetownlst(&p->p_sigiolst);
+
+ /*
+ * Close open files and release open-file table.
+ * This may block!
+ */
+ fdfree(td); /* XXXKSE *//* may not be the one in proc */
+
+ /*
+ * Remove ourself from our leader's peer list and wake our leader.
+ */
+ PROC_LOCK(p->p_leader);
+ if (p->p_leader->p_peers) {
+ q = p->p_leader;
+ while (q->p_peers != p)
+ q = q->p_peers;
+ q->p_peers = p->p_peers;
+ wakeup((caddr_t)p->p_leader);
+ }
+ PROC_UNLOCK(p->p_leader);
+
+ /* The next two chunks should probably be moved to vmspace_exit. */
+ vm = p->p_vmspace;
+ /*
+ * Release user portion of address space.
+ * This releases references to vnodes,
+ * which could cause I/O if the file has been unlinked.
+ * Need to do this early enough that we can still sleep.
+ * Can't free the entire vmspace as the kernel stack
+ * may be mapped within that space also.
+ */
+ if (--vm->vm_refcnt == 0) {
+ if (vm->vm_shm)
+ shmexit(p);
+ pmap_remove_pages(vmspace_pmap(vm), VM_MIN_ADDRESS,
+ VM_MAXUSER_ADDRESS);
+ (void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS,
+ VM_MAXUSER_ADDRESS);
+ vm->vm_freer = p;
+ }
+
+ sx_xlock(&proctree_lock);
+ if (SESS_LEADER(p)) {
+ register struct session *sp;
+
+ sp = p->p_session;
+ if (sp->s_ttyvp) {
+ /*
+ * Controlling process.
+ * Signal foreground pgrp,
+ * drain controlling terminal
+ * and revoke access to controlling terminal.
+ */
+ if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) {
+ tp = sp->s_ttyp;
+ if (sp->s_ttyp->t_pgrp) {
+ PGRP_LOCK(sp->s_ttyp->t_pgrp);
+ pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
+ PGRP_UNLOCK(sp->s_ttyp->t_pgrp);
+ }
+ /* XXX tp should be locked. */
+ sx_xunlock(&proctree_lock);
+ (void) ttywait(tp);
+ sx_xlock(&proctree_lock);
+ /*
+ * The tty could have been revoked
+ * if we blocked.
+ */
+ if (sp->s_ttyvp) {
+ ttyvp = sp->s_ttyvp;
+ SESS_LOCK(p->p_session);
+ sp->s_ttyvp = NULL;
+ SESS_UNLOCK(p->p_session);
+ sx_xunlock(&proctree_lock);
+ VOP_REVOKE(ttyvp, REVOKEALL);
+ vrele(ttyvp);
+ sx_xlock(&proctree_lock);
+ }
+ }
+ if (sp->s_ttyvp) {
+ ttyvp = sp->s_ttyvp;
+ SESS_LOCK(p->p_session);
+ sp->s_ttyvp = NULL;
+ SESS_UNLOCK(p->p_session);
+ vrele(ttyvp);
+ }
+ /*
+ * s_ttyp is not zero'd; we use this to indicate
+ * that the session once had a controlling terminal.
+ * (for logging and informational purposes)
+ */
+ }
+ SESS_LOCK(p->p_session);
+ sp->s_leader = NULL;
+ SESS_UNLOCK(p->p_session);
+ }
+ fixjobc(p, p->p_pgrp, 0);
+ sx_xunlock(&proctree_lock);
+ (void)acct_process(td);
+#ifdef KTRACE
+ /*
+ * release trace file
+ */
+ PROC_LOCK(p);
+ mtx_lock(&ktrace_mtx);
+ p->p_traceflag = 0; /* don't trace the vrele() */
+ tracevp = p->p_tracep;
+ p->p_tracep = NULL;
+ mtx_unlock(&ktrace_mtx);
+ PROC_UNLOCK(p);
+ if (tracevp != NULL)
+ vrele(tracevp);
+#endif
+ /*
+ * Release reference to text vnode
+ */
+ if ((vtmp = p->p_textvp) != NULL) {
+ p->p_textvp = NULL;
+ vrele(vtmp);
+ }
+
+ /*
+ * Release our limits structure.
+ */
+ mtx_assert(&Giant, MA_OWNED);
+ if (--p->p_limit->p_refcnt == 0) {
+ FREE(p->p_limit, M_SUBPROC);
+ p->p_limit = NULL;
+ }
+
+ /*
+ * Release this thread's reference to the ucred. The actual proc
+ * reference will stay around until the proc is harvested by
+ * wait(). At this point the ucred is immutable (no other threads
+ * from this proc are around that can change it) so we leave the
+ * per-thread ucred pointer intact in case it is needed although
+ * in theory nothing should be using it at this point.
+ */
+ crfree(td->td_ucred);
+
+ /*
+ * Remove proc from allproc queue and pidhash chain.
+ * Place onto zombproc. Unlink from parent's child list.
+ */
+ sx_xlock(&allproc_lock);
+ LIST_REMOVE(p, p_list);
+ LIST_INSERT_HEAD(&zombproc, p, p_list);
+ LIST_REMOVE(p, p_hash);
+ sx_xunlock(&allproc_lock);
+
+ sx_xlock(&proctree_lock);
+ q = LIST_FIRST(&p->p_children);
+ if (q != NULL) /* only need this if any child is S_ZOMB */
+ wakeup((caddr_t) initproc);
+ for (; q != NULL; q = nq) {
+ nq = LIST_NEXT(q, p_sibling);
+ PROC_LOCK(q);
+ proc_reparent(q, initproc);
+ q->p_sigparent = SIGCHLD;
+ /*
+ * Traced processes are killed
+ * since their existence means someone is screwing up.
+ */
+ if (q->p_flag & P_TRACED) {
+ q->p_flag &= ~P_TRACED;
+ psignal(q, SIGKILL);
+ }
+ PROC_UNLOCK(q);
+ }
+
+ /*
+ * Save exit status and final rusage info, adding in child rusage
+ * info and self times.
+ */
+ PROC_LOCK(p);
+ p->p_xstat = rv;
+ *p->p_ru = p->p_stats->p_ru;
+ mtx_lock_spin(&sched_lock);
+ calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL);
+ mtx_unlock_spin(&sched_lock);
+ ruadd(p->p_ru, &p->p_stats->p_cru);
+
+ /*
+ * Notify interested parties of our demise.
+ */
+ KNOTE(&p->p_klist, NOTE_EXIT);
+
+ /*
+ * Notify parent that we're gone. If parent has the PS_NOCLDWAIT
+ * flag set, or if the handler is set to SIG_IGN, notify process
+ * 1 instead (and hope it will handle this situation).
+ */
+ PROC_LOCK(p->p_pptr);
+ if (p->p_pptr->p_procsig->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
+ struct proc *pp;
+
+ pp = p->p_pptr;
+ PROC_UNLOCK(pp);
+ proc_reparent(p, initproc);
+ PROC_LOCK(p->p_pptr);
+ /*
+ * If this was the last child of our parent, notify
+ * parent, so in case he was wait(2)ing, he will
+ * continue.
+ */
+ if (LIST_EMPTY(&pp->p_children))
+ wakeup((caddr_t)pp);
+ }
+
+ if (p->p_sigparent && p->p_pptr != initproc)
+ psignal(p->p_pptr, p->p_sigparent);
+ else
+ psignal(p->p_pptr, SIGCHLD);
+ PROC_UNLOCK(p->p_pptr);
+
+ /*
+ * If this is a kthread, then wakeup anyone waiting for it to exit.
+ */
+ if (p->p_flag & P_KTHREAD)
+ wakeup((caddr_t)p);
+ PROC_UNLOCK(p);
+
+ /*
+ * Finally, call machine-dependent code to release the remaining
+ * resources including address space, the kernel stack and pcb.
+ * The address space is released by "vmspace_exitfree(p)" in
+ * vm_waitproc().
+ */
+ cpu_exit(td);
+
+ PROC_LOCK(p);
+ PROC_LOCK(p->p_pptr);
+ sx_xunlock(&proctree_lock);
+ mtx_lock_spin(&sched_lock);
+ while (mtx_owned(&Giant))
+ mtx_unlock(&Giant);
+
+ /*
+ * We have to wait until after releasing all locks before
+ * changing p_stat. If we block on a mutex then we will be
+ * back at SRUN when we resume and our parent will never
+ * harvest us.
+ */
+ p->p_stat = SZOMB;
+
+ wakeup(p->p_pptr);
+ PROC_UNLOCK(p->p_pptr);
+ PROC_UNLOCK(p);
+
+ cnt.v_swtch++;
+ binuptime(PCPU_PTR(switchtime));
+ PCPU_SET(switchticks, ticks);
+
+ cpu_sched_exit(td);
+ cpu_throw();
+ panic("exit1");
+}
+
+#ifdef COMPAT_43
+/*
+ * MPSAFE. The dirty work is handled by wait1().
+ */
+int
+owait(td, uap)
+ struct thread *td;
+ register struct owait_args /* {
+ int dummy;
+ } */ *uap;
+{
+ struct wait_args w;
+
+ w.options = 0;
+ w.rusage = NULL;
+ w.pid = WAIT_ANY;
+ w.status = NULL;
+ return (wait1(td, &w, 1));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * MPSAFE. The dirty work is handled by wait1().
+ */
+int
+wait4(td, uap)
+ struct thread *td;
+ struct wait_args *uap;
+{
+
+ return (wait1(td, uap, 0));
+}
+
+/*
+ * MPSAFE
+ */
+static int
+wait1(td, uap, compat)
+ register struct thread *td;
+ register struct wait_args /* {
+ int pid;
+ int *status;
+ int options;
+ struct rusage *rusage;
+ } */ *uap;
+ int compat;
+{
+ struct rusage ru;
+ register int nfound;
+ register struct proc *p, *q, *t;
+ int status, error;
+
+ q = td->td_proc;
+ if (uap->pid == 0) {
+ PROC_LOCK(q);
+ uap->pid = -q->p_pgid;
+ PROC_UNLOCK(q);
+ }
+ if (uap->options &~ (WUNTRACED|WNOHANG|WCONTINUED|WLINUXCLONE))
+ return (EINVAL);
+ mtx_lock(&Giant);
+loop:
+ nfound = 0;
+ sx_xlock(&proctree_lock);
+ LIST_FOREACH(p, &q->p_children, p_sibling) {
+ PROC_LOCK(p);
+ if (uap->pid != WAIT_ANY &&
+ p->p_pid != uap->pid && p->p_pgid != -uap->pid) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+
+ /*
+ * This special case handles a kthread spawned by linux_clone
+ * (see linux_misc.c). The linux_wait4 and linux_waitpid
+ * functions need to be able to distinguish between waiting
+ * on a process and waiting on a thread. It is a thread if
+ * p_sigparent is not SIGCHLD, and the WLINUXCLONE option
+ * signifies we want to wait for threads and not processes.
+ */
+ if ((p->p_sigparent != SIGCHLD) ^
+ ((uap->options & WLINUXCLONE) != 0)) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+
+ nfound++;
+ if (p->p_stat == SZOMB) {
+ /*
+ * charge childs scheduling cpu usage to parent
+ * XXXKSE assume only one thread & kse & ksegrp
+ * keep estcpu in each ksegrp
+ * so charge it to the ksegrp that did the wait
+ * since process estcpu is sum of all ksegrps,
+ * this is strictly as expected.
+ * Assume that the child process aggregated all
+ * tke estcpu into the 'build-in' ksegrp.
+ * XXXKSE
+ */
+ if (curthread->td_proc->p_pid != 1) {
+ mtx_lock_spin(&sched_lock);
+ curthread->td_ksegrp->kg_estcpu =
+ ESTCPULIM(curthread->td_ksegrp->kg_estcpu +
+ p->p_ksegrp.kg_estcpu);
+ mtx_unlock_spin(&sched_lock);
+ }
+
+ td->td_retval[0] = p->p_pid;
+#ifdef COMPAT_43
+ if (compat)
+ td->td_retval[1] = p->p_xstat;
+ else
+#endif
+ if (uap->status) {
+ status = p->p_xstat; /* convert to int */
+ PROC_UNLOCK(p);
+ if ((error = copyout((caddr_t)&status,
+ (caddr_t)uap->status, sizeof(status)))) {
+ sx_xunlock(&proctree_lock);
+ mtx_unlock(&Giant);
+ return (error);
+ }
+ PROC_LOCK(p);
+ }
+ if (uap->rusage) {
+ bcopy(p->p_ru, &ru, sizeof(ru));
+ PROC_UNLOCK(p);
+ if ((error = copyout((caddr_t)&ru,
+ (caddr_t)uap->rusage,
+ sizeof (struct rusage)))) {
+ sx_xunlock(&proctree_lock);
+ mtx_unlock(&Giant);
+ return (error);
+ }
+ } else
+ PROC_UNLOCK(p);
+ /*
+ * If we got the child via a ptrace 'attach',
+ * we need to give it back to the old parent.
+ */
+ if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
+ PROC_LOCK(p);
+ p->p_oppid = 0;
+ proc_reparent(p, t);
+ PROC_UNLOCK(p);
+ psignal(t, SIGCHLD);
+ wakeup((caddr_t)t);
+ PROC_UNLOCK(t);
+ sx_xunlock(&proctree_lock);
+ mtx_unlock(&Giant);
+ return (0);
+ }
+ /*
+ * Remove other references to this process to ensure
+ * we have an exclusive reference.
+ */
+ leavepgrp(p);
+
+ sx_xlock(&allproc_lock);
+ LIST_REMOVE(p, p_list); /* off zombproc */
+ sx_xunlock(&allproc_lock);
+
+ LIST_REMOVE(p, p_sibling);
+ sx_xunlock(&proctree_lock);
+
+ /*
+ * As a side effect of this lock, we know that
+ * all other writes to this proc are visible now, so
+ * no more locking is needed for p.
+ */
+ PROC_LOCK(p);
+ p->p_xstat = 0; /* XXX: why? */
+ PROC_UNLOCK(p);
+ PROC_LOCK(q);
+ ruadd(&q->p_stats->p_cru, p->p_ru);
+ PROC_UNLOCK(q);
+ FREE(p->p_ru, M_ZOMBIE);
+ p->p_ru = NULL;
+
+ /*
+ * Decrement the count of procs running with this uid.
+ */
+ (void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
+
+ /*
+ * Free up credentials.
+ */
+ crfree(p->p_ucred);
+ p->p_ucred = NULL; /* XXX: why? */
+
+ /*
+ * Remove unused arguments
+ */
+ pargs_drop(p->p_args);
+ p->p_args = NULL;
+
+ if (--p->p_procsig->ps_refcnt == 0) {
+ if (p->p_sigacts != &p->p_uarea->u_sigacts)
+ FREE(p->p_sigacts, M_SUBPROC);
+ FREE(p->p_procsig, M_SUBPROC);
+ p->p_procsig = NULL;
+ }
+
+ /*
+ * Give vm and machine-dependent layer a chance
+ * to free anything that cpu_exit couldn't
+ * release while still running in process context.
+ */
+ vm_waitproc(p);
+ mtx_destroy(&p->p_mtx);
+ uma_zfree(proc_zone, p);
+ sx_xlock(&allproc_lock);
+ nprocs--;
+ sx_xunlock(&allproc_lock);
+ mtx_unlock(&Giant);
+ return (0);
+ }
+ if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 &&
+ (p->p_flag & P_TRACED || uap->options & WUNTRACED)) {
+ p->p_flag |= P_WAITED;
+ sx_xunlock(&proctree_lock);
+ td->td_retval[0] = p->p_pid;
+#ifdef COMPAT_43
+ if (compat) {
+ td->td_retval[1] = W_STOPCODE(p->p_xstat);
+ PROC_UNLOCK(p);
+ error = 0;
+ } else
+#endif
+ if (uap->status) {
+ status = W_STOPCODE(p->p_xstat);
+ PROC_UNLOCK(p);
+ error = copyout((caddr_t)&status,
+ (caddr_t)uap->status, sizeof(status));
+ } else {
+ PROC_UNLOCK(p);
+ error = 0;
+ }
+ mtx_unlock(&Giant);
+ return (error);
+ }
+ if (uap->options & WCONTINUED && (p->p_flag & P_CONTINUED)) {
+ sx_xunlock(&proctree_lock);
+ td->td_retval[0] = p->p_pid;
+ p->p_flag &= ~P_CONTINUED;
+ PROC_UNLOCK(p);
+
+ if (uap->status) {
+ status = SIGCONT;
+ error = copyout((caddr_t)&status,
+ (caddr_t)uap->status, sizeof(status));
+ } else
+ error = 0;
+
+ mtx_unlock(&Giant);
+ return (error);
+ }
+ PROC_UNLOCK(p);
+ }
+ if (nfound == 0) {
+ sx_xunlock(&proctree_lock);
+ mtx_unlock(&Giant);
+ return (ECHILD);
+ }
+ if (uap->options & WNOHANG) {
+ sx_xunlock(&proctree_lock);
+ td->td_retval[0] = 0;
+ mtx_unlock(&Giant);
+ return (0);
+ }
+ PROC_LOCK(q);
+ sx_xunlock(&proctree_lock);
+ error = msleep((caddr_t)q, &q->p_mtx, PWAIT | PCATCH, "wait", 0);
+ PROC_UNLOCK(q);
+ if (error) {
+ mtx_unlock(&Giant);
+ return (error);
+ }
+ goto loop;
+}
+
+/*
+ * Make process 'parent' the new parent of process 'child'.
+ * Must be called with an exclusive hold of proctree lock.
+ */
+void
+proc_reparent(child, parent)
+ register struct proc *child;
+ register struct proc *parent;
+{
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+ PROC_LOCK_ASSERT(child, MA_OWNED);
+ if (child->p_pptr == parent)
+ return;
+
+ LIST_REMOVE(child, p_sibling);
+ LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
+ child->p_pptr = parent;
+}
+
+/*
+ * The next two functions are to handle adding/deleting items on the
+ * exit callout list
+ *
+ * at_exit():
+ * Take the arguments given and put them onto the exit callout list,
+ * However first make sure that it's not already there.
+ * returns 0 on success.
+ */
+
+int
+at_exit(function)
+ exitlist_fn function;
+{
+ struct exitlist *ep;
+
+#ifdef INVARIANTS
+ /* Be noisy if the programmer has lost track of things */
+ if (rm_at_exit(function))
+ printf("WARNING: exit callout entry (%p) already present\n",
+ function);
+#endif
+ ep = malloc(sizeof(*ep), M_ATEXIT, M_NOWAIT);
+ if (ep == NULL)
+ return (ENOMEM);
+ ep->function = function;
+ TAILQ_INSERT_TAIL(&exit_list, ep, next);
+ return (0);
+}
+
+/*
+ * Scan the exit callout list for the given item and remove it.
+ * Returns the number of items removed (0 or 1)
+ */
+int
+rm_at_exit(function)
+ exitlist_fn function;
+{
+ struct exitlist *ep;
+
+ TAILQ_FOREACH(ep, &exit_list, next) {
+ if (ep->function == function) {
+ TAILQ_REMOVE(&exit_list, ep, next);
+ free(ep, M_ATEXIT);
+ return (1);
+ }
+ }
+ return (0);
+}
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
new file mode 100644
index 0000000..016653b
--- /dev/null
+++ b/sys/kern/kern_fork.c
@@ -0,0 +1,866 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/syscall.h>
+#include <sys/vnode.h>
+#include <sys/acct.h>
+#include <sys/ktr.h>
+#include <sys/ktrace.h>
+#include <sys/kthread.h>
+#include <sys/unistd.h>
+#include <sys/jail.h>
+#include <sys/sx.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+
+#include <sys/vmmeter.h>
+#include <sys/user.h>
+#include <machine/critical.h>
+
+static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
+
+/*
+ * These are the stuctures used to create a callout list for things to do
+ * when forking a process
+ */
+struct forklist {
+ forklist_fn function;
+ TAILQ_ENTRY(forklist) next;
+};
+
+static struct sx fork_list_lock;
+
+TAILQ_HEAD(forklist_head, forklist);
+static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list);
+
+#ifndef _SYS_SYSPROTO_H_
+struct fork_args {
+ int dummy;
+};
+#endif
+
+int forksleep; /* Place for fork1() to sleep on. */
+
+static void
+init_fork_list(void *data __unused)
+{
+
+ sx_init(&fork_list_lock, "fork list");
+}
+SYSINIT(fork_list, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_fork_list, NULL);
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+fork(td, uap)
+ struct thread *td;
+ struct fork_args *uap;
+{
+ int error;
+ struct proc *p2;
+
+ mtx_lock(&Giant);
+ error = fork1(td, RFFDG | RFPROC, &p2);
+ if (error == 0) {
+ td->td_retval[0] = p2->p_pid;
+ td->td_retval[1] = 0;
+ }
+ mtx_unlock(&Giant);
+ return error;
+}
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+vfork(td, uap)
+ struct thread *td;
+ struct vfork_args *uap;
+{
+ int error;
+ struct proc *p2;
+
+ mtx_lock(&Giant);
+ error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2);
+ if (error == 0) {
+ td->td_retval[0] = p2->p_pid;
+ td->td_retval[1] = 0;
+ }
+ mtx_unlock(&Giant);
+ return error;
+}
+
+/*
+ * MPSAFE
+ */
+int
+rfork(td, uap)
+ struct thread *td;
+ struct rfork_args *uap;
+{
+ int error;
+ struct proc *p2;
+
+ /* Don't allow kernel only flags. */
+ if ((uap->flags & RFKERNELONLY) != 0)
+ return (EINVAL);
+ mtx_lock(&Giant);
+ error = fork1(td, uap->flags, &p2);
+ if (error == 0) {
+ td->td_retval[0] = p2 ? p2->p_pid : 0;
+ td->td_retval[1] = 0;
+ }
+ mtx_unlock(&Giant);
+ return error;
+}
+
+
+int nprocs = 1; /* process 0 */
+int lastpid = 0;
+SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
+ "Last used PID");
+
+/*
+ * Random component to lastpid generation. We mix in a random factor to make
+ * it a little harder to predict. We sanity check the modulus value to avoid
+ * doing it in critical paths. Don't let it be too small or we pointlessly
+ * waste randomness entropy, and don't let it be impossibly large. Using a
+ * modulus that is too big causes a LOT more process table scans and slows
+ * down fork processing as the pidchecked caching is defeated.
+ */
+static int randompid = 0;
+
+static int
+sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
+{
+ int error, pid;
+
+ sx_xlock(&allproc_lock);
+ pid = randompid;
+ error = sysctl_handle_int(oidp, &pid, 0, req);
+ if (error == 0 && req->newptr != NULL) {
+ if (pid < 0 || pid > PID_MAX - 100) /* out of range */
+ pid = PID_MAX - 100;
+ else if (pid < 2) /* NOP */
+ pid = 0;
+ else if (pid < 100) /* Make it reasonable */
+ pid = 100;
+ randompid = pid;
+ }
+ sx_xunlock(&allproc_lock);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
+ 0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
+
+#if 0
+void
+kse_init(struct kse *kse1, struct kse *kse2)
+{
+}
+
+void
+thread_init(struct thread *thread1, struct thread *thread2)
+{
+}
+
+void
+ksegrp_init(struct ksegrp *ksegrp1, struct ksegrp *ksegrp2)
+{
+}
+#endif
+
+int
+fork1(td, flags, procp)
+ struct thread *td; /* parent proc */
+ int flags;
+ struct proc **procp; /* child proc */
+{
+ struct proc *p2, *pptr;
+ uid_t uid;
+ struct proc *newproc;
+ int trypid;
+ int ok;
+ static int pidchecked = 0;
+ struct forklist *ep;
+ struct filedesc *fd;
+ struct proc *p1 = td->td_proc;
+ struct thread *td2;
+ struct kse *ke2;
+ struct ksegrp *kg2;
+ struct sigacts *newsigacts;
+ struct procsig *newprocsig;
+
+ GIANT_REQUIRED;
+
+ /* Can't copy and clear */
+ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
+ return (EINVAL);
+
+ /*
+ * Here we don't create a new process, but we divorce
+ * certain parts of a process from itself.
+ */
+ if ((flags & RFPROC) == 0) {
+ vm_forkproc(td, NULL, NULL, flags);
+
+ /*
+ * Close all file descriptors.
+ */
+ if (flags & RFCFDG) {
+ struct filedesc *fdtmp;
+ fdtmp = fdinit(td); /* XXXKSE */
+ PROC_LOCK(p1);
+ fdfree(td); /* XXXKSE */
+ p1->p_fd = fdtmp;
+ PROC_UNLOCK(p1);
+ }
+
+ /*
+ * Unshare file descriptors (from parent.)
+ */
+ if (flags & RFFDG) {
+ FILEDESC_LOCK(p1->p_fd);
+ if (p1->p_fd->fd_refcnt > 1) {
+ struct filedesc *newfd;
+
+ newfd = fdcopy(td);
+ FILEDESC_UNLOCK(p1->p_fd);
+ PROC_LOCK(p1);
+ fdfree(td);
+ p1->p_fd = newfd;
+ PROC_UNLOCK(p1);
+ } else
+ FILEDESC_UNLOCK(p1->p_fd);
+ }
+ *procp = NULL;
+ return (0);
+ }
+
+ /* Allocate new proc. */
+ newproc = uma_zalloc(proc_zone, M_WAITOK);
+
+ /*
+ * Although process entries are dynamically created, we still keep
+ * a global limit on the maximum number we will create. Don't allow
+ * a nonprivileged user to use the last process; don't let root
+ * exceed the limit. The variable nprocs is the current number of
+ * processes, maxproc is the limit.
+ */
+ sx_xlock(&allproc_lock);
+ uid = td->td_ucred->cr_ruid;
+ if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) {
+ sx_xunlock(&allproc_lock);
+ uma_zfree(proc_zone, newproc);
+ tsleep(&forksleep, PUSER, "fork", hz / 2);
+ return (EAGAIN);
+ }
+ /*
+ * Increment the count of procs running with this uid. Don't allow
+ * a nonprivileged user to exceed their current limit.
+ */
+ PROC_LOCK(p1);
+ ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
+ (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
+ PROC_UNLOCK(p1);
+ if (!ok) {
+ sx_xunlock(&allproc_lock);
+ uma_zfree(proc_zone, newproc);
+ tsleep(&forksleep, PUSER, "fork", hz / 2);
+ return (EAGAIN);
+ }
+
+ /*
+ * Increment the nprocs resource before blocking can occur. There
+ * are hard-limits as to the number of processes that can run.
+ */
+ nprocs++;
+
+ /*
+ * Find an unused process ID. We remember a range of unused IDs
+ * ready to use (from lastpid+1 through pidchecked-1).
+ *
+ * If RFHIGHPID is set (used during system boot), do not allocate
+ * low-numbered pids.
+ */
+ trypid = lastpid + 1;
+ if (flags & RFHIGHPID) {
+ if (trypid < 10) {
+ trypid = 10;
+ }
+ } else {
+ if (randompid)
+ trypid += arc4random() % randompid;
+ }
+retry:
+ /*
+ * If the process ID prototype has wrapped around,
+ * restart somewhat above 0, as the low-numbered procs
+ * tend to include daemons that don't exit.
+ */
+ if (trypid >= PID_MAX) {
+ trypid = trypid % PID_MAX;
+ if (trypid < 100)
+ trypid += 100;
+ pidchecked = 0;
+ }
+ if (trypid >= pidchecked) {
+ int doingzomb = 0;
+
+ pidchecked = PID_MAX;
+ /*
+ * Scan the active and zombie procs to check whether this pid
+ * is in use. Remember the lowest pid that's greater
+ * than trypid, so we can avoid checking for a while.
+ */
+ p2 = LIST_FIRST(&allproc);
+again:
+ for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
+ PROC_LOCK(p2);
+ while (p2->p_pid == trypid ||
+ p2->p_pgrp->pg_id == trypid ||
+ p2->p_session->s_sid == trypid) {
+ trypid++;
+ if (trypid >= pidchecked) {
+ PROC_UNLOCK(p2);
+ goto retry;
+ }
+ }
+ if (p2->p_pid > trypid && pidchecked > p2->p_pid)
+ pidchecked = p2->p_pid;
+ if (p2->p_pgrp->pg_id > trypid &&
+ pidchecked > p2->p_pgrp->pg_id)
+ pidchecked = p2->p_pgrp->pg_id;
+ if (p2->p_session->s_sid > trypid &&
+ pidchecked > p2->p_session->s_sid)
+ pidchecked = p2->p_session->s_sid;
+ PROC_UNLOCK(p2);
+ }
+ if (!doingzomb) {
+ doingzomb = 1;
+ p2 = LIST_FIRST(&zombproc);
+ goto again;
+ }
+ }
+
+ /*
+ * RFHIGHPID does not mess with the lastpid counter during boot.
+ */
+ if (flags & RFHIGHPID)
+ pidchecked = 0;
+ else
+ lastpid = trypid;
+
+ p2 = newproc;
+ p2->p_stat = SIDL; /* protect against others */
+ p2->p_pid = trypid;
+ LIST_INSERT_HEAD(&allproc, p2, p_list);
+ LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
+ sx_xunlock(&allproc_lock);
+
+ /*
+ * Malloc things while we don't hold any locks.
+ */
+ if (flags & RFSIGSHARE) {
+ MALLOC(newsigacts, struct sigacts *,
+ sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
+ newprocsig = NULL;
+ } else {
+ newsigacts = NULL;
+ MALLOC(newprocsig, struct procsig *, sizeof(struct procsig),
+ M_SUBPROC, M_WAITOK);
+ }
+
+ /*
+ * Copy filedesc.
+ * XXX: This is busted. fd*() need to not take proc
+ * arguments or something.
+ */
+ if (flags & RFCFDG)
+ fd = fdinit(td);
+ else if (flags & RFFDG) {
+ FILEDESC_LOCK(p1->p_fd);
+ fd = fdcopy(td);
+ FILEDESC_UNLOCK(p1->p_fd);
+ } else
+ fd = fdshare(p1);
+
+ /*
+ * Make a proc table entry for the new process.
+ * Start by zeroing the section of proc that is zero-initialized,
+ * then copy the section that is copied directly from the parent.
+ */
+ td2 = thread_get(p2);
+ ke2 = &p2->p_kse;
+ kg2 = &p2->p_ksegrp;
+
+#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
+
+ bzero(&p2->p_startzero,
+ (unsigned) RANGEOF(struct proc, p_startzero, p_endzero));
+ bzero(&ke2->ke_startzero,
+ (unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero));
+ bzero(&td2->td_startzero,
+ (unsigned) RANGEOF(struct thread, td_startzero, td_endzero));
+ bzero(&kg2->kg_startzero,
+ (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero));
+
+ mtx_init(&p2->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+ PROC_LOCK(p2);
+ PROC_LOCK(p1);
+
+ bcopy(&p1->p_startcopy, &p2->p_startcopy,
+ (unsigned) RANGEOF(struct proc, p_startcopy, p_endcopy));
+ bcopy(&td->td_kse->ke_startcopy, &ke2->ke_startcopy,
+ (unsigned) RANGEOF(struct kse, ke_startcopy, ke_endcopy));
+ bcopy(&td->td_startcopy, &td2->td_startcopy,
+ (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
+ bcopy(&td->td_ksegrp->kg_startcopy, &kg2->kg_startcopy,
+ (unsigned) RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
+#undef RANGEOF
+
+ /*
+ * XXXKSE Theoretically only the running thread would get copied
+ * Others in the kernel would be 'aborted' in the child.
+ * i.e return E*something*
+ */
+ proc_linkup(p2, kg2, ke2, td2);
+
+ /* note.. XXXKSE no pcb or u-area yet */
+
+ /*
+ * Duplicate sub-structures as needed.
+ * Increase reference counts on shared objects.
+ * The p_stats and p_sigacts substructs are set in vm_forkproc.
+ */
+ p2->p_flag = 0;
+ mtx_lock_spin(&sched_lock);
+ p2->p_sflag = PS_INMEM;
+ if (p1->p_sflag & PS_PROFIL)
+ startprofclock(p2);
+ mtx_unlock_spin(&sched_lock);
+ p2->p_ucred = crhold(td->td_ucred);
+ td2->td_ucred = crhold(p2->p_ucred); /* XXXKSE */
+
+ /*
+ * Setup linkage for kernel based threading
+ */
+ if((flags & RFTHREAD) != 0) {
+ /*
+ * XXX: This assumes a leader is a parent or grandparent of
+ * all processes in a task.
+ */
+ if (p1->p_leader != p1)
+ PROC_LOCK(p1->p_leader);
+ p2->p_peers = p1->p_peers;
+ p1->p_peers = p2;
+ p2->p_leader = p1->p_leader;
+ if (p1->p_leader != p1)
+ PROC_UNLOCK(p1->p_leader);
+ } else {
+ p2->p_peers = NULL;
+ p2->p_leader = p2;
+ }
+
+ pargs_hold(p2->p_args);
+
+ if (flags & RFSIGSHARE) {
+ p2->p_procsig = p1->p_procsig;
+ p2->p_procsig->ps_refcnt++;
+ if (p1->p_sigacts == &p1->p_uarea->u_sigacts) {
+ /*
+ * Set p_sigacts to the new shared structure.
+ * Note that this is updating p1->p_sigacts at the
+ * same time, since p_sigacts is just a pointer to
+ * the shared p_procsig->ps_sigacts.
+ */
+ p2->p_sigacts = newsigacts;
+ newsigacts = NULL;
+ *p2->p_sigacts = p1->p_uarea->u_sigacts;
+ }
+ } else {
+ p2->p_procsig = newprocsig;
+ newprocsig = NULL;
+ bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
+ p2->p_procsig->ps_refcnt = 1;
+ p2->p_sigacts = NULL; /* finished in vm_forkproc() */
+ }
+ if (flags & RFLINUXTHPN)
+ p2->p_sigparent = SIGUSR1;
+ else
+ p2->p_sigparent = SIGCHLD;
+
+ /* Bump references to the text vnode (for procfs) */
+ p2->p_textvp = p1->p_textvp;
+ if (p2->p_textvp)
+ VREF(p2->p_textvp);
+ p2->p_fd = fd;
+ PROC_UNLOCK(p1);
+ PROC_UNLOCK(p2);
+
+ /*
+ * If p_limit is still copy-on-write, bump refcnt,
+ * otherwise get a copy that won't be modified.
+ * (If PL_SHAREMOD is clear, the structure is shared
+ * copy-on-write.)
+ */
+ if (p1->p_limit->p_lflags & PL_SHAREMOD)
+ p2->p_limit = limcopy(p1->p_limit);
+ else {
+ p2->p_limit = p1->p_limit;
+ p2->p_limit->p_refcnt++;
+ }
+
+ sx_xlock(&proctree_lock);
+ PGRP_LOCK(p1->p_pgrp);
+ PROC_LOCK(p2);
+ PROC_LOCK(p1);
+
+ /*
+ * Preserve some more flags in subprocess. PS_PROFIL has already
+ * been preserved.
+ */
+ p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK);
+ SESS_LOCK(p1->p_session);
+ if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
+ p2->p_flag |= P_CONTROLT;
+ SESS_UNLOCK(p1->p_session);
+ if (flags & RFPPWAIT)
+ p2->p_flag |= P_PPWAIT;
+
+ LIST_INSERT_AFTER(p1, p2, p_pglist);
+ PGRP_UNLOCK(p1->p_pgrp);
+ LIST_INIT(&p2->p_children);
+ LIST_INIT(&td2->td_contested); /* XXXKSE only 1 thread? */
+
+ callout_init(&p2->p_itcallout, 0);
+ callout_init(&td2->td_slpcallout, 1); /* XXXKSE */
+
+#ifdef KTRACE
+ /*
+ * Copy traceflag and tracefile if enabled.
+ */
+ mtx_lock(&ktrace_mtx);
+ KASSERT(p2->p_tracep == NULL, ("new process has a ktrace vnode"));
+ if (p1->p_traceflag & KTRFAC_INHERIT) {
+ p2->p_traceflag = p1->p_traceflag;
+ if ((p2->p_tracep = p1->p_tracep) != NULL)
+ VREF(p2->p_tracep);
+ }
+ mtx_unlock(&ktrace_mtx);
+#endif
+
+ /*
+ * set priority of child to be that of parent
+ * XXXKSE hey! copying the estcpu seems dodgy.. should split it..
+ */
+ mtx_lock_spin(&sched_lock);
+ p2->p_ksegrp.kg_estcpu = p1->p_ksegrp.kg_estcpu;
+ mtx_unlock_spin(&sched_lock);
+
+ /*
+ * This begins the section where we must prevent the parent
+ * from being swapped.
+ */
+ _PHOLD(p1);
+ PROC_UNLOCK(p1);
+
+ /*
+ * Attach the new process to its parent.
+ *
+ * If RFNOWAIT is set, the newly created process becomes a child
+ * of init. This effectively disassociates the child from the
+ * parent.
+ */
+ if (flags & RFNOWAIT)
+ pptr = initproc;
+ else
+ pptr = p1;
+ p2->p_pptr = pptr;
+ LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
+ PROC_UNLOCK(p2);
+ sx_xunlock(&proctree_lock);
+
+ /*
+ * XXXKSE: In KSE, there would be a race here if one thread was
+ * dieing due to a signal (or calling exit1() for that matter) while
+ * another thread was calling fork1(). Not sure how KSE wants to work
+ * around that. The problem is that up until the point above, if p1
+ * gets killed, it won't find p2 in its list in order for it to be
+ * reparented. Alternatively, we could add a new p_flag that gets set
+ * before we reparent all the children that we check above and just
+ * use init as our parent if that if that flag is set. (Either that
+ * or abort the fork if the flag is set since our parent died trying
+ * to fork us (which is evil)).
+ */
+
+ KASSERT(newprocsig == NULL, ("unused newprocsig"));
+ if (newsigacts != NULL)
+ FREE(newsigacts, M_SUBPROC);
+ /*
+ * Finish creating the child process. It will return via a different
+ * execution path later. (ie: directly into user mode)
+ */
+ vm_forkproc(td, p2, td2, flags);
+
+ if (flags == (RFFDG | RFPROC)) {
+ cnt.v_forks++;
+ cnt.v_forkpages += p2->p_vmspace->vm_dsize +
+ p2->p_vmspace->vm_ssize;
+ } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
+ cnt.v_vforks++;
+ cnt.v_vforkpages += p2->p_vmspace->vm_dsize +
+ p2->p_vmspace->vm_ssize;
+ } else if (p1 == &proc0) {
+ cnt.v_kthreads++;
+ cnt.v_kthreadpages += p2->p_vmspace->vm_dsize +
+ p2->p_vmspace->vm_ssize;
+ } else {
+ cnt.v_rforks++;
+ cnt.v_rforkpages += p2->p_vmspace->vm_dsize +
+ p2->p_vmspace->vm_ssize;
+ }
+
+ /*
+ * Both processes are set up, now check if any loadable modules want
+ * to adjust anything.
+ * What if they have an error? XXX
+ */
+ sx_slock(&fork_list_lock);
+ TAILQ_FOREACH(ep, &fork_list, next) {
+ (*ep->function)(p1, p2, flags);
+ }
+ sx_sunlock(&fork_list_lock);
+
+ /*
+ * If RFSTOPPED not requested, make child runnable and add to
+ * run queue.
+ */
+ microtime(&(p2->p_stats->p_start));
+ p2->p_acflag = AFORK;
+ if ((flags & RFSTOPPED) == 0) {
+ mtx_lock_spin(&sched_lock);
+ p2->p_stat = SRUN;
+ setrunqueue(td2);
+ mtx_unlock_spin(&sched_lock);
+ }
+
+ /*
+ * Now can be swapped.
+ */
+ PROC_LOCK(p1);
+ _PRELE(p1);
+
+ /*
+ * tell any interested parties about the new process
+ */
+ KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
+ PROC_UNLOCK(p1);
+
+ /*
+ * Preserve synchronization semantics of vfork. If waiting for
+ * child to exec or exit, set P_PPWAIT on child, and sleep on our
+ * proc (in case of exit).
+ */
+ PROC_LOCK(p2);
+ while (p2->p_flag & P_PPWAIT)
+ msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0);
+ PROC_UNLOCK(p2);
+
+ /*
+ * Return child proc pointer to parent.
+ */
+ *procp = p2;
+ return (0);
+}
+
+/*
+ * The next two functionms are general routines to handle adding/deleting
+ * items on the fork callout list.
+ *
+ * at_fork():
+ * Take the arguments given and put them onto the fork callout list,
+ * However first make sure that it's not already there.
+ * Returns 0 on success or a standard error number.
+ */
+
+int
+at_fork(function)
+ forklist_fn function;
+{
+ struct forklist *ep;
+
+#ifdef INVARIANTS
+ /* let the programmer know if he's been stupid */
+ if (rm_at_fork(function))
+ printf("WARNING: fork callout entry (%p) already present\n",
+ function);
+#endif
+ ep = malloc(sizeof(*ep), M_ATFORK, M_NOWAIT);
+ if (ep == NULL)
+ return (ENOMEM);
+ ep->function = function;
+ sx_xlock(&fork_list_lock);
+ TAILQ_INSERT_TAIL(&fork_list, ep, next);
+ sx_xunlock(&fork_list_lock);
+ return (0);
+}
+
+/*
+ * Scan the exit callout list for the given item and remove it..
+ * Returns the number of items removed (0 or 1)
+ */
+
+int
+rm_at_fork(function)
+ forklist_fn function;
+{
+ struct forklist *ep;
+
+ sx_xlock(&fork_list_lock);
+ TAILQ_FOREACH(ep, &fork_list, next) {
+ if (ep->function == function) {
+ TAILQ_REMOVE(&fork_list, ep, next);
+ sx_xunlock(&fork_list_lock);
+ free(ep, M_ATFORK);
+ return(1);
+ }
+ }
+ sx_xunlock(&fork_list_lock);
+ return (0);
+}
+
+/*
+ * Handle the return of a child process from fork1(). This function
+ * is called from the MD fork_trampoline() entry point.
+ */
+void
+fork_exit(callout, arg, frame)
+ void (*callout)(void *, struct trapframe *);
+ void *arg;
+ struct trapframe *frame;
+{
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+
+ td->td_kse->ke_oncpu = PCPU_GET(cpuid);
+ /*
+ * Finish setting up thread glue. We need to initialize
+ * the thread into a td_critnest=1 state. Some platforms
+ * may have already partially or fully initialized td_critnest
+ * and/or td_md.md_savecrit (when applciable).
+ *
+ * see <arch>/<arch>/critical.c
+ */
+ sched_lock.mtx_lock = (uintptr_t)td;
+ sched_lock.mtx_recurse = 0;
+ cpu_critical_fork_exit();
+ CTR3(KTR_PROC, "fork_exit: new proc %p (pid %d, %s)", p, p->p_pid,
+ p->p_comm);
+ if (PCPU_GET(switchtime.sec) == 0)
+ binuptime(PCPU_PTR(switchtime));
+ PCPU_SET(switchticks, ticks);
+ mtx_unlock_spin(&sched_lock);
+
+ /*
+ * cpu_set_fork_handler intercepts this function call to
+ * have this call a non-return function to stay in kernel mode.
+ * initproc has its own fork handler, but it does return.
+ */
+ KASSERT(callout != NULL, ("NULL callout in fork_exit"));
+ callout(arg, frame);
+
+ /*
+ * Check if a kernel thread misbehaved and returned from its main
+ * function.
+ */
+ PROC_LOCK(p);
+ if (p->p_flag & P_KTHREAD) {
+ PROC_UNLOCK(p);
+ mtx_lock(&Giant);
+ printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
+ p->p_comm, p->p_pid);
+ kthread_exit(0);
+ }
+ PROC_UNLOCK(p);
+ mtx_assert(&Giant, MA_NOTOWNED);
+}
+
+/*
+ * Simplified back end of syscall(), used when returning from fork()
+ * directly into user mode. Giant is not held on entry, and must not
+ * be held on return. This function is passed in to fork_exit() as the
+ * first parameter and is called when returning to a new userland process.
+ */
+void
+fork_return(td, frame)
+ struct thread *td;
+ struct trapframe *frame;
+{
+
+ userret(td, frame, 0);
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_SYSRET))
+ ktrsysret(SYS_fork, 0, 0);
+#endif
+ mtx_assert(&Giant, MA_NOTOWNED);
+}
diff --git a/sys/kern/kern_idle.c b/sys/kern/kern_idle.c
new file mode 100644
index 0000000..29194b7
--- /dev/null
+++ b/sys/kern/kern_idle.c
@@ -0,0 +1,110 @@
+/*-
+ * Copyright (c) 2000, All rights reserved. See /usr/src/COPYRIGHT
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/smp.h>
+#include <sys/unistd.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+static void idle_setup(void *dummy);
+SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL)
+
+static void idle_proc(void *dummy);
+
+/*
+ * Setup per-cpu idle process contexts. The AP's shouldn't be running or
+ * accessing their idle processes at this point, so don't bother with
+ * locking.
+ */
+static void
+idle_setup(void *dummy)
+{
+#ifdef SMP
+ struct pcpu *pc;
+#endif
+ struct proc *p;
+ int error;
+
+#ifdef SMP
+ SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
+ error = kthread_create(idle_proc, NULL, &p,
+ RFSTOPPED | RFHIGHPID, "idle: cpu%d", pc->pc_cpuid);
+ pc->pc_idlethread = FIRST_THREAD_IN_PROC(p);
+ if (pc->pc_curthread == NULL) {
+ pc->pc_curthread = pc->pc_idlethread;
+ pc->pc_idlethread->td_critnest = 0;
+ }
+#else
+ error = kthread_create(idle_proc, NULL, &p,
+ RFSTOPPED | RFHIGHPID, "idle");
+ PCPU_SET(idlethread, FIRST_THREAD_IN_PROC(p));
+#endif
+ if (error)
+ panic("idle_setup: kthread_create error %d\n", error);
+
+ p->p_flag |= P_NOLOAD;
+ p->p_stat = SRUN;
+#ifdef SMP
+ }
+#endif
+}
+
+/*
+ * idle process context
+ */
+static void
+idle_proc(void *dummy)
+{
+#ifdef DIAGNOSTIC
+ int count;
+#endif
+
+ for (;;) {
+ mtx_assert(&Giant, MA_NOTOWNED);
+
+#ifdef DIAGNOSTIC
+ count = 0;
+
+ while (count >= 0 && procrunnable() == 0) {
+#else
+ while (procrunnable() == 0) {
+#endif
+ /*
+ * This is a good place to put things to be done in
+ * the background, including sanity checks.
+ */
+
+#ifdef DIAGNOSTIC
+ if (count++ < 0)
+ CTR0(KTR_PROC, "idle_proc: timed out waiting"
+ " for a process");
+#endif
+
+#ifdef __i386__
+ cpu_idle();
+#endif
+ }
+
+ mtx_lock_spin(&sched_lock);
+ curproc->p_stats->p_ru.ru_nvcsw++;
+ mi_switch();
+ mtx_unlock_spin(&sched_lock);
+ }
+}
diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c
new file mode 100644
index 0000000..d65dc82
--- /dev/null
+++ b/sys/kern/kern_intr.c
@@ -0,0 +1,684 @@
+/*
+ * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/rtprio.h>
+#include <sys/systm.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/random.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+#include <sys/unistd.h>
+#include <sys/vmmeter.h>
+#include <machine/atomic.h>
+#include <machine/cpu.h>
+#include <machine/md_var.h>
+#include <machine/stdarg.h>
+
+#include <net/netisr.h> /* prototype for legacy_setsoftnet */
+
+struct int_entropy {
+ struct proc *proc;
+ int vector;
+};
+
+void *net_ih;
+void *vm_ih;
+void *softclock_ih;
+struct ithd *clk_ithd;
+struct ithd *tty_ithd;
+
+static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads");
+
+static void ithread_update(struct ithd *);
+static void ithread_loop(void *);
+static void start_softintr(void *);
+static void swi_net(void *);
+
+u_char
+ithread_priority(enum intr_type flags)
+{
+ u_char pri;
+
+ flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET |
+ INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV);
+ switch (flags) {
+ case INTR_TYPE_TTY:
+ pri = PI_TTYLOW;
+ break;
+ case INTR_TYPE_BIO:
+ /*
+ * XXX We need to refine this. BSD/OS distinguishes
+ * between tape and disk priorities.
+ */
+ pri = PI_DISK;
+ break;
+ case INTR_TYPE_NET:
+ pri = PI_NET;
+ break;
+ case INTR_TYPE_CAM:
+ pri = PI_DISK; /* XXX or PI_CAM? */
+ break;
+ case INTR_TYPE_AV: /* Audio/video */
+ pri = PI_AV;
+ break;
+ case INTR_TYPE_CLK:
+ pri = PI_REALTIME;
+ break;
+ case INTR_TYPE_MISC:
+ pri = PI_DULL; /* don't care */
+ break;
+ default:
+ /* We didn't specify an interrupt level. */
+ panic("ithread_priority: no interrupt type in flags");
+ }
+
+ return pri;
+}
+
+/*
+ * Regenerate the name (p_comm) and priority for a threaded interrupt thread.
+ */
+static void
+ithread_update(struct ithd *ithd)
+{
+ struct intrhand *ih;
+ struct thread *td;
+ struct proc *p;
+ int entropy;
+
+ mtx_assert(&ithd->it_lock, MA_OWNED);
+ td = ithd->it_td;
+ if (td == NULL)
+ return;
+ p = td->td_proc;
+
+ strncpy(p->p_comm, ithd->it_name, sizeof(ithd->it_name));
+ ih = TAILQ_FIRST(&ithd->it_handlers);
+ if (ih == NULL) {
+ mtx_lock_spin(&sched_lock);
+ td->td_priority = PRI_MAX_ITHD;
+ td->td_base_pri = PRI_MAX_ITHD;
+ mtx_unlock_spin(&sched_lock);
+ ithd->it_flags &= ~IT_ENTROPY;
+ return;
+ }
+ entropy = 0;
+ mtx_lock_spin(&sched_lock);
+ td->td_priority = ih->ih_pri;
+ td->td_base_pri = ih->ih_pri;
+ mtx_unlock_spin(&sched_lock);
+ TAILQ_FOREACH(ih, &ithd->it_handlers, ih_next) {
+ if (strlen(p->p_comm) + strlen(ih->ih_name) + 1 <
+ sizeof(p->p_comm)) {
+ strcat(p->p_comm, " ");
+ strcat(p->p_comm, ih->ih_name);
+ } else if (strlen(p->p_comm) + 1 == sizeof(p->p_comm)) {
+ if (p->p_comm[sizeof(p->p_comm) - 2] == '+')
+ p->p_comm[sizeof(p->p_comm) - 2] = '*';
+ else
+ p->p_comm[sizeof(p->p_comm) - 2] = '+';
+ } else
+ strcat(p->p_comm, "+");
+ if (ih->ih_flags & IH_ENTROPY)
+ entropy++;
+ }
+ if (entropy)
+ ithd->it_flags |= IT_ENTROPY;
+ else
+ ithd->it_flags &= ~IT_ENTROPY;
+ CTR2(KTR_INTR, "%s: updated %s\n", __func__, p->p_comm);
+}
+
+int
+ithread_create(struct ithd **ithread, int vector, int flags,
+ void (*disable)(int), void (*enable)(int), const char *fmt, ...)
+{
+ struct ithd *ithd;
+ struct thread *td;
+ struct proc *p;
+ int error;
+ va_list ap;
+
+ /* The only valid flag during creation is IT_SOFT. */
+ if ((flags & ~IT_SOFT) != 0)
+ return (EINVAL);
+
+ ithd = malloc(sizeof(struct ithd), M_ITHREAD, M_WAITOK | M_ZERO);
+ ithd->it_vector = vector;
+ ithd->it_disable = disable;
+ ithd->it_enable = enable;
+ ithd->it_flags = flags;
+ TAILQ_INIT(&ithd->it_handlers);
+ mtx_init(&ithd->it_lock, "ithread", NULL, MTX_DEF);
+
+ va_start(ap, fmt);
+ vsnprintf(ithd->it_name, sizeof(ithd->it_name), fmt, ap);
+ va_end(ap);
+
+ error = kthread_create(ithread_loop, ithd, &p, RFSTOPPED | RFHIGHPID,
+ "%s", ithd->it_name);
+ if (error) {
+ mtx_destroy(&ithd->it_lock);
+ free(ithd, M_ITHREAD);
+ return (error);
+ }
+ td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */
+ td->td_ksegrp->kg_pri_class = PRI_ITHD;
+ td->td_priority = PRI_MAX_ITHD;
+ p->p_stat = SWAIT;
+ ithd->it_td = td;
+ td->td_ithd = ithd;
+ if (ithread != NULL)
+ *ithread = ithd;
+
+ CTR2(KTR_INTR, "%s: created %s", __func__, ithd->it_name);
+ return (0);
+}
+
+int
+ithread_destroy(struct ithd *ithread)
+{
+
+ struct thread *td;
+ struct proc *p;
+ if (ithread == NULL)
+ return (EINVAL);
+
+ td = ithread->it_td;
+ p = td->td_proc;
+ mtx_lock(&ithread->it_lock);
+ if (!TAILQ_EMPTY(&ithread->it_handlers)) {
+ mtx_unlock(&ithread->it_lock);
+ return (EINVAL);
+ }
+ ithread->it_flags |= IT_DEAD;
+ mtx_lock_spin(&sched_lock);
+ if (p->p_stat == SWAIT) {
+ p->p_stat = SRUN; /* XXXKSE */
+ setrunqueue(td);
+ }
+ mtx_unlock_spin(&sched_lock);
+ mtx_unlock(&ithread->it_lock);
+ CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_name);
+ return (0);
+}
+
+int
+ithread_add_handler(struct ithd* ithread, const char *name,
+ driver_intr_t handler, void *arg, u_char pri, enum intr_type flags,
+ void **cookiep)
+{
+ struct intrhand *ih, *temp_ih;
+
+ if (ithread == NULL || name == NULL || handler == NULL)
+ return (EINVAL);
+ if ((flags & INTR_FAST) !=0)
+ flags |= INTR_EXCL;
+
+ ih = malloc(sizeof(struct intrhand), M_ITHREAD, M_WAITOK | M_ZERO);
+ ih->ih_handler = handler;
+ ih->ih_argument = arg;
+ ih->ih_name = name;
+ ih->ih_ithread = ithread;
+ ih->ih_pri = pri;
+ if (flags & INTR_FAST)
+ ih->ih_flags = IH_FAST | IH_EXCLUSIVE;
+ else if (flags & INTR_EXCL)
+ ih->ih_flags = IH_EXCLUSIVE;
+ if (flags & INTR_MPSAFE)
+ ih->ih_flags |= IH_MPSAFE;
+ if (flags & INTR_ENTROPY)
+ ih->ih_flags |= IH_ENTROPY;
+
+ mtx_lock(&ithread->it_lock);
+ if ((flags & INTR_EXCL) !=0 && !TAILQ_EMPTY(&ithread->it_handlers))
+ goto fail;
+ if (!TAILQ_EMPTY(&ithread->it_handlers) &&
+ (TAILQ_FIRST(&ithread->it_handlers)->ih_flags & IH_EXCLUSIVE) != 0)
+ goto fail;
+
+ TAILQ_FOREACH(temp_ih, &ithread->it_handlers, ih_next)
+ if (temp_ih->ih_pri > ih->ih_pri)
+ break;
+ if (temp_ih == NULL)
+ TAILQ_INSERT_TAIL(&ithread->it_handlers, ih, ih_next);
+ else
+ TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
+ ithread_update(ithread);
+ mtx_unlock(&ithread->it_lock);
+
+ if (cookiep != NULL)
+ *cookiep = ih;
+ CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
+ ithread->it_name);
+ return (0);
+
+fail:
+ mtx_unlock(&ithread->it_lock);
+ free(ih, M_ITHREAD);
+ return (EINVAL);
+}
+
+int
+ithread_remove_handler(void *cookie)
+{
+ struct intrhand *handler = (struct intrhand *)cookie;
+ struct ithd *ithread;
+#ifdef INVARIANTS
+ struct intrhand *ih;
+#endif
+
+ if (handler == NULL)
+ return (EINVAL);
+ ithread = handler->ih_ithread;
+ KASSERT(ithread != NULL,
+ ("interrupt handler \"%s\" has a NULL interrupt thread",
+ handler->ih_name));
+ CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
+ ithread->it_name);
+ mtx_lock(&ithread->it_lock);
+#ifdef INVARIANTS
+ TAILQ_FOREACH(ih, &ithread->it_handlers, ih_next)
+ if (ih == handler)
+ goto ok;
+ mtx_unlock(&ithread->it_lock);
+ panic("interrupt handler \"%s\" not found in interrupt thread \"%s\"",
+ ih->ih_name, ithread->it_name);
+ok:
+#endif
+ /*
+ * If the interrupt thread is already running, then just mark this
+ * handler as being dead and let the ithread do the actual removal.
+ */
+ mtx_lock_spin(&sched_lock);
+ if (ithread->it_td->td_proc->p_stat != SWAIT) {
+ handler->ih_flags |= IH_DEAD;
+
+ /*
+ * Ensure that the thread will process the handler list
+ * again and remove this handler if it has already passed
+ * it on the list.
+ */
+ ithread->it_need = 1;
+ } else
+ TAILQ_REMOVE(&ithread->it_handlers, handler, ih_next);
+ mtx_unlock_spin(&sched_lock);
+ if ((handler->ih_flags & IH_DEAD) != 0)
+ msleep(handler, &ithread->it_lock, PUSER, "itrmh", 0);
+ ithread_update(ithread);
+ mtx_unlock(&ithread->it_lock);
+ free(handler, M_ITHREAD);
+ return (0);
+}
+
+int
+ithread_schedule(struct ithd *ithread, int do_switch)
+{
+ struct int_entropy entropy;
+ struct thread *td;
+ struct proc *p;
+
+ /*
+ * If no ithread or no handlers, then we have a stray interrupt.
+ */
+ if ((ithread == NULL) || TAILQ_EMPTY(&ithread->it_handlers))
+ return (EINVAL);
+
+ /*
+ * If any of the handlers for this ithread claim to be good
+ * sources of entropy, then gather some.
+ */
+ if (harvest.interrupt && ithread->it_flags & IT_ENTROPY) {
+ entropy.vector = ithread->it_vector;
+ entropy.proc = curthread->td_proc;;
+ random_harvest(&entropy, sizeof(entropy), 2, 0,
+ RANDOM_INTERRUPT);
+ }
+
+ td = ithread->it_td;
+ p = td->td_proc;
+ KASSERT(p != NULL, ("ithread %s has no process", ithread->it_name));
+ CTR4(KTR_INTR, "%s: pid %d: (%s) need = %d", __func__, p->p_pid, p->p_comm,
+ ithread->it_need);
+
+ /*
+ * Set it_need to tell the thread to keep running if it is already
+ * running. Then, grab sched_lock and see if we actually need to
+ * put this thread on the runqueue. If so and the do_switch flag is
+ * true and it is safe to switch, then switch to the ithread
+ * immediately. Otherwise, set the needresched flag to guarantee
+ * that this ithread will run before any userland processes.
+ */
+ ithread->it_need = 1;
+ mtx_lock_spin(&sched_lock);
+ if (p->p_stat == SWAIT) {
+ CTR2(KTR_INTR, "%s: setrunqueue %d", __func__, p->p_pid);
+ p->p_stat = SRUN;
+ setrunqueue(td); /* XXXKSE */
+ if (do_switch && curthread->td_critnest == 1 &&
+ curthread->td_proc->p_stat == SRUN) {
+ if (curthread != PCPU_GET(idlethread))
+ setrunqueue(curthread);
+ curthread->td_proc->p_stats->p_ru.ru_nivcsw++;
+ mi_switch();
+ } else {
+ curthread->td_kse->ke_flags |= KEF_NEEDRESCHED;
+ }
+ } else {
+ CTR4(KTR_INTR, "%s: pid %d: it_need %d, state %d",
+ __func__, p->p_pid, ithread->it_need, p->p_stat);
+ }
+ mtx_unlock_spin(&sched_lock);
+
+ return (0);
+}
+
+int
+swi_add(struct ithd **ithdp, const char *name, driver_intr_t handler,
+ void *arg, int pri, enum intr_type flags, void **cookiep)
+{
+ struct ithd *ithd;
+ int error;
+
+ if (flags & (INTR_FAST | INTR_ENTROPY))
+ return (EINVAL);
+
+ ithd = (ithdp != NULL) ? *ithdp : NULL;
+
+ if (ithd != NULL) {
+ if ((ithd->it_flags & IT_SOFT) == 0)
+ return(EINVAL);
+ } else {
+ error = ithread_create(&ithd, pri, IT_SOFT, NULL, NULL,
+ "swi%d:", pri);
+ if (error)
+ return (error);
+
+ if (ithdp != NULL)
+ *ithdp = ithd;
+ }
+ return (ithread_add_handler(ithd, name, handler, arg,
+ (pri * RQ_PPQ) + PI_SOFT, flags, cookiep));
+}
+
+
+/*
+ * Schedule a heavyweight software interrupt process.
+ */
+void
+swi_sched(void *cookie, int flags)
+{
+ struct intrhand *ih = (struct intrhand *)cookie;
+ struct ithd *it = ih->ih_ithread;
+ int error;
+
+ atomic_add_int(&cnt.v_intr, 1); /* one more global interrupt */
+
+ CTR3(KTR_INTR, "swi_sched pid %d(%s) need=%d",
+ it->it_td->td_proc->p_pid, it->it_td->td_proc->p_comm, it->it_need);
+
+ /*
+ * Set ih_need for this handler so that if the ithread is already
+ * running it will execute this handler on the next pass. Otherwise,
+ * it will execute it the next time it runs.
+ */
+ atomic_store_rel_int(&ih->ih_need, 1);
+ if (!(flags & SWI_DELAY)) {
+ error = ithread_schedule(it, !cold);
+ KASSERT(error == 0, ("stray software interrupt"));
+ }
+}
+
+/*
+ * This is the main code for interrupt threads.
+ */
+void
+ithread_loop(void *arg)
+{
+ struct ithd *ithd; /* our thread context */
+ struct intrhand *ih; /* and our interrupt handler chain */
+ struct thread *td;
+ struct proc *p;
+
+ td = curthread;
+ p = td->td_proc;
+ ithd = (struct ithd *)arg; /* point to myself */
+ KASSERT(ithd->it_td == td && td->td_ithd == ithd,
+ ("%s: ithread and proc linkage out of sync", __func__));
+
+ /*
+ * As long as we have interrupts outstanding, go through the
+ * list of handlers, giving each one a go at it.
+ */
+ for (;;) {
+ /*
+ * If we are an orphaned thread, then just die.
+ */
+ if (ithd->it_flags & IT_DEAD) {
+ CTR3(KTR_INTR, "%s: pid %d: (%s) exiting", __func__,
+ p->p_pid, p->p_comm);
+ td->td_ithd = NULL;
+ mtx_destroy(&ithd->it_lock);
+ mtx_lock(&Giant);
+ free(ithd, M_ITHREAD);
+ kthread_exit(0);
+ }
+
+ CTR4(KTR_INTR, "%s: pid %d: (%s) need=%d", __func__,
+ p->p_pid, p->p_comm, ithd->it_need);
+ while (ithd->it_need) {
+ /*
+ * Service interrupts. If another interrupt
+ * arrives while we are running, they will set
+ * it_need to denote that we should make
+ * another pass.
+ */
+ atomic_store_rel_int(&ithd->it_need, 0);
+restart:
+ TAILQ_FOREACH(ih, &ithd->it_handlers, ih_next) {
+ if (ithd->it_flags & IT_SOFT && !ih->ih_need)
+ continue;
+ atomic_store_rel_int(&ih->ih_need, 0);
+ CTR6(KTR_INTR,
+ "%s: pid %d ih=%p: %p(%p) flg=%x", __func__,
+ p->p_pid, (void *)ih,
+ (void *)ih->ih_handler, ih->ih_argument,
+ ih->ih_flags);
+
+ if ((ih->ih_flags & IH_DEAD) != 0) {
+ mtx_lock(&ithd->it_lock);
+ TAILQ_REMOVE(&ithd->it_handlers, ih,
+ ih_next);
+ wakeup(ih);
+ mtx_unlock(&ithd->it_lock);
+ goto restart;
+ }
+ if ((ih->ih_flags & IH_MPSAFE) == 0)
+ mtx_lock(&Giant);
+ ih->ih_handler(ih->ih_argument);
+ if ((ih->ih_flags & IH_MPSAFE) == 0)
+ mtx_unlock(&Giant);
+ }
+ }
+
+ /*
+ * Processed all our interrupts. Now get the sched
+ * lock. This may take a while and it_need may get
+ * set again, so we have to check it again.
+ */
+ mtx_assert(&Giant, MA_NOTOWNED);
+ mtx_lock_spin(&sched_lock);
+ if (!ithd->it_need) {
+ /*
+ * Should we call this earlier in the loop above?
+ */
+ if (ithd->it_enable != NULL)
+ ithd->it_enable(ithd->it_vector);
+ p->p_stat = SWAIT; /* we're idle */
+ p->p_stats->p_ru.ru_nvcsw++;
+ CTR2(KTR_INTR, "%s: pid %d: done", __func__, p->p_pid);
+ mi_switch();
+ CTR2(KTR_INTR, "%s: pid %d: resumed", __func__, p->p_pid);
+ }
+ mtx_unlock_spin(&sched_lock);
+ }
+}
+
+/*
+ * Start standard software interrupt threads
+ */
+static void
+start_softintr(void *dummy)
+{
+
+ if (swi_add(NULL, "net", swi_net, NULL, SWI_NET, 0, &net_ih) ||
+ swi_add(&clk_ithd, "clock", softclock, NULL, SWI_CLOCK,
+ INTR_MPSAFE, &softclock_ih) ||
+ swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, 0, &vm_ih))
+ panic("died while creating standard software ithreads");
+
+ PROC_LOCK(clk_ithd->it_td->td_proc);
+ clk_ithd->it_td->td_proc->p_flag |= P_NOLOAD;
+ PROC_UNLOCK(clk_ithd->it_td->td_proc);
+}
+SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr, NULL)
+
+void
+legacy_setsoftnet(void)
+{
+ swi_sched(net_ih, 0);
+}
+
+/*
+ * XXX: This should really be in the network code somewhere and installed
+ * via a SI_SUB_SOFINTR, SI_ORDER_MIDDLE sysinit.
+ */
+void (*netisrs[32])(void);
+volatile unsigned int netisr; /* scheduling bits for network */
+
+int
+register_netisr(num, handler)
+ int num;
+ netisr_t *handler;
+{
+
+ if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) {
+ printf("register_netisr: bad isr number: %d\n", num);
+ return (EINVAL);
+ }
+ netisrs[num] = handler;
+ return (0);
+}
+
+int
+unregister_netisr(num)
+ int num;
+{
+
+ if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) {
+ printf("unregister_netisr: bad isr number: %d\n", num);
+ return (EINVAL);
+ }
+ netisrs[num] = NULL;
+ return (0);
+}
+
+#ifdef DEVICE_POLLING
+ void netisr_pollmore(void);
+#endif
+
+static void
+swi_net(void *dummy)
+{
+ u_int bits;
+ int i;
+
+#ifdef DEVICE_POLLING
+ for (;;) {
+ int pollmore;
+#endif
+ bits = atomic_readandclear_int(&netisr);
+#ifdef DEVICE_POLLING
+ if (bits == 0)
+ return;
+ pollmore = bits & (1 << NETISR_POLL);
+#endif
+ while ((i = ffs(bits)) != 0) {
+ i--;
+ if (netisrs[i] != NULL)
+ netisrs[i]();
+ else
+ printf("swi_net: unregistered isr number: %d.\n", i);
+ bits &= ~(1 << i);
+ }
+#ifdef DEVICE_POLLING
+ if (pollmore)
+ netisr_pollmore();
+ }
+#endif
+}
+
+/*
+ * Sysctls used by systat and others: hw.intrnames and hw.intrcnt.
+ * The data for this machine dependent, and the declarations are in machine
+ * dependent code. The layout of intrnames and intrcnt however is machine
+ * independent.
+ *
+ * We do not know the length of intrcnt and intrnames at compile time, so
+ * calculate things at run time.
+ */
+static int
+sysctl_intrnames(SYSCTL_HANDLER_ARGS)
+{
+ return (sysctl_handle_opaque(oidp, intrnames, eintrnames - intrnames,
+ req));
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD,
+ NULL, 0, sysctl_intrnames, "", "Interrupt Names");
+
+static int
+sysctl_intrcnt(SYSCTL_HANDLER_ARGS)
+{
+ return (sysctl_handle_opaque(oidp, intrcnt,
+ (char *)eintrcnt - (char *)intrcnt, req));
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD,
+ NULL, 0, sysctl_intrcnt, "", "Interrupt Counts");
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
new file mode 100644
index 0000000..cf3b03c
--- /dev/null
+++ b/sys/kern/kern_jail.c
@@ -0,0 +1,256 @@
+/*
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD$
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/sysproto.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/jail.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <net/if.h>
+#include <netinet/in.h>
+
+MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
+
+SYSCTL_DECL(_security);
+SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
+ "Jail rules");
+
+mp_fixme("these variables need a lock")
+
+int jail_set_hostname_allowed = 1;
+SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
+ &jail_set_hostname_allowed, 0,
+ "Processes in jail can set their hostnames");
+
+int jail_socket_unixiproute_only = 1;
+SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
+ &jail_socket_unixiproute_only, 0,
+ "Processes in jail are limited to creating UNIX/IPv4/route sockets only");
+
+int jail_sysvipc_allowed = 0;
+SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
+ &jail_sysvipc_allowed, 0,
+ "Processes in jail can use System V IPC primitives");
+
+/*
+ * MPSAFE
+ */
+int
+jail(td, uap)
+ struct thread *td;
+ struct jail_args /* {
+ syscallarg(struct jail *) jail;
+ } */ *uap;
+{
+ struct proc *p = td->td_proc;
+ int error;
+ struct prison *pr;
+ struct jail j;
+ struct chroot_args ca;
+ struct ucred *newcred = NULL, *oldcred;
+
+ error = copyin(uap->jail, &j, sizeof j);
+ if (error)
+ return (error);
+ if (j.version != 0)
+ return (EINVAL);
+
+ MALLOC(pr, struct prison *, sizeof *pr , M_PRISON, M_WAITOK | M_ZERO);
+ mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
+ pr->pr_securelevel = securelevel;
+ error = copyinstr(j.hostname, &pr->pr_host, sizeof pr->pr_host, 0);
+ if (error)
+ goto bail;
+ ca.path = j.path;
+ error = chroot(td, &ca);
+ if (error)
+ goto bail;
+ newcred = crget();
+ pr->pr_ip = j.ip_number;
+ PROC_LOCK(p);
+ /* Implicitly fail if already in jail. */
+ error = suser_cred(p->p_ucred, 0);
+ if (error)
+ goto badcred;
+ oldcred = p->p_ucred;
+ crcopy(newcred, oldcred);
+ p->p_ucred = newcred;
+ p->p_ucred->cr_prison = pr;
+ pr->pr_ref = 1;
+ PROC_UNLOCK(p);
+ crfree(oldcred);
+ return (0);
+badcred:
+ PROC_UNLOCK(p);
+ crfree(newcred);
+bail:
+ FREE(pr, M_PRISON);
+ return (error);
+}
+
+void
+prison_free(struct prison *pr)
+{
+
+ mtx_lock(&pr->pr_mtx);
+ pr->pr_ref--;
+ if (pr->pr_ref == 0) {
+ mtx_unlock(&pr->pr_mtx);
+ mtx_destroy(&pr->pr_mtx);
+ if (pr->pr_linux != NULL)
+ FREE(pr->pr_linux, M_PRISON);
+ FREE(pr, M_PRISON);
+ return;
+ }
+ mtx_unlock(&pr->pr_mtx);
+}
+
+void
+prison_hold(struct prison *pr)
+{
+
+ mtx_lock(&pr->pr_mtx);
+ pr->pr_ref++;
+ mtx_unlock(&pr->pr_mtx);
+}
+
+u_int32_t
+prison_getip(struct ucred *cred)
+{
+
+ return (cred->cr_prison->pr_ip);
+}
+
+int
+prison_ip(struct ucred *cred, int flag, u_int32_t *ip)
+{
+ u_int32_t tmp;
+
+ if (!jailed(cred))
+ return (0);
+ if (flag)
+ tmp = *ip;
+ else
+ tmp = ntohl(*ip);
+ if (tmp == INADDR_ANY) {
+ if (flag)
+ *ip = cred->cr_prison->pr_ip;
+ else
+ *ip = htonl(cred->cr_prison->pr_ip);
+ return (0);
+ }
+ if (tmp == INADDR_LOOPBACK) {
+ if (flag)
+ *ip = cred->cr_prison->pr_ip;
+ else
+ *ip = htonl(cred->cr_prison->pr_ip);
+ return (0);
+ }
+ if (cred->cr_prison->pr_ip != tmp)
+ return (1);
+ return (0);
+}
+
+void
+prison_remote_ip(struct ucred *cred, int flag, u_int32_t *ip)
+{
+ u_int32_t tmp;
+
+ if (!jailed(cred))
+ return;
+ if (flag)
+ tmp = *ip;
+ else
+ tmp = ntohl(*ip);
+ if (tmp == INADDR_LOOPBACK) {
+ if (flag)
+ *ip = cred->cr_prison->pr_ip;
+ else
+ *ip = htonl(cred->cr_prison->pr_ip);
+ return;
+ }
+ return;
+}
+
+int
+prison_if(struct ucred *cred, struct sockaddr *sa)
+{
+ struct sockaddr_in *sai = (struct sockaddr_in*) sa;
+ int ok;
+
+ if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only)
+ ok = 1;
+ else if (sai->sin_family != AF_INET)
+ ok = 0;
+ else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr))
+ ok = 1;
+ else
+ ok = 0;
+ return (ok);
+}
+
+/*
+ * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
+ */
+int
+prison_check(cred1, cred2)
+ struct ucred *cred1, *cred2;
+{
+
+ if (jailed(cred1)) {
+ if (!jailed(cred2))
+ return (ESRCH);
+ if (cred2->cr_prison != cred1->cr_prison)
+ return (ESRCH);
+ }
+
+ return (0);
+}
+
+/*
+ * Return 1 if the passed credential is in a jail, otherwise 0.
+ */
+int
+jailed(cred)
+ struct ucred *cred;
+{
+
+ return (cred->cr_prison != NULL);
+}
+
+/*
+ * Return the correct hostname for the passed credential.
+ */
+void
+getcredhostname(cred, buf, size)
+ struct ucred *cred;
+ char *buf;
+ size_t size;
+{
+
+ if (jailed(cred)) {
+ mtx_lock(&cred->cr_prison->pr_mtx);
+ strncpy(buf, cred->cr_prison->pr_host, size);
+ mtx_unlock(&cred->cr_prison->pr_mtx);
+ }
+ else
+ strncpy(buf, hostname, size);
+ buf[size - 1] = '\0';
+}
diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c
new file mode 100644
index 0000000..a456a86
--- /dev/null
+++ b/sys/kern/kern_kthread.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/wait.h>
+
+#include <machine/stdarg.h>
+
+/*
+ * Start a kernel process. This is called after a fork() call in
+ * mi_startup() in the file kern/init_main.c.
+ *
+ * This function is used to start "internal" daemons and intended
+ * to be called from SYSINIT().
+ */
+void
+kproc_start(udata)
+ const void *udata;
+{
+ const struct kproc_desc *kp = udata;
+ int error;
+
+ error = kthread_create((void (*)(void *))kp->func, NULL,
+ kp->global_procpp, 0, "%s", kp->arg0);
+ if (error)
+ panic("kproc_start: %s: error %d", kp->arg0, error);
+}
+
+/*
+ * Create a kernel process/thread/whatever. It shares its address space
+ * with proc0 - ie: kernel only.
+ *
+ * func is the function to start.
+ * arg is the parameter to pass to function on first startup.
+ * newpp is the return value pointing to the thread's struct proc.
+ * flags are flags to fork1 (in unistd.h)
+ * fmt and following will be *printf'd into (*newpp)->p_comm (for ps, etc.).
+ */
+int
+kthread_create(void (*func)(void *), void *arg,
+ struct proc **newpp, int flags, const char *fmt, ...)
+{
+ int error;
+ va_list ap;
+ struct proc *p2;
+
+ if (!proc0.p_stats /* || proc0.p_stats->p_start.tv_sec == 0 */)
+ panic("kthread_create called too soon");
+
+ error = fork1(&thread0, RFMEM | RFFDG | RFPROC | RFSTOPPED | flags,
+ &p2);
+ if (error)
+ return error;
+
+ /* save a global descriptor, if desired */
+ if (newpp != NULL)
+ *newpp = p2;
+
+ /* this is a non-swapped system process */
+ PROC_LOCK(p2);
+ p2->p_flag |= P_SYSTEM | P_KTHREAD;
+ p2->p_procsig->ps_flag |= PS_NOCLDWAIT;
+ _PHOLD(p2);
+ PROC_UNLOCK(p2);
+
+ /* set up arg0 for 'ps', et al */
+ va_start(ap, fmt);
+ vsnprintf(p2->p_comm, sizeof(p2->p_comm), fmt, ap);
+ va_end(ap);
+
+ /* call the processes' main()... */
+ cpu_set_fork_handler(FIRST_THREAD_IN_PROC(p2), func, arg);
+
+ /* Delay putting it on the run queue until now. */
+ mtx_lock_spin(&sched_lock);
+ p2->p_sflag |= PS_INMEM;
+ if (!(flags & RFSTOPPED)) {
+ p2->p_stat = SRUN;
+ setrunqueue(FIRST_THREAD_IN_PROC(p2)); /* XXXKSE */
+ }
+ mtx_unlock_spin(&sched_lock);
+
+ return 0;
+}
+
+void
+kthread_exit(int ecode)
+{
+ struct thread *td;
+ struct proc *p;
+
+ td = curthread;
+ p = td->td_proc;
+ sx_xlock(&proctree_lock);
+ PROC_LOCK(p);
+ proc_reparent(p, initproc);
+ PROC_UNLOCK(p);
+ sx_xunlock(&proctree_lock);
+ exit1(td, W_EXITCODE(ecode, 0));
+}
+
+/*
+ * Advise a kernel process to suspend (or resume) in its main loop.
+ * Participation is voluntary.
+ */
+int
+kthread_suspend(struct proc *p, int timo)
+{
+ /*
+ * Make sure this is indeed a system process and we can safely
+ * use the p_siglist field.
+ */
+ PROC_LOCK(p);
+ if ((p->p_flag & P_KTHREAD) == 0) {
+ PROC_UNLOCK(p);
+ return (EINVAL);
+ }
+ SIGADDSET(p->p_siglist, SIGSTOP);
+ wakeup(p);
+ return msleep(&p->p_siglist, &p->p_mtx, PPAUSE | PDROP, "suspkt", timo);
+}
+
+int
+kthread_resume(struct proc *p)
+{
+ /*
+ * Make sure this is indeed a system process and we can safely
+ * use the p_siglist field.
+ */
+ PROC_LOCK(p);
+ if ((p->p_flag & P_KTHREAD) == 0) {
+ PROC_UNLOCK(p);
+ return (EINVAL);
+ }
+ SIGDELSET(p->p_siglist, SIGSTOP);
+ PROC_UNLOCK(p);
+ wakeup(&p->p_siglist);
+ return (0);
+}
+
+void
+kthread_suspend_check(struct proc *p)
+{
+ PROC_LOCK(p);
+ while (SIGISMEMBER(p->p_siglist, SIGSTOP)) {
+ wakeup(&p->p_siglist);
+ msleep(&p->p_siglist, &p->p_mtx, PPAUSE, "ktsusp", 0);
+ }
+ PROC_UNLOCK(p);
+}
diff --git a/sys/kern/kern_ktr.c b/sys/kern/kern_ktr.c
new file mode 100644
index 0000000..719d5e4
--- /dev/null
+++ b/sys/kern/kern_ktr.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2000
+ * John Baldwin <jhb@FreeBSD.org>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY JOHN BALDWIN AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL JOHN BALDWIN OR THE VOICES IN HIS HEAD
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This module holds the global variables used by KTR and the ktr_tracepoint()
+ * function that does the actual tracing.
+ */
+
+#include "opt_ddb.h"
+#include "opt_ktr.h"
+
+#include <sys/param.h>
+#include <sys/cons.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/libkern.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+
+#include <machine/cpu.h>
+#ifdef __sparc64__
+#include <machine/ktr.h>
+#endif
+
+#include <ddb/ddb.h>
+
+#ifndef KTR_ENTRIES
+#define KTR_ENTRIES 1024
+#endif
+
+#ifndef KTR_MASK
+#define KTR_MASK (KTR_GEN)
+#endif
+
+#ifndef KTR_CPUMASK
+#define KTR_CPUMASK (~0)
+#endif
+
+#ifndef KTR_TIME
+#define KTR_TIME get_cyclecount()
+#endif
+
+#ifndef KTR_CPU
+#define KTR_CPU PCPU_GET(cpuid)
+#endif
+
+SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD, 0, "KTR options");
+
+int ktr_cpumask = KTR_CPUMASK;
+TUNABLE_INT("debug.ktr.cpumask", &ktr_cpumask);
+SYSCTL_INT(_debug_ktr, OID_AUTO, cpumask, CTLFLAG_RW, &ktr_cpumask, 0, "");
+
+int ktr_mask = KTR_MASK;
+TUNABLE_INT("debug.ktr.mask", &ktr_mask);
+SYSCTL_INT(_debug_ktr, OID_AUTO, mask, CTLFLAG_RW, &ktr_mask, 0, "");
+
+int ktr_entries = KTR_ENTRIES;
+SYSCTL_INT(_debug_ktr, OID_AUTO, entries, CTLFLAG_RD, &ktr_entries, 0, "");
+
+int ktr_version = KTR_VERSION;
+SYSCTL_INT(_debug_ktr, OID_AUTO, version, CTLFLAG_RD, &ktr_version, 0, "");
+
+volatile int ktr_idx = 0;
+struct ktr_entry ktr_buf[KTR_ENTRIES];
+
+#ifdef KTR_VERBOSE
+int ktr_verbose = KTR_VERBOSE;
+TUNABLE_INT("debug.ktr.verbose", &ktr_verbose);
+SYSCTL_INT(_debug_ktr, OID_AUTO, verbose, CTLFLAG_RW, &ktr_verbose, 0, "");
+#endif
+
+void
+ktr_tracepoint(u_int mask, const char *file, int line, const char *format,
+ u_long arg1, u_long arg2, u_long arg3, u_long arg4, u_long arg5,
+ u_long arg6)
+{
+ struct ktr_entry *entry;
+ int newindex, saveindex;
+#ifdef KTR_VERBOSE
+ struct thread *td;
+#endif
+ int cpu;
+
+ if (panicstr)
+ return;
+ if ((ktr_mask & mask) == 0)
+ return;
+ cpu = KTR_CPU;
+ if (((1 << cpu) & ktr_cpumask) == 0)
+ return;
+#ifdef KTR_VERBOSE
+ td = curthread;
+ if (td->td_inktr)
+ return;
+ td->td_inktr++;
+#endif
+ do {
+ saveindex = ktr_idx;
+ newindex = (saveindex + 1) & (KTR_ENTRIES - 1);
+ } while (atomic_cmpset_rel_int(&ktr_idx, saveindex, newindex) == 0);
+ entry = &ktr_buf[saveindex];
+ entry->ktr_timestamp = KTR_TIME;
+ entry->ktr_cpu = cpu;
+ entry->ktr_file = file;
+ entry->ktr_line = line;
+#ifdef KTR_VERBOSE
+ if (ktr_verbose) {
+#ifdef SMP
+ printf("cpu%d ", cpu);
+#endif
+ if (ktr_verbose > 1) {
+ printf("%s.%d\t", entry->ktr_file,
+ entry->ktr_line);
+ }
+ printf(format, arg1, arg2, arg3, arg4, arg5, arg6);
+ printf("\n");
+ }
+#endif
+ entry->ktr_desc = format;
+ entry->ktr_parms[0] = arg1;
+ entry->ktr_parms[1] = arg2;
+ entry->ktr_parms[2] = arg3;
+ entry->ktr_parms[3] = arg4;
+ entry->ktr_parms[4] = arg5;
+ entry->ktr_parms[5] = arg6;
+#ifdef KTR_VERBOSE
+ td->td_inktr--;
+#endif
+}
+
+#ifdef DDB
+
+struct tstate {
+ int cur;
+ int first;
+};
+static struct tstate tstate;
+static int db_ktr_verbose;
+static int db_mach_vtrace(void);
+
+#define NUM_LINES_PER_PAGE 18
+
+DB_SHOW_COMMAND(ktr, db_ktr_all)
+{
+ int c, lines;
+
+ lines = NUM_LINES_PER_PAGE;
+ tstate.cur = (ktr_idx - 1) & (KTR_ENTRIES - 1);
+ tstate.first = -1;
+ if (strcmp(modif, "v") == 0)
+ db_ktr_verbose = 1;
+ else
+ db_ktr_verbose = 0;
+ while (db_mach_vtrace())
+ if (--lines == 0) {
+ db_printf("--More--");
+ c = cngetc();
+ db_printf("\r");
+ switch (c) {
+ case '\n': /* one more line */
+ lines = 1;
+ break;
+ case ' ': /* one more page */
+ lines = NUM_LINES_PER_PAGE;
+ break;
+ default:
+ db_printf("\n");
+ return;
+ }
+ }
+}
+
+static int
+db_mach_vtrace(void)
+{
+ struct ktr_entry *kp;
+
+ if (tstate.cur == tstate.first) {
+ db_printf("--- End of trace buffer ---\n");
+ return (0);
+ }
+ kp = &ktr_buf[tstate.cur];
+
+ /* Skip over unused entries. */
+ if (kp->ktr_desc == NULL) {
+ db_printf("--- End of trace buffer ---\n");
+ return (0);
+ }
+ db_printf("%d: ", tstate.cur);
+#ifdef SMP
+ db_printf("cpu%d ", kp->ktr_cpu);
+#endif
+ if (db_ktr_verbose) {
+ db_printf("%10.10lld %s.%d\t", (long long)kp->ktr_timestamp,
+ kp->ktr_file, kp->ktr_line);
+ }
+ db_printf(kp->ktr_desc, kp->ktr_parms[0], kp->ktr_parms[1],
+ kp->ktr_parms[2], kp->ktr_parms[3], kp->ktr_parms[4],
+ kp->ktr_parms[5]);
+ db_printf("\n");
+
+ if (tstate.first == -1)
+ tstate.first = tstate.cur;
+
+ if (--tstate.cur < 0)
+ tstate.cur = KTR_ENTRIES - 1;
+
+ return (1);
+}
+
+#endif /* DDB */
diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
new file mode 100644
index 0000000..b71f695
--- /dev/null
+++ b/sys/kern/kern_ktrace.c
@@ -0,0 +1,850 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/ktrace.h>
+#include <sys/sema.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+
+static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
+
+#ifdef KTRACE
+
+#ifndef KTRACE_REQUEST_POOL
+#define KTRACE_REQUEST_POOL 100
+#endif
+
+struct ktr_request {
+ struct ktr_header ktr_header;
+ struct ucred *ktr_cred;
+ struct vnode *ktr_vp;
+ union {
+ struct ktr_syscall ktr_syscall;
+ struct ktr_sysret ktr_sysret;
+ struct ktr_genio ktr_genio;
+ struct ktr_psig ktr_psig;
+ struct ktr_csw ktr_csw;
+ } ktr_data;
+ int ktr_synchronous;
+ STAILQ_ENTRY(ktr_request) ktr_list;
+};
+
+static int data_lengths[] = {
+ 0, /* none */
+ offsetof(struct ktr_syscall, ktr_args), /* KTR_SYSCALL */
+ sizeof(struct ktr_sysret), /* KTR_SYSRET */
+ 0, /* KTR_NAMEI */
+ sizeof(struct ktr_genio), /* KTR_GENIO */
+ sizeof(struct ktr_psig), /* KTR_PSIG */
+ sizeof(struct ktr_csw), /* KTR_CSW */
+ 0 /* KTR_USER */
+};
+
+static STAILQ_HEAD(, ktr_request) ktr_todo;
+static STAILQ_HEAD(, ktr_request) ktr_free;
+
+static uint ktr_requestpool = KTRACE_REQUEST_POOL;
+TUNABLE_INT("kern.ktrace_request_pool", &ktr_requestpool);
+
+static int print_message = 1;
+struct mtx ktrace_mtx;
+static struct sema ktrace_sema;
+
+static void ktrace_init(void *dummy);
+static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
+static uint ktrace_resize_pool(uint newsize);
+static struct ktr_request *ktr_getrequest(int type);
+static void ktr_submitrequest(struct ktr_request *req);
+static void ktr_freerequest(struct ktr_request *req);
+static void ktr_loop(void *dummy);
+static void ktr_writerequest(struct ktr_request *req);
+static int ktrcanset(struct thread *,struct proc *);
+static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
+static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
+
+static void
+ktrace_init(void *dummy)
+{
+ struct ktr_request *req;
+ int i;
+
+ mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
+ sema_init(&ktrace_sema, 0, "ktrace");
+ STAILQ_INIT(&ktr_todo);
+ STAILQ_INIT(&ktr_free);
+ for (i = 0; i < ktr_requestpool; i++) {
+ req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
+ STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
+ }
+ kthread_create(ktr_loop, NULL, NULL, RFHIGHPID, "ktrace");
+}
+SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
+
+static int
+sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
+{
+ struct thread *td;
+ uint newsize, oldsize, wantsize;
+ int error;
+
+ /* Handle easy read-only case first to avoid warnings from GCC. */
+ if (!req->newptr) {
+ mtx_lock(&ktrace_mtx);
+ oldsize = ktr_requestpool;
+ mtx_unlock(&ktrace_mtx);
+ return (SYSCTL_OUT(req, &oldsize, sizeof(uint)));
+ }
+
+ error = SYSCTL_IN(req, &wantsize, sizeof(uint));
+ if (error)
+ return (error);
+ td = curthread;
+ td->td_inktrace = 1;
+ mtx_lock(&ktrace_mtx);
+ oldsize = ktr_requestpool;
+ newsize = ktrace_resize_pool(wantsize);
+ mtx_unlock(&ktrace_mtx);
+ td->td_inktrace = 0;
+ error = SYSCTL_OUT(req, &oldsize, sizeof(uint));
+ if (error)
+ return (error);
+ if (newsize != wantsize)
+ return (ENOSPC);
+ return (0);
+}
+SYSCTL_PROC(_kern, OID_AUTO, ktrace_request_pool, CTLTYPE_UINT|CTLFLAG_RW,
+ &ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU", "");
+
+static uint
+ktrace_resize_pool(uint newsize)
+{
+ struct ktr_request *req;
+
+ mtx_assert(&ktrace_mtx, MA_OWNED);
+ print_message = 1;
+ if (newsize == ktr_requestpool)
+ return (newsize);
+ if (newsize < ktr_requestpool)
+ /* Shrink pool down to newsize if possible. */
+ while (ktr_requestpool > newsize) {
+ req = STAILQ_FIRST(&ktr_free);
+ if (req == NULL)
+ return (ktr_requestpool);
+ STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
+ ktr_requestpool--;
+ mtx_unlock(&ktrace_mtx);
+ free(req, M_KTRACE);
+ mtx_lock(&ktrace_mtx);
+ }
+ else
+ /* Grow pool up to newsize. */
+ while (ktr_requestpool < newsize) {
+ mtx_unlock(&ktrace_mtx);
+ req = malloc(sizeof(struct ktr_request), M_KTRACE,
+ M_WAITOK);
+ mtx_lock(&ktrace_mtx);
+ STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
+ ktr_requestpool++;
+ }
+ return (ktr_requestpool);
+}
+
+static struct ktr_request *
+ktr_getrequest(int type)
+{
+ struct ktr_request *req;
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ int pm;
+
+ td->td_inktrace = 1;
+ mtx_lock(&ktrace_mtx);
+ if (!KTRCHECK(td, type)) {
+ mtx_unlock(&ktrace_mtx);
+ td->td_inktrace = 0;
+ return (NULL);
+ }
+ req = STAILQ_FIRST(&ktr_free);
+ if (req != NULL) {
+ STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
+ req->ktr_header.ktr_type = type;
+ KASSERT(p->p_tracep != NULL, ("ktrace: no trace vnode"));
+ req->ktr_vp = p->p_tracep;
+ VREF(p->p_tracep);
+ mtx_unlock(&ktrace_mtx);
+ microtime(&req->ktr_header.ktr_time);
+ req->ktr_header.ktr_pid = p->p_pid;
+ bcopy(p->p_comm, req->ktr_header.ktr_comm, MAXCOMLEN + 1);
+ req->ktr_cred = crhold(td->td_ucred);
+ req->ktr_header.ktr_buffer = NULL;
+ req->ktr_header.ktr_len = 0;
+ req->ktr_synchronous = 0;
+ } else {
+ pm = print_message;
+ print_message = 0;
+ mtx_unlock(&ktrace_mtx);
+ if (pm)
+ printf("Out of ktrace request objects.\n");
+ td->td_inktrace = 0;
+ }
+ return (req);
+}
+
+static void
+ktr_submitrequest(struct ktr_request *req)
+{
+
+ mtx_lock(&ktrace_mtx);
+ STAILQ_INSERT_TAIL(&ktr_todo, req, ktr_list);
+ sema_post(&ktrace_sema);
+ if (req->ktr_synchronous) {
+ /*
+ * For a synchronous request, we wait for the ktrace thread
+ * to get to our item in the todo list and wake us up. Then
+ * we write the request out ourselves and wake the ktrace
+ * thread back up.
+ */
+ msleep(req, &ktrace_mtx, curthread->td_priority, "ktrsync", 0);
+ mtx_unlock(&ktrace_mtx);
+ ktr_writerequest(req);
+ mtx_lock(&ktrace_mtx);
+ wakeup(req);
+ }
+ mtx_unlock(&ktrace_mtx);
+ curthread->td_inktrace = 0;
+}
+
+static void
+ktr_freerequest(struct ktr_request *req)
+{
+
+ crfree(req->ktr_cred);
+ mtx_lock(&Giant);
+ vrele(req->ktr_vp);
+ mtx_unlock(&Giant);
+ mtx_lock(&ktrace_mtx);
+ STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
+ mtx_unlock(&ktrace_mtx);
+}
+
+static void
+ktr_loop(void *dummy)
+{
+ struct ktr_request *req;
+ struct thread *td;
+ struct ucred *cred;
+
+ /* Only cache these values once. */
+ td = curthread;
+ cred = td->td_ucred;
+ for (;;) {
+ sema_wait(&ktrace_sema);
+ mtx_lock(&ktrace_mtx);
+ req = STAILQ_FIRST(&ktr_todo);
+ STAILQ_REMOVE_HEAD(&ktr_todo, ktr_list);
+ KASSERT(req != NULL, ("got a NULL request"));
+ if (req->ktr_synchronous) {
+ wakeup(req);
+ msleep(req, &ktrace_mtx, curthread->td_priority,
+ "ktrwait", 0);
+ mtx_unlock(&ktrace_mtx);
+ } else {
+ mtx_unlock(&ktrace_mtx);
+ /*
+ * It is not enough just to pass the cached cred
+ * to the VOP's in ktr_writerequest(). Some VFS
+ * operations use curthread->td_ucred, so we need
+ * to modify our thread's credentials as well.
+ * Evil.
+ */
+ td->td_ucred = req->ktr_cred;
+ ktr_writerequest(req);
+ td->td_ucred = cred;
+ }
+ ktr_freerequest(req);
+ }
+}
+
+/*
+ * MPSAFE
+ */
+void
+ktrsyscall(code, narg, args)
+ int code, narg;
+ register_t args[];
+{
+ struct ktr_request *req;
+ struct ktr_syscall *ktp;
+ size_t buflen;
+
+ req = ktr_getrequest(KTR_SYSCALL);
+ if (req == NULL)
+ return;
+ ktp = &req->ktr_data.ktr_syscall;
+ ktp->ktr_code = code;
+ ktp->ktr_narg = narg;
+ buflen = sizeof(register_t) * narg;
+ if (buflen > 0) {
+ req->ktr_header.ktr_buffer = malloc(buflen, M_KTRACE, M_WAITOK);
+ bcopy(args, req->ktr_header.ktr_buffer, buflen);
+ req->ktr_header.ktr_len = buflen;
+ }
+ ktr_submitrequest(req);
+}
+
+/*
+ * MPSAFE
+ */
+void
+ktrsysret(code, error, retval)
+ int code, error;
+ register_t retval;
+{
+ struct ktr_request *req;
+ struct ktr_sysret *ktp;
+
+ req = ktr_getrequest(KTR_SYSRET);
+ if (req == NULL)
+ return;
+ ktp = &req->ktr_data.ktr_sysret;
+ ktp->ktr_code = code;
+ ktp->ktr_error = error;
+ ktp->ktr_retval = retval; /* what about val2 ? */
+ ktr_submitrequest(req);
+}
+
+void
+ktrnamei(path)
+ char *path;
+{
+ struct ktr_request *req;
+ int namelen;
+
+ req = ktr_getrequest(KTR_NAMEI);
+ if (req == NULL)
+ return;
+ namelen = strlen(path);
+ if (namelen > 0) {
+ req->ktr_header.ktr_len = namelen;
+ req->ktr_header.ktr_buffer = malloc(namelen, M_KTRACE,
+ M_WAITOK);
+ bcopy(path, req->ktr_header.ktr_buffer, namelen);
+ }
+ ktr_submitrequest(req);
+}
+
+/*
+ * Since the uio may not stay valid, we can not hand off this request to
+ * the thread and need to process it synchronously. However, we wish to
+ * keep the relative order of records in a trace file correct, so we
+ * do put this request on the queue (if it isn't empty) and then block.
+ * The ktrace thread waks us back up when it is time for this event to
+ * be posted and blocks until we have completed writing out the event
+ * and woken it back up.
+ */
+void
+ktrgenio(fd, rw, uio, error)
+ int fd;
+ enum uio_rw rw;
+ struct uio *uio;
+ int error;
+{
+ struct ktr_request *req;
+ struct ktr_genio *ktg;
+
+ if (error)
+ return;
+ req = ktr_getrequest(KTR_GENIO);
+ if (req == NULL)
+ return;
+ ktg = &req->ktr_data.ktr_genio;
+ ktg->ktr_fd = fd;
+ ktg->ktr_rw = rw;
+ req->ktr_header.ktr_buffer = uio;
+ uio->uio_offset = 0;
+ uio->uio_rw = UIO_WRITE;
+ req->ktr_synchronous = 1;
+ ktr_submitrequest(req);
+}
+
+void
+ktrpsig(sig, action, mask, code)
+ int sig;
+ sig_t action;
+ sigset_t *mask;
+ int code;
+{
+ struct ktr_request *req;
+ struct ktr_psig *kp;
+
+ req = ktr_getrequest(KTR_PSIG);
+ if (req == NULL)
+ return;
+ kp = &req->ktr_data.ktr_psig;
+ kp->signo = (char)sig;
+ kp->action = action;
+ kp->mask = *mask;
+ kp->code = code;
+ ktr_submitrequest(req);
+}
+
+void
+ktrcsw(out, user)
+ int out, user;
+{
+ struct ktr_request *req;
+ struct ktr_csw *kc;
+
+ req = ktr_getrequest(KTR_CSW);
+ if (req == NULL)
+ return;
+ kc = &req->ktr_data.ktr_csw;
+ kc->out = out;
+ kc->user = user;
+ ktr_submitrequest(req);
+}
+#endif
+
+/* Interface and common routines */
+
+/*
+ * ktrace system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ktrace_args {
+ char *fname;
+ int ops;
+ int facs;
+ int pid;
+};
+#endif
+/* ARGSUSED */
+int
+ktrace(td, uap)
+ struct thread *td;
+ register struct ktrace_args *uap;
+{
+#ifdef KTRACE
+ register struct vnode *vp = NULL;
+ register struct proc *p;
+ struct pgrp *pg;
+ int facs = uap->facs & ~KTRFAC_ROOT;
+ int ops = KTROP(uap->ops);
+ int descend = uap->ops & KTRFLAG_DESCEND;
+ int ret = 0;
+ int flags, error = 0;
+ struct nameidata nd;
+
+ td->td_inktrace = 1;
+ if (ops != KTROP_CLEAR) {
+ /*
+ * an operation which requires a file argument.
+ */
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->fname, td);
+ flags = FREAD | FWRITE | O_NOFOLLOW;
+ error = vn_open(&nd, &flags, 0);
+ if (error) {
+ td->td_inktrace = 0;
+ return (error);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ VOP_UNLOCK(vp, 0, td);
+ if (vp->v_type != VREG) {
+ (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
+ td->td_inktrace = 0;
+ return (EACCES);
+ }
+ }
+ /*
+ * Clear all uses of the tracefile.
+ */
+ if (ops == KTROP_CLEARFILE) {
+ sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &allproc, p_list) {
+ PROC_LOCK(p);
+ if (p->p_tracep == vp) {
+ if (ktrcanset(td, p)) {
+ mtx_lock(&ktrace_mtx);
+ p->p_tracep = NULL;
+ p->p_traceflag = 0;
+ mtx_unlock(&ktrace_mtx);
+ PROC_UNLOCK(p);
+ (void) vn_close(vp, FREAD|FWRITE,
+ td->td_ucred, td);
+ } else {
+ PROC_UNLOCK(p);
+ error = EPERM;
+ }
+ } else
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ goto done;
+ }
+ /*
+ * need something to (un)trace (XXX - why is this here?)
+ */
+ if (!facs) {
+ error = EINVAL;
+ goto done;
+ }
+ /*
+ * do it
+ */
+ if (uap->pid < 0) {
+ /*
+ * by process group
+ */
+ sx_slock(&proctree_lock);
+ pg = pgfind(-uap->pid);
+ if (pg == NULL) {
+ sx_sunlock(&proctree_lock);
+ error = ESRCH;
+ goto done;
+ }
+ /*
+ * ktrops() may call vrele(). Lock pg_members
+ * by the proctree_lock rather than pg_mtx.
+ */
+ PGRP_UNLOCK(pg);
+ LIST_FOREACH(p, &pg->pg_members, p_pglist)
+ if (descend)
+ ret |= ktrsetchildren(td, p, ops, facs, vp);
+ else
+ ret |= ktrops(td, p, ops, facs, vp);
+ sx_sunlock(&proctree_lock);
+ } else {
+ /*
+ * by pid
+ */
+ p = pfind(uap->pid);
+ if (p == NULL) {
+ error = ESRCH;
+ goto done;
+ }
+ PROC_UNLOCK(p);
+ /* XXX: UNLOCK above has a race */
+ if (descend)
+ ret |= ktrsetchildren(td, p, ops, facs, vp);
+ else
+ ret |= ktrops(td, p, ops, facs, vp);
+ }
+ if (!ret)
+ error = EPERM;
+done:
+ if (vp != NULL)
+ (void) vn_close(vp, FWRITE, td->td_ucred, td);
+ td->td_inktrace = 0;
+ return (error);
+#else
+ return ENOSYS;
+#endif
+}
+
+/*
+ * utrace system call
+ */
+/* ARGSUSED */
+int
+utrace(td, uap)
+ struct thread *td;
+ register struct utrace_args *uap;
+{
+
+#ifdef KTRACE
+ struct ktr_request *req;
+ register caddr_t cp;
+
+ if (uap->len > KTR_USER_MAXLEN)
+ return (EINVAL);
+ req = ktr_getrequest(KTR_USER);
+ if (req == NULL)
+ return (0);
+ MALLOC(cp, caddr_t, uap->len, M_KTRACE, M_WAITOK);
+ if (!copyin(uap->addr, cp, uap->len)) {
+ req->ktr_header.ktr_buffer = cp;
+ req->ktr_header.ktr_len = uap->len;
+ ktr_submitrequest(req);
+ } else {
+ ktr_freerequest(req);
+ td->td_inktrace = 0;
+ }
+ return (0);
+#else
+ return (ENOSYS);
+#endif
+}
+
+#ifdef KTRACE
+static int
+ktrops(td, p, ops, facs, vp)
+ struct thread *td;
+ struct proc *p;
+ int ops, facs;
+ struct vnode *vp;
+{
+ struct vnode *tracevp = NULL;
+
+ PROC_LOCK(p);
+ if (!ktrcanset(td, p)) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ mtx_lock(&ktrace_mtx);
+ if (ops == KTROP_SET) {
+ if (p->p_tracep != vp) {
+ /*
+ * if trace file already in use, relinquish below
+ */
+ tracevp = p->p_tracep;
+ VREF(vp);
+ p->p_tracep = vp;
+ }
+ p->p_traceflag |= facs;
+ if (td->td_ucred->cr_uid == 0)
+ p->p_traceflag |= KTRFAC_ROOT;
+ } else {
+ /* KTROP_CLEAR */
+ if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
+ /* no more tracing */
+ p->p_traceflag = 0;
+ tracevp = p->p_tracep;
+ p->p_tracep = NULL;
+ }
+ }
+ mtx_unlock(&ktrace_mtx);
+ PROC_UNLOCK(p);
+ if (tracevp != NULL)
+ vrele(tracevp);
+
+ return (1);
+}
+
+static int
+ktrsetchildren(td, top, ops, facs, vp)
+ struct thread *td;
+ struct proc *top;
+ int ops, facs;
+ struct vnode *vp;
+{
+ register struct proc *p;
+ register int ret = 0;
+
+ p = top;
+ sx_slock(&proctree_lock);
+ for (;;) {
+ ret |= ktrops(td, p, ops, facs, vp);
+ /*
+ * If this process has children, descend to them next,
+ * otherwise do any siblings, and if done with this level,
+ * follow back up the tree (but not past top).
+ */
+ if (!LIST_EMPTY(&p->p_children))
+ p = LIST_FIRST(&p->p_children);
+ else for (;;) {
+ if (p == top) {
+ sx_sunlock(&proctree_lock);
+ return (ret);
+ }
+ if (LIST_NEXT(p, p_sibling)) {
+ p = LIST_NEXT(p, p_sibling);
+ break;
+ }
+ p = p->p_pptr;
+ }
+ }
+ /*NOTREACHED*/
+}
+
+static void
+ktr_writerequest(struct ktr_request *req)
+{
+ struct ktr_header *kth;
+ struct vnode *vp;
+ struct uio *uio = NULL;
+ struct proc *p;
+ struct thread *td;
+ struct ucred *cred;
+ struct uio auio;
+ struct iovec aiov[3];
+ struct mount *mp;
+ int datalen, buflen, vrele_count;
+ int error;
+
+ vp = req->ktr_vp;
+ /*
+ * If vp is NULL, the vp has been cleared out from under this
+ * request, so just drop it.
+ */
+ if (vp == NULL)
+ return;
+ kth = &req->ktr_header;
+ datalen = data_lengths[kth->ktr_type];
+ buflen = kth->ktr_len;
+ cred = req->ktr_cred;
+ td = curthread;
+ auio.uio_iov = &aiov[0];
+ auio.uio_offset = 0;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ aiov[0].iov_base = (caddr_t)kth;
+ aiov[0].iov_len = sizeof(struct ktr_header);
+ auio.uio_resid = sizeof(struct ktr_header);
+ auio.uio_iovcnt = 1;
+ auio.uio_td = td;
+ if (datalen != 0) {
+ aiov[1].iov_base = (caddr_t)&req->ktr_data;
+ aiov[1].iov_len = datalen;
+ auio.uio_resid += datalen;
+ auio.uio_iovcnt++;
+ kth->ktr_len += datalen;
+ }
+ if (buflen != 0) {
+ KASSERT(kth->ktr_buffer != NULL, ("ktrace: nothing to write"));
+ aiov[auio.uio_iovcnt].iov_base = kth->ktr_buffer;
+ aiov[auio.uio_iovcnt].iov_len = buflen;
+ auio.uio_resid += buflen;
+ auio.uio_iovcnt++;
+ } else
+ uio = kth->ktr_buffer;
+ KASSERT((uio == NULL) ^ (kth->ktr_type == KTR_GENIO),
+ ("ktrace: uio and genio mismatch"));
+ if (uio != NULL)
+ kth->ktr_len += uio->uio_resid;
+ mtx_lock(&Giant);
+ vn_start_write(vp, &mp, V_WAIT);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ (void)VOP_LEASE(vp, td, cred, LEASE_WRITE);
+ error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
+ if (error == 0 && uio != NULL) {
+ (void)VOP_LEASE(vp, td, cred, LEASE_WRITE);
+ error = VOP_WRITE(vp, uio, IO_UNIT | IO_APPEND, cred);
+ }
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ mtx_unlock(&Giant);
+ if (buflen != 0)
+ free(kth->ktr_buffer, M_KTRACE);
+ if (!error)
+ return;
+ /*
+ * If error encountered, give up tracing on this vnode. We defer
+ * all the vrele()'s on the vnode until after we are finished walking
+ * the various lists to avoid needlessly holding locks.
+ */
+ log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
+ error);
+ vrele_count = 0;
+ /*
+ * First, clear this vnode from being used by any processes in the
+ * system.
+ * XXX - If one process gets an EPERM writing to the vnode, should
+ * we really do this? Other processes might have suitable
+ * credentials for the operation.
+ */
+ sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &allproc, p_list) {
+ PROC_LOCK(p);
+ if (p->p_tracep == vp) {
+ mtx_lock(&ktrace_mtx);
+ p->p_tracep = NULL;
+ p->p_traceflag = 0;
+ mtx_unlock(&ktrace_mtx);
+ vrele_count++;
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ /*
+ * Second, clear this vnode from any pending requests.
+ */
+ mtx_lock(&ktrace_mtx);
+ STAILQ_FOREACH(req, &ktr_todo, ktr_list) {
+ if (req->ktr_vp == vp) {
+ req->ktr_vp = NULL;
+ vrele_count++;
+ }
+ }
+ mtx_unlock(&ktrace_mtx);
+ mtx_lock(&Giant);
+ while (vrele_count-- > 0)
+ vrele(vp);
+ mtx_unlock(&Giant);
+}
+
+/*
+ * Return true if caller has permission to set the ktracing state
+ * of target. Essentially, the target can't possess any
+ * more permissions than the caller. KTRFAC_ROOT signifies that
+ * root previously set the tracing status on the target process, and
+ * so, only root may further change it.
+ */
+static int
+ktrcanset(td, targetp)
+ struct thread *td;
+ struct proc *targetp;
+{
+
+ PROC_LOCK_ASSERT(targetp, MA_OWNED);
+ if (targetp->p_traceflag & KTRFAC_ROOT &&
+ suser_cred(td->td_ucred, PRISON_ROOT))
+ return (0);
+
+ if (p_candebug(td, targetp) != 0)
+ return (0);
+
+ return (1);
+}
+
+#endif /* KTRACE */
diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c
new file mode 100644
index 0000000..a506726
--- /dev/null
+++ b/sys/kern/kern_linker.c
@@ -0,0 +1,1812 @@
+/*-
+ * Copyright (c) 1997-2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/module.h>
+#include <sys/linker.h>
+#include <sys/fcntl.h>
+#include <sys/libkern.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/sysctl.h>
+
+#include "linker_if.h"
+
+#ifdef KLD_DEBUG
+int kld_debug = 0;
+#endif
+
+/*
+ * static char *linker_search_path(const char *name, struct mod_depend
+ * *verinfo);
+ */
+static const char *linker_basename(const char *path);
+static int linker_load_module(const char *kldname, const char *modname,
+ struct linker_file *parent, struct mod_depend *verinfo,
+ struct linker_file **lfpp);
+
+/* Metadata from the static kernel */
+SET_DECLARE(modmetadata_set, struct mod_metadata);
+
+MALLOC_DEFINE(M_LINKER, "linker", "kernel linker");
+
+linker_file_t linker_kernel_file;
+
+static struct mtx kld_mtx; /* kernel linker mutex */
+
+static linker_class_list_t classes;
+static linker_file_list_t linker_files;
+static int next_file_id = 1;
+static int linker_no_more_classes = 0;
+
+#define LINKER_GET_NEXT_FILE_ID(a) do { \
+ linker_file_t lftmp; \
+ \
+retry: \
+ mtx_lock(&kld_mtx); \
+ TAILQ_FOREACH(lftmp, &linker_files, link) { \
+ if (next_file_id == lftmp->id) { \
+ next_file_id++; \
+ mtx_unlock(&kld_mtx); \
+ goto retry; \
+ } \
+ } \
+ (a) = next_file_id; \
+ mtx_unlock(&kld_mtx); /* Hold for safe read of id variable */ \
+} while(0)
+
+
+/* XXX wrong name; we're looking at version provision tags here, not modules */
+typedef TAILQ_HEAD(, modlist) modlisthead_t;
+struct modlist {
+ TAILQ_ENTRY(modlist) link; /* chain together all modules */
+ linker_file_t container;
+ const char *name;
+ int version;
+};
+typedef struct modlist *modlist_t;
+static modlisthead_t found_modules;
+
+static modlist_t modlist_lookup2(const char *name,
+ struct mod_depend *verinfo);
+
+static char *
+linker_strdup(const char *str)
+{
+ char *result;
+
+ if ((result = malloc((strlen(str) + 1), M_LINKER, M_WAITOK)) != NULL)
+ strcpy(result, str);
+ return (result);
+}
+
+static void
+linker_init(void *arg)
+{
+
+ mtx_init(&kld_mtx, "kernel linker", NULL, MTX_DEF);
+ TAILQ_INIT(&classes);
+ TAILQ_INIT(&linker_files);
+}
+
+SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0)
+
+static void
+linker_stop_class_add(void *arg)
+{
+
+ linker_no_more_classes = 1;
+}
+
+SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL)
+
+int
+linker_add_class(linker_class_t lc)
+{
+
+ /*
+ * We disallow any class registration passt SI_ORDER_ANY
+ * of SI_SUB_KLD.
+ */
+ if (linker_no_more_classes == 1)
+ return (EPERM);
+ kobj_class_compile((kobj_class_t) lc);
+ TAILQ_INSERT_TAIL(&classes, lc, link);
+ return (0);
+}
+
+static void
+linker_file_sysinit(linker_file_t lf)
+{
+ struct sysinit **start, **stop, **sipp, **xipp, *save;
+
+ KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n",
+ lf->filename));
+
+ if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0)
+ return;
+ /*
+ * Perform a bubble sort of the system initialization objects by
+ * their subsystem (primary key) and order (secondary key).
+ *
+ * Since some things care about execution order, this is the operation
+ * which ensures continued function.
+ */
+ for (sipp = start; sipp < stop; sipp++) {
+ for (xipp = sipp + 1; xipp < stop; xipp++) {
+ if ((*sipp)->subsystem < (*xipp)->subsystem ||
+ ((*sipp)->subsystem == (*xipp)->subsystem &&
+ (*sipp)->order <= (*xipp)->order))
+ continue; /* skip */
+ save = *sipp;
+ *sipp = *xipp;
+ *xipp = save;
+ }
+ }
+
+ /*
+ * Traverse the (now) ordered list of system initialization tasks.
+ * Perform each task, and continue on to the next task.
+ */
+ for (sipp = start; sipp < stop; sipp++) {
+ if ((*sipp)->subsystem == SI_SUB_DUMMY)
+ continue; /* skip dummy task(s) */
+
+ /* Call function */
+ (*((*sipp)->func)) ((*sipp)->udata);
+ }
+}
+
+static void
+linker_file_sysuninit(linker_file_t lf)
+{
+ struct sysinit **start, **stop, **sipp, **xipp, *save;
+
+ KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n",
+ lf->filename));
+
+ if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop,
+ NULL) != 0)
+ return;
+
+ /*
+ * Perform a reverse bubble sort of the system initialization objects
+ * by their subsystem (primary key) and order (secondary key).
+ *
+ * Since some things care about execution order, this is the operation
+ * which ensures continued function.
+ */
+ for (sipp = start; sipp < stop; sipp++) {
+ for (xipp = sipp + 1; xipp < stop; xipp++) {
+ if ((*sipp)->subsystem > (*xipp)->subsystem ||
+ ((*sipp)->subsystem == (*xipp)->subsystem &&
+ (*sipp)->order >= (*xipp)->order))
+ continue; /* skip */
+ save = *sipp;
+ *sipp = *xipp;
+ *xipp = save;
+ }
+ }
+
+ /*
+ * Traverse the (now) ordered list of system initialization tasks.
+ * Perform each task, and continue on to the next task.
+ */
+ for (sipp = start; sipp < stop; sipp++) {
+ if ((*sipp)->subsystem == SI_SUB_DUMMY)
+ continue; /* skip dummy task(s) */
+
+ /* Call function */
+ (*((*sipp)->func)) ((*sipp)->udata);
+ }
+}
+
+static void
+linker_file_register_sysctls(linker_file_t lf)
+{
+ struct sysctl_oid **start, **stop, **oidp;
+
+ KLD_DPF(FILE,
+ ("linker_file_register_sysctls: registering SYSCTLs for %s\n",
+ lf->filename));
+
+ if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
+ return;
+
+ for (oidp = start; oidp < stop; oidp++)
+ sysctl_register_oid(*oidp);
+}
+
+static void
+linker_file_unregister_sysctls(linker_file_t lf)
+{
+ struct sysctl_oid **start, **stop, **oidp;
+
+ KLD_DPF(FILE, ("linker_file_unregister_sysctls: registering SYSCTLs"
+ " for %s\n", lf->filename));
+
+ if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
+ return;
+
+ for (oidp = start; oidp < stop; oidp++)
+ sysctl_unregister_oid(*oidp);
+}
+
+static int
+linker_file_register_modules(linker_file_t lf)
+{
+ struct mod_metadata **start, **stop, **mdp;
+ const moduledata_t *moddata;
+ int error;
+
+ KLD_DPF(FILE, ("linker_file_register_modules: registering modules"
+ " in %s\n", lf->filename));
+
+ if (linker_file_lookup_set(lf, "modmetadata_set", &start,
+ &stop, 0) != 0) {
+ /*
+ * This fallback should be unnecessary, but if we get booted
+ * from boot2 instead of loader and we are missing our
+ * metadata then we have to try the best we can.
+ */
+ if (lf == linker_kernel_file) {
+ start = SET_BEGIN(modmetadata_set);
+ stop = SET_LIMIT(modmetadata_set);
+ } else
+ return (0);
+ }
+ for (mdp = start; mdp < stop; mdp++) {
+ if ((*mdp)->md_type != MDT_MODULE)
+ continue;
+ moddata = (*mdp)->md_data;
+ KLD_DPF(FILE, ("Registering module %s in %s\n",
+ moddata->name, lf->filename));
+ error = module_register(moddata, lf);
+ if (error)
+ printf("Module %s failed to register: %d\n",
+ moddata->name, error);
+ }
+ return (0);
+}
+
+static void
+linker_init_kernel_modules(void)
+{
+
+ linker_file_register_modules(linker_kernel_file);
+}
+
+SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules, 0)
+
+int
+linker_load_file(const char *filename, linker_file_t *result)
+{
+ linker_class_t lc;
+ linker_file_t lf;
+ int foundfile, error = 0;
+
+ /* Refuse to load modules if securelevel raised */
+ if (securelevel > 0)
+ return (EPERM);
+
+ lf = linker_find_file_by_name(filename);
+ if (lf) {
+ KLD_DPF(FILE, ("linker_load_file: file %s is already loaded,"
+ " incrementing refs\n", filename));
+ *result = lf;
+ lf->refs++;
+ goto out;
+ }
+ lf = NULL;
+ foundfile = 0;
+
+ /*
+ * We do not need to protect (lock) classes here because there is
+ * no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY)
+ * and there is no class deregistration mechanism at this time.
+ */
+ TAILQ_FOREACH(lc, &classes, link) {
+ KLD_DPF(FILE, ("linker_load_file: trying to load %s\n",
+ filename));
+ error = LINKER_LOAD_FILE(lc, filename, &lf);
+ /*
+ * If we got something other than ENOENT, then it exists but
+ * we cannot load it for some other reason.
+ */
+ if (error != ENOENT)
+ foundfile = 1;
+ if (lf) {
+ linker_file_register_modules(lf);
+ linker_file_register_sysctls(lf);
+ linker_file_sysinit(lf);
+ lf->flags |= LINKER_FILE_LINKED;
+ *result = lf;
+ error = 0;
+ goto out;
+ }
+ }
+ /*
+ * Less than ideal, but tells the user whether it failed to load or
+ * the module was not found.
+ */
+ if (foundfile)
+ /* Format not recognized (or unloadable). */
+ error = ENOEXEC;
+ else
+ error = ENOENT; /* Nothing found */
+out:
+ return (error);
+}
+
+int
+linker_reference_module(const char *modname, struct mod_depend *verinfo,
+ linker_file_t *result)
+{
+ modlist_t mod;
+
+ if ((mod = modlist_lookup2(modname, verinfo)) != NULL) {
+ *result = mod->container;
+ (*result)->refs++;
+ return (0);
+ }
+
+ return (linker_load_module(NULL, modname, NULL, verinfo, result));
+}
+
+linker_file_t
+linker_find_file_by_name(const char *filename)
+{
+ linker_file_t lf = 0;
+ char *koname;
+
+ koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
+ if (koname == NULL)
+ goto out;
+ sprintf(koname, "%s.ko", filename);
+
+ mtx_lock(&kld_mtx);
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ if (strcmp(lf->filename, koname) == 0)
+ break;
+ if (strcmp(lf->filename, filename) == 0)
+ break;
+ }
+ mtx_unlock(&kld_mtx);
+out:
+ if (koname)
+ free(koname, M_LINKER);
+ return (lf);
+}
+
+linker_file_t
+linker_find_file_by_id(int fileid)
+{
+ linker_file_t lf = 0;
+
+ mtx_lock(&kld_mtx);
+ TAILQ_FOREACH(lf, &linker_files, link)
+ if (lf->id == fileid)
+ break;
+ mtx_unlock(&kld_mtx);
+ return (lf);
+}
+
+linker_file_t
+linker_make_file(const char *pathname, linker_class_t lc)
+{
+ linker_file_t lf;
+ const char *filename;
+
+ lf = NULL;
+ filename = linker_basename(pathname);
+
+ KLD_DPF(FILE, ("linker_make_file: new file, filename=%s\n", filename));
+ lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK);
+ if (lf == NULL)
+ goto out;
+ lf->refs = 1;
+ lf->userrefs = 0;
+ lf->flags = 0;
+ lf->filename = linker_strdup(filename);
+ LINKER_GET_NEXT_FILE_ID(lf->id);
+ lf->ndeps = 0;
+ lf->deps = NULL;
+ STAILQ_INIT(&lf->common);
+ TAILQ_INIT(&lf->modules);
+ mtx_lock(&kld_mtx);
+ TAILQ_INSERT_TAIL(&linker_files, lf, link);
+ mtx_unlock(&kld_mtx);
+out:
+ return (lf);
+}
+
+int
+linker_file_unload(linker_file_t file)
+{
+ module_t mod, next;
+ modlist_t ml, nextml;
+ struct common_symbol *cp;
+ int error, i;
+
+ error = 0;
+
+ /* Refuse to unload modules if securelevel raised. */
+ if (securelevel > 0)
+ return (EPERM);
+
+ KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs));
+ if (file->refs == 1) {
+ KLD_DPF(FILE, ("linker_file_unload: file is unloading,"
+ " informing modules\n"));
+
+ /*
+ * Inform any modules associated with this file.
+ */
+ MOD_XLOCK;
+ for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
+ next = module_getfnext(mod);
+ MOD_XUNLOCK;
+
+ /*
+ * Give the module a chance to veto the unload.
+ */
+ if ((error = module_unload(mod)) != 0) {
+ KLD_DPF(FILE, ("linker_file_unload: module %x"
+ " vetoes unload\n", mod));
+ goto out;
+ } else
+ MOD_XLOCK;
+ module_release(mod);
+ }
+ MOD_XUNLOCK;
+ }
+ file->refs--;
+ if (file->refs > 0) {
+ goto out;
+ }
+ for (ml = TAILQ_FIRST(&found_modules); ml; ml = nextml) {
+ nextml = TAILQ_NEXT(ml, link);
+ if (ml->container == file)
+ TAILQ_REMOVE(&found_modules, ml, link);
+ }
+
+ /*
+ * Don't try to run SYSUNINITs if we are unloaded due to a
+ * link error.
+ */
+ if (file->flags & LINKER_FILE_LINKED) {
+ linker_file_sysuninit(file);
+ linker_file_unregister_sysctls(file);
+ }
+ mtx_lock(&kld_mtx);
+ TAILQ_REMOVE(&linker_files, file, link);
+ mtx_unlock(&kld_mtx);
+
+ if (file->deps) {
+ for (i = 0; i < file->ndeps; i++)
+ linker_file_unload(file->deps[i]);
+ free(file->deps, M_LINKER);
+ file->deps = NULL;
+ }
+ for (cp = STAILQ_FIRST(&file->common); cp;
+ cp = STAILQ_FIRST(&file->common)) {
+ STAILQ_REMOVE(&file->common, cp, common_symbol, link);
+ free(cp, M_LINKER);
+ }
+
+ LINKER_UNLOAD(file);
+ if (file->filename) {
+ free(file->filename, M_LINKER);
+ file->filename = NULL;
+ }
+ kobj_delete((kobj_t) file, M_LINKER);
+out:
+ return (error);
+}
+
+int
+linker_file_add_dependency(linker_file_t file, linker_file_t dep)
+{
+ linker_file_t *newdeps;
+
+ newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t *),
+ M_LINKER, M_WAITOK | M_ZERO);
+ if (newdeps == NULL)
+ return (ENOMEM);
+
+ if (file->deps) {
+ bcopy(file->deps, newdeps,
+ file->ndeps * sizeof(linker_file_t *));
+ free(file->deps, M_LINKER);
+ }
+ file->deps = newdeps;
+ file->deps[file->ndeps] = dep;
+ file->ndeps++;
+ return (0);
+}
+
+/*
+ * Locate a linker set and its contents. This is a helper function to avoid
+ * linker_if.h exposure elsewhere. Note: firstp and lastp are really void ***
+ */
+int
+linker_file_lookup_set(linker_file_t file, const char *name,
+ void *firstp, void *lastp, int *countp)
+{
+
+ return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp));
+}
+
+caddr_t
+linker_file_lookup_symbol(linker_file_t file, const char *name, int deps)
+{
+ c_linker_sym_t sym;
+ linker_symval_t symval;
+ caddr_t address;
+ size_t common_size = 0;
+ int i;
+
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%x, name=%s, deps=%d\n",
+ file, name, deps));
+
+ if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) {
+ LINKER_SYMBOL_VALUES(file, sym, &symval);
+ if (symval.value == 0)
+ /*
+ * For commons, first look them up in the
+ * dependencies and only allocate space if not found
+ * there.
+ */
+ common_size = symval.size;
+ else {
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol"
+ ".value=%x\n", symval.value));
+ return (symval.value);
+ }
+ }
+ if (deps) {
+ for (i = 0; i < file->ndeps; i++) {
+ address = linker_file_lookup_symbol(file->deps[i],
+ name, 0);
+ if (address) {
+ KLD_DPF(SYM, ("linker_file_lookup_symbol:"
+ " deps value=%x\n", address));
+ return (address);
+ }
+ }
+ }
+ if (common_size > 0) {
+ /*
+ * This is a common symbol which was not found in the
+ * dependencies. We maintain a simple common symbol table in
+ * the file object.
+ */
+ struct common_symbol *cp;
+
+ STAILQ_FOREACH(cp, &file->common, link) {
+ if (strcmp(cp->name, name) == 0) {
+ KLD_DPF(SYM, ("linker_file_lookup_symbol:"
+ " old common value=%x\n", cp->address));
+ return (cp->address);
+ }
+ }
+ /*
+ * Round the symbol size up to align.
+ */
+ common_size = (common_size + sizeof(int) - 1) & -sizeof(int);
+ cp = malloc(sizeof(struct common_symbol)
+ + common_size + strlen(name) + 1, M_LINKER,
+ M_WAITOK | M_ZERO);
+ if (cp == NULL) {
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: nomem\n"));
+ return (0);
+ }
+ cp->address = (caddr_t)(cp + 1);
+ cp->name = cp->address + common_size;
+ strcpy(cp->name, name);
+ bzero(cp->address, common_size);
+ STAILQ_INSERT_TAIL(&file->common, cp, link);
+
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: new common"
+ " value=%x\n", cp->address));
+ return (cp->address);
+ }
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n"));
+ return (0);
+}
+
+#ifdef DDB
+/*
+ * DDB Helpers. DDB has to look across multiple files with their own symbol
+ * tables and string tables.
+ *
+ * Note that we do not obey list locking protocols here. We really don't need
+ * DDB to hang because somebody's got the lock held. We'll take the chance
+ * that the files list is inconsistant instead.
+ */
+
+int
+linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym)
+{
+ linker_file_t lf;
+
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ if (LINKER_LOOKUP_SYMBOL(lf, symstr, sym) == 0)
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp)
+{
+ linker_file_t lf;
+ c_linker_sym_t best, es;
+ u_long diff, bestdiff, off;
+
+ best = 0;
+ off = (uintptr_t)value;
+ bestdiff = off;
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0)
+ continue;
+ if (es != 0 && diff < bestdiff) {
+ best = es;
+ bestdiff = diff;
+ }
+ if (bestdiff == 0)
+ break;
+ }
+ if (best) {
+ *sym = best;
+ *diffp = bestdiff;
+ return (0);
+ } else {
+ *sym = 0;
+ *diffp = off;
+ return (ENOENT);
+ }
+}
+
+int
+linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
+{
+ linker_file_t lf;
+
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ if (LINKER_SYMBOL_VALUES(lf, sym, symval) == 0)
+ return (0);
+ }
+ return (ENOENT);
+}
+#endif
+
+/*
+ * Syscalls.
+ */
+/*
+ * MPSAFE
+ */
+int
+kldload(struct thread *td, struct kldload_args *uap)
+{
+ char *kldname, *modname;
+ char *pathname = NULL;
+ linker_file_t lf;
+ int error = 0;
+
+ td->td_retval[0] = -1;
+
+ mtx_lock(&Giant);
+
+ if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
+ goto out;
+
+ if ((error = suser(td)) != 0)
+ goto out;
+
+ pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ if ((error = copyinstr(SCARG(uap, file), pathname, MAXPATHLEN,
+ NULL)) != 0)
+ goto out;
+
+ /*
+ * If path do not contain qualified name or any dot in it
+ * (kldname.ko, or kldname.ver.ko) treat it as interface
+ * name.
+ */
+ if (index(pathname, '/') || index(pathname, '.')) {
+ kldname = pathname;
+ modname = NULL;
+ } else {
+ kldname = NULL;
+ modname = pathname;
+ }
+ error = linker_load_module(kldname, modname, NULL, NULL, &lf);
+ if (error)
+ goto out;
+
+ lf->userrefs++;
+ td->td_retval[0] = lf->id;
+out:
+ if (pathname)
+ free(pathname, M_TEMP);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kldunload(struct thread *td, struct kldunload_args *uap)
+{
+ linker_file_t lf;
+ int error = 0;
+
+ mtx_lock(&Giant);
+
+ if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
+ goto out;
+
+ if ((error = suser(td)) != 0)
+ goto out;
+
+ lf = linker_find_file_by_id(SCARG(uap, fileid));
+ if (lf) {
+ KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs));
+ if (lf->userrefs == 0) {
+ printf("kldunload: attempt to unload file that was"
+ " loaded by the kernel\n");
+ error = EBUSY;
+ goto out;
+ }
+ lf->userrefs--;
+ error = linker_file_unload(lf);
+ if (error)
+ lf->userrefs++;
+ } else
+ error = ENOENT;
+out:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kldfind(struct thread *td, struct kldfind_args *uap)
+{
+ char *pathname;
+ const char *filename;
+ linker_file_t lf;
+ int error = 0;
+
+ mtx_lock(&Giant);
+ td->td_retval[0] = -1;
+
+ pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ if ((error = copyinstr(SCARG(uap, file), pathname, MAXPATHLEN,
+ NULL)) != 0)
+ goto out;
+
+ filename = linker_basename(pathname);
+ lf = linker_find_file_by_name(filename);
+ if (lf)
+ td->td_retval[0] = lf->id;
+ else
+ error = ENOENT;
+out:
+ if (pathname)
+ free(pathname, M_TEMP);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kldnext(struct thread *td, struct kldnext_args *uap)
+{
+ linker_file_t lf;
+ int error = 0;
+
+ mtx_lock(&Giant);
+
+ if (SCARG(uap, fileid) == 0) {
+ mtx_lock(&kld_mtx);
+ if (TAILQ_FIRST(&linker_files))
+ td->td_retval[0] = TAILQ_FIRST(&linker_files)->id;
+ else
+ td->td_retval[0] = 0;
+ mtx_unlock(&kld_mtx);
+ goto out;
+ }
+ lf = linker_find_file_by_id(SCARG(uap, fileid));
+ if (lf) {
+ if (TAILQ_NEXT(lf, link))
+ td->td_retval[0] = TAILQ_NEXT(lf, link)->id;
+ else
+ td->td_retval[0] = 0;
+ } else
+ error = ENOENT;
+out:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kldstat(struct thread *td, struct kldstat_args *uap)
+{
+ linker_file_t lf;
+ int error = 0;
+ int namelen, version;
+ struct kld_file_stat *stat;
+
+ mtx_lock(&Giant);
+
+ lf = linker_find_file_by_id(SCARG(uap, fileid));
+ if (lf == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ stat = SCARG(uap, stat);
+
+ /*
+ * Check the version of the user's structure.
+ */
+ if ((error = copyin(&stat->version, &version, sizeof(version))) != 0)
+ goto out;
+ if (version != sizeof(struct kld_file_stat)) {
+ error = EINVAL;
+ goto out;
+ }
+ namelen = strlen(lf->filename) + 1;
+ if (namelen > MAXPATHLEN)
+ namelen = MAXPATHLEN;
+ if ((error = copyout(lf->filename, &stat->name[0], namelen)) != 0)
+ goto out;
+ if ((error = copyout(&lf->refs, &stat->refs, sizeof(int))) != 0)
+ goto out;
+ if ((error = copyout(&lf->id, &stat->id, sizeof(int))) != 0)
+ goto out;
+ if ((error = copyout(&lf->address, &stat->address,
+ sizeof(caddr_t))) != 0)
+ goto out;
+ if ((error = copyout(&lf->size, &stat->size, sizeof(size_t))) != 0)
+ goto out;
+
+ td->td_retval[0] = 0;
+out:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kldfirstmod(struct thread *td, struct kldfirstmod_args *uap)
+{
+ linker_file_t lf;
+ module_t mp;
+ int error = 0;
+
+ mtx_lock(&Giant);
+ lf = linker_find_file_by_id(SCARG(uap, fileid));
+ if (lf) {
+ MOD_SLOCK;
+ mp = TAILQ_FIRST(&lf->modules);
+ if (mp != NULL)
+ td->td_retval[0] = module_getid(mp);
+ else
+ td->td_retval[0] = 0;
+ MOD_SUNLOCK;
+ } else
+ error = ENOENT;
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kldsym(struct thread *td, struct kldsym_args *uap)
+{
+ char *symstr = NULL;
+ c_linker_sym_t sym;
+ linker_symval_t symval;
+ linker_file_t lf;
+ struct kld_sym_lookup lookup;
+ int error = 0;
+
+ mtx_lock(&Giant);
+
+ if ((error = copyin(SCARG(uap, data), &lookup, sizeof(lookup))) != 0)
+ goto out;
+ if (lookup.version != sizeof(lookup) ||
+ SCARG(uap, cmd) != KLDSYM_LOOKUP) {
+ error = EINVAL;
+ goto out;
+ }
+ symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0)
+ goto out;
+ if (SCARG(uap, fileid) != 0) {
+ lf = linker_find_file_by_id(SCARG(uap, fileid));
+ if (lf == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
+ LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
+ lookup.symvalue = (uintptr_t) symval.value;
+ lookup.symsize = symval.size;
+ error = copyout(&lookup, SCARG(uap, data),
+ sizeof(lookup));
+ } else
+ error = ENOENT;
+ } else {
+ mtx_lock(&kld_mtx);
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
+ LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
+ lookup.symvalue = (uintptr_t)symval.value;
+ lookup.symsize = symval.size;
+ error = copyout(&lookup, SCARG(uap, data),
+ sizeof(lookup));
+ break;
+ }
+ }
+ mtx_unlock(&kld_mtx);
+ if (lf == NULL)
+ error = ENOENT;
+ }
+out:
+ if (symstr)
+ free(symstr, M_TEMP);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Preloaded module support
+ */
+
+static modlist_t
+modlist_lookup(const char *name, int ver)
+{
+ modlist_t mod;
+
+ TAILQ_FOREACH(mod, &found_modules, link) {
+ if (strcmp(mod->name, name) == 0 &&
+ (ver == 0 || mod->version == ver))
+ return (mod);
+ }
+ return (NULL);
+}
+
+static modlist_t
+modlist_lookup2(const char *name, struct mod_depend *verinfo)
+{
+ modlist_t mod, bestmod;
+ int ver;
+
+ if (verinfo == NULL)
+ return (modlist_lookup(name, 0));
+ bestmod = NULL;
+ for (mod = TAILQ_FIRST(&found_modules); mod;
+ mod = TAILQ_NEXT(mod, link)) {
+ if (strcmp(mod->name, name) != 0)
+ continue;
+ ver = mod->version;
+ if (ver == verinfo->md_ver_preferred)
+ return (mod);
+ if (ver >= verinfo->md_ver_minimum &&
+ ver <= verinfo->md_ver_maximum &&
+ ver > bestmod->version)
+ bestmod = mod;
+ }
+ return (bestmod);
+}
+
+static modlist_t
+modlist_newmodule(const char *modname, int version, linker_file_t container)
+{
+ modlist_t mod;
+
+ mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO);
+ if (mod == NULL)
+ panic("no memory for module list");
+ mod->container = container;
+ mod->name = modname;
+ mod->version = version;
+ TAILQ_INSERT_TAIL(&found_modules, mod, link);
+ return (mod);
+}
+
+/*
+ * This routine is cheap and nasty but will work for data pointers.
+ */
+static void *
+linker_reloc_ptr(linker_file_t lf, const void *offset)
+{
+ return (lf->address + (uintptr_t)offset);
+}
+
+/*
+ * Dereference MDT_VERSION metadata into module name and version
+ */
+static void
+linker_mdt_version(linker_file_t lf, struct mod_metadata *mp,
+ const char **modname, int *version)
+{
+ struct mod_version *mvp;
+
+ if (modname)
+ *modname = linker_reloc_ptr(lf, mp->md_cval);
+ if (version) {
+ mvp = linker_reloc_ptr(lf, mp->md_data);
+ *version = mvp->mv_version;
+ }
+}
+
+/*
+ * Dereference MDT_DEPEND metadata into module name and mod_depend structure
+ */
+static void
+linker_mdt_depend(linker_file_t lf, struct mod_metadata *mp,
+ const char **modname, struct mod_depend **verinfo)
+{
+
+ if (modname)
+ *modname = linker_reloc_ptr(lf, mp->md_cval);
+ if (verinfo)
+ *verinfo = linker_reloc_ptr(lf, mp->md_data);
+}
+
+static void
+linker_addmodules(linker_file_t lf, struct mod_metadata **start,
+ struct mod_metadata **stop, int preload)
+{
+ struct mod_metadata *mp, **mdp;
+ const char *modname;
+ int ver;
+
+ for (mdp = start; mdp < stop; mdp++) {
+ if (preload)
+ mp = *mdp;
+ else
+ mp = linker_reloc_ptr(lf, *mdp);
+ if (mp->md_type != MDT_VERSION)
+ continue;
+ if (preload) {
+ modname = mp->md_cval;
+ ver = ((struct mod_version *)mp->md_data)->mv_version;
+ } else
+ linker_mdt_version(lf, mp, &modname, &ver);
+ if (modlist_lookup(modname, ver) != NULL) {
+ printf("module %s already present!\n", modname);
+ /* XXX what can we do? this is a build error. :-( */
+ continue;
+ }
+ modlist_newmodule(modname, ver, lf);
+ }
+}
+
+static void
+linker_preload(void *arg)
+{
+ caddr_t modptr;
+ const char *modname, *nmodname;
+ char *modtype;
+ linker_file_t lf;
+ linker_class_t lc;
+ int error;
+ linker_file_list_t loaded_files;
+ linker_file_list_t depended_files;
+ struct mod_metadata *mp, *nmp;
+ struct mod_metadata **start, **stop, **mdp, **nmdp;
+ struct mod_depend *verinfo;
+ int nver;
+ int resolves;
+ modlist_t mod;
+ struct sysinit **si_start, **si_stop;
+
+ TAILQ_INIT(&loaded_files);
+ TAILQ_INIT(&depended_files);
+ TAILQ_INIT(&found_modules);
+ error = 0;
+
+ modptr = NULL;
+ while ((modptr = preload_search_next_name(modptr)) != NULL) {
+ modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+ modtype = (char *)preload_search_info(modptr, MODINFO_TYPE);
+ if (modname == NULL) {
+ printf("Preloaded module at %p does not have a"
+ " name!\n", modptr);
+ continue;
+ }
+ if (modtype == NULL) {
+ printf("Preloaded module at %p does not have a type!\n",
+ modptr);
+ continue;
+ }
+ printf("Preloaded %s \"%s\" at %p.\n", modtype, modname,
+ modptr);
+ lf = NULL;
+ TAILQ_FOREACH(lc, &classes, link) {
+ error = LINKER_LINK_PRELOAD(lc, modname, &lf);
+ if (error) {
+ lf = NULL;
+ break;
+ }
+ }
+ if (lf)
+ TAILQ_INSERT_TAIL(&loaded_files, lf, loaded);
+ }
+
+ /*
+ * First get a list of stuff in the kernel.
+ */
+ if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start,
+ &stop, NULL) == 0)
+ linker_addmodules(linker_kernel_file, start, stop, 1);
+
+ /*
+ * this is a once-off kinky bubble sort resolve relocation dependency
+ * requirements
+ */
+restart:
+ TAILQ_FOREACH(lf, &loaded_files, loaded) {
+ error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
+ &stop, NULL);
+ /*
+ * First, look to see if we would successfully link with this
+ * stuff.
+ */
+ resolves = 1; /* unless we know otherwise */
+ if (!error) {
+ for (mdp = start; mdp < stop; mdp++) {
+ mp = linker_reloc_ptr(lf, *mdp);
+ if (mp->md_type != MDT_DEPEND)
+ continue;
+ linker_mdt_depend(lf, mp, &modname, &verinfo);
+ for (nmdp = start; nmdp < stop; nmdp++) {
+ nmp = linker_reloc_ptr(lf, *nmdp);
+ if (nmp->md_type != MDT_VERSION)
+ continue;
+ linker_mdt_version(lf, nmp, &nmodname,
+ NULL);
+ nmodname = linker_reloc_ptr(lf,
+ nmp->md_cval);
+ if (strcmp(modname, nmodname) == 0)
+ break;
+ }
+ if (nmdp < stop) /* it's a self reference */
+ continue;
+
+ /*
+ * ok, the module isn't here yet, we
+ * are not finished
+ */
+ if (modlist_lookup2(modname, verinfo) == NULL)
+ resolves = 0;
+ }
+ }
+ /*
+ * OK, if we found our modules, we can link. So, "provide"
+ * the modules inside and add it to the end of the link order
+ * list.
+ */
+ if (resolves) {
+ if (!error) {
+ for (mdp = start; mdp < stop; mdp++) {
+ mp = linker_reloc_ptr(lf, *mdp);
+ if (mp->md_type != MDT_VERSION)
+ continue;
+ linker_mdt_version(lf, mp,
+ &modname, &nver);
+ if (modlist_lookup(modname,
+ nver) != NULL) {
+ printf("module %s already"
+ " present!\n", modname);
+ linker_file_unload(lf);
+ TAILQ_REMOVE(&loaded_files,
+ lf, loaded);
+ /* we changed tailq next ptr */
+ goto restart;
+ }
+ modlist_newmodule(modname, nver, lf);
+ }
+ }
+ TAILQ_REMOVE(&loaded_files, lf, loaded);
+ TAILQ_INSERT_TAIL(&depended_files, lf, loaded);
+ /*
+ * Since we provided modules, we need to restart the
+ * sort so that the previous files that depend on us
+ * have a chance. Also, we've busted the tailq next
+ * pointer with the REMOVE.
+ */
+ goto restart;
+ }
+ }
+
+ /*
+ * At this point, we check to see what could not be resolved..
+ */
+ TAILQ_FOREACH(lf, &loaded_files, loaded) {
+ printf("KLD file %s is missing dependencies\n", lf->filename);
+ linker_file_unload(lf);
+ TAILQ_REMOVE(&loaded_files, lf, loaded);
+ }
+
+ /*
+ * We made it. Finish off the linking in the order we determined.
+ */
+ TAILQ_FOREACH(lf, &depended_files, loaded) {
+ if (linker_kernel_file) {
+ linker_kernel_file->refs++;
+ error = linker_file_add_dependency(lf,
+ linker_kernel_file);
+ if (error)
+ panic("cannot add dependency");
+ }
+ lf->userrefs++; /* so we can (try to) kldunload it */
+ error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
+ &stop, NULL);
+ if (!error) {
+ for (mdp = start; mdp < stop; mdp++) {
+ mp = linker_reloc_ptr(lf, *mdp);
+ if (mp->md_type != MDT_DEPEND)
+ continue;
+ linker_mdt_depend(lf, mp, &modname, &verinfo);
+ mod = modlist_lookup2(modname, verinfo);
+ mod->container->refs++;
+ error = linker_file_add_dependency(lf,
+ mod->container);
+ if (error)
+ panic("cannot add dependency");
+ }
+ }
+ /*
+ * Now do relocation etc using the symbol search paths
+ * established by the dependencies
+ */
+ error = LINKER_LINK_PRELOAD_FINISH(lf);
+ if (error) {
+ printf("KLD file %s - could not finalize loading\n",
+ lf->filename);
+ linker_file_unload(lf);
+ continue;
+ }
+ linker_file_register_modules(lf);
+ if (linker_file_lookup_set(lf, "sysinit_set", &si_start,
+ &si_stop, NULL) == 0)
+ sysinit_add(si_start, si_stop);
+ linker_file_register_sysctls(lf);
+ lf->flags |= LINKER_FILE_LINKED;
+ }
+ /* woohoo! we made it! */
+}
+
+SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0)
+
+/*
+ * Search for a not-loaded module by name.
+ *
+ * Modules may be found in the following locations:
+ *
+ * - preloaded (result is just the module name) - on disk (result is full path
+ * to module)
+ *
+ * If the module name is qualified in any way (contains path, etc.) the we
+ * simply return a copy of it.
+ *
+ * The search path can be manipulated via sysctl. Note that we use the ';'
+ * character as a separator to be consistent with the bootloader.
+ */
+
+static char linker_hintfile[] = "linker.hints";
+static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules;/modules";
+
+SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RW, linker_path,
+ sizeof(linker_path), "module load search path");
+
+TUNABLE_STR("module_path", linker_path, sizeof(linker_path));
+
+static char *linker_ext_list[] = {
+ "",
+ ".ko",
+ NULL
+};
+
+/*
+ * Check if file actually exists either with or without extension listed in
+ * the linker_ext_list. (probably should be generic for the rest of the
+ * kernel)
+ */
+static char *
+linker_lookup_file(const char *path, int pathlen, const char *name,
+ int namelen, struct vattr *vap)
+{
+ struct nameidata nd;
+ struct thread *td = curthread; /* XXX */
+ char *result, **cpp, *sep;
+ int error, len, extlen, reclen, flags;
+ enum vtype type;
+
+ extlen = 0;
+ for (cpp = linker_ext_list; *cpp; cpp++) {
+ len = strlen(*cpp);
+ if (len > extlen)
+ extlen = len;
+ }
+ extlen++; /* trailing '\0' */
+ sep = (path[pathlen - 1] != '/') ? "/" : "";
+
+ reclen = pathlen + strlen(sep) + namelen + extlen + 1;
+ result = malloc(reclen, M_LINKER, M_WAITOK);
+ for (cpp = linker_ext_list; *cpp; cpp++) {
+ snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep,
+ namelen, name, *cpp);
+ /*
+ * Attempt to open the file, and return the path if
+ * we succeed and it's a regular file.
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, td);
+ flags = FREAD;
+ error = vn_open(&nd, &flags, 0);
+ if (error == 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ type = nd.ni_vp->v_type;
+ if (vap)
+ VOP_GETATTR(nd.ni_vp, vap, td->td_ucred, td);
+ VOP_UNLOCK(nd.ni_vp, 0, td);
+ vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+ if (type == VREG)
+ return (result);
+ }
+ }
+ free(result, M_LINKER);
+ return (NULL);
+}
+
+#define INT_ALIGN(base, ptr) ptr = \
+ (base) + (((ptr) - (base) + sizeof(int) - 1) & ~(sizeof(int) - 1))
+
+/*
+ * Lookup KLD which contains requested module in the "linker.hints" file. If
+ * version specification is available, then try to find the best KLD.
+ * Otherwise just find the latest one.
+ *
+ * XXX: Vnode locking here is hosed; lock should be held for calls to
+ * VOP_GETATTR() and vn_rdwr().
+ */
+static char *
+linker_hints_lookup(const char *path, int pathlen, const char *modname,
+ int modnamelen, struct mod_depend *verinfo)
+{
+ struct thread *td = curthread; /* XXX */
+ struct ucred *cred = td ? td->td_ucred : NULL;
+ struct nameidata nd;
+ struct vattr vattr, mattr;
+ u_char *hints = NULL;
+ u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep;
+ int error, ival, bestver, *intp, reclen, found, flags, clen, blen;
+
+ result = NULL;
+ bestver = found = 0;
+
+ sep = (path[pathlen - 1] != '/') ? "/" : "";
+ reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen +
+ strlen(sep) + 1;
+ pathbuf = malloc(reclen, M_LINKER, M_WAITOK);
+ snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep,
+ linker_hintfile);
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf, td);
+ flags = FREAD;
+ error = vn_open(&nd, &flags, 0);
+ if (error)
+ goto bad;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ VOP_UNLOCK(nd.ni_vp, 0, td);
+ if (nd.ni_vp->v_type != VREG)
+ goto bad;
+ best = cp = NULL;
+ error = VOP_GETATTR(nd.ni_vp, &vattr, cred, td);
+ if (error)
+ goto bad;
+ /*
+ * XXX: we need to limit this number to some reasonable value
+ */
+ if (vattr.va_size > 100 * 1024) {
+ printf("hints file too large %ld\n", (long)vattr.va_size);
+ goto bad;
+ }
+ hints = malloc(vattr.va_size, M_TEMP, M_WAITOK);
+ if (hints == NULL)
+ goto bad;
+ error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0,
+ UIO_SYSSPACE, IO_NODELOCKED, cred, &reclen, td);
+ if (error)
+ goto bad;
+ vn_close(nd.ni_vp, FREAD, cred, td);
+ nd.ni_vp = NULL;
+ if (reclen != 0) {
+ printf("can't read %d\n", reclen);
+ goto bad;
+ }
+ intp = (int *)hints;
+ ival = *intp++;
+ if (ival != LINKER_HINTS_VERSION) {
+ printf("hints file version mismatch %d\n", ival);
+ goto bad;
+ }
+ bufend = hints + vattr.va_size;
+ recptr = (u_char *)intp;
+ clen = blen = 0;
+ while (recptr < bufend && !found) {
+ intp = (int *)recptr;
+ reclen = *intp++;
+ ival = *intp++;
+ cp = (char *)intp;
+ switch (ival) {
+ case MDT_VERSION:
+ clen = *cp++;
+ if (clen != modnamelen || bcmp(cp, modname, clen) != 0)
+ break;
+ cp += clen;
+ INT_ALIGN(hints, cp);
+ ival = *(int *)cp;
+ cp += sizeof(int);
+ clen = *cp++;
+ if (verinfo == NULL ||
+ ival == verinfo->md_ver_preferred) {
+ found = 1;
+ break;
+ }
+ if (ival >= verinfo->md_ver_minimum &&
+ ival <= verinfo->md_ver_maximum &&
+ ival > bestver) {
+ bestver = ival;
+ best = cp;
+ blen = clen;
+ }
+ break;
+ default:
+ break;
+ }
+ recptr += reclen + sizeof(int);
+ }
+ /*
+ * Finally check if KLD is in the place
+ */
+ if (found)
+ result = linker_lookup_file(path, pathlen, cp, clen, &mattr);
+ else if (best)
+ result = linker_lookup_file(path, pathlen, best, blen, &mattr);
+
+ /*
+ * KLD is newer than hints file. What we should do now?
+ */
+ if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >))
+ printf("warning: KLD '%s' is newer than the linker.hints"
+ " file\n", result);
+bad:
+ if (hints)
+ free(hints, M_TEMP);
+ if (nd.ni_vp != NULL)
+ vn_close(nd.ni_vp, FREAD, cred, td);
+ /*
+ * If nothing found or hints is absent - fallback to the old
+ * way by using "kldname[.ko]" as module name.
+ */
+ if (!found && !bestver && result == NULL)
+ result = linker_lookup_file(path, pathlen, modname,
+ modnamelen, NULL);
+ return (result);
+}
+
+/*
+ * Lookup KLD which contains requested module in the all directories.
+ */
+static char *
+linker_search_module(const char *modname, int modnamelen,
+ struct mod_depend *verinfo)
+{
+ char *cp, *ep, *result;
+
+ /*
+ * traverse the linker path
+ */
+ for (cp = linker_path; *cp; cp = ep + 1) {
+ /* find the end of this component */
+ for (ep = cp; (*ep != 0) && (*ep != ';'); ep++);
+ result = linker_hints_lookup(cp, ep - cp, modname,
+ modnamelen, verinfo);
+ if (result != NULL)
+ return (result);
+ if (*ep == 0)
+ break;
+ }
+ return (NULL);
+}
+
+/*
+ * Search for module in all directories listed in the linker_path.
+ */
+static char *
+linker_search_kld(const char *name)
+{
+ char *cp, *ep, *result, **cpp;
+ int extlen, len;
+
+ /* qualified at all? */
+ if (index(name, '/'))
+ return (linker_strdup(name));
+
+ extlen = 0;
+ for (cpp = linker_ext_list; *cpp; cpp++) {
+ len = strlen(*cpp);
+ if (len > extlen)
+ extlen = len;
+ }
+ extlen++; /* trailing '\0' */
+
+ /* traverse the linker path */
+ len = strlen(name);
+ for (ep = linker_path; *ep; ep++) {
+ cp = ep;
+ /* find the end of this component */
+ for (; *ep != 0 && *ep != ';'; ep++);
+ result = linker_lookup_file(cp, ep - cp, name, len, NULL);
+ if (result != NULL)
+ return (result);
+ }
+ return (NULL);
+}
+
+static const char *
+linker_basename(const char *path)
+{
+ const char *filename;
+
+ filename = rindex(path, '/');
+ if (filename == NULL)
+ return path;
+ if (filename[1])
+ filename++;
+ return (filename);
+}
+
+/*
+ * Find a file which contains given module and load it, if "parent" is not
+ * NULL, register a reference to it.
+ */
+static int
+linker_load_module(const char *kldname, const char *modname,
+ struct linker_file *parent, struct mod_depend *verinfo,
+ struct linker_file **lfpp)
+{
+ linker_file_t lfdep;
+ const char *filename;
+ char *pathname;
+ int error;
+
+ if (modname == NULL) {
+ /*
+ * We have to load KLD
+ */
+ KASSERT(verinfo == NULL, ("linker_load_module: verinfo"
+ " is not NULL"));
+ pathname = linker_search_kld(kldname);
+ } else {
+ if (modlist_lookup2(modname, verinfo) != NULL)
+ return (EEXIST);
+ if (kldname != NULL)
+ pathname = linker_strdup(kldname);
+ else if (rootvnode == NULL)
+ pathname = NULL;
+ else
+ /*
+ * Need to find a KLD with required module
+ */
+ pathname = linker_search_module(modname,
+ strlen(modname), verinfo);
+ }
+ if (pathname == NULL)
+ return (ENOENT);
+
+ /*
+ * Can't load more than one file with the same basename XXX:
+ * Actually it should be possible to have multiple KLDs with
+ * the same basename but different path because they can
+ * provide different versions of the same modules.
+ */
+ filename = linker_basename(pathname);
+ if (linker_find_file_by_name(filename)) {
+ error = EEXIST;
+ goto out;
+ }
+ do {
+ error = linker_load_file(pathname, &lfdep);
+ if (error)
+ break;
+ if (modname && verinfo &&
+ modlist_lookup2(modname, verinfo) == NULL) {
+ linker_file_unload(lfdep);
+ error = ENOENT;
+ break;
+ }
+ if (parent) {
+ error = linker_file_add_dependency(parent, lfdep);
+ if (error)
+ break;
+ }
+ if (lfpp)
+ *lfpp = lfdep;
+ } while (0);
+out:
+ if (pathname)
+ free(pathname, M_LINKER);
+ return (error);
+}
+
+/*
+ * This routine is responsible for finding dependencies of userland initiated
+ * kldload(2)'s of files.
+ */
+int
+linker_load_dependencies(linker_file_t lf)
+{
+ linker_file_t lfdep;
+ struct mod_metadata **start, **stop, **mdp, **nmdp;
+ struct mod_metadata *mp, *nmp;
+ struct mod_depend *verinfo;
+ modlist_t mod;
+ const char *modname, *nmodname;
+ int ver, error = 0, count;
+
+ /*
+ * All files are dependant on /kernel.
+ */
+ if (linker_kernel_file) {
+ linker_kernel_file->refs++;
+ error = linker_file_add_dependency(lf, linker_kernel_file);
+ if (error)
+ return (error);
+ }
+ if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop,
+ &count) != 0)
+ return (0);
+ for (mdp = start; mdp < stop; mdp++) {
+ mp = linker_reloc_ptr(lf, *mdp);
+ if (mp->md_type != MDT_VERSION)
+ continue;
+ linker_mdt_version(lf, mp, &modname, &ver);
+ mod = modlist_lookup(modname, ver);
+ if (mod != NULL) {
+ printf("interface %s.%d already present in the KLD"
+ " '%s'!\n", modname, ver,
+ mod->container->filename);
+ return (EEXIST);
+ }
+ }
+
+ for (mdp = start; mdp < stop; mdp++) {
+ mp = linker_reloc_ptr(lf, *mdp);
+ if (mp->md_type != MDT_DEPEND)
+ continue;
+ linker_mdt_depend(lf, mp, &modname, &verinfo);
+ nmodname = NULL;
+ for (nmdp = start; nmdp < stop; nmdp++) {
+ nmp = linker_reloc_ptr(lf, *nmdp);
+ if (nmp->md_type != MDT_VERSION)
+ continue;
+ nmodname = linker_reloc_ptr(lf, nmp->md_cval);
+ if (strcmp(modname, nmodname) == 0)
+ break;
+ }
+ if (nmdp < stop)/* early exit, it's a self reference */
+ continue;
+ mod = modlist_lookup2(modname, verinfo);
+ if (mod) { /* woohoo, it's loaded already */
+ lfdep = mod->container;
+ lfdep->refs++;
+ error = linker_file_add_dependency(lf, lfdep);
+ if (error)
+ break;
+ continue;
+ }
+ error = linker_load_module(NULL, modname, lf, verinfo, NULL);
+ if (error) {
+ printf("KLD %s: depends on %s - not available\n",
+ lf->filename, modname);
+ break;
+ }
+ }
+
+ if (error)
+ return (error);
+ linker_addmodules(lf, start, stop, 0);
+ return (error);
+}
+
+static int
+sysctl_kern_function_list_iterate(const char *name, void *opaque)
+{
+ struct sysctl_req *req;
+
+ req = opaque;
+ return (SYSCTL_OUT(req, name, strlen(name) + 1));
+}
+
+/*
+ * Export a nul-separated, double-nul-terminated list of all function names
+ * in the kernel.
+ */
+static int
+sysctl_kern_function_list(SYSCTL_HANDLER_ARGS)
+{
+ linker_file_t lf;
+ int error;
+
+ mtx_lock(&kld_mtx);
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ error = LINKER_EACH_FUNCTION_NAME(lf,
+ sysctl_kern_function_list_iterate, req);
+ if (error) {
+ mtx_unlock(&kld_mtx);
+ return (error);
+ }
+ }
+ mtx_unlock(&kld_mtx);
+ return (SYSCTL_OUT(req, "", 1));
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLFLAG_RD,
+ NULL, 0, sysctl_kern_function_list, "", "kernel function list");
diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
new file mode 100644
index 0000000..5189bb7
--- /dev/null
+++ b/sys/kern/kern_lock.c
@@ -0,0 +1,594 @@
+/*
+ * Copyright (c) 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Copyright (C) 1997
+ * John S. Dyson. All rights reserved.
+ *
+ * This code contains ideas from software contributed to Berkeley by
+ * Avadis Tevanian, Jr., Michael Wayne Young, and the Mach Operating
+ * System project at Carnegie-Mellon University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_lock.c 8.18 (Berkeley) 5/21/95
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+/*
+ * Locking primitives implementation.
+ * Locks provide shared/exclusive sychronization.
+ */
+
+#define LOCK_WAIT_TIME 100
+#define LOCK_SAMPLE_WAIT 7
+
+#if defined(DIAGNOSTIC)
+#define LOCK_INLINE
+#else
+#define LOCK_INLINE __inline
+#endif
+
+#define LK_ALL (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | \
+ LK_SHARE_NONZERO | LK_WAIT_NONZERO)
+
+/*
+ * Mutex array variables. Rather than each lockmgr lock having its own mutex,
+ * share a fixed (at boot time) number of mutexes across all lockmgr locks in
+ * order to keep sizeof(struct lock) down.
+ */
+int lock_mtx_valid;
+static struct mtx lock_mtx;
+
+static int acquire(struct lock *lkp, int extflags, int wanted);
+static int apause(struct lock *lkp, int flags);
+static int acquiredrain(struct lock *lkp, int extflags) ;
+
+static void
+lockmgr_init(void *dummy __unused)
+{
+ /*
+ * Initialize the lockmgr protection mutex if it hasn't already been
+ * done. Unless something changes about kernel startup order, VM
+ * initialization will always cause this mutex to already be
+ * initialized in a call to lockinit().
+ */
+ if (lock_mtx_valid == 0) {
+ mtx_init(&lock_mtx, "lockmgr", NULL, MTX_DEF);
+ lock_mtx_valid = 1;
+ }
+}
+SYSINIT(lmgrinit, SI_SUB_LOCK, SI_ORDER_FIRST, lockmgr_init, NULL)
+
+static LOCK_INLINE void
+sharelock(struct lock *lkp, int incr) {
+ lkp->lk_flags |= LK_SHARE_NONZERO;
+ lkp->lk_sharecount += incr;
+}
+
+static LOCK_INLINE void
+shareunlock(struct lock *lkp, int decr) {
+
+ KASSERT(lkp->lk_sharecount >= decr, ("shareunlock: count < decr"));
+
+ if (lkp->lk_sharecount == decr) {
+ lkp->lk_flags &= ~LK_SHARE_NONZERO;
+ if (lkp->lk_flags & (LK_WANT_UPGRADE | LK_WANT_EXCL)) {
+ wakeup(lkp);
+ }
+ lkp->lk_sharecount = 0;
+ } else {
+ lkp->lk_sharecount -= decr;
+ }
+}
+
+/*
+ * This is the waitloop optimization.
+ */
+static int
+apause(struct lock *lkp, int flags)
+{
+#ifdef SMP
+ int i, lock_wait;
+#endif
+
+ if ((lkp->lk_flags & flags) == 0)
+ return 0;
+#ifdef SMP
+ for (lock_wait = LOCK_WAIT_TIME; lock_wait > 0; lock_wait--) {
+ mtx_unlock(lkp->lk_interlock);
+ for (i = LOCK_SAMPLE_WAIT; i > 0; i--)
+ if ((lkp->lk_flags & flags) == 0)
+ break;
+ mtx_lock(lkp->lk_interlock);
+ if ((lkp->lk_flags & flags) == 0)
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+static int
+acquire(struct lock *lkp, int extflags, int wanted) {
+ int s, error;
+
+ CTR3(KTR_LOCKMGR,
+ "acquire(): lkp == %p, extflags == 0x%x, wanted == 0x%x\n",
+ lkp, extflags, wanted);
+
+ if ((extflags & LK_NOWAIT) && (lkp->lk_flags & wanted)) {
+ return EBUSY;
+ }
+
+ if (((lkp->lk_flags | extflags) & LK_NOPAUSE) == 0) {
+ error = apause(lkp, wanted);
+ if (error == 0)
+ return 0;
+ }
+
+ s = splhigh();
+ while ((lkp->lk_flags & wanted) != 0) {
+ lkp->lk_flags |= LK_WAIT_NONZERO;
+ lkp->lk_waitcount++;
+ error = msleep(lkp, lkp->lk_interlock, lkp->lk_prio,
+ lkp->lk_wmesg,
+ ((extflags & LK_TIMELOCK) ? lkp->lk_timo : 0));
+ if (lkp->lk_waitcount == 1) {
+ lkp->lk_flags &= ~LK_WAIT_NONZERO;
+ lkp->lk_waitcount = 0;
+ } else {
+ lkp->lk_waitcount--;
+ }
+ if (error) {
+ splx(s);
+ return error;
+ }
+ if (extflags & LK_SLEEPFAIL) {
+ splx(s);
+ return ENOLCK;
+ }
+ }
+ splx(s);
+ return 0;
+}
+
+/*
+ * Set, change, or release a lock.
+ *
+ * Shared requests increment the shared count. Exclusive requests set the
+ * LK_WANT_EXCL flag (preventing further shared locks), and wait for already
+ * accepted shared locks and shared-to-exclusive upgrades to go away.
+ */
+int
+#ifndef DEBUG_LOCKS
+lockmgr(lkp, flags, interlkp, td)
+#else
+debuglockmgr(lkp, flags, interlkp, td, name, file, line)
+#endif
+ struct lock *lkp;
+ u_int flags;
+ struct mtx *interlkp;
+ struct thread *td;
+#ifdef DEBUG_LOCKS
+ const char *name; /* Name of lock function */
+ const char *file; /* Name of file call is from */
+ int line; /* Line number in file */
+#endif
+{
+ int error;
+ pid_t pid;
+ int extflags, lockflags;
+
+ CTR5(KTR_LOCKMGR,
+ "lockmgr(): lkp == %p (lk_wmesg == \"%s\"), flags == 0x%x, "
+ "interlkp == %p, td == %p", lkp, lkp->lk_wmesg, flags, interlkp, td);
+
+ error = 0;
+ if (td == NULL)
+ pid = LK_KERNPROC;
+ else
+ pid = td->td_proc->p_pid;
+
+ mtx_lock(lkp->lk_interlock);
+ if (flags & LK_INTERLOCK) {
+ mtx_assert(interlkp, MA_OWNED | MA_NOTRECURSED);
+ mtx_unlock(interlkp);
+ }
+
+ if (panicstr != NULL) {
+ mtx_unlock(lkp->lk_interlock);
+ return (0);
+ }
+
+ extflags = (flags | lkp->lk_flags) & LK_EXTFLG_MASK;
+
+ switch (flags & LK_TYPE_MASK) {
+
+ case LK_SHARED:
+ /*
+ * If we are not the exclusive lock holder, we have to block
+ * while there is an exclusive lock holder or while an
+ * exclusive lock request or upgrade request is in progress.
+ *
+ * However, if TDF_DEADLKTREAT is set, we override exclusive
+ * lock requests or upgrade requests ( but not the exclusive
+ * lock itself ).
+ */
+ if (lkp->lk_lockholder != pid) {
+ lockflags = LK_HAVE_EXCL;
+ mtx_lock_spin(&sched_lock);
+ if (td != NULL && !(td->td_flags & TDF_DEADLKTREAT))
+ lockflags |= LK_WANT_EXCL | LK_WANT_UPGRADE;
+ mtx_unlock_spin(&sched_lock);
+ error = acquire(lkp, extflags, lockflags);
+ if (error)
+ break;
+ sharelock(lkp, 1);
+#if defined(DEBUG_LOCKS)
+ lkp->lk_slockholder = pid;
+ lkp->lk_sfilename = file;
+ lkp->lk_slineno = line;
+ lkp->lk_slockername = name;
+#endif
+ break;
+ }
+ /*
+ * We hold an exclusive lock, so downgrade it to shared.
+ * An alternative would be to fail with EDEADLK.
+ */
+ sharelock(lkp, 1);
+ /* fall into downgrade */
+
+ case LK_DOWNGRADE:
+ KASSERT(lkp->lk_lockholder == pid && lkp->lk_exclusivecount != 0,
+ ("lockmgr: not holding exclusive lock "
+ "(owner pid (%d) != pid (%d), exlcnt (%d) != 0",
+ lkp->lk_lockholder, pid, lkp->lk_exclusivecount));
+ sharelock(lkp, lkp->lk_exclusivecount);
+ lkp->lk_exclusivecount = 0;
+ lkp->lk_flags &= ~LK_HAVE_EXCL;
+ lkp->lk_lockholder = LK_NOPROC;
+ if (lkp->lk_waitcount)
+ wakeup((void *)lkp);
+ break;
+
+ case LK_EXCLUPGRADE:
+ /*
+ * If another process is ahead of us to get an upgrade,
+ * then we want to fail rather than have an intervening
+ * exclusive access.
+ */
+ if (lkp->lk_flags & LK_WANT_UPGRADE) {
+ shareunlock(lkp, 1);
+ error = EBUSY;
+ break;
+ }
+ /* fall into normal upgrade */
+
+ case LK_UPGRADE:
+ /*
+ * Upgrade a shared lock to an exclusive one. If another
+ * shared lock has already requested an upgrade to an
+ * exclusive lock, our shared lock is released and an
+ * exclusive lock is requested (which will be granted
+ * after the upgrade). If we return an error, the file
+ * will always be unlocked.
+ */
+ if ((lkp->lk_lockholder == pid) || (lkp->lk_sharecount <= 0))
+ panic("lockmgr: upgrade exclusive lock");
+ shareunlock(lkp, 1);
+ /*
+ * If we are just polling, check to see if we will block.
+ */
+ if ((extflags & LK_NOWAIT) &&
+ ((lkp->lk_flags & LK_WANT_UPGRADE) ||
+ lkp->lk_sharecount > 1)) {
+ error = EBUSY;
+ break;
+ }
+ if ((lkp->lk_flags & LK_WANT_UPGRADE) == 0) {
+ /*
+ * We are first shared lock to request an upgrade, so
+ * request upgrade and wait for the shared count to
+ * drop to zero, then take exclusive lock.
+ */
+ lkp->lk_flags |= LK_WANT_UPGRADE;
+ error = acquire(lkp, extflags, LK_SHARE_NONZERO);
+ lkp->lk_flags &= ~LK_WANT_UPGRADE;
+
+ if (error)
+ break;
+ lkp->lk_flags |= LK_HAVE_EXCL;
+ lkp->lk_lockholder = pid;
+ if (lkp->lk_exclusivecount != 0)
+ panic("lockmgr: non-zero exclusive count");
+ lkp->lk_exclusivecount = 1;
+#if defined(DEBUG_LOCKS)
+ lkp->lk_filename = file;
+ lkp->lk_lineno = line;
+ lkp->lk_lockername = name;
+#endif
+ break;
+ }
+ /*
+ * Someone else has requested upgrade. Release our shared
+ * lock, awaken upgrade requestor if we are the last shared
+ * lock, then request an exclusive lock.
+ */
+ if ( (lkp->lk_flags & (LK_SHARE_NONZERO|LK_WAIT_NONZERO)) ==
+ LK_WAIT_NONZERO)
+ wakeup((void *)lkp);
+ /* fall into exclusive request */
+
+ case LK_EXCLUSIVE:
+ if (lkp->lk_lockholder == pid && pid != LK_KERNPROC) {
+ /*
+ * Recursive lock.
+ */
+ if ((extflags & (LK_NOWAIT | LK_CANRECURSE)) == 0)
+ panic("lockmgr: locking against myself");
+ if ((extflags & LK_CANRECURSE) != 0) {
+ lkp->lk_exclusivecount++;
+ break;
+ }
+ }
+ /*
+ * If we are just polling, check to see if we will sleep.
+ */
+ if ((extflags & LK_NOWAIT) &&
+ (lkp->lk_flags & (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | LK_SHARE_NONZERO))) {
+ error = EBUSY;
+ break;
+ }
+ /*
+ * Try to acquire the want_exclusive flag.
+ */
+ error = acquire(lkp, extflags, (LK_HAVE_EXCL | LK_WANT_EXCL));
+ if (error)
+ break;
+ lkp->lk_flags |= LK_WANT_EXCL;
+ /*
+ * Wait for shared locks and upgrades to finish.
+ */
+ error = acquire(lkp, extflags, LK_WANT_UPGRADE | LK_SHARE_NONZERO);
+ lkp->lk_flags &= ~LK_WANT_EXCL;
+ if (error)
+ break;
+ lkp->lk_flags |= LK_HAVE_EXCL;
+ lkp->lk_lockholder = pid;
+ if (lkp->lk_exclusivecount != 0)
+ panic("lockmgr: non-zero exclusive count");
+ lkp->lk_exclusivecount = 1;
+#if defined(DEBUG_LOCKS)
+ lkp->lk_filename = file;
+ lkp->lk_lineno = line;
+ lkp->lk_lockername = name;
+#endif
+ break;
+
+ case LK_RELEASE:
+ if (lkp->lk_exclusivecount != 0) {
+ if (lkp->lk_lockholder != pid &&
+ lkp->lk_lockholder != LK_KERNPROC) {
+ panic("lockmgr: pid %d, not %s %d unlocking",
+ pid, "exclusive lock holder",
+ lkp->lk_lockholder);
+ }
+ if (lkp->lk_exclusivecount == 1) {
+ lkp->lk_flags &= ~LK_HAVE_EXCL;
+ lkp->lk_lockholder = LK_NOPROC;
+ lkp->lk_exclusivecount = 0;
+ } else {
+ lkp->lk_exclusivecount--;
+ }
+ } else if (lkp->lk_flags & LK_SHARE_NONZERO)
+ shareunlock(lkp, 1);
+ if (lkp->lk_flags & LK_WAIT_NONZERO)
+ wakeup((void *)lkp);
+ break;
+
+ case LK_DRAIN:
+ /*
+ * Check that we do not already hold the lock, as it can
+ * never drain if we do. Unfortunately, we have no way to
+ * check for holding a shared lock, but at least we can
+ * check for an exclusive one.
+ */
+ if (lkp->lk_lockholder == pid)
+ panic("lockmgr: draining against myself");
+
+ error = acquiredrain(lkp, extflags);
+ if (error)
+ break;
+ lkp->lk_flags |= LK_DRAINING | LK_HAVE_EXCL;
+ lkp->lk_lockholder = pid;
+ lkp->lk_exclusivecount = 1;
+#if defined(DEBUG_LOCKS)
+ lkp->lk_filename = file;
+ lkp->lk_lineno = line;
+ lkp->lk_lockername = name;
+#endif
+ break;
+
+ default:
+ mtx_unlock(lkp->lk_interlock);
+ panic("lockmgr: unknown locktype request %d",
+ flags & LK_TYPE_MASK);
+ /* NOTREACHED */
+ }
+ if ((lkp->lk_flags & LK_WAITDRAIN) &&
+ (lkp->lk_flags & (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE |
+ LK_SHARE_NONZERO | LK_WAIT_NONZERO)) == 0) {
+ lkp->lk_flags &= ~LK_WAITDRAIN;
+ wakeup((void *)&lkp->lk_flags);
+ }
+ mtx_unlock(lkp->lk_interlock);
+ return (error);
+}
+
+static int
+acquiredrain(struct lock *lkp, int extflags) {
+ int error;
+
+ if ((extflags & LK_NOWAIT) && (lkp->lk_flags & LK_ALL)) {
+ return EBUSY;
+ }
+
+ error = apause(lkp, LK_ALL);
+ if (error == 0)
+ return 0;
+
+ while (lkp->lk_flags & LK_ALL) {
+ lkp->lk_flags |= LK_WAITDRAIN;
+ error = msleep(&lkp->lk_flags, lkp->lk_interlock, lkp->lk_prio,
+ lkp->lk_wmesg,
+ ((extflags & LK_TIMELOCK) ? lkp->lk_timo : 0));
+ if (error)
+ return error;
+ if (extflags & LK_SLEEPFAIL) {
+ return ENOLCK;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Initialize a lock; required before use.
+ */
+void
+lockinit(lkp, prio, wmesg, timo, flags)
+ struct lock *lkp;
+ int prio;
+ const char *wmesg;
+ int timo;
+ int flags;
+{
+ CTR5(KTR_LOCKMGR, "lockinit(): lkp == %p, prio == %d, wmesg == \"%s\", "
+ "timo == %d, flags = 0x%x\n", lkp, prio, wmesg, timo, flags);
+
+ if (lock_mtx_valid == 0) {
+ mtx_init(&lock_mtx, "lockmgr", NULL, MTX_DEF);
+ lock_mtx_valid = 1;
+ }
+ /*
+ * XXX cleanup - make sure mtxpool is always initialized before
+ * this is ever called.
+ */
+ if (mtx_pool_valid) {
+ mtx_lock(&lock_mtx);
+ lkp->lk_interlock = mtx_pool_alloc();
+ mtx_unlock(&lock_mtx);
+ } else {
+ lkp->lk_interlock = &lock_mtx;
+ }
+ lkp->lk_flags = (flags & LK_EXTFLG_MASK);
+ lkp->lk_sharecount = 0;
+ lkp->lk_waitcount = 0;
+ lkp->lk_exclusivecount = 0;
+ lkp->lk_prio = prio;
+ lkp->lk_wmesg = wmesg;
+ lkp->lk_timo = timo;
+ lkp->lk_lockholder = LK_NOPROC;
+}
+
+/*
+ * Destroy a lock.
+ */
+void
+lockdestroy(lkp)
+ struct lock *lkp;
+{
+ CTR2(KTR_LOCKMGR, "lockdestroy(): lkp == %p (lk_wmesg == \"%s\")",
+ lkp, lkp->lk_wmesg);
+}
+
+/*
+ * Determine the status of a lock.
+ */
+int
+lockstatus(lkp, td)
+ struct lock *lkp;
+ struct thread *td;
+{
+ int lock_type = 0;
+
+ mtx_lock(lkp->lk_interlock);
+ if (lkp->lk_exclusivecount != 0) {
+ if (td == NULL || lkp->lk_lockholder == td->td_proc->p_pid)
+ lock_type = LK_EXCLUSIVE;
+ else
+ lock_type = LK_EXCLOTHER;
+ } else if (lkp->lk_sharecount != 0)
+ lock_type = LK_SHARED;
+ mtx_unlock(lkp->lk_interlock);
+ return (lock_type);
+}
+
+/*
+ * Determine the number of holders of a lock.
+ */
+int
+lockcount(lkp)
+ struct lock *lkp;
+{
+ int count;
+
+ mtx_lock(lkp->lk_interlock);
+ count = lkp->lk_exclusivecount + lkp->lk_sharecount;
+ mtx_unlock(lkp->lk_interlock);
+ return (count);
+}
+
+/*
+ * Print out information about state of a lock. Used by VOP_PRINT
+ * routines to display status about contained locks.
+ */
+void
+lockmgr_printinfo(lkp)
+ struct lock *lkp;
+{
+
+ if (lkp->lk_sharecount)
+ printf(" lock type %s: SHARED (count %d)", lkp->lk_wmesg,
+ lkp->lk_sharecount);
+ else if (lkp->lk_flags & LK_HAVE_EXCL)
+ printf(" lock type %s: EXCL (count %d) by pid %d",
+ lkp->lk_wmesg, lkp->lk_exclusivecount, lkp->lk_lockholder);
+ if (lkp->lk_waitcount > 0)
+ printf(" with %d pending", lkp->lk_waitcount);
+}
diff --git a/sys/kern/kern_lockf.c b/sys/kern/kern_lockf.c
new file mode 100644
index 0000000..c1cadb1
--- /dev/null
+++ b/sys/kern/kern_lockf.c
@@ -0,0 +1,846 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Scooter Morris at Genentech Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
+ * $FreeBSD$
+ */
+
+#include "opt_debug_lockf.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/lockf.h>
+
+#include <machine/limits.h>
+
+/*
+ * This variable controls the maximum number of processes that will
+ * be checked in doing deadlock detection.
+ */
+static int maxlockdepth = MAXDEPTH;
+
+#ifdef LOCKF_DEBUG
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+
+
+static int lockf_debug = 0;
+SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
+#endif
+
+MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
+
+#define NOLOCKF (struct lockf *)0
+#define SELF 0x1
+#define OTHERS 0x2
+static int lf_clearlock(struct lockf *);
+static int lf_findoverlap(struct lockf *,
+ struct lockf *, int, struct lockf ***, struct lockf **);
+static struct lockf *
+ lf_getblock(struct lockf *);
+static int lf_getlock(struct lockf *, struct flock *);
+static int lf_setlock(struct lockf *);
+static void lf_split(struct lockf *, struct lockf *);
+static void lf_wakelock(struct lockf *);
+
+/*
+ * Advisory record locking support
+ */
+int
+lf_advlock(ap, head, size)
+ struct vop_advlock_args /* {
+ struct vnode *a_vp;
+ caddr_t a_id;
+ int a_op;
+ struct flock *a_fl;
+ int a_flags;
+ } */ *ap;
+ struct lockf **head;
+ u_quad_t size;
+{
+ register struct flock *fl = ap->a_fl;
+ register struct lockf *lock;
+ off_t start, end, oadd;
+ int error;
+
+ /*
+ * Convert the flock structure into a start and end.
+ */
+ switch (fl->l_whence) {
+
+ case SEEK_SET:
+ case SEEK_CUR:
+ /*
+ * Caller is responsible for adding any necessary offset
+ * when SEEK_CUR is used.
+ */
+ start = fl->l_start;
+ break;
+
+ case SEEK_END:
+ if (size > OFF_MAX ||
+ (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
+ return (EOVERFLOW);
+ start = size + fl->l_start;
+ break;
+
+ default:
+ return (EINVAL);
+ }
+ if (start < 0)
+ return (EINVAL);
+ if (fl->l_len < 0) {
+ if (start == 0)
+ return (EINVAL);
+ end = start - 1;
+ start += fl->l_len;
+ if (start < 0)
+ return (EINVAL);
+ } else if (fl->l_len == 0)
+ end = -1;
+ else {
+ oadd = fl->l_len - 1;
+ if (oadd > OFF_MAX - start)
+ return (EOVERFLOW);
+ end = start + oadd;
+ }
+ /*
+ * Avoid the common case of unlocking when inode has no locks.
+ */
+ if (*head == (struct lockf *)0) {
+ if (ap->a_op != F_SETLK) {
+ fl->l_type = F_UNLCK;
+ return (0);
+ }
+ }
+ /*
+ * Create the lockf structure
+ */
+ MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
+ lock->lf_start = start;
+ lock->lf_end = end;
+ lock->lf_id = ap->a_id;
+ /*
+ * XXX The problem is that VTOI is ufs specific, so it will
+ * break LOCKF_DEBUG for all other FS's other than UFS because
+ * it casts the vnode->data ptr to struct inode *.
+ */
+/* lock->lf_inode = VTOI(ap->a_vp); */
+ lock->lf_inode = (struct inode *)0;
+ lock->lf_type = fl->l_type;
+ lock->lf_head = head;
+ lock->lf_next = (struct lockf *)0;
+ TAILQ_INIT(&lock->lf_blkhd);
+ lock->lf_flags = ap->a_flags;
+ /*
+ * Do the requested operation.
+ */
+ switch(ap->a_op) {
+ case F_SETLK:
+ return (lf_setlock(lock));
+
+ case F_UNLCK:
+ error = lf_clearlock(lock);
+ FREE(lock, M_LOCKF);
+ return (error);
+
+ case F_GETLK:
+ error = lf_getlock(lock, fl);
+ FREE(lock, M_LOCKF);
+ return (error);
+
+ default:
+ free(lock, M_LOCKF);
+ return (EINVAL);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Set a byte-range lock.
+ */
+static int
+lf_setlock(lock)
+ register struct lockf *lock;
+{
+ register struct lockf *block;
+ struct lockf **head = lock->lf_head;
+ struct lockf **prev, *overlap, *ltmp;
+ static char lockstr[] = "lockf";
+ int ovcase, priority, needtolink, error;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_print("lf_setlock", lock);
+#endif /* LOCKF_DEBUG */
+
+ /*
+ * Set the priority
+ */
+ priority = PLOCK;
+ if (lock->lf_type == F_WRLCK)
+ priority += 4;
+ priority |= PCATCH;
+ /*
+ * Scan lock list for this file looking for locks that would block us.
+ */
+ while ((block = lf_getblock(lock))) {
+ /*
+ * Free the structure and return if nonblocking.
+ */
+ if ((lock->lf_flags & F_WAIT) == 0) {
+ FREE(lock, M_LOCKF);
+ return (EAGAIN);
+ }
+ /*
+ * We are blocked. Since flock style locks cover
+ * the whole file, there is no chance for deadlock.
+ * For byte-range locks we must check for deadlock.
+ *
+ * Deadlock detection is done by looking through the
+ * wait channels to see if there are any cycles that
+ * involve us. MAXDEPTH is set just to make sure we
+ * do not go off into neverland.
+ */
+ if ((lock->lf_flags & F_POSIX) &&
+ (block->lf_flags & F_POSIX)) {
+ register struct proc *wproc;
+ struct thread *td;
+ register struct lockf *waitblock;
+ int i = 0;
+
+ /* The block is waiting on something */
+ /* XXXKSE this is not complete under threads */
+ wproc = (struct proc *)block->lf_id;
+ mtx_lock_spin(&sched_lock);
+ FOREACH_THREAD_IN_PROC(wproc, td) {
+ while (td->td_wchan &&
+ (td->td_wmesg == lockstr) &&
+ (i++ < maxlockdepth)) {
+ waitblock = (struct lockf *)td->td_wchan;
+ /* Get the owner of the blocking lock */
+ waitblock = waitblock->lf_next;
+ if ((waitblock->lf_flags & F_POSIX) == 0)
+ break;
+ wproc = (struct proc *)waitblock->lf_id;
+ if (wproc == (struct proc *)lock->lf_id) {
+ mtx_unlock_spin(&sched_lock);
+ free(lock, M_LOCKF);
+ return (EDEADLK);
+ }
+ }
+ }
+ mtx_unlock_spin(&sched_lock);
+ }
+ /*
+ * For flock type locks, we must first remove
+ * any shared locks that we hold before we sleep
+ * waiting for an exclusive lock.
+ */
+ if ((lock->lf_flags & F_FLOCK) &&
+ lock->lf_type == F_WRLCK) {
+ lock->lf_type = F_UNLCK;
+ (void) lf_clearlock(lock);
+ lock->lf_type = F_WRLCK;
+ }
+ /*
+ * Add our lock to the blocked list and sleep until we're free.
+ * Remember who blocked us (for deadlock detection).
+ */
+ lock->lf_next = block;
+ TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1) {
+ lf_print("lf_setlock: blocking on", block);
+ lf_printlist("lf_setlock", block);
+ }
+#endif /* LOCKF_DEBUG */
+ error = tsleep(lock, priority, lockstr, 0);
+ /*
+ * We may have been awakened by a signal and/or by a
+ * debugger continuing us (in which cases we must remove
+ * ourselves from the blocked list) and/or by another
+ * process releasing a lock (in which case we have
+ * already been removed from the blocked list and our
+ * lf_next field set to NOLOCKF).
+ */
+ if (lock->lf_next) {
+ TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
+ lock->lf_next = NOLOCKF;
+ }
+ if (error) {
+ free(lock, M_LOCKF);
+ return (error);
+ }
+ }
+ /*
+ * No blocks!! Add the lock. Note that we will
+ * downgrade or upgrade any overlapping locks this
+ * process already owns.
+ *
+ * Skip over locks owned by other processes.
+ * Handle any locks that overlap and are owned by ourselves.
+ */
+ prev = head;
+ block = *head;
+ needtolink = 1;
+ for (;;) {
+ ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
+ if (ovcase)
+ block = overlap->lf_next;
+ /*
+ * Six cases:
+ * 0) no overlap
+ * 1) overlap == lock
+ * 2) overlap contains lock
+ * 3) lock contains overlap
+ * 4) overlap starts before lock
+ * 5) overlap ends after lock
+ */
+ switch (ovcase) {
+ case 0: /* no overlap */
+ if (needtolink) {
+ *prev = lock;
+ lock->lf_next = overlap;
+ }
+ break;
+
+ case 1: /* overlap == lock */
+ /*
+ * If downgrading lock, others may be
+ * able to acquire it.
+ */
+ if (lock->lf_type == F_RDLCK &&
+ overlap->lf_type == F_WRLCK)
+ lf_wakelock(overlap);
+ overlap->lf_type = lock->lf_type;
+ FREE(lock, M_LOCKF);
+ lock = overlap; /* for debug output below */
+ break;
+
+ case 2: /* overlap contains lock */
+ /*
+ * Check for common starting point and different types.
+ */
+ if (overlap->lf_type == lock->lf_type) {
+ free(lock, M_LOCKF);
+ lock = overlap; /* for debug output below */
+ break;
+ }
+ if (overlap->lf_start == lock->lf_start) {
+ *prev = lock;
+ lock->lf_next = overlap;
+ overlap->lf_start = lock->lf_end + 1;
+ } else
+ lf_split(overlap, lock);
+ lf_wakelock(overlap);
+ break;
+
+ case 3: /* lock contains overlap */
+ /*
+ * If downgrading lock, others may be able to
+ * acquire it, otherwise take the list.
+ */
+ if (lock->lf_type == F_RDLCK &&
+ overlap->lf_type == F_WRLCK) {
+ lf_wakelock(overlap);
+ } else {
+ while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
+ ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
+ TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
+ lf_block);
+ TAILQ_INSERT_TAIL(&lock->lf_blkhd,
+ ltmp, lf_block);
+ ltmp->lf_next = lock;
+ }
+ }
+ /*
+ * Add the new lock if necessary and delete the overlap.
+ */
+ if (needtolink) {
+ *prev = lock;
+ lock->lf_next = overlap->lf_next;
+ prev = &lock->lf_next;
+ needtolink = 0;
+ } else
+ *prev = overlap->lf_next;
+ free(overlap, M_LOCKF);
+ continue;
+
+ case 4: /* overlap starts before lock */
+ /*
+ * Add lock after overlap on the list.
+ */
+ lock->lf_next = overlap->lf_next;
+ overlap->lf_next = lock;
+ overlap->lf_end = lock->lf_start - 1;
+ prev = &lock->lf_next;
+ lf_wakelock(overlap);
+ needtolink = 0;
+ continue;
+
+ case 5: /* overlap ends after lock */
+ /*
+ * Add the new lock before overlap.
+ */
+ if (needtolink) {
+ *prev = lock;
+ lock->lf_next = overlap;
+ }
+ overlap->lf_start = lock->lf_end + 1;
+ lf_wakelock(overlap);
+ break;
+ }
+ break;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1) {
+ lf_print("lf_setlock: got the lock", lock);
+ lf_printlist("lf_setlock", lock);
+ }
+#endif /* LOCKF_DEBUG */
+ return (0);
+}
+
+/*
+ * Remove a byte-range lock on an inode.
+ *
+ * Generally, find the lock (or an overlap to that lock)
+ * and remove it (or shrink it), then wakeup anyone we can.
+ */
+static int
+lf_clearlock(unlock)
+ register struct lockf *unlock;
+{
+ struct lockf **head = unlock->lf_head;
+ register struct lockf *lf = *head;
+ struct lockf *overlap, **prev;
+ int ovcase;
+
+ if (lf == NOLOCKF)
+ return (0);
+#ifdef LOCKF_DEBUG
+ if (unlock->lf_type != F_UNLCK)
+ panic("lf_clearlock: bad type");
+ if (lockf_debug & 1)
+ lf_print("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+ prev = head;
+ while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap))) {
+ /*
+ * Wakeup the list of locks to be retried.
+ */
+ lf_wakelock(overlap);
+
+ switch (ovcase) {
+
+ case 1: /* overlap == lock */
+ *prev = overlap->lf_next;
+ FREE(overlap, M_LOCKF);
+ break;
+
+ case 2: /* overlap contains lock: split it */
+ if (overlap->lf_start == unlock->lf_start) {
+ overlap->lf_start = unlock->lf_end + 1;
+ break;
+ }
+ lf_split(overlap, unlock);
+ overlap->lf_next = unlock->lf_next;
+ break;
+
+ case 3: /* lock contains overlap */
+ *prev = overlap->lf_next;
+ lf = overlap->lf_next;
+ free(overlap, M_LOCKF);
+ continue;
+
+ case 4: /* overlap starts before lock */
+ overlap->lf_end = unlock->lf_start - 1;
+ prev = &overlap->lf_next;
+ lf = overlap->lf_next;
+ continue;
+
+ case 5: /* overlap ends after lock */
+ overlap->lf_start = unlock->lf_end + 1;
+ break;
+ }
+ break;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_printlist("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+ return (0);
+}
+
+/*
+ * Check whether there is a blocking lock,
+ * and if so return its process identifier.
+ */
+static int
+lf_getlock(lock, fl)
+ register struct lockf *lock;
+ register struct flock *fl;
+{
+ register struct lockf *block;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_print("lf_getlock", lock);
+#endif /* LOCKF_DEBUG */
+
+ if ((block = lf_getblock(lock))) {
+ fl->l_type = block->lf_type;
+ fl->l_whence = SEEK_SET;
+ fl->l_start = block->lf_start;
+ if (block->lf_end == -1)
+ fl->l_len = 0;
+ else
+ fl->l_len = block->lf_end - block->lf_start + 1;
+ if (block->lf_flags & F_POSIX)
+ fl->l_pid = ((struct proc *)(block->lf_id))->p_pid;
+ else
+ fl->l_pid = -1;
+ } else {
+ fl->l_type = F_UNLCK;
+ }
+ return (0);
+}
+
+/*
+ * Walk the list of locks for an inode and
+ * return the first blocking lock.
+ */
+static struct lockf *
+lf_getblock(lock)
+ register struct lockf *lock;
+{
+ struct lockf **prev, *overlap, *lf = *(lock->lf_head);
+ int ovcase;
+
+ prev = lock->lf_head;
+ while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap))) {
+ /*
+ * We've found an overlap, see if it blocks us
+ */
+ if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
+ return (overlap);
+ /*
+ * Nope, point to the next one on the list and
+ * see if it blocks us
+ */
+ lf = overlap->lf_next;
+ }
+ return (NOLOCKF);
+}
+
+/*
+ * Walk the list of locks for an inode to
+ * find an overlapping lock (if any).
+ *
+ * NOTE: this returns only the FIRST overlapping lock. There
+ * may be more than one.
+ */
+static int
+lf_findoverlap(lf, lock, type, prev, overlap)
+ register struct lockf *lf;
+ struct lockf *lock;
+ int type;
+ struct lockf ***prev;
+ struct lockf **overlap;
+{
+ off_t start, end;
+
+ *overlap = lf;
+ if (lf == NOLOCKF)
+ return (0);
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ lf_print("lf_findoverlap: looking for overlap in", lock);
+#endif /* LOCKF_DEBUG */
+ start = lock->lf_start;
+ end = lock->lf_end;
+ while (lf != NOLOCKF) {
+ if (((type & SELF) && lf->lf_id != lock->lf_id) ||
+ ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
+ *prev = &lf->lf_next;
+ *overlap = lf = lf->lf_next;
+ continue;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ lf_print("\tchecking", lf);
+#endif /* LOCKF_DEBUG */
+ /*
+ * OK, check for overlap
+ *
+ * Six cases:
+ * 0) no overlap
+ * 1) overlap == lock
+ * 2) overlap contains lock
+ * 3) lock contains overlap
+ * 4) overlap starts before lock
+ * 5) overlap ends after lock
+ */
+ if ((lf->lf_end != -1 && start > lf->lf_end) ||
+ (end != -1 && lf->lf_start > end)) {
+ /* Case 0 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("no overlap\n");
+#endif /* LOCKF_DEBUG */
+ if ((type & SELF) && end != -1 && lf->lf_start > end)
+ return (0);
+ *prev = &lf->lf_next;
+ *overlap = lf = lf->lf_next;
+ continue;
+ }
+ if ((lf->lf_start == start) && (lf->lf_end == end)) {
+ /* Case 1 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap == lock\n");
+#endif /* LOCKF_DEBUG */
+ return (1);
+ }
+ if ((lf->lf_start <= start) &&
+ (end != -1) &&
+ ((lf->lf_end >= end) || (lf->lf_end == -1))) {
+ /* Case 2 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap contains lock\n");
+#endif /* LOCKF_DEBUG */
+ return (2);
+ }
+ if (start <= lf->lf_start &&
+ (end == -1 ||
+ (lf->lf_end != -1 && end >= lf->lf_end))) {
+ /* Case 3 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("lock contains overlap\n");
+#endif /* LOCKF_DEBUG */
+ return (3);
+ }
+ if ((lf->lf_start < start) &&
+ ((lf->lf_end >= start) || (lf->lf_end == -1))) {
+ /* Case 4 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap starts before lock\n");
+#endif /* LOCKF_DEBUG */
+ return (4);
+ }
+ if ((lf->lf_start > start) &&
+ (end != -1) &&
+ ((lf->lf_end > end) || (lf->lf_end == -1))) {
+ /* Case 5 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap ends after lock\n");
+#endif /* LOCKF_DEBUG */
+ return (5);
+ }
+ panic("lf_findoverlap: default");
+ }
+ return (0);
+}
+
+/*
+ * Split a lock and a contained region into
+ * two or three locks as necessary.
+ */
+static void
+lf_split(lock1, lock2)
+ register struct lockf *lock1;
+ register struct lockf *lock2;
+{
+ register struct lockf *splitlock;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2) {
+ lf_print("lf_split", lock1);
+ lf_print("splitting from", lock2);
+ }
+#endif /* LOCKF_DEBUG */
+ /*
+ * Check to see if spliting into only two pieces.
+ */
+ if (lock1->lf_start == lock2->lf_start) {
+ lock1->lf_start = lock2->lf_end + 1;
+ lock2->lf_next = lock1;
+ return;
+ }
+ if (lock1->lf_end == lock2->lf_end) {
+ lock1->lf_end = lock2->lf_start - 1;
+ lock2->lf_next = lock1->lf_next;
+ lock1->lf_next = lock2;
+ return;
+ }
+ /*
+ * Make a new lock consisting of the last part of
+ * the encompassing lock
+ */
+ MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
+ bcopy(lock1, splitlock, sizeof *splitlock);
+ splitlock->lf_start = lock2->lf_end + 1;
+ TAILQ_INIT(&splitlock->lf_blkhd);
+ lock1->lf_end = lock2->lf_start - 1;
+ /*
+ * OK, now link it in
+ */
+ splitlock->lf_next = lock1->lf_next;
+ lock2->lf_next = splitlock;
+ lock1->lf_next = lock2;
+}
+
+/*
+ * Wakeup a blocklist
+ */
+static void
+lf_wakelock(listhead)
+ struct lockf *listhead;
+{
+ register struct lockf *wakelock;
+
+ while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
+ wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
+ TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
+ wakelock->lf_next = NOLOCKF;
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ lf_print("lf_wakelock: awakening", wakelock);
+#endif /* LOCKF_DEBUG */
+ wakeup(wakelock);
+ }
+}
+
+#ifdef LOCKF_DEBUG
+/*
+ * Print out a lock.
+ */
+void
+lf_print(tag, lock)
+ char *tag;
+ register struct lockf *lock;
+{
+
+ printf("%s: lock %p for ", tag, (void *)lock);
+ if (lock->lf_flags & F_POSIX)
+ printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid);
+ else
+ printf("id %p", (void *)lock->lf_id);
+ if (lock->lf_inode != (struct inode *)0)
+ /* XXX no %qd in kernel. Truncate. */
+ printf(" in ino %lu on dev <%d, %d>, %s, start %ld, end %ld",
+ (u_long)lock->lf_inode->i_number,
+ major(lock->lf_inode->i_dev),
+ minor(lock->lf_inode->i_dev),
+ lock->lf_type == F_RDLCK ? "shared" :
+ lock->lf_type == F_WRLCK ? "exclusive" :
+ lock->lf_type == F_UNLCK ? "unlock" :
+ "unknown", (long)lock->lf_start, (long)lock->lf_end);
+ else
+ printf(" %s, start %ld, end %ld",
+ lock->lf_type == F_RDLCK ? "shared" :
+ lock->lf_type == F_WRLCK ? "exclusive" :
+ lock->lf_type == F_UNLCK ? "unlock" :
+ "unknown", (long)lock->lf_start, (long)lock->lf_end);
+ if (!TAILQ_EMPTY(&lock->lf_blkhd))
+ printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
+ else
+ printf("\n");
+}
+
+void
+lf_printlist(tag, lock)
+ char *tag;
+ struct lockf *lock;
+{
+ register struct lockf *lf, *blk;
+
+ if (lock->lf_inode == (struct inode *)0)
+ return;
+
+ printf("%s: Lock list for ino %lu on dev <%d, %d>:\n",
+ tag, (u_long)lock->lf_inode->i_number,
+ major(lock->lf_inode->i_dev),
+ minor(lock->lf_inode->i_dev));
+ for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) {
+ printf("\tlock %p for ",(void *)lf);
+ if (lf->lf_flags & F_POSIX)
+ printf("proc %ld",
+ (long)((struct proc *)lf->lf_id)->p_pid);
+ else
+ printf("id %p", (void *)lf->lf_id);
+ /* XXX no %qd in kernel. Truncate. */
+ printf(", %s, start %ld, end %ld",
+ lf->lf_type == F_RDLCK ? "shared" :
+ lf->lf_type == F_WRLCK ? "exclusive" :
+ lf->lf_type == F_UNLCK ? "unlock" :
+ "unknown", (long)lf->lf_start, (long)lf->lf_end);
+ TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
+ printf("\n\t\tlock request %p for ", (void *)blk);
+ if (blk->lf_flags & F_POSIX)
+ printf("proc %ld",
+ (long)((struct proc *)blk->lf_id)->p_pid);
+ else
+ printf("id %p", (void *)blk->lf_id);
+ /* XXX no %qd in kernel. Truncate. */
+ printf(", %s, start %ld, end %ld",
+ blk->lf_type == F_RDLCK ? "shared" :
+ blk->lf_type == F_WRLCK ? "exclusive" :
+ blk->lf_type == F_UNLCK ? "unlock" :
+ "unknown", (long)blk->lf_start,
+ (long)blk->lf_end);
+ if (!TAILQ_EMPTY(&blk->lf_blkhd))
+ panic("lf_printlist: bad list");
+ }
+ printf("\n");
+ }
+}
+#endif /* LOCKF_DEBUG */
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
new file mode 100644
index 0000000..c7bec3e
--- /dev/null
+++ b/sys/kern/kern_malloc.c
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 1987, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94
+ * $FreeBSD$
+ */
+
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/vmmeter.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/uma.h>
+#include <vm/uma_int.h>
+#include <vm/uma_dbg.h>
+
+#if defined(INVARIANTS) && defined(__i386__)
+#include <machine/cpu.h>
+#endif
+
+/*
+ * When realloc() is called, if the new size is sufficiently smaller than
+ * the old size, realloc() will allocate a new, smaller block to avoid
+ * wasting memory. 'Sufficiently smaller' is defined as: newsize <=
+ * oldsize / 2^n, where REALLOC_FRACTION defines the value of 'n'.
+ */
+#ifndef REALLOC_FRACTION
+#define REALLOC_FRACTION 1 /* new block if <= half the size */
+#endif
+
+MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches");
+MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
+MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers");
+
+MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
+MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery");
+
+static void kmeminit(void *);
+SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL)
+
+static MALLOC_DEFINE(M_FREE, "free", "should be on free list");
+
+static struct malloc_type *kmemstatistics;
+static char *kmembase;
+static char *kmemlimit;
+
+#define KMEM_ZSHIFT 4
+#define KMEM_ZBASE 16
+#define KMEM_ZMASK (KMEM_ZBASE - 1)
+
+#define KMEM_ZMAX 65536
+#define KMEM_ZSIZE (KMEM_ZMAX >> KMEM_ZSHIFT)
+static u_int8_t kmemsize[KMEM_ZSIZE + 1];
+
+/* These won't be powers of two for long */
+struct {
+ int kz_size;
+ char *kz_name;
+ uma_zone_t kz_zone;
+} kmemzones[] = {
+ {16, "16", NULL},
+ {32, "32", NULL},
+ {64, "64", NULL},
+ {128, "128", NULL},
+ {256, "256", NULL},
+ {512, "512", NULL},
+ {1024, "1024", NULL},
+ {2048, "2048", NULL},
+ {4096, "4096", NULL},
+ {8192, "8192", NULL},
+ {16384, "16384", NULL},
+ {32768, "32768", NULL},
+ {65536, "65536", NULL},
+ {0, NULL},
+};
+
+u_int vm_kmem_size;
+
+/*
+ * The malloc_mtx protects the kmemstatistics linked list as well as the
+ * mallochash.
+ */
+
+struct mtx malloc_mtx;
+
+#ifdef MALLOC_PROFILE
+uint64_t krequests[KMEM_ZSIZE + 1];
+
+static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS);
+#endif
+
+static int sysctl_kern_malloc(SYSCTL_HANDLER_ARGS);
+
+/*
+ * malloc:
+ *
+ * Allocate a block of memory.
+ *
+ * If M_NOWAIT is set, this routine will not block and return NULL if
+ * the allocation fails.
+ */
+void *
+malloc(size, type, flags)
+ unsigned long size;
+ struct malloc_type *type;
+ int flags;
+{
+ int indx;
+ caddr_t va;
+ uma_zone_t zone;
+ register struct malloc_type *ksp = type;
+
+#if 0
+ if (size == 0)
+ Debugger("zero size malloc");
+#endif
+ if (!(flags & M_NOWAIT))
+ KASSERT(curthread->td_intr_nesting_level == 0,
+ ("malloc(M_WAITOK) in interrupt context"));
+ if (size <= KMEM_ZMAX) {
+ if (size & KMEM_ZMASK)
+ size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
+ indx = kmemsize[size >> KMEM_ZSHIFT];
+ zone = kmemzones[indx].kz_zone;
+#ifdef MALLOC_PROFILE
+ krequests[size >> KMEM_ZSHIFT]++;
+#endif
+ va = uma_zalloc(zone, flags);
+ mtx_lock(&ksp->ks_mtx);
+ if (va == NULL)
+ goto out;
+
+ ksp->ks_size |= 1 << indx;
+ size = zone->uz_size;
+ } else {
+ size = roundup(size, PAGE_SIZE);
+ zone = NULL;
+ va = uma_large_malloc(size, flags);
+ mtx_lock(&ksp->ks_mtx);
+ if (va == NULL)
+ goto out;
+ }
+ ksp->ks_memuse += size;
+ ksp->ks_inuse++;
+out:
+ ksp->ks_calls++;
+ if (ksp->ks_memuse > ksp->ks_maxused)
+ ksp->ks_maxused = ksp->ks_memuse;
+
+ mtx_unlock(&ksp->ks_mtx);
+ return ((void *) va);
+}
+
+/*
+ * free:
+ *
+ * Free a block of memory allocated by malloc.
+ *
+ * This routine may not block.
+ */
+void
+free(addr, type)
+ void *addr;
+ struct malloc_type *type;
+{
+ uma_slab_t slab;
+ void *mem;
+ u_long size;
+ register struct malloc_type *ksp = type;
+
+ /* free(NULL, ...) does nothing */
+ if (addr == NULL)
+ return;
+
+ size = 0;
+
+ mem = (void *)((u_long)addr & (~UMA_SLAB_MASK));
+ mtx_lock(&malloc_mtx);
+ slab = hash_sfind(mallochash, mem);
+ mtx_unlock(&malloc_mtx);
+
+ if (slab == NULL)
+ panic("free: address %p(%p) has not been allocated.\n",
+ addr, mem);
+
+ if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
+#ifdef INVARIANTS
+ struct malloc_type **mtp = addr;
+#endif
+ size = slab->us_zone->uz_size;
+#ifdef INVARIANTS
+ /*
+ * Cache a pointer to the malloc_type that most recently freed
+ * this memory here. This way we know who is most likely to
+ * have stepped on it later.
+ *
+ * This code assumes that size is a multiple of 8 bytes for
+ * 64 bit machines
+ */
+ mtp = (struct malloc_type **)
+ ((unsigned long)mtp & ~UMA_ALIGN_PTR);
+ mtp += (size - sizeof(struct malloc_type *)) /
+ sizeof(struct malloc_type *);
+ *mtp = type;
+#endif
+ uma_zfree_arg(slab->us_zone, addr, slab);
+ } else {
+ size = slab->us_size;
+ uma_large_free(slab);
+ }
+ mtx_lock(&ksp->ks_mtx);
+ ksp->ks_memuse -= size;
+ ksp->ks_inuse--;
+ mtx_unlock(&ksp->ks_mtx);
+}
+
+/*
+ * realloc: change the size of a memory block
+ */
+void *
+realloc(addr, size, type, flags)
+ void *addr;
+ unsigned long size;
+ struct malloc_type *type;
+ int flags;
+{
+ uma_slab_t slab;
+ unsigned long alloc;
+ void *newaddr;
+
+ /* realloc(NULL, ...) is equivalent to malloc(...) */
+ if (addr == NULL)
+ return (malloc(size, type, flags));
+
+ mtx_lock(&malloc_mtx);
+ slab = hash_sfind(mallochash,
+ (void *)((u_long)addr & ~(UMA_SLAB_MASK)));
+ mtx_unlock(&malloc_mtx);
+
+ /* Sanity check */
+ KASSERT(slab != NULL,
+ ("realloc: address %p out of range", (void *)addr));
+
+ /* Get the size of the original block */
+ if (slab->us_zone)
+ alloc = slab->us_zone->uz_size;
+ else
+ alloc = slab->us_size;
+
+ /* Reuse the original block if appropriate */
+ if (size <= alloc
+ && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE))
+ return (addr);
+
+ /* Allocate a new, bigger (or smaller) block */
+ if ((newaddr = malloc(size, type, flags)) == NULL)
+ return (NULL);
+
+ /* Copy over original contents */
+ bcopy(addr, newaddr, min(size, alloc));
+ free(addr, type);
+ return (newaddr);
+}
+
+/*
+ * reallocf: same as realloc() but free memory on failure.
+ */
+void *
+reallocf(addr, size, type, flags)
+ void *addr;
+ unsigned long size;
+ struct malloc_type *type;
+ int flags;
+{
+ void *mem;
+
+ if ((mem = realloc(addr, size, type, flags)) == NULL)
+ free(addr, type);
+ return (mem);
+}
+
+/*
+ * Initialize the kernel memory allocator
+ */
+/* ARGSUSED*/
+static void
+kmeminit(dummy)
+ void *dummy;
+{
+ u_int8_t indx;
+ u_long npg;
+ u_long mem_size;
+ void *hashmem;
+ u_long hashsize;
+ int highbit;
+ int bits;
+ int i;
+
+ mtx_init(&malloc_mtx, "malloc", NULL, MTX_DEF);
+
+ /*
+ * Try to auto-tune the kernel memory size, so that it is
+ * more applicable for a wider range of machine sizes.
+ * On an X86, a VM_KMEM_SIZE_SCALE value of 4 is good, while
+ * a VM_KMEM_SIZE of 12MB is a fair compromise. The
+ * VM_KMEM_SIZE_MAX is dependent on the maximum KVA space
+ * available, and on an X86 with a total KVA space of 256MB,
+ * try to keep VM_KMEM_SIZE_MAX at 80MB or below.
+ *
+ * Note that the kmem_map is also used by the zone allocator,
+ * so make sure that there is enough space.
+ */
+ vm_kmem_size = VM_KMEM_SIZE;
+ mem_size = cnt.v_page_count * PAGE_SIZE;
+
+#if defined(VM_KMEM_SIZE_SCALE)
+ if ((mem_size / VM_KMEM_SIZE_SCALE) > vm_kmem_size)
+ vm_kmem_size = mem_size / VM_KMEM_SIZE_SCALE;
+#endif
+
+#if defined(VM_KMEM_SIZE_MAX)
+ if (vm_kmem_size >= VM_KMEM_SIZE_MAX)
+ vm_kmem_size = VM_KMEM_SIZE_MAX;
+#endif
+
+ /* Allow final override from the kernel environment */
+ TUNABLE_INT_FETCH("kern.vm.kmem.size", &vm_kmem_size);
+
+ /*
+ * Limit kmem virtual size to twice the physical memory.
+ * This allows for kmem map sparseness, but limits the size
+ * to something sane. Be careful to not overflow the 32bit
+ * ints while doing the check.
+ */
+ if ((vm_kmem_size / 2) > (cnt.v_page_count * PAGE_SIZE))
+ vm_kmem_size = 2 * cnt.v_page_count * PAGE_SIZE;
+
+ /*
+ * In mbuf_init(), we set up submaps for mbufs and clusters, in which
+ * case we rounddown() (nmbufs * MSIZE) and (nmbclusters * MCLBYTES),
+ * respectively. Mathematically, this means that what we do here may
+ * amount to slightly more address space than we need for the submaps,
+ * but it never hurts to have an extra page in kmem_map.
+ */
+ npg = (nmbufs * MSIZE + nmbclusters * MCLBYTES + nmbcnt *
+ sizeof(u_int) + vm_kmem_size) / PAGE_SIZE;
+
+ kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
+ (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE));
+ kmem_map->system_map = 1;
+
+ hashsize = npg * sizeof(void *);
+
+ highbit = 0;
+ bits = 0;
+ /* The hash size must be a power of two */
+ for (i = 0; i < 8 * sizeof(hashsize); i++)
+ if (hashsize & (1 << i)) {
+ highbit = i;
+ bits++;
+ }
+ if (bits > 1)
+ hashsize = 1 << (highbit);
+
+ hashmem = (void *)kmem_alloc(kernel_map, (vm_size_t)hashsize);
+ uma_startup2(hashmem, hashsize / sizeof(void *));
+
+ for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) {
+ int size = kmemzones[indx].kz_size;
+ char *name = kmemzones[indx].kz_name;
+
+ kmemzones[indx].kz_zone = uma_zcreate(name, size,
+#ifdef INVARIANTS
+ mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
+#else
+ NULL, NULL, NULL, NULL,
+#endif
+ UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
+
+ for (;i <= size; i+= KMEM_ZBASE)
+ kmemsize[i >> KMEM_ZSHIFT] = indx;
+
+ }
+}
+
+void
+malloc_init(data)
+ void *data;
+{
+ struct malloc_type *type = (struct malloc_type *)data;
+
+ mtx_lock(&malloc_mtx);
+ if (type->ks_magic != M_MAGIC)
+ panic("malloc type lacks magic");
+
+ if (cnt.v_page_count == 0)
+ panic("malloc_init not allowed before vm init");
+
+ if (type->ks_next != NULL)
+ return;
+
+ type->ks_next = kmemstatistics;
+ kmemstatistics = type;
+ mtx_init(&type->ks_mtx, type->ks_shortdesc, "Malloc Stats", MTX_DEF);
+ mtx_unlock(&malloc_mtx);
+}
+
+void
+malloc_uninit(data)
+ void *data;
+{
+ struct malloc_type *type = (struct malloc_type *)data;
+ struct malloc_type *t;
+
+ mtx_lock(&malloc_mtx);
+ mtx_lock(&type->ks_mtx);
+ if (type->ks_magic != M_MAGIC)
+ panic("malloc type lacks magic");
+
+ if (cnt.v_page_count == 0)
+ panic("malloc_uninit not allowed before vm init");
+
+ if (type == kmemstatistics)
+ kmemstatistics = type->ks_next;
+ else {
+ for (t = kmemstatistics; t->ks_next != NULL; t = t->ks_next) {
+ if (t->ks_next == type) {
+ t->ks_next = type->ks_next;
+ break;
+ }
+ }
+ }
+ type->ks_next = NULL;
+ mtx_destroy(&type->ks_mtx);
+ mtx_unlock(&malloc_mtx);
+}
+
+static int
+sysctl_kern_malloc(SYSCTL_HANDLER_ARGS)
+{
+ struct malloc_type *type;
+ int linesize = 128;
+ int curline;
+ int bufsize;
+ int first;
+ int error;
+ char *buf;
+ char *p;
+ int cnt;
+ int len;
+ int i;
+
+ cnt = 0;
+
+ mtx_lock(&malloc_mtx);
+ for (type = kmemstatistics; type != NULL; type = type->ks_next)
+ cnt++;
+
+ mtx_unlock(&malloc_mtx);
+ bufsize = linesize * (cnt + 1);
+ p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
+ mtx_lock(&malloc_mtx);
+
+ len = snprintf(p, linesize,
+ "\n Type InUse MemUse HighUse Requests Size(s)\n");
+ p += len;
+
+ for (type = kmemstatistics; cnt != 0 && type != NULL;
+ type = type->ks_next, cnt--) {
+ if (type->ks_calls == 0)
+ continue;
+
+ curline = linesize - 2; /* Leave room for the \n */
+ len = snprintf(p, curline, "%13s%6lu%6luK%7luK%9llu",
+ type->ks_shortdesc,
+ type->ks_inuse,
+ (type->ks_memuse + 1023) / 1024,
+ (type->ks_maxused + 1023) / 1024,
+ (long long unsigned)type->ks_calls);
+ curline -= len;
+ p += len;
+
+ first = 1;
+ for (i = 0; i < sizeof(kmemzones) / sizeof(kmemzones[0]) - 1;
+ i++) {
+ if (type->ks_size & (1 << i)) {
+ if (first)
+ len = snprintf(p, curline, " ");
+ else
+ len = snprintf(p, curline, ",");
+ curline -= len;
+ p += len;
+
+ len = snprintf(p, curline,
+ "%s", kmemzones[i].kz_name);
+ curline -= len;
+ p += len;
+
+ first = 0;
+ }
+ }
+
+ len = snprintf(p, 2, "\n");
+ p += len;
+ }
+
+ mtx_unlock(&malloc_mtx);
+ error = SYSCTL_OUT(req, buf, p - buf);
+
+ free(buf, M_TEMP);
+ return (error);
+}
+
+SYSCTL_OID(_kern, OID_AUTO, malloc, CTLTYPE_STRING|CTLFLAG_RD,
+ NULL, 0, sysctl_kern_malloc, "A", "Malloc Stats");
+
+#ifdef MALLOC_PROFILE
+
+static int
+sysctl_kern_mprof(SYSCTL_HANDLER_ARGS)
+{
+ int linesize = 64;
+ uint64_t count;
+ uint64_t waste;
+ uint64_t mem;
+ int bufsize;
+ int error;
+ char *buf;
+ int rsize;
+ int size;
+ char *p;
+ int len;
+ int i;
+
+ bufsize = linesize * (KMEM_ZSIZE + 1);
+ bufsize += 128; /* For the stats line */
+ bufsize += 128; /* For the banner line */
+ waste = 0;
+ mem = 0;
+
+ p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
+ len = snprintf(p, bufsize,
+ "\n Size Requests Real Size\n");
+ bufsize -= len;
+ p += len;
+
+ for (i = 0; i < KMEM_ZSIZE; i++) {
+ size = i << KMEM_ZSHIFT;
+ rsize = kmemzones[kmemsize[i]].kz_size;
+ count = (long long unsigned)krequests[i];
+
+ len = snprintf(p, bufsize, "%6d%28llu%11d\n",
+ size, (unsigned long long)count, rsize);
+ bufsize -= len;
+ p += len;
+
+ if ((rsize * count) > (size * count))
+ waste += (rsize * count) - (size * count);
+ mem += (rsize * count);
+ }
+
+ len = snprintf(p, bufsize,
+ "\nTotal memory used:\t%30llu\nTotal Memory wasted:\t%30llu\n",
+ (unsigned long long)mem, (unsigned long long)waste);
+ p += len;
+
+ error = SYSCTL_OUT(req, buf, p - buf);
+
+ free(buf, M_TEMP);
+ return (error);
+}
+
+SYSCTL_OID(_kern, OID_AUTO, mprof, CTLTYPE_STRING|CTLFLAG_RD,
+ NULL, 0, sysctl_kern_mprof, "A", "Malloc Profiling");
+#endif /* MALLOC_PROFILE */
diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c
new file mode 100644
index 0000000..ebcba94
--- /dev/null
+++ b/sys/kern/kern_mib.c
@@ -0,0 +1,336 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
+ * $FreeBSD$
+ */
+
+#include "opt_posix.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/jail.h>
+#include <sys/smp.h>
+
+SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0,
+ "Sysctl internal magic");
+SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW, 0,
+ "High kernel, proc, limits &c");
+SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW, 0,
+ "Virtual memory");
+SYSCTL_NODE(, CTL_VFS, vfs, CTLFLAG_RW, 0,
+ "File system");
+SYSCTL_NODE(, CTL_NET, net, CTLFLAG_RW, 0,
+ "Network, (see socket.h)");
+SYSCTL_NODE(, CTL_DEBUG, debug, CTLFLAG_RW, 0,
+ "Debugging");
+SYSCTL_NODE(_debug, OID_AUTO, sizeof, CTLFLAG_RW, 0,
+ "Sizeof various things");
+SYSCTL_NODE(, CTL_HW, hw, CTLFLAG_RW, 0,
+ "hardware");
+SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0,
+ "machine dependent");
+SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW, 0,
+ "user-level");
+SYSCTL_NODE(, CTL_P1003_1B, p1003_1b, CTLFLAG_RW, 0,
+ "p1003_1b, (see p1003_1b.h)");
+
+SYSCTL_NODE(, OID_AUTO, compat, CTLFLAG_RW, 0,
+ "Compatibility code");
+SYSCTL_NODE(, OID_AUTO, security, CTLFLAG_RW, 0,
+ "Security");
+#ifdef REGRESSION
+SYSCTL_NODE(, OID_AUTO, regression, CTLFLAG_RW, 0,
+ "Regression test MIB");
+#endif
+
+SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD,
+ osrelease, 0, "Operating system release");
+
+SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD,
+ 0, BSD, "Operating system revision");
+
+SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD,
+ version, 0, "Kernel version");
+
+SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD,
+ ostype, 0, "Operating system type");
+
+extern int osreldate;
+SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD,
+ &osreldate, 0, "Operating system release date");
+
+SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RD,
+ &maxproc, 0, "Maximum number of processes");
+
+SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW,
+ &maxprocperuid, 0, "Maximum processes allowed per userid");
+
+SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RD,
+ &maxusers, 0, "Hint for kernel tuning");
+
+SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD,
+ 0, ARG_MAX, "Maximum bytes of argument to execve(2)");
+
+SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD,
+ 0, _KPOSIX_VERSION, "Version of POSIX attempting to comply to");
+
+SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD,
+ 0, NGROUPS_MAX, "Maximum number of groups a user can belong to");
+
+SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD,
+ 0, 1, "Whether job control is available");
+
+#ifdef _POSIX_SAVED_IDS
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD,
+ 0, 1, "Whether saved set-group/user ID is available");
+#else
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD,
+ 0, 0, "Whether saved set-group/user ID is available");
+#endif
+
+char kernelname[MAXPATHLEN] = "/kernel"; /* XXX bloat */
+
+SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW,
+ kernelname, sizeof kernelname, "Name of kernel file booted");
+
+#ifdef SMP
+SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD,
+ &mp_ncpus, 0, "Number of active CPUs");
+#else
+SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD,
+ 0, 1, "Number of active CPUs");
+#endif
+
+SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD,
+ 0, BYTE_ORDER, "System byte order");
+
+SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD,
+ 0, PAGE_SIZE, "System memory page size");
+
+static char machine_arch[] = MACHINE_ARCH;
+SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD,
+ machine_arch, 0, "System architecture");
+
+char hostname[MAXHOSTNAMELEN];
+
+static int
+sysctl_hostname(SYSCTL_HANDLER_ARGS)
+{
+ struct prison *pr;
+ char tmphostname[MAXHOSTNAMELEN];
+ int error;
+
+ pr = req->td->td_ucred->cr_prison;
+ if (pr != NULL) {
+ if (!jail_set_hostname_allowed && req->newptr)
+ return (EPERM);
+ /*
+ * Process is in jail, so make a local copy of jail
+ * hostname to get/set so we don't have to hold the jail
+ * mutex during the sysctl copyin/copyout activities.
+ */
+ mtx_lock(&pr->pr_mtx);
+ bcopy(pr->pr_host, tmphostname, MAXHOSTNAMELEN);
+ mtx_unlock(&pr->pr_mtx);
+
+ error = sysctl_handle_string(oidp, tmphostname,
+ sizeof pr->pr_host, req);
+
+ if (req->newptr != NULL && error == 0) {
+ /*
+ * Copy the locally set hostname to the jail, if
+ * appropriate.
+ */
+ mtx_lock(&pr->pr_mtx);
+ bcopy(tmphostname, pr->pr_host, MAXHOSTNAMELEN);
+ mtx_unlock(&pr->pr_mtx);
+ }
+ } else
+ error = sysctl_handle_string(oidp,
+ hostname, sizeof hostname, req);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname,
+ CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_PRISON,
+ 0, 0, sysctl_hostname, "A", "Hostname");
+
+static int regression_securelevel_nonmonotonic = 0;
+
+#ifdef REGRESSION
+SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW,
+ &regression_securelevel_nonmonotonic, 0, "securelevel may be lowered");
+#endif
+
+int securelevel = -1;
+struct mtx securelevel_mtx;
+
+MTX_SYSINIT(securelevel_lock, &securelevel_mtx, "securelevel mutex lock",
+ MTX_DEF);
+
+static int
+sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS)
+{
+ struct prison *pr;
+ int error, level;
+
+ pr = req->td->td_ucred->cr_prison;
+
+ /*
+ * If the process is in jail, return the maximum of the global and
+ * local levels; otherwise, return the global level.
+ */
+ if (pr != NULL) {
+ mtx_lock(&pr->pr_mtx);
+ level = imax(securelevel, pr->pr_securelevel);
+ mtx_unlock(&pr->pr_mtx);
+ } else
+ level = securelevel;
+ error = sysctl_handle_int(oidp, &level, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ /*
+ * Permit update only if the new securelevel exceeds the
+ * global level, and local level if any.
+ */
+ if (pr != NULL) {
+ mtx_lock(&pr->pr_mtx);
+ if (!regression_securelevel_nonmonotonic &&
+ (level < imax(securelevel, pr->pr_securelevel))) {
+ mtx_unlock(&pr->pr_mtx);
+ return (EPERM);
+ }
+ pr->pr_securelevel = level;
+ mtx_unlock(&pr->pr_mtx);
+ } else {
+ mtx_lock(&securelevel_mtx);
+ if (!regression_securelevel_nonmonotonic &&
+ (level < securelevel)) {
+ mtx_unlock(&securelevel_mtx);
+ return (EPERM);
+ }
+ securelevel = level;
+ mtx_unlock(&securelevel_mtx);
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel,
+ CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_kern_securelvl,
+ "I", "Current secure level");
+
+char domainname[MAXHOSTNAMELEN];
+SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW,
+ &domainname, sizeof(domainname), "Name of the current YP/NIS domain");
+
+u_long hostid;
+SYSCTL_ULONG(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "Host ID");
+
+/*
+ * This is really cheating. These actually live in the libc, something
+ * which I'm not quite sure is a good idea anyway, but in order for
+ * getnext and friends to actually work, we define dummies here.
+ */
+SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD,
+ "", 0, "PATH that finds all the standard utilities");
+SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD,
+ 0, 0, "Max ibase/obase values in bc(1)");
+SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD,
+ 0, 0, "Max array size in bc(1)");
+SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD,
+ 0, 0, "Max scale value in bc(1)");
+SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD,
+ 0, 0, "Max string length in bc(1)");
+SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD,
+ 0, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry");
+SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD,
+ 0, 0, "Max length (bytes) of a text-processing utility's input line");
+SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD,
+ 0, 0, "Maximum number of repeats of a regexp permitted");
+SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD,
+ 0, 0,
+ "The version of POSIX 1003.2 with which the system attempts to comply");
+SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD,
+ 0, 0, "Whether C development supports the C bindings option");
+SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD,
+ 0, 0, "Whether system supports the C development utilities option");
+SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD,
+ 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD,
+ 0, 0, "Whether system supports FORTRAN development utilities");
+SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD,
+ 0, 0, "Whether system supports FORTRAN runtime utilities");
+SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD,
+ 0, 0, "Whether system supports creation of locales");
+SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD,
+ 0, 0, "Whether system supports software development utilities");
+SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD,
+ 0, 0, "Whether system supports the user portability utilities");
+SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD,
+ 0, 0, "Min Maximum number of streams a process may have open at one time");
+SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD,
+ 0, 0, "Min Maximum number of types supported for timezone names");
+
+#include <sys/vnode.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD,
+ 0, sizeof(struct vnode), "sizeof(struct vnode)");
+
+SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD,
+ 0, sizeof(struct proc), "sizeof(struct proc)");
+
+#include <sys/conf.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, specinfo, CTLFLAG_RD,
+ 0, sizeof(struct specinfo), "sizeof(struct specinfo)");
+
+#include <sys/bio.h>
+#include <sys/buf.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD,
+ 0, sizeof(struct bio), "sizeof(struct bio)");
+SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD,
+ 0, sizeof(struct buf), "sizeof(struct buf)");
+
+#include <sys/user.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD,
+ 0, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)");
diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c
new file mode 100644
index 0000000..74a0259
--- /dev/null
+++ b/sys/kern/kern_module.c
@@ -0,0 +1,394 @@
+/*-
+ * Copyright (c) 1997 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/module.h>
+#include <sys/linker.h>
+
+static MALLOC_DEFINE(M_MODULE, "module", "module data structures");
+
+typedef TAILQ_HEAD(, module) modulelist_t;
+struct module {
+ TAILQ_ENTRY(module) link; /* chain together all modules */
+ TAILQ_ENTRY(module) flink; /* all modules in a file */
+ struct linker_file *file; /* file which contains this module */
+ int refs; /* reference count */
+ int id; /* unique id number */
+ char *name; /* module name */
+ modeventhand_t handler; /* event handler */
+ void *arg; /* argument for handler */
+ modspecific_t data; /* module specific data */
+};
+
+#define MOD_EVENT(mod, type) (mod)->handler((mod), (type), (mod)->arg)
+
+static modulelist_t modules;
+struct sx modules_sx;
+static int nextid = 1;
+static void module_shutdown(void *, int);
+
+static int
+modevent_nop(module_t mod, int what, void *arg)
+{
+ return (0);
+}
+
+static void
+module_init(void *arg)
+{
+
+ sx_init(&modules_sx, "module subsystem sx lock");
+ TAILQ_INIT(&modules);
+ EVENTHANDLER_REGISTER(shutdown_post_sync, module_shutdown, NULL,
+ SHUTDOWN_PRI_DEFAULT);
+}
+
+SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0)
+
+static void
+module_shutdown(void *arg1, int arg2)
+{
+ module_t mod;
+
+ MOD_SLOCK;
+ TAILQ_FOREACH(mod, &modules, link)
+ MOD_EVENT(mod, MOD_SHUTDOWN);
+ MOD_SUNLOCK;
+}
+
+void
+module_register_init(const void *arg)
+{
+ const moduledata_t *data = (const moduledata_t *)arg;
+ int error;
+ module_t mod;
+
+ MOD_SLOCK;
+ mod = module_lookupbyname(data->name);
+ if (mod == NULL)
+ panic("module_register_init: module named %s not found\n",
+ data->name);
+ MOD_SUNLOCK;
+ error = MOD_EVENT(mod, MOD_LOAD);
+ if (error) {
+ MOD_EVENT(mod, MOD_UNLOAD);
+ MOD_XLOCK;
+ module_release(mod);
+ MOD_XUNLOCK;
+ printf("module_register_init: MOD_LOAD (%s, %p, %p) error"
+ " %d\n", data->name, (void *)data->evhand, data->priv,
+ error);
+ }
+}
+
+int
+module_register(const moduledata_t *data, linker_file_t container)
+{
+ size_t namelen;
+ module_t newmod;
+
+ MOD_SLOCK;
+ newmod = module_lookupbyname(data->name);
+ if (newmod != NULL) {
+ MOD_SUNLOCK;
+ printf("module_register: module %s already exists!\n",
+ data->name);
+ return (EEXIST);
+ }
+ MOD_SUNLOCK;
+ namelen = strlen(data->name) + 1;
+ newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK);
+ if (newmod == NULL)
+ return (ENOMEM);
+ MOD_XLOCK;
+ newmod->refs = 1;
+ newmod->id = nextid++;
+ newmod->name = (char *)(newmod + 1);
+ strcpy(newmod->name, data->name);
+ newmod->handler = data->evhand ? data->evhand : modevent_nop;
+ newmod->arg = data->priv;
+ bzero(&newmod->data, sizeof(newmod->data));
+ TAILQ_INSERT_TAIL(&modules, newmod, link);
+
+ if (container)
+ TAILQ_INSERT_TAIL(&container->modules, newmod, flink);
+ newmod->file = container;
+ MOD_XUNLOCK;
+ return (0);
+}
+
+void
+module_reference(module_t mod)
+{
+
+ MOD_XLOCK_ASSERT;
+
+ MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs));
+ mod->refs++;
+}
+
+void
+module_release(module_t mod)
+{
+
+ MOD_XLOCK_ASSERT;
+
+ if (mod->refs <= 0)
+ panic("module_release: bad reference count");
+
+ MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs));
+
+ mod->refs--;
+ if (mod->refs == 0) {
+ TAILQ_REMOVE(&modules, mod, link);
+ if (mod->file)
+ TAILQ_REMOVE(&mod->file->modules, mod, flink);
+ MOD_XUNLOCK;
+ free(mod, M_MODULE);
+ MOD_XLOCK;
+ }
+}
+
+module_t
+module_lookupbyname(const char *name)
+{
+ module_t mod;
+ int err;
+
+ MOD_LOCK_ASSERT;
+
+ TAILQ_FOREACH(mod, &modules, link) {
+ err = strcmp(mod->name, name);
+ if (err == 0)
+ return (mod);
+ }
+ return (NULL);
+}
+
+module_t
+module_lookupbyid(int modid)
+{
+ module_t mod;
+
+ MOD_LOCK_ASSERT;
+
+ TAILQ_FOREACH(mod, &modules, link)
+ if (mod->id == modid)
+ return(mod);
+ return (NULL);
+}
+
+int
+module_unload(module_t mod)
+{
+
+ return (MOD_EVENT(mod, MOD_UNLOAD));
+}
+
+int
+module_getid(module_t mod)
+{
+
+ MOD_LOCK_ASSERT;
+ return (mod->id);
+}
+
+module_t
+module_getfnext(module_t mod)
+{
+
+ MOD_LOCK_ASSERT;
+ return (TAILQ_NEXT(mod, flink));
+}
+
+void
+module_setspecific(module_t mod, modspecific_t *datap)
+{
+
+ MOD_XLOCK_ASSERT;
+ mod->data = *datap;
+}
+
+/*
+ * Syscalls.
+ */
+/*
+ * MPSAFE
+ */
+int
+modnext(struct thread *td, struct modnext_args *uap)
+{
+ module_t mod;
+ int error = 0;
+
+ td->td_retval[0] = -1;
+
+ MOD_SLOCK;
+ if (SCARG(uap, modid) == 0) {
+ mod = TAILQ_FIRST(&modules);
+ if (mod)
+ td->td_retval[0] = mod->id;
+ else
+ error = ENOENT;
+ goto done2;
+ }
+ mod = module_lookupbyid(SCARG(uap, modid));
+ if (mod == NULL) {
+ error = ENOENT;
+ goto done2;
+ }
+ if (TAILQ_NEXT(mod, link))
+ td->td_retval[0] = TAILQ_NEXT(mod, link)->id;
+ else
+ td->td_retval[0] = 0;
+done2:
+ MOD_SUNLOCK;
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+modfnext(struct thread *td, struct modfnext_args *uap)
+{
+ module_t mod;
+ int error;
+
+ td->td_retval[0] = -1;
+
+ MOD_SLOCK;
+ mod = module_lookupbyid(SCARG(uap, modid));
+ if (mod == NULL) {
+ error = ENOENT;
+ } else {
+ error = 0;
+ if (TAILQ_NEXT(mod, flink))
+ td->td_retval[0] = TAILQ_NEXT(mod, flink)->id;
+ else
+ td->td_retval[0] = 0;
+ }
+ MOD_SUNLOCK;
+ return (error);
+}
+
+struct module_stat_v1 {
+ int version; /* set to sizeof(struct module_stat) */
+ char name[MAXMODNAME];
+ int refs;
+ int id;
+};
+
+/*
+ * MPSAFE
+ */
+int
+modstat(struct thread *td, struct modstat_args *uap)
+{
+ module_t mod;
+ modspecific_t data;
+ int error = 0;
+ int id, namelen, refs, version;
+ struct module_stat *stat;
+ char *name;
+
+ MOD_SLOCK;
+ mod = module_lookupbyid(SCARG(uap, modid));
+ if (mod == NULL) {
+ MOD_SUNLOCK;
+ return (ENOENT);
+ }
+ id = mod->id;
+ refs = mod->refs;
+ name = mod->name;
+ data = mod->data;
+ MOD_SUNLOCK;
+ stat = SCARG(uap, stat);
+
+ /*
+ * Check the version of the user's structure.
+ */
+ if ((error = copyin(&stat->version, &version, sizeof(version))) != 0)
+ return (error);
+ if (version != sizeof(struct module_stat_v1)
+ && version != sizeof(struct module_stat))
+ return (EINVAL);
+ namelen = strlen(mod->name) + 1;
+ if (namelen > MAXMODNAME)
+ namelen = MAXMODNAME;
+ if ((error = copyout(name, &stat->name[0], namelen)) != 0)
+ return (error);
+
+ if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0)
+ return (error);
+ if ((error = copyout(&id, &stat->id, sizeof(int))) != 0)
+ return (error);
+
+ /*
+ * >v1 stat includes module data.
+ */
+ if (version == sizeof(struct module_stat))
+ if ((error = copyout(&data, &stat->data,
+ sizeof(data))) != 0)
+ return (error);
+ td->td_retval[0] = 0;
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+modfind(struct thread *td, struct modfind_args *uap)
+{
+ int error = 0;
+ char name[MAXMODNAME];
+ module_t mod;
+
+ if ((error = copyinstr(SCARG(uap, name), name, sizeof name, 0)) != 0)
+ return (error);
+
+ MOD_SLOCK;
+ mod = module_lookupbyname(name);
+ if (mod == NULL)
+ error = ENOENT;
+ else
+ td->td_retval[0] = module_getid(mod);
+ MOD_SUNLOCK;
+ return (error);
+}
diff --git a/sys/kern/kern_mtxpool.c b/sys/kern/kern_mtxpool.c
new file mode 100644
index 0000000..3d4aa1c
--- /dev/null
+++ b/sys/kern/kern_mtxpool.c
@@ -0,0 +1,115 @@
+/*-
+ * Copyright (c) 2001 Matthew Dillon. All Rights Reserved. Copyright
+ * terms are as specified in the COPYRIGHT file at the base of the source
+ * tree.
+ *
+ * Mutex pool routines. These routines are designed to be used as short
+ * term leaf mutexes (e.g. the last mutex you might aquire other then
+ * calling msleep()). They operate using a shared pool. A mutex is chosen
+ * from the pool based on the supplied pointer (which may or may not be
+ * valid).
+ *
+ * Advantages:
+ * - no structural overhead. Mutexes can be associated with structures
+ * without adding bloat to the structures.
+ * - mutexes can be obtained for invalid pointers, useful when uses
+ * mutexes to interlock destructor ops.
+ * - no initialization/destructor overhead
+ * - can be used with msleep.
+ *
+ * Disadvantages:
+ * - should generally only be used as leaf mutexes
+ * - pool/pool dependancy ordering cannot be depended on.
+ * - possible L1 cache mastersip contention between cpus
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+#ifndef MTX_POOL_SIZE
+#define MTX_POOL_SIZE 128
+#endif
+#define MTX_POOL_MASK (MTX_POOL_SIZE-1)
+
+static struct mtx mtx_pool_ary[MTX_POOL_SIZE];
+
+int mtx_pool_valid = 0;
+
+/*
+ * Inline version of mtx_pool_find(), used to streamline our main API
+ * function calls.
+ */
+static __inline
+struct mtx *
+_mtx_pool_find(void *ptr)
+{
+ int p;
+
+ p = (int)(uintptr_t)ptr;
+ return(&mtx_pool_ary[(p ^ (p >> 6)) & MTX_POOL_MASK]);
+}
+
+static void
+mtx_pool_setup(void *dummy __unused)
+{
+ int i;
+
+ for (i = 0; i < MTX_POOL_SIZE; ++i)
+ mtx_init(&mtx_pool_ary[i], "pool mutex", NULL, MTX_DEF | MTX_NOWITNESS | MTX_QUIET);
+ mtx_pool_valid = 1;
+}
+
+/*
+ * Obtain a (shared) mutex from the pool. The returned mutex is a leaf
+ * level mutex, meaning that if you obtain it you cannot obtain any other
+ * mutexes until you release it. You can legally msleep() on the mutex.
+ */
+struct mtx *
+mtx_pool_alloc(void)
+{
+ static int si;
+ return(&mtx_pool_ary[si++ & MTX_POOL_MASK]);
+}
+
+/*
+ * Return the (shared) pool mutex associated with the specified address.
+ * The returned mutex is a leaf level mutex, meaning that if you obtain it
+ * you cannot obtain any other mutexes until you release it. You can
+ * legally msleep() on the mutex.
+ */
+struct mtx *
+mtx_pool_find(void *ptr)
+{
+ return(_mtx_pool_find(ptr));
+}
+
+/*
+ * Combined find/lock operation. Lock the pool mutex associated with
+ * the specified address.
+ */
+void
+mtx_pool_lock(void *ptr)
+{
+ mtx_lock(_mtx_pool_find(ptr));
+}
+
+/*
+ * Combined find/unlock operation. Unlock the pool mutex associated with
+ * the specified address.
+ */
+void
+mtx_pool_unlock(void *ptr)
+{
+ mtx_unlock(_mtx_pool_find(ptr));
+}
+
+SYSINIT(mtxpooli, SI_SUB_MTX_POOL, SI_ORDER_FIRST, mtx_pool_setup, NULL)
+
diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
new file mode 100644
index 0000000..08bca8d
--- /dev/null
+++ b/sys/kern/kern_mutex.c
@@ -0,0 +1,986 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
+ * $FreeBSD$
+ */
+
+/*
+ * Machine independent bits of mutex implementation.
+ */
+
+#include "opt_adaptive_mutexes.h"
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
+#include <sys/stdint.h>
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+
+#include <machine/atomic.h>
+#include <machine/bus.h>
+#include <machine/clock.h>
+#include <machine/cpu.h>
+
+#include <ddb/ddb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+/*
+ * Internal utility macros.
+ */
+#define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED)
+
+#define mtx_owner(m) (mtx_unowned((m)) ? NULL \
+ : (struct thread *)((m)->mtx_lock & MTX_FLAGMASK))
+
+/* XXXKSE This test will change. */
+#define thread_running(td) \
+ ((td)->td_kse != NULL && (td)->td_kse->ke_oncpu != NOCPU)
+
+/*
+ * Lock classes for sleep and spin mutexes.
+ */
+struct lock_class lock_class_mtx_sleep = {
+ "sleep mutex",
+ LC_SLEEPLOCK | LC_RECURSABLE
+};
+struct lock_class lock_class_mtx_spin = {
+ "spin mutex",
+ LC_SPINLOCK | LC_RECURSABLE
+};
+
+/*
+ * System-wide mutexes
+ */
+struct mtx sched_lock;
+struct mtx Giant;
+
+/*
+ * Prototypes for non-exported routines.
+ */
+static void propagate_priority(struct thread *);
+
+static void
+propagate_priority(struct thread *td)
+{
+ int pri = td->td_priority;
+ struct mtx *m = td->td_blocked;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ for (;;) {
+ struct thread *td1;
+
+ td = mtx_owner(m);
+
+ if (td == NULL) {
+ /*
+ * This really isn't quite right. Really
+ * ought to bump priority of thread that
+ * next acquires the mutex.
+ */
+ MPASS(m->mtx_lock == MTX_CONTESTED);
+ return;
+ }
+
+ MPASS(td->td_proc->p_magic == P_MAGIC);
+ KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex"));
+ if (td->td_priority <= pri) /* lower is higher priority */
+ return;
+
+ /*
+ * Bump this thread's priority.
+ */
+ td->td_priority = pri;
+
+ /*
+ * If lock holder is actually running, just bump priority.
+ */
+ if (thread_running(td)) {
+ MPASS(td->td_proc->p_stat == SRUN
+ || td->td_proc->p_stat == SZOMB
+ || td->td_proc->p_stat == SSTOP);
+ return;
+ }
+
+#ifndef SMP
+ /*
+ * For UP, we check to see if td is curthread (this shouldn't
+ * ever happen however as it would mean we are in a deadlock.)
+ */
+ KASSERT(td != curthread, ("Deadlock detected"));
+#endif
+
+ /*
+ * If on run queue move to new run queue, and quit.
+ * XXXKSE this gets a lot more complicated under threads
+ * but try anyhow.
+ */
+ if (td->td_proc->p_stat == SRUN) {
+ MPASS(td->td_blocked == NULL);
+ remrunqueue(td);
+ setrunqueue(td);
+ return;
+ }
+
+ /*
+ * If we aren't blocked on a mutex, we should be.
+ */
+ KASSERT(td->td_proc->p_stat == SMTX, (
+ "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
+ td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat,
+ m->mtx_object.lo_name));
+
+ /*
+ * Pick up the mutex that td is blocked on.
+ */
+ m = td->td_blocked;
+ MPASS(m != NULL);
+
+ /*
+ * Check if the thread needs to be moved up on
+ * the blocked chain
+ */
+ if (td == TAILQ_FIRST(&m->mtx_blocked)) {
+ continue;
+ }
+
+ td1 = TAILQ_PREV(td, threadqueue, td_blkq);
+ if (td1->td_priority <= pri) {
+ continue;
+ }
+
+ /*
+ * Remove thread from blocked chain and determine where
+ * it should be moved up to. Since we know that td1 has
+ * a lower priority than td, we know that at least one
+ * thread in the chain has a lower priority and that
+ * td1 will thus not be NULL after the loop.
+ */
+ TAILQ_REMOVE(&m->mtx_blocked, td, td_blkq);
+ TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq) {
+ MPASS(td1->td_proc->p_magic == P_MAGIC);
+ if (td1->td_priority > pri)
+ break;
+ }
+
+ MPASS(td1 != NULL);
+ TAILQ_INSERT_BEFORE(td1, td, td_blkq);
+ CTR4(KTR_LOCK,
+ "propagate_priority: p %p moved before %p on [%p] %s",
+ td, td1, m, m->mtx_object.lo_name);
+ }
+}
+
+#ifdef MUTEX_PROFILING
+SYSCTL_NODE(_debug, OID_AUTO, mutex, CTLFLAG_RD, NULL, "mutex debugging");
+SYSCTL_NODE(_debug_mutex, OID_AUTO, prof, CTLFLAG_RD, NULL, "mutex profiling");
+static int mutex_prof_enable = 0;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, enable, CTLFLAG_RW,
+ &mutex_prof_enable, 0, "Enable tracing of mutex holdtime");
+
+struct mutex_prof {
+ const char *name;
+ const char *file;
+ int line;
+#define MPROF_MAX 0
+#define MPROF_TOT 1
+#define MPROF_CNT 2
+#define MPROF_AVG 3
+ uintmax_t counter[4];
+ struct mutex_prof *next;
+};
+
+/*
+ * mprof_buf is a static pool of profiling records to avoid possible
+ * reentrance of the memory allocation functions.
+ *
+ * Note: NUM_MPROF_BUFFERS must be smaller than MPROF_HASH_SIZE.
+ */
+#define NUM_MPROF_BUFFERS 1000
+static struct mutex_prof mprof_buf[NUM_MPROF_BUFFERS];
+static int first_free_mprof_buf;
+#define MPROF_HASH_SIZE 1009
+static struct mutex_prof *mprof_hash[MPROF_HASH_SIZE];
+
+static int mutex_prof_acquisitions;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, acquisitions, CTLFLAG_RD,
+ &mutex_prof_acquisitions, 0, "Number of mutex acquistions recorded");
+static int mutex_prof_records;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, records, CTLFLAG_RD,
+ &mutex_prof_records, 0, "Number of profiling records");
+static int mutex_prof_maxrecords = NUM_MPROF_BUFFERS;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, maxrecords, CTLFLAG_RD,
+ &mutex_prof_maxrecords, 0, "Maximum number of profiling records");
+static int mutex_prof_rejected;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, rejected, CTLFLAG_RD,
+ &mutex_prof_rejected, 0, "Number of rejected profiling records");
+static int mutex_prof_hashsize = MPROF_HASH_SIZE;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, hashsize, CTLFLAG_RD,
+ &mutex_prof_hashsize, 0, "Hash size");
+static int mutex_prof_collisions = 0;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, collisions, CTLFLAG_RD,
+ &mutex_prof_collisions, 0, "Number of hash collisions");
+
+/*
+ * mprof_mtx protects the profiling buffers and the hash.
+ */
+static struct mtx mprof_mtx;
+MTX_SYSINIT(mprof, &mprof_mtx, "mutex profiling lock", MTX_SPIN | MTX_QUIET);
+
+static u_int64_t
+nanoseconds(void)
+{
+ struct timespec tv;
+
+ nanotime(&tv);
+ return (tv.tv_sec * (u_int64_t)1000000000 + tv.tv_nsec);
+}
+
+static int
+dump_mutex_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf *sb;
+ int error, i;
+
+ if (first_free_mprof_buf == 0)
+ return SYSCTL_OUT(req, "No locking recorded",
+ sizeof("No locking recorded"));
+
+ sb = sbuf_new(NULL, NULL, 1024, SBUF_AUTOEXTEND);
+ sbuf_printf(sb, "%12s %12s %12s %12s %s\n",
+ "max", "total", "count", "average", "name");
+ mtx_lock_spin(&mprof_mtx);
+ for (i = 0; i < first_free_mprof_buf; ++i)
+ sbuf_printf(sb, "%12ju %12ju %12ju %12ju %s:%d (%s)\n",
+ mprof_buf[i].counter[MPROF_MAX] / 1000,
+ mprof_buf[i].counter[MPROF_TOT] / 1000,
+ mprof_buf[i].counter[MPROF_CNT],
+ mprof_buf[i].counter[MPROF_AVG] / 1000,
+ mprof_buf[i].file, mprof_buf[i].line, mprof_buf[i].name);
+ mtx_unlock_spin(&mprof_mtx);
+ sbuf_finish(sb);
+ error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+ sbuf_delete(sb);
+ return (error);
+}
+SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
+ NULL, 0, dump_mutex_prof_stats, "A", "Mutex profiling statistics");
+#endif
+
+/*
+ * Function versions of the inlined __mtx_* macros. These are used by
+ * modules and can also be called from assembly language if needed.
+ */
+void
+_mtx_lock_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+ MPASS(curthread != NULL);
+ _get_sleep_lock(m, curthread, opts, file, line);
+ LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+ line);
+ WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+#ifdef MUTEX_PROFILING
+ /* don't reset the timer when/if recursing */
+ if (m->acqtime == 0) {
+ m->file = file;
+ m->line = line;
+ m->acqtime = mutex_prof_enable ? nanoseconds() : 0;
+ ++mutex_prof_acquisitions;
+ }
+#endif
+}
+
+void
+_mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+ MPASS(curthread != NULL);
+ mtx_assert(m, MA_OWNED);
+#ifdef MUTEX_PROFILING
+ if (m->acqtime != 0) {
+ static const char *unknown = "(unknown)";
+ struct mutex_prof *mpp;
+ u_int64_t acqtime, now;
+ const char *p, *q;
+ volatile u_int hash;
+
+ now = nanoseconds();
+ acqtime = m->acqtime;
+ m->acqtime = 0;
+ if (now <= acqtime)
+ goto out;
+ for (p = file; strncmp(p, "../", 3) == 0; p += 3)
+ /* nothing */ ;
+ if (p == NULL || *p == '\0')
+ p = unknown;
+ for (hash = line, q = p; *q != '\0'; ++q)
+ hash = (hash * 2 + *q) % MPROF_HASH_SIZE;
+ mtx_lock_spin(&mprof_mtx);
+ for (mpp = mprof_hash[hash]; mpp != NULL; mpp = mpp->next)
+ if (mpp->line == line && strcmp(mpp->file, p) == 0)
+ break;
+ if (mpp == NULL) {
+ /* Just exit if we cannot get a trace buffer */
+ if (first_free_mprof_buf >= NUM_MPROF_BUFFERS) {
+ ++mutex_prof_rejected;
+ goto unlock;
+ }
+ mpp = &mprof_buf[first_free_mprof_buf++];
+ mpp->name = mtx_name(m);
+ mpp->file = p;
+ mpp->line = line;
+ mpp->next = mprof_hash[hash];
+ if (mprof_hash[hash] != NULL)
+ ++mutex_prof_collisions;
+ mprof_hash[hash] = mpp;
+ ++mutex_prof_records;
+ }
+ /*
+ * Record if the mutex has been held longer now than ever
+ * before
+ */
+ if ((now - acqtime) > mpp->counter[MPROF_MAX])
+ mpp->counter[MPROF_MAX] = now - acqtime;
+ mpp->counter[MPROF_TOT] += now - acqtime;
+ mpp->counter[MPROF_CNT] += 1;
+ mpp->counter[MPROF_AVG] =
+ mpp->counter[MPROF_TOT] / mpp->counter[MPROF_CNT];
+unlock:
+ mtx_unlock_spin(&mprof_mtx);
+ }
+out:
+#endif
+ WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+ line);
+ _rel_sleep_lock(m, curthread, opts, file, line);
+}
+
+void
+_mtx_lock_spin_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+ MPASS(curthread != NULL);
+#if defined(SMP) || LOCK_DEBUG > 0
+ _get_spin_lock(m, curthread, opts, file, line);
+#else
+ critical_enter();
+#endif
+ LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+ line);
+ WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+}
+
+void
+_mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+ MPASS(curthread != NULL);
+ mtx_assert(m, MA_OWNED);
+ WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+ line);
+#if defined(SMP) || LOCK_DEBUG > 0
+ _rel_spin_lock(m);
+#else
+ critical_exit();
+#endif
+}
+
+/*
+ * The important part of mtx_trylock{,_flags}()
+ * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that
+ * if we're called, it's because we know we don't already own this lock.
+ */
+int
+_mtx_trylock(struct mtx *m, int opts, const char *file, int line)
+{
+ int rval;
+
+ MPASS(curthread != NULL);
+
+ rval = _obtain_lock(m, curthread);
+
+ LOCK_LOG_TRY("LOCK", &m->mtx_object, opts, rval, file, line);
+ if (rval) {
+ /*
+ * We do not handle recursion in _mtx_trylock; see the
+ * note at the top of the routine.
+ */
+ KASSERT(!mtx_recursed(m),
+ ("mtx_trylock() called on a recursed mutex"));
+ WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
+ file, line);
+ }
+
+ return (rval);
+}
+
+/*
+ * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
+ *
+ * We call this if the lock is either contested (i.e. we need to go to
+ * sleep waiting for it), or if we need to recurse on it.
+ */
+void
+_mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
+{
+ struct thread *td = curthread;
+#if defined(SMP) && defined(ADAPTIVE_MUTEXES)
+ struct thread *owner;
+#endif
+
+ if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)td) {
+ m->mtx_recurse++;
+ atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
+ return;
+ }
+
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR4(KTR_LOCK,
+ "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
+ m->mtx_object.lo_name, (void *)m->mtx_lock, file, line);
+
+ while (!_obtain_lock(m, td)) {
+ uintptr_t v;
+ struct thread *td1;
+
+ mtx_lock_spin(&sched_lock);
+ /*
+ * Check if the lock has been released while spinning for
+ * the sched_lock.
+ */
+ if ((v = m->mtx_lock) == MTX_UNOWNED) {
+ mtx_unlock_spin(&sched_lock);
+#ifdef __i386__
+ ia32_pause();
+#endif
+ continue;
+ }
+
+ /*
+ * The mutex was marked contested on release. This means that
+ * there are threads blocked on it.
+ */
+ if (v == MTX_CONTESTED) {
+ td1 = TAILQ_FIRST(&m->mtx_blocked);
+ MPASS(td1 != NULL);
+ m->mtx_lock = (uintptr_t)td | MTX_CONTESTED;
+
+ if (td1->td_priority < td->td_priority)
+ td->td_priority = td1->td_priority;
+ mtx_unlock_spin(&sched_lock);
+ return;
+ }
+
+ /*
+ * If the mutex isn't already contested and a failure occurs
+ * setting the contested bit, the mutex was either released
+ * or the state of the MTX_RECURSED bit changed.
+ */
+ if ((v & MTX_CONTESTED) == 0 &&
+ !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
+ (void *)(v | MTX_CONTESTED))) {
+ mtx_unlock_spin(&sched_lock);
+#ifdef __i386__
+ ia32_pause();
+#endif
+ continue;
+ }
+
+#if defined(SMP) && defined(ADAPTIVE_MUTEXES)
+ /*
+ * If the current owner of the lock is executing on another
+ * CPU, spin instead of blocking.
+ */
+ owner = (struct thread *)(v & MTX_FLAGMASK);
+ if (m != &Giant && thread_running(owner)) {
+ mtx_unlock_spin(&sched_lock);
+ while (mtx_owner(m) == owner && thread_running(owner)) {
+#ifdef __i386__
+ ia32_pause();
+#endif
+ }
+ continue;
+ }
+#endif /* SMP && ADAPTIVE_MUTEXES */
+
+ /*
+ * We definitely must sleep for this lock.
+ */
+ mtx_assert(m, MA_NOTOWNED);
+
+#ifdef notyet
+ /*
+ * If we're borrowing an interrupted thread's VM context, we
+ * must clean up before going to sleep.
+ */
+ if (td->td_ithd != NULL) {
+ struct ithd *it = td->td_ithd;
+
+ if (it->it_interrupted) {
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR2(KTR_LOCK,
+ "_mtx_lock_sleep: %p interrupted %p",
+ it, it->it_interrupted);
+ intr_thd_fixup(it);
+ }
+ }
+#endif
+
+ /*
+ * Put us on the list of threads blocked on this mutex.
+ */
+ if (TAILQ_EMPTY(&m->mtx_blocked)) {
+ td1 = mtx_owner(m);
+ LIST_INSERT_HEAD(&td1->td_contested, m, mtx_contested);
+ TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq);
+ } else {
+ TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq)
+ if (td1->td_priority > td->td_priority)
+ break;
+ if (td1)
+ TAILQ_INSERT_BEFORE(td1, td, td_blkq);
+ else
+ TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq);
+ }
+
+ /*
+ * Save who we're blocked on.
+ */
+ td->td_blocked = m;
+ td->td_mtxname = m->mtx_object.lo_name;
+ td->td_proc->p_stat = SMTX;
+ propagate_priority(td);
+
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR3(KTR_LOCK,
+ "_mtx_lock_sleep: p %p blocked on [%p] %s", td, m,
+ m->mtx_object.lo_name);
+
+ td->td_proc->p_stats->p_ru.ru_nvcsw++;
+ mi_switch();
+
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR3(KTR_LOCK,
+ "_mtx_lock_sleep: p %p free from blocked on [%p] %s",
+ td, m, m->mtx_object.lo_name);
+
+ mtx_unlock_spin(&sched_lock);
+ }
+
+ return;
+}
+
+/*
+ * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock.
+ *
+ * This is only called if we need to actually spin for the lock. Recursion
+ * is handled inline.
+ */
+void
+_mtx_lock_spin(struct mtx *m, int opts, const char *file, int line)
+{
+ int i = 0;
+
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
+
+ for (;;) {
+ if (_obtain_lock(m, curthread))
+ break;
+
+ /* Give interrupts a chance while we spin. */
+ critical_exit();
+ while (m->mtx_lock != MTX_UNOWNED) {
+ if (i++ < 10000000) {
+#ifdef __i386__
+ ia32_pause();
+#endif
+ continue;
+ }
+ if (i < 60000000)
+ DELAY(1);
+#ifdef DDB
+ else if (!db_active)
+#else
+ else
+#endif
+ panic("spin lock %s held by %p for > 5 seconds",
+ m->mtx_object.lo_name, (void *)m->mtx_lock);
+#ifdef __i386__
+ ia32_pause();
+#endif
+ }
+ critical_enter();
+ }
+
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
+
+ return;
+}
+
+/*
+ * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
+ *
+ * We are only called here if the lock is recursed or contested (i.e. we
+ * need to wake up a blocked thread).
+ */
+void
+_mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
+{
+ struct thread *td, *td1;
+ struct mtx *m1;
+ int pri;
+
+ td = curthread;
+
+ if (mtx_recursed(m)) {
+ if (--(m->mtx_recurse) == 0)
+ atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
+ return;
+ }
+
+ mtx_lock_spin(&sched_lock);
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
+
+ td1 = TAILQ_FIRST(&m->mtx_blocked);
+#if defined(SMP) && defined(ADAPTIVE_MUTEXES)
+ if (td1 == NULL) {
+ _release_lock_quick(m);
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m);
+ mtx_unlock_spin(&sched_lock);
+ return;
+ }
+#endif
+ MPASS(td->td_proc->p_magic == P_MAGIC);
+ MPASS(td1->td_proc->p_magic == P_MAGIC);
+
+ TAILQ_REMOVE(&m->mtx_blocked, td1, td_blkq);
+
+ if (TAILQ_EMPTY(&m->mtx_blocked)) {
+ LIST_REMOVE(m, mtx_contested);
+ _release_lock_quick(m);
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m);
+ } else
+ atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);
+
+ pri = PRI_MAX;
+ LIST_FOREACH(m1, &td->td_contested, mtx_contested) {
+ int cp = TAILQ_FIRST(&m1->mtx_blocked)->td_priority;
+ if (cp < pri)
+ pri = cp;
+ }
+
+ if (pri > td->td_base_pri)
+ pri = td->td_base_pri;
+ td->td_priority = pri;
+
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p",
+ m, td1);
+
+ td1->td_blocked = NULL;
+ td1->td_proc->p_stat = SRUN;
+ setrunqueue(td1);
+
+ if (td->td_critnest == 1 && td1->td_priority < pri) {
+#ifdef notyet
+ if (td->td_ithd != NULL) {
+ struct ithd *it = td->td_ithd;
+
+ if (it->it_interrupted) {
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR2(KTR_LOCK,
+ "_mtx_unlock_sleep: %p interrupted %p",
+ it, it->it_interrupted);
+ intr_thd_fixup(it);
+ }
+ }
+#endif
+ setrunqueue(td);
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR2(KTR_LOCK,
+ "_mtx_unlock_sleep: %p switching out lock=%p", m,
+ (void *)m->mtx_lock);
+
+ td->td_proc->p_stats->p_ru.ru_nivcsw++;
+ mi_switch();
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
+ m, (void *)m->mtx_lock);
+ }
+
+ mtx_unlock_spin(&sched_lock);
+
+ return;
+}
+
+/*
+ * All the unlocking of MTX_SPIN locks is done inline.
+ * See the _rel_spin_lock() macro for the details.
+ */
+
+/*
+ * The backing function for the INVARIANTS-enabled mtx_assert()
+ */
+#ifdef INVARIANT_SUPPORT
+void
+_mtx_assert(struct mtx *m, int what, const char *file, int line)
+{
+
+ if (panicstr != NULL)
+ return;
+ switch (what) {
+ case MA_OWNED:
+ case MA_OWNED | MA_RECURSED:
+ case MA_OWNED | MA_NOTRECURSED:
+ if (!mtx_owned(m))
+ panic("mutex %s not owned at %s:%d",
+ m->mtx_object.lo_name, file, line);
+ if (mtx_recursed(m)) {
+ if ((what & MA_NOTRECURSED) != 0)
+ panic("mutex %s recursed at %s:%d",
+ m->mtx_object.lo_name, file, line);
+ } else if ((what & MA_RECURSED) != 0) {
+ panic("mutex %s unrecursed at %s:%d",
+ m->mtx_object.lo_name, file, line);
+ }
+ break;
+ case MA_NOTOWNED:
+ if (mtx_owned(m))
+ panic("mutex %s owned at %s:%d",
+ m->mtx_object.lo_name, file, line);
+ break;
+ default:
+ panic("unknown mtx_assert at %s:%d", file, line);
+ }
+}
+#endif
+
+/*
+ * The MUTEX_DEBUG-enabled mtx_validate()
+ *
+ * Most of these checks have been moved off into the LO_INITIALIZED flag
+ * maintained by the witness code.
+ */
+#ifdef MUTEX_DEBUG
+
+void mtx_validate(struct mtx *);
+
+void
+mtx_validate(struct mtx *m)
+{
+
+/*
+ * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
+ * we can re-enable the kernacc() checks.
+ */
+#ifndef __alpha__
+ /*
+ * Can't call kernacc() from early init386(), especially when
+ * initializing Giant mutex, because some stuff in kernacc()
+ * requires Giant itself.
+ */
+ if (!cold)
+ if (!kernacc((caddr_t)m, sizeof(m),
+ VM_PROT_READ | VM_PROT_WRITE))
+ panic("Can't read and write to mutex %p", m);
+#endif
+}
+#endif
+
+/*
+ * General init routine used by the MTX_SYSINIT() macro.
+ */
+void
+mtx_sysinit(void *arg)
+{
+ struct mtx_args *margs = arg;
+
+ mtx_init(margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts);
+}
+
+/*
+ * Mutex initialization routine; initialize lock `m' of type contained in
+ * `opts' with options contained in `opts' and name `name.' The optional
+ * lock type `type' is used as a general lock category name for use with
+ * witness.
+ */
+void
+mtx_init(struct mtx *m, const char *name, const char *type, int opts)
+{
+ struct lock_object *lock;
+
+ MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
+ MTX_SLEEPABLE | MTX_NOWITNESS | MTX_DUPOK)) == 0);
+
+#ifdef MUTEX_DEBUG
+ /* Diagnostic and error correction */
+ mtx_validate(m);
+#endif
+
+ lock = &m->mtx_object;
+ KASSERT((lock->lo_flags & LO_INITIALIZED) == 0,
+ ("mutex %s %p already initialized", name, m));
+ bzero(m, sizeof(*m));
+ if (opts & MTX_SPIN)
+ lock->lo_class = &lock_class_mtx_spin;
+ else
+ lock->lo_class = &lock_class_mtx_sleep;
+ lock->lo_name = name;
+ lock->lo_type = type != NULL ? type : name;
+ if (opts & MTX_QUIET)
+ lock->lo_flags = LO_QUIET;
+ if (opts & MTX_RECURSE)
+ lock->lo_flags |= LO_RECURSABLE;
+ if (opts & MTX_SLEEPABLE)
+ lock->lo_flags |= LO_SLEEPABLE;
+ if ((opts & MTX_NOWITNESS) == 0)
+ lock->lo_flags |= LO_WITNESS;
+ if (opts & MTX_DUPOK)
+ lock->lo_flags |= LO_DUPOK;
+
+ m->mtx_lock = MTX_UNOWNED;
+ TAILQ_INIT(&m->mtx_blocked);
+
+ LOCK_LOG_INIT(lock, opts);
+
+ WITNESS_INIT(lock);
+}
+
+/*
+ * Remove lock `m' from all_mtx queue. We don't allow MTX_QUIET to be
+ * passed in as a flag here because if the corresponding mtx_init() was
+ * called with MTX_QUIET set, then it will already be set in the mutex's
+ * flags.
+ */
+void
+mtx_destroy(struct mtx *m)
+{
+
+ LOCK_LOG_DESTROY(&m->mtx_object, 0);
+
+ if (!mtx_owned(m))
+ MPASS(mtx_unowned(m));
+ else {
+ MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
+
+ /* Tell witness this isn't locked to make it happy. */
+ WITNESS_UNLOCK(&m->mtx_object, LOP_EXCLUSIVE, __FILE__,
+ __LINE__);
+ }
+
+ WITNESS_DESTROY(&m->mtx_object);
+}
+
+/*
+ * Intialize the mutex code and system mutexes. This is called from the MD
+ * startup code prior to mi_startup(). The per-CPU data space needs to be
+ * setup before this is called.
+ */
+void
+mutex_init(void)
+{
+
+ /* Setup thread0 so that mutexes work. */
+ LIST_INIT(&thread0.td_contested);
+
+ /*
+ * Initialize mutexes.
+ */
+ mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
+ mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
+ mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+ mtx_lock(&Giant);
+}
+
+/*
+ * Encapsulated Giant mutex routines. These routines provide encapsulation
+ * control for the Giant mutex, allowing sysctls to be used to turn on and
+ * off Giant around certain subsystems. The default value for the sysctls
+ * are set to what developers believe is stable and working in regards to
+ * the Giant pushdown. Developers should not turn off Giant via these
+ * sysctls unless they know what they are doing.
+ *
+ * Callers of mtx_lock_giant() are expected to pass the return value to an
+ * accompanying mtx_unlock_giant() later on. If multiple subsystems are
+ * effected by a Giant wrap, all related sysctl variables must be zero for
+ * the subsystem call to operate without Giant (as determined by the caller).
+ */
+
+SYSCTL_NODE(_kern, OID_AUTO, giant, CTLFLAG_RD, NULL, "Giant mutex manipulation");
+
+static int kern_giant_all = 0;
+SYSCTL_INT(_kern_giant, OID_AUTO, all, CTLFLAG_RW, &kern_giant_all, 0, "");
+
+int kern_giant_proc = 1; /* Giant around PROC locks */
+int kern_giant_file = 1; /* Giant around struct file & filedesc */
+int kern_giant_ucred = 1; /* Giant around ucred */
+SYSCTL_INT(_kern_giant, OID_AUTO, proc, CTLFLAG_RW, &kern_giant_proc, 0, "");
+SYSCTL_INT(_kern_giant, OID_AUTO, file, CTLFLAG_RW, &kern_giant_file, 0, "");
+SYSCTL_INT(_kern_giant, OID_AUTO, ucred, CTLFLAG_RW, &kern_giant_ucred, 0, "");
+
+int
+mtx_lock_giant(int sysctlvar)
+{
+ if (sysctlvar || kern_giant_all) {
+ mtx_lock(&Giant);
+ return(1);
+ }
+ return(0);
+}
+
+void
+mtx_unlock_giant(int s)
+{
+ if (s)
+ mtx_unlock(&Giant);
+}
+
diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c
new file mode 100644
index 0000000..cd2db73
--- /dev/null
+++ b/sys/kern/kern_ntptime.c
@@ -0,0 +1,935 @@
+/***********************************************************************
+ * *
+ * Copyright (c) David L. Mills 1993-2001 *
+ * *
+ * Permission to use, copy, modify, and distribute this software and *
+ * its documentation for any purpose and without fee is hereby *
+ * granted, provided that the above copyright notice appears in all *
+ * copies and that both the copyright notice and this permission *
+ * notice appear in supporting documentation, and that the name *
+ * University of Delaware not be used in advertising or publicity *
+ * pertaining to distribution of the software without specific, *
+ * written prior permission. The University of Delaware makes no *
+ * representations about the suitability this software for any *
+ * purpose. It is provided "as is" without express or implied *
+ * warranty. *
+ * *
+ **********************************************************************/
+
+/*
+ * Adapted from the original sources for FreeBSD and timecounters by:
+ * Poul-Henning Kamp <phk@FreeBSD.org>.
+ *
+ * The 32bit version of the "LP" macros seems a bit past its "sell by"
+ * date so I have retained only the 64bit version and included it directly
+ * in this file.
+ *
+ * Only minor changes done to interface with the timecounters over in
+ * sys/kern/kern_clock.c. Some of the comments below may be (even more)
+ * confusing and/or plain wrong in that context.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ntp.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <sys/timetc.h>
+#include <sys/timepps.h>
+#include <sys/sysctl.h>
+
+/*
+ * Single-precision macros for 64-bit machines
+ */
+typedef long long l_fp;
+#define L_ADD(v, u) ((v) += (u))
+#define L_SUB(v, u) ((v) -= (u))
+#define L_ADDHI(v, a) ((v) += (long long)(a) << 32)
+#define L_NEG(v) ((v) = -(v))
+#define L_RSHIFT(v, n) \
+ do { \
+ if ((v) < 0) \
+ (v) = -(-(v) >> (n)); \
+ else \
+ (v) = (v) >> (n); \
+ } while (0)
+#define L_MPY(v, a) ((v) *= (a))
+#define L_CLR(v) ((v) = 0)
+#define L_ISNEG(v) ((v) < 0)
+#define L_LINT(v, a) ((v) = (long long)(a) << 32)
+#define L_GINT(v) ((v) < 0 ? -(-(v) >> 32) : (v) >> 32)
+
+/*
+ * Generic NTP kernel interface
+ *
+ * These routines constitute the Network Time Protocol (NTP) interfaces
+ * for user and daemon application programs. The ntp_gettime() routine
+ * provides the time, maximum error (synch distance) and estimated error
+ * (dispersion) to client user application programs. The ntp_adjtime()
+ * routine is used by the NTP daemon to adjust the system clock to an
+ * externally derived time. The time offset and related variables set by
+ * this routine are used by other routines in this module to adjust the
+ * phase and frequency of the clock discipline loop which controls the
+ * system clock.
+ *
+ * When the kernel time is reckoned directly in nanoseconds (NTP_NANO
+ * defined), the time at each tick interrupt is derived directly from
+ * the kernel time variable. When the kernel time is reckoned in
+ * microseconds, (NTP_NANO undefined), the time is derived from the
+ * kernel time variable together with a variable representing the
+ * leftover nanoseconds at the last tick interrupt. In either case, the
+ * current nanosecond time is reckoned from these values plus an
+ * interpolated value derived by the clock routines in another
+ * architecture-specific module. The interpolation can use either a
+ * dedicated counter or a processor cycle counter (PCC) implemented in
+ * some architectures.
+ *
+ * Note that all routines must run at priority splclock or higher.
+ */
+/*
+ * Phase/frequency-lock loop (PLL/FLL) definitions
+ *
+ * The nanosecond clock discipline uses two variable types, time
+ * variables and frequency variables. Both types are represented as 64-
+ * bit fixed-point quantities with the decimal point between two 32-bit
+ * halves. On a 32-bit machine, each half is represented as a single
+ * word and mathematical operations are done using multiple-precision
+ * arithmetic. On a 64-bit machine, ordinary computer arithmetic is
+ * used.
+ *
+ * A time variable is a signed 64-bit fixed-point number in ns and
+ * fraction. It represents the remaining time offset to be amortized
+ * over succeeding tick interrupts. The maximum time offset is about
+ * 0.5 s and the resolution is about 2.3e-10 ns.
+ *
+ * 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |s s s| ns |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | fraction |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * A frequency variable is a signed 64-bit fixed-point number in ns/s
+ * and fraction. It represents the ns and fraction to be added to the
+ * kernel time variable at each second. The maximum frequency offset is
+ * about +-500000 ns/s and the resolution is about 2.3e-10 ns/s.
+ *
+ * 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |s s s s s s s s s s s s s| ns/s |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | fraction |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+/*
+ * The following variables establish the state of the PLL/FLL and the
+ * residual time and frequency offset of the local clock.
+ */
+#define SHIFT_PLL 4 /* PLL loop gain (shift) */
+#define SHIFT_FLL 2 /* FLL loop gain (shift) */
+
+static int time_state = TIME_OK; /* clock state */
+static int time_status = STA_UNSYNC; /* clock status bits */
+static long time_tai; /* TAI offset (s) */
+static long time_monitor; /* last time offset scaled (ns) */
+static long time_constant; /* poll interval (shift) (s) */
+static long time_precision = 1; /* clock precision (ns) */
+static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */
+static long time_esterror = MAXPHASE / 1000; /* estimated error (us) */
+static long time_reftime; /* time at last adjustment (s) */
+static long time_tick; /* nanoseconds per tick (ns) */
+static l_fp time_offset; /* time offset (ns) */
+static l_fp time_freq; /* frequency offset (ns/s) */
+static l_fp time_adj; /* tick adjust (ns/s) */
+
+static int64_t time_adjtime; /* correction from adjtime(2) (usec) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available and connected via a modem control lead. They establish
+ * the engineering parameters of the clock discipline loop when
+ * controlled by the PPS signal.
+ */
+#define PPS_FAVG 2 /* min freq avg interval (s) (shift) */
+#define PPS_FAVGDEF 8 /* default freq avg int (s) (shift) */
+#define PPS_FAVGMAX 15 /* max freq avg interval (s) (shift) */
+#define PPS_PAVG 4 /* phase avg interval (s) (shift) */
+#define PPS_VALID 120 /* PPS signal watchdog max (s) */
+#define PPS_MAXWANDER 100000 /* max PPS wander (ns/s) */
+#define PPS_POPCORN 2 /* popcorn spike threshold (shift) */
+
+static struct timespec pps_tf[3]; /* phase median filter */
+static l_fp pps_freq; /* scaled frequency offset (ns/s) */
+static long pps_fcount; /* frequency accumulator */
+static long pps_jitter; /* nominal jitter (ns) */
+static long pps_stabil; /* nominal stability (scaled ns/s) */
+static long pps_lastsec; /* time at last calibration (s) */
+static int pps_valid; /* signal watchdog counter */
+static int pps_shift = PPS_FAVG; /* interval duration (s) (shift) */
+static int pps_shiftmax = PPS_FAVGDEF; /* max interval duration (s) (shift) */
+static int pps_intcnt; /* wander counter */
+
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt; /* calibration intervals */
+static long pps_jitcnt; /* jitter limit exceeded */
+static long pps_stbcnt; /* stability limit exceeded */
+static long pps_errcnt; /* calibration errors */
+#endif /* PPS_SYNC */
+/*
+ * End of phase/frequency-lock loop (PLL/FLL) definitions
+ */
+
+static void ntp_init(void);
+static void hardupdate(long offset);
+
+/*
+ * ntp_gettime() - NTP user application interface
+ *
+ * See the timex.h header file for synopsis and API description. Note
+ * that the TAI offset is returned in the ntvtimeval.tai structure
+ * member.
+ */
+static int
+ntp_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct ntptimeval ntv; /* temporary structure */
+ struct timespec atv; /* nanosecond time */
+
+ nanotime(&atv);
+ ntv.time.tv_sec = atv.tv_sec;
+ ntv.time.tv_nsec = atv.tv_nsec;
+ ntv.maxerror = time_maxerror;
+ ntv.esterror = time_esterror;
+ ntv.tai = time_tai;
+ ntv.time_state = time_state;
+
+ /*
+ * Status word error decode. If any of these conditions occur,
+ * an error is returned, instead of the status word. Most
+ * applications will care only about the fact the system clock
+ * may not be trusted, not about the details.
+ *
+ * Hardware or software error
+ */
+ if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
+
+ /*
+ * PPS signal lost when either time or frequency synchronization
+ * requested
+ */
+ (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+ !(time_status & STA_PPSSIGNAL)) ||
+
+ /*
+ * PPS jitter exceeded when time synchronization requested
+ */
+ (time_status & STA_PPSTIME &&
+ time_status & STA_PPSJITTER) ||
+
+ /*
+ * PPS wander exceeded or calibration error when frequency
+ * synchronization requested
+ */
+ (time_status & STA_PPSFREQ &&
+ time_status & (STA_PPSWANDER | STA_PPSERROR)))
+ ntv.time_state = TIME_ERROR;
+ return (sysctl_handle_opaque(oidp, &ntv, sizeof ntv, req));
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, ntp_pll, CTLFLAG_RW, 0, "");
+SYSCTL_PROC(_kern_ntp_pll, OID_AUTO, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", "");
+
+#ifdef PPS_SYNC
+SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shiftmax, CTLFLAG_RW, &pps_shiftmax, 0, "");
+SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shift, CTLFLAG_RW, &pps_shift, 0, "");
+SYSCTL_INT(_kern_ntp_pll, OID_AUTO, time_monitor, CTLFLAG_RD, &time_monitor, 0, "");
+
+SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD, &pps_freq, sizeof(pps_freq), "I", "");
+SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD, &time_freq, sizeof(time_freq), "I", "");
+#endif
+/*
+ * ntp_adjtime() - NTP daemon application interface
+ *
+ * See the timex.h header file for synopsis and API description. Note
+ * that the timex.constant structure member has a dual purpose to set
+ * the time constant and to set the TAI offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ntp_adjtime_args {
+ struct timex *tp;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap)
+{
+ struct timex ntv; /* temporary structure */
+ long freq; /* frequency ns/s) */
+ int modes; /* mode bits from structure */
+ int s; /* caller priority */
+ int error;
+
+ error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
+ if (error)
+ return(error);
+
+ /*
+ * Update selected clock variables - only the superuser can
+ * change anything. Note that there is no error checking here on
+ * the assumption the superuser should know what it is doing.
+ * Note that either the time constant or TAI offset are loaded
+ * from the ntv.constant member, depending on the mode bits. If
+ * the STA_PLL bit in the status word is cleared, the state and
+ * status words are reset to the initial values at boot.
+ */
+ mtx_lock(&Giant);
+ modes = ntv.modes;
+ if (modes)
+ error = suser(td);
+ if (error)
+ goto done2;
+ s = splclock();
+ if (modes & MOD_MAXERROR)
+ time_maxerror = ntv.maxerror;
+ if (modes & MOD_ESTERROR)
+ time_esterror = ntv.esterror;
+ if (modes & MOD_STATUS) {
+ if (time_status & STA_PLL && !(ntv.status & STA_PLL)) {
+ time_state = TIME_OK;
+ time_status = STA_UNSYNC;
+#ifdef PPS_SYNC
+ pps_shift = PPS_FAVG;
+#endif /* PPS_SYNC */
+ }
+ time_status &= STA_RONLY;
+ time_status |= ntv.status & ~STA_RONLY;
+ }
+ if (modes & MOD_TIMECONST) {
+ if (ntv.constant < 0)
+ time_constant = 0;
+ else if (ntv.constant > MAXTC)
+ time_constant = MAXTC;
+ else
+ time_constant = ntv.constant;
+ }
+ if (modes & MOD_TAI) {
+ if (ntv.constant > 0) /* XXX zero & negative numbers ? */
+ time_tai = ntv.constant;
+ }
+#ifdef PPS_SYNC
+ if (modes & MOD_PPSMAX) {
+ if (ntv.shift < PPS_FAVG)
+ pps_shiftmax = PPS_FAVG;
+ else if (ntv.shift > PPS_FAVGMAX)
+ pps_shiftmax = PPS_FAVGMAX;
+ else
+ pps_shiftmax = ntv.shift;
+ }
+#endif /* PPS_SYNC */
+ if (modes & MOD_NANO)
+ time_status |= STA_NANO;
+ if (modes & MOD_MICRO)
+ time_status &= ~STA_NANO;
+ if (modes & MOD_CLKB)
+ time_status |= STA_CLK;
+ if (modes & MOD_CLKA)
+ time_status &= ~STA_CLK;
+ if (modes & MOD_OFFSET) {
+ if (time_status & STA_NANO)
+ hardupdate(ntv.offset);
+ else
+ hardupdate(ntv.offset * 1000);
+ }
+ if (modes & MOD_FREQUENCY) {
+ freq = (ntv.freq * 1000LL) >> 16;
+ if (freq > MAXFREQ)
+ L_LINT(time_freq, MAXFREQ);
+ else if (freq < -MAXFREQ)
+ L_LINT(time_freq, -MAXFREQ);
+ else
+ L_LINT(time_freq, freq);
+#ifdef PPS_SYNC
+ pps_freq = time_freq;
+#endif /* PPS_SYNC */
+ }
+
+ /*
+ * Retrieve all clock variables. Note that the TAI offset is
+ * returned only by ntp_gettime();
+ */
+ if (time_status & STA_NANO)
+ ntv.offset = L_GINT(time_offset);
+ else
+ ntv.offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */
+ ntv.freq = L_GINT((time_freq / 1000LL) << 16);
+ ntv.maxerror = time_maxerror;
+ ntv.esterror = time_esterror;
+ ntv.status = time_status;
+ ntv.constant = time_constant;
+ if (time_status & STA_NANO)
+ ntv.precision = time_precision;
+ else
+ ntv.precision = time_precision / 1000;
+ ntv.tolerance = MAXFREQ * SCALE_PPM;
+#ifdef PPS_SYNC
+ ntv.shift = pps_shift;
+ ntv.ppsfreq = L_GINT((pps_freq / 1000LL) << 16);
+ if (time_status & STA_NANO)
+ ntv.jitter = pps_jitter;
+ else
+ ntv.jitter = pps_jitter / 1000;
+ ntv.stabil = pps_stabil;
+ ntv.calcnt = pps_calcnt;
+ ntv.errcnt = pps_errcnt;
+ ntv.jitcnt = pps_jitcnt;
+ ntv.stbcnt = pps_stbcnt;
+#endif /* PPS_SYNC */
+ splx(s);
+
+ error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv));
+ if (error)
+ goto done2;
+
+ /*
+ * Status word error decode. See comments in
+ * ntp_gettime() routine.
+ */
+ if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
+ (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+ !(time_status & STA_PPSSIGNAL)) ||
+ (time_status & STA_PPSTIME &&
+ time_status & STA_PPSJITTER) ||
+ (time_status & STA_PPSFREQ &&
+ time_status & (STA_PPSWANDER | STA_PPSERROR))) {
+ td->td_retval[0] = TIME_ERROR;
+ } else {
+ td->td_retval[0] = time_state;
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * second_overflow() - called after ntp_tick_adjust()
+ *
+ * This routine is ordinarily called immediately following the above
+ * routine ntp_tick_adjust(). While these two routines are normally
+ * combined, they are separated here only for the purposes of
+ * simulation.
+ */
+void
+ntp_update_second(int64_t *adjustment, time_t *newsec)
+{
+ int tickrate;
+ l_fp ftemp; /* 32/64-bit temporary */
+
+ /*
+ * On rollover of the second both the nanosecond and microsecond
+ * clocks are updated and the state machine cranked as
+ * necessary. The phase adjustment to be used for the next
+ * second is calculated and the maximum error is increased by
+ * the tolerance.
+ */
+ time_maxerror += MAXFREQ / 1000;
+
+ /*
+ * Leap second processing. If in leap-insert state at
+ * the end of the day, the system clock is set back one
+ * second; if in leap-delete state, the system clock is
+ * set ahead one second. The nano_time() routine or
+ * external clock driver will insure that reported time
+ * is always monotonic.
+ */
+ switch (time_state) {
+
+ /*
+ * No warning.
+ */
+ case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
+ break;
+
+ /*
+ * Insert second 23:59:60 following second
+ * 23:59:59.
+ */
+ case TIME_INS:
+ if (!(time_status & STA_INS))
+ time_state = TIME_OK;
+ else if ((*newsec) % 86400 == 0) {
+ (*newsec)--;
+ time_state = TIME_OOP;
+ }
+ break;
+
+ /*
+ * Delete second 23:59:59.
+ */
+ case TIME_DEL:
+ if (!(time_status & STA_DEL))
+ time_state = TIME_OK;
+ else if (((*newsec) + 1) % 86400 == 0) {
+ (*newsec)++;
+ time_tai--;
+ time_state = TIME_WAIT;
+ }
+ break;
+
+ /*
+ * Insert second in progress.
+ */
+ case TIME_OOP:
+ time_tai++;
+ time_state = TIME_WAIT;
+ break;
+
+ /*
+ * Wait for status bits to clear.
+ */
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ }
+
+ /*
+ * Compute the total time adjustment for the next second
+ * in ns. The offset is reduced by a factor depending on
+ * whether the PPS signal is operating. Note that the
+ * value is in effect scaled by the clock frequency,
+ * since the adjustment is added at each tick interrupt.
+ */
+ ftemp = time_offset;
+#ifdef PPS_SYNC
+ /* XXX even if PPS signal dies we should finish adjustment ? */
+ if (time_status & STA_PPSTIME && time_status &
+ STA_PPSSIGNAL)
+ L_RSHIFT(ftemp, pps_shift);
+ else
+ L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
+#else
+ L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
+#endif /* PPS_SYNC */
+ time_adj = ftemp;
+ L_SUB(time_offset, ftemp);
+ L_ADD(time_adj, time_freq);
+
+ /*
+ * Apply any correction from adjtime(2). If more than one second
+ * off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM)
+ * until the last second is slewed the final < 500 usecs.
+ */
+ if (time_adjtime != 0) {
+ if (time_adjtime > 1000000)
+ tickrate = 5000;
+ else if (time_adjtime < -1000000)
+ tickrate = -5000;
+ else if (time_adjtime > 500)
+ tickrate = 500;
+ else if (time_adjtime < -500)
+ tickrate = -500;
+ else if (time_adjtime != 0)
+ tickrate = time_adjtime;
+ else
+ tickrate = 0; /* GCC sucks! */
+ time_adjtime -= tickrate;
+ L_LINT(ftemp, tickrate * 1000);
+ L_ADD(time_adj, ftemp);
+ }
+ *adjustment = time_adj;
+
+#ifdef PPS_SYNC
+ if (pps_valid > 0)
+ pps_valid--;
+ else
+ time_status &= ~STA_PPSSIGNAL;
+#endif /* PPS_SYNC */
+}
+
+/*
+ * ntp_init() - initialize variables and structures
+ *
+ * This routine must be called after the kernel variables hz and tick
+ * are set or changed and before the next tick interrupt. In this
+ * particular implementation, these values are assumed set elsewhere in
+ * the kernel. The design allows the clock frequency and tick interval
+ * to be changed while the system is running. So, this routine should
+ * probably be integrated with the code that does that.
+ */
+static void
+ntp_init()
+{
+
+ /*
+ * The following variable must be initialized any time the
+ * kernel variable hz is changed.
+ */
+ time_tick = NANOSECOND / hz;
+
+ /*
+ * The following variables are initialized only at startup. Only
+ * those structures not cleared by the compiler need to be
+ * initialized, and these only in the simulator. In the actual
+ * kernel, any nonzero values here will quickly evaporate.
+ */
+ L_CLR(time_offset);
+ L_CLR(time_freq);
+#ifdef PPS_SYNC
+ pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0;
+ pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0;
+ pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0;
+ pps_fcount = 0;
+ L_CLR(pps_freq);
+#endif /* PPS_SYNC */
+}
+
+SYSINIT(ntpclocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, ntp_init, NULL)
+
+/*
+ * hardupdate() - local clock update
+ *
+ * This routine is called by ntp_adjtime() to update the local clock
+ * phase and frequency. The implementation is of an adaptive-parameter,
+ * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
+ * time and frequency offset estimates for each call. If the kernel PPS
+ * discipline code is configured (PPS_SYNC), the PPS signal itself
+ * determines the new time offset, instead of the calling argument.
+ * Presumably, calls to ntp_adjtime() occur only when the caller
+ * believes the local clock is valid within some bound (+-128 ms with
+ * NTP). If the caller's time is far different than the PPS time, an
+ * argument will ensue, and it's not clear who will lose.
+ *
+ * For uncompensated quartz crystal oscillators and nominal update
+ * intervals less than 256 s, operation should be in phase-lock mode,
+ * where the loop is disciplined to phase. For update intervals greater
+ * than 1024 s, operation should be in frequency-lock mode, where the
+ * loop is disciplined to frequency. Between 256 s and 1024 s, the mode
+ * is selected by the STA_MODE status bit.
+ */
+static void
+hardupdate(offset)
+ long offset; /* clock offset (ns) */
+{
+ long mtemp;
+ l_fp ftemp;
+
+ /*
+ * Select how the phase is to be controlled and from which
+ * source. If the PPS signal is present and enabled to
+ * discipline the time, the PPS offset is used; otherwise, the
+ * argument offset is used.
+ */
+ if (!(time_status & STA_PLL))
+ return;
+ if (!(time_status & STA_PPSTIME && time_status &
+ STA_PPSSIGNAL)) {
+ if (offset > MAXPHASE)
+ time_monitor = MAXPHASE;
+ else if (offset < -MAXPHASE)
+ time_monitor = -MAXPHASE;
+ else
+ time_monitor = offset;
+ L_LINT(time_offset, time_monitor);
+ }
+
+ /*
+ * Select how the frequency is to be controlled and in which
+ * mode (PLL or FLL). If the PPS signal is present and enabled
+ * to discipline the frequency, the PPS frequency is used;
+ * otherwise, the argument offset is used to compute it.
+ */
+ if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) {
+ time_reftime = time_second;
+ return;
+ }
+ if (time_status & STA_FREQHOLD || time_reftime == 0)
+ time_reftime = time_second;
+ mtemp = time_second - time_reftime;
+ L_LINT(ftemp, time_monitor);
+ L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1);
+ L_MPY(ftemp, mtemp);
+ L_ADD(time_freq, ftemp);
+ time_status &= ~STA_MODE;
+ if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp >
+ MAXSEC)) {
+ L_LINT(ftemp, (time_monitor << 4) / mtemp);
+ L_RSHIFT(ftemp, SHIFT_FLL + 4);
+ L_ADD(time_freq, ftemp);
+ time_status |= STA_MODE;
+ }
+ time_reftime = time_second;
+ if (L_GINT(time_freq) > MAXFREQ)
+ L_LINT(time_freq, MAXFREQ);
+ else if (L_GINT(time_freq) < -MAXFREQ)
+ L_LINT(time_freq, -MAXFREQ);
+}
+
+#ifdef PPS_SYNC
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS interrupt in order to discipline
+ * the CPU clock oscillator to the PPS signal. There are two independent
+ * first-order feedback loops, one for the phase, the other for the
+ * frequency. The phase loop measures and grooms the PPS phase offset
+ * and leaves it in a handy spot for the seconds overflow routine. The
+ * frequency loop averages successive PPS phase differences and
+ * calculates the PPS frequency offset, which is also processed by the
+ * seconds overflow routine. The code requires the caller to capture the
+ * time and architecture-dependent hardware counter values in
+ * nanoseconds at the on-time PPS signal transition.
+ *
+ * Note that, on some Unix systems this routine runs at an interrupt
+ * priority level higher than the timer interrupt routine hardclock().
+ * Therefore, the variables used are distinct from the hardclock()
+ * variables, except for the actual time and frequency variables, which
+ * are determined by this routine and updated atomically.
+ */
+void
+hardpps(tsp, nsec)
+ struct timespec *tsp; /* time at PPS */
+ long nsec; /* hardware counter at PPS */
+{
+ long u_sec, u_nsec, v_nsec; /* temps */
+ l_fp ftemp;
+
+ /*
+ * The signal is first processed by a range gate and frequency
+ * discriminator. The range gate rejects noise spikes outside
+ * the range +-500 us. The frequency discriminator rejects input
+ * signals with apparent frequency outside the range 1 +-500
+ * PPM. If two hits occur in the same second, we ignore the
+ * later hit; if not and a hit occurs outside the range gate,
+ * keep the later hit for later comparison, but do not process
+ * it.
+ */
+ time_status |= STA_PPSSIGNAL | STA_PPSJITTER;
+ time_status &= ~(STA_PPSWANDER | STA_PPSERROR);
+ pps_valid = PPS_VALID;
+ u_sec = tsp->tv_sec;
+ u_nsec = tsp->tv_nsec;
+ if (u_nsec >= (NANOSECOND >> 1)) {
+ u_nsec -= NANOSECOND;
+ u_sec++;
+ }
+ v_nsec = u_nsec - pps_tf[0].tv_nsec;
+ if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND -
+ MAXFREQ)
+ return;
+ pps_tf[2] = pps_tf[1];
+ pps_tf[1] = pps_tf[0];
+ pps_tf[0].tv_sec = u_sec;
+ pps_tf[0].tv_nsec = u_nsec;
+
+ /*
+ * Compute the difference between the current and previous
+ * counter values. If the difference exceeds 0.5 s, assume it
+ * has wrapped around, so correct 1.0 s. If the result exceeds
+ * the tick interval, the sample point has crossed a tick
+ * boundary during the last second, so correct the tick. Very
+ * intricate.
+ */
+ u_nsec = nsec;
+ if (u_nsec > (NANOSECOND >> 1))
+ u_nsec -= NANOSECOND;
+ else if (u_nsec < -(NANOSECOND >> 1))
+ u_nsec += NANOSECOND;
+ pps_fcount += u_nsec;
+ if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ)
+ return;
+ time_status &= ~STA_PPSJITTER;
+
+ /*
+ * A three-stage median filter is used to help denoise the PPS
+ * time. The median sample becomes the time offset estimate; the
+ * difference between the other two samples becomes the time
+ * dispersion (jitter) estimate.
+ */
+ if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) {
+ if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) {
+ v_nsec = pps_tf[1].tv_nsec; /* 0 1 2 */
+ u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec;
+ } else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) {
+ v_nsec = pps_tf[0].tv_nsec; /* 2 0 1 */
+ u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec;
+ } else {
+ v_nsec = pps_tf[2].tv_nsec; /* 0 2 1 */
+ u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec;
+ }
+ } else {
+ if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) {
+ v_nsec = pps_tf[1].tv_nsec; /* 2 1 0 */
+ u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec;
+ } else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) {
+ v_nsec = pps_tf[0].tv_nsec; /* 1 0 2 */
+ u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec;
+ } else {
+ v_nsec = pps_tf[2].tv_nsec; /* 1 2 0 */
+ u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec;
+ }
+ }
+
+ /*
+ * Nominal jitter is due to PPS signal noise and interrupt
+ * latency. If it exceeds the popcorn threshold, the sample is
+ * discarded. otherwise, if so enabled, the time offset is
+ * updated. We can tolerate a modest loss of data here without
+ * much degrading time accuracy.
+ */
+ if (u_nsec > (pps_jitter << PPS_POPCORN)) {
+ time_status |= STA_PPSJITTER;
+ pps_jitcnt++;
+ } else if (time_status & STA_PPSTIME) {
+ time_monitor = -v_nsec;
+ L_LINT(time_offset, time_monitor);
+ }
+ pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG;
+ u_sec = pps_tf[0].tv_sec - pps_lastsec;
+ if (u_sec < (1 << pps_shift))
+ return;
+
+ /*
+ * At the end of the calibration interval the difference between
+ * the first and last counter values becomes the scaled
+ * frequency. It will later be divided by the length of the
+ * interval to determine the frequency update. If the frequency
+ * exceeds a sanity threshold, or if the actual calibration
+ * interval is not equal to the expected length, the data are
+ * discarded. We can tolerate a modest loss of data here without
+ * much degrading frequency accuracy.
+ */
+ pps_calcnt++;
+ v_nsec = -pps_fcount;
+ pps_lastsec = pps_tf[0].tv_sec;
+ pps_fcount = 0;
+ u_nsec = MAXFREQ << pps_shift;
+ if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 <<
+ pps_shift)) {
+ time_status |= STA_PPSERROR;
+ pps_errcnt++;
+ return;
+ }
+
+ /*
+ * Here the raw frequency offset and wander (stability) is
+ * calculated. If the wander is less than the wander threshold
+ * for four consecutive averaging intervals, the interval is
+ * doubled; if it is greater than the threshold for four
+ * consecutive intervals, the interval is halved. The scaled
+ * frequency offset is converted to frequency offset. The
+ * stability metric is calculated as the average of recent
+ * frequency changes, but is used only for performance
+ * monitoring.
+ */
+ L_LINT(ftemp, v_nsec);
+ L_RSHIFT(ftemp, pps_shift);
+ L_SUB(ftemp, pps_freq);
+ u_nsec = L_GINT(ftemp);
+ if (u_nsec > PPS_MAXWANDER) {
+ L_LINT(ftemp, PPS_MAXWANDER);
+ pps_intcnt--;
+ time_status |= STA_PPSWANDER;
+ pps_stbcnt++;
+ } else if (u_nsec < -PPS_MAXWANDER) {
+ L_LINT(ftemp, -PPS_MAXWANDER);
+ pps_intcnt--;
+ time_status |= STA_PPSWANDER;
+ pps_stbcnt++;
+ } else {
+ pps_intcnt++;
+ }
+ if (pps_intcnt >= 4) {
+ pps_intcnt = 4;
+ if (pps_shift < pps_shiftmax) {
+ pps_shift++;
+ pps_intcnt = 0;
+ }
+ } else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) {
+ pps_intcnt = -4;
+ if (pps_shift > PPS_FAVG) {
+ pps_shift--;
+ pps_intcnt = 0;
+ }
+ }
+ if (u_nsec < 0)
+ u_nsec = -u_nsec;
+ pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG;
+
+ /*
+ * The PPS frequency is recalculated and clamped to the maximum
+ * MAXFREQ. If enabled, the system clock frequency is updated as
+ * well.
+ */
+ L_ADD(pps_freq, ftemp);
+ u_nsec = L_GINT(pps_freq);
+ if (u_nsec > MAXFREQ)
+ L_LINT(pps_freq, MAXFREQ);
+ else if (u_nsec < -MAXFREQ)
+ L_LINT(pps_freq, -MAXFREQ);
+ if (time_status & STA_PPSFREQ)
+ time_freq = pps_freq;
+}
+#endif /* PPS_SYNC */
+
+#ifndef _SYS_SYSPROTO_H_
+struct adjtime_args {
+ struct timeval *delta;
+ struct timeval *olddelta;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+adjtime(struct thread *td, struct adjtime_args *uap)
+{
+ struct timeval atv;
+ int error;
+
+ if ((error = suser(td)))
+ return (error);
+
+ mtx_lock(&Giant);
+ if (uap->olddelta) {
+ atv.tv_sec = time_adjtime / 1000000;
+ atv.tv_usec = time_adjtime % 1000000;
+ if (atv.tv_usec < 0) {
+ atv.tv_usec += 1000000;
+ atv.tv_sec--;
+ }
+ error = copyout(&atv, uap->olddelta, sizeof(atv));
+ if (error)
+ goto done2;
+ }
+ if (uap->delta) {
+ error = copyin(uap->delta, &atv, sizeof(atv));
+ if (error)
+ goto done2;
+ time_adjtime = (int64_t)atv.tv_sec * 1000000 + atv.tv_usec;
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
new file mode 100644
index 0000000..11f3d0c
--- /dev/null
+++ b/sys/kern/kern_physio.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice immediately at the beginning of the file, without modification,
+ * this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ * John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ * are met.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static void
+physwakeup(struct buf *bp)
+{
+ wakeup((caddr_t) bp);
+}
+
+int
+physio(dev_t dev, struct uio *uio, int ioflag)
+{
+ int i;
+ int error;
+ int spl;
+ caddr_t sa;
+ u_int iolen;
+ struct buf *bp;
+
+ /* Keep the process UPAGES from being swapped. XXX: why ? */
+ PHOLD(curproc);
+
+ bp = getpbuf(NULL);
+ sa = bp->b_data;
+ error = bp->b_error = 0;
+
+ /* XXX: sanity check */
+ if(dev->si_iosize_max < PAGE_SIZE) {
+ printf("WARNING: %s si_iosize_max=%d, using DFLTPHYS.\n",
+ devtoname(dev), dev->si_iosize_max);
+ dev->si_iosize_max = DFLTPHYS;
+ }
+
+ for (i = 0; i < uio->uio_iovcnt; i++) {
+ while (uio->uio_iov[i].iov_len) {
+ bp->b_flags = B_PHYS;
+ if (uio->uio_rw == UIO_READ)
+ bp->b_iocmd = BIO_READ;
+ else
+ bp->b_iocmd = BIO_WRITE;
+ bp->b_dev = dev;
+ bp->b_iodone = physwakeup;
+ bp->b_data = uio->uio_iov[i].iov_base;
+ bp->b_bcount = uio->uio_iov[i].iov_len;
+ bp->b_offset = uio->uio_offset;
+ bp->b_saveaddr = sa;
+
+ /* Don't exceed drivers iosize limit */
+ if (bp->b_bcount > dev->si_iosize_max)
+ bp->b_bcount = dev->si_iosize_max;
+
+ /*
+ * Make sure the pbuf can map the request
+ * XXX: The pbuf has kvasize = MAXPHYS so a request
+ * XXX: larger than MAXPHYS - PAGE_SIZE must be
+ * XXX: page aligned or it will be fragmented.
+ */
+ iolen = ((vm_offset_t) bp->b_data) & PAGE_MASK;
+ if ((bp->b_bcount + iolen) > bp->b_kvasize) {
+ bp->b_bcount = bp->b_kvasize;
+ if (iolen != 0)
+ bp->b_bcount -= PAGE_SIZE;
+ }
+ bp->b_bufsize = bp->b_bcount;
+
+ bp->b_blkno = btodb(bp->b_offset);
+
+ if (uio->uio_segflg == UIO_USERSPACE) {
+ if (!useracc(bp->b_data, bp->b_bufsize,
+ bp->b_iocmd == BIO_READ ?
+ VM_PROT_WRITE : VM_PROT_READ)) {
+ error = EFAULT;
+ goto doerror;
+ }
+ vmapbuf(bp);
+ }
+
+ DEV_STRATEGY(bp, 0);
+ spl = splbio();
+ while ((bp->b_flags & B_DONE) == 0)
+ tsleep((caddr_t)bp, PRIBIO, "physstr", 0);
+ splx(spl);
+
+ if (uio->uio_segflg == UIO_USERSPACE)
+ vunmapbuf(bp);
+ iolen = bp->b_bcount - bp->b_resid;
+ if (iolen == 0 && !(bp->b_ioflags & BIO_ERROR))
+ goto doerror; /* EOF */
+ uio->uio_iov[i].iov_len -= iolen;
+ uio->uio_iov[i].iov_base += iolen;
+ uio->uio_resid -= iolen;
+ uio->uio_offset += iolen;
+ if( bp->b_ioflags & BIO_ERROR) {
+ error = bp->b_error;
+ goto doerror;
+ }
+ }
+ }
+doerror:
+ relpbuf(bp, NULL);
+ PRELE(curproc);
+ return (error);
+}
diff --git a/sys/kern/kern_poll.c b/sys/kern/kern_poll.c
new file mode 100644
index 0000000..a197bc0
--- /dev/null
+++ b/sys/kern/kern_poll.c
@@ -0,0 +1,523 @@
+/*-
+ * Copyright (c) 2001-2002 Luigi Rizzo
+ *
+ * Supported by: the Xorp Project (www.xorp.org)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/socket.h> /* needed by net/if.h */
+#include <sys/sysctl.h>
+
+#include <net/if.h> /* for IFF_* flags */
+#include <net/netisr.h> /* for NETISR_POLL */
+
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/kthread.h>
+
+#ifdef SMP
+#ifndef COMPILING_LINT
+#error DEVICE_POLLING is not compatible with SMP
+#endif
+#endif
+
+static void netisr_poll(void); /* the two netisr handlers */
+void netisr_pollmore(void);
+
+void init_device_poll(void); /* init routine */
+void hardclock_device_poll(void); /* hook from hardclock */
+void ether_poll(int); /* polling while in trap */
+
+/*
+ * Polling support for [network] device drivers.
+ *
+ * Drivers which support this feature try to register with the
+ * polling code.
+ *
+ * If registration is successful, the driver must disable interrupts,
+ * and further I/O is performed through the handler, which is invoked
+ * (at least once per clock tick) with 3 arguments: the "arg" passed at
+ * register time (a struct ifnet pointer), a command, and a "count" limit.
+ *
+ * The command can be one of the following:
+ * POLL_ONLY: quick move of "count" packets from input/output queues.
+ * POLL_AND_CHECK_STATUS: as above, plus check status registers or do
+ * other more expensive operations. This command is issued periodically
+ * but less frequently than POLL_ONLY.
+ * POLL_DEREGISTER: deregister and return to interrupt mode.
+ *
+ * The first two commands are only issued if the interface is marked as
+ * 'IFF_UP and IFF_RUNNING', the last one only if IFF_RUNNING is set.
+ *
+ * The count limit specifies how much work the handler can do during the
+ * call -- typically this is the number of packets to be received, or
+ * transmitted, etc. (drivers are free to interpret this number, as long
+ * as the max time spent in the function grows roughly linearly with the
+ * count).
+ *
+ * Deregistration can be requested by the driver itself (typically in the
+ * *_stop() routine), or by the polling code, by invoking the handler.
+ *
+ * Polling can be globally enabled or disabled with the sysctl variable
+ * kern.polling.enable (default is 0, disabled)
+ *
+ * A second variable controls the sharing of CPU between polling/kernel
+ * network processing, and other activities (typically userlevel tasks):
+ * kern.polling.user_frac (between 0 and 100, default 50) sets the share
+ * of CPU allocated to user tasks. CPU is allocated proportionally to the
+ * shares, by dynamically adjusting the "count" (poll_burst).
+ *
+ * Other parameters can should be left to their default values.
+ * The following constraints hold
+ *
+ * 1 <= poll_each_burst <= poll_burst <= poll_burst_max
+ * 0 <= poll_in_trap <= poll_each_burst
+ * MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX
+ */
+
+#define MIN_POLL_BURST_MAX 10
+#define MAX_POLL_BURST_MAX 1000
+
+SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0,
+ "Device polling parameters");
+
+static u_int32_t poll_burst = 5;
+SYSCTL_UINT(_kern_polling, OID_AUTO, burst, CTLFLAG_RW,
+ &poll_burst, 0, "Current polling burst size");
+
+static u_int32_t poll_each_burst = 5;
+SYSCTL_UINT(_kern_polling, OID_AUTO, each_burst, CTLFLAG_RW,
+ &poll_each_burst, 0, "Max size of each burst");
+
+static u_int32_t poll_burst_max = 150; /* good for 100Mbit net and HZ=1000 */
+SYSCTL_UINT(_kern_polling, OID_AUTO, burst_max, CTLFLAG_RW,
+ &poll_burst_max, 0, "Max Polling burst size");
+
+static u_int32_t poll_in_idle_loop=1; /* do we poll in idle loop ? */
+SYSCTL_UINT(_kern_polling, OID_AUTO, idle_poll, CTLFLAG_RW,
+ &poll_in_idle_loop, 0, "Enable device polling in idle loop");
+
+u_int32_t poll_in_trap; /* used in trap.c */
+SYSCTL_UINT(_kern_polling, OID_AUTO, poll_in_trap, CTLFLAG_RW,
+ &poll_in_trap, 0, "Poll burst size during a trap");
+
+static u_int32_t user_frac = 50;
+SYSCTL_UINT(_kern_polling, OID_AUTO, user_frac, CTLFLAG_RW,
+ &user_frac, 0, "Desired user fraction of cpu time");
+
+static u_int32_t reg_frac = 20 ;
+SYSCTL_UINT(_kern_polling, OID_AUTO, reg_frac, CTLFLAG_RW,
+ &reg_frac, 0, "Every this many cycles poll register");
+
+static u_int32_t short_ticks;
+SYSCTL_UINT(_kern_polling, OID_AUTO, short_ticks, CTLFLAG_RW,
+ &short_ticks, 0, "Hardclock ticks shorter than they should be");
+
+static u_int32_t lost_polls;
+SYSCTL_UINT(_kern_polling, OID_AUTO, lost_polls, CTLFLAG_RW,
+ &lost_polls, 0, "How many times we would have lost a poll tick");
+
+static u_int32_t pending_polls;
+SYSCTL_UINT(_kern_polling, OID_AUTO, pending_polls, CTLFLAG_RW,
+ &pending_polls, 0, "Do we need to poll again");
+
+static int residual_burst = 0;
+SYSCTL_INT(_kern_polling, OID_AUTO, residual_burst, CTLFLAG_RW,
+ &residual_burst, 0, "# of residual cycles in burst");
+
+static u_int32_t poll_handlers; /* next free entry in pr[]. */
+SYSCTL_UINT(_kern_polling, OID_AUTO, handlers, CTLFLAG_RD,
+ &poll_handlers, 0, "Number of registered poll handlers");
+
+static int polling = 0; /* global polling enable */
+SYSCTL_UINT(_kern_polling, OID_AUTO, enable, CTLFLAG_RW,
+ &polling, 0, "Polling enabled");
+
+static u_int32_t phase;
+SYSCTL_UINT(_kern_polling, OID_AUTO, phase, CTLFLAG_RW,
+ &phase, 0, "Polling phase");
+
+static u_int32_t suspect;
+SYSCTL_UINT(_kern_polling, OID_AUTO, suspect, CTLFLAG_RW,
+ &suspect, 0, "suspect event");
+
+static u_int32_t stalled;
+SYSCTL_UINT(_kern_polling, OID_AUTO, stalled, CTLFLAG_RW,
+ &stalled, 0, "potential stalls");
+
+static u_int32_t idlepoll_sleeping; /* idlepoll is sleeping */
+SYSCTL_UINT(_kern_polling, OID_AUTO, idlepoll_sleeping, CTLFLAG_RD,
+ &idlepoll_sleeping, 0, "idlepoll is sleeping");
+
+
+#define POLL_LIST_LEN 128
+struct pollrec {
+ poll_handler_t *handler;
+ struct ifnet *ifp;
+};
+
+static struct pollrec pr[POLL_LIST_LEN];
+
+/*
+ * register relevant netisr. Called from kern_clock.c:
+ */
+void
+init_device_poll(void)
+{
+ register_netisr(NETISR_POLL, netisr_poll);
+}
+
+/*
+ * Hook from hardclock. Tries to schedule a netisr, but keeps track
+ * of lost ticks due to the previous handler taking too long.
+ * Normally, this should not happen, because polling handler should
+ * run for a short time. However, in some cases (e.g. when there are
+ * changes in link status etc.) the drivers take a very long time
+ * (even in the order of milliseconds) to reset and reconfigure the
+ * device, causing apparent lost polls.
+ *
+ * The first part of the code is just for debugging purposes, and tries
+ * to count how often hardclock ticks are shorter than they should,
+ * meaning either stray interrupts or delayed events.
+ */
+void
+hardclock_device_poll(void)
+{
+ static struct timeval prev_t, t;
+ int delta;
+
+ if (poll_handlers == 0)
+ return;
+
+ microuptime(&t);
+ delta = (t.tv_usec - prev_t.tv_usec) +
+ (t.tv_sec - prev_t.tv_sec)*1000000;
+ if (delta * hz < 500000)
+ short_ticks++;
+ else
+ prev_t = t;
+
+ if (pending_polls > 100) {
+ /*
+ * Too much, assume it has stalled (not always true
+ * see comment above).
+ */
+ stalled++;
+ pending_polls = 0;
+ phase = 0;
+ }
+
+ if (phase <= 2) {
+ if (phase != 0)
+ suspect++;
+ phase = 1;
+ schednetisr(NETISR_POLL);
+ phase = 2;
+ }
+ if (pending_polls++ > 0)
+ lost_polls++;
+}
+
+/*
+ * ether_poll is called from the idle loop or from the trap handler.
+ */
+void
+ether_poll(int count)
+{
+ int i;
+
+ mtx_lock(&Giant);
+
+ if (count > poll_each_burst)
+ count = poll_each_burst;
+ for (i = 0 ; i < poll_handlers ; i++)
+ if (pr[i].handler && (IFF_UP|IFF_RUNNING) ==
+ (pr[i].ifp->if_flags & (IFF_UP|IFF_RUNNING)) )
+ pr[i].handler(pr[i].ifp, 0, count); /* quick check */
+ mtx_unlock(&Giant);
+}
+
+/*
+ * netisr_pollmore is called after other netisr's, possibly scheduling
+ * another NETISR_POLL call, or adapting the burst size for the next cycle.
+ *
+ * It is very bad to fetch large bursts of packets from a single card at once,
+ * because the burst could take a long time to be completely processed, or
+ * could saturate the intermediate queue (ipintrq or similar) leading to
+ * losses or unfairness. To reduce the problem, and also to account better for
+ * time spent in network-related processing, we split the burst in smaller
+ * chunks of fixed size, giving control to the other netisr's between chunks.
+ * This helps in improving the fairness, reducing livelock (because we
+ * emulate more closely the "process to completion" that we have with
+ * fastforwarding) and accounting for the work performed in low level
+ * handling and forwarding.
+ */
+
+static struct timeval poll_start_t;
+
+void
+netisr_pollmore()
+{
+ struct timeval t;
+ int kern_load;
+ /* XXX run at splhigh() or equivalent */
+
+ phase = 5;
+ if (residual_burst > 0) {
+ schednetisr(NETISR_POLL);
+ /* will run immediately on return, followed by netisrs */
+ return ;
+ }
+ /* here we can account time spent in netisr's in this tick */
+ microuptime(&t);
+ kern_load = (t.tv_usec - poll_start_t.tv_usec) +
+ (t.tv_sec - poll_start_t.tv_sec)*1000000; /* us */
+ kern_load = (kern_load * hz) / 10000; /* 0..100 */
+ if (kern_load > (100 - user_frac)) { /* try decrease ticks */
+ if (poll_burst > 1)
+ poll_burst--;
+ } else {
+ if (poll_burst < poll_burst_max)
+ poll_burst++;
+ }
+
+ pending_polls--;
+ if (pending_polls == 0) /* we are done */
+ phase = 0;
+ else {
+ /*
+ * Last cycle was long and caused us to miss one or more
+ * hardclock ticks. Restart processing again, but slightly
+ * reduce the burst size to prevent that this happens again.
+ */
+ poll_burst -= (poll_burst / 8);
+ if (poll_burst < 1)
+ poll_burst = 1;
+ schednetisr(NETISR_POLL);
+ phase = 6;
+ }
+}
+
+/*
+ * netisr_poll is scheduled by schednetisr when appropriate, typically once
+ * per tick. It is called at splnet() so first thing to do is to upgrade to
+ * splimp(), and call all registered handlers.
+ */
+static void
+netisr_poll(void)
+{
+ static int reg_frac_count;
+ int i, cycles;
+ enum poll_cmd arg = POLL_ONLY;
+ mtx_lock(&Giant);
+
+ phase = 3;
+ if (residual_burst == 0) { /* first call in this tick */
+ microuptime(&poll_start_t);
+ /*
+ * Check that paremeters are consistent with runtime
+ * variables. Some of these tests could be done at sysctl
+ * time, but the savings would be very limited because we
+ * still have to check against reg_frac_count and
+ * poll_each_burst. So, instead of writing separate sysctl
+ * handlers, we do all here.
+ */
+
+ if (reg_frac > hz)
+ reg_frac = hz;
+ else if (reg_frac < 1)
+ reg_frac = 1;
+ if (reg_frac_count > reg_frac)
+ reg_frac_count = reg_frac - 1;
+ if (reg_frac_count-- == 0) {
+ arg = POLL_AND_CHECK_STATUS;
+ reg_frac_count = reg_frac - 1;
+ }
+ if (poll_burst_max < MIN_POLL_BURST_MAX)
+ poll_burst_max = MIN_POLL_BURST_MAX;
+ else if (poll_burst_max > MAX_POLL_BURST_MAX)
+ poll_burst_max = MAX_POLL_BURST_MAX;
+
+ if (poll_each_burst < 1)
+ poll_each_burst = 1;
+ else if (poll_each_burst > poll_burst_max)
+ poll_each_burst = poll_burst_max;
+
+ residual_burst = poll_burst;
+ }
+ cycles = (residual_burst < poll_each_burst) ?
+ residual_burst : poll_each_burst;
+ residual_burst -= cycles;
+
+ if (polling) {
+ for (i = 0 ; i < poll_handlers ; i++)
+ if (pr[i].handler && (IFF_UP|IFF_RUNNING) ==
+ (pr[i].ifp->if_flags & (IFF_UP|IFF_RUNNING)) )
+ pr[i].handler(pr[i].ifp, arg, cycles);
+ } else { /* unregister */
+ for (i = 0 ; i < poll_handlers ; i++) {
+ if (pr[i].handler &&
+ pr[i].ifp->if_flags & IFF_RUNNING) {
+ pr[i].ifp->if_ipending &= ~IFF_POLLING;
+ pr[i].handler(pr[i].ifp, POLL_DEREGISTER, 1);
+ }
+ pr[i].handler=NULL;
+ }
+ residual_burst = 0;
+ poll_handlers = 0;
+ }
+ /* on -stable, schednetisr(NETISR_POLLMORE); */
+ phase = 4;
+ mtx_unlock(&Giant);
+}
+
+/*
+ * Try to register routine for polling. Returns 1 if successful
+ * (and polling should be enabled), 0 otherwise.
+ * A device is not supposed to register itself multiple times.
+ *
+ * This is called from within the *_intr() functions, so we do not need
+ * further locking.
+ */
+int
+ether_poll_register(poll_handler_t *h, struct ifnet *ifp)
+{
+ int s;
+
+ if (polling == 0) /* polling disabled, cannot register */
+ return 0;
+ if (h == NULL || ifp == NULL) /* bad arguments */
+ return 0;
+ if ( !(ifp->if_flags & IFF_UP) ) /* must be up */
+ return 0;
+ if (ifp->if_ipending & IFF_POLLING) /* already polling */
+ return 0;
+
+ s = splhigh();
+ if (poll_handlers >= POLL_LIST_LEN) {
+ /*
+ * List full, cannot register more entries.
+ * This should never happen; if it does, it is probably a
+ * broken driver trying to register multiple times. Checking
+ * this at runtime is expensive, and won't solve the problem
+ * anyways, so just report a few times and then give up.
+ */
+ static int verbose = 10 ;
+ splx(s);
+ if (verbose >0) {
+ printf("poll handlers list full, "
+ "maybe a broken driver ?\n");
+ verbose--;
+ }
+ return 0; /* no polling for you */
+ }
+
+ pr[poll_handlers].handler = h;
+ pr[poll_handlers].ifp = ifp;
+ poll_handlers++;
+ ifp->if_ipending |= IFF_POLLING;
+ splx(s);
+ if (idlepoll_sleeping)
+ wakeup(&idlepoll_sleeping);
+ return 1; /* polling enabled in next call */
+}
+
+/*
+ * Remove interface from the polling list. Normally called by *_stop().
+ * It is not an error to call it with IFF_POLLING clear, the call is
+ * sufficiently rare to be preferable to save the space for the extra
+ * test in each driver in exchange of one additional function call.
+ */
+int
+ether_poll_deregister(struct ifnet *ifp)
+{
+ int i;
+
+ mtx_lock(&Giant);
+ if ( !ifp || !(ifp->if_ipending & IFF_POLLING) ) {
+ mtx_unlock(&Giant);
+ return 0;
+ }
+ for (i = 0 ; i < poll_handlers ; i++)
+ if (pr[i].ifp == ifp) /* found it */
+ break;
+ ifp->if_ipending &= ~IFF_POLLING; /* found or not... */
+ if (i == poll_handlers) {
+ mtx_unlock(&Giant);
+ printf("ether_poll_deregister: ifp not found!!!\n");
+ return 0;
+ }
+ poll_handlers--;
+ if (i < poll_handlers) { /* Last entry replaces this one. */
+ pr[i].handler = pr[poll_handlers].handler;
+ pr[i].ifp = pr[poll_handlers].ifp;
+ }
+ mtx_unlock(&Giant);
+ return 1;
+}
+
+static void
+poll_idle(void)
+{
+ struct thread *td = curthread;
+ struct rtprio rtp;
+ int pri;
+
+ rtp.prio = RTP_PRIO_MAX; /* lowest priority */
+ rtp.type = RTP_PRIO_IDLE;
+ mtx_lock_spin(&sched_lock);
+ rtp_to_pri(&rtp, td->td_ksegrp);
+ pri = td->td_priority;
+ mtx_unlock_spin(&sched_lock);
+
+ for (;;) {
+ if (poll_in_idle_loop && poll_handlers > 0) {
+ idlepoll_sleeping = 0;
+ mtx_lock(&Giant);
+ ether_poll(poll_each_burst);
+ mtx_unlock(&Giant);
+ mtx_assert(&Giant, MA_NOTOWNED);
+ mtx_lock_spin(&sched_lock);
+ setrunqueue(td);
+ td->td_proc->p_stats->p_ru.ru_nvcsw++;
+ mi_switch();
+ mtx_unlock_spin(&sched_lock);
+ } else {
+ idlepoll_sleeping = 1;
+ tsleep(&idlepoll_sleeping, pri, "pollid", hz * 3);
+ }
+ }
+}
+
+static struct proc *idlepoll;
+static struct kproc_desc idlepoll_kp = {
+ "idlepoll",
+ poll_idle,
+ &idlepoll
+};
+SYSINIT(idlepoll, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, kproc_start, &idlepoll_kp)
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
new file mode 100644
index 0000000..a5378d9
--- /dev/null
+++ b/sys/kern/kern_proc.c
@@ -0,0 +1,1072 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/sysctl.h>
+#include <sys/filedesc.h>
+#include <sys/tty.h>
+#include <sys/signalvar.h>
+#include <sys/sx.h>
+#include <sys/user.h>
+#include <sys/jail.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/uma.h>
+#include <machine/critical.h>
+
+MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
+MALLOC_DEFINE(M_SESSION, "session", "session header");
+static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
+MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
+
+static struct proc *dopfind(register pid_t);
+
+static void doenterpgrp(struct proc *, struct pgrp *);
+
+static void pgdelete(struct pgrp *);
+
+static void orphanpg(struct pgrp *pg);
+
+/*
+ * Other process lists
+ */
+struct pidhashhead *pidhashtbl;
+u_long pidhash;
+struct pgrphashhead *pgrphashtbl;
+u_long pgrphash;
+struct proclist allproc;
+struct proclist zombproc;
+struct sx allproc_lock;
+struct sx proctree_lock;
+struct mtx pargs_ref_lock;
+uma_zone_t proc_zone;
+uma_zone_t ithread_zone;
+
+CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
+
+/*
+ * Initialize global process hashing structures.
+ */
+void
+procinit()
+{
+
+ sx_init(&allproc_lock, "allproc");
+ sx_init(&proctree_lock, "proctree");
+ mtx_init(&pargs_ref_lock, "struct pargs.ref", NULL, MTX_DEF);
+ LIST_INIT(&allproc);
+ LIST_INIT(&zombproc);
+ pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
+ pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
+ proc_zone = uma_zcreate("PROC", sizeof (struct proc), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uihashinit();
+}
+
+/*
+ * Note that we do not link to the proc's ucred here
+ * The thread is linked as if running but no KSE assigned
+ */
+static void
+thread_link(struct thread *td, struct ksegrp *kg)
+{
+ struct proc *p = kg->kg_proc;
+
+ td->td_proc = p;
+ td->td_ksegrp = kg;
+ td->td_last_kse = &p->p_kse;
+
+ TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
+ TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
+ td->td_critnest = 0;
+ td->td_kse = NULL;
+ cpu_thread_link(td);
+}
+
+/*
+ * KSE is linked onto the idle queue.
+ */
+static void
+kse_link(struct kse *ke, struct ksegrp *kg)
+{
+ struct proc *p = kg->kg_proc;
+
+ TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
+ kg->kg_kses++;
+ TAILQ_INSERT_HEAD(&kg->kg_iq, ke, ke_kgrlist);
+ ke->ke_proc = p;
+ ke->ke_ksegrp = kg;
+ ke->ke_thread = NULL;
+ ke->ke_oncpu = NOCPU;
+}
+
+static void
+ksegrp_link(struct ksegrp *kg, struct proc *p)
+{
+
+ TAILQ_INIT(&kg->kg_threads);
+ TAILQ_INIT(&kg->kg_runq); /* links with td_runq */
+ TAILQ_INIT(&kg->kg_slpq); /* links with td_runq */
+ TAILQ_INIT(&kg->kg_kseq); /* all kses in ksegrp */
+ TAILQ_INIT(&kg->kg_iq); /* all kses in ksegrp */
+ kg->kg_proc = p;
+/* the following counters are in the -zero- section and may not need clearing */
+ kg->kg_runnable = 0;
+ kg->kg_kses = 0;
+ kg->kg_runq_kses = 0; /* XXXKSE change name */
+/* link it in now that it's consitant */
+ TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
+}
+
+/*
+ * for a newly created process,
+ * link up a the structure and its initial threads etc.
+ */
+void
+proc_linkup(struct proc *p, struct ksegrp *kg,
+ struct kse *ke, struct thread *td)
+{
+
+ TAILQ_INIT(&p->p_ksegrps); /* all ksegrps in proc */
+ TAILQ_INIT(&p->p_threads); /* all threads in proc */
+
+ ksegrp_link(kg, p);
+ kse_link(ke, kg);
+ thread_link(td, kg);
+ /* link them together for 1:1 */
+ td->td_kse = ke;
+ ke->ke_thread = td;
+}
+
+/* temporary version is ultra simple while we are in 1:1 mode */
+struct thread *
+thread_get(struct proc *p)
+{
+ struct thread *td = &p->p_xxthread;
+
+ return (td);
+}
+
+
+/*********************
+* STUB KSE syscalls
+*********************/
+
+/* struct thread_wakeup_args { struct thread_mailbox *tmbx; }; */
+int
+thread_wakeup(struct thread *td, struct thread_wakeup_args *uap)
+{
+
+ return(ENOSYS);
+}
+
+int
+kse_exit(struct thread *td, struct kse_exit_args *uap)
+{
+
+ return(ENOSYS);
+}
+
+int
+kse_yield(struct thread *td, struct kse_yield_args *uap)
+{
+
+ return(ENOSYS);
+}
+
+int kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
+{
+
+ return(ENOSYS);
+}
+
+
+int
+kse_new(struct thread *td, struct kse_new_args *uap)
+/* struct kse_new_args {
+ struct kse_mailbox *mbx;
+ int new_grp_flag;
+}; */
+{
+
+ return (ENOSYS);
+}
+
+/*
+ * Is p an inferior of the current process?
+ */
+int
+inferior(p)
+ register struct proc *p;
+{
+
+ sx_assert(&proctree_lock, SX_LOCKED);
+ for (; p != curproc; p = p->p_pptr)
+ if (p->p_pid == 0)
+ return (0);
+ return (1);
+}
+
+/*
+ * Locate a process by number
+ */
+struct proc *
+pfind(pid)
+ register pid_t pid;
+{
+ register struct proc *p;
+
+ sx_slock(&allproc_lock);
+ p = dopfind(pid);
+ sx_sunlock(&allproc_lock);
+ return (p);
+}
+
+static struct proc *
+dopfind(pid)
+ register pid_t pid;
+{
+ register struct proc *p;
+
+ sx_assert(&allproc_lock, SX_LOCKED);
+
+ LIST_FOREACH(p, PIDHASH(pid), p_hash)
+ if (p->p_pid == pid) {
+ PROC_LOCK(p);
+ break;
+ }
+ return (p);
+}
+
+/*
+ * Locate a process group by number.
+ * The caller must hold proctree_lock.
+ */
+struct pgrp *
+pgfind(pgid)
+ register pid_t pgid;
+{
+ register struct pgrp *pgrp;
+
+ sx_assert(&proctree_lock, SX_LOCKED);
+
+ LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
+ if (pgrp->pg_id == pgid) {
+ PGRP_LOCK(pgrp);
+ return (pgrp);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Create a new process group.
+ * pgid must be equal to the pid of p.
+ * Begin a new session if required.
+ */
+int
+enterpgrp(p, pgid, pgrp, sess)
+ register struct proc *p;
+ pid_t pgid;
+ struct pgrp *pgrp;
+ struct session *sess;
+{
+ struct pgrp *pgrp2;
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+
+ KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
+ KASSERT(p->p_pid == pgid,
+ ("enterpgrp: new pgrp and pid != pgid"));
+
+ pgrp2 = pgfind(pgid);
+
+ KASSERT(pgrp2 == NULL,
+ ("enterpgrp: pgrp with pgid exists"));
+ KASSERT(!SESS_LEADER(p),
+ ("enterpgrp: session leader attempted setpgrp"));
+
+ mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
+
+ if (sess != NULL) {
+ /*
+ * new session
+ */
+ mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
+ PROC_LOCK(p);
+ p->p_flag &= ~P_CONTROLT;
+ PROC_UNLOCK(p);
+ PGRP_LOCK(pgrp);
+ sess->s_leader = p;
+ sess->s_sid = p->p_pid;
+ sess->s_count = 1;
+ sess->s_ttyvp = NULL;
+ sess->s_ttyp = NULL;
+ bcopy(p->p_session->s_login, sess->s_login,
+ sizeof(sess->s_login));
+ pgrp->pg_session = sess;
+ KASSERT(p == curproc,
+ ("enterpgrp: mksession and p != curproc"));
+ } else {
+ pgrp->pg_session = p->p_session;
+ SESS_LOCK(pgrp->pg_session);
+ pgrp->pg_session->s_count++;
+ SESS_UNLOCK(pgrp->pg_session);
+ PGRP_LOCK(pgrp);
+ }
+ pgrp->pg_id = pgid;
+ LIST_INIT(&pgrp->pg_members);
+
+ /*
+ * As we have an exclusive lock of proctree_lock,
+ * this should not deadlock.
+ */
+ LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
+ pgrp->pg_jobc = 0;
+ SLIST_INIT(&pgrp->pg_sigiolst);
+ PGRP_UNLOCK(pgrp);
+
+ doenterpgrp(p, pgrp);
+
+ return (0);
+}
+
+/*
+ * Move p to an existing process group
+ */
+int
+enterthispgrp(p, pgrp)
+ register struct proc *p;
+ struct pgrp *pgrp;
+{
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+ PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+ PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+ PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
+ SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
+ KASSERT(pgrp->pg_session == p->p_session,
+ ("%s: pgrp's session %p, p->p_session %p.\n",
+ __func__,
+ pgrp->pg_session,
+ p->p_session));
+ KASSERT(pgrp != p->p_pgrp,
+ ("%s: p belongs to pgrp.", __func__));
+
+ doenterpgrp(p, pgrp);
+
+ return (0);
+}
+
+/*
+ * Move p to a process group
+ */
+static void
+doenterpgrp(p, pgrp)
+ struct proc *p;
+ struct pgrp *pgrp;
+{
+ struct pgrp *savepgrp;
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+ PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+ PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+ PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
+ SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
+
+ savepgrp = p->p_pgrp;
+
+ /*
+ * Adjust eligibility of affected pgrps to participate in job control.
+ * Increment eligibility counts before decrementing, otherwise we
+ * could reach 0 spuriously during the first call.
+ */
+ fixjobc(p, pgrp, 1);
+ fixjobc(p, p->p_pgrp, 0);
+
+ PGRP_LOCK(pgrp);
+ PGRP_LOCK(savepgrp);
+ PROC_LOCK(p);
+ LIST_REMOVE(p, p_pglist);
+ p->p_pgrp = pgrp;
+ PROC_UNLOCK(p);
+ LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
+ PGRP_UNLOCK(savepgrp);
+ PGRP_UNLOCK(pgrp);
+ if (LIST_EMPTY(&savepgrp->pg_members))
+ pgdelete(savepgrp);
+}
+
+/*
+ * remove process from process group
+ */
+int
+leavepgrp(p)
+ register struct proc *p;
+{
+ struct pgrp *savepgrp;
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+ savepgrp = p->p_pgrp;
+ PGRP_LOCK(savepgrp);
+ PROC_LOCK(p);
+ LIST_REMOVE(p, p_pglist);
+ p->p_pgrp = NULL;
+ PROC_UNLOCK(p);
+ PGRP_UNLOCK(savepgrp);
+ if (LIST_EMPTY(&savepgrp->pg_members))
+ pgdelete(savepgrp);
+ return (0);
+}
+
+/*
+ * delete a process group
+ */
+static void
+pgdelete(pgrp)
+ register struct pgrp *pgrp;
+{
+ struct session *savesess;
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+ PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+ SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
+
+ /*
+ * Reset any sigio structures pointing to us as a result of
+ * F_SETOWN with our pgid.
+ */
+ funsetownlst(&pgrp->pg_sigiolst);
+
+ PGRP_LOCK(pgrp);
+ if (pgrp->pg_session->s_ttyp != NULL &&
+ pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
+ pgrp->pg_session->s_ttyp->t_pgrp = NULL;
+ LIST_REMOVE(pgrp, pg_hash);
+ savesess = pgrp->pg_session;
+ SESS_LOCK(savesess);
+ savesess->s_count--;
+ SESS_UNLOCK(savesess);
+ PGRP_UNLOCK(pgrp);
+ if (savesess->s_count == 0) {
+ mtx_destroy(&savesess->s_mtx);
+ FREE(pgrp->pg_session, M_SESSION);
+ }
+ mtx_destroy(&pgrp->pg_mtx);
+ FREE(pgrp, M_PGRP);
+}
+
+/*
+ * Adjust pgrp jobc counters when specified process changes process group.
+ * We count the number of processes in each process group that "qualify"
+ * the group for terminal job control (those with a parent in a different
+ * process group of the same session). If that count reaches zero, the
+ * process group becomes orphaned. Check both the specified process'
+ * process group and that of its children.
+ * entering == 0 => p is leaving specified group.
+ * entering == 1 => p is entering specified group.
+ */
+void
+fixjobc(p, pgrp, entering)
+ register struct proc *p;
+ register struct pgrp *pgrp;
+ int entering;
+{
+ register struct pgrp *hispgrp;
+ register struct session *mysession;
+
+ sx_assert(&proctree_lock, SX_LOCKED);
+ PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+ PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+ SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
+
+ /*
+ * Check p's parent to see whether p qualifies its own process
+ * group; if so, adjust count for p's process group.
+ */
+ mysession = pgrp->pg_session;
+ if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
+ hispgrp->pg_session == mysession) {
+ PGRP_LOCK(pgrp);
+ if (entering)
+ pgrp->pg_jobc++;
+ else {
+ --pgrp->pg_jobc;
+ if (pgrp->pg_jobc == 0)
+ orphanpg(pgrp);
+ }
+ PGRP_UNLOCK(pgrp);
+ }
+
+ /*
+ * Check this process' children to see whether they qualify
+ * their process groups; if so, adjust counts for children's
+ * process groups.
+ */
+ LIST_FOREACH(p, &p->p_children, p_sibling) {
+ if ((hispgrp = p->p_pgrp) != pgrp &&
+ hispgrp->pg_session == mysession &&
+ p->p_stat != SZOMB) {
+ PGRP_LOCK(hispgrp);
+ if (entering)
+ hispgrp->pg_jobc++;
+ else {
+ --hispgrp->pg_jobc;
+ if (hispgrp->pg_jobc == 0)
+ orphanpg(hispgrp);
+ }
+ PGRP_UNLOCK(hispgrp);
+ }
+ }
+}
+
+/*
+ * A process group has become orphaned;
+ * if there are any stopped processes in the group,
+ * hang-up all process in that group.
+ */
+static void
+orphanpg(pg)
+ struct pgrp *pg;
+{
+ register struct proc *p;
+
+ PGRP_LOCK_ASSERT(pg, MA_OWNED);
+
+ mtx_lock_spin(&sched_lock);
+ LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+ if (p->p_stat == SSTOP) {
+ mtx_unlock_spin(&sched_lock);
+ LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ psignal(p, SIGHUP);
+ psignal(p, SIGCONT);
+ PROC_UNLOCK(p);
+ }
+ return;
+ }
+ }
+ mtx_unlock_spin(&sched_lock);
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(pgrpdump, pgrpdump)
+{
+ register struct pgrp *pgrp;
+ register struct proc *p;
+ register int i;
+
+ for (i = 0; i <= pgrphash; i++) {
+ if (!LIST_EMPTY(&pgrphashtbl[i])) {
+ printf("\tindx %d\n", i);
+ LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
+ printf(
+ "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
+ (void *)pgrp, (long)pgrp->pg_id,
+ (void *)pgrp->pg_session,
+ pgrp->pg_session->s_count,
+ (void *)LIST_FIRST(&pgrp->pg_members));
+ LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
+ printf("\t\tpid %ld addr %p pgrp %p\n",
+ (long)p->p_pid, (void *)p,
+ (void *)p->p_pgrp);
+ }
+ }
+ }
+ }
+}
+#endif /* DDB */
+
+/*
+ * Fill in an kinfo_proc structure for the specified process.
+ * Must be called with the target process locked.
+ */
+void
+fill_kinfo_proc(p, kp)
+ struct proc *p;
+ struct kinfo_proc *kp;
+{
+ struct thread *td;
+ struct tty *tp;
+ struct session *sp;
+ struct timeval tv;
+
+ bzero(kp, sizeof(*kp));
+
+ kp->ki_structsize = sizeof(*kp);
+ kp->ki_paddr = p;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ kp->ki_addr =/* p->p_addr; */0; /* XXXKSE */
+ kp->ki_args = p->p_args;
+ kp->ki_textvp = p->p_textvp;
+#ifdef KTRACE
+ kp->ki_tracep = p->p_tracep;
+ mtx_lock(&ktrace_mtx);
+ kp->ki_traceflag = p->p_traceflag;
+ mtx_unlock(&ktrace_mtx);
+#endif
+ kp->ki_fd = p->p_fd;
+ kp->ki_vmspace = p->p_vmspace;
+ if (p->p_ucred) {
+ kp->ki_uid = p->p_ucred->cr_uid;
+ kp->ki_ruid = p->p_ucred->cr_ruid;
+ kp->ki_svuid = p->p_ucred->cr_svuid;
+ /* XXX bde doesn't like KI_NGROUPS */
+ kp->ki_ngroups = min(p->p_ucred->cr_ngroups, KI_NGROUPS);
+ bcopy(p->p_ucred->cr_groups, kp->ki_groups,
+ kp->ki_ngroups * sizeof(gid_t));
+ kp->ki_rgid = p->p_ucred->cr_rgid;
+ kp->ki_svgid = p->p_ucred->cr_svgid;
+ }
+ if (p->p_procsig) {
+ kp->ki_sigignore = p->p_procsig->ps_sigignore;
+ kp->ki_sigcatch = p->p_procsig->ps_sigcatch;
+ }
+ mtx_lock_spin(&sched_lock);
+ if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) {
+ struct vmspace *vm = p->p_vmspace;
+
+ kp->ki_size = vm->vm_map.size;
+ kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/
+ if (p->p_sflag & PS_INMEM)
+ kp->ki_rssize += UAREA_PAGES;
+ FOREACH_THREAD_IN_PROC(p, td) /* XXXKSE: thread swapout check */
+ kp->ki_rssize += KSTACK_PAGES;
+ kp->ki_swrss = vm->vm_swrss;
+ kp->ki_tsize = vm->vm_tsize;
+ kp->ki_dsize = vm->vm_dsize;
+ kp->ki_ssize = vm->vm_ssize;
+ }
+ if ((p->p_sflag & PS_INMEM) && p->p_stats) {
+ kp->ki_start = p->p_stats->p_start;
+ kp->ki_rusage = p->p_stats->p_ru;
+ kp->ki_childtime.tv_sec = p->p_stats->p_cru.ru_utime.tv_sec +
+ p->p_stats->p_cru.ru_stime.tv_sec;
+ kp->ki_childtime.tv_usec = p->p_stats->p_cru.ru_utime.tv_usec +
+ p->p_stats->p_cru.ru_stime.tv_usec;
+ }
+ td = FIRST_THREAD_IN_PROC(p);
+ if (td->td_wmesg != NULL)
+ strncpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg) - 1);
+ if (p->p_stat == SMTX) {
+ kp->ki_kiflag |= KI_MTXBLOCK;
+ strncpy(kp->ki_mtxname, td->td_mtxname,
+ sizeof(kp->ki_mtxname) - 1);
+ }
+ kp->ki_stat = p->p_stat;
+ kp->ki_sflag = p->p_sflag;
+ kp->ki_swtime = p->p_swtime;
+ kp->ki_pid = p->p_pid;
+ /* vvv XXXKSE */
+ bintime2timeval(&p->p_runtime, &tv);
+ kp->ki_runtime = tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec;
+ kp->ki_pctcpu = p->p_kse.ke_pctcpu;
+ kp->ki_estcpu = td->td_ksegrp->kg_estcpu;
+ kp->ki_slptime = td->td_ksegrp->kg_slptime;
+ kp->ki_wchan = td->td_wchan;
+ kp->ki_pri.pri_level = td->td_priority;
+ kp->ki_pri.pri_user = td->td_ksegrp->kg_user_pri;
+ kp->ki_pri.pri_class = td->td_ksegrp->kg_pri_class;
+ kp->ki_pri.pri_native = td->td_base_pri;
+ kp->ki_nice = td->td_ksegrp->kg_nice;
+ kp->ki_rqindex = p->p_kse.ke_rqindex;
+ kp->ki_oncpu = p->p_kse.ke_oncpu;
+ kp->ki_lastcpu = td->td_lastcpu;
+ kp->ki_tdflags = td->td_flags;
+ kp->ki_pcb = td->td_pcb;
+ kp->ki_kstack = (void *)td->td_kstack;
+ /* ^^^ XXXKSE */
+ mtx_unlock_spin(&sched_lock);
+ sp = NULL;
+ tp = NULL;
+ if (p->p_pgrp) {
+ kp->ki_pgid = p->p_pgrp->pg_id;
+ kp->ki_jobc = p->p_pgrp->pg_jobc;
+ sp = p->p_pgrp->pg_session;
+
+ if (sp != NULL) {
+ kp->ki_sid = sp->s_sid;
+ SESS_LOCK(sp);
+ strncpy(kp->ki_login, sp->s_login,
+ sizeof(kp->ki_login) - 1);
+ if (sp->s_ttyvp)
+ kp->ki_kiflag |= KI_CTTY;
+ if (SESS_LEADER(p))
+ kp->ki_kiflag |= KI_SLEADER;
+ tp = sp->s_ttyp;
+ SESS_UNLOCK(sp);
+ }
+ }
+ if ((p->p_flag & P_CONTROLT) && tp != NULL) {
+ kp->ki_tdev = dev2udev(tp->t_dev);
+ kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
+ if (tp->t_session)
+ kp->ki_tsid = tp->t_session->s_sid;
+ } else
+ kp->ki_tdev = NOUDEV;
+ if (p->p_comm[0] != '\0') {
+ strncpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm) - 1);
+ strncpy(kp->ki_ocomm, p->p_comm, sizeof(kp->ki_ocomm) - 1);
+ }
+ kp->ki_siglist = p->p_siglist;
+ kp->ki_sigmask = p->p_sigmask;
+ kp->ki_xstat = p->p_xstat;
+ kp->ki_acflag = p->p_acflag;
+ kp->ki_flag = p->p_flag;
+ /* If jailed(p->p_ucred), emulate the old P_JAILED flag. */
+ if (jailed(p->p_ucred))
+ kp->ki_flag |= P_JAILED;
+ kp->ki_lock = p->p_lock;
+ if (p->p_pptr)
+ kp->ki_ppid = p->p_pptr->p_pid;
+}
+
+/*
+ * Locate a zombie process by number
+ */
+struct proc *
+zpfind(pid_t pid)
+{
+ struct proc *p;
+
+ sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &zombproc, p_list)
+ if (p->p_pid == pid) {
+ PROC_LOCK(p);
+ break;
+ }
+ sx_sunlock(&allproc_lock);
+ return (p);
+}
+
+
+/*
+ * Must be called with the process locked and will return with it unlocked.
+ */
+static int
+sysctl_out_proc(struct proc *p, struct sysctl_req *req, int doingzomb)
+{
+ struct kinfo_proc kinfo_proc;
+ int error;
+ struct proc *np;
+ pid_t pid = p->p_pid;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ fill_kinfo_proc(p, &kinfo_proc);
+ PROC_UNLOCK(p);
+ error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc));
+ if (error)
+ return (error);
+ if (doingzomb)
+ np = zpfind(pid);
+ else {
+ if (pid == 0)
+ return (0);
+ np = pfind(pid);
+ }
+ if (np == NULL)
+ return EAGAIN;
+ if (np != p) {
+ PROC_UNLOCK(np);
+ return EAGAIN;
+ }
+ PROC_UNLOCK(np);
+ return (0);
+}
+
+static int
+sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int*) arg1;
+ u_int namelen = arg2;
+ struct proc *p;
+ int doingzomb;
+ int error = 0;
+
+ if (oidp->oid_number == KERN_PROC_PID) {
+ if (namelen != 1)
+ return (EINVAL);
+ p = pfind((pid_t)name[0]);
+ if (!p)
+ return (0);
+ if (p_cansee(curthread, p)) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ error = sysctl_out_proc(p, req, 0);
+ return (error);
+ }
+ if (oidp->oid_number == KERN_PROC_ALL && !namelen)
+ ;
+ else if (oidp->oid_number != KERN_PROC_ALL && namelen == 1)
+ ;
+ else
+ return (EINVAL);
+
+ if (!req->oldptr) {
+ /* overestimate by 5 procs */
+ error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
+ if (error)
+ return (error);
+ }
+ sx_slock(&allproc_lock);
+ for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
+ if (!doingzomb)
+ p = LIST_FIRST(&allproc);
+ else
+ p = LIST_FIRST(&zombproc);
+ for (; p != 0; p = LIST_NEXT(p, p_list)) {
+ PROC_LOCK(p);
+ /*
+ * Show a user only appropriate processes.
+ */
+ if (p_cansee(curthread, p)) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * Skip embryonic processes.
+ */
+ if (p->p_stat == SIDL) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * TODO - make more efficient (see notes below).
+ * do by session.
+ */
+ switch (oidp->oid_number) {
+
+ case KERN_PROC_PGRP:
+ /* could do this by traversing pgrp */
+ if (p->p_pgrp == NULL ||
+ p->p_pgrp->pg_id != (pid_t)name[0]) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ break;
+
+ case KERN_PROC_TTY:
+ if ((p->p_flag & P_CONTROLT) == 0 ||
+ p->p_session == NULL) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ SESS_LOCK(p->p_session);
+ if (p->p_session->s_ttyp == NULL ||
+ dev2udev(p->p_session->s_ttyp->t_dev) !=
+ (udev_t)name[0]) {
+ SESS_UNLOCK(p->p_session);
+ PROC_UNLOCK(p);
+ continue;
+ }
+ SESS_UNLOCK(p->p_session);
+ break;
+
+ case KERN_PROC_UID:
+ if (p->p_ucred == NULL ||
+ p->p_ucred->cr_uid != (uid_t)name[0]) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ break;
+
+ case KERN_PROC_RUID:
+ if (p->p_ucred == NULL ||
+ p->p_ucred->cr_ruid != (uid_t)name[0]) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ break;
+ }
+
+ error = sysctl_out_proc(p, req, doingzomb);
+ if (error) {
+ sx_sunlock(&allproc_lock);
+ return (error);
+ }
+ }
+ }
+ sx_sunlock(&allproc_lock);
+ return (0);
+}
+
+struct pargs *
+pargs_alloc(int len)
+{
+ struct pargs *pa;
+
+ MALLOC(pa, struct pargs *, sizeof(struct pargs) + len, M_PARGS,
+ M_WAITOK);
+ pa->ar_ref = 1;
+ pa->ar_length = len;
+ return (pa);
+}
+
+void
+pargs_free(struct pargs *pa)
+{
+
+ FREE(pa, M_PARGS);
+}
+
+void
+pargs_hold(struct pargs *pa)
+{
+
+ if (pa == NULL)
+ return;
+ PARGS_LOCK(pa);
+ pa->ar_ref++;
+ PARGS_UNLOCK(pa);
+}
+
+void
+pargs_drop(struct pargs *pa)
+{
+
+ if (pa == NULL)
+ return;
+ PARGS_LOCK(pa);
+ if (--pa->ar_ref == 0) {
+ PARGS_UNLOCK(pa);
+ pargs_free(pa);
+ } else
+ PARGS_UNLOCK(pa);
+}
+
+/*
+ * This sysctl allows a process to retrieve the argument list or process
+ * title for another process without groping around in the address space
+ * of the other process. It also allow a process to set its own "process
+ * title to a string of its own choice.
+ */
+static int
+sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int*) arg1;
+ u_int namelen = arg2;
+ struct proc *p;
+ struct pargs *pa;
+ int error = 0;
+
+ if (namelen != 1)
+ return (EINVAL);
+
+ p = pfind((pid_t)name[0]);
+ if (!p)
+ return (0);
+
+ if ((!ps_argsopen) && p_cansee(curthread, p)) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ PROC_UNLOCK(p);
+
+ if (req->newptr && curproc != p)
+ return (EPERM);
+
+ PROC_LOCK(p);
+ pa = p->p_args;
+ pargs_hold(pa);
+ PROC_UNLOCK(p);
+ if (req->oldptr && pa != NULL) {
+ error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length);
+ }
+ pargs_drop(pa);
+ if (req->newptr == NULL)
+ return (error);
+
+ PROC_LOCK(p);
+ pa = p->p_args;
+ p->p_args = NULL;
+ PROC_UNLOCK(p);
+ pargs_drop(pa);
+
+ if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit)
+ return (error);
+
+ pa = pargs_alloc(req->newlen);
+ error = SYSCTL_IN(req, pa->ar_args, req->newlen);
+ if (!error) {
+ PROC_LOCK(p);
+ p->p_args = pa;
+ PROC_UNLOCK(p);
+ } else
+ pargs_free(pa);
+ return (error);
+}
+
+SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table");
+
+SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT,
+ 0, 0, sysctl_kern_proc, "S,proc", "Return entire process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_ANYBODY,
+ sysctl_kern_proc_args, "Process argument list");
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
new file mode 100644
index 0000000..a3e4bea
--- /dev/null
+++ b/sys/kern/kern_prot.c
@@ -0,0 +1,1969 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ * Copyright (c) 2000-2001 Robert N. M. Watson. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+/*
+ * System calls related to processes and protection
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/acct.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/jail.h>
+#include <sys/pioctl.h>
+#include <sys/resourcevar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+
+static MALLOC_DEFINE(M_CRED, "cred", "credentials");
+
+SYSCTL_DECL(_security);
+SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0,
+ "BSD security policy");
+
+#ifndef _SYS_SYSPROTO_H_
+struct getpid_args {
+ int dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getpid(struct thread *td, struct getpid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ int s;
+
+ s = mtx_lock_giant(kern_giant_proc);
+ td->td_retval[0] = p->p_pid;
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+ PROC_LOCK(p);
+ td->td_retval[1] = p->p_pptr->p_pid;
+ PROC_UNLOCK(p);
+#endif
+ mtx_unlock_giant(s);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getppid_args {
+ int dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getppid(struct thread *td, struct getppid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ int s;
+
+ s = mtx_lock_giant(kern_giant_proc);
+ PROC_LOCK(p);
+ td->td_retval[0] = p->p_pptr->p_pid;
+ PROC_UNLOCK(p);
+ mtx_unlock_giant(s);
+ return (0);
+}
+
+/*
+ * Get process group ID; note that POSIX getpgrp takes no parameter.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgrp_args {
+ int dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+getpgrp(struct thread *td, struct getpgrp_args *uap)
+{
+ struct proc *p = td->td_proc;
+ int s;
+
+ s = mtx_lock_giant(kern_giant_proc);
+ PROC_LOCK(p);
+ td->td_retval[0] = p->p_pgrp->pg_id;
+ PROC_UNLOCK(p);
+ mtx_unlock_giant(s);
+ return (0);
+}
+
+/* Get an arbitary pid's process group id */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgid_args {
+ pid_t pid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+getpgid(struct thread *td, struct getpgid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct proc *pt;
+ int error;
+
+ mtx_lock(&Giant);
+ error = 0;
+ if (uap->pid == 0) {
+ PROC_LOCK(p);
+ td->td_retval[0] = p->p_pgrp->pg_id;
+ PROC_UNLOCK(p);
+ } else if ((pt = pfind(uap->pid)) == NULL)
+ error = ESRCH;
+ else {
+ error = p_cansee(td, pt);
+ if (error == 0)
+ td->td_retval[0] = pt->p_pgrp->pg_id;
+ PROC_UNLOCK(pt);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Get an arbitary pid's session id.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getsid_args {
+ pid_t pid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+getsid(struct thread *td, struct getsid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct proc *pt;
+ int error;
+
+ mtx_lock(&Giant);
+ error = 0;
+ if (uap->pid == 0) {
+ PROC_LOCK(p);
+ td->td_retval[0] = p->p_session->s_sid;
+ PROC_UNLOCK(p);
+ } else if ((pt = pfind(uap->pid)) == NULL)
+ error = ESRCH;
+ else {
+ error = p_cansee(td, pt);
+ if (error == 0)
+ td->td_retval[0] = pt->p_session->s_sid;
+ PROC_UNLOCK(pt);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getuid_args {
+ int dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getuid(struct thread *td, struct getuid_args *uap)
+{
+
+ td->td_retval[0] = td->td_ucred->cr_ruid;
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+ td->td_retval[1] = td->td_ucred->cr_uid;
+#endif
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct geteuid_args {
+ int dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+geteuid(struct thread *td, struct geteuid_args *uap)
+{
+
+ td->td_retval[0] = td->td_ucred->cr_uid;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getgid_args {
+ int dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getgid(struct thread *td, struct getgid_args *uap)
+{
+
+ td->td_retval[0] = td->td_ucred->cr_rgid;
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+ td->td_retval[1] = td->td_ucred->cr_groups[0];
+#endif
+ return (0);
+}
+
+/*
+ * Get effective group ID. The "egid" is groups[0], and could be obtained
+ * via getgroups. This syscall exists because it is somewhat painful to do
+ * correctly in a library function.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getegid_args {
+ int dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getegid(struct thread *td, struct getegid_args *uap)
+{
+
+ td->td_retval[0] = td->td_ucred->cr_groups[0];
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getgroups_args {
+ u_int gidsetsize;
+ gid_t *gidset;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+getgroups(struct thread *td, register struct getgroups_args *uap)
+{
+ struct ucred *cred;
+ u_int ngrp;
+ int error;
+
+ cred = td->td_ucred;
+ if ((ngrp = uap->gidsetsize) == 0) {
+ td->td_retval[0] = cred->cr_ngroups;
+ return (0);
+ }
+ if (ngrp < cred->cr_ngroups)
+ return (EINVAL);
+ ngrp = cred->cr_ngroups;
+ error = copyout((caddr_t)cred->cr_groups, (caddr_t)uap->gidset,
+ ngrp * sizeof(gid_t));
+ if (error == 0)
+ td->td_retval[0] = ngrp;
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setsid_args {
+ int dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setsid(register struct thread *td, struct setsid_args *uap)
+{
+ struct pgrp *pgrp;
+ int error;
+ struct proc *p = td->td_proc;
+ struct pgrp *newpgrp;
+ struct session *newsess;
+
+ error = 0;
+ pgrp = NULL;
+
+ MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
+ MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO);
+
+ sx_xlock(&proctree_lock);
+
+ if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) {
+ if (pgrp != NULL)
+ PGRP_UNLOCK(pgrp);
+ error = EPERM;
+ } else {
+ (void)enterpgrp(p, p->p_pid, newpgrp, newsess);
+ td->td_retval[0] = p->p_pid;
+ newpgrp = NULL;
+ newsess = NULL;
+ }
+
+ sx_xunlock(&proctree_lock);
+
+ if (newpgrp != NULL)
+ FREE(newpgrp, M_PGRP);
+ if (newsess != NULL)
+ FREE(newsess, M_SESSION);
+
+ return (error);
+}
+
+/*
+ * set process group (setpgid/old setpgrp)
+ *
+ * caller does setpgid(targpid, targpgid)
+ *
+ * pid must be caller or child of caller (ESRCH)
+ * if a child
+ * pid must be in same session (EPERM)
+ * pid can't have done an exec (EACCES)
+ * if pgid != pid
+ * there must exist some pid in same session having pgid (EPERM)
+ * pid must not be session leader (EPERM)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setpgid_args {
+ int pid; /* target process id */
+ int pgid; /* target pgrp id */
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setpgid(struct thread *td, register struct setpgid_args *uap)
+{
+ struct proc *curp = td->td_proc;
+ register struct proc *targp; /* target process */
+ register struct pgrp *pgrp; /* target pgrp */
+ int error;
+ struct pgrp *newpgrp;
+
+ if (uap->pgid < 0)
+ return (EINVAL);
+
+ error = 0;
+
+ MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
+
+ sx_xlock(&proctree_lock);
+ if (uap->pid != 0 && uap->pid != curp->p_pid) {
+ if ((targp = pfind(uap->pid)) == NULL) {
+ if (targp)
+ PROC_UNLOCK(targp);
+ error = ESRCH;
+ goto done;
+ }
+ if (!inferior(targp)) {
+ PROC_UNLOCK(targp);
+ error = ESRCH;
+ goto done;
+ }
+ if ((error = p_cansee(curthread, targp))) {
+ PROC_UNLOCK(targp);
+ goto done;
+ }
+ if (targp->p_pgrp == NULL ||
+ targp->p_session != curp->p_session) {
+ PROC_UNLOCK(targp);
+ error = EPERM;
+ goto done;
+ }
+ if (targp->p_flag & P_EXEC) {
+ PROC_UNLOCK(targp);
+ error = EACCES;
+ goto done;
+ }
+ PROC_UNLOCK(targp);
+ } else
+ targp = curp;
+ if (SESS_LEADER(targp)) {
+ error = EPERM;
+ goto done;
+ }
+ if (uap->pgid == 0)
+ uap->pgid = targp->p_pid;
+ if (uap->pgid == targp->p_pid) {
+ if (targp->p_pgid == uap->pgid)
+ goto done;
+ error = enterpgrp(targp, uap->pgid, newpgrp, NULL);
+ if (error == 0)
+ newpgrp = NULL;
+ } else {
+ if ((pgrp = pgfind(uap->pgid)) == NULL ||
+ pgrp->pg_session != curp->p_session) {
+ if (pgrp != NULL)
+ PGRP_UNLOCK(pgrp);
+ error = EPERM;
+ goto done;
+ }
+ if (pgrp == targp->p_pgrp) {
+ PGRP_UNLOCK(pgrp);
+ goto done;
+ }
+ PGRP_UNLOCK(pgrp);
+ error = enterthispgrp(targp, pgrp);
+ }
+done:
+ sx_xunlock(&proctree_lock);
+ KASSERT((error == 0) || (newpgrp != NULL),
+ ("setpgid failed and newpgrp is NULL"));
+ if (newpgrp != NULL)
+ FREE(newpgrp, M_PGRP);
+ return (error);
+}
+
+/*
+ * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD
+ * compatible. It says that setting the uid/gid to euid/egid is a special
+ * case of "appropriate privilege". Once the rules are expanded out, this
+ * basically means that setuid(nnn) sets all three id's, in all permitted
+ * cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid())
+ * does not set the saved id - this is dangerous for traditional BSD
+ * programs. For this reason, we *really* do not want to set
+ * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2.
+ */
+#define POSIX_APPENDIX_B_4_2_2
+
+#ifndef _SYS_SYSPROTO_H_
+struct setuid_args {
+ uid_t uid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setuid(struct thread *td, struct setuid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ uid_t uid;
+ struct uidinfo *uip;
+ int error;
+
+ mtx_lock(&Giant);
+ uid = uap->uid;
+ newcred = crget();
+ uip = uifind(uid);
+ PROC_LOCK(p);
+ oldcred = p->p_ucred;
+
+ /*
+ * See if we have "permission" by POSIX 1003.1 rules.
+ *
+ * Note that setuid(geteuid()) is a special case of
+ * "appropriate privileges" in appendix B.4.2.2. We need
+ * to use this clause to be compatible with traditional BSD
+ * semantics. Basically, it means that "setuid(xx)" sets all
+ * three id's (assuming you have privs).
+ *
+ * Notes on the logic. We do things in three steps.
+ * 1: We determine if the euid is going to change, and do EPERM
+ * right away. We unconditionally change the euid later if this
+ * test is satisfied, simplifying that part of the logic.
+ * 2: We determine if the real and/or saved uids are going to
+ * change. Determined by compile options.
+ * 3: Change euid last. (after tests in #2 for "appropriate privs")
+ */
+ if (uid != oldcred->cr_ruid && /* allow setuid(getuid()) */
+#ifdef _POSIX_SAVED_IDS
+ uid != oldcred->cr_svuid && /* allow setuid(saved gid) */
+#endif
+#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
+ uid != oldcred->cr_uid && /* allow setuid(geteuid()) */
+#endif
+ (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+ PROC_UNLOCK(p);
+ uifree(uip);
+ crfree(newcred);
+ mtx_unlock(&Giant);
+ return (error);
+ }
+
+ /*
+ * Copy credentials so other references do not see our changes.
+ */
+ crcopy(newcred, oldcred);
+#ifdef _POSIX_SAVED_IDS
+ /*
+ * Do we have "appropriate privileges" (are we root or uid == euid)
+ * If so, we are changing the real uid and/or saved uid.
+ */
+ if (
+#ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */
+ uid == oldcred->cr_uid ||
+#endif
+ suser_cred(oldcred, PRISON_ROOT) == 0) /* we are using privs */
+#endif
+ {
+ /*
+ * Set the real uid and transfer proc count to new user.
+ */
+ if (uid != oldcred->cr_ruid) {
+ change_ruid(newcred, uip);
+ setsugid(p);
+ }
+ /*
+ * Set saved uid
+ *
+ * XXX always set saved uid even if not _POSIX_SAVED_IDS, as
+ * the security of seteuid() depends on it. B.4.2.2 says it
+ * is important that we should do this.
+ */
+ if (uid != oldcred->cr_svuid) {
+ change_svuid(newcred, uid);
+ setsugid(p);
+ }
+ }
+
+ /*
+ * In all permitted cases, we are changing the euid.
+ */
+ if (uid != oldcred->cr_uid) {
+ change_euid(newcred, uip);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ uifree(uip);
+ crfree(oldcred);
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct seteuid_args {
+ uid_t euid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+seteuid(struct thread *td, struct seteuid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ uid_t euid;
+ struct uidinfo *euip;
+ int error;
+
+ euid = uap->euid;
+ mtx_lock(&Giant);
+ newcred = crget();
+ euip = uifind(euid);
+ PROC_LOCK(p);
+ oldcred = p->p_ucred;
+ if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */
+ euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */
+ (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+ PROC_UNLOCK(p);
+ uifree(euip);
+ crfree(newcred);
+ mtx_unlock(&Giant);
+ return (error);
+ }
+ /*
+ * Everything's okay, do it. Copy credentials so other references do
+ * not see our changes.
+ */
+ crcopy(newcred, oldcred);
+ if (oldcred->cr_uid != euid) {
+ change_euid(newcred, euip);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ uifree(euip);
+ crfree(oldcred);
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setgid_args {
+ gid_t gid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setgid(struct thread *td, struct setgid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ gid_t gid;
+ int error;
+
+ gid = uap->gid;
+ mtx_lock(&Giant);
+ newcred = crget();
+ PROC_LOCK(p);
+ oldcred = p->p_ucred;
+
+ /*
+ * See if we have "permission" by POSIX 1003.1 rules.
+ *
+ * Note that setgid(getegid()) is a special case of
+ * "appropriate privileges" in appendix B.4.2.2. We need
+ * to use this clause to be compatible with traditional BSD
+ * semantics. Basically, it means that "setgid(xx)" sets all
+ * three id's (assuming you have privs).
+ *
+ * For notes on the logic here, see setuid() above.
+ */
+ if (gid != oldcred->cr_rgid && /* allow setgid(getgid()) */
+#ifdef _POSIX_SAVED_IDS
+ gid != oldcred->cr_svgid && /* allow setgid(saved gid) */
+#endif
+#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
+ gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
+#endif
+ (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+ PROC_UNLOCK(p);
+ crfree(newcred);
+ mtx_unlock(&Giant);
+ return (error);
+ }
+
+ crcopy(newcred, oldcred);
+#ifdef _POSIX_SAVED_IDS
+ /*
+ * Do we have "appropriate privileges" (are we root or gid == egid)
+ * If so, we are changing the real uid and saved gid.
+ */
+ if (
+#ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */
+ gid == oldcred->cr_groups[0] ||
+#endif
+ suser_cred(oldcred, PRISON_ROOT) == 0) /* we are using privs */
+#endif
+ {
+ /*
+ * Set real gid
+ */
+ if (oldcred->cr_rgid != gid) {
+ change_rgid(newcred, gid);
+ setsugid(p);
+ }
+ /*
+ * Set saved gid
+ *
+ * XXX always set saved gid even if not _POSIX_SAVED_IDS, as
+ * the security of setegid() depends on it. B.4.2.2 says it
+ * is important that we should do this.
+ */
+ if (oldcred->cr_svgid != gid) {
+ change_svgid(newcred, gid);
+ setsugid(p);
+ }
+ }
+ /*
+ * In all cases permitted cases, we are changing the egid.
+ * Copy credentials so other references do not see our changes.
+ */
+ if (oldcred->cr_groups[0] != gid) {
+ change_egid(newcred, gid);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ crfree(oldcred);
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setegid_args {
+ gid_t egid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setegid(struct thread *td, struct setegid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ gid_t egid;
+ int error;
+
+ egid = uap->egid;
+ mtx_lock(&Giant);
+ newcred = crget();
+ PROC_LOCK(p);
+ oldcred = p->p_ucred;
+ if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */
+ egid != oldcred->cr_svgid && /* allow setegid(saved gid) */
+ (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+ PROC_UNLOCK(p);
+ crfree(newcred);
+ mtx_unlock(&Giant);
+ return (error);
+ }
+ crcopy(newcred, oldcred);
+ if (oldcred->cr_groups[0] != egid) {
+ change_egid(newcred, egid);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ crfree(oldcred);
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setgroups_args {
+ u_int gidsetsize;
+ gid_t *gidset;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setgroups(struct thread *td, struct setgroups_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *tempcred, *oldcred;
+ u_int ngrp;
+ int error;
+
+ ngrp = uap->gidsetsize;
+ if (ngrp > NGROUPS)
+ return (EINVAL);
+ mtx_lock(&Giant);
+ tempcred = crget();
+ error = copyin((caddr_t)uap->gidset, (caddr_t)tempcred->cr_groups,
+ ngrp * sizeof(gid_t));
+ if (error != 0) {
+ crfree(tempcred);
+ mtx_unlock(&Giant);
+ return (error);
+ }
+ newcred = crget();
+ PROC_LOCK(p);
+ oldcred = p->p_ucred;
+ error = suser_cred(oldcred, PRISON_ROOT);
+ if (error) {
+ PROC_UNLOCK(p);
+ crfree(newcred);
+ crfree(tempcred);
+ mtx_unlock(&Giant);
+ return (error);
+ }
+
+ /*
+ * XXX A little bit lazy here. We could test if anything has
+ * changed before crcopy() and setting P_SUGID.
+ */
+ crcopy(newcred, oldcred);
+ if (ngrp < 1) {
+ /*
+ * setgroups(0, NULL) is a legitimate way of clearing the
+ * groups vector on non-BSD systems (which generally do not
+ * have the egid in the groups[0]). We risk security holes
+ * when running non-BSD software if we do not do the same.
+ */
+ newcred->cr_ngroups = 1;
+ } else {
+ bcopy(tempcred->cr_groups, newcred->cr_groups,
+ ngrp * sizeof(gid_t));
+ newcred->cr_ngroups = ngrp;
+ }
+ setsugid(p);
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ crfree(tempcred);
+ crfree(oldcred);
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setreuid_args {
+ uid_t ruid;
+ uid_t euid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setreuid(register struct thread *td, struct setreuid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ uid_t euid, ruid;
+ struct uidinfo *euip, *ruip;
+ int error;
+
+ euid = uap->euid;
+ ruid = uap->ruid;
+ mtx_lock(&Giant);
+ newcred = crget();
+ euip = uifind(euid);
+ ruip = uifind(ruid);
+ PROC_LOCK(p);
+ oldcred = p->p_ucred;
+ if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
+ ruid != oldcred->cr_svuid) ||
+ (euid != (uid_t)-1 && euid != oldcred->cr_uid &&
+ euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) &&
+ (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+ PROC_UNLOCK(p);
+ uifree(ruip);
+ uifree(euip);
+ crfree(newcred);
+ mtx_unlock(&Giant);
+ return (error);
+ }
+ crcopy(newcred, oldcred);
+ if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
+ change_euid(newcred, euip);
+ setsugid(p);
+ }
+ if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
+ change_ruid(newcred, ruip);
+ setsugid(p);
+ }
+ if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) &&
+ newcred->cr_svuid != newcred->cr_uid) {
+ change_svuid(newcred, newcred->cr_uid);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ uifree(ruip);
+ uifree(euip);
+ crfree(oldcred);
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setregid_args {
+ gid_t rgid;
+ gid_t egid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setregid(register struct thread *td, struct setregid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ gid_t egid, rgid;
+ int error;
+
+ egid = uap->egid;
+ rgid = uap->rgid;
+ mtx_lock(&Giant);
+ newcred = crget();
+ PROC_LOCK(p);
+ oldcred = p->p_ucred;
+ if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
+ rgid != oldcred->cr_svgid) ||
+ (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
+ egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
+ (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+ PROC_UNLOCK(p);
+ crfree(newcred);
+ mtx_unlock(&Giant);
+ return (error);
+ }
+
+ crcopy(newcred, oldcred);
+ if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+ change_egid(newcred, egid);
+ setsugid(p);
+ }
+ if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
+ change_rgid(newcred, rgid);
+ setsugid(p);
+ }
+ if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) &&
+ newcred->cr_svgid != newcred->cr_groups[0]) {
+ change_svgid(newcred, newcred->cr_groups[0]);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ crfree(oldcred);
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+/*
+ * setresuid(ruid, euid, suid) is like setreuid except control over the
+ * saved uid is explicit.
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct setresuid_args {
+ uid_t ruid;
+ uid_t euid;
+ uid_t suid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setresuid(register struct thread *td, struct setresuid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ uid_t euid, ruid, suid;
+ struct uidinfo *euip, *ruip;
+ int error;
+
+ euid = uap->euid;
+ ruid = uap->ruid;
+ suid = uap->suid;
+ mtx_lock(&Giant);
+ newcred = crget();
+ euip = uifind(euid);
+ ruip = uifind(ruid);
+ PROC_LOCK(p);
+ oldcred = p->p_ucred;
+ if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
+ ruid != oldcred->cr_svuid &&
+ ruid != oldcred->cr_uid) ||
+ (euid != (uid_t)-1 && euid != oldcred->cr_ruid &&
+ euid != oldcred->cr_svuid &&
+ euid != oldcred->cr_uid) ||
+ (suid != (uid_t)-1 && suid != oldcred->cr_ruid &&
+ suid != oldcred->cr_svuid &&
+ suid != oldcred->cr_uid)) &&
+ (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+ PROC_UNLOCK(p);
+ uifree(ruip);
+ uifree(euip);
+ crfree(newcred);
+ mtx_unlock(&Giant);
+ return (error);
+ }
+
+ crcopy(newcred, oldcred);
+ if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
+ change_euid(newcred, euip);
+ setsugid(p);
+ }
+ if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
+ change_ruid(newcred, ruip);
+ setsugid(p);
+ }
+ if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) {
+ change_svuid(newcred, suid);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ uifree(ruip);
+ uifree(euip);
+ crfree(oldcred);
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+/*
+ * setresgid(rgid, egid, sgid) is like setregid except control over the
+ * saved gid is explicit.
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct setresgid_args {
+ gid_t rgid;
+ gid_t egid;
+ gid_t sgid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setresgid(register struct thread *td, struct setresgid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ gid_t egid, rgid, sgid;
+ int error;
+
+ egid = uap->egid;
+ rgid = uap->rgid;
+ sgid = uap->sgid;
+ mtx_lock(&Giant);
+ newcred = crget();
+ PROC_LOCK(p);
+ oldcred = p->p_ucred;
+ if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
+ rgid != oldcred->cr_svgid &&
+ rgid != oldcred->cr_groups[0]) ||
+ (egid != (gid_t)-1 && egid != oldcred->cr_rgid &&
+ egid != oldcred->cr_svgid &&
+ egid != oldcred->cr_groups[0]) ||
+ (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
+ sgid != oldcred->cr_svgid &&
+ sgid != oldcred->cr_groups[0])) &&
+ (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+ PROC_UNLOCK(p);
+ crfree(newcred);
+ mtx_unlock(&Giant);
+ return (error);
+ }
+
+ crcopy(newcred, oldcred);
+ if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+ change_egid(newcred, egid);
+ setsugid(p);
+ }
+ if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
+ change_rgid(newcred, rgid);
+ setsugid(p);
+ }
+ if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) {
+ change_svgid(newcred, sgid);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ crfree(oldcred);
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getresuid_args {
+ uid_t *ruid;
+ uid_t *euid;
+ uid_t *suid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getresuid(register struct thread *td, struct getresuid_args *uap)
+{
+ struct ucred *cred;
+ int error1 = 0, error2 = 0, error3 = 0;
+
+ cred = td->td_ucred;
+ if (uap->ruid)
+ error1 = copyout((caddr_t)&cred->cr_ruid,
+ (caddr_t)uap->ruid, sizeof(cred->cr_ruid));
+ if (uap->euid)
+ error2 = copyout((caddr_t)&cred->cr_uid,
+ (caddr_t)uap->euid, sizeof(cred->cr_uid));
+ if (uap->suid)
+ error3 = copyout((caddr_t)&cred->cr_svuid,
+ (caddr_t)uap->suid, sizeof(cred->cr_svuid));
+ return (error1 ? error1 : error2 ? error2 : error3);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getresgid_args {
+ gid_t *rgid;
+ gid_t *egid;
+ gid_t *sgid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getresgid(register struct thread *td, struct getresgid_args *uap)
+{
+ struct ucred *cred;
+ int error1 = 0, error2 = 0, error3 = 0;
+
+ cred = td->td_ucred;
+ if (uap->rgid)
+ error1 = copyout((caddr_t)&cred->cr_rgid,
+ (caddr_t)uap->rgid, sizeof(cred->cr_rgid));
+ if (uap->egid)
+ error2 = copyout((caddr_t)&cred->cr_groups[0],
+ (caddr_t)uap->egid, sizeof(cred->cr_groups[0]));
+ if (uap->sgid)
+ error3 = copyout((caddr_t)&cred->cr_svgid,
+ (caddr_t)uap->sgid, sizeof(cred->cr_svgid));
+ return (error1 ? error1 : error2 ? error2 : error3);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct issetugid_args {
+ int dummy;
+};
+#endif
+/*
+ * NOT MPSAFE?
+ */
+/* ARGSUSED */
+int
+issetugid(register struct thread *td, struct issetugid_args *uap)
+{
+ struct proc *p = td->td_proc;
+
+ /*
+ * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time,
+ * we use P_SUGID because we consider changing the owners as
+ * "tainting" as well.
+ * This is significant for procs that start as root and "become"
+ * a user without an exec - programs cannot know *everything*
+ * that libc *might* have put in their data segment.
+ */
+ PROC_LOCK(p);
+ td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0;
+ PROC_UNLOCK(p);
+ return (0);
+}
+
+/*
+ * MPSAFE
+ */
+int
+__setugid(struct thread *td, struct __setugid_args *uap)
+{
+#ifdef REGRESSION
+ struct proc *p;
+
+ p = td->td_proc;
+ switch (uap->flag) {
+ case 0:
+ mtx_lock(&Giant);
+ PROC_LOCK(p);
+ p->p_flag &= ~P_SUGID;
+ PROC_UNLOCK(p);
+ mtx_unlock(&Giant);
+ return (0);
+ case 1:
+ mtx_lock(&Giant);
+ PROC_LOCK(p);
+ p->p_flag |= P_SUGID;
+ PROC_UNLOCK(p);
+ mtx_unlock(&Giant);
+ return (0);
+ default:
+ return (EINVAL);
+ }
+#else /* !REGRESSION */
+
+ return (ENOSYS);
+#endif /* REGRESSION */
+}
+
+/*
+ * Check if gid is a member of the group set.
+ *
+ * MPSAFE (cred must be held)
+ */
+int
+groupmember(gid_t gid, struct ucred *cred)
+{
+ register gid_t *gp;
+ gid_t *egp;
+
+ egp = &(cred->cr_groups[cred->cr_ngroups]);
+ for (gp = cred->cr_groups; gp < egp; gp++)
+ if (*gp == gid)
+ return (1);
+ return (0);
+}
+
+/*
+ * `suser_enabled' (which can be set by the security.suser_enabled
+ * sysctl) determines whether the system 'super-user' policy is in effect.
+ * If it is nonzero, an effective uid of 0 connotes special privilege,
+ * overriding many mandatory and discretionary protections. If it is zero,
+ * uid 0 is offered no special privilege in the kernel security policy.
+ * Setting it to zero may seriously impact the functionality of many
+ * existing userland programs, and should not be done without careful
+ * consideration of the consequences.
+ */
+int suser_enabled = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RW,
+ &suser_enabled, 0, "processes with uid 0 have privilege");
+TUNABLE_INT("security.bsd.suser_enabled", &suser_enabled);
+
+/*
+ * Test whether the specified credentials imply "super-user" privilege.
+ * Return 0 or EPERM. The flag argument is currently used only to
+ * specify jail interaction.
+ */
+int
+suser_cred(struct ucred *cred, int flag)
+{
+
+ if (!suser_enabled)
+ return (EPERM);
+ if (cred->cr_uid != 0)
+ return (EPERM);
+ if (jailed(cred) && !(flag & PRISON_ROOT))
+ return (EPERM);
+ return (0);
+}
+
+/*
+ * Shortcut to hide contents of struct td and struct proc from the
+ * caller, promoting binary compatibility.
+ */
+int
+suser(struct thread *td)
+{
+
+ return (suser_cred(td->td_ucred, 0));
+}
+
+/*
+ * Test the active securelevel against a given level. securelevel_gt()
+ * implements (securelevel > level). securelevel_ge() implements
+ * (securelevel >= level). Note that the logic is inverted -- these
+ * functions return EPERM on "success" and 0 on "failure".
+ *
+ * MPSAFE
+ */
+int
+securelevel_gt(struct ucred *cr, int level)
+{
+ int active_securelevel;
+
+ active_securelevel = securelevel;
+ KASSERT(cr != NULL, ("securelevel_gt: null cr"));
+ if (cr->cr_prison != NULL) {
+ mtx_lock(&cr->cr_prison->pr_mtx);
+ active_securelevel = imax(cr->cr_prison->pr_securelevel,
+ active_securelevel);
+ mtx_unlock(&cr->cr_prison->pr_mtx);
+ }
+ return (active_securelevel > level ? EPERM : 0);
+}
+
+int
+securelevel_ge(struct ucred *cr, int level)
+{
+ int active_securelevel;
+
+ active_securelevel = securelevel;
+ KASSERT(cr != NULL, ("securelevel_ge: null cr"));
+ if (cr->cr_prison != NULL) {
+ mtx_lock(&cr->cr_prison->pr_mtx);
+ active_securelevel = imax(cr->cr_prison->pr_securelevel,
+ active_securelevel);
+ mtx_unlock(&cr->cr_prison->pr_mtx);
+ }
+ return (active_securelevel >= level ? EPERM : 0);
+}
+
+/*
+ * 'see_other_uids' determines whether or not visibility of processes
+ * and sockets with credentials holding different real uids is possible
+ * using a variety of system MIBs.
+ * XXX: data declarations should be together near the beginning of the file.
+ */
+static int see_other_uids = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW,
+ &see_other_uids, 0,
+ "Unprivileged processes may see subjects/objects with different real uid");
+
+/*-
+ * Determine if u1 "can see" the subject specified by u2, according to the
+ * 'see_other_uids' policy.
+ * Returns: 0 for permitted, ESRCH otherwise
+ * Locks: none
+ * References: *u1 and *u2 must not change during the call
+ * u1 may equal u2, in which case only one reference is required
+ */
+static int
+cr_seeotheruids(struct ucred *u1, struct ucred *u2)
+{
+
+ if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) {
+ if (suser_cred(u1, PRISON_ROOT) != 0)
+ return (ESRCH);
+ }
+ return (0);
+}
+
+/*-
+ * Determine if u1 "can see" the subject specified by u2.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: none
+ * References: *u1 and *u2 must not change during the call
+ * u1 may equal u2, in which case only one reference is required
+ */
+int
+cr_cansee(struct ucred *u1, struct ucred *u2)
+{
+ int error;
+
+ if ((error = prison_check(u1, u2)))
+ return (error);
+ if ((error = cr_seeotheruids(u1, u2)))
+ return (error);
+ return (0);
+}
+
+/*-
+ * Determine if td "can see" the subject specified by p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect p->p_ucred must be held. td really
+ * should be curthread.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_cansee(struct thread *td, struct proc *p)
+{
+
+ /* Wrap cr_cansee() for all functionality. */
+ KASSERT(td == curthread, ("%s: td not curthread", __func__));
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ return (cr_cansee(td->td_ucred, p->p_ucred));
+}
+
+/*-
+ * Determine whether cred may deliver the specified signal to proc.
+ * Returns: 0 for permitted, an errno value otherwise.
+ * Locks: A lock must be held for proc.
+ * References: cred and proc must be valid for the lifetime of the call.
+ */
+int
+cr_cansignal(struct ucred *cred, struct proc *proc, int signum)
+{
+ int error;
+
+ PROC_LOCK_ASSERT(proc, MA_OWNED);
+ /*
+ * Jail semantics limit the scope of signalling to proc in the
+ * same jail as cred, if cred is in jail.
+ */
+ error = prison_check(cred, proc->p_ucred);
+ if (error)
+ return (error);
+ error = cr_seeotheruids(cred, proc->p_ucred);
+ if (error)
+ return (error);
+
+ /*
+ * UNIX signal semantics depend on the status of the P_SUGID
+ * bit on the target process. If the bit is set, then additional
+ * restrictions are placed on the set of available signals.
+ */
+ if (proc->p_flag & P_SUGID) {
+ switch (signum) {
+ case 0:
+ case SIGKILL:
+ case SIGINT:
+ case SIGTERM:
+ case SIGSTOP:
+ case SIGTTIN:
+ case SIGTTOU:
+ case SIGTSTP:
+ case SIGHUP:
+ case SIGUSR1:
+ case SIGUSR2:
+ /*
+ * Generally, permit job and terminal control
+ * signals.
+ */
+ break;
+ default:
+ /* Not permitted without privilege. */
+ error = suser_cred(cred, PRISON_ROOT);
+ if (error)
+ return (error);
+ }
+ }
+
+ /*
+ * Generally, the target credential's ruid or svuid must match the
+ * subject credential's ruid or euid.
+ */
+ if (cred->cr_ruid != proc->p_ucred->cr_ruid &&
+ cred->cr_ruid != proc->p_ucred->cr_svuid &&
+ cred->cr_uid != proc->p_ucred->cr_ruid &&
+ cred->cr_uid != proc->p_ucred->cr_svuid) {
+ /* Not permitted without privilege. */
+ error = suser_cred(cred, PRISON_ROOT);
+ if (error)
+ return (error);
+ }
+
+ return (0);
+}
+
+
+/*-
+ * Determine whether td may deliver the specified signal to p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ * must be held. td must be curthread, and a lock must be
+ * held for p.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_cansignal(struct thread *td, struct proc *p, int signum)
+{
+
+ KASSERT(td == curthread, ("%s: td not curthread", __func__));
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if (td->td_proc == p)
+ return (0);
+
+ /*
+ * UNIX signalling semantics require that processes in the same
+ * session always be able to deliver SIGCONT to one another,
+ * overriding the remaining protections.
+ */
+ /* XXX: This will require an additional lock of some sort. */
+ if (signum == SIGCONT && td->td_proc->p_session == p->p_session)
+ return (0);
+
+ return (cr_cansignal(td->td_ucred, p, signum));
+}
+
+/*-
+ * Determine whether td may reschedule p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ * must be held. td must be curthread, and a lock must
+ * be held for p.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_cansched(struct thread *td, struct proc *p)
+{
+ int error;
+
+ KASSERT(td == curthread, ("%s: td not curthread", __func__));
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if (td->td_proc == p)
+ return (0);
+ if ((error = prison_check(td->td_ucred, p->p_ucred)))
+ return (error);
+ if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
+ return (error);
+ if (td->td_ucred->cr_ruid == p->p_ucred->cr_ruid)
+ return (0);
+ if (td->td_ucred->cr_uid == p->p_ucred->cr_ruid)
+ return (0);
+ if (suser_cred(td->td_ucred, PRISON_ROOT) == 0)
+ return (0);
+
+#ifdef CAPABILITIES
+ if (!cap_check(NULL, td, CAP_SYS_NICE, PRISON_ROOT))
+ return (0);
+#endif
+
+ return (EPERM);
+}
+
+/*
+ * The 'unprivileged_proc_debug' flag may be used to disable a variety of
+ * unprivileged inter-process debugging services, including some procfs
+ * functionality, ptrace(), and ktrace(). In the past, inter-process
+ * debugging has been involved in a variety of security problems, and sites
+ * not requiring the service might choose to disable it when hardening
+ * systems.
+ *
+ * XXX: Should modifying and reading this variable require locking?
+ * XXX: data declarations should be together near the beginning of the file.
+ */
+static int unprivileged_proc_debug = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLFLAG_RW,
+ &unprivileged_proc_debug, 0,
+ "Unprivileged processes may use process debugging facilities");
+
+/*-
+ * Determine whether td may debug p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ * must be held. td must be curthread, and a lock must
+ * be held for p.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_candebug(struct thread *td, struct proc *p)
+{
+ int credentialchanged, error, grpsubset, i, uidsubset;
+
+ KASSERT(td == curthread, ("%s: td not curthread", __func__));
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if (!unprivileged_proc_debug) {
+ error = suser_cred(td->td_ucred, PRISON_ROOT);
+ if (error)
+ return (error);
+ }
+ if (td->td_proc == p)
+ return (0);
+ if ((error = prison_check(td->td_ucred, p->p_ucred)))
+ return (error);
+ if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
+ return (error);
+
+ /*
+ * Is p's group set a subset of td's effective group set? This
+ * includes p's egid, group access list, rgid, and svgid.
+ */
+ grpsubset = 1;
+ for (i = 0; i < p->p_ucred->cr_ngroups; i++) {
+ if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) {
+ grpsubset = 0;
+ break;
+ }
+ }
+ grpsubset = grpsubset &&
+ groupmember(p->p_ucred->cr_rgid, td->td_ucred) &&
+ groupmember(p->p_ucred->cr_svgid, td->td_ucred);
+
+ /*
+ * Are the uids present in p's credential equal to td's
+ * effective uid? This includes p's euid, svuid, and ruid.
+ */
+ uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid &&
+ td->td_ucred->cr_uid == p->p_ucred->cr_svuid &&
+ td->td_ucred->cr_uid == p->p_ucred->cr_ruid);
+
+ /*
+ * Has the credential of the process changed since the last exec()?
+ */
+ credentialchanged = (p->p_flag & P_SUGID);
+
+ /*
+ * If p's gids aren't a subset, or the uids aren't a subset,
+ * or the credential has changed, require appropriate privilege
+ * for td to debug p. For POSIX.1e capabilities, this will
+ * require CAP_SYS_PTRACE.
+ */
+ if (!grpsubset || !uidsubset || credentialchanged) {
+ error = suser_cred(td->td_ucred, PRISON_ROOT);
+ if (error)
+ return (error);
+ }
+
+ /* Can't trace init when securelevel > 0. */
+ if (p == initproc) {
+ error = securelevel_gt(td->td_ucred, 0);
+ if (error)
+ return (error);
+ }
+
+ /*
+ * Can't trace a process that's currently exec'ing.
+ * XXX: Note, this is not a security policy decision, it's a
+ * basic correctness/functionality decision. Therefore, this check
+ * should be moved to the caller's of p_candebug().
+ */
+ if ((p->p_flag & P_INEXEC) != 0)
+ return (EAGAIN);
+
+ return (0);
+}
+
+/*-
+ * Determine whether the subject represented by cred can "see" a socket.
+ * Returns: 0 for permitted, ENOENT otherwise.
+ */
+int
+cr_canseesocket(struct ucred *cred, struct socket *so)
+{
+ int error;
+
+ error = prison_check(cred, so->so_cred);
+ if (error)
+ return (ENOENT);
+ if (cr_seeotheruids(cred, so->so_cred))
+ return (ENOENT);
+#ifdef MAC
+ /* XXX: error = mac_cred_check_seesocket() here. */
+#endif
+
+ return (0);
+}
+
+/*
+ * Allocate a zeroed cred structure.
+ */
+struct ucred *
+crget(void)
+{
+ register struct ucred *cr;
+
+ MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK | M_ZERO);
+ cr->cr_ref = 1;
+ cr->cr_mtxp = mtx_pool_find(cr);
+ return (cr);
+}
+
+/*
+ * Claim another reference to a ucred structure.
+ */
+struct ucred *
+crhold(struct ucred *cr)
+{
+
+ mtx_lock(cr->cr_mtxp);
+ cr->cr_ref++;
+ mtx_unlock(cr->cr_mtxp);
+ return (cr);
+}
+
+/*
+ * Free a cred structure.
+ * Throws away space when ref count gets to 0.
+ */
+void
+crfree(struct ucred *cr)
+{
+ struct mtx *mtxp = cr->cr_mtxp;
+
+ mtx_lock(mtxp);
+ KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
+ if (--cr->cr_ref == 0) {
+ /*
+ * Some callers of crget(), such as nfs_statfs(),
+ * allocate a temporary credential, but don't
+ * allocate a uidinfo structure.
+ */
+ mtx_unlock(mtxp);
+ mtx_lock(&Giant);
+ if (cr->cr_uidinfo != NULL)
+ uifree(cr->cr_uidinfo);
+ if (cr->cr_ruidinfo != NULL)
+ uifree(cr->cr_ruidinfo);
+ /*
+ * Free a prison, if any.
+ */
+ if (jailed(cr))
+ prison_free(cr->cr_prison);
+ FREE((caddr_t)cr, M_CRED);
+ mtx_unlock(&Giant);
+ } else {
+ mtx_unlock(mtxp);
+ }
+}
+
+/*
+ * Check to see if this ucred is shared.
+ */
+int
+crshared(struct ucred *cr)
+{
+ int shared;
+
+ mtx_lock(cr->cr_mtxp);
+ shared = (cr->cr_ref > 1);
+ mtx_unlock(cr->cr_mtxp);
+ return (shared);
+}
+
+/*
+ * Copy a ucred's contents from a template. Does not block.
+ */
+void
+crcopy(struct ucred *dest, struct ucred *src)
+{
+
+ KASSERT(crshared(dest) == 0, ("crcopy of shared ucred"));
+ bcopy(&src->cr_startcopy, &dest->cr_startcopy,
+ (unsigned)((caddr_t)&src->cr_endcopy -
+ (caddr_t)&src->cr_startcopy));
+ uihold(dest->cr_uidinfo);
+ uihold(dest->cr_ruidinfo);
+ if (jailed(dest))
+ prison_hold(dest->cr_prison);
+}
+
+/*
+ * Dup cred struct to a new held one.
+ */
+struct ucred *
+crdup(struct ucred *cr)
+{
+ struct ucred *newcr;
+
+ newcr = crget();
+ crcopy(newcr, cr);
+ return (newcr);
+}
+
+/*
+ * Fill in a struct xucred based on a struct ucred.
+ */
+void
+cru2x(struct ucred *cr, struct xucred *xcr)
+{
+
+ bzero(xcr, sizeof(*xcr));
+ xcr->cr_version = XUCRED_VERSION;
+ xcr->cr_uid = cr->cr_uid;
+ xcr->cr_ngroups = cr->cr_ngroups;
+ bcopy(cr->cr_groups, xcr->cr_groups, sizeof(cr->cr_groups));
+}
+
+/*
+ * small routine to swap a thread's current ucred for the correct one
+ * taken from the process.
+ */
+void
+cred_update_thread(struct thread *td)
+{
+ struct proc *p;
+ struct ucred *cred;
+
+ p = td->td_proc;
+ cred = td->td_ucred;
+ mtx_lock(&Giant);
+ PROC_LOCK(p);
+ td->td_ucred = crhold(p->p_ucred);
+ PROC_UNLOCK(p);
+ if (cred != NULL)
+ crfree(cred);
+ mtx_unlock(&Giant);
+}
+
+/*
+ * Get login name, if available.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getlogin_args {
+ char *namebuf;
+ u_int namelen;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getlogin(struct thread *td, struct getlogin_args *uap)
+{
+ int error;
+ char login[MAXLOGNAME];
+ struct proc *p = td->td_proc;
+
+ if (uap->namelen > MAXLOGNAME)
+ uap->namelen = MAXLOGNAME;
+ PROC_LOCK(p);
+ SESS_LOCK(p->p_session);
+ bcopy(p->p_session->s_login, login, uap->namelen);
+ SESS_UNLOCK(p->p_session);
+ PROC_UNLOCK(p);
+ error = copyout((caddr_t) login, (caddr_t) uap->namebuf, uap->namelen);
+ return(error);
+}
+
+/*
+ * Set login name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setlogin_args {
+ char *namebuf;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setlogin(struct thread *td, struct setlogin_args *uap)
+{
+ struct proc *p = td->td_proc;
+ int error;
+ char logintmp[MAXLOGNAME];
+
+ error = suser_cred(td->td_ucred, PRISON_ROOT);
+ if (error)
+ return (error);
+ error = copyinstr((caddr_t) uap->namebuf, (caddr_t) logintmp,
+ sizeof(logintmp), (size_t *)0);
+ if (error == ENAMETOOLONG)
+ error = EINVAL;
+ else if (!error) {
+ PROC_LOCK(p);
+ SESS_LOCK(p->p_session);
+ (void) memcpy(p->p_session->s_login, logintmp,
+ sizeof(logintmp));
+ SESS_UNLOCK(p->p_session);
+ PROC_UNLOCK(p);
+ }
+ return (error);
+}
+
+void
+setsugid(struct proc *p)
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ p->p_flag |= P_SUGID;
+ if (!(p->p_pfsflags & PF_ISUGID))
+ p->p_stops = 0;
+}
+
+/*-
+ * Change a process's effective uid.
+ * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified.
+ * References: newcred must be an exclusive credential reference for the
+ * duration of the call.
+ */
+void
+change_euid(struct ucred *newcred, struct uidinfo *euip)
+{
+
+ newcred->cr_uid = euip->ui_uid;
+ uihold(euip);
+ uifree(newcred->cr_uidinfo);
+ newcred->cr_uidinfo = euip;
+}
+
+/*-
+ * Change a process's effective gid.
+ * Side effects: newcred->cr_gid will be modified.
+ * References: newcred must be an exclusive credential reference for the
+ * duration of the call.
+ */
+void
+change_egid(struct ucred *newcred, gid_t egid)
+{
+
+ newcred->cr_groups[0] = egid;
+}
+
+/*-
+ * Change a process's real uid.
+ * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo
+ * will be updated, and the old and new cr_ruidinfo proc
+ * counts will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ * duration of the call.
+ */
+void
+change_ruid(struct ucred *newcred, struct uidinfo *ruip)
+{
+
+ (void)chgproccnt(newcred->cr_ruidinfo, -1, 0);
+ newcred->cr_ruid = ruip->ui_uid;
+ uihold(ruip);
+ uifree(newcred->cr_ruidinfo);
+ newcred->cr_ruidinfo = ruip;
+ (void)chgproccnt(newcred->cr_ruidinfo, 1, 0);
+}
+
+/*-
+ * Change a process's real gid.
+ * Side effects: newcred->cr_rgid will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ * duration of the call.
+ */
+void
+change_rgid(struct ucred *newcred, gid_t rgid)
+{
+
+ newcred->cr_rgid = rgid;
+}
+
+/*-
+ * Change a process's saved uid.
+ * Side effects: newcred->cr_svuid will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ * duration of the call.
+ */
+void
+change_svuid(struct ucred *newcred, uid_t svuid)
+{
+
+ newcred->cr_svuid = svuid;
+}
+
+/*-
+ * Change a process's saved gid.
+ * Side effects: newcred->cr_svgid will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ * duration of the call.
+ */
+void
+change_svgid(struct ucred *newcred, gid_t svgid)
+{
+
+ newcred->cr_svgid = svgid;
+}
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
new file mode 100644
index 0000000..d467c1a
--- /dev/null
+++ b/sys/kern/kern_resource.c
@@ -0,0 +1,1020 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/file.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sx.h>
+#include <sys/time.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+static int donice(struct thread *td, struct proc *chgp, int n);
+
+static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
+#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
+static struct mtx uihashtbl_mtx;
+static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
+static u_long uihash; /* size of hash table - 1 */
+
+static struct uidinfo *uilookup(uid_t uid);
+
+/*
+ * Resource controls and accounting.
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct getpriority_args {
+ int which;
+ int who;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+getpriority(td, uap)
+ struct thread *td;
+ register struct getpriority_args *uap;
+{
+ register struct proc *p;
+ register int low = PRIO_MAX + 1;
+ int error = 0;
+
+ mtx_lock(&Giant);
+
+ switch (uap->which) {
+ case PRIO_PROCESS:
+ if (uap->who == 0)
+ low = td->td_ksegrp->kg_nice;
+ else {
+ p = pfind(uap->who);
+ if (p == NULL)
+ break;
+ if (p_cansee(td, p) == 0)
+ low = p->p_ksegrp.kg_nice /* XXXKSE */ ;
+ PROC_UNLOCK(p);
+ }
+ break;
+
+ case PRIO_PGRP: {
+ register struct pgrp *pg;
+
+ sx_slock(&proctree_lock);
+ if (uap->who == 0) {
+ pg = td->td_proc->p_pgrp;
+ PGRP_LOCK(pg);
+ } else {
+ pg = pgfind(uap->who);
+ if (pg == NULL) {
+ sx_sunlock(&proctree_lock);
+ break;
+ }
+ }
+ sx_sunlock(&proctree_lock);
+ LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (!p_cansee(td, p) && p->p_ksegrp.kg_nice /* XXXKSE */ < low)
+ low = p->p_ksegrp.kg_nice /* XXXKSE */ ;
+ PROC_UNLOCK(p);
+ }
+ PGRP_UNLOCK(pg);
+ break;
+ }
+
+ case PRIO_USER:
+ if (uap->who == 0)
+ uap->who = td->td_ucred->cr_uid;
+ sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &allproc, p_list) {
+ PROC_LOCK(p);
+ if (!p_cansee(td, p) &&
+ p->p_ucred->cr_uid == uap->who &&
+ p->p_ksegrp.kg_nice /* XXXKSE */ < low)
+ low = p->p_ksegrp.kg_nice /* XXXKSE */ ;
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ if (low == PRIO_MAX + 1 && error == 0)
+ error = ESRCH;
+ td->td_retval[0] = low;
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setpriority_args {
+ int which;
+ int who;
+ int prio;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setpriority(td, uap)
+ struct thread *td;
+ register struct setpriority_args *uap;
+{
+ struct proc *curp = td->td_proc;
+ register struct proc *p;
+ int found = 0, error = 0;
+
+ mtx_lock(&Giant);
+
+ switch (uap->which) {
+ case PRIO_PROCESS:
+ if (uap->who == 0) {
+ PROC_LOCK(curp);
+ error = donice(td, curp, uap->prio);
+ PROC_UNLOCK(curp);
+ } else {
+ p = pfind(uap->who);
+ if (p == 0)
+ break;
+ if (p_cansee(td, p) == 0)
+ error = donice(td, p, uap->prio);
+ PROC_UNLOCK(p);
+ }
+ found++;
+ break;
+
+ case PRIO_PGRP: {
+ register struct pgrp *pg;
+
+ sx_slock(&proctree_lock);
+ if (uap->who == 0) {
+ pg = curp->p_pgrp;
+ PGRP_LOCK(pg);
+ } else {
+ pg = pgfind(uap->who);
+ if (pg == NULL) {
+ sx_sunlock(&proctree_lock);
+ break;
+ }
+ }
+ sx_sunlock(&proctree_lock);
+ LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (!p_cansee(td, p)) {
+ error = donice(td, p, uap->prio);
+ found++;
+ }
+ PROC_UNLOCK(p);
+ }
+ PGRP_UNLOCK(pg);
+ break;
+ }
+
+ case PRIO_USER:
+ if (uap->who == 0)
+ uap->who = td->td_ucred->cr_uid;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_ucred->cr_uid == uap->who &&
+ !p_cansee(td, p)) {
+ error = donice(td, p, uap->prio);
+ found++;
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ if (found == 0 && error == 0)
+ error = ESRCH;
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static int
+donice(td, chgp, n)
+ struct thread *td;
+ register struct proc *chgp;
+ register int n;
+{
+ int error;
+
+ PROC_LOCK_ASSERT(chgp, MA_OWNED);
+ if ((error = p_cansched(td, chgp)))
+ return (error);
+ if (n > PRIO_MAX)
+ n = PRIO_MAX;
+ if (n < PRIO_MIN)
+ n = PRIO_MIN;
+ if (n < chgp->p_ksegrp.kg_nice /* XXXKSE */ && suser(td))
+ return (EACCES);
+ chgp->p_ksegrp.kg_nice /* XXXKSE */ = n;
+ (void)resetpriority(&chgp->p_ksegrp); /* XXXKSE */
+ return (0);
+}
+
+/* rtprio system call */
+#ifndef _SYS_SYSPROTO_H_
+struct rtprio_args {
+ int function;
+ pid_t pid;
+ struct rtprio *rtp;
+};
+#endif
+
+/*
+ * Set realtime priority
+ */
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+rtprio(td, uap)
+ struct thread *td;
+ register struct rtprio_args *uap;
+{
+ struct proc *curp = td->td_proc;
+ register struct proc *p;
+ struct rtprio rtp;
+ int error, cierror = 0;
+
+ /* Perform copyin before acquiring locks if needed. */
+ if (uap->function == RTP_SET)
+ cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
+
+ if (uap->pid == 0) {
+ p = curp;
+ PROC_LOCK(p);
+ } else {
+ p = pfind(uap->pid);
+ if (p == NULL)
+ return (ESRCH);
+ }
+
+ switch (uap->function) {
+ case RTP_LOOKUP:
+ if ((error = p_cansee(td, p)))
+ break;
+ mtx_lock_spin(&sched_lock);
+ pri_to_rtp(&p->p_ksegrp /* XXXKSE */ , &rtp);
+ mtx_unlock_spin(&sched_lock);
+ PROC_UNLOCK(p);
+ return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
+ case RTP_SET:
+ if ((error = p_cansched(td, p)) || (error = cierror))
+ break;
+ /* disallow setting rtprio in most cases if not superuser */
+ if (suser(td) != 0) {
+ /* can't set someone else's */
+ if (uap->pid) {
+ error = EPERM;
+ break;
+ }
+ /* can't set realtime priority */
+/*
+ * Realtime priority has to be restricted for reasons which should be
+ * obvious. However, for idle priority, there is a potential for
+ * system deadlock if an idleprio process gains a lock on a resource
+ * that other processes need (and the idleprio process can't run
+ * due to a CPU-bound normal process). Fix me! XXX
+ */
+#if 0
+ if (RTP_PRIO_IS_REALTIME(rtp.type))
+#endif
+ if (rtp.type != RTP_PRIO_NORMAL) {
+ error = EPERM;
+ break;
+ }
+ }
+ mtx_lock_spin(&sched_lock);
+ error = rtp_to_pri(&rtp, &p->p_ksegrp);
+ mtx_unlock_spin(&sched_lock);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+int
+rtp_to_pri(struct rtprio *rtp, struct ksegrp *kg)
+{
+
+ if (rtp->prio > RTP_PRIO_MAX)
+ return (EINVAL);
+ switch (RTP_PRIO_BASE(rtp->type)) {
+ case RTP_PRIO_REALTIME:
+ kg->kg_user_pri = PRI_MIN_REALTIME + rtp->prio;
+ break;
+ case RTP_PRIO_NORMAL:
+ kg->kg_user_pri = PRI_MIN_TIMESHARE + rtp->prio;
+ break;
+ case RTP_PRIO_IDLE:
+ kg->kg_user_pri = PRI_MIN_IDLE + rtp->prio;
+ break;
+ default:
+ return (EINVAL);
+ }
+ kg->kg_pri_class = rtp->type;
+ if (curthread->td_ksegrp == kg) {
+ curthread->td_base_pri = kg->kg_user_pri;
+ curthread->td_priority = kg->kg_user_pri; /* XXX dubious */
+ }
+ return (0);
+}
+
+void
+pri_to_rtp(struct ksegrp *kg, struct rtprio *rtp)
+{
+
+ switch (PRI_BASE(kg->kg_pri_class)) {
+ case PRI_REALTIME:
+ rtp->prio = kg->kg_user_pri - PRI_MIN_REALTIME;
+ break;
+ case PRI_TIMESHARE:
+ rtp->prio = kg->kg_user_pri - PRI_MIN_TIMESHARE;
+ break;
+ case PRI_IDLE:
+ rtp->prio = kg->kg_user_pri - PRI_MIN_IDLE;
+ break;
+ default:
+ break;
+ }
+ rtp->type = kg->kg_pri_class;
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct osetrlimit_args {
+ u_int which;
+ struct orlimit *rlp;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osetrlimit(td, uap)
+ struct thread *td;
+ register struct osetrlimit_args *uap;
+{
+ struct orlimit olim;
+ struct rlimit lim;
+ int error;
+
+ if ((error =
+ copyin((caddr_t)uap->rlp, (caddr_t)&olim, sizeof(struct orlimit))))
+ return (error);
+ lim.rlim_cur = olim.rlim_cur;
+ lim.rlim_max = olim.rlim_max;
+ mtx_lock(&Giant);
+ error = dosetrlimit(td, uap->which, &lim);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ogetrlimit_args {
+ u_int which;
+ struct orlimit *rlp;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+ogetrlimit(td, uap)
+ struct thread *td;
+ register struct ogetrlimit_args *uap;
+{
+ struct proc *p = td->td_proc;
+ struct orlimit olim;
+ int error;
+
+ if (uap->which >= RLIM_NLIMITS)
+ return (EINVAL);
+ mtx_lock(&Giant);
+ olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur;
+ if (olim.rlim_cur == -1)
+ olim.rlim_cur = 0x7fffffff;
+ olim.rlim_max = p->p_rlimit[uap->which].rlim_max;
+ if (olim.rlim_max == -1)
+ olim.rlim_max = 0x7fffffff;
+ error = copyout((caddr_t)&olim, (caddr_t)uap->rlp, sizeof(olim));
+ mtx_unlock(&Giant);
+ return (error);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+#ifndef _SYS_SYSPROTO_H_
+struct __setrlimit_args {
+ u_int which;
+ struct rlimit *rlp;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setrlimit(td, uap)
+ struct thread *td;
+ register struct __setrlimit_args *uap;
+{
+ struct rlimit alim;
+ int error;
+
+ if ((error =
+ copyin((caddr_t)uap->rlp, (caddr_t)&alim, sizeof (struct rlimit))))
+ return (error);
+ mtx_lock(&Giant);
+ error = dosetrlimit(td, uap->which, &alim);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+int
+dosetrlimit(td, which, limp)
+ struct thread *td;
+ u_int which;
+ struct rlimit *limp;
+{
+ struct proc *p = td->td_proc;
+ register struct rlimit *alimp;
+ int error;
+
+ GIANT_REQUIRED;
+
+ if (which >= RLIM_NLIMITS)
+ return (EINVAL);
+ alimp = &p->p_rlimit[which];
+
+ /*
+ * Preserve historical bugs by treating negative limits as unsigned.
+ */
+ if (limp->rlim_cur < 0)
+ limp->rlim_cur = RLIM_INFINITY;
+ if (limp->rlim_max < 0)
+ limp->rlim_max = RLIM_INFINITY;
+
+ if (limp->rlim_cur > alimp->rlim_max ||
+ limp->rlim_max > alimp->rlim_max)
+ if ((error = suser_cred(td->td_ucred, PRISON_ROOT)))
+ return (error);
+ if (limp->rlim_cur > limp->rlim_max)
+ limp->rlim_cur = limp->rlim_max;
+ if (p->p_limit->p_refcnt > 1 &&
+ (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
+ p->p_limit->p_refcnt--;
+ p->p_limit = limcopy(p->p_limit);
+ alimp = &p->p_rlimit[which];
+ }
+
+ switch (which) {
+
+ case RLIMIT_CPU:
+ if (limp->rlim_cur > RLIM_INFINITY / (rlim_t)1000000)
+ p->p_limit->p_cpulimit = RLIM_INFINITY;
+ else
+ p->p_limit->p_cpulimit =
+ (rlim_t)1000000 * limp->rlim_cur;
+ break;
+ case RLIMIT_DATA:
+ if (limp->rlim_cur > maxdsiz)
+ limp->rlim_cur = maxdsiz;
+ if (limp->rlim_max > maxdsiz)
+ limp->rlim_max = maxdsiz;
+ break;
+
+ case RLIMIT_STACK:
+ if (limp->rlim_cur > maxssiz)
+ limp->rlim_cur = maxssiz;
+ if (limp->rlim_max > maxssiz)
+ limp->rlim_max = maxssiz;
+ /*
+ * Stack is allocated to the max at exec time with only
+ * "rlim_cur" bytes accessible. If stack limit is going
+ * up make more accessible, if going down make inaccessible.
+ */
+ if (limp->rlim_cur != alimp->rlim_cur) {
+ vm_offset_t addr;
+ vm_size_t size;
+ vm_prot_t prot;
+
+ if (limp->rlim_cur > alimp->rlim_cur) {
+ prot = VM_PROT_ALL;
+ size = limp->rlim_cur - alimp->rlim_cur;
+ addr = USRSTACK - limp->rlim_cur;
+ } else {
+ prot = VM_PROT_NONE;
+ size = alimp->rlim_cur - limp->rlim_cur;
+ addr = USRSTACK - alimp->rlim_cur;
+ }
+ addr = trunc_page(addr);
+ size = round_page(size);
+ (void) vm_map_protect(&p->p_vmspace->vm_map,
+ addr, addr+size, prot, FALSE);
+ }
+ break;
+
+ case RLIMIT_NOFILE:
+ if (limp->rlim_cur > maxfilesperproc)
+ limp->rlim_cur = maxfilesperproc;
+ if (limp->rlim_max > maxfilesperproc)
+ limp->rlim_max = maxfilesperproc;
+ break;
+
+ case RLIMIT_NPROC:
+ if (limp->rlim_cur > maxprocperuid)
+ limp->rlim_cur = maxprocperuid;
+ if (limp->rlim_max > maxprocperuid)
+ limp->rlim_max = maxprocperuid;
+ if (limp->rlim_cur < 1)
+ limp->rlim_cur = 1;
+ if (limp->rlim_max < 1)
+ limp->rlim_max = 1;
+ break;
+ }
+ *alimp = *limp;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct __getrlimit_args {
+ u_int which;
+ struct rlimit *rlp;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getrlimit(td, uap)
+ struct thread *td;
+ register struct __getrlimit_args *uap;
+{
+ int error;
+ struct proc *p = td->td_proc;
+
+ if (uap->which >= RLIM_NLIMITS)
+ return (EINVAL);
+ mtx_lock(&Giant);
+ error = copyout((caddr_t)&p->p_rlimit[uap->which], (caddr_t)uap->rlp,
+ sizeof (struct rlimit));
+ mtx_unlock(&Giant);
+ return(error);
+}
+
+/*
+ * Transform the running time and tick information in proc p into user,
+ * system, and interrupt time usage.
+ */
+void
+calcru(p, up, sp, ip)
+ struct proc *p;
+ struct timeval *up;
+ struct timeval *sp;
+ struct timeval *ip;
+{
+ /* {user, system, interrupt, total} {ticks, usec}; previous tu: */
+ u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
+ u_int64_t uut = 0, sut = 0, iut = 0;
+ int s;
+ struct timeval tv;
+ struct bintime bt;
+ struct kse *ke;
+ struct ksegrp *kg;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ /* XXX: why spl-protect ? worst case is an off-by-one report */
+
+ FOREACH_KSEGRP_IN_PROC(p, kg) {
+ /* we could accumulate per ksegrp and per process here*/
+ FOREACH_KSE_IN_GROUP(kg, ke) {
+ s = splstatclock();
+ ut = ke->ke_uticks;
+ st = ke->ke_sticks;
+ it = ke->ke_iticks;
+ splx(s);
+
+ tt = ut + st + it;
+ if (tt == 0) {
+ st = 1;
+ tt = 1;
+ }
+
+ if (ke == curthread->td_kse) {
+ /*
+ * Adjust for the current time slice. This is actually fairly
+ * important since the error here is on the order of a time
+ * quantum, which is much greater than the sampling error.
+ * XXXKSE use a different test due to threads on other
+ * processors also being 'current'.
+ */
+
+ binuptime(&bt);
+ bintime_sub(&bt, PCPU_PTR(switchtime));
+ bintime_add(&bt, &p->p_runtime);
+ } else {
+ bt = p->p_runtime;
+ }
+ bintime2timeval(&bt, &tv);
+ tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
+ ptu = ke->ke_uu + ke->ke_su + ke->ke_iu;
+ if (tu < ptu || (int64_t)tu < 0) {
+ /* XXX no %qd in kernel. Truncate. */
+ printf("calcru: negative time of %ld usec for pid %d (%s)\n",
+ (long)tu, p->p_pid, p->p_comm);
+ tu = ptu;
+ }
+
+ /* Subdivide tu. */
+ uu = (tu * ut) / tt;
+ su = (tu * st) / tt;
+ iu = tu - uu - su;
+
+ /* Enforce monotonicity. */
+ if (uu < ke->ke_uu || su < ke->ke_su || iu < ke->ke_iu) {
+ if (uu < ke->ke_uu)
+ uu = ke->ke_uu;
+ else if (uu + ke->ke_su + ke->ke_iu > tu)
+ uu = tu - ke->ke_su - ke->ke_iu;
+ if (st == 0)
+ su = ke->ke_su;
+ else {
+ su = ((tu - uu) * st) / (st + it);
+ if (su < ke->ke_su)
+ su = ke->ke_su;
+ else if (uu + su + ke->ke_iu > tu)
+ su = tu - uu - ke->ke_iu;
+ }
+ KASSERT(uu + su + ke->ke_iu <= tu,
+ ("calcru: monotonisation botch 1"));
+ iu = tu - uu - su;
+ KASSERT(iu >= ke->ke_iu,
+ ("calcru: monotonisation botch 2"));
+ }
+ ke->ke_uu = uu;
+ ke->ke_su = su;
+ ke->ke_iu = iu;
+ uut += uu;
+ sut += su;
+ iut += iu;
+
+ } /* end kse loop */
+ } /* end kseg loop */
+ up->tv_sec = uut / 1000000;
+ up->tv_usec = uut % 1000000;
+ sp->tv_sec = sut / 1000000;
+ sp->tv_usec = sut % 1000000;
+ if (ip != NULL) {
+ ip->tv_sec = iut / 1000000;
+ ip->tv_usec = iut % 1000000;
+ }
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getrusage_args {
+ int who;
+ struct rusage *rusage;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getrusage(td, uap)
+ register struct thread *td;
+ register struct getrusage_args *uap;
+{
+ struct proc *p = td->td_proc;
+ register struct rusage *rup;
+ int error = 0;
+
+ mtx_lock(&Giant);
+
+ switch (uap->who) {
+ case RUSAGE_SELF:
+ rup = &p->p_stats->p_ru;
+ mtx_lock_spin(&sched_lock);
+ calcru(p, &rup->ru_utime, &rup->ru_stime, NULL);
+ mtx_unlock_spin(&sched_lock);
+ break;
+
+ case RUSAGE_CHILDREN:
+ rup = &p->p_stats->p_cru;
+ break;
+
+ default:
+ rup = NULL;
+ error = EINVAL;
+ break;
+ }
+ mtx_unlock(&Giant);
+ if (error == 0) {
+ error = copyout((caddr_t)rup, (caddr_t)uap->rusage,
+ sizeof (struct rusage));
+ }
+ return(error);
+}
+
+void
+ruadd(ru, ru2)
+ register struct rusage *ru, *ru2;
+{
+ register long *ip, *ip2;
+ register int i;
+
+ timevaladd(&ru->ru_utime, &ru2->ru_utime);
+ timevaladd(&ru->ru_stime, &ru2->ru_stime);
+ if (ru->ru_maxrss < ru2->ru_maxrss)
+ ru->ru_maxrss = ru2->ru_maxrss;
+ ip = &ru->ru_first; ip2 = &ru2->ru_first;
+ for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
+ *ip++ += *ip2++;
+}
+
+/*
+ * Make a copy of the plimit structure.
+ * We share these structures copy-on-write after fork,
+ * and copy when a limit is changed.
+ */
+struct plimit *
+limcopy(lim)
+ struct plimit *lim;
+{
+ register struct plimit *copy;
+
+ MALLOC(copy, struct plimit *, sizeof(struct plimit),
+ M_SUBPROC, M_WAITOK);
+ bcopy(lim->pl_rlimit, copy->pl_rlimit, sizeof(struct plimit));
+ copy->p_lflags = 0;
+ copy->p_refcnt = 1;
+ return (copy);
+}
+
+/*
+ * Find the uidinfo structure for a uid. This structure is used to
+ * track the total resource consumption (process count, socket buffer
+ * size, etc.) for the uid and impose limits.
+ */
+void
+uihashinit()
+{
+
+ uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
+ mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF);
+}
+
+/*
+ * lookup a uidinfo struct for the parameter uid.
+ * uihashtbl_mtx must be locked.
+ */
+static struct uidinfo *
+uilookup(uid)
+ uid_t uid;
+{
+ struct uihashhead *uipp;
+ struct uidinfo *uip;
+
+ mtx_assert(&uihashtbl_mtx, MA_OWNED);
+ uipp = UIHASH(uid);
+ LIST_FOREACH(uip, uipp, ui_hash)
+ if (uip->ui_uid == uid)
+ break;
+
+ return (uip);
+}
+
+/*
+ * Find or allocate a struct uidinfo for a particular uid.
+ * Increase refcount on uidinfo struct returned.
+ * uifree() should be called on a struct uidinfo when released.
+ */
+struct uidinfo *
+uifind(uid)
+ uid_t uid;
+{
+ struct uidinfo *uip;
+
+ mtx_lock(&uihashtbl_mtx);
+ uip = uilookup(uid);
+ if (uip == NULL) {
+ struct uidinfo *old_uip;
+
+ mtx_unlock(&uihashtbl_mtx);
+ uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
+ mtx_lock(&uihashtbl_mtx);
+ /*
+ * There's a chance someone created our uidinfo while we
+ * were in malloc and not holding the lock, so we have to
+ * make sure we don't insert a duplicate uidinfo
+ */
+ if ((old_uip = uilookup(uid)) != NULL) {
+ /* someone else beat us to it */
+ free(uip, M_UIDINFO);
+ uip = old_uip;
+ } else {
+ uip->ui_mtxp = mtx_pool_alloc();
+ uip->ui_uid = uid;
+ LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
+ }
+ }
+ uihold(uip);
+ mtx_unlock(&uihashtbl_mtx);
+ return (uip);
+}
+
+/*
+ * Place another refcount on a uidinfo struct.
+ */
+void
+uihold(uip)
+ struct uidinfo *uip;
+{
+
+ UIDINFO_LOCK(uip);
+ uip->ui_ref++;
+ UIDINFO_UNLOCK(uip);
+}
+
+/*-
+ * Since uidinfo structs have a long lifetime, we use an
+ * opportunistic refcounting scheme to avoid locking the lookup hash
+ * for each release.
+ *
+ * If the refcount hits 0, we need to free the structure,
+ * which means we need to lock the hash.
+ * Optimal case:
+ * After locking the struct and lowering the refcount, if we find
+ * that we don't need to free, simply unlock and return.
+ * Suboptimal case:
+ * If refcount lowering results in need to free, bump the count
+ * back up, loose the lock and aquire the locks in the proper
+ * order to try again.
+ */
+void
+uifree(uip)
+ struct uidinfo *uip;
+{
+
+ /* Prepare for optimal case. */
+ UIDINFO_LOCK(uip);
+
+ if (--uip->ui_ref != 0) {
+ UIDINFO_UNLOCK(uip);
+ return;
+ }
+
+ /* Prepare for suboptimal case. */
+ uip->ui_ref++;
+ UIDINFO_UNLOCK(uip);
+ mtx_lock(&uihashtbl_mtx);
+ UIDINFO_LOCK(uip);
+
+ /*
+ * We must subtract one from the count again because we backed out
+ * our initial subtraction before dropping the lock.
+ * Since another thread may have added a reference after we dropped the
+ * initial lock we have to test for zero again.
+ */
+ if (--uip->ui_ref == 0) {
+ LIST_REMOVE(uip, ui_hash);
+ mtx_unlock(&uihashtbl_mtx);
+ if (uip->ui_sbsize != 0)
+ /* XXX no %qd in kernel. Truncate. */
+ printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
+ uip->ui_uid, (long)uip->ui_sbsize);
+ if (uip->ui_proccnt != 0)
+ printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
+ uip->ui_uid, uip->ui_proccnt);
+ UIDINFO_UNLOCK(uip);
+ FREE(uip, M_UIDINFO);
+ return;
+ }
+
+ mtx_unlock(&uihashtbl_mtx);
+ UIDINFO_UNLOCK(uip);
+}
+
+/*
+ * Change the count associated with number of processes
+ * a given user is using. When 'max' is 0, don't enforce a limit
+ */
+int
+chgproccnt(uip, diff, max)
+ struct uidinfo *uip;
+ int diff;
+ int max;
+{
+
+ UIDINFO_LOCK(uip);
+ /* don't allow them to exceed max, but allow subtraction */
+ if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) {
+ UIDINFO_UNLOCK(uip);
+ return (0);
+ }
+ uip->ui_proccnt += diff;
+ if (uip->ui_proccnt < 0)
+ printf("negative proccnt for uid = %d\n", uip->ui_uid);
+ UIDINFO_UNLOCK(uip);
+ return (1);
+}
+
+/*
+ * Change the total socket buffer size a user has used.
+ */
+int
+chgsbsize(uip, hiwat, to, max)
+ struct uidinfo *uip;
+ u_long *hiwat;
+ u_long to;
+ rlim_t max;
+{
+ rlim_t new;
+ int s;
+
+ s = splnet();
+ UIDINFO_LOCK(uip);
+ new = uip->ui_sbsize + to - *hiwat;
+ /* don't allow them to exceed max, but allow subtraction */
+ if (to > *hiwat && new > max) {
+ splx(s);
+ UIDINFO_UNLOCK(uip);
+ return (0);
+ }
+ uip->ui_sbsize = new;
+ *hiwat = to;
+ if (uip->ui_sbsize < 0)
+ printf("negative sbsize for uid = %d\n", uip->ui_uid);
+ splx(s);
+ UIDINFO_UNLOCK(uip);
+ return (1);
+}
diff --git a/sys/kern/kern_sema.c b/sys/kern/kern_sema.c
new file mode 100644
index 0000000..61435bd
--- /dev/null
+++ b/sys/kern/kern_sema.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) 2001 Jason Evans <jasone@freebsd.org>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice(s), this list of conditions and the following disclaimer as
+ * the first lines of this file unmodified other than the possible
+ * addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice(s), this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Counting semaphores.
+ *
+ * Priority propagation will not generally raise the priority of semaphore
+ * "owners" (a misnomer in the context of semaphores), so should not be relied
+ * upon in combination with semaphores.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ktr.h>
+#include <sys/condvar.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sema.h>
+
+void
+sema_init(struct sema *sema, int value, const char *description)
+{
+
+ KASSERT((value >= 0), ("%s(): negative value\n", __func__));
+
+ bzero(sema, sizeof(*sema));
+ mtx_init(&sema->sema_mtx, description, "sema backing lock",
+ MTX_DEF | MTX_NOWITNESS | MTX_QUIET);
+ cv_init(&sema->sema_cv, description);
+ sema->sema_value = value;
+
+ CTR4(KTR_LOCK, "%s(%p, %d, \"%s\")", __func__, sema, value, description);
+}
+
+void
+sema_destroy(struct sema *sema)
+{
+
+ CTR3(KTR_LOCK, "%s(%p) \"%s\"", __func__, sema,
+ cv_wmesg(&sema->sema_cv));
+
+ KASSERT((sema->sema_waiters == 0), ("%s(): waiters\n", __func__));
+
+ mtx_destroy(&sema->sema_mtx);
+ cv_destroy(&sema->sema_cv);
+}
+
+void
+_sema_post(struct sema *sema, const char *file, int line)
+{
+
+ mtx_lock(&sema->sema_mtx);
+ sema->sema_value++;
+ if (sema->sema_waiters && sema->sema_value > 0)
+ cv_signal(&sema->sema_cv);
+
+ CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+ cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+
+ mtx_unlock(&sema->sema_mtx);
+}
+
+void
+_sema_wait(struct sema *sema, const char *file, int line)
+{
+
+ mtx_lock(&sema->sema_mtx);
+ while (sema->sema_value == 0) {
+ sema->sema_waiters++;
+ cv_wait(&sema->sema_cv, &sema->sema_mtx);
+ sema->sema_waiters--;
+ }
+ sema->sema_value--;
+
+ CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+ cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+
+ mtx_unlock(&sema->sema_mtx);
+}
+
+int
+_sema_timedwait(struct sema *sema, int timo, const char *file, int line)
+{
+ int ret, timed_out;
+
+ mtx_lock(&sema->sema_mtx);
+
+ /*
+ * A spurious wakeup will cause the timeout interval to start over.
+ * This isn't a big deal as long as spurious wakeups don't occur
+ * continuously, since the timeout period is merely a lower bound on how
+ * long to wait.
+ */
+ for (timed_out = 0; sema->sema_value == 0 && timed_out == 0;) {
+ sema->sema_waiters++;
+ timed_out = cv_timedwait(&sema->sema_cv, &sema->sema_mtx, timo);
+ sema->sema_waiters--;
+ }
+ if (sema->sema_value > 0) {
+ /* Success. */
+ sema->sema_value--;
+ ret = 1;
+
+ CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+ cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+ } else {
+ ret = 0;
+
+ CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema,
+ cv_wmesg(&sema->sema_cv), file, line);
+ }
+
+ mtx_unlock(&sema->sema_mtx);
+ return (ret);
+}
+
+int
+_sema_trywait(struct sema *sema, const char *file, int line)
+{
+ int ret;
+
+ mtx_lock(&sema->sema_mtx);
+
+ if (sema->sema_value > 0) {
+ /* Success. */
+ sema->sema_value--;
+ ret = 1;
+
+ CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+ cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+ } else {
+ ret = 0;
+
+ CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema,
+ cv_wmesg(&sema->sema_cv), file, line);
+ }
+
+ mtx_unlock(&sema->sema_mtx);
+ return (ret);
+}
+
+int
+sema_value(struct sema *sema)
+{
+ int ret;
+
+ mtx_lock(&sema->sema_mtx);
+ ret = sema->sema_value;
+ mtx_unlock(&sema->sema_mtx);
+ return (ret);
+}
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
new file mode 100644
index 0000000..d2cb69d
--- /dev/null
+++ b/sys/kern/kern_shutdown.c
@@ -0,0 +1,564 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+#include "opt_hw_wdog.h"
+#include "opt_panic.h"
+#include "opt_show_busybufs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/disklabel.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/reboot.h>
+#include <sys/resourcevar.h>
+#include <sys/smp.h> /* smp_active */
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vnode.h>
+
+#include <machine/pcb.h>
+#include <machine/md_var.h>
+#include <machine/smp.h>
+
+#include <sys/signalvar.h>
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#ifndef PANIC_REBOOT_WAIT_TIME
+#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
+#endif
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#ifdef DDB
+#ifdef DDB_UNATTENDED
+int debugger_on_panic = 0;
+#else
+int debugger_on_panic = 1;
+#endif
+SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW,
+ &debugger_on_panic, 0, "Run debugger on kernel panic");
+#endif
+
+int sync_on_panic = 1;
+SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RW,
+ &sync_on_panic, 0, "Do a sync before rebooting from a panic");
+
+SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment");
+
+#ifdef HW_WDOG
+/*
+ * If there is a hardware watchdog, point this at the function needed to
+ * hold it off.
+ * It's needed when the kernel needs to do some lengthy operations.
+ * e.g. in wd.c when dumping core.. It's most annoying to have
+ * your precious core-dump only half written because the wdog kicked in.
+ */
+watchdog_tickle_fn wdog_tickler = NULL;
+#endif /* HW_WDOG */
+
+/*
+ * Variable panicstr contains argument to first call to panic; used as flag
+ * to indicate that the kernel has already called panic.
+ */
+const char *panicstr;
+
+int dumping; /* system is dumping */
+static struct dumperinfo dumper; /* our selected dumper */
+static struct pcb dumppcb; /* "You Are Here" sign for dump-debuggers */
+
+static void boot(int) __dead2;
+static void poweroff_wait(void *, int);
+static void shutdown_halt(void *junk, int howto);
+static void shutdown_panic(void *junk, int howto);
+static void shutdown_reset(void *junk, int howto);
+
+/* register various local shutdown events */
+static void
+shutdown_conf(void *unused)
+{
+ EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST);
+ EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100);
+ EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100);
+ EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL, SHUTDOWN_PRI_LAST + 200);
+}
+
+SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL)
+
+/*
+ * The system call that results in a reboot
+ *
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+reboot(struct thread *td, struct reboot_args *uap)
+{
+ int error;
+
+ mtx_lock(&Giant);
+ if ((error = suser(td)) == 0)
+ boot(uap->opt);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Called by events that want to shut down.. e.g <CTL><ALT><DEL> on a PC
+ */
+static int shutdown_howto = 0;
+
+void
+shutdown_nice(int howto)
+{
+ shutdown_howto = howto;
+
+ /* Send a signal to init(8) and have it shutdown the world */
+ if (initproc != NULL) {
+ PROC_LOCK(initproc);
+ psignal(initproc, SIGINT);
+ PROC_UNLOCK(initproc);
+ } else {
+ /* No init(8) running, so simply reboot */
+ boot(RB_NOSYNC);
+ }
+ return;
+}
+static int waittime = -1;
+
+static void
+print_uptime(void)
+{
+ int f;
+ struct timespec ts;
+
+ getnanouptime(&ts);
+ printf("Uptime: ");
+ f = 0;
+ if (ts.tv_sec >= 86400) {
+ printf("%ldd", (long)ts.tv_sec / 86400);
+ ts.tv_sec %= 86400;
+ f = 1;
+ }
+ if (f || ts.tv_sec >= 3600) {
+ printf("%ldh", (long)ts.tv_sec / 3600);
+ ts.tv_sec %= 3600;
+ f = 1;
+ }
+ if (f || ts.tv_sec >= 60) {
+ printf("%ldm", (long)ts.tv_sec / 60);
+ ts.tv_sec %= 60;
+ f = 1;
+ }
+ printf("%lds\n", (long)ts.tv_sec);
+}
+
+static void
+doadump(void)
+{
+ savectx(&dumppcb);
+ dumping++;
+ dumpsys(&dumper);
+}
+
+/*
+ * Go through the rigmarole of shutting down..
+ * this used to be in machdep.c but I'll be dammned if I could see
+ * anything machine dependant in it.
+ */
+static void
+boot(int howto)
+{
+
+ /* collect extra flags that shutdown_nice might have set */
+ howto |= shutdown_howto;
+
+#ifdef DDB
+ /* We are out of the debugger now. */
+ db_active = 0;
+#endif
+
+#ifdef SMP
+ if (smp_active)
+ printf("boot() called on cpu#%d\n", PCPU_GET(cpuid));
+#endif
+ /*
+ * Do any callouts that should be done BEFORE syncing the filesystems.
+ */
+ EVENTHANDLER_INVOKE(shutdown_pre_sync, howto);
+
+ /*
+ * Now sync filesystems
+ */
+ if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
+ register struct buf *bp;
+ int iter, nbusy, pbusy;
+ int subiter;
+
+ waittime = 0;
+ printf("\nsyncing disks... ");
+
+ sync(&thread0, NULL);
+
+ /*
+ * With soft updates, some buffers that are
+ * written will be remarked as dirty until other
+ * buffers are written.
+ */
+ for (iter = pbusy = 0; iter < 20; iter++) {
+ nbusy = 0;
+ for (bp = &buf[nbuf]; --bp >= buf; ) {
+ if ((bp->b_flags & B_INVAL) == 0 &&
+ BUF_REFCNT(bp) > 0) {
+ nbusy++;
+ } else if ((bp->b_flags & (B_DELWRI | B_INVAL))
+ == B_DELWRI) {
+ /* bawrite(bp);*/
+ nbusy++;
+ }
+ }
+ if (nbusy == 0)
+ break;
+ printf("%d ", nbusy);
+ if (nbusy < pbusy)
+ iter = 0;
+ pbusy = nbusy;
+ sync(&thread0, NULL);
+ if (curthread != NULL) {
+ DROP_GIANT();
+ for (subiter = 0; subiter < 50 * iter; subiter++) {
+ mtx_lock_spin(&sched_lock);
+ setrunqueue(curthread);
+ curthread->td_proc->p_stats->p_ru.ru_nvcsw++;
+ mi_switch(); /* Allow interrupt threads to run */
+ mtx_unlock_spin(&sched_lock);
+ DELAY(1000);
+ }
+ PICKUP_GIANT();
+ } else
+ DELAY(50000 * iter);
+ }
+ printf("\n");
+ /*
+ * Count only busy local buffers to prevent forcing
+ * a fsck if we're just a client of a wedged NFS server
+ */
+ nbusy = 0;
+ for (bp = &buf[nbuf]; --bp >= buf; ) {
+ if (((bp->b_flags&B_INVAL) == 0 && BUF_REFCNT(bp)) ||
+ ((bp->b_flags & (B_DELWRI|B_INVAL)) == B_DELWRI)) {
+ if (bp->b_dev == NODEV) {
+ TAILQ_REMOVE(&mountlist,
+ bp->b_vp->v_mount, mnt_list);
+ continue;
+ }
+ nbusy++;
+#if defined(SHOW_BUSYBUFS) || defined(DIAGNOSTIC)
+ printf(
+ "%d: dev:%s, flags:%08lx, blkno:%ld, lblkno:%ld\n",
+ nbusy, devtoname(bp->b_dev),
+ bp->b_flags, (long)bp->b_blkno,
+ (long)bp->b_lblkno);
+#endif
+ }
+ }
+ if (nbusy) {
+ /*
+ * Failed to sync all blocks. Indicate this and don't
+ * unmount filesystems (thus forcing an fsck on reboot).
+ */
+ printf("giving up on %d buffers\n", nbusy);
+ DELAY(5000000); /* 5 seconds */
+ } else {
+ printf("done\n");
+ /*
+ * Unmount filesystems
+ */
+ if (panicstr == 0)
+ vfs_unmountall();
+ }
+ DELAY(100000); /* wait for console output to finish */
+ }
+
+ print_uptime();
+
+ /*
+ * Ok, now do things that assume all filesystem activity has
+ * been completed.
+ */
+ EVENTHANDLER_INVOKE(shutdown_post_sync, howto);
+ splhigh();
+ if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP &&
+ !cold && dumper.dumper != NULL && !dumping)
+ doadump();
+
+ /* Now that we're going to really halt the system... */
+ EVENTHANDLER_INVOKE(shutdown_final, howto);
+
+ for(;;) ; /* safety against shutdown_reset not working */
+ /* NOTREACHED */
+}
+
+/*
+ * If the shutdown was a clean halt, behave accordingly.
+ */
+static void
+shutdown_halt(void *junk, int howto)
+{
+ if (howto & RB_HALT) {
+ printf("\n");
+ printf("The operating system has halted.\n");
+ printf("Please press any key to reboot.\n\n");
+ switch (cngetc()) {
+ case -1: /* No console, just die */
+ cpu_halt();
+ /* NOTREACHED */
+ default:
+ howto &= ~RB_HALT;
+ break;
+ }
+ }
+}
+
+/*
+ * Check to see if the system paniced, pause and then reboot
+ * according to the specified delay.
+ */
+static void
+shutdown_panic(void *junk, int howto)
+{
+ int loop;
+
+ if (howto & RB_DUMP) {
+ if (PANIC_REBOOT_WAIT_TIME != 0) {
+ if (PANIC_REBOOT_WAIT_TIME != -1) {
+ printf("Automatic reboot in %d seconds - "
+ "press a key on the console to abort\n",
+ PANIC_REBOOT_WAIT_TIME);
+ for (loop = PANIC_REBOOT_WAIT_TIME * 10;
+ loop > 0; --loop) {
+ DELAY(1000 * 100); /* 1/10th second */
+ /* Did user type a key? */
+ if (cncheckc() != -1)
+ break;
+ }
+ if (!loop)
+ return;
+ }
+ } else { /* zero time specified - reboot NOW */
+ return;
+ }
+ printf("--> Press a key on the console to reboot,\n");
+ printf("--> or switch off the system now.\n");
+ cngetc();
+ }
+}
+
+/*
+ * Everything done, now reset
+ */
+static void
+shutdown_reset(void *junk, int howto)
+{
+ printf("Rebooting...\n");
+ DELAY(1000000); /* wait 1 sec for printf's to complete and be read */
+ /* cpu_boot(howto); */ /* doesn't do anything at the moment */
+ cpu_reset();
+ /* NOTREACHED */ /* assuming reset worked */
+}
+
+#ifdef SMP
+static u_int panic_cpu = NOCPU;
+#endif
+
+/*
+ * Panic is called on unresolvable fatal errors. It prints "panic: mesg",
+ * and then reboots. If we are called twice, then we avoid trying to sync
+ * the disks as this often leads to recursive panics.
+ *
+ * MPSAFE
+ */
+void
+panic(const char *fmt, ...)
+{
+ int bootopt;
+ va_list ap;
+ static char buf[256];
+
+#ifdef SMP
+ /*
+ * We don't want multiple CPU's to panic at the same time, so we
+ * use panic_cpu as a simple spinlock. We have to keep checking
+ * panic_cpu if we are spinning in case the panic on the first
+ * CPU is canceled.
+ */
+ if (panic_cpu != PCPU_GET(cpuid))
+ while (atomic_cmpset_int(&panic_cpu, NOCPU,
+ PCPU_GET(cpuid)) == 0)
+ while (panic_cpu != NOCPU)
+ ; /* nothing */
+#endif
+
+ bootopt = RB_AUTOBOOT | RB_DUMP;
+ if (panicstr)
+ bootopt |= RB_NOSYNC;
+ else
+ panicstr = fmt;
+
+ va_start(ap, fmt);
+ (void)vsnprintf(buf, sizeof(buf), fmt, ap);
+ if (panicstr == fmt)
+ panicstr = buf;
+ va_end(ap);
+ printf("panic: %s\n", buf);
+#ifdef SMP
+ /* two separate prints in case of an unmapped page and trap */
+ printf("cpuid = %d; ", PCPU_GET(cpuid));
+#ifdef APIC_IO
+ printf("lapic.id = %08x\n", lapic.id);
+#endif
+#endif
+
+#if defined(DDB)
+ if (debugger_on_panic)
+ Debugger ("panic");
+#ifdef RESTARTABLE_PANICS
+ /* See if the user aborted the panic, in which case we continue. */
+ if (panicstr == NULL) {
+#ifdef SMP
+ atomic_store_rel_int(&panic_cpu, NOCPU);
+#endif
+ return;
+ }
+#endif
+#endif
+ if (!sync_on_panic)
+ bootopt |= RB_NOSYNC;
+ boot(bootopt);
+}
+
+/*
+ * Support for poweroff delay.
+ */
+#ifndef POWEROFF_DELAY
+# define POWEROFF_DELAY 5000
+#endif
+static int poweroff_delay = POWEROFF_DELAY;
+
+SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW,
+ &poweroff_delay, 0, "");
+
+static void
+poweroff_wait(void *junk, int howto)
+{
+ if(!(howto & RB_POWEROFF) || poweroff_delay <= 0)
+ return;
+ DELAY(poweroff_delay * 1000);
+}
+
+/*
+ * Some system processes (e.g. syncer) need to be stopped at appropriate
+ * points in their main loops prior to a system shutdown, so that they
+ * won't interfere with the shutdown process (e.g. by holding a disk buf
+ * to cause sync to fail). For each of these system processes, register
+ * shutdown_kproc() as a handler for one of shutdown events.
+ */
+static int kproc_shutdown_wait = 60;
+SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW,
+ &kproc_shutdown_wait, 0, "");
+
+void
+kproc_shutdown(void *arg, int howto)
+{
+ struct proc *p;
+ int error;
+
+ if (panicstr)
+ return;
+
+ p = (struct proc *)arg;
+ printf("Waiting (max %d seconds) for system process `%s' to stop...",
+ kproc_shutdown_wait, p->p_comm);
+ error = kthread_suspend(p, kproc_shutdown_wait * hz);
+
+ if (error == EWOULDBLOCK)
+ printf("timed out\n");
+ else
+ printf("stopped\n");
+}
+
+/* Registration of dumpers */
+int
+set_dumper(struct dumperinfo *di)
+{
+ if (di == NULL) {
+ bzero(&dumper, sizeof dumper);
+ return (0);
+ }
+ if (dumper.dumper != NULL)
+ return (EBUSY);
+ dumper = *di;
+ return (0);
+}
+
+#if defined(__powerpc__) || defined(__sparc64__)
+void
+dumpsys(struct dumperinfo *di __unused)
+{
+
+ printf("Kernel dumps not implemented on this architecture\n");
+}
+#endif
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
new file mode 100644
index 0000000..8af0280
--- /dev/null
+++ b/sys/kern/kern_sig.c
@@ -0,0 +1,2153 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/event.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/acct.h>
+#include <sys/fcntl.h>
+#include <sys/condvar.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/wait.h>
+#include <sys/ktr.h>
+#include <sys/ktrace.h>
+#include <sys/resourcevar.h>
+#include <sys/smp.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syslog.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+
+#include <machine/cpu.h>
+
+#define ONSIG 32 /* NSIG for osig* syscalls. XXX. */
+
+static int coredump(struct thread *);
+static int do_sigaction(struct proc *p, int sig, struct sigaction *act,
+ struct sigaction *oact, int old);
+static int do_sigprocmask(struct proc *p, int how, sigset_t *set,
+ sigset_t *oset, int old);
+static char *expand_name(const char *, uid_t, pid_t);
+static int killpg1(struct thread *td, int sig, int pgid, int all);
+static int sig_ffs(sigset_t *set);
+static int sigprop(int sig);
+static void stop(struct proc *);
+
+static int filt_sigattach(struct knote *kn);
+static void filt_sigdetach(struct knote *kn);
+static int filt_signal(struct knote *kn, long hint);
+
+struct filterops sig_filtops =
+ { 0, filt_sigattach, filt_sigdetach, filt_signal };
+
+static int kern_logsigexit = 1;
+SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
+ &kern_logsigexit, 0,
+ "Log processes quitting on abnormal signals to syslog(3)");
+
+/*
+ * Policy -- Can ucred cr1 send SIGIO to process cr2?
+ * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
+ * in the right situations.
+ */
+#define CANSIGIO(cr1, cr2) \
+ ((cr1)->cr_uid == 0 || \
+ (cr1)->cr_ruid == (cr2)->cr_ruid || \
+ (cr1)->cr_uid == (cr2)->cr_ruid || \
+ (cr1)->cr_ruid == (cr2)->cr_uid || \
+ (cr1)->cr_uid == (cr2)->cr_uid)
+
+int sugid_coredump;
+SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW,
+ &sugid_coredump, 0, "Enable coredumping set user/group ID processes");
+
+static int do_coredump = 1;
+SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
+ &do_coredump, 0, "Enable/Disable coredumps");
+
+/*
+ * Signal properties and actions.
+ * The array below categorizes the signals and their default actions
+ * according to the following properties:
+ */
+#define SA_KILL 0x01 /* terminates process by default */
+#define SA_CORE 0x02 /* ditto and coredumps */
+#define SA_STOP 0x04 /* suspend process */
+#define SA_TTYSTOP 0x08 /* ditto, from tty */
+#define SA_IGNORE 0x10 /* ignore by default */
+#define SA_CONT 0x20 /* continue if suspended */
+#define SA_CANTMASK 0x40 /* non-maskable, catchable */
+
+static int sigproptbl[NSIG] = {
+ SA_KILL, /* SIGHUP */
+ SA_KILL, /* SIGINT */
+ SA_KILL|SA_CORE, /* SIGQUIT */
+ SA_KILL|SA_CORE, /* SIGILL */
+ SA_KILL|SA_CORE, /* SIGTRAP */
+ SA_KILL|SA_CORE, /* SIGABRT */
+ SA_KILL|SA_CORE, /* SIGEMT */
+ SA_KILL|SA_CORE, /* SIGFPE */
+ SA_KILL, /* SIGKILL */
+ SA_KILL|SA_CORE, /* SIGBUS */
+ SA_KILL|SA_CORE, /* SIGSEGV */
+ SA_KILL|SA_CORE, /* SIGSYS */
+ SA_KILL, /* SIGPIPE */
+ SA_KILL, /* SIGALRM */
+ SA_KILL, /* SIGTERM */
+ SA_IGNORE, /* SIGURG */
+ SA_STOP, /* SIGSTOP */
+ SA_STOP|SA_TTYSTOP, /* SIGTSTP */
+ SA_IGNORE|SA_CONT, /* SIGCONT */
+ SA_IGNORE, /* SIGCHLD */
+ SA_STOP|SA_TTYSTOP, /* SIGTTIN */
+ SA_STOP|SA_TTYSTOP, /* SIGTTOU */
+ SA_IGNORE, /* SIGIO */
+ SA_KILL, /* SIGXCPU */
+ SA_KILL, /* SIGXFSZ */
+ SA_KILL, /* SIGVTALRM */
+ SA_KILL, /* SIGPROF */
+ SA_IGNORE, /* SIGWINCH */
+ SA_IGNORE, /* SIGINFO */
+ SA_KILL, /* SIGUSR1 */
+ SA_KILL, /* SIGUSR2 */
+};
+
+/*
+ * Determine signal that should be delivered to process p, the current
+ * process, 0 if none. If there is a pending stop signal with default
+ * action, the process stops in issignal().
+ *
+ * MP SAFE.
+ */
+int
+cursig(struct proc *p)
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ mtx_assert(&sched_lock, MA_NOTOWNED);
+ return (SIGPENDING(p) ? issignal(p) : 0);
+}
+
+/*
+ * Arrange for ast() to handle unmasked pending signals on return to user
+ * mode. This must be called whenever a signal is added to p_siglist or
+ * unmasked in p_sigmask.
+ */
+void
+signotify(struct proc *p)
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ mtx_lock_spin(&sched_lock);
+ if (SIGPENDING(p)) {
+ p->p_sflag |= PS_NEEDSIGCHK;
+ p->p_kse.ke_flags |= KEF_ASTPENDING; /* XXXKSE */
+ }
+ mtx_unlock_spin(&sched_lock);
+}
+
+static __inline int
+sigprop(int sig)
+{
+
+ if (sig > 0 && sig < NSIG)
+ return (sigproptbl[_SIG_IDX(sig)]);
+ return (0);
+}
+
+static __inline int
+sig_ffs(sigset_t *set)
+{
+ int i;
+
+ for (i = 0; i < _SIG_WORDS; i++)
+ if (set->__bits[i])
+ return (ffs(set->__bits[i]) + (i * 32));
+ return (0);
+}
+
+/*
+ * do_sigaction
+ * sigaction
+ * osigaction
+ */
+static int
+do_sigaction(p, sig, act, oact, old)
+ struct proc *p;
+ register int sig;
+ struct sigaction *act, *oact;
+ int old;
+{
+ register struct sigacts *ps;
+
+ if (!_SIG_VALID(sig))
+ return (EINVAL);
+
+ PROC_LOCK(p);
+ ps = p->p_sigacts;
+ if (oact) {
+ oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
+ oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
+ oact->sa_flags = 0;
+ if (SIGISMEMBER(ps->ps_sigonstack, sig))
+ oact->sa_flags |= SA_ONSTACK;
+ if (!SIGISMEMBER(ps->ps_sigintr, sig))
+ oact->sa_flags |= SA_RESTART;
+ if (SIGISMEMBER(ps->ps_sigreset, sig))
+ oact->sa_flags |= SA_RESETHAND;
+ if (SIGISMEMBER(ps->ps_signodefer, sig))
+ oact->sa_flags |= SA_NODEFER;
+ if (SIGISMEMBER(ps->ps_siginfo, sig))
+ oact->sa_flags |= SA_SIGINFO;
+ if (sig == SIGCHLD && p->p_procsig->ps_flag & PS_NOCLDSTOP)
+ oact->sa_flags |= SA_NOCLDSTOP;
+ if (sig == SIGCHLD && p->p_procsig->ps_flag & PS_NOCLDWAIT)
+ oact->sa_flags |= SA_NOCLDWAIT;
+ }
+ if (act) {
+ if ((sig == SIGKILL || sig == SIGSTOP) &&
+ act->sa_handler != SIG_DFL) {
+ PROC_UNLOCK(p);
+ return (EINVAL);
+ }
+
+ /*
+ * Change setting atomically.
+ */
+
+ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
+ SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
+ if (act->sa_flags & SA_SIGINFO) {
+ ps->ps_sigact[_SIG_IDX(sig)] =
+ (__sighandler_t *)act->sa_sigaction;
+ SIGADDSET(ps->ps_siginfo, sig);
+ } else {
+ ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
+ SIGDELSET(ps->ps_siginfo, sig);
+ }
+ if (!(act->sa_flags & SA_RESTART))
+ SIGADDSET(ps->ps_sigintr, sig);
+ else
+ SIGDELSET(ps->ps_sigintr, sig);
+ if (act->sa_flags & SA_ONSTACK)
+ SIGADDSET(ps->ps_sigonstack, sig);
+ else
+ SIGDELSET(ps->ps_sigonstack, sig);
+ if (act->sa_flags & SA_RESETHAND)
+ SIGADDSET(ps->ps_sigreset, sig);
+ else
+ SIGDELSET(ps->ps_sigreset, sig);
+ if (act->sa_flags & SA_NODEFER)
+ SIGADDSET(ps->ps_signodefer, sig);
+ else
+ SIGDELSET(ps->ps_signodefer, sig);
+#ifdef COMPAT_SUNOS
+ if (act->sa_flags & SA_USERTRAMP)
+ SIGADDSET(ps->ps_usertramp, sig);
+ else
+ SIGDELSET(ps->ps_usertramp, sig);
+#endif
+ if (sig == SIGCHLD) {
+ if (act->sa_flags & SA_NOCLDSTOP)
+ p->p_procsig->ps_flag |= PS_NOCLDSTOP;
+ else
+ p->p_procsig->ps_flag &= ~PS_NOCLDSTOP;
+ if (act->sa_flags & SA_NOCLDWAIT) {
+ /*
+ * Paranoia: since SA_NOCLDWAIT is implemented
+ * by reparenting the dying child to PID 1 (and
+ * trust it to reap the zombie), PID 1 itself
+ * is forbidden to set SA_NOCLDWAIT.
+ */
+ if (p->p_pid == 1)
+ p->p_procsig->ps_flag &= ~PS_NOCLDWAIT;
+ else
+ p->p_procsig->ps_flag |= PS_NOCLDWAIT;
+ } else
+ p->p_procsig->ps_flag &= ~PS_NOCLDWAIT;
+ if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
+ p->p_procsig->ps_flag |= PS_CLDSIGIGN;
+ else
+ p->p_procsig->ps_flag &= ~PS_CLDSIGIGN;
+ }
+ /*
+ * Set bit in p_sigignore for signals that are set to SIG_IGN,
+ * and for signals set to SIG_DFL where the default is to
+ * ignore. However, don't put SIGCONT in p_sigignore, as we
+ * have to restart the process.
+ */
+ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
+ (sigprop(sig) & SA_IGNORE &&
+ ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
+ /* never to be seen again */
+ SIGDELSET(p->p_siglist, sig);
+ if (sig != SIGCONT)
+ /* easier in psignal */
+ SIGADDSET(p->p_sigignore, sig);
+ SIGDELSET(p->p_sigcatch, sig);
+ } else {
+ SIGDELSET(p->p_sigignore, sig);
+ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
+ SIGDELSET(p->p_sigcatch, sig);
+ else
+ SIGADDSET(p->p_sigcatch, sig);
+ }
+#ifdef COMPAT_43
+ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
+ ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || !old)
+ SIGDELSET(ps->ps_osigset, sig);
+ else
+ SIGADDSET(ps->ps_osigset, sig);
+#endif
+ }
+ PROC_UNLOCK(p);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigaction_args {
+ int sig;
+ struct sigaction *act;
+ struct sigaction *oact;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+sigaction(td, uap)
+ struct thread *td;
+ register struct sigaction_args *uap;
+{
+ struct proc *p = td->td_proc;
+ struct sigaction act, oact;
+ register struct sigaction *actp, *oactp;
+ int error;
+
+ mtx_lock(&Giant);
+
+ actp = (uap->act != NULL) ? &act : NULL;
+ oactp = (uap->oact != NULL) ? &oact : NULL;
+ if (actp) {
+ error = copyin(uap->act, actp, sizeof(act));
+ if (error)
+ goto done2;
+ }
+ error = do_sigaction(p, uap->sig, actp, oactp, 0);
+ if (oactp && !error) {
+ error = copyout(oactp, uap->oact, sizeof(oact));
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
+#ifndef _SYS_SYSPROTO_H_
+struct osigaction_args {
+ int signum;
+ struct osigaction *nsa;
+ struct osigaction *osa;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osigaction(td, uap)
+ struct thread *td;
+ register struct osigaction_args *uap;
+{
+ struct proc *p = td->td_proc;
+ struct osigaction sa;
+ struct sigaction nsa, osa;
+ register struct sigaction *nsap, *osap;
+ int error;
+
+ if (uap->signum <= 0 || uap->signum >= ONSIG)
+ return (EINVAL);
+
+ nsap = (uap->nsa != NULL) ? &nsa : NULL;
+ osap = (uap->osa != NULL) ? &osa : NULL;
+
+ mtx_lock(&Giant);
+
+ if (nsap) {
+ error = copyin(uap->nsa, &sa, sizeof(sa));
+ if (error)
+ goto done2;
+ nsap->sa_handler = sa.sa_handler;
+ nsap->sa_flags = sa.sa_flags;
+ OSIG2SIG(sa.sa_mask, nsap->sa_mask);
+ }
+ error = do_sigaction(p, uap->signum, nsap, osap, 1);
+ if (osap && !error) {
+ sa.sa_handler = osap->sa_handler;
+ sa.sa_flags = osap->sa_flags;
+ SIG2OSIG(osap->sa_mask, sa.sa_mask);
+ error = copyout(&sa, uap->osa, sizeof(sa));
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Initialize signal state for process 0;
+ * set to ignore signals that are ignored by default.
+ */
+void
+siginit(p)
+ struct proc *p;
+{
+ register int i;
+
+ PROC_LOCK(p);
+ for (i = 1; i <= NSIG; i++)
+ if (sigprop(i) & SA_IGNORE && i != SIGCONT)
+ SIGADDSET(p->p_sigignore, i);
+ PROC_UNLOCK(p);
+}
+
+/*
+ * Reset signals for an exec of the specified process.
+ */
+void
+execsigs(p)
+ register struct proc *p;
+{
+ register struct sigacts *ps;
+ register int sig;
+
+ /*
+ * Reset caught signals. Held signals remain held
+ * through p_sigmask (unless they were caught,
+ * and are now ignored by default).
+ */
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ ps = p->p_sigacts;
+ while (SIGNOTEMPTY(p->p_sigcatch)) {
+ sig = sig_ffs(&p->p_sigcatch);
+ SIGDELSET(p->p_sigcatch, sig);
+ if (sigprop(sig) & SA_IGNORE) {
+ if (sig != SIGCONT)
+ SIGADDSET(p->p_sigignore, sig);
+ SIGDELSET(p->p_siglist, sig);
+ }
+ ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+ }
+ /*
+ * Reset stack state to the user stack.
+ * Clear set of signals caught on the signal stack.
+ */
+ p->p_sigstk.ss_flags = SS_DISABLE;
+ p->p_sigstk.ss_size = 0;
+ p->p_sigstk.ss_sp = 0;
+ p->p_flag &= ~P_ALTSTACK;
+ /*
+ * Reset no zombies if child dies flag as Solaris does.
+ */
+ p->p_procsig->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
+ if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
+ ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
+}
+
+/*
+ * do_sigprocmask()
+ *
+ * Manipulate signal mask.
+ */
+static int
+do_sigprocmask(p, how, set, oset, old)
+ struct proc *p;
+ int how;
+ sigset_t *set, *oset;
+ int old;
+{
+ int error;
+
+ PROC_LOCK(p);
+ if (oset != NULL)
+ *oset = p->p_sigmask;
+
+ error = 0;
+ if (set != NULL) {
+ switch (how) {
+ case SIG_BLOCK:
+ SIG_CANTMASK(*set);
+ SIGSETOR(p->p_sigmask, *set);
+ break;
+ case SIG_UNBLOCK:
+ SIGSETNAND(p->p_sigmask, *set);
+ signotify(p);
+ break;
+ case SIG_SETMASK:
+ SIG_CANTMASK(*set);
+ if (old)
+ SIGSETLO(p->p_sigmask, *set);
+ else
+ p->p_sigmask = *set;
+ signotify(p);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+/*
+ * sigprocmask() - MP SAFE (XXXKSE not under KSE it isn't)
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigprocmask_args {
+ int how;
+ const sigset_t *set;
+ sigset_t *oset;
+};
+#endif
+int
+sigprocmask(td, uap)
+ register struct thread *td;
+ struct sigprocmask_args *uap;
+{
+ struct proc *p = td->td_proc;
+ sigset_t set, oset;
+ sigset_t *setp, *osetp;
+ int error;
+
+ setp = (uap->set != NULL) ? &set : NULL;
+ osetp = (uap->oset != NULL) ? &oset : NULL;
+ if (setp) {
+ error = copyin(uap->set, setp, sizeof(set));
+ if (error)
+ return (error);
+ }
+ error = do_sigprocmask(p, uap->how, setp, osetp, 0);
+ if (osetp && !error) {
+ error = copyout(osetp, uap->oset, sizeof(oset));
+ }
+ return (error);
+}
+
+#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
+/*
+ * osigprocmask() - MP SAFE
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct osigprocmask_args {
+ int how;
+ osigset_t mask;
+};
+#endif
+int
+osigprocmask(td, uap)
+ register struct thread *td;
+ struct osigprocmask_args *uap;
+{
+ struct proc *p = td->td_proc;
+ sigset_t set, oset;
+ int error;
+
+ OSIG2SIG(uap->mask, set);
+ error = do_sigprocmask(p, uap->how, &set, &oset, 1);
+ SIG2OSIG(oset, td->td_retval[0]);
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigpending_args {
+ sigset_t *set;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+sigpending(td, uap)
+ struct thread *td;
+ struct sigpending_args *uap;
+{
+ struct proc *p = td->td_proc;
+ sigset_t siglist;
+ int error;
+
+ mtx_lock(&Giant);
+ PROC_LOCK(p);
+ siglist = p->p_siglist;
+ PROC_UNLOCK(p);
+ mtx_unlock(&Giant);
+ error = copyout(&siglist, uap->set, sizeof(sigset_t));
+ return(error);
+}
+
+#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
+#ifndef _SYS_SYSPROTO_H_
+struct osigpending_args {
+ int dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osigpending(td, uap)
+ struct thread *td;
+ struct osigpending_args *uap;
+{
+ struct proc *p = td->td_proc;
+
+ mtx_lock(&Giant);
+ PROC_LOCK(p);
+ SIG2OSIG(p->p_siglist, td->td_retval[0]);
+ PROC_UNLOCK(p);
+ mtx_unlock(&Giant);
+ return (0);
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Generalized interface signal handler, 4.3-compatible.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct osigvec_args {
+ int signum;
+ struct sigvec *nsv;
+ struct sigvec *osv;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osigvec(td, uap)
+ struct thread *td;
+ register struct osigvec_args *uap;
+{
+ struct proc *p = td->td_proc;
+ struct sigvec vec;
+ struct sigaction nsa, osa;
+ register struct sigaction *nsap, *osap;
+ int error;
+
+ if (uap->signum <= 0 || uap->signum >= ONSIG)
+ return (EINVAL);
+ nsap = (uap->nsv != NULL) ? &nsa : NULL;
+ osap = (uap->osv != NULL) ? &osa : NULL;
+ if (nsap) {
+ error = copyin(uap->nsv, &vec, sizeof(vec));
+ if (error)
+ return (error);
+ nsap->sa_handler = vec.sv_handler;
+ OSIG2SIG(vec.sv_mask, nsap->sa_mask);
+ nsap->sa_flags = vec.sv_flags;
+ nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */
+#ifdef COMPAT_SUNOS
+ nsap->sa_flags |= SA_USERTRAMP;
+#endif
+ }
+ mtx_lock(&Giant);
+ error = do_sigaction(p, uap->signum, nsap, osap, 1);
+ mtx_unlock(&Giant);
+ if (osap && !error) {
+ vec.sv_handler = osap->sa_handler;
+ SIG2OSIG(osap->sa_mask, vec.sv_mask);
+ vec.sv_flags = osap->sa_flags;
+ vec.sv_flags &= ~SA_NOCLDWAIT;
+ vec.sv_flags ^= SA_RESTART;
+#ifdef COMPAT_SUNOS
+ vec.sv_flags &= ~SA_NOCLDSTOP;
+#endif
+ error = copyout(&vec, uap->osv, sizeof(vec));
+ }
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct osigblock_args {
+ int mask;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+osigblock(td, uap)
+ register struct thread *td;
+ struct osigblock_args *uap;
+{
+ struct proc *p = td->td_proc;
+ sigset_t set;
+
+ OSIG2SIG(uap->mask, set);
+ SIG_CANTMASK(set);
+ mtx_lock(&Giant);
+ PROC_LOCK(p);
+ SIG2OSIG(p->p_sigmask, td->td_retval[0]);
+ SIGSETOR(p->p_sigmask, set);
+ PROC_UNLOCK(p);
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct osigsetmask_args {
+ int mask;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+osigsetmask(td, uap)
+ struct thread *td;
+ struct osigsetmask_args *uap;
+{
+ struct proc *p = td->td_proc;
+ sigset_t set;
+
+ OSIG2SIG(uap->mask, set);
+ SIG_CANTMASK(set);
+ mtx_lock(&Giant);
+ PROC_LOCK(p);
+ SIG2OSIG(p->p_sigmask, td->td_retval[0]);
+ SIGSETLO(p->p_sigmask, set);
+ signotify(p);
+ PROC_UNLOCK(p);
+ mtx_unlock(&Giant);
+ return (0);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Suspend process until signal, providing mask to be set
+ * in the meantime. Note nonstandard calling convention:
+ * libc stub passes mask, not pointer, to save a copyin.
+ ***** XXXKSE this doesn't make sense under KSE.
+ ***** Do we suspend the thread or all threads in the process?
+ ***** How do we suspend threads running NOW on another processor?
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sigsuspend_args {
+ const sigset_t *sigmask;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+sigsuspend(td, uap)
+ struct thread *td;
+ struct sigsuspend_args *uap;
+{
+ struct proc *p = td->td_proc;
+ sigset_t mask;
+ register struct sigacts *ps;
+ int error;
+
+ error = copyin(uap->sigmask, &mask, sizeof(mask));
+ if (error)
+ return (error);
+
+ /*
+ * When returning from sigsuspend, we want
+ * the old mask to be restored after the
+ * signal handler has finished. Thus, we
+ * save it here and mark the sigacts structure
+ * to indicate this.
+ */
+ mtx_lock(&Giant);
+ PROC_LOCK(p);
+ ps = p->p_sigacts;
+ p->p_oldsigmask = p->p_sigmask;
+ p->p_flag |= P_OLDMASK;
+
+ SIG_CANTMASK(mask);
+ p->p_sigmask = mask;
+ signotify(p);
+ while (msleep((caddr_t) ps, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0)
+ /* void */;
+ PROC_UNLOCK(p);
+ mtx_unlock(&Giant);
+ /* always return EINTR rather than ERESTART... */
+ return (EINTR);
+}
+
+#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
+#ifndef _SYS_SYSPROTO_H_
+struct osigsuspend_args {
+ osigset_t mask;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osigsuspend(td, uap)
+ struct thread *td;
+ struct osigsuspend_args *uap;
+{
+ struct proc *p = td->td_proc;
+ sigset_t mask;
+ register struct sigacts *ps;
+
+ mtx_lock(&Giant);
+ PROC_LOCK(p);
+ ps = p->p_sigacts;
+ p->p_oldsigmask = p->p_sigmask;
+ p->p_flag |= P_OLDMASK;
+ OSIG2SIG(uap->mask, mask);
+ SIG_CANTMASK(mask);
+ SIGSETLO(p->p_sigmask, mask);
+ signotify(p);
+ while (msleep((caddr_t) ps, &p->p_mtx, PPAUSE|PCATCH, "opause", 0) == 0)
+ /* void */;
+ PROC_UNLOCK(p);
+ mtx_unlock(&Giant);
+ /* always return EINTR rather than ERESTART... */
+ return (EINTR);
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct osigstack_args {
+ struct sigstack *nss;
+ struct sigstack *oss;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osigstack(td, uap)
+ struct thread *td;
+ register struct osigstack_args *uap;
+{
+ struct proc *p = td->td_proc;
+ struct sigstack ss;
+ int error = 0;
+
+ mtx_lock(&Giant);
+
+ if (uap->oss != NULL) {
+ PROC_LOCK(p);
+ ss.ss_sp = p->p_sigstk.ss_sp;
+ ss.ss_onstack = sigonstack(cpu_getstack(td));
+ PROC_UNLOCK(p);
+ error = copyout(&ss, uap->oss, sizeof(struct sigstack));
+ if (error)
+ goto done2;
+ }
+
+ if (uap->nss != NULL) {
+ if ((error = copyin(uap->nss, &ss, sizeof(ss))) != 0)
+ goto done2;
+ PROC_LOCK(p);
+ p->p_sigstk.ss_sp = ss.ss_sp;
+ p->p_sigstk.ss_size = 0;
+ p->p_sigstk.ss_flags |= ss.ss_onstack & SS_ONSTACK;
+ p->p_flag |= P_ALTSTACK;
+ PROC_UNLOCK(p);
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigaltstack_args {
+ stack_t *ss;
+ stack_t *oss;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+sigaltstack(td, uap)
+ struct thread *td;
+ register struct sigaltstack_args *uap;
+{
+ struct proc *p = td->td_proc;
+ stack_t ss;
+ int oonstack;
+ int error = 0;
+
+ mtx_lock(&Giant);
+
+ oonstack = sigonstack(cpu_getstack(td));
+
+ if (uap->oss != NULL) {
+ PROC_LOCK(p);
+ ss = p->p_sigstk;
+ ss.ss_flags = (p->p_flag & P_ALTSTACK)
+ ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
+ PROC_UNLOCK(p);
+ if ((error = copyout(&ss, uap->oss, sizeof(stack_t))) != 0)
+ goto done2;
+ }
+
+ if (uap->ss != NULL) {
+ if (oonstack) {
+ error = EPERM;
+ goto done2;
+ }
+ if ((error = copyin(uap->ss, &ss, sizeof(ss))) != 0)
+ goto done2;
+ if ((ss.ss_flags & ~SS_DISABLE) != 0) {
+ error = EINVAL;
+ goto done2;
+ }
+ if (!(ss.ss_flags & SS_DISABLE)) {
+ if (ss.ss_size < p->p_sysent->sv_minsigstksz) {
+ error = ENOMEM;
+ goto done2;
+ }
+ PROC_LOCK(p);
+ p->p_sigstk = ss;
+ p->p_flag |= P_ALTSTACK;
+ PROC_UNLOCK(p);
+ } else {
+ PROC_LOCK(p);
+ p->p_flag &= ~P_ALTSTACK;
+ PROC_UNLOCK(p);
+ }
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Common code for kill process group/broadcast kill.
+ * cp is calling process.
+ */
+int
+killpg1(td, sig, pgid, all)
+ register struct thread *td;
+ int sig, pgid, all;
+{
+ register struct proc *p;
+ struct pgrp *pgrp;
+ int nfound = 0;
+
+ if (all) {
+ /*
+ * broadcast
+ */
+ sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &allproc, p_list) {
+ PROC_LOCK(p);
+ if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+ p == td->td_proc) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ if (p_cansignal(td, p, sig) == 0) {
+ nfound++;
+ if (sig)
+ psignal(p, sig);
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ } else {
+ sx_slock(&proctree_lock);
+ if (pgid == 0) {
+ /*
+ * zero pgid means send to my process group.
+ */
+ pgrp = td->td_proc->p_pgrp;
+ PGRP_LOCK(pgrp);
+ } else {
+ pgrp = pgfind(pgid);
+ if (pgrp == NULL) {
+ sx_sunlock(&proctree_lock);
+ return (ESRCH);
+ }
+ }
+ sx_sunlock(&proctree_lock);
+ LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (p->p_pid <= 1 || p->p_flag & P_SYSTEM) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ if (p->p_stat == SZOMB) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ if (p_cansignal(td, p, sig) == 0) {
+ nfound++;
+ if (sig)
+ psignal(p, sig);
+ }
+ PROC_UNLOCK(p);
+ }
+ PGRP_UNLOCK(pgrp);
+ }
+ return (nfound ? 0 : ESRCH);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct kill_args {
+ int pid;
+ int signum;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+kill(td, uap)
+ register struct thread *td;
+ register struct kill_args *uap;
+{
+ register struct proc *p;
+ int error = 0;
+
+ if ((u_int)uap->signum > _SIG_MAXSIG)
+ return (EINVAL);
+
+ mtx_lock(&Giant);
+ if (uap->pid > 0) {
+ /* kill single process */
+ if ((p = pfind(uap->pid)) == NULL) {
+ error = ESRCH;
+ } else if ((error = p_cansignal(td, p, uap->signum)) != 0) {
+ PROC_UNLOCK(p);
+ } else {
+ if (uap->signum)
+ psignal(p, uap->signum);
+ PROC_UNLOCK(p);
+ error = 0;
+ }
+ } else {
+ switch (uap->pid) {
+ case -1: /* broadcast signal */
+ error = killpg1(td, uap->signum, 0, 1);
+ break;
+ case 0: /* signal own process group */
+ error = killpg1(td, uap->signum, 0, 0);
+ break;
+ default: /* negative explicit process group */
+ error = killpg1(td, uap->signum, -uap->pid, 0);
+ break;
+ }
+ }
+ mtx_unlock(&Giant);
+ return(error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct okillpg_args {
+ int pgid;
+ int signum;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+okillpg(td, uap)
+ struct thread *td;
+ register struct okillpg_args *uap;
+{
+ int error;
+
+ if ((u_int)uap->signum > _SIG_MAXSIG)
+ return (EINVAL);
+ mtx_lock(&Giant);
+ error = killpg1(td, uap->signum, uap->pgid, 0);
+ mtx_unlock(&Giant);
+ return (error);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Send a signal to a process group.
+ */
+void
+gsignal(pgid, sig)
+ int pgid, sig;
+{
+ struct pgrp *pgrp;
+
+ if (pgid != 0) {
+ sx_slock(&proctree_lock);
+ pgrp = pgfind(pgid);
+ sx_sunlock(&proctree_lock);
+ if (pgrp != NULL) {
+ pgsignal(pgrp, sig, 0);
+ PGRP_UNLOCK(pgrp);
+ }
+ }
+}
+
+/*
+ * Send a signal to a process group. If checktty is 1,
+ * limit to members which have a controlling terminal.
+ */
+void
+pgsignal(pgrp, sig, checkctty)
+ struct pgrp *pgrp;
+ int sig, checkctty;
+{
+ register struct proc *p;
+
+ if (pgrp) {
+ PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
+ LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (checkctty == 0 || p->p_flag & P_CONTROLT)
+ psignal(p, sig);
+ PROC_UNLOCK(p);
+ }
+ }
+}
+
+/*
+ * Send a signal caused by a trap to the current process.
+ * If it will be caught immediately, deliver it with correct code.
+ * Otherwise, post it normally.
+ *
+ * MPSAFE
+ */
+void
+trapsignal(p, sig, code)
+ struct proc *p;
+ register int sig;
+ u_long code;
+{
+ register struct sigacts *ps = p->p_sigacts;
+
+ PROC_LOCK(p);
+ if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(p->p_sigcatch, sig) &&
+ !SIGISMEMBER(p->p_sigmask, sig)) {
+ p->p_stats->p_ru.ru_nsignals++;
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_PSIG))
+ ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
+ &p->p_sigmask, code);
+#endif
+ (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], sig,
+ &p->p_sigmask, code);
+ SIGSETOR(p->p_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
+ if (!SIGISMEMBER(ps->ps_signodefer, sig))
+ SIGADDSET(p->p_sigmask, sig);
+ if (SIGISMEMBER(ps->ps_sigreset, sig)) {
+ /*
+ * See do_sigaction() for origin of this code.
+ */
+ SIGDELSET(p->p_sigcatch, sig);
+ if (sig != SIGCONT &&
+ sigprop(sig) & SA_IGNORE)
+ SIGADDSET(p->p_sigignore, sig);
+ ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+ }
+ } else {
+ p->p_code = code; /* XXX for core dump/debugger */
+ p->p_sig = sig; /* XXX to verify code */
+ psignal(p, sig);
+ }
+ PROC_UNLOCK(p);
+}
+
+/*
+ * Send the signal to the process. If the signal has an action, the action
+ * is usually performed by the target process rather than the caller; we add
+ * the signal to the set of pending signals for the process.
+ *
+ * Exceptions:
+ * o When a stop signal is sent to a sleeping process that takes the
+ * default action, the process is stopped without awakening it.
+ * o SIGCONT restarts stopped processes (or puts them back to sleep)
+ * regardless of the signal action (eg, blocked or ignored).
+ *
+ * Other ignored signals are discarded immediately.
+ */
+void
+psignal(p, sig)
+ register struct proc *p;
+ register int sig;
+{
+ register int prop;
+ register sig_t action;
+ struct thread *td;
+#ifdef SMP
+ struct ksegrp *kg;
+#endif
+
+ KASSERT(_SIG_VALID(sig),
+ ("psignal(): invalid signal %d\n", sig));
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ KNOTE(&p->p_klist, NOTE_SIGNAL | sig);
+
+ prop = sigprop(sig);
+
+ /*
+ * If proc is traced, always give parent a chance;
+ * if signal event is tracked by procfs, give *that*
+ * a chance, as well.
+ */
+ if ((p->p_flag & P_TRACED) || (p->p_stops & S_SIG)) {
+ action = SIG_DFL;
+ } else {
+ /*
+ * If the signal is being ignored,
+ * then we forget about it immediately.
+ * (Note: we don't set SIGCONT in p_sigignore,
+ * and if it is set to SIG_IGN,
+ * action will be SIG_DFL here.)
+ */
+ if (SIGISMEMBER(p->p_sigignore, sig) || (p->p_flag & P_WEXIT))
+ return;
+ if (SIGISMEMBER(p->p_sigmask, sig))
+ action = SIG_HOLD;
+ else if (SIGISMEMBER(p->p_sigcatch, sig))
+ action = SIG_CATCH;
+ else
+ action = SIG_DFL;
+ }
+
+ /*
+ * bring the priority of a process up if we want it to get
+ * killed in this lifetime.
+ * XXXKSE think if a better way to do this.
+ *
+ * What we need to do is see if there is a thread that will
+ * be able to accept the signal. e.g.
+ * FOREACH_THREAD_IN_PROC() {
+ * if runnable, we're done
+ * else pick one at random.
+ * }
+ */
+ /* XXXKSE
+ * For now there is one thread per proc.
+ * Effectively select one sucker thread..
+ */
+ td = FIRST_THREAD_IN_PROC(p);
+ mtx_lock_spin(&sched_lock);
+ if ((p->p_ksegrp.kg_nice > NZERO) && (action == SIG_DFL) &&
+ (prop & SA_KILL) && ((p->p_flag & P_TRACED) == 0))
+ p->p_ksegrp.kg_nice = NZERO; /* XXXKSE */
+ mtx_unlock_spin(&sched_lock);
+
+ if (prop & SA_CONT)
+ SIG_STOPSIGMASK(p->p_siglist);
+
+ if (prop & SA_STOP) {
+ /*
+ * If sending a tty stop signal to a member of an orphaned
+ * process group, discard the signal here if the action
+ * is default; don't stop the process below if sleeping,
+ * and don't clear any pending SIGCONT.
+ */
+ if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 &&
+ action == SIG_DFL)
+ return;
+ SIG_CONTSIGMASK(p->p_siglist);
+ }
+ SIGADDSET(p->p_siglist, sig);
+ mtx_lock_spin(&sched_lock);
+ signotify(p);
+
+ /*
+ * Defer further processing for signals which are held,
+ * except that stopped processes must be continued by SIGCONT.
+ */
+ if (action == SIG_HOLD && (!(prop & SA_CONT) || p->p_stat != SSTOP)) {
+ mtx_unlock_spin(&sched_lock);
+ return;
+ }
+
+ switch (p->p_stat) {
+
+ case SSLEEP:
+ /*
+ * If process is sleeping uninterruptibly
+ * we can't interrupt the sleep... the signal will
+ * be noticed when the process returns through
+ * trap() or syscall().
+ */
+ if ((td->td_flags & TDF_SINTR) == 0)
+ goto out;
+ /*
+ * Process is sleeping and traced... make it runnable
+ * so it can discover the signal in issignal() and stop
+ * for the parent.
+ */
+ if (p->p_flag & P_TRACED)
+ goto run;
+ /*
+ * If SIGCONT is default (or ignored) and process is
+ * asleep, we are finished; the process should not
+ * be awakened.
+ */
+ if ((prop & SA_CONT) && action == SIG_DFL) {
+ SIGDELSET(p->p_siglist, sig);
+ goto out;
+ }
+ /*
+ * When a sleeping process receives a stop
+ * signal, process immediately if possible.
+ * All other (caught or default) signals
+ * cause the process to run.
+ */
+ if (prop & SA_STOP) {
+ if (action != SIG_DFL)
+ goto runfast;
+ /*
+ * If a child holding parent blocked,
+ * stopping could cause deadlock.
+ */
+ if (p->p_flag & P_PPWAIT)
+ goto out;
+ mtx_unlock_spin(&sched_lock);
+ SIGDELSET(p->p_siglist, sig);
+ p->p_xstat = sig;
+ PROC_LOCK(p->p_pptr);
+ if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0)
+ psignal(p->p_pptr, SIGCHLD);
+ PROC_UNLOCK(p->p_pptr);
+ mtx_lock_spin(&sched_lock);
+ stop(p);
+ goto out;
+ } else
+ goto runfast;
+ /* NOTREACHED */
+
+ case SSTOP:
+ /*
+ * If traced process is already stopped,
+ * then no further action is necessary.
+ */
+ if (p->p_flag & P_TRACED)
+ goto out;
+
+ /*
+ * Kill signal always sets processes running.
+ */
+ if (sig == SIGKILL)
+ goto runfast;
+
+ if (prop & SA_CONT) {
+ /*
+ * If SIGCONT is default (or ignored), we continue the
+ * process but don't leave the signal in p_siglist, as
+ * it has no further action. If SIGCONT is held, we
+ * continue the process and leave the signal in
+ * p_siglist. If the process catches SIGCONT, let it
+ * handle the signal itself. If it isn't waiting on
+ * an event, then it goes back to run state.
+ * Otherwise, process goes back to sleep state.
+ */
+ if (action == SIG_DFL)
+ SIGDELSET(p->p_siglist, sig);
+ if (action == SIG_CATCH)
+ goto runfast;
+ /*
+ * XXXKSE
+ * do this for each thread.
+ */
+ if (p->p_flag & P_KSES) {
+ mtx_assert(&sched_lock,
+ MA_OWNED | MA_NOTRECURSED);
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (td->td_wchan == NULL) {
+ setrunnable(td); /* XXXKSE */
+ } else {
+ /* mark it as sleeping */
+ }
+ }
+ } else {
+ p->p_flag |= P_CONTINUED;
+ wakeup((caddr_t)p->p_pptr);
+ if (td->td_wchan == NULL)
+ goto run;
+ p->p_stat = SSLEEP;
+ }
+ goto out;
+ }
+
+ if (prop & SA_STOP) {
+ /*
+ * Already stopped, don't need to stop again.
+ * (If we did the shell could get confused.)
+ */
+ SIGDELSET(p->p_siglist, sig);
+ goto out;
+ }
+
+ /*
+ * If process is sleeping interruptibly, then simulate a
+ * wakeup so that when it is continued, it will be made
+ * runnable and can look at the signal. But don't make
+ * the process runnable, leave it stopped.
+ * XXXKSE should we wake ALL blocked threads?
+ */
+ if (p->p_flag & P_KSES) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (td->td_wchan && (td->td_flags & TDF_SINTR)){
+ if (td->td_flags & TDF_CVWAITQ)
+ cv_waitq_remove(td);
+ else
+ unsleep(td); /* XXXKSE */
+ }
+ }
+ } else {
+ if (td->td_wchan && td->td_flags & TDF_SINTR) {
+ if (td->td_flags & TDF_CVWAITQ)
+ cv_waitq_remove(td);
+ else
+ unsleep(td); /* XXXKSE */
+ }
+ }
+ goto out;
+
+ default:
+ /*
+ * SRUN, SIDL, SZOMB do nothing with the signal,
+ * other than kicking ourselves if we are running.
+ * It will either never be noticed, or noticed very soon.
+ */
+ if (p->p_stat == SRUN) {
+#ifdef SMP
+ struct kse *ke;
+ struct thread *td = curthread;
+/* we should only deliver to one thread.. but which one? */
+ FOREACH_KSEGRP_IN_PROC(p, kg) {
+ FOREACH_KSE_IN_GROUP(kg, ke) {
+ if (ke->ke_thread == td) {
+ continue;
+ }
+ forward_signal(ke->ke_thread);
+ }
+ }
+#endif
+ }
+ goto out;
+ }
+ /*NOTREACHED*/
+
+runfast:
+ /*
+ * Raise priority to at least PUSER.
+ * XXXKSE Should we make them all run fast?
+ * Maybe just one would be enough?
+ */
+
+ if (FIRST_THREAD_IN_PROC(p)->td_priority > PUSER) {
+ FIRST_THREAD_IN_PROC(p)->td_priority = PUSER;
+ }
+run:
+ /* If we jump here, sched_lock has to be owned. */
+ mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
+ setrunnable(td); /* XXXKSE */
+out:
+ mtx_unlock_spin(&sched_lock);
+
+ /* Once we get here, sched_lock should not be owned. */
+ mtx_assert(&sched_lock, MA_NOTOWNED);
+}
+
+/*
+ * If the current process has received a signal (should be caught or cause
+ * termination, should interrupt current syscall), return the signal number.
+ * Stop signals with default action are processed immediately, then cleared;
+ * they aren't returned. This is checked after each entry to the system for
+ * a syscall or trap (though this can usually be done without calling issignal
+ * by checking the pending signal masks in cursig.) The normal call
+ * sequence is
+ *
+ * while (sig = cursig(curproc))
+ * postsig(sig);
+ */
+int
+issignal(p)
+ register struct proc *p;
+{
+ sigset_t mask;
+ register int sig, prop;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ for (;;) {
+ int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
+
+ mask = p->p_siglist;
+ SIGSETNAND(mask, p->p_sigmask);
+ if (p->p_flag & P_PPWAIT)
+ SIG_STOPSIGMASK(mask);
+ if (SIGISEMPTY(mask)) /* no signal to send */
+ return (0);
+ sig = sig_ffs(&mask);
+ prop = sigprop(sig);
+
+ _STOPEVENT(p, S_SIG, sig);
+
+ /*
+ * We should see pending but ignored signals
+ * only if P_TRACED was on when they were posted.
+ */
+ if (SIGISMEMBER(p->p_sigignore, sig) && (traced == 0)) {
+ SIGDELSET(p->p_siglist, sig);
+ continue;
+ }
+ if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
+ /*
+ * If traced, always stop.
+ */
+ p->p_xstat = sig;
+ PROC_LOCK(p->p_pptr);
+ psignal(p->p_pptr, SIGCHLD);
+ PROC_UNLOCK(p->p_pptr);
+ mtx_lock_spin(&sched_lock);
+ stop(p);
+ PROC_UNLOCK(p);
+ DROP_GIANT();
+ p->p_stats->p_ru.ru_nivcsw++;
+ mi_switch();
+ mtx_unlock_spin(&sched_lock);
+ PICKUP_GIANT();
+ PROC_LOCK(p);
+
+ /*
+ * If the traced bit got turned off, go back up
+ * to the top to rescan signals. This ensures
+ * that p_sig* and ps_sigact are consistent.
+ */
+ if ((p->p_flag & P_TRACED) == 0)
+ continue;
+
+ /*
+ * If parent wants us to take the signal,
+ * then it will leave it in p->p_xstat;
+ * otherwise we just look for signals again.
+ */
+ SIGDELSET(p->p_siglist, sig); /* clear old signal */
+ sig = p->p_xstat;
+ if (sig == 0)
+ continue;
+
+ /*
+ * Put the new signal into p_siglist. If the
+ * signal is being masked, look for other signals.
+ */
+ SIGADDSET(p->p_siglist, sig);
+ if (SIGISMEMBER(p->p_sigmask, sig))
+ continue;
+ }
+
+ /*
+ * Decide whether the signal should be returned.
+ * Return the signal's number, or fall through
+ * to clear it from the pending mask.
+ */
+ switch ((int)(intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
+
+ case (int)SIG_DFL:
+ /*
+ * Don't take default actions on system processes.
+ */
+ if (p->p_pid <= 1) {
+#ifdef DIAGNOSTIC
+ /*
+ * Are you sure you want to ignore SIGSEGV
+ * in init? XXX
+ */
+ printf("Process (pid %lu) got signal %d\n",
+ (u_long)p->p_pid, sig);
+#endif
+ break; /* == ignore */
+ }
+ /*
+ * If there is a pending stop signal to process
+ * with default action, stop here,
+ * then clear the signal. However,
+ * if process is member of an orphaned
+ * process group, ignore tty stop signals.
+ */
+ if (prop & SA_STOP) {
+ if (p->p_flag & P_TRACED ||
+ (p->p_pgrp->pg_jobc == 0 &&
+ prop & SA_TTYSTOP))
+ break; /* == ignore */
+ p->p_xstat = sig;
+ PROC_LOCK(p->p_pptr);
+ if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0)
+ psignal(p->p_pptr, SIGCHLD);
+ PROC_UNLOCK(p->p_pptr);
+ mtx_lock_spin(&sched_lock);
+ stop(p);
+ PROC_UNLOCK(p);
+ DROP_GIANT();
+ p->p_stats->p_ru.ru_nivcsw++;
+ mi_switch();
+ mtx_unlock_spin(&sched_lock);
+ PICKUP_GIANT();
+ PROC_LOCK(p);
+ break;
+ } else if (prop & SA_IGNORE) {
+ /*
+ * Except for SIGCONT, shouldn't get here.
+ * Default action is to ignore; drop it.
+ */
+ break; /* == ignore */
+ } else
+ return (sig);
+ /*NOTREACHED*/
+
+ case (int)SIG_IGN:
+ /*
+ * Masking above should prevent us ever trying
+ * to take action on an ignored signal other
+ * than SIGCONT, unless process is traced.
+ */
+ if ((prop & SA_CONT) == 0 &&
+ (p->p_flag & P_TRACED) == 0)
+ printf("issignal\n");
+ break; /* == ignore */
+
+ default:
+ /*
+ * This signal has an action, let
+ * postsig() process it.
+ */
+ return (sig);
+ }
+ SIGDELSET(p->p_siglist, sig); /* take the signal! */
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Put the argument process into the stopped state and notify the parent
+ * via wakeup. Signals are handled elsewhere. The process must not be
+ * on the run queue. Must be called with the proc p locked and the scheduler
+ * lock held.
+ */
+static void
+stop(p)
+ register struct proc *p;
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ mtx_assert(&sched_lock, MA_OWNED);
+ p->p_stat = SSTOP;
+ p->p_flag &= ~P_WAITED;
+ wakeup((caddr_t)p->p_pptr);
+}
+
+/*
+ * Take the action for the specified signal
+ * from the current set of pending signals.
+ */
+void
+postsig(sig)
+ register int sig;
+{
+ struct thread *td = curthread;
+ register struct proc *p = td->td_proc;
+ struct sigacts *ps;
+ sig_t action;
+ sigset_t returnmask;
+ int code;
+
+ KASSERT(sig != 0, ("postsig"));
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ ps = p->p_sigacts;
+ SIGDELSET(p->p_siglist, sig);
+ action = ps->ps_sigact[_SIG_IDX(sig)];
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_PSIG))
+ ktrpsig(sig, action, p->p_flag & P_OLDMASK ?
+ &p->p_oldsigmask : &p->p_sigmask, 0);
+#endif
+ _STOPEVENT(p, S_SIG, sig);
+
+ if (action == SIG_DFL) {
+ /*
+ * Default action, where the default is to kill
+ * the process. (Other cases were ignored above.)
+ */
+ sigexit(td, sig);
+ /* NOTREACHED */
+ } else {
+ /*
+ * If we get here, the signal must be caught.
+ */
+ KASSERT(action != SIG_IGN && !SIGISMEMBER(p->p_sigmask, sig),
+ ("postsig action"));
+ /*
+ * Set the new mask value and also defer further
+ * occurrences of this signal.
+ *
+ * Special case: user has done a sigsuspend. Here the
+ * current mask is not of interest, but rather the
+ * mask from before the sigsuspend is what we want
+ * restored after the signal processing is completed.
+ */
+ if (p->p_flag & P_OLDMASK) {
+ returnmask = p->p_oldsigmask;
+ p->p_flag &= ~P_OLDMASK;
+ } else
+ returnmask = p->p_sigmask;
+
+ SIGSETOR(p->p_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
+ if (!SIGISMEMBER(ps->ps_signodefer, sig))
+ SIGADDSET(p->p_sigmask, sig);
+
+ if (SIGISMEMBER(ps->ps_sigreset, sig)) {
+ /*
+ * See do_sigaction() for origin of this code.
+ */
+ SIGDELSET(p->p_sigcatch, sig);
+ if (sig != SIGCONT &&
+ sigprop(sig) & SA_IGNORE)
+ SIGADDSET(p->p_sigignore, sig);
+ ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+ }
+ p->p_stats->p_ru.ru_nsignals++;
+ if (p->p_sig != sig) {
+ code = 0;
+ } else {
+ code = p->p_code;
+ p->p_code = 0;
+ p->p_sig = 0;
+ }
+ (*p->p_sysent->sv_sendsig)(action, sig, &returnmask, code);
+ }
+}
+
+/*
+ * Kill the current process for stated reason.
+ */
+void
+killproc(p, why)
+ struct proc *p;
+ char *why;
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)",
+ p, p->p_pid, p->p_comm);
+ log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
+ p->p_ucred ? p->p_ucred->cr_uid : -1, why);
+ psignal(p, SIGKILL);
+}
+
+/*
+ * Force the current process to exit with the specified signal, dumping core
+ * if appropriate. We bypass the normal tests for masked and caught signals,
+ * allowing unrecoverable failures to terminate the process without changing
+ * signal state. Mark the accounting record with the signal termination.
+ * If dumping core, save the signal number for the debugger. Calls exit and
+ * does not return.
+ */
+void
+sigexit(td, sig)
+ struct thread *td;
+ int sig;
+{
+ struct proc *p = td->td_proc;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ p->p_acflag |= AXSIG;
+ if (sigprop(sig) & SA_CORE) {
+ p->p_sig = sig;
+ /*
+ * Log signals which would cause core dumps
+ * (Log as LOG_INFO to appease those who don't want
+ * these messages.)
+ * XXX : Todo, as well as euid, write out ruid too
+ */
+ PROC_UNLOCK(p);
+ if (!mtx_owned(&Giant))
+ mtx_lock(&Giant);
+ if (coredump(td) == 0)
+ sig |= WCOREFLAG;
+ if (kern_logsigexit)
+ log(LOG_INFO,
+ "pid %d (%s), uid %d: exited on signal %d%s\n",
+ p->p_pid, p->p_comm,
+ td->td_ucred ? td->td_ucred->cr_uid : -1,
+ sig &~ WCOREFLAG,
+ sig & WCOREFLAG ? " (core dumped)" : "");
+ } else {
+ PROC_UNLOCK(p);
+ if (!mtx_owned(&Giant))
+ mtx_lock(&Giant);
+ }
+ exit1(td, W_EXITCODE(0, sig));
+ /* NOTREACHED */
+}
+
+static char corefilename[MAXPATHLEN+1] = {"%N.core"};
+SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
+ sizeof(corefilename), "process corefile name format string");
+
+/*
+ * expand_name(name, uid, pid)
+ * Expand the name described in corefilename, using name, uid, and pid.
+ * corefilename is a printf-like string, with three format specifiers:
+ * %N name of process ("name")
+ * %P process id (pid)
+ * %U user id (uid)
+ * For example, "%N.core" is the default; they can be disabled completely
+ * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
+ * This is controlled by the sysctl variable kern.corefile (see above).
+ */
+
+static char *
+expand_name(name, uid, pid)
+ const char *name;
+ uid_t uid;
+ pid_t pid;
+{
+ const char *format, *appendstr;
+ char *temp;
+ char buf[11]; /* Buffer for pid/uid -- max 4B */
+ size_t i, l, n;
+
+ format = corefilename;
+ temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO);
+ if (temp == NULL)
+ return (NULL);
+ for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) {
+ switch (format[i]) {
+ case '%': /* Format character */
+ i++;
+ switch (format[i]) {
+ case '%':
+ appendstr = "%";
+ break;
+ case 'N': /* process name */
+ appendstr = name;
+ break;
+ case 'P': /* process id */
+ sprintf(buf, "%u", pid);
+ appendstr = buf;
+ break;
+ case 'U': /* user id */
+ sprintf(buf, "%u", uid);
+ appendstr = buf;
+ break;
+ default:
+ appendstr = "";
+ log(LOG_ERR,
+ "Unknown format character %c in `%s'\n",
+ format[i], format);
+ }
+ l = strlen(appendstr);
+ if ((n + l) >= MAXPATHLEN)
+ goto toolong;
+ memcpy(temp + n, appendstr, l);
+ n += l;
+ break;
+ default:
+ temp[n++] = format[i];
+ }
+ }
+ if (format[i] != '\0')
+ goto toolong;
+ return (temp);
+toolong:
+ log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too long\n",
+ (long)pid, name, (u_long)uid);
+ free(temp, M_TEMP);
+ return (NULL);
+}
+
+/*
+ * Dump a process' core. The main routine does some
+ * policy checking, and creates the name of the coredump;
+ * then it passes on a vnode and a size limit to the process-specific
+ * coredump routine if there is one; if there _is not_ one, it returns
+ * ENOSYS; otherwise it returns the error from the process-specific routine.
+ *
+ * XXX: VOP_GETATTR() here requires holding the vnode lock.
+ */
+
+static int
+coredump(struct thread *td)
+{
+ struct proc *p = td->td_proc;
+ register struct vnode *vp;
+ register struct ucred *cred = td->td_ucred;
+ struct flock lf;
+ struct nameidata nd;
+ struct vattr vattr;
+ int error, error1, flags;
+ struct mount *mp;
+ char *name; /* name of corefile */
+ off_t limit;
+
+ PROC_LOCK(p);
+ _STOPEVENT(p, S_CORE, 0);
+
+ if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) {
+ PROC_UNLOCK(p);
+ return (EFAULT);
+ }
+
+ /*
+ * Note that the bulk of limit checking is done after
+ * the corefile is created. The exception is if the limit
+ * for corefiles is 0, in which case we don't bother
+ * creating the corefile at all. This layout means that
+ * a corefile is truncated instead of not being created,
+ * if it is larger than the limit.
+ */
+ limit = p->p_rlimit[RLIMIT_CORE].rlim_cur;
+ if (limit == 0) {
+ PROC_UNLOCK(p);
+ return 0;
+ }
+ PROC_UNLOCK(p);
+
+restart:
+ name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid);
+ if (name == NULL)
+ return (EINVAL);
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); /* XXXKSE */
+ flags = O_CREAT | FWRITE | O_NOFOLLOW;
+ error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR);
+ free(name, M_TEMP);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+
+ VOP_UNLOCK(vp, 0, td);
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_WRLCK;
+ error = VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK);
+ if (error)
+ goto out2;
+
+ if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+ lf.l_type = F_UNLCK;
+ VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
+ if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
+ return (error);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+
+ /* Don't dump to non-regular files or files with links. */
+ if (vp->v_type != VREG ||
+ VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) {
+ error = EFAULT;
+ goto out1;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_size = 0;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ VOP_LEASE(vp, td, cred, LEASE_WRITE);
+ VOP_SETATTR(vp, &vattr, cred, td);
+ VOP_UNLOCK(vp, 0, td);
+ PROC_LOCK(p);
+ p->p_acflag |= ACORE;
+ PROC_UNLOCK(p);
+
+ error = p->p_sysent->sv_coredump ?
+ p->p_sysent->sv_coredump(td, vp, limit) :
+ ENOSYS;
+
+out1:
+ lf.l_type = F_UNLCK;
+ VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
+ vn_finished_write(mp);
+out2:
+ error1 = vn_close(vp, FWRITE, cred, td);
+ if (error == 0)
+ error = error1;
+ return (error);
+}
+
+/*
+ * Nonexistent system call-- signal process (may want to handle it).
+ * Flag error in case process won't see signal immediately (blocked or ignored).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nosys_args {
+ int dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+nosys(td, args)
+ struct thread *td;
+ struct nosys_args *args;
+{
+ struct proc *p = td->td_proc;
+
+ mtx_lock(&Giant);
+ PROC_LOCK(p);
+ psignal(p, SIGSYS);
+ PROC_UNLOCK(p);
+ mtx_unlock(&Giant);
+ return (ENOSYS);
+}
+
+/*
+ * Send a SIGIO or SIGURG signal to a process or process group using
+ * stored credentials rather than those of the current process.
+ */
+void
+pgsigio(sigiop, sig, checkctty)
+ struct sigio **sigiop;
+ int sig, checkctty;
+{
+ struct sigio *sigio;
+
+ SIGIO_LOCK();
+ sigio = *sigiop;
+ if (sigio == NULL) {
+ SIGIO_UNLOCK();
+ return;
+ }
+ if (sigio->sio_pgid > 0) {
+ PROC_LOCK(sigio->sio_proc);
+ if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
+ psignal(sigio->sio_proc, sig);
+ PROC_UNLOCK(sigio->sio_proc);
+ } else if (sigio->sio_pgid < 0) {
+ struct proc *p;
+
+ PGRP_LOCK(sigio->sio_pgrp);
+ LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
+ (checkctty == 0 || (p->p_flag & P_CONTROLT)))
+ psignal(p, sig);
+ PROC_UNLOCK(p);
+ }
+ PGRP_UNLOCK(sigio->sio_pgrp);
+ }
+ SIGIO_UNLOCK();
+}
+
+static int
+filt_sigattach(struct knote *kn)
+{
+ struct proc *p = curproc;
+
+ kn->kn_ptr.p_proc = p;
+ kn->kn_flags |= EV_CLEAR; /* automatically set */
+
+ PROC_LOCK(p);
+ SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
+ PROC_UNLOCK(p);
+
+ return (0);
+}
+
+static void
+filt_sigdetach(struct knote *kn)
+{
+ struct proc *p = kn->kn_ptr.p_proc;
+
+ PROC_LOCK(p);
+ SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
+ PROC_UNLOCK(p);
+}
+
+/*
+ * signal knotes are shared with proc knotes, so we apply a mask to
+ * the hint in order to differentiate them from process hints. This
+ * could be avoided by using a signal-specific knote list, but probably
+ * isn't worth the trouble.
+ */
+static int
+filt_signal(struct knote *kn, long hint)
+{
+
+ if (hint & NOTE_SIGNAL) {
+ hint &= ~NOTE_SIGNAL;
+
+ if (kn->kn_id == hint)
+ kn->kn_data++;
+ }
+ return (kn->kn_data != 0);
+}
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
new file mode 100644
index 0000000..5e32eee
--- /dev/null
+++ b/sys/kern/kern_subr.c
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_zero.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+
+SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV,
+ "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
+
+#ifdef ZERO_COPY_SOCKETS
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/swap_pager.h>
+#include <sys/mbuf.h>
+#include <machine/cpu.h>
+
+/* Declared in uipc_socket.c */
+extern int so_zero_copy_receive;
+
+static int vm_pgmoveco(vm_map_t mapa, vm_object_t srcobj, vm_offset_t kaddr,
+ vm_offset_t uaddr);
+static int userspaceco(caddr_t cp, u_int cnt, struct uio *uio,
+ struct vm_object *obj, int disposable);
+
+static int
+vm_pgmoveco(mapa, srcobj, kaddr, uaddr)
+ vm_map_t mapa;
+ vm_object_t srcobj;
+ vm_offset_t kaddr, uaddr;
+{
+ vm_map_t map = mapa;
+ vm_page_t kern_pg, user_pg;
+ vm_object_t uobject;
+ vm_map_entry_t entry;
+ vm_pindex_t upindex, kpindex;
+ vm_prot_t prot;
+ boolean_t wired;
+
+ /*
+ * First lookup the kernel page.
+ */
+ kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr));
+
+ if ((vm_map_lookup(&map, uaddr,
+ VM_PROT_READ, &entry, &uobject,
+ &upindex, &prot, &wired)) != KERN_SUCCESS) {
+ return(EFAULT);
+ }
+ if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) {
+ vm_page_sleep_busy(user_pg, 1, "vm_pgmoveco");
+ pmap_remove(map->pmap, uaddr, uaddr+PAGE_SIZE);
+ vm_page_busy(user_pg);
+ vm_page_free(user_pg);
+ }
+
+ if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) ||
+ (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) {
+ printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), "
+ "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex,
+ kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0,
+ kern_pg->hold_count, (u_long)kern_pg->phys_addr);
+ if ((kern_pg->queue - kern_pg->pc) == PQ_FREE)
+ panic("vm_pgmoveco: renaming free page");
+ else
+ panic("vm_pgmoveco: renaming busy page");
+ }
+ kpindex = kern_pg->pindex;
+ vm_page_busy(kern_pg);
+ vm_page_rename(kern_pg, uobject, upindex);
+ vm_page_flag_clear(kern_pg, PG_BUSY);
+ kern_pg->valid = VM_PAGE_BITS_ALL;
+
+ vm_map_lookup_done(map, entry);
+ return(KERN_SUCCESS);
+}
+#endif /* ZERO_COPY_SOCKETS */
+
+int
+uiomove(cp, n, uio)
+ register caddr_t cp;
+ register int n;
+ register struct uio *uio;
+{
+ struct thread *td = curthread;
+ register struct iovec *iov;
+ u_int cnt;
+ int error = 0;
+ int save = 0;
+
+ KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+ ("uiomove: mode"));
+ KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
+ ("uiomove proc"));
+
+ if (td) {
+ mtx_lock_spin(&sched_lock);
+ save = td->td_flags & TDF_DEADLKTREAT;
+ td->td_flags |= TDF_DEADLKTREAT;
+ mtx_unlock_spin(&sched_lock);
+ }
+
+ while (n > 0 && uio->uio_resid) {
+ iov = uio->uio_iov;
+ cnt = iov->iov_len;
+ if (cnt == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ continue;
+ }
+ if (cnt > n)
+ cnt = n;
+
+ switch (uio->uio_segflg) {
+
+ case UIO_USERSPACE:
+ if (ticks - PCPU_GET(switchticks) >= hogticks)
+ uio_yield();
+ if (uio->uio_rw == UIO_READ)
+ error = copyout(cp, iov->iov_base, cnt);
+ else
+ error = copyin(iov->iov_base, cp, cnt);
+ if (error)
+ goto out;
+ break;
+
+ case UIO_SYSSPACE:
+ if (uio->uio_rw == UIO_READ)
+ bcopy(cp, iov->iov_base, cnt);
+ else
+ bcopy(iov->iov_base, cp, cnt);
+ break;
+ case UIO_NOCOPY:
+ break;
+ }
+ iov->iov_base += cnt;
+ iov->iov_len -= cnt;
+ uio->uio_resid -= cnt;
+ uio->uio_offset += cnt;
+ cp += cnt;
+ n -= cnt;
+ }
+out:
+ if (td != curthread) printf("uiomove: IT CHANGED!");
+ td = curthread; /* Might things have changed in copyin/copyout? */
+ if (td) {
+ mtx_lock_spin(&sched_lock);
+ td->td_flags = (td->td_flags & ~TDF_DEADLKTREAT) | save;
+ mtx_unlock_spin(&sched_lock);
+ }
+ return (error);
+}
+
+#if defined(ENABLE_VFS_IOOPT) || defined(ZERO_COPY_SOCKETS)
+/*
+ * Experimental support for zero-copy I/O
+ */
+static int
+userspaceco(cp, cnt, uio, obj, disposable)
+ caddr_t cp;
+ u_int cnt;
+ struct uio *uio;
+ struct vm_object *obj;
+ int disposable;
+{
+ struct iovec *iov;
+ int error;
+
+ iov = uio->uio_iov;
+
+#ifdef ZERO_COPY_SOCKETS
+
+ if (uio->uio_rw == UIO_READ) {
+ if ((so_zero_copy_receive != 0)
+ && (obj != NULL)
+ && ((cnt & PAGE_MASK) == 0)
+ && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
+ && ((uio->uio_offset & PAGE_MASK) == 0)
+ && ((((intptr_t) cp) & PAGE_MASK) == 0)
+ && (obj->type == OBJT_DEFAULT)
+ && (disposable != 0)) {
+ /* SOCKET: use page-trading */
+ /*
+ * We only want to call vm_pgmoveco() on
+ * disposeable pages, since it gives the
+ * kernel page to the userland process.
+ */
+ error = vm_pgmoveco(&curproc->p_vmspace->vm_map,
+ obj, (vm_offset_t)cp,
+ (vm_offset_t)iov->iov_base);
+
+ /*
+ * If we get an error back, attempt
+ * to use copyout() instead. The
+ * disposable page should be freed
+ * automatically if we weren't able to move
+ * it into userland.
+ */
+ if (error != 0)
+ error = copyout(cp, iov->iov_base, cnt);
+#ifdef ENABLE_VFS_IOOPT
+ } else if ((vfs_ioopt != 0)
+ && ((cnt & PAGE_MASK) == 0)
+ && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
+ && ((uio->uio_offset & PAGE_MASK) == 0)
+ && ((((intptr_t) cp) & PAGE_MASK) == 0)) {
+ error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
+ uio->uio_offset, cnt,
+ (vm_offset_t) iov->iov_base, NULL);
+#endif /* ENABLE_VFS_IOOPT */
+ } else {
+ error = copyout(cp, iov->iov_base, cnt);
+ }
+ } else {
+ error = copyin(iov->iov_base, cp, cnt);
+ }
+#else /* ZERO_COPY_SOCKETS */
+ if (uio->uio_rw == UIO_READ) {
+#ifdef ENABLE_VFS_IOOPT
+ if ((vfs_ioopt != 0)
+ && ((cnt & PAGE_MASK) == 0)
+ && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
+ && ((uio->uio_offset & PAGE_MASK) == 0)
+ && ((((intptr_t) cp) & PAGE_MASK) == 0)) {
+ error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
+ uio->uio_offset, cnt,
+ (vm_offset_t) iov->iov_base, NULL);
+ } else
+#endif /* ENABLE_VFS_IOOPT */
+ {
+ error = copyout(cp, iov->iov_base, cnt);
+ }
+ } else {
+ error = copyin(iov->iov_base, cp, cnt);
+ }
+#endif /* ZERO_COPY_SOCKETS */
+
+ return (error);
+}
+
+int
+uiomoveco(cp, n, uio, obj, disposable)
+ caddr_t cp;
+ int n;
+ struct uio *uio;
+ struct vm_object *obj;
+ int disposable;
+{
+ struct iovec *iov;
+ u_int cnt;
+ int error;
+
+ KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+ ("uiomoveco: mode"));
+ KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
+ ("uiomoveco proc"));
+
+ while (n > 0 && uio->uio_resid) {
+ iov = uio->uio_iov;
+ cnt = iov->iov_len;
+ if (cnt == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ continue;
+ }
+ if (cnt > n)
+ cnt = n;
+
+ switch (uio->uio_segflg) {
+
+ case UIO_USERSPACE:
+ if (ticks - PCPU_GET(switchticks) >= hogticks)
+ uio_yield();
+
+ error = userspaceco(cp, cnt, uio, obj, disposable);
+
+ if (error)
+ return (error);
+ break;
+
+ case UIO_SYSSPACE:
+ if (uio->uio_rw == UIO_READ)
+ bcopy(cp, iov->iov_base, cnt);
+ else
+ bcopy(iov->iov_base, cp, cnt);
+ break;
+ case UIO_NOCOPY:
+ break;
+ }
+ iov->iov_base += cnt;
+ iov->iov_len -= cnt;
+ uio->uio_resid -= cnt;
+ uio->uio_offset += cnt;
+ cp += cnt;
+ n -= cnt;
+ }
+ return (0);
+}
+#endif /* ENABLE_VFS_IOOPT || ZERO_COPY_SOCKETS */
+
+#ifdef ENABLE_VFS_IOOPT
+
+/*
+ * Experimental support for zero-copy I/O
+ */
+int
+uioread(n, uio, obj, nread)
+ int n;
+ struct uio *uio;
+ struct vm_object *obj;
+ int *nread;
+{
+ int npagesmoved;
+ struct iovec *iov;
+ u_int cnt, tcnt;
+ int error;
+
+ *nread = 0;
+ if (vfs_ioopt < 2)
+ return 0;
+
+ error = 0;
+
+ while (n > 0 && uio->uio_resid) {
+ iov = uio->uio_iov;
+ cnt = iov->iov_len;
+ if (cnt == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ continue;
+ }
+ if (cnt > n)
+ cnt = n;
+
+ if ((uio->uio_segflg == UIO_USERSPACE) &&
+ ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
+ ((uio->uio_offset & PAGE_MASK) == 0) ) {
+
+ if (cnt < PAGE_SIZE)
+ break;
+
+ cnt &= ~PAGE_MASK;
+
+ if (ticks - PCPU_GET(switchticks) >= hogticks)
+ uio_yield();
+ error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
+ uio->uio_offset, cnt,
+ (vm_offset_t) iov->iov_base, &npagesmoved);
+
+ if (npagesmoved == 0)
+ break;
+
+ tcnt = npagesmoved * PAGE_SIZE;
+ cnt = tcnt;
+
+ if (error)
+ break;
+
+ iov->iov_base += cnt;
+ iov->iov_len -= cnt;
+ uio->uio_resid -= cnt;
+ uio->uio_offset += cnt;
+ *nread += cnt;
+ n -= cnt;
+ } else {
+ break;
+ }
+ }
+ return error;
+}
+#endif /* ENABLE_VFS_IOOPT */
+
+/*
+ * Give next character to user as result of read.
+ */
+int
+ureadc(c, uio)
+ register int c;
+ register struct uio *uio;
+{
+ register struct iovec *iov;
+
+again:
+ if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
+ panic("ureadc");
+ iov = uio->uio_iov;
+ if (iov->iov_len == 0) {
+ uio->uio_iovcnt--;
+ uio->uio_iov++;
+ goto again;
+ }
+ switch (uio->uio_segflg) {
+
+ case UIO_USERSPACE:
+ if (subyte(iov->iov_base, c) < 0)
+ return (EFAULT);
+ break;
+
+ case UIO_SYSSPACE:
+ *iov->iov_base = c;
+ break;
+
+ case UIO_NOCOPY:
+ break;
+ }
+ iov->iov_base++;
+ iov->iov_len--;
+ uio->uio_resid--;
+ uio->uio_offset++;
+ return (0);
+}
+
+/*
+ * General routine to allocate a hash table.
+ */
+void *
+hashinit(elements, type, hashmask)
+ int elements;
+ struct malloc_type *type;
+ u_long *hashmask;
+{
+ long hashsize;
+ LIST_HEAD(generic, generic) *hashtbl;
+ int i;
+
+ if (elements <= 0)
+ panic("hashinit: bad elements");
+ for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
+ continue;
+ hashsize >>= 1;
+ hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
+ for (i = 0; i < hashsize; i++)
+ LIST_INIT(&hashtbl[i]);
+ *hashmask = hashsize - 1;
+ return (hashtbl);
+}
+
+static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039,
+ 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653,
+ 7159, 7673, 8191, 12281, 16381, 24571, 32749 };
+#define NPRIMES (sizeof(primes) / sizeof(primes[0]))
+
+/*
+ * General routine to allocate a prime number sized hash table.
+ */
+void *
+phashinit(elements, type, nentries)
+ int elements;
+ struct malloc_type *type;
+ u_long *nentries;
+{
+ long hashsize;
+ LIST_HEAD(generic, generic) *hashtbl;
+ int i;
+
+ if (elements <= 0)
+ panic("phashinit: bad elements");
+ for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
+ i++;
+ if (i == NPRIMES)
+ break;
+ hashsize = primes[i];
+ }
+ hashsize = primes[i - 1];
+ hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
+ for (i = 0; i < hashsize; i++)
+ LIST_INIT(&hashtbl[i]);
+ *nentries = hashsize;
+ return (hashtbl);
+}
+
+void
+uio_yield()
+{
+ struct thread *td;
+
+ td = curthread;
+ mtx_lock_spin(&sched_lock);
+ DROP_GIANT();
+ td->td_priority = td->td_ksegrp->kg_user_pri; /* XXXKSE */
+ setrunqueue(td);
+ td->td_proc->p_stats->p_ru.ru_nivcsw++;
+ mi_switch();
+ mtx_unlock_spin(&sched_lock);
+ PICKUP_GIANT();
+}
+
+int
+copyinfrom(const void *src, void *dst, size_t len, int seg)
+{
+ int error = 0;
+
+ switch (seg) {
+ case UIO_USERSPACE:
+ error = copyin(src, dst, len);
+ break;
+ case UIO_SYSSPACE:
+ bcopy(src, dst, len);
+ break;
+ default:
+ panic("copyinfrom: bad seg %d\n", seg);
+ }
+ return (error);
+}
+
+int
+copyinstrfrom(const void *src, void *dst, size_t len, size_t *copied, int seg)
+{
+ int error = 0;
+
+ switch (seg) {
+ case UIO_USERSPACE:
+ error = copyinstr(src, dst, len, copied);
+ break;
+ case UIO_SYSSPACE:
+ error = copystr(src, dst, len, copied);
+ break;
+ default:
+ panic("copyinstrfrom: bad seg %d\n", seg);
+ }
+ return (error);
+}
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
new file mode 100644
index 0000000..2b531c0
--- /dev/null
+++ b/sys/kern/kern_switch.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <machine/critical.h>
+
+CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
+
+/*
+ * Global run queue.
+ */
+static struct runq runq;
+SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq)
+
+/*
+ * Wrappers which implement old interface; act on global run queue.
+ */
+
+struct thread *
+choosethread(void)
+{
+ return (runq_choose(&runq)->ke_thread);
+}
+
+int
+procrunnable(void)
+{
+ return runq_check(&runq);
+}
+
+void
+remrunqueue(struct thread *td)
+{
+ runq_remove(&runq, td->td_kse);
+}
+
+void
+setrunqueue(struct thread *td)
+{
+ runq_add(&runq, td->td_kse);
+}
+
+/* Critical sections that prevent preemption. */
+void
+critical_enter(void)
+{
+ struct thread *td;
+
+ td = curthread;
+ if (td->td_critnest == 0)
+ cpu_critical_enter();
+ td->td_critnest++;
+}
+
+void
+critical_exit(void)
+{
+ struct thread *td;
+
+ td = curthread;
+ if (td->td_critnest == 1) {
+ td->td_critnest = 0;
+ cpu_critical_exit();
+ } else {
+ td->td_critnest--;
+ }
+}
+
+/*
+ * Clear the status bit of the queue corresponding to priority level pri,
+ * indicating that it is empty.
+ */
+static __inline void
+runq_clrbit(struct runq *rq, int pri)
+{
+ struct rqbits *rqb;
+
+ rqb = &rq->rq_status;
+ CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
+ rqb->rqb_bits[RQB_WORD(pri)],
+ rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
+ RQB_BIT(pri), RQB_WORD(pri));
+ rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
+}
+
+/*
+ * Find the index of the first non-empty run queue. This is done by
+ * scanning the status bits, a set bit indicates a non-empty queue.
+ */
+static __inline int
+runq_findbit(struct runq *rq)
+{
+ struct rqbits *rqb;
+ int pri;
+ int i;
+
+ rqb = &rq->rq_status;
+ for (i = 0; i < RQB_LEN; i++)
+ if (rqb->rqb_bits[i]) {
+ pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
+ CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
+ rqb->rqb_bits[i], i, pri);
+ return (pri);
+ }
+
+ return (-1);
+}
+
+/*
+ * Set the status bit of the queue corresponding to priority level pri,
+ * indicating that it is non-empty.
+ */
+static __inline void
+runq_setbit(struct runq *rq, int pri)
+{
+ struct rqbits *rqb;
+
+ rqb = &rq->rq_status;
+ CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
+ rqb->rqb_bits[RQB_WORD(pri)],
+ rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
+ RQB_BIT(pri), RQB_WORD(pri));
+ rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
+}
+
+/*
+ * Add the process to the queue specified by its priority, and set the
+ * corresponding status bit.
+ */
+void
+runq_add(struct runq *rq, struct kse *ke)
+{
+ struct rqhead *rqh;
+ int pri;
+
+#ifdef INVARIANTS
+ struct proc *p = ke->ke_proc;
+#endif
+ if (ke->ke_flags & KEF_ONRUNQ)
+ return;
+ mtx_assert(&sched_lock, MA_OWNED);
+ KASSERT(p->p_stat == SRUN, ("runq_add: proc %p (%s) not SRUN",
+ p, p->p_comm));
+ pri = ke->ke_thread->td_priority / RQ_PPQ;
+ ke->ke_rqindex = pri;
+ runq_setbit(rq, pri);
+ rqh = &rq->rq_queues[pri];
+ CTR4(KTR_RUNQ, "runq_add: p=%p pri=%d %d rqh=%p",
+ ke->ke_proc, ke->ke_thread->td_priority, pri, rqh);
+ TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
+ ke->ke_flags |= KEF_ONRUNQ;
+}
+
+/*
+ * Return true if there are runnable processes of any priority on the run
+ * queue, false otherwise. Has no side effects, does not modify the run
+ * queue structure.
+ */
+int
+runq_check(struct runq *rq)
+{
+ struct rqbits *rqb;
+ int i;
+
+ rqb = &rq->rq_status;
+ for (i = 0; i < RQB_LEN; i++)
+ if (rqb->rqb_bits[i]) {
+ CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
+ rqb->rqb_bits[i], i);
+ return (1);
+ }
+ CTR0(KTR_RUNQ, "runq_check: empty");
+
+ return (0);
+}
+
+/*
+ * Find and remove the highest priority process from the run queue.
+ * If there are no runnable processes, the per-cpu idle process is
+ * returned. Will not return NULL under any circumstances.
+ */
+struct kse *
+runq_choose(struct runq *rq)
+{
+ struct rqhead *rqh;
+ struct kse *ke;
+ int pri;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ if ((pri = runq_findbit(rq)) != -1) {
+ rqh = &rq->rq_queues[pri];
+ ke = TAILQ_FIRST(rqh);
+ KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
+ KASSERT(ke->ke_proc->p_stat == SRUN,
+ ("runq_choose: process %d(%s) in state %d", ke->ke_proc->p_pid,
+ ke->ke_proc->p_comm, ke->ke_proc->p_stat));
+ CTR3(KTR_RUNQ, "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
+ TAILQ_REMOVE(rqh, ke, ke_procq);
+ if (TAILQ_EMPTY(rqh)) {
+ CTR0(KTR_RUNQ, "runq_choose: empty");
+ runq_clrbit(rq, pri);
+ }
+ ke->ke_flags &= ~KEF_ONRUNQ;
+ return (ke);
+ }
+ CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
+
+ return (PCPU_GET(idlethread)->td_kse);
+}
+
+/*
+ * Initialize a run structure.
+ */
+void
+runq_init(struct runq *rq)
+{
+ int i;
+
+ bzero(rq, sizeof *rq);
+ for (i = 0; i < RQ_NQS; i++)
+ TAILQ_INIT(&rq->rq_queues[i]);
+}
+
+/*
+ * Remove the process from the queue specified by its priority, and clear the
+ * corresponding status bit if the queue becomes empty.
+ */
+void
+runq_remove(struct runq *rq, struct kse *ke)
+{
+ struct rqhead *rqh;
+ int pri;
+
+ if (!(ke->ke_flags & KEF_ONRUNQ))
+ return;
+ mtx_assert(&sched_lock, MA_OWNED);
+ pri = ke->ke_rqindex;
+ rqh = &rq->rq_queues[pri];
+ CTR4(KTR_RUNQ, "runq_remove: p=%p pri=%d %d rqh=%p",
+ ke, ke->ke_thread->td_priority, pri, rqh);
+ KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
+ TAILQ_REMOVE(rqh, ke, ke_procq);
+ if (TAILQ_EMPTY(rqh)) {
+ CTR0(KTR_RUNQ, "runq_remove: empty");
+ runq_clrbit(rq, pri);
+ }
+ ke->ke_flags &= ~KEF_ONRUNQ;
+}
diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c
new file mode 100644
index 0000000..2f69a00
--- /dev/null
+++ b/sys/kern/kern_sx.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright (C) 2001 Jason Evans <jasone@freebsd.org>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice(s), this list of conditions and the following disclaimer as
+ * the first lines of this file unmodified other than the possible
+ * addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice(s), this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Shared/exclusive locks. This implementation assures deterministic lock
+ * granting behavior, so that slocks and xlocks are interleaved.
+ *
+ * Priority propagation will not generally raise the priority of lock holders,
+ * so should not be relied upon in combination with sx locks.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ktr.h>
+#include <sys/condvar.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+
+struct lock_class lock_class_sx = {
+ "sx",
+ LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE
+};
+
+#ifndef INVARIANTS
+#define _sx_assert(sx, what, file, line)
+#endif
+
+void
+sx_sysinit(void *arg)
+{
+ struct sx_args *sargs = arg;
+
+ sx_init(sargs->sa_sx, sargs->sa_desc);
+}
+
+void
+sx_init(struct sx *sx, const char *description)
+{
+ struct lock_object *lock;
+
+ lock = &sx->sx_object;
+ KASSERT((lock->lo_flags & LO_INITIALIZED) == 0,
+ ("sx lock %s %p already initialized", description, sx));
+ bzero(sx, sizeof(*sx));
+ lock->lo_class = &lock_class_sx;
+ lock->lo_type = lock->lo_name = description;
+ lock->lo_flags = LO_WITNESS | LO_RECURSABLE | LO_SLEEPABLE |
+ LO_UPGRADABLE;
+ sx->sx_lock = mtx_pool_find(sx);
+ sx->sx_cnt = 0;
+ cv_init(&sx->sx_shrd_cv, description);
+ sx->sx_shrd_wcnt = 0;
+ cv_init(&sx->sx_excl_cv, description);
+ sx->sx_excl_wcnt = 0;
+ sx->sx_xholder = NULL;
+
+ LOCK_LOG_INIT(lock, 0);
+
+ WITNESS_INIT(lock);
+}
+
+void
+sx_destroy(struct sx *sx)
+{
+
+ LOCK_LOG_DESTROY(&sx->sx_object, 0);
+
+ KASSERT((sx->sx_cnt == 0 && sx->sx_shrd_wcnt == 0 && sx->sx_excl_wcnt ==
+ 0), ("%s (%s): holders or waiters\n", __func__,
+ sx->sx_object.lo_name));
+
+ sx->sx_lock = NULL;
+ cv_destroy(&sx->sx_shrd_cv);
+ cv_destroy(&sx->sx_excl_cv);
+
+ WITNESS_DESTROY(&sx->sx_object);
+}
+
+void
+_sx_slock(struct sx *sx, const char *file, int line)
+{
+
+ mtx_lock(sx->sx_lock);
+ KASSERT(sx->sx_xholder != curthread,
+ ("%s (%s): slock while xlock is held @ %s:%d\n", __func__,
+ sx->sx_object.lo_name, file, line));
+
+ /*
+ * Loop in case we lose the race for lock acquisition.
+ */
+ while (sx->sx_cnt < 0) {
+ sx->sx_shrd_wcnt++;
+ cv_wait(&sx->sx_shrd_cv, sx->sx_lock);
+ sx->sx_shrd_wcnt--;
+ }
+
+ /* Acquire a shared lock. */
+ sx->sx_cnt++;
+
+ LOCK_LOG_LOCK("SLOCK", &sx->sx_object, 0, 0, file, line);
+ WITNESS_LOCK(&sx->sx_object, 0, file, line);
+
+ mtx_unlock(sx->sx_lock);
+}
+
+int
+_sx_try_slock(struct sx *sx, const char *file, int line)
+{
+
+ mtx_lock(sx->sx_lock);
+ if (sx->sx_cnt >= 0) {
+ sx->sx_cnt++;
+ LOCK_LOG_TRY("SLOCK", &sx->sx_object, 0, 1, file, line);
+ WITNESS_LOCK(&sx->sx_object, LOP_TRYLOCK, file, line);
+ mtx_unlock(sx->sx_lock);
+ return (1);
+ } else {
+ LOCK_LOG_TRY("SLOCK", &sx->sx_object, 0, 0, file, line);
+ mtx_unlock(sx->sx_lock);
+ return (0);
+ }
+}
+
+void
+_sx_xlock(struct sx *sx, const char *file, int line)
+{
+
+ mtx_lock(sx->sx_lock);
+
+ /*
+ * With sx locks, we're absolutely not permitted to recurse on
+ * xlocks, as it is fatal (deadlock). Normally, recursion is handled
+ * by WITNESS, but as it is not semantically correct to hold the
+ * xlock while in here, we consider it API abuse and put it under
+ * INVARIANTS.
+ */
+ KASSERT(sx->sx_xholder != curthread,
+ ("%s (%s): xlock already held @ %s:%d", __func__,
+ sx->sx_object.lo_name, file, line));
+
+ /* Loop in case we lose the race for lock acquisition. */
+ while (sx->sx_cnt != 0) {
+ sx->sx_excl_wcnt++;
+ cv_wait(&sx->sx_excl_cv, sx->sx_lock);
+ sx->sx_excl_wcnt--;
+ }
+
+ MPASS(sx->sx_cnt == 0);
+
+ /* Acquire an exclusive lock. */
+ sx->sx_cnt--;
+ sx->sx_xholder = curthread;
+
+ LOCK_LOG_LOCK("XLOCK", &sx->sx_object, 0, 0, file, line);
+ WITNESS_LOCK(&sx->sx_object, LOP_EXCLUSIVE, file, line);
+
+ mtx_unlock(sx->sx_lock);
+}
+
+int
+_sx_try_xlock(struct sx *sx, const char *file, int line)
+{
+
+ mtx_lock(sx->sx_lock);
+ if (sx->sx_cnt == 0) {
+ sx->sx_cnt--;
+ sx->sx_xholder = curthread;
+ LOCK_LOG_TRY("XLOCK", &sx->sx_object, 0, 1, file, line);
+ WITNESS_LOCK(&sx->sx_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file,
+ line);
+ mtx_unlock(sx->sx_lock);
+ return (1);
+ } else {
+ LOCK_LOG_TRY("XLOCK", &sx->sx_object, 0, 0, file, line);
+ mtx_unlock(sx->sx_lock);
+ return (0);
+ }
+}
+
+void
+_sx_sunlock(struct sx *sx, const char *file, int line)
+{
+
+ _sx_assert(sx, SX_SLOCKED, file, line);
+ mtx_lock(sx->sx_lock);
+
+ WITNESS_UNLOCK(&sx->sx_object, 0, file, line);
+
+ /* Release. */
+ sx->sx_cnt--;
+
+ /*
+ * If we just released the last shared lock, wake any waiters up, giving
+ * exclusive lockers precedence. In order to make sure that exclusive
+ * lockers won't be blocked forever, don't wake shared lock waiters if
+ * there are exclusive lock waiters.
+ */
+ if (sx->sx_excl_wcnt > 0) {
+ if (sx->sx_cnt == 0)
+ cv_signal(&sx->sx_excl_cv);
+ } else if (sx->sx_shrd_wcnt > 0)
+ cv_broadcast(&sx->sx_shrd_cv);
+
+ LOCK_LOG_LOCK("SUNLOCK", &sx->sx_object, 0, 0, file, line);
+
+ mtx_unlock(sx->sx_lock);
+}
+
+void
+_sx_xunlock(struct sx *sx, const char *file, int line)
+{
+
+ _sx_assert(sx, SX_XLOCKED, file, line);
+ mtx_lock(sx->sx_lock);
+ MPASS(sx->sx_cnt == -1);
+
+ WITNESS_UNLOCK(&sx->sx_object, LOP_EXCLUSIVE, file, line);
+
+ /* Release. */
+ sx->sx_cnt++;
+ sx->sx_xholder = NULL;
+
+ /*
+ * Wake up waiters if there are any. Give precedence to slock waiters.
+ */
+ if (sx->sx_shrd_wcnt > 0)
+ cv_broadcast(&sx->sx_shrd_cv);
+ else if (sx->sx_excl_wcnt > 0)
+ cv_signal(&sx->sx_excl_cv);
+
+ LOCK_LOG_LOCK("XUNLOCK", &sx->sx_object, 0, 0, file, line);
+
+ mtx_unlock(sx->sx_lock);
+}
+
+int
+_sx_try_upgrade(struct sx *sx, const char *file, int line)
+{
+
+ _sx_assert(sx, SX_SLOCKED, file, line);
+ mtx_lock(sx->sx_lock);
+
+ if (sx->sx_cnt == 1) {
+ sx->sx_cnt = -1;
+ sx->sx_xholder = curthread;
+
+ LOCK_LOG_TRY("XUPGRADE", &sx->sx_object, 0, 1, file, line);
+ WITNESS_UPGRADE(&sx->sx_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+ file, line);
+
+ mtx_unlock(sx->sx_lock);
+ return (1);
+ } else {
+ LOCK_LOG_TRY("XUPGRADE", &sx->sx_object, 0, 0, file, line);
+ mtx_unlock(sx->sx_lock);
+ return (0);
+ }
+}
+
+void
+_sx_downgrade(struct sx *sx, const char *file, int line)
+{
+
+ _sx_assert(sx, SX_XLOCKED, file, line);
+ mtx_lock(sx->sx_lock);
+ MPASS(sx->sx_cnt == -1);
+
+ WITNESS_DOWNGRADE(&sx->sx_object, 0, file, line);
+
+ sx->sx_cnt = 1;
+ sx->sx_xholder = NULL;
+ if (sx->sx_shrd_wcnt > 0)
+ cv_broadcast(&sx->sx_shrd_cv);
+
+ LOCK_LOG_LOCK("XDOWNGRADE", &sx->sx_object, 0, 0, file, line);
+
+ mtx_unlock(sx->sx_lock);
+}
+
+#ifdef INVARIANT_SUPPORT
+#ifndef INVARIANTS
+#undef _sx_assert
+#endif
+
+/*
+ * In the non-WITNESS case, sx_assert() can only detect that at least
+ * *some* thread owns an slock, but it cannot guarantee that *this*
+ * thread owns an slock.
+ */
+void
+_sx_assert(struct sx *sx, int what, const char *file, int line)
+{
+
+ switch (what) {
+ case SX_LOCKED:
+ case SX_SLOCKED:
+#ifdef WITNESS
+ witness_assert(&sx->sx_object, what, file, line);
+#else
+ mtx_lock(sx->sx_lock);
+ if (sx->sx_cnt <= 0 &&
+ (what == SX_SLOCKED || sx->sx_xholder != curthread))
+ printf("Lock %s not %slocked @ %s:%d\n",
+ sx->sx_object.lo_name, (what == SX_SLOCKED) ?
+ "share " : "", file, line);
+ mtx_unlock(sx->sx_lock);
+#endif
+ break;
+ case SX_XLOCKED:
+ mtx_lock(sx->sx_lock);
+ if (sx->sx_xholder != curthread)
+ printf("Lock %s not exclusively locked @ %s:%d\n",
+ sx->sx_object.lo_name, file, line);
+ mtx_unlock(sx->sx_lock);
+ break;
+ default:
+ panic("Unknown sx lock assertion: %d @ %s:%d", what, file,
+ line);
+ }
+}
+#endif /* INVARIANT_SUPPORT */
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
new file mode 100644
index 0000000..6f9adad
--- /dev/null
+++ b/sys/kern/kern_synch.c
@@ -0,0 +1,970 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/condvar.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vmmeter.h>
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/cpu.h>
+
+static void sched_setup(void *dummy);
+SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
+
+int hogticks;
+int lbolt;
+int sched_quantum; /* Roundrobin scheduling quantum in ticks. */
+
+static struct callout loadav_callout;
+static struct callout schedcpu_callout;
+static struct callout roundrobin_callout;
+
+struct loadavg averunnable =
+ { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */
+/*
+ * Constants for averages over 1, 5, and 15 minutes
+ * when sampling at 5 second intervals.
+ */
+static fixpt_t cexp[3] = {
+ 0.9200444146293232 * FSCALE, /* exp(-1/12) */
+ 0.9834714538216174 * FSCALE, /* exp(-1/60) */
+ 0.9944598480048967 * FSCALE, /* exp(-1/180) */
+};
+
+static void endtsleep(void *);
+static void loadav(void *arg);
+static void roundrobin(void *arg);
+static void schedcpu(void *arg);
+
+static int
+sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
+{
+ int error, new_val;
+
+ new_val = sched_quantum * tick;
+ error = sysctl_handle_int(oidp, &new_val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (new_val < tick)
+ return (EINVAL);
+ sched_quantum = new_val / tick;
+ hogticks = 2 * sched_quantum;
+ return (0);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
+ 0, sizeof sched_quantum, sysctl_kern_quantum, "I",
+ "Roundrobin scheduling quantum in microseconds");
+
+/*
+ * Arrange to reschedule if necessary, taking the priorities and
+ * schedulers into account.
+ */
+void
+maybe_resched(struct thread *td)
+{
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ if (td->td_priority < curthread->td_priority)
+ curthread->td_kse->ke_flags |= KEF_NEEDRESCHED;
+}
+
+int
+roundrobin_interval(void)
+{
+ return (sched_quantum);
+}
+
+/*
+ * Force switch among equal priority processes every 100ms.
+ * We don't actually need to force a context switch of the current process.
+ * The act of firing the event triggers a context switch to softclock() and
+ * then switching back out again which is equivalent to a preemption, thus
+ * no further work is needed on the local CPU.
+ */
+/* ARGSUSED */
+static void
+roundrobin(arg)
+ void *arg;
+{
+
+#ifdef SMP
+ mtx_lock_spin(&sched_lock);
+ forward_roundrobin();
+ mtx_unlock_spin(&sched_lock);
+#endif
+
+ callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
+}
+
+/*
+ * Constants for digital decay and forget:
+ * 90% of (p_estcpu) usage in 5 * loadav time
+ * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
+ * Note that, as ps(1) mentions, this can let percentages
+ * total over 100% (I've seen 137.9% for 3 processes).
+ *
+ * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
+ *
+ * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
+ * That is, the system wants to compute a value of decay such
+ * that the following for loop:
+ * for (i = 0; i < (5 * loadavg); i++)
+ * p_estcpu *= decay;
+ * will compute
+ * p_estcpu *= 0.1;
+ * for all values of loadavg:
+ *
+ * Mathematically this loop can be expressed by saying:
+ * decay ** (5 * loadavg) ~= .1
+ *
+ * The system computes decay as:
+ * decay = (2 * loadavg) / (2 * loadavg + 1)
+ *
+ * We wish to prove that the system's computation of decay
+ * will always fulfill the equation:
+ * decay ** (5 * loadavg) ~= .1
+ *
+ * If we compute b as:
+ * b = 2 * loadavg
+ * then
+ * decay = b / (b + 1)
+ *
+ * We now need to prove two things:
+ * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
+ * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
+ *
+ * Facts:
+ * For x close to zero, exp(x) =~ 1 + x, since
+ * exp(x) = 0! + x**1/1! + x**2/2! + ... .
+ * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
+ * For x close to zero, ln(1+x) =~ x, since
+ * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
+ * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
+ * ln(.1) =~ -2.30
+ *
+ * Proof of (1):
+ * Solve (factor)**(power) =~ .1 given power (5*loadav):
+ * solving for factor,
+ * ln(factor) =~ (-2.30/5*loadav), or
+ * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
+ * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
+ *
+ * Proof of (2):
+ * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
+ * solving for power,
+ * power*ln(b/(b+1)) =~ -2.30, or
+ * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
+ *
+ * Actual power values for the implemented algorithm are as follows:
+ * loadav: 1 2 3 4
+ * power: 5.68 10.32 14.94 19.55
+ */
+
+/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
+#define loadfactor(loadav) (2 * (loadav))
+#define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
+
+/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
+SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
+
+/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
+static int fscale __unused = FSCALE;
+SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
+
+/*
+ * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
+ * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
+ * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
+ *
+ * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
+ * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
+ *
+ * If you don't want to bother with the faster/more-accurate formula, you
+ * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
+ * (more general) method of calculating the %age of CPU used by a process.
+ */
+#define CCPU_SHIFT 11
+
+/*
+ * Recompute process priorities, every hz ticks.
+ * MP-safe, called without the Giant mutex.
+ */
+/* ARGSUSED */
+static void
+schedcpu(arg)
+ void *arg;
+{
+ register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+ struct thread *td;
+ struct proc *p;
+ struct kse *ke;
+ struct ksegrp *kg;
+ int realstathz;
+ int awake;
+
+ realstathz = stathz ? stathz : hz;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ mtx_lock_spin(&sched_lock);
+ p->p_swtime++;
+ FOREACH_KSEGRP_IN_PROC(p, kg) {
+ awake = 0;
+ FOREACH_KSE_IN_GROUP(kg, ke) {
+ /*
+ * Increment time in/out of memory and sleep
+ * time (if sleeping). We ignore overflow;
+ * with 16-bit int's (remember them?)
+ * overflow takes 45 days.
+ */
+ /* XXXKSE */
+ /* if ((ke->ke_flags & KEF_ONRUNQ) == 0) */
+ if (p->p_stat == SSLEEP || p->p_stat == SSTOP) {
+ ke->ke_slptime++;
+ } else {
+ ke->ke_slptime = 0;
+ awake = 1;
+ }
+
+ /*
+ * pctcpu is only for ps?
+ * Do it per kse.. and add them up at the end?
+ * XXXKSE
+ */
+ ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >> FSHIFT;
+ /*
+ * If the kse has been idle the entire second,
+ * stop recalculating its priority until
+ * it wakes up.
+ */
+ if (ke->ke_slptime > 1) {
+ continue;
+ }
+
+#if (FSHIFT >= CCPU_SHIFT)
+ ke->ke_pctcpu += (realstathz == 100) ?
+ ((fixpt_t) ke->ke_cpticks) <<
+ (FSHIFT - CCPU_SHIFT) :
+ 100 * (((fixpt_t) ke->ke_cpticks) <<
+ (FSHIFT - CCPU_SHIFT)) / realstathz;
+#else
+ ke->ke_pctcpu += ((FSCALE - ccpu) *
+ (ke->ke_cpticks * FSCALE / realstathz)) >>
+ FSHIFT;
+#endif
+ ke->ke_cpticks = 0;
+ } /* end of kse loop */
+ if (awake == 0) {
+ kg->kg_slptime++;
+ } else {
+ kg->kg_slptime = 0;
+ }
+ kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
+ resetpriority(kg);
+ td = FIRST_THREAD_IN_PROC(p);
+ if (td->td_priority >= PUSER &&
+ (p->p_sflag & PS_INMEM)) {
+ int changedqueue =
+ ((td->td_priority / RQ_PPQ) !=
+ (kg->kg_user_pri / RQ_PPQ));
+
+ td->td_priority = kg->kg_user_pri;
+ FOREACH_KSE_IN_GROUP(kg, ke) {
+ if ((ke->ke_oncpu == NOCPU) &&
+ (p->p_stat == SRUN) && /* XXXKSE */
+ changedqueue) {
+ remrunqueue(ke->ke_thread);
+ setrunqueue(ke->ke_thread);
+ }
+ }
+ }
+ } /* end of ksegrp loop */
+ mtx_unlock_spin(&sched_lock);
+ } /* end of process loop */
+ sx_sunlock(&allproc_lock);
+ wakeup((caddr_t)&lbolt);
+ callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
+}
+
+/*
+ * Recalculate the priority of a process after it has slept for a while.
+ * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
+ * least six times the loadfactor will decay p_estcpu to zero.
+ */
+void
+updatepri(td)
+ register struct thread *td;
+{
+ register struct ksegrp *kg;
+ register unsigned int newcpu;
+ register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+
+ if (td == NULL)
+ return;
+ kg = td->td_ksegrp;
+ newcpu = kg->kg_estcpu;
+ if (kg->kg_slptime > 5 * loadfac)
+ kg->kg_estcpu = 0;
+ else {
+ kg->kg_slptime--; /* the first time was done in schedcpu */
+ while (newcpu && --kg->kg_slptime)
+ newcpu = decay_cpu(loadfac, newcpu);
+ kg->kg_estcpu = newcpu;
+ }
+ resetpriority(td->td_ksegrp);
+}
+
+/*
+ * We're only looking at 7 bits of the address; everything is
+ * aligned to 4, lots of things are aligned to greater powers
+ * of 2. Shift right by 8, i.e. drop the bottom 256 worth.
+ */
+#define TABLESIZE 128
+static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE];
+#define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1))
+
+void
+sleepinit(void)
+{
+ int i;
+
+ sched_quantum = hz/10;
+ hogticks = 2 * sched_quantum;
+ for (i = 0; i < TABLESIZE; i++)
+ TAILQ_INIT(&slpque[i]);
+}
+
+/*
+ * General sleep call. Suspends the current process until a wakeup is
+ * performed on the specified identifier. The process will then be made
+ * runnable with the specified priority. Sleeps at most timo/hz seconds
+ * (0 means no timeout). If pri includes PCATCH flag, signals are checked
+ * before and after sleeping, else signals are not checked. Returns 0 if
+ * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
+ * signal needs to be delivered, ERESTART is returned if the current system
+ * call should be restarted if possible, and EINTR is returned if the system
+ * call should be interrupted by the signal (return EINTR).
+ *
+ * The mutex argument is exited before the caller is suspended, and
+ * entered before msleep returns. If priority includes the PDROP
+ * flag the mutex is not entered before returning.
+ */
+int
+msleep(ident, mtx, priority, wmesg, timo)
+ void *ident;
+ struct mtx *mtx;
+ int priority, timo;
+ const char *wmesg;
+{
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ int sig, catch = priority & PCATCH;
+ int rval = 0;
+ WITNESS_SAVE_DECL(mtx);
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 0);
+#endif
+ WITNESS_SLEEP(0, &mtx->mtx_object);
+ KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,
+ ("sleeping without a mutex"));
+ mtx_lock_spin(&sched_lock);
+ if (cold || panicstr) {
+ /*
+ * After a panic, or during autoconfiguration,
+ * just give interrupts a chance, then just return;
+ * don't run any other procs or panic below,
+ * in case this is the idle process and already asleep.
+ */
+ if (mtx != NULL && priority & PDROP)
+ mtx_unlock(mtx);
+ mtx_unlock_spin(&sched_lock);
+ return (0);
+ }
+
+ DROP_GIANT();
+
+ if (mtx != NULL) {
+ mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
+ WITNESS_SAVE(&mtx->mtx_object, mtx);
+ mtx_unlock(mtx);
+ if (priority & PDROP)
+ mtx = NULL;
+ }
+
+ KASSERT(p != NULL, ("msleep1"));
+ KASSERT(ident != NULL && td->td_proc->p_stat == SRUN, ("msleep"));
+
+ td->td_wchan = ident;
+ td->td_wmesg = wmesg;
+ td->td_kse->ke_slptime = 0; /* XXXKSE */
+ td->td_ksegrp->kg_slptime = 0;
+ td->td_priority = priority & PRIMASK;
+ CTR5(KTR_PROC, "msleep: thread %p (pid %d, %s) on %s (%p)",
+ td, p->p_pid, p->p_comm, wmesg, ident);
+ TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], td, td_slpq);
+ if (timo)
+ callout_reset(&td->td_slpcallout, timo, endtsleep, td);
+ /*
+ * We put ourselves on the sleep queue and start our timeout
+ * before calling cursig, as we could stop there, and a wakeup
+ * or a SIGCONT (or both) could occur while we were stopped.
+ * A SIGCONT would cause us to be marked as SSLEEP
+ * without resuming us, thus we must be ready for sleep
+ * when cursig is called. If the wakeup happens while we're
+ * stopped, td->td_wchan will be 0 upon return from cursig.
+ */
+ if (catch) {
+ CTR3(KTR_PROC, "msleep caught: proc %p (pid %d, %s)", p,
+ p->p_pid, p->p_comm);
+ td->td_flags |= TDF_SINTR;
+ mtx_unlock_spin(&sched_lock);
+ PROC_LOCK(p);
+ sig = cursig(p);
+ mtx_lock_spin(&sched_lock);
+ PROC_UNLOCK(p);
+ if (sig != 0) {
+ if (td->td_wchan != NULL)
+ unsleep(td);
+ } else if (td->td_wchan == NULL)
+ catch = 0;
+ } else
+ sig = 0;
+ if (td->td_wchan != NULL) {
+ td->td_proc->p_stat = SSLEEP;
+ p->p_stats->p_ru.ru_nvcsw++;
+ mi_switch();
+ }
+ CTR3(KTR_PROC, "msleep resume: proc %p (pid %d, %s)", td, p->p_pid,
+ p->p_comm);
+ KASSERT(td->td_proc->p_stat == SRUN, ("running but not SRUN"));
+ td->td_flags &= ~TDF_SINTR;
+ if (td->td_flags & TDF_TIMEOUT) {
+ td->td_flags &= ~TDF_TIMEOUT;
+ if (sig == 0)
+ rval = EWOULDBLOCK;
+ } else if (td->td_flags & TDF_TIMOFAIL)
+ td->td_flags &= ~TDF_TIMOFAIL;
+ else if (timo && callout_stop(&td->td_slpcallout) == 0) {
+ /*
+ * This isn't supposed to be pretty. If we are here, then
+ * the endtsleep() callout is currently executing on another
+ * CPU and is either spinning on the sched_lock or will be
+ * soon. If we don't synchronize here, there is a chance
+ * that this process may msleep() again before the callout
+ * has a chance to run and the callout may end up waking up
+ * the wrong msleep(). Yuck.
+ */
+ td->td_flags |= TDF_TIMEOUT;
+ p->p_stats->p_ru.ru_nivcsw++;
+ mi_switch();
+ }
+ mtx_unlock_spin(&sched_lock);
+
+ if (rval == 0 && catch) {
+ PROC_LOCK(p);
+ /* XXX: shouldn't we always be calling cursig() */
+ if (sig != 0 || (sig = cursig(p))) {
+ if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
+ rval = EINTR;
+ else
+ rval = ERESTART;
+ }
+ PROC_UNLOCK(p);
+ }
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0);
+#endif
+ PICKUP_GIANT();
+ if (mtx != NULL) {
+ mtx_lock(mtx);
+ WITNESS_RESTORE(&mtx->mtx_object, mtx);
+ }
+ return (rval);
+}
+
+/*
+ * Implement timeout for msleep()
+ *
+ * If process hasn't been awakened (wchan non-zero),
+ * set timeout flag and undo the sleep. If proc
+ * is stopped, just unsleep so it will remain stopped.
+ * MP-safe, called without the Giant mutex.
+ */
+static void
+endtsleep(arg)
+ void *arg;
+{
+ register struct thread *td = arg;
+
+ CTR3(KTR_PROC, "endtsleep: thread %p (pid %d, %s)", td, td->td_proc->p_pid,
+ td->td_proc->p_comm);
+ mtx_lock_spin(&sched_lock);
+ /*
+ * This is the other half of the synchronization with msleep()
+ * described above. If the PS_TIMEOUT flag is set, we lost the
+ * race and just need to put the process back on the runqueue.
+ */
+ if ((td->td_flags & TDF_TIMEOUT) != 0) {
+ td->td_flags &= ~TDF_TIMEOUT;
+ setrunqueue(td);
+ } else if (td->td_wchan != NULL) {
+ if (td->td_proc->p_stat == SSLEEP) /* XXXKSE */
+ setrunnable(td);
+ else
+ unsleep(td);
+ td->td_flags |= TDF_TIMEOUT;
+ } else {
+ td->td_flags |= TDF_TIMOFAIL;
+ }
+ mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Remove a process from its wait queue
+ */
+void
+unsleep(struct thread *td)
+{
+
+ mtx_lock_spin(&sched_lock);
+ if (td->td_wchan != NULL) {
+ TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq);
+ td->td_wchan = NULL;
+ }
+ mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Make all processes sleeping on the specified identifier runnable.
+ */
+void
+wakeup(ident)
+ register void *ident;
+{
+ register struct slpquehead *qp;
+ register struct thread *td;
+ struct thread *ntd;
+ struct proc *p;
+
+ mtx_lock_spin(&sched_lock);
+ qp = &slpque[LOOKUP(ident)];
+restart:
+ for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
+ ntd = TAILQ_NEXT(td, td_slpq);
+ p = td->td_proc;
+ if (td->td_wchan == ident) {
+ TAILQ_REMOVE(qp, td, td_slpq);
+ td->td_wchan = NULL;
+ if (td->td_proc->p_stat == SSLEEP) {
+ /* OPTIMIZED EXPANSION OF setrunnable(p); */
+ CTR3(KTR_PROC, "wakeup: thread %p (pid %d, %s)",
+ td, p->p_pid, p->p_comm);
+ if (td->td_ksegrp->kg_slptime > 1)
+ updatepri(td);
+ td->td_ksegrp->kg_slptime = 0;
+ td->td_kse->ke_slptime = 0;
+ td->td_proc->p_stat = SRUN;
+ if (p->p_sflag & PS_INMEM) {
+ setrunqueue(td);
+ maybe_resched(td);
+ } else {
+ p->p_sflag |= PS_SWAPINREQ;
+ wakeup((caddr_t)&proc0);
+ }
+ /* END INLINE EXPANSION */
+ goto restart;
+ }
+ }
+ }
+ mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Make a process sleeping on the specified identifier runnable.
+ * May wake more than one process if a target process is currently
+ * swapped out.
+ */
+void
+wakeup_one(ident)
+ register void *ident;
+{
+ register struct slpquehead *qp;
+ register struct thread *td;
+ register struct proc *p;
+ struct thread *ntd;
+
+ mtx_lock_spin(&sched_lock);
+ qp = &slpque[LOOKUP(ident)];
+restart:
+ for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
+ ntd = TAILQ_NEXT(td, td_slpq);
+ p = td->td_proc;
+ if (td->td_wchan == ident) {
+ TAILQ_REMOVE(qp, td, td_slpq);
+ td->td_wchan = NULL;
+ if (td->td_proc->p_stat == SSLEEP) {
+ /* OPTIMIZED EXPANSION OF setrunnable(p); */
+ CTR3(KTR_PROC, "wakeup1: proc %p (pid %d, %s)",
+ p, p->p_pid, p->p_comm);
+ if (td->td_ksegrp->kg_slptime > 1)
+ updatepri(td);
+ td->td_ksegrp->kg_slptime = 0;
+ td->td_kse->ke_slptime = 0;
+ td->td_proc->p_stat = SRUN;
+ if (p->p_sflag & PS_INMEM) {
+ setrunqueue(td);
+ maybe_resched(td);
+ break;
+ } else {
+ p->p_sflag |= PS_SWAPINREQ;
+ wakeup((caddr_t)&proc0);
+ }
+ /* END INLINE EXPANSION */
+ goto restart;
+ }
+ }
+ }
+ mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * The machine independent parts of mi_switch().
+ */
+void
+mi_switch()
+{
+ struct bintime new_switchtime;
+ struct thread *td = curthread; /* XXX */
+ register struct proc *p = td->td_proc; /* XXX */
+#if 0
+ register struct rlimit *rlim;
+#endif
+ u_int sched_nest;
+
+ mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
+#ifdef INVARIANTS
+ if (p->p_stat != SMTX && p->p_stat != SRUN)
+ mtx_assert(&Giant, MA_NOTOWNED);
+#endif
+
+ /*
+ * Compute the amount of time during which the current
+ * process was running, and add that to its total so far.
+ */
+ binuptime(&new_switchtime);
+ bintime_add(&p->p_runtime, &new_switchtime);
+ bintime_sub(&p->p_runtime, PCPU_PTR(switchtime));
+
+#ifdef DDB
+ /*
+ * Don't perform context switches from the debugger.
+ */
+ if (db_active) {
+ mtx_unlock_spin(&sched_lock);
+ db_error("Context switches not allowed in the debugger.");
+ }
+#endif
+
+#if 0
+ /*
+ * Check if the process exceeds its cpu resource allocation.
+ * If over max, kill it.
+ *
+ * XXX drop sched_lock, pickup Giant
+ */
+ if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
+ p->p_runtime > p->p_limit->p_cpulimit) {
+ rlim = &p->p_rlimit[RLIMIT_CPU];
+ if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
+ mtx_unlock_spin(&sched_lock);
+ PROC_LOCK(p);
+ killproc(p, "exceeded maximum CPU limit");
+ mtx_lock_spin(&sched_lock);
+ PROC_UNLOCK(p);
+ } else {
+ mtx_unlock_spin(&sched_lock);
+ PROC_LOCK(p);
+ psignal(p, SIGXCPU);
+ mtx_lock_spin(&sched_lock);
+ PROC_UNLOCK(p);
+ if (rlim->rlim_cur < rlim->rlim_max) {
+ /* XXX: we should make a private copy */
+ rlim->rlim_cur += 5;
+ }
+ }
+ }
+#endif
+
+ /*
+ * Pick a new current process and record its start time.
+ */
+ cnt.v_swtch++;
+ PCPU_SET(switchtime, new_switchtime);
+ CTR3(KTR_PROC, "mi_switch: old proc %p (pid %d, %s)", p, p->p_pid,
+ p->p_comm);
+ sched_nest = sched_lock.mtx_recurse;
+ td->td_lastcpu = td->td_kse->ke_oncpu;
+ td->td_kse->ke_oncpu = NOCPU;
+ td->td_kse->ke_flags &= ~KEF_NEEDRESCHED;
+ cpu_switch();
+ td->td_kse->ke_oncpu = PCPU_GET(cpuid);
+ sched_lock.mtx_recurse = sched_nest;
+ sched_lock.mtx_lock = (uintptr_t)td;
+ CTR3(KTR_PROC, "mi_switch: new proc %p (pid %d, %s)", p, p->p_pid,
+ p->p_comm);
+ if (PCPU_GET(switchtime.sec) == 0)
+ binuptime(PCPU_PTR(switchtime));
+ PCPU_SET(switchticks, ticks);
+}
+
+/*
+ * Change process state to be runnable,
+ * placing it on the run queue if it is in memory,
+ * and awakening the swapper if it isn't in memory.
+ */
+void
+setrunnable(struct thread *td)
+{
+ struct proc *p = td->td_proc;
+
+ mtx_lock_spin(&sched_lock);
+ switch (p->p_stat) {
+ case SZOMB: /* not a thread flag XXXKSE */
+ panic("setrunnable(1)");
+ }
+ switch (td->td_proc->p_stat) {
+ case 0:
+ case SRUN:
+ case SWAIT:
+ default:
+ panic("setrunnable(2)");
+ case SSTOP:
+ case SSLEEP: /* e.g. when sending signals */
+ if (td->td_flags & TDF_CVWAITQ)
+ cv_waitq_remove(td);
+ else
+ unsleep(td);
+ break;
+
+ case SIDL:
+ break;
+ }
+ td->td_proc->p_stat = SRUN;
+ if (td->td_ksegrp->kg_slptime > 1)
+ updatepri(td);
+ td->td_ksegrp->kg_slptime = 0;
+ td->td_kse->ke_slptime = 0;
+ if ((p->p_sflag & PS_INMEM) == 0) {
+ p->p_sflag |= PS_SWAPINREQ;
+ wakeup((caddr_t)&proc0);
+ } else {
+ setrunqueue(td);
+ maybe_resched(td);
+ }
+ mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Compute the priority of a process when running in user mode.
+ * Arrange to reschedule if the resulting priority is better
+ * than that of the current process.
+ */
+void
+resetpriority(kg)
+ register struct ksegrp *kg;
+{
+ register unsigned int newpriority;
+ struct thread *td;
+
+ mtx_lock_spin(&sched_lock);
+ if (kg->kg_pri_class == PRI_TIMESHARE) {
+ newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT +
+ NICE_WEIGHT * (kg->kg_nice - PRIO_MIN);
+ newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
+ PRI_MAX_TIMESHARE);
+ kg->kg_user_pri = newpriority;
+ }
+ FOREACH_THREAD_IN_GROUP(kg, td) {
+ maybe_resched(td);
+ }
+ mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Compute a tenex style load average of a quantity on
+ * 1, 5 and 15 minute intervals.
+ * XXXKSE Needs complete rewrite when correct info is available.
+ * Completely Bogus.. only works with 1:1 (but compiles ok now :-)
+ */
+static void
+loadav(void *arg)
+{
+ int i, nrun;
+ struct loadavg *avg;
+ struct proc *p;
+ struct ksegrp *kg;
+
+ avg = &averunnable;
+ sx_slock(&allproc_lock);
+ nrun = 0;
+ FOREACH_PROC_IN_SYSTEM(p) {
+ FOREACH_KSEGRP_IN_PROC(p, kg) {
+ switch (p->p_stat) {
+ case SRUN:
+ if ((p->p_flag & P_NOLOAD) != 0)
+ goto nextproc;
+ /* FALLTHROUGH */
+ case SIDL:
+ nrun++;
+ }
+nextproc:
+ continue;
+ }
+ }
+ sx_sunlock(&allproc_lock);
+ for (i = 0; i < 3; i++)
+ avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
+ nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
+
+ /*
+ * Schedule the next update to occur after 5 seconds, but add a
+ * random variation to avoid synchronisation with processes that
+ * run at regular intervals.
+ */
+ callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
+ loadav, NULL);
+}
+
+/* ARGSUSED */
+static void
+sched_setup(dummy)
+ void *dummy;
+{
+
+ callout_init(&schedcpu_callout, 1);
+ callout_init(&roundrobin_callout, 0);
+ callout_init(&loadav_callout, 0);
+
+ /* Kick off timeout driven events by calling first time. */
+ roundrobin(NULL);
+ schedcpu(NULL);
+ loadav(NULL);
+}
+
+/*
+ * We adjust the priority of the current process. The priority of
+ * a process gets worse as it accumulates CPU time. The cpu usage
+ * estimator (p_estcpu) is increased here. resetpriority() will
+ * compute a different priority each time p_estcpu increases by
+ * INVERSE_ESTCPU_WEIGHT
+ * (until MAXPRI is reached). The cpu usage estimator ramps up
+ * quite quickly when the process is running (linearly), and decays
+ * away exponentially, at a rate which is proportionally slower when
+ * the system is busy. The basic principle is that the system will
+ * 90% forget that the process used a lot of CPU time in 5 * loadav
+ * seconds. This causes the system to favor processes which haven't
+ * run much recently, and to round-robin among other processes.
+ */
+void
+schedclock(td)
+ struct thread *td;
+{
+ struct kse *ke = td->td_kse;
+ struct ksegrp *kg = td->td_ksegrp;
+
+ if (td) {
+ ke->ke_cpticks++;
+ kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
+ if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
+ resetpriority(td->td_ksegrp);
+ if (td->td_priority >= PUSER)
+ td->td_priority = kg->kg_user_pri;
+ }
+ } else {
+ panic("schedclock");
+ }
+}
+
+/*
+ * General purpose yield system call
+ */
+int
+yield(struct thread *td, struct yield_args *uap)
+{
+ struct ksegrp *kg = td->td_ksegrp;
+
+ mtx_assert(&Giant, MA_NOTOWNED);
+ mtx_lock_spin(&sched_lock);
+ td->td_priority = PRI_MAX_TIMESHARE;
+ setrunqueue(td);
+ kg->kg_proc->p_stats->p_ru.ru_nvcsw++;
+ mi_switch();
+ mtx_unlock_spin(&sched_lock);
+ td->td_retval[0] = 0;
+
+ return (0);
+}
+
diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c
new file mode 100644
index 0000000..2867bc9
--- /dev/null
+++ b/sys/kern/kern_syscalls.c
@@ -0,0 +1,123 @@
+/*-
+ * Copyright (c) 1999 Assar Westerlund
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/syscall.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/module.h>
+
+/*
+ * Acts like "nosys" but can be identified in sysent for dynamic call
+ * number assignment for a limited number of calls.
+ *
+ * Place holder for system call slots reserved for loadable modules.
+ */
+int
+lkmnosys(struct thread *td, struct nosys_args *args)
+{
+ return(nosys(td, args));
+}
+
+int
+lkmressys(struct thread *td, struct nosys_args *args)
+{
+ return(nosys(td, args));
+}
+
+int
+syscall_register(int *offset, struct sysent *new_sysent,
+ struct sysent *old_sysent)
+{
+ if (*offset == NO_SYSCALL) {
+ int i;
+
+ for (i = 1; i < SYS_MAXSYSCALL; ++i)
+ if (sysent[i].sy_call == (sy_call_t *)lkmnosys)
+ break;
+ if (i == SYS_MAXSYSCALL)
+ return ENFILE;
+ *offset = i;
+ } else if (*offset < 0 || *offset >= SYS_MAXSYSCALL)
+ return EINVAL;
+ else if (sysent[*offset].sy_call != (sy_call_t *)lkmnosys &&
+ sysent[*offset].sy_call != (sy_call_t *)lkmressys)
+ return EEXIST;
+
+ *old_sysent = sysent[*offset];
+ sysent[*offset] = *new_sysent;
+ return 0;
+}
+
+int
+syscall_deregister(int *offset, struct sysent *old_sysent)
+{
+ if (*offset)
+ sysent[*offset] = *old_sysent;
+ return 0;
+}
+
+int
+syscall_module_handler(struct module *mod, int what, void *arg)
+{
+ struct syscall_module_data *data = (struct syscall_module_data*)arg;
+ modspecific_t ms;
+ int error;
+
+ switch (what) {
+ case MOD_LOAD :
+ error = syscall_register(data->offset, data->new_sysent,
+ &data->old_sysent);
+ if (error)
+ return error;
+ ms.intval = *data->offset;
+ MOD_XLOCK;
+ module_setspecific(mod, &ms);
+ MOD_XUNLOCK;
+ if (data->chainevh)
+ error = data->chainevh(mod, what, data->chainarg);
+ return error;
+
+ case MOD_UNLOAD :
+ if (data->chainevh) {
+ error = data->chainevh(mod, what, data->chainarg);
+ if (error)
+ return error;
+ }
+ error = syscall_deregister(data->offset, &data->old_sysent);
+ return error;
+ }
+
+ if (data->chainevh)
+ return data->chainevh(mod, what, data->chainarg);
+ else
+ return 0;
+}
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
new file mode 100644
index 0000000..6943bc5
--- /dev/null
+++ b/sys/kern/kern_sysctl.c
@@ -0,0 +1,1422 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/sysproto.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
+static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
+
+/*
+ * Locking - this locks the sysctl tree in memory.
+ */
+static struct sx sysctllock;
+
+#define SYSCTL_LOCK() sx_xlock(&sysctllock)
+#define SYSCTL_UNLOCK() sx_xunlock(&sysctllock)
+#define SYSCTL_INIT() sx_init(&sysctllock, "sysctl sysctllock")
+
+static int sysctl_root(SYSCTL_HANDLER_ARGS);
+
+struct sysctl_oid_list sysctl__children; /* root list */
+
+static struct sysctl_oid *
+sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
+{
+ struct sysctl_oid *oidp;
+
+ SLIST_FOREACH(oidp, list, oid_link) {
+ if (strcmp(oidp->oid_name, name) == 0) {
+ return (oidp);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Initialization of the MIB tree.
+ *
+ * Order by number in each list.
+ */
+
+void
+sysctl_register_oid(struct sysctl_oid *oidp)
+{
+ struct sysctl_oid_list *parent = oidp->oid_parent;
+ struct sysctl_oid *p;
+ struct sysctl_oid *q;
+
+ /*
+ * First check if another oid with the same name already
+ * exists in the parent's list.
+ */
+ p = sysctl_find_oidname(oidp->oid_name, parent);
+ if (p != NULL) {
+ if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ p->oid_refcnt++;
+ return;
+ } else {
+ printf("can't re-use a leaf (%s)!\n", p->oid_name);
+ return;
+ }
+ }
+ /*
+ * If this oid has a number OID_AUTO, give it a number which
+ * is greater than any current oid.
+ * NOTE: DO NOT change the starting value here, change it in
+ * <sys/sysctl.h>, and make sure it is at least 256 to
+ * accomodate e.g. net.inet.raw as a static sysctl node.
+ */
+ if (oidp->oid_number == OID_AUTO) {
+ static int newoid = CTL_AUTO_START;
+
+ oidp->oid_number = newoid++;
+ if (newoid == 0x7fffffff)
+ panic("out of oids");
+ }
+#if 0
+ else if (oidp->oid_number >= CTL_AUTO_START) {
+ /* do not panic; this happens when unregistering sysctl sets */
+ printf("static sysctl oid too high: %d", oidp->oid_number);
+ }
+#endif
+
+ /*
+ * Insert the oid into the parent's list in order.
+ */
+ q = NULL;
+ SLIST_FOREACH(p, parent, oid_link) {
+ if (oidp->oid_number < p->oid_number)
+ break;
+ q = p;
+ }
+ if (q)
+ SLIST_INSERT_AFTER(q, oidp, oid_link);
+ else
+ SLIST_INSERT_HEAD(parent, oidp, oid_link);
+}
+
+void
+sysctl_unregister_oid(struct sysctl_oid *oidp)
+{
+ SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link);
+}
+
+/* Initialize a new context to keep track of dynamically added sysctls. */
+int
+sysctl_ctx_init(struct sysctl_ctx_list *c)
+{
+
+ if (c == NULL) {
+ return (EINVAL);
+ }
+ TAILQ_INIT(c);
+ return (0);
+}
+
+/* Free the context, and destroy all dynamic oids registered in this context */
+int
+sysctl_ctx_free(struct sysctl_ctx_list *clist)
+{
+ struct sysctl_ctx_entry *e, *e1;
+ int error;
+
+ error = 0;
+ /*
+ * First perform a "dry run" to check if it's ok to remove oids.
+ * XXX FIXME
+ * XXX This algorithm is a hack. But I don't know any
+ * XXX better solution for now...
+ */
+ TAILQ_FOREACH(e, clist, link) {
+ error = sysctl_remove_oid(e->entry, 0, 0);
+ if (error)
+ break;
+ }
+ /*
+ * Restore deregistered entries, either from the end,
+ * or from the place where error occured.
+ * e contains the entry that was not unregistered
+ */
+ if (error)
+ e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
+ else
+ e1 = TAILQ_LAST(clist, sysctl_ctx_list);
+ while (e1 != NULL) {
+ sysctl_register_oid(e1->entry);
+ e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
+ }
+ if (error)
+ return(EBUSY);
+ /* Now really delete the entries */
+ e = TAILQ_FIRST(clist);
+ while (e != NULL) {
+ e1 = TAILQ_NEXT(e, link);
+ error = sysctl_remove_oid(e->entry, 1, 0);
+ if (error)
+ panic("sysctl_remove_oid: corrupt tree, entry: %s",
+ e->entry->oid_name);
+ free(e, M_SYSCTLOID);
+ e = e1;
+ }
+ return (error);
+}
+
+/* Add an entry to the context */
+struct sysctl_ctx_entry *
+sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
+{
+ struct sysctl_ctx_entry *e;
+
+ if (clist == NULL || oidp == NULL)
+ return(NULL);
+ e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
+ e->entry = oidp;
+ TAILQ_INSERT_HEAD(clist, e, link);
+ return (e);
+}
+
+/* Find an entry in the context */
+struct sysctl_ctx_entry *
+sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
+{
+ struct sysctl_ctx_entry *e;
+
+ if (clist == NULL || oidp == NULL)
+ return(NULL);
+ TAILQ_FOREACH(e, clist, link) {
+ if(e->entry == oidp)
+ return(e);
+ }
+ return (e);
+}
+
+/*
+ * Delete an entry from the context.
+ * NOTE: this function doesn't free oidp! You have to remove it
+ * with sysctl_remove_oid().
+ */
+int
+sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
+{
+ struct sysctl_ctx_entry *e;
+
+ if (clist == NULL || oidp == NULL)
+ return (EINVAL);
+ e = sysctl_ctx_entry_find(clist, oidp);
+ if (e != NULL) {
+ TAILQ_REMOVE(clist, e, link);
+ free(e, M_SYSCTLOID);
+ return (0);
+ } else
+ return (ENOENT);
+}
+
+/*
+ * Remove dynamically created sysctl trees.
+ * oidp - top of the tree to be removed
+ * del - if 0 - just deregister, otherwise free up entries as well
+ * recurse - if != 0 traverse the subtree to be deleted
+ */
+int
+sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
+{
+ struct sysctl_oid *p;
+ int error;
+
+ if (oidp == NULL)
+ return(EINVAL);
+ if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
+ printf("can't remove non-dynamic nodes!\n");
+ return (EINVAL);
+ }
+ /*
+ * WARNING: normal method to do this should be through
+ * sysctl_ctx_free(). Use recursing as the last resort
+ * method to purge your sysctl tree of leftovers...
+ * However, if some other code still references these nodes,
+ * it will panic.
+ */
+ if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ if (oidp->oid_refcnt == 1) {
+ SLIST_FOREACH(p, SYSCTL_CHILDREN(oidp), oid_link) {
+ if (!recurse)
+ return (ENOTEMPTY);
+ error = sysctl_remove_oid(p, del, recurse);
+ if (error)
+ return (error);
+ }
+ if (del)
+ free(SYSCTL_CHILDREN(oidp), M_SYSCTLOID);
+ }
+ }
+ if (oidp->oid_refcnt > 1 ) {
+ oidp->oid_refcnt--;
+ } else {
+ if (oidp->oid_refcnt == 0) {
+ printf("Warning: bad oid_refcnt=%u (%s)!\n",
+ oidp->oid_refcnt, oidp->oid_name);
+ return (EINVAL);
+ }
+ sysctl_unregister_oid(oidp);
+ if (del) {
+ if (oidp->descr)
+ free(oidp->descr, M_SYSCTLOID);
+ free((void *)(uintptr_t)(const void *)oidp->oid_name,
+ M_SYSCTLOID);
+ free(oidp, M_SYSCTLOID);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Create new sysctls at run time.
+ * clist may point to a valid context initialized with sysctl_ctx_init().
+ */
+struct sysctl_oid *
+sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
+ int number, const char *name, int kind, void *arg1, int arg2,
+ int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr)
+{
+ struct sysctl_oid *oidp;
+ ssize_t len;
+ char *newname;
+
+ /* You have to hook up somewhere.. */
+ if (parent == NULL)
+ return(NULL);
+ /* Check if the node already exists, otherwise create it */
+ oidp = sysctl_find_oidname(name, parent);
+ if (oidp != NULL) {
+ if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ oidp->oid_refcnt++;
+ /* Update the context */
+ if (clist != NULL)
+ sysctl_ctx_entry_add(clist, oidp);
+ return (oidp);
+ } else {
+ printf("can't re-use a leaf (%s)!\n", name);
+ return (NULL);
+ }
+ }
+ oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
+ oidp->oid_parent = parent;
+ SLIST_NEXT(oidp, oid_link) = NULL;
+ oidp->oid_number = number;
+ oidp->oid_refcnt = 1;
+ len = strlen(name);
+ newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
+ bcopy(name, newname, len + 1);
+ newname[len] = '\0';
+ oidp->oid_name = newname;
+ oidp->oid_handler = handler;
+ oidp->oid_kind = CTLFLAG_DYN | kind;
+ if ((kind & CTLTYPE) == CTLTYPE_NODE) {
+ /* Allocate space for children */
+ SYSCTL_CHILDREN(oidp) = malloc(sizeof(struct sysctl_oid_list),
+ M_SYSCTLOID, M_WAITOK);
+ SLIST_INIT(SYSCTL_CHILDREN(oidp));
+ } else {
+ oidp->oid_arg1 = arg1;
+ oidp->oid_arg2 = arg2;
+ }
+ oidp->oid_fmt = fmt;
+ if (descr) {
+ int len = strlen(descr) + 1;
+ oidp->descr = malloc(len, M_SYSCTLOID, M_WAITOK);
+ if (oidp->descr)
+ strcpy(oidp->descr, descr);
+ }
+ /* Update the context, if used */
+ if (clist != NULL)
+ sysctl_ctx_entry_add(clist, oidp);
+ /* Register this oid */
+ sysctl_register_oid(oidp);
+ return (oidp);
+}
+
+/*
+ * Register the kernel's oids on startup.
+ */
+SET_DECLARE(sysctl_set, struct sysctl_oid);
+
+static void
+sysctl_register_all(void *arg)
+{
+ struct sysctl_oid **oidp;
+
+ SYSCTL_INIT();
+ SET_FOREACH(oidp, sysctl_set)
+ sysctl_register_oid(*oidp);
+}
+SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0);
+
+/*
+ * "Staff-functions"
+ *
+ * These functions implement a presently undocumented interface
+ * used by the sysctl program to walk the tree, and get the type
+ * so it can print the value.
+ * This interface is under work and consideration, and should probably
+ * be killed with a big axe by the first person who can find the time.
+ * (be aware though, that the proper interface isn't as obvious as it
+ * may seem, there are various conflicting requirements.
+ *
+ * {0,0} printf the entire MIB-tree.
+ * {0,1,...} return the name of the "..." OID.
+ * {0,2,...} return the next OID.
+ * {0,3} return the OID of the name in "new"
+ * {0,4,...} return the kind & format info for the "..." OID.
+ * {0,5,...} return the description the "..." OID.
+ */
+
+static void
+sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
+{
+ int k;
+ struct sysctl_oid *oidp;
+
+ SLIST_FOREACH(oidp, l, oid_link) {
+
+ for (k=0; k<i; k++)
+ printf(" ");
+
+ printf("%d %s ", oidp->oid_number, oidp->oid_name);
+
+ printf("%c%c",
+ oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
+ oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
+
+ if (oidp->oid_handler)
+ printf(" *Handler");
+
+ switch (oidp->oid_kind & CTLTYPE) {
+ case CTLTYPE_NODE:
+ printf(" Node\n");
+ if (!oidp->oid_handler) {
+ sysctl_sysctl_debug_dump_node(
+ oidp->oid_arg1, i+2);
+ }
+ break;
+ case CTLTYPE_INT: printf(" Int\n"); break;
+ case CTLTYPE_STRING: printf(" String\n"); break;
+ case CTLTYPE_QUAD: printf(" Quad\n"); break;
+ case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
+ default: printf("\n");
+ }
+
+ }
+}
+
+static int
+sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+
+ error = suser(req->td);
+ if (error)
+ return error;
+ sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
+ return ENOENT;
+}
+
+SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
+ 0, 0, sysctl_sysctl_debug, "-", "");
+
+static int
+sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ int error = 0;
+ struct sysctl_oid *oid;
+ struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
+ char buf[10];
+
+ while (namelen) {
+ if (!lsp) {
+ snprintf(buf,sizeof(buf),"%d",*name);
+ if (req->oldidx)
+ error = SYSCTL_OUT(req, ".", 1);
+ if (!error)
+ error = SYSCTL_OUT(req, buf, strlen(buf));
+ if (error)
+ return (error);
+ namelen--;
+ name++;
+ continue;
+ }
+ lsp2 = 0;
+ SLIST_FOREACH(oid, lsp, oid_link) {
+ if (oid->oid_number != *name)
+ continue;
+
+ if (req->oldidx)
+ error = SYSCTL_OUT(req, ".", 1);
+ if (!error)
+ error = SYSCTL_OUT(req, oid->oid_name,
+ strlen(oid->oid_name));
+ if (error)
+ return (error);
+
+ namelen--;
+ name++;
+
+ if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ break;
+
+ if (oid->oid_handler)
+ break;
+
+ lsp2 = (struct sysctl_oid_list *)oid->oid_arg1;
+ break;
+ }
+ lsp = lsp2;
+ }
+ return (SYSCTL_OUT(req, "", 1));
+}
+
+SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, "");
+
+static int
+sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen,
+ int *next, int *len, int level, struct sysctl_oid **oidpp)
+{
+ struct sysctl_oid *oidp;
+
+ *len = level;
+ SLIST_FOREACH(oidp, lsp, oid_link) {
+ *next = oidp->oid_number;
+ *oidpp = oidp;
+
+ if (!namelen) {
+ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ return 0;
+ if (oidp->oid_handler)
+ /* We really should call the handler here...*/
+ return 0;
+ lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
+ if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1,
+ len, level+1, oidpp))
+ return 0;
+ goto next;
+ }
+
+ if (oidp->oid_number < *name)
+ continue;
+
+ if (oidp->oid_number > *name) {
+ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ return 0;
+ if (oidp->oid_handler)
+ return 0;
+ lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
+ if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1,
+ next+1, len, level+1, oidpp))
+ return (0);
+ goto next;
+ }
+ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ continue;
+
+ if (oidp->oid_handler)
+ continue;
+
+ lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
+ if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1,
+ len, level+1, oidpp))
+ return (0);
+ next:
+ namelen = 1;
+ *len = level;
+ }
+ return 1;
+}
+
+static int
+sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ int i, j, error;
+ struct sysctl_oid *oid;
+ struct sysctl_oid_list *lsp = &sysctl__children;
+ int newoid[CTL_MAXNAME];
+
+ i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
+ if (i)
+ return ENOENT;
+ error = SYSCTL_OUT(req, newoid, j * sizeof (int));
+ return (error);
+}
+
+SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, "");
+
+static int
+name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp)
+{
+ int i;
+ struct sysctl_oid *oidp;
+ struct sysctl_oid_list *lsp = &sysctl__children;
+ char *p;
+
+ if (!*name)
+ return ENOENT;
+
+ p = name + strlen(name) - 1 ;
+ if (*p == '.')
+ *p = '\0';
+
+ *len = 0;
+
+ for (p = name; *p && *p != '.'; p++)
+ ;
+ i = *p;
+ if (i == '.')
+ *p = '\0';
+
+ oidp = SLIST_FIRST(lsp);
+
+ while (oidp && *len < CTL_MAXNAME) {
+ if (strcmp(name, oidp->oid_name)) {
+ oidp = SLIST_NEXT(oidp, oid_link);
+ continue;
+ }
+ *oid++ = oidp->oid_number;
+ (*len)++;
+
+ if (!i) {
+ if (oidpp)
+ *oidpp = oidp;
+ return (0);
+ }
+
+ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ break;
+
+ if (oidp->oid_handler)
+ break;
+
+ lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
+ oidp = SLIST_FIRST(lsp);
+ name = p+1;
+ for (p = name; *p && *p != '.'; p++)
+ ;
+ i = *p;
+ if (i == '.')
+ *p = '\0';
+ }
+ return ENOENT;
+}
+
+static int
+sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
+{
+ char *p;
+ int error, oid[CTL_MAXNAME], len;
+ struct sysctl_oid *op = 0;
+
+ if (!req->newlen)
+ return ENOENT;
+ if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */
+ return (ENAMETOOLONG);
+
+ p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
+
+ error = SYSCTL_IN(req, p, req->newlen);
+ if (error) {
+ free(p, M_SYSCTL);
+ return (error);
+ }
+
+ p [req->newlen] = '\0';
+
+ error = name2oid(p, oid, &len, &op);
+
+ free(p, M_SYSCTL);
+
+ if (error)
+ return (error);
+
+ error = SYSCTL_OUT(req, oid, len * sizeof *oid);
+ return (error);
+}
+
+SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0,
+ sysctl_sysctl_name2oid, "I", "");
+
+static int
+sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
+{
+ struct sysctl_oid *oid;
+ int error;
+
+ error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
+ if (error)
+ return (error);
+
+ if (!oid->oid_fmt)
+ return (ENOENT);
+ error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
+ return (error);
+}
+
+
+SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, "");
+
+static int
+sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
+{
+ struct sysctl_oid *oid;
+ int error;
+
+ error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
+ if (error)
+ return (error);
+
+ if (!oid->descr)
+ return (ENOENT);
+ error = SYSCTL_OUT(req, oid->descr, strlen(oid->descr) + 1);
+ return (error);
+}
+
+SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD, sysctl_sysctl_oiddescr, "");
+
+/*
+ * Default "handler" functions.
+ */
+
+/*
+ * Handle an int, signed or unsigned.
+ * Two cases:
+ * a variable: point arg1 at it.
+ * a constant: pass it in arg2.
+ */
+
+int
+sysctl_handle_int(SYSCTL_HANDLER_ARGS)
+{
+ int error = 0;
+
+ if (arg1)
+ error = SYSCTL_OUT(req, arg1, sizeof(int));
+ else
+ error = SYSCTL_OUT(req, &arg2, sizeof(int));
+
+ if (error || !req->newptr)
+ return (error);
+
+ if (!arg1)
+ error = EPERM;
+ else
+ error = SYSCTL_IN(req, arg1, sizeof(int));
+ return (error);
+}
+
+/*
+ * Handle a long, signed or unsigned. arg1 points to it.
+ */
+
+int
+sysctl_handle_long(SYSCTL_HANDLER_ARGS)
+{
+ int error = 0;
+
+ if (!arg1)
+ return (EINVAL);
+ error = SYSCTL_OUT(req, arg1, sizeof(long));
+
+ if (error || !req->newptr)
+ return (error);
+
+ error = SYSCTL_IN(req, arg1, sizeof(long));
+ return (error);
+}
+
+/*
+ * Handle our generic '\0' terminated 'C' string.
+ * Two cases:
+ * a variable string: point arg1 at it, arg2 is max length.
+ * a constant string: point arg1 at it, arg2 is zero.
+ */
+
+int
+sysctl_handle_string(SYSCTL_HANDLER_ARGS)
+{
+ int error=0;
+
+ error = SYSCTL_OUT(req, arg1, strlen((char *)arg1)+1);
+
+ if (error || !req->newptr)
+ return (error);
+
+ if ((req->newlen - req->newidx) >= arg2) {
+ error = EINVAL;
+ } else {
+ arg2 = (req->newlen - req->newidx);
+ error = SYSCTL_IN(req, arg1, arg2);
+ ((char *)arg1)[arg2] = '\0';
+ }
+
+ return (error);
+}
+
+/*
+ * Handle any kind of opaque data.
+ * arg1 points to it, arg2 is the size.
+ */
+
+int
+sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+
+ error = SYSCTL_OUT(req, arg1, arg2);
+
+ if (error || !req->newptr)
+ return (error);
+
+ error = SYSCTL_IN(req, arg1, arg2);
+
+ return (error);
+}
+
+/*
+ * Transfer functions to/from kernel space.
+ * XXX: rather untested at this point
+ */
+static int
+sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
+{
+ size_t i = 0;
+
+ if (req->oldptr) {
+ i = l;
+ if (req->oldlen <= req->oldidx)
+ i = 0;
+ else
+ if (i > req->oldlen - req->oldidx)
+ i = req->oldlen - req->oldidx;
+ if (i > 0)
+ bcopy(p, (char *)req->oldptr + req->oldidx, i);
+ }
+ req->oldidx += l;
+ if (req->oldptr && i != l)
+ return (ENOMEM);
+ return (0);
+}
+
+static int
+sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
+{
+ if (!req->newptr)
+ return 0;
+ if (req->newlen - req->newidx < l)
+ return (EINVAL);
+ bcopy((char *)req->newptr + req->newidx, p, l);
+ req->newidx += l;
+ return (0);
+}
+
+int
+kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
+ size_t *oldlenp, void *new, size_t newlen, size_t *retval)
+{
+ int error = 0;
+ struct sysctl_req req;
+
+ bzero(&req, sizeof req);
+
+ req.td = td;
+
+ if (oldlenp) {
+ req.oldlen = *oldlenp;
+ }
+
+ if (old) {
+ req.oldptr= old;
+ }
+
+ if (new != NULL) {
+ req.newlen = newlen;
+ req.newptr = new;
+ }
+
+ req.oldfunc = sysctl_old_kernel;
+ req.newfunc = sysctl_new_kernel;
+ req.lock = 1;
+
+ SYSCTL_LOCK();
+
+ error = sysctl_root(0, name, namelen, &req);
+
+ if (req.lock == 2)
+ vsunlock(req.oldptr, req.oldlen);
+
+ SYSCTL_UNLOCK();
+
+ if (error && error != ENOMEM)
+ return (error);
+
+ if (retval) {
+ if (req.oldptr && req.oldidx > req.oldlen)
+ *retval = req.oldlen;
+ else
+ *retval = req.oldidx;
+ }
+ return (error);
+}
+
+int
+kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
+ void *new, size_t newlen, size_t *retval)
+{
+ int oid[CTL_MAXNAME];
+ size_t oidlen, plen;
+ int error;
+
+ oid[0] = 0; /* sysctl internal magic */
+ oid[1] = 3; /* name2oid */
+ oidlen = sizeof(oid);
+
+ error = kernel_sysctl(td, oid, 2, oid, &oidlen,
+ (void *)name, strlen(name), &plen);
+ if (error)
+ return (error);
+
+ error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
+ new, newlen, retval);
+ return (error);
+}
+
+/*
+ * Transfer function to/from user space.
+ */
+static int
+sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
+{
+ int error = 0;
+ size_t i = 0;
+
+ if (req->lock == 1 && req->oldptr) {
+ vslock(req->oldptr, req->oldlen);
+ req->lock = 2;
+ }
+ if (req->oldptr) {
+ i = l;
+ if (req->oldlen <= req->oldidx)
+ i = 0;
+ else
+ if (i > req->oldlen - req->oldidx)
+ i = req->oldlen - req->oldidx;
+ if (i > 0)
+ error = copyout(p, (char *)req->oldptr + req->oldidx,
+ i);
+ }
+ req->oldidx += l;
+ if (error)
+ return (error);
+ if (req->oldptr && i < l)
+ return (ENOMEM);
+ return (0);
+}
+
+static int
+sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
+{
+ int error;
+
+ if (!req->newptr)
+ return 0;
+ if (req->newlen - req->newidx < l)
+ return (EINVAL);
+ error = copyin((char *)req->newptr + req->newidx, p, l);
+ req->newidx += l;
+ return (error);
+}
+
+int
+sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
+ int *nindx, struct sysctl_req *req)
+{
+ struct sysctl_oid *oid;
+ int indx;
+
+ oid = SLIST_FIRST(&sysctl__children);
+ indx = 0;
+ while (oid && indx < CTL_MAXNAME) {
+ if (oid->oid_number == name[indx]) {
+ indx++;
+ if (oid->oid_kind & CTLFLAG_NOLOCK)
+ req->lock = 0;
+ if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ if (oid->oid_handler != NULL ||
+ indx == namelen) {
+ *noid = oid;
+ if (nindx != NULL)
+ *nindx = indx;
+ return (0);
+ }
+ oid = SLIST_FIRST(
+ (struct sysctl_oid_list *)oid->oid_arg1);
+ } else if (indx == namelen) {
+ *noid = oid;
+ if (nindx != NULL)
+ *nindx = indx;
+ return (0);
+ } else {
+ return (ENOTDIR);
+ }
+ } else {
+ oid = SLIST_NEXT(oid, oid_link);
+ }
+ }
+ return (ENOENT);
+}
+
+/*
+ * Traverse our tree, and find the right node, execute whatever it points
+ * to, and return the resulting error code.
+ */
+
+int
+sysctl_root(SYSCTL_HANDLER_ARGS)
+{
+ struct sysctl_oid *oid;
+ int error, indx;
+
+ error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
+ if (error)
+ return (error);
+
+ if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ /*
+ * You can't call a sysctl when it's a node, but has
+ * no handler. Inform the user that it's a node.
+ * The indx may or may not be the same as namelen.
+ */
+ if (oid->oid_handler == NULL)
+ return (EISDIR);
+ }
+
+ /* Is this sysctl writable? */
+ if (req->newptr && !(oid->oid_kind & CTLFLAG_WR))
+ return (EPERM);
+
+ KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
+
+ /* Is this sysctl sensitive to securelevels? */
+ if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
+ error = securelevel_gt(req->td->td_ucred, 0);
+ if (error)
+ return (error);
+ }
+
+ /* Is this sysctl writable by only privileged users? */
+ if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
+ int flags;
+
+ if (oid->oid_kind & CTLFLAG_PRISON)
+ flags = PRISON_ROOT;
+ else
+ flags = 0;
+ error = suser_cred(req->td->td_ucred, flags);
+ if (error)
+ return (error);
+ }
+
+ if (!oid->oid_handler)
+ return EINVAL;
+
+ if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE)
+ error = oid->oid_handler(oid, (int *)arg1 + indx, arg2 - indx,
+ req);
+ else
+ error = oid->oid_handler(oid, oid->oid_arg1, oid->oid_arg2,
+ req);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sysctl_args {
+ int *name;
+ u_int namelen;
+ void *old;
+ size_t *oldlenp;
+ void *new;
+ size_t newlen;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+__sysctl(struct thread *td, struct sysctl_args *uap)
+{
+ int error, name[CTL_MAXNAME];
+ size_t j;
+
+ if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
+ return (EINVAL);
+
+ error = copyin(uap->name, &name, uap->namelen * sizeof(int));
+ if (error)
+ return (error);
+
+ mtx_lock(&Giant);
+
+ error = userland_sysctl(td, name, uap->namelen,
+ uap->old, uap->oldlenp, 0,
+ uap->new, uap->newlen, &j);
+ if (error && error != ENOMEM)
+ goto done2;
+ if (uap->oldlenp) {
+ int i = copyout(&j, uap->oldlenp, sizeof(j));
+ if (i)
+ error = i;
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * This is used from various compatibility syscalls too. That's why name
+ * must be in kernel space.
+ */
+int
+userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
+ size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval)
+{
+ int error = 0;
+ struct sysctl_req req, req2;
+
+ bzero(&req, sizeof req);
+
+ req.td = td;
+
+ if (oldlenp) {
+ if (inkernel) {
+ req.oldlen = *oldlenp;
+ } else {
+ error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
+ if (error)
+ return (error);
+ }
+ }
+
+ if (old) {
+ if (!useracc(old, req.oldlen, VM_PROT_WRITE))
+ return (EFAULT);
+ req.oldptr= old;
+ }
+
+ if (new != NULL) {
+ if (!useracc(new, req.newlen, VM_PROT_READ))
+ return (EFAULT);
+ req.newlen = newlen;
+ req.newptr = new;
+ }
+
+ req.oldfunc = sysctl_old_user;
+ req.newfunc = sysctl_new_user;
+ req.lock = 1;
+
+ SYSCTL_LOCK();
+
+ do {
+ req2 = req;
+ error = sysctl_root(0, name, namelen, &req2);
+ } while (error == EAGAIN);
+
+ req = req2;
+ if (req.lock == 2)
+ vsunlock(req.oldptr, req.oldlen);
+
+ SYSCTL_UNLOCK();
+
+ if (error && error != ENOMEM)
+ return (error);
+
+ if (retval) {
+ if (req.oldptr && req.oldidx > req.oldlen)
+ *retval = req.oldlen;
+ else
+ *retval = req.oldidx;
+ }
+ return (error);
+}
+
+#ifdef COMPAT_43
+#include <sys/socket.h>
+#include <vm/vm_param.h>
+
+#define KINFO_PROC (0<<8)
+#define KINFO_RT (1<<8)
+#define KINFO_VNODE (2<<8)
+#define KINFO_FILE (3<<8)
+#define KINFO_METER (4<<8)
+#define KINFO_LOADAVG (5<<8)
+#define KINFO_CLOCKRATE (6<<8)
+
+/* Non-standard BSDI extension - only present on their 4.3 net-2 releases */
+#define KINFO_BSDI_SYSINFO (101<<8)
+
+/*
+ * XXX this is bloat, but I hope it's better here than on the potentially
+ * limited kernel stack... -Peter
+ */
+
+static struct {
+ int bsdi_machine; /* "i386" on BSD/386 */
+/* ^^^ this is an offset to the string, relative to the struct start */
+ char *pad0;
+ long pad1;
+ long pad2;
+ long pad3;
+ u_long pad4;
+ u_long pad5;
+ u_long pad6;
+
+ int bsdi_ostype; /* "BSD/386" on BSD/386 */
+ int bsdi_osrelease; /* "1.1" on BSD/386 */
+ long pad7;
+ long pad8;
+ char *pad9;
+
+ long pad10;
+ long pad11;
+ int pad12;
+ long pad13;
+ quad_t pad14;
+ long pad15;
+
+ struct timeval pad16;
+ /* we dont set this, because BSDI's uname used gethostname() instead */
+ int bsdi_hostname; /* hostname on BSD/386 */
+
+ /* the actual string data is appended here */
+
+} bsdi_si;
+/*
+ * this data is appended to the end of the bsdi_si structure during copyout.
+ * The "char *" offsets are relative to the base of the bsdi_si struct.
+ * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings
+ * should not exceed the length of the buffer here... (or else!! :-)
+ */
+static char bsdi_strings[80]; /* It had better be less than this! */
+
+#ifndef _SYS_SYSPROTO_H_
+struct getkerninfo_args {
+ int op;
+ char *where;
+ size_t *size;
+ int arg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+ogetkerninfo(struct thread *td, struct getkerninfo_args *uap)
+{
+ int error, name[6];
+ size_t size;
+ u_int needed = 0;
+
+ mtx_lock(&Giant);
+
+ switch (uap->op & 0xff00) {
+
+ case KINFO_RT:
+ name[0] = CTL_NET;
+ name[1] = PF_ROUTE;
+ name[2] = 0;
+ name[3] = (uap->op & 0xff0000) >> 16;
+ name[4] = uap->op & 0xff;
+ name[5] = uap->arg;
+ error = userland_sysctl(td, name, 6, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_VNODE:
+ name[0] = CTL_KERN;
+ name[1] = KERN_VNODE;
+ error = userland_sysctl(td, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_PROC:
+ name[0] = CTL_KERN;
+ name[1] = KERN_PROC;
+ name[2] = uap->op & 0xff;
+ name[3] = uap->arg;
+ error = userland_sysctl(td, name, 4, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_FILE:
+ name[0] = CTL_KERN;
+ name[1] = KERN_FILE;
+ error = userland_sysctl(td, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_METER:
+ name[0] = CTL_VM;
+ name[1] = VM_METER;
+ error = userland_sysctl(td, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_LOADAVG:
+ name[0] = CTL_VM;
+ name[1] = VM_LOADAVG;
+ error = userland_sysctl(td, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_CLOCKRATE:
+ name[0] = CTL_KERN;
+ name[1] = KERN_CLOCKRATE;
+ error = userland_sysctl(td, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_BSDI_SYSINFO: {
+ /*
+ * this is pretty crude, but it's just enough for uname()
+ * from BSDI's 1.x libc to work.
+ *
+ * *size gives the size of the buffer before the call, and
+ * the amount of data copied after a successful call.
+ * If successful, the return value is the amount of data
+ * available, which can be larger than *size.
+ *
+ * BSDI's 2.x product apparently fails with ENOMEM if *size
+ * is too small.
+ */
+
+ u_int left;
+ char *s;
+
+ bzero((char *)&bsdi_si, sizeof(bsdi_si));
+ bzero(bsdi_strings, sizeof(bsdi_strings));
+
+ s = bsdi_strings;
+
+ bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si);
+ strcpy(s, ostype);
+ s += strlen(s) + 1;
+
+ bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si);
+ strcpy(s, osrelease);
+ s += strlen(s) + 1;
+
+ bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si);
+ strcpy(s, machine);
+ s += strlen(s) + 1;
+
+ needed = sizeof(bsdi_si) + (s - bsdi_strings);
+
+ if ((uap->where == NULL) || (uap->size == NULL)) {
+ /* process is asking how much buffer to supply.. */
+ size = needed;
+ error = 0;
+ break;
+ }
+
+ if ((error = copyin(uap->size, &size, sizeof(size))) != 0)
+ break;
+
+ /* if too much buffer supplied, trim it down */
+ if (size > needed)
+ size = needed;
+
+ /* how much of the buffer is remaining */
+ left = size;
+
+ if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0)
+ break;
+
+ /* is there any point in continuing? */
+ if (left > sizeof(bsdi_si)) {
+ left -= sizeof(bsdi_si);
+ error = copyout(&bsdi_strings,
+ uap->where + sizeof(bsdi_si), left);
+ }
+ break;
+ }
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ if (error == 0) {
+ td->td_retval[0] = needed ? needed : size;
+ if (uap->size) {
+ error = copyout((caddr_t)&size, (caddr_t)uap->size,
+ sizeof(size));
+ }
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+#endif /* COMPAT_43 */
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
new file mode 100644
index 0000000..fabc204
--- /dev/null
+++ b/sys/kern/kern_tc.c
@@ -0,0 +1,684 @@
+/*-
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ntp.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/timepps.h>
+#include <sys/timetc.h>
+#include <sys/timex.h>
+
+/*
+ * Implement a dummy timecounter which we can use until we get a real one
+ * in the air. This allows the console and other early stuff to use
+ * time services.
+ */
+
+static u_int
+dummy_get_timecount(struct timecounter *tc)
+{
+ static u_int now;
+
+ return (++now);
+}
+
+static struct timecounter dummy_timecounter = {
+ dummy_get_timecount, 0, ~0u, 1000000, "dummy",
+};
+
+struct timehands {
+ /* These fields must be initialized by the driver. */
+ struct timecounter *th_counter;
+ int64_t th_adjustment;
+ u_int64_t th_scale;
+ u_int th_offset_count;
+ struct bintime th_offset;
+ struct timeval th_microtime;
+ struct timespec th_nanotime;
+ /* Fields not to be copied in tc_windup start with th_generation. */
+ volatile u_int th_generation;
+ struct timehands *th_next;
+};
+
+extern struct timehands th0;
+static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th0};
+static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th9};
+static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th8};
+static struct timehands th6 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th7};
+static struct timehands th5 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th6};
+static struct timehands th4 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th5};
+static struct timehands th3 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th4};
+static struct timehands th2 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th3};
+static struct timehands th1 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th2};
+static struct timehands th0 = {
+ &dummy_timecounter,
+ 0,
+ (uint64_t)-1 / 1000000,
+ 0,
+ {1, 0},
+ {0, 0},
+ {0, 0},
+ 1,
+ &th1
+};
+
+static struct timehands *volatile timehands = &th0;
+struct timecounter *timecounter = &dummy_timecounter;
+static struct timecounter *timecounters = &dummy_timecounter;
+
+time_t time_second = 1;
+
+static struct bintime boottimebin;
+struct timeval boottime;
+SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD,
+ &boottime, timeval, "System boottime");
+
+SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+
+#define TC_STATS(foo) \
+ static u_int foo; \
+ SYSCTL_UINT(_kern_timecounter, OID_AUTO, foo, CTLFLAG_RD, &foo, 0, "") \
+ struct __hack
+
+TC_STATS(nbinuptime); TC_STATS(nnanouptime); TC_STATS(nmicrouptime);
+TC_STATS(nbintime); TC_STATS(nnanotime); TC_STATS(nmicrotime);
+TC_STATS(ngetbinuptime); TC_STATS(ngetnanouptime); TC_STATS(ngetmicrouptime);
+TC_STATS(ngetbintime); TC_STATS(ngetnanotime); TC_STATS(ngetmicrotime);
+
+#undef TC_STATS
+
+static void tc_windup(void);
+
+/*
+ * Return the difference between the timehands' counter value now and what
+ * was when we copied it to the timehands' offset_count.
+ */
+static __inline u_int
+tc_delta(struct timehands *th)
+{
+ struct timecounter *tc;
+
+ tc = th->th_counter;
+ return ((tc->tc_get_timecount(tc) - th->th_offset_count) &
+ tc->tc_counter_mask);
+}
+
+/*
+ * Functions for reading the time. We have to loop until we are sure that
+ * the timehands that we operated on was not updated under our feet. See
+ * the comment in <sys/time.h> for a description of these 12 functions.
+ */
+
+void
+binuptime(struct bintime *bt)
+{
+ struct timehands *th;
+ u_int gen;
+
+ nbinuptime++;
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *bt = th->th_offset;
+ bintime_addx(bt, th->th_scale * tc_delta(th));
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+nanouptime(struct timespec *tsp)
+{
+ struct bintime bt;
+
+ nnanouptime++;
+ binuptime(&bt);
+ bintime2timespec(&bt, tsp);
+}
+
+void
+microuptime(struct timeval *tvp)
+{
+ struct bintime bt;
+
+ nmicrouptime++;
+ binuptime(&bt);
+ bintime2timeval(&bt, tvp);
+}
+
+void
+bintime(struct bintime *bt)
+{
+
+ nbintime++;
+ binuptime(bt);
+ bintime_add(bt, &boottimebin);
+}
+
+void
+nanotime(struct timespec *tsp)
+{
+ struct bintime bt;
+
+ nnanotime++;
+ bintime(&bt);
+ bintime2timespec(&bt, tsp);
+}
+
+void
+microtime(struct timeval *tvp)
+{
+ struct bintime bt;
+
+ nmicrotime++;
+ bintime(&bt);
+ bintime2timeval(&bt, tvp);
+}
+
+void
+getbinuptime(struct bintime *bt)
+{
+ struct timehands *th;
+ u_int gen;
+
+ ngetbinuptime++;
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *bt = th->th_offset;
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getnanouptime(struct timespec *tsp)
+{
+ struct timehands *th;
+ u_int gen;
+
+ ngetnanouptime++;
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ bintime2timespec(&th->th_offset, tsp);
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getmicrouptime(struct timeval *tvp)
+{
+ struct timehands *th;
+ u_int gen;
+
+ ngetmicrouptime++;
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ bintime2timeval(&th->th_offset, tvp);
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getbintime(struct bintime *bt)
+{
+ struct timehands *th;
+ u_int gen;
+
+ ngetbintime++;
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *bt = th->th_offset;
+ } while (gen == 0 || gen != th->th_generation);
+ bintime_add(bt, &boottimebin);
+}
+
+void
+getnanotime(struct timespec *tsp)
+{
+ struct timehands *th;
+ u_int gen;
+
+ ngetnanotime++;
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *tsp = th->th_nanotime;
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getmicrotime(struct timeval *tvp)
+{
+ struct timehands *th;
+ u_int gen;
+
+ ngetmicrotime++;
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *tvp = th->th_microtime;
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+/*
+ * Initialize a new timecounter.
+ * We should really try to rank the timecounters and intelligently determine
+ * if the new timecounter is better than the current one. This is subject
+ * to further study. For now always use the new timecounter.
+ */
+void
+tc_init(struct timecounter *tc)
+{
+
+ tc->tc_next = timecounters;
+ timecounters = tc;
+ printf("Timecounter \"%s\" frequency %lu Hz\n",
+ tc->tc_name, (u_long)tc->tc_frequency);
+ (void)tc->tc_get_timecount(tc);
+ (void)tc->tc_get_timecount(tc);
+ timecounter = tc;
+}
+
+/* Report the frequency of the current timecounter. */
+u_int32_t
+tc_getfrequency(void)
+{
+
+ return (timehands->th_counter->tc_frequency);
+}
+
+/*
+ * Step our concept of GMT. This is done by modifying our estimate of
+ * when we booted. XXX: needs futher work.
+ */
+void
+tc_setclock(struct timespec *ts)
+{
+ struct timespec ts2;
+
+ nanouptime(&ts2);
+ boottime.tv_sec = ts->tv_sec - ts2.tv_sec;
+ /* XXX boottime should probably be a timespec. */
+ boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000;
+ if (boottime.tv_usec < 0) {
+ boottime.tv_usec += 1000000;
+ boottime.tv_sec--;
+ }
+ timeval2bintime(&boottime, &boottimebin);
+
+ /* XXX fiddle all the little crinkly bits around the fiords... */
+ tc_windup();
+}
+
+/*
+ * Initialize the next struct timehands in the ring and make
+ * it the active timehands. Along the way we might switch to a different
+ * timecounter and/or do seconds processing in NTP. Slightly magic.
+ */
+static void
+tc_windup(void)
+{
+ struct bintime bt;
+ struct timehands *th, *tho;
+ u_int64_t scale;
+ u_int delta, ncount, ogen;
+ int i;
+
+ /*
+ * Make the next timehands a copy of the current one, but do not
+ * overwrite the generation or next pointer. While we update
+ * the contents, the generation must be zero.
+ */
+ tho = timehands;
+ th = tho->th_next;
+ ogen = th->th_generation;
+ th->th_generation = 0;
+ bcopy(tho, th, offsetof(struct timehands, th_generation));
+
+ /*
+ * Capture a timecounter delta on the current timecounter and if
+ * changing timecounters, a counter value from the new timecounter.
+ * Update the offset fields accordingly.
+ */
+ delta = tc_delta(th);
+ if (th->th_counter != timecounter)
+ ncount = timecounter->tc_get_timecount(timecounter);
+ else
+ ncount = 0;
+ th->th_offset_count += delta;
+ th->th_offset_count &= th->th_counter->tc_counter_mask;
+ bintime_addx(&th->th_offset, th->th_scale * delta);
+
+ /*
+ * Hardware latching timecounters may not generate interrupts on
+ * PPS events, so instead we poll them. There is a finite risk that
+ * the hardware might capture a count which is later than the one we
+ * got above, and therefore possibly in the next NTP second which might
+ * have a different rate than the current NTP second. It doesn't
+ * matter in practice.
+ */
+ if (tho->th_counter->tc_poll_pps)
+ tho->th_counter->tc_poll_pps(tho->th_counter);
+
+ /*
+ * Deal with NTP second processing. The for loop normally only
+ * iterates once, but in extreme situations it might keep NTP sane
+ * if timeouts are not run for several seconds.
+ */
+ for (i = th->th_offset.sec - tho->th_offset.sec; i > 0; i--)
+ ntp_update_second(&th->th_adjustment, &th->th_offset.sec);
+
+ /* Now is a good time to change timecounters. */
+ if (th->th_counter != timecounter) {
+ th->th_counter = timecounter;
+ th->th_offset_count = ncount;
+ }
+
+ /*-
+ * Recalculate the scaling factor. We want the number of 1/2^64
+ * fractions of a second per period of the hardware counter, taking
+ * into account the th_adjustment factor which the NTP PLL/adjtime(2)
+ * processing provides us with.
+ *
+ * The th_adjustment is nanoseconds per second with 32 bit binary
+ * fraction and want 64 bit binary fraction of second:
+ *
+ * x = a * 2^32 / 10^9 = a * 4.294967296
+ *
+ * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
+ * we can only multiply by about 850 without overflowing, but that
+ * leaves suitably precise fractions for multiply before divide.
+ *
+ * Divide before multiply with a fraction of 2199/512 results in a
+ * systematic undercompensation of 10PPM of th_adjustment. On a
+ * 5000PPM adjustment this is a 0.05PPM error. This is acceptable.
+ *
+ * We happily sacrifice the lowest of the 64 bits of our result
+ * to the goddess of code clarity.
+ *
+ */
+ scale = (u_int64_t)1 << 63;
+ scale += (th->th_adjustment / 1024) * 2199;
+ scale /= th->th_counter->tc_frequency;
+ th->th_scale = scale * 2;
+
+ /* Update the GMT timestamps used for the get*() functions. */
+ bt = th->th_offset;
+ bintime_add(&bt, &boottimebin);
+ bintime2timeval(&bt, &th->th_microtime);
+ bintime2timespec(&bt, &th->th_nanotime);
+
+ /*
+ * Now that the struct timehands is again consistent, set the new
+ * generation number, making sure to not make it zero.
+ */
+ if (++ogen == 0)
+ ogen = 1;
+ th->th_generation = ogen;
+
+ /* Go live with the new struct timehands. */
+ time_second = th->th_microtime.tv_sec;
+ timehands = th;
+}
+
+/* Report or change the active timecounter hardware. */
+static int
+sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
+{
+ char newname[32];
+ struct timecounter *newtc, *tc;
+ int error;
+
+ tc = timecounter;
+ strncpy(newname, tc->tc_name, sizeof(newname));
+ newname[sizeof(newname) - 1] = '\0';
+ error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
+ if (error != 0 || req->newptr == NULL ||
+ strcmp(newname, tc->tc_name) == 0)
+ return (error);
+ for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
+ if (strcmp(newname, newtc->tc_name) != 0)
+ continue;
+
+ /* Warm up new timecounter. */
+ (void)newtc->tc_get_timecount(newtc);
+ (void)newtc->tc_get_timecount(newtc);
+
+ timecounter = newtc;
+ return (0);
+ }
+ return (EINVAL);
+}
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW,
+ 0, 0, sysctl_kern_timecounter_hardware, "A", "");
+
+/*
+ * RFC 2783 PPS-API implementation.
+ */
+
+int
+pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
+{
+ pps_params_t *app;
+ struct pps_fetch_args *fapi;
+#ifdef PPS_SYNC
+ struct pps_kcbind_args *kapi;
+#endif
+
+ switch (cmd) {
+ case PPS_IOC_CREATE:
+ return (0);
+ case PPS_IOC_DESTROY:
+ return (0);
+ case PPS_IOC_SETPARAMS:
+ app = (pps_params_t *)data;
+ if (app->mode & ~pps->ppscap)
+ return (EINVAL);
+ pps->ppsparam = *app;
+ return (0);
+ case PPS_IOC_GETPARAMS:
+ app = (pps_params_t *)data;
+ *app = pps->ppsparam;
+ app->api_version = PPS_API_VERS_1;
+ return (0);
+ case PPS_IOC_GETCAP:
+ *(int*)data = pps->ppscap;
+ return (0);
+ case PPS_IOC_FETCH:
+ fapi = (struct pps_fetch_args *)data;
+ if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
+ return (EINVAL);
+ if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec)
+ return (EOPNOTSUPP);
+ pps->ppsinfo.current_mode = pps->ppsparam.mode;
+ fapi->pps_info_buf = pps->ppsinfo;
+ return (0);
+ case PPS_IOC_KCBIND:
+#ifdef PPS_SYNC
+ kapi = (struct pps_kcbind_args *)data;
+ /* XXX Only root should be able to do this */
+ if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
+ return (EINVAL);
+ if (kapi->kernel_consumer != PPS_KC_HARDPPS)
+ return (EINVAL);
+ if (kapi->edge & ~pps->ppscap)
+ return (EINVAL);
+ pps->kcmode = kapi->edge;
+ return (0);
+#else
+ return (EOPNOTSUPP);
+#endif
+ default:
+ return (ENOTTY);
+ }
+}
+
+void
+pps_init(struct pps_state *pps)
+{
+ pps->ppscap |= PPS_TSFMT_TSPEC;
+ if (pps->ppscap & PPS_CAPTUREASSERT)
+ pps->ppscap |= PPS_OFFSETASSERT;
+ if (pps->ppscap & PPS_CAPTURECLEAR)
+ pps->ppscap |= PPS_OFFSETCLEAR;
+}
+
+void
+pps_capture(struct pps_state *pps)
+{
+ struct timehands *th;
+
+ th = timehands;
+ pps->capgen = th->th_generation;
+ pps->capth = th;
+ pps->capcount = th->th_counter->tc_get_timecount(th->th_counter);
+ if (pps->capgen != th->th_generation)
+ pps->capgen = 0;
+}
+
+void
+pps_event(struct pps_state *pps, int event)
+{
+ struct bintime bt;
+ struct timespec ts, *tsp, *osp;
+ u_int tcount, *pcount;
+ int foff, fhard;
+ pps_seq_t *pseq;
+
+ /* If the timecounter was wound up underneath us, bail out. */
+ if (pps->capgen == 0 || pps->capgen != pps->capth->th_generation)
+ return;
+
+ /* Things would be easier with arrays. */
+ if (event == PPS_CAPTUREASSERT) {
+ tsp = &pps->ppsinfo.assert_timestamp;
+ osp = &pps->ppsparam.assert_offset;
+ foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
+ fhard = pps->kcmode & PPS_CAPTUREASSERT;
+ pcount = &pps->ppscount[0];
+ pseq = &pps->ppsinfo.assert_sequence;
+ } else {
+ tsp = &pps->ppsinfo.clear_timestamp;
+ osp = &pps->ppsparam.clear_offset;
+ foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
+ fhard = pps->kcmode & PPS_CAPTURECLEAR;
+ pcount = &pps->ppscount[1];
+ pseq = &pps->ppsinfo.clear_sequence;
+ }
+
+ /*
+ * If the timecounter changed, we cannot compare the count values, so
+ * we have to drop the rest of the PPS-stuff until the next event.
+ */
+ if (pps->ppstc != pps->capth->th_counter) {
+ pps->ppstc = pps->capth->th_counter;
+ *pcount = pps->capcount;
+ pps->ppscount[2] = pps->capcount;
+ return;
+ }
+
+ /* Return if nothing really happened. */
+ if (*pcount == pps->capcount)
+ return;
+
+ /* Convert the count to a timespec. */
+ tcount = pps->capcount - pps->capth->th_offset_count;
+ tcount &= pps->capth->th_counter->tc_counter_mask;
+ bt = pps->capth->th_offset;
+ bintime_addx(&bt, pps->capth->th_scale * tcount);
+ bintime_add(&bt, &boottimebin);
+ bintime2timespec(&bt, &ts);
+
+ /* If the timecounter was wound up underneath us, bail out. */
+ if (pps->capgen != pps->capth->th_generation)
+ return;
+
+ *pcount = pps->capcount;
+ (*pseq)++;
+ *tsp = ts;
+
+ if (foff) {
+ timespecadd(tsp, osp);
+ if (tsp->tv_nsec < 0) {
+ tsp->tv_nsec += 1000000000;
+ tsp->tv_sec -= 1;
+ }
+ }
+#ifdef PPS_SYNC
+ if (fhard) {
+ /*
+ * Feed the NTP PLL/FLL.
+ * The FLL wants to know how many nanoseconds elapsed since
+ * the previous event.
+ * I have never been able to convince myself that this code
+ * is actually correct: Using th_scale is bound to contain
+ * a phase correction component from userland, when running
+ * as FLL, so the number hardpps() gets is not meaningful IMO.
+ */
+ tcount = pps->capcount - pps->ppscount[2];
+ pps->ppscount[2] = pps->capcount;
+ tcount &= pps->capth->th_counter->tc_counter_mask;
+ bt.sec = 0;
+ bt.frac = 0;
+ bintime_addx(&bt, pps->capth->th_scale * tcount);
+ bintime2timespec(&bt, &ts);
+ hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec);
+ }
+#endif
+}
+
+/*
+ * Timecounters need to be updated every so often to prevent the hardware
+ * counter from overflowing. Updating also recalculates the cached values
+ * used by the get*() family of functions, so their precision depends on
+ * the update frequency.
+ */
+
+static int tc_tick;
+SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tick, 0, "");
+
+static void
+tc_ticktock(void *dummy)
+{
+
+ tc_windup();
+ timeout(tc_ticktock, NULL, tc_tick);
+}
+
+static void
+inittimecounter(void *dummy)
+{
+ u_int p;
+
+ /*
+ * Set the initial timeout to
+ * max(1, <approx. number of hardclock ticks in a millisecond>).
+ * People should probably not use the sysctl to set the timeout
+ * to smaller than its inital value, since that value is the
+ * smallest reasonable one. If they want better timestamps they
+ * should use the non-"get"* functions.
+ */
+ if (hz > 1000)
+ tc_tick = (hz + 500) / 1000;
+ else
+ tc_tick = 1;
+ p = (tc_tick * 1000000) / hz;
+ printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000);
+
+ /* warm up new timecounter (again) and get rolling. */
+ (void)timecounter->tc_get_timecount(timecounter);
+ (void)timecounter->tc_get_timecount(timecounter);
+ tc_ticktock(NULL);
+}
+
+SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_FIRST, inittimecounter, NULL)
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
new file mode 100644
index 0000000..645170e
--- /dev/null
+++ b/sys/kern/kern_time.c
@@ -0,0 +1,678 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_time.c 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/time.h>
+#include <sys/timetc.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+struct timezone tz;
+
+/*
+ * Time of day and interval timer support.
+ *
+ * These routines provide the kernel entry points to get and set
+ * the time-of-day and per-process interval timers. Subroutines
+ * here provide support for adding and subtracting timeval structures
+ * and decrementing interval timers, optionally reloading the interval
+ * timers when they expire.
+ */
+
+static int nanosleep1(struct thread *td, struct timespec *rqt,
+ struct timespec *rmt);
+static int settime(struct thread *, struct timeval *);
+static void timevalfix(struct timeval *);
+static void no_lease_updatetime(int);
+
+static void
+no_lease_updatetime(deltat)
+ int deltat;
+{
+}
+
+void (*lease_updatetime)(int) = no_lease_updatetime;
+
+static int
+settime(td, tv)
+ struct thread *td;
+ struct timeval *tv;
+{
+ struct timeval delta, tv1, tv2;
+ static struct timeval maxtime, laststep;
+ struct timespec ts;
+ int s;
+
+ s = splclock();
+ microtime(&tv1);
+ delta = *tv;
+ timevalsub(&delta, &tv1);
+
+ /*
+ * If the system is secure, we do not allow the time to be
+ * set to a value earlier than 1 second less than the highest
+ * time we have yet seen. The worst a miscreant can do in
+ * this circumstance is "freeze" time. He couldn't go
+ * back to the past.
+ *
+ * We similarly do not allow the clock to be stepped more
+ * than one second, nor more than once per second. This allows
+ * a miscreant to make the clock march double-time, but no worse.
+ */
+ if (securelevel_gt(td->td_ucred, 1) != 0) {
+ if (delta.tv_sec < 0 || delta.tv_usec < 0) {
+ /*
+ * Update maxtime to latest time we've seen.
+ */
+ if (tv1.tv_sec > maxtime.tv_sec)
+ maxtime = tv1;
+ tv2 = *tv;
+ timevalsub(&tv2, &maxtime);
+ if (tv2.tv_sec < -1) {
+ tv->tv_sec = maxtime.tv_sec - 1;
+ printf("Time adjustment clamped to -1 second\n");
+ }
+ } else {
+ if (tv1.tv_sec == laststep.tv_sec) {
+ splx(s);
+ return (EPERM);
+ }
+ if (delta.tv_sec > 1) {
+ tv->tv_sec = tv1.tv_sec + 1;
+ printf("Time adjustment clamped to +1 second\n");
+ }
+ laststep = *tv;
+ }
+ }
+
+ ts.tv_sec = tv->tv_sec;
+ ts.tv_nsec = tv->tv_usec * 1000;
+ mtx_lock(&Giant);
+ tc_setclock(&ts);
+ (void) splsoftclock();
+ lease_updatetime(delta.tv_sec);
+ splx(s);
+ resettodr();
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_gettime_args {
+ clockid_t clock_id;
+ struct timespec *tp;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+clock_gettime(td, uap)
+ struct thread *td;
+ struct clock_gettime_args *uap;
+{
+ struct timespec ats;
+
+ if (SCARG(uap, clock_id) != CLOCK_REALTIME)
+ return (EINVAL);
+ mtx_lock(&Giant);
+ nanotime(&ats);
+ mtx_unlock(&Giant);
+ return (copyout(&ats, SCARG(uap, tp), sizeof(ats)));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_settime_args {
+ clockid_t clock_id;
+ const struct timespec *tp;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+clock_settime(td, uap)
+ struct thread *td;
+ struct clock_settime_args *uap;
+{
+ struct timeval atv;
+ struct timespec ats;
+ int error;
+
+ if ((error = suser(td)) != 0)
+ return (error);
+ if (SCARG(uap, clock_id) != CLOCK_REALTIME)
+ return (EINVAL);
+ if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0)
+ return (error);
+ if (ats.tv_nsec < 0 || ats.tv_nsec >= 1000000000)
+ return (EINVAL);
+ /* XXX Don't convert nsec->usec and back */
+ TIMESPEC_TO_TIMEVAL(&atv, &ats);
+ error = settime(td, &atv);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_getres_args {
+ clockid_t clock_id;
+ struct timespec *tp;
+};
+#endif
+
+int
+clock_getres(td, uap)
+ struct thread *td;
+ struct clock_getres_args *uap;
+{
+ struct timespec ts;
+ int error;
+
+ if (SCARG(uap, clock_id) != CLOCK_REALTIME)
+ return (EINVAL);
+ error = 0;
+ if (SCARG(uap, tp)) {
+ ts.tv_sec = 0;
+ ts.tv_nsec = 1000000000 / tc_getfrequency();
+ error = copyout(&ts, SCARG(uap, tp), sizeof(ts));
+ }
+ return (error);
+}
+
+static int nanowait;
+
+static int
+nanosleep1(td, rqt, rmt)
+ struct thread *td;
+ struct timespec *rqt, *rmt;
+{
+ struct timespec ts, ts2, ts3;
+ struct timeval tv;
+ int error;
+
+ if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
+ return (EINVAL);
+ if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0))
+ return (0);
+ getnanouptime(&ts);
+ timespecadd(&ts, rqt);
+ TIMESPEC_TO_TIMEVAL(&tv, rqt);
+ for (;;) {
+ error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp",
+ tvtohz(&tv));
+ getnanouptime(&ts2);
+ if (error != EWOULDBLOCK) {
+ if (error == ERESTART)
+ error = EINTR;
+ if (rmt != NULL) {
+ timespecsub(&ts, &ts2);
+ if (ts.tv_sec < 0)
+ timespecclear(&ts);
+ *rmt = ts;
+ }
+ return (error);
+ }
+ if (timespeccmp(&ts2, &ts, >=))
+ return (0);
+ ts3 = ts;
+ timespecsub(&ts3, &ts2);
+ TIMESPEC_TO_TIMEVAL(&tv, &ts3);
+ }
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nanosleep_args {
+ struct timespec *rqtp;
+ struct timespec *rmtp;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+nanosleep(td, uap)
+ struct thread *td;
+ struct nanosleep_args *uap;
+{
+ struct timespec rmt, rqt;
+ int error;
+
+ error = copyin(SCARG(uap, rqtp), &rqt, sizeof(rqt));
+ if (error)
+ return (error);
+
+ mtx_lock(&Giant);
+ if (SCARG(uap, rmtp)) {
+ if (!useracc((caddr_t)SCARG(uap, rmtp), sizeof(rmt),
+ VM_PROT_WRITE)) {
+ error = EFAULT;
+ goto done2;
+ }
+ }
+ error = nanosleep1(td, &rqt, &rmt);
+ if (error && SCARG(uap, rmtp)) {
+ int error2;
+
+ error2 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt));
+ if (error2) /* XXX shouldn't happen, did useracc() above */
+ error = error2;
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct gettimeofday_args {
+ struct timeval *tp;
+ struct timezone *tzp;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+gettimeofday(td, uap)
+ struct thread *td;
+ register struct gettimeofday_args *uap;
+{
+ struct timeval atv;
+ int error = 0;
+
+ if (uap->tp) {
+ microtime(&atv);
+ error = copyout((caddr_t)&atv, (caddr_t)uap->tp, sizeof (atv));
+ }
+ if (error == 0 && uap->tzp != NULL) {
+ mtx_lock(&Giant);
+ error = copyout((caddr_t)&tz, (caddr_t)uap->tzp,
+ sizeof (tz));
+ mtx_unlock(&Giant);
+ }
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct settimeofday_args {
+ struct timeval *tv;
+ struct timezone *tzp;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+settimeofday(td, uap)
+ struct thread *td;
+ struct settimeofday_args *uap;
+{
+ struct timeval atv;
+ struct timezone atz;
+ int error = 0;
+
+ if ((error = suser(td)))
+ return (error);
+ /* Verify all parameters before changing time. */
+ if (uap->tv) {
+ if ((error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
+ sizeof(atv))))
+ return (error);
+ if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
+ return (EINVAL);
+ }
+ if (uap->tzp &&
+ (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz))))
+ return (error);
+
+ if (uap->tv && (error = settime(td, &atv)))
+ return (error);
+ if (uap->tzp) {
+ mtx_lock(&Giant);
+ tz = atz;
+ mtx_unlock(&Giant);
+ }
+ return (error);
+}
+/*
+ * Get value of an interval timer. The process virtual and
+ * profiling virtual time timers are kept in the p_stats area, since
+ * they can be swapped out. These are kept internally in the
+ * way they are specified externally: in time until they expire.
+ *
+ * The real time interval timer is kept in the process table slot
+ * for the process, and its value (it_value) is kept as an
+ * absolute time rather than as a delta, so that it is easy to keep
+ * periodic real-time signals from drifting.
+ *
+ * Virtual time timers are processed in the hardclock() routine of
+ * kern_clock.c. The real time timer is processed by a timeout
+ * routine, called from the softclock() routine. Since a callout
+ * may be delayed in real time due to interrupt processing in the system,
+ * it is possible for the real time timeout routine (realitexpire, given below),
+ * to be delayed in real time past when it is supposed to occur. It
+ * does not suffice, therefore, to reload the real timer .it_value from the
+ * real time timers .it_interval. Rather, we compute the next time in
+ * absolute time the timer should go off.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getitimer_args {
+ u_int which;
+ struct itimerval *itv;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getitimer(td, uap)
+ struct thread *td;
+ register struct getitimer_args *uap;
+{
+ struct proc *p = td->td_proc;
+ struct timeval ctv;
+ struct itimerval aitv;
+ int s;
+ int error;
+
+ if (uap->which > ITIMER_PROF)
+ return (EINVAL);
+
+ mtx_lock(&Giant);
+
+ s = splclock(); /* XXX still needed ? */
+ if (uap->which == ITIMER_REAL) {
+ /*
+ * Convert from absolute to relative time in .it_value
+ * part of real time timer. If time for real time timer
+ * has passed return 0, else return difference between
+ * current time and time for the timer to go off.
+ */
+ aitv = p->p_realtimer;
+ if (timevalisset(&aitv.it_value)) {
+ getmicrouptime(&ctv);
+ if (timevalcmp(&aitv.it_value, &ctv, <))
+ timevalclear(&aitv.it_value);
+ else
+ timevalsub(&aitv.it_value, &ctv);
+ }
+ } else {
+ aitv = p->p_stats->p_timer[uap->which];
+ }
+ splx(s);
+ error = copyout((caddr_t)&aitv, (caddr_t)uap->itv,
+ sizeof (struct itimerval));
+ mtx_unlock(&Giant);
+ return(error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setitimer_args {
+ u_int which;
+ struct itimerval *itv, *oitv;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setitimer(td, uap)
+ struct thread *td;
+ register struct setitimer_args *uap;
+{
+ struct proc *p = td->td_proc;
+ struct itimerval aitv;
+ struct timeval ctv;
+ register struct itimerval *itvp;
+ int s, error = 0;
+
+ if (uap->which > ITIMER_PROF)
+ return (EINVAL);
+ itvp = uap->itv;
+ if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv,
+ sizeof(struct itimerval))))
+ return (error);
+
+ mtx_lock(&Giant);
+
+ if ((uap->itv = uap->oitv) &&
+ (error = getitimer(td, (struct getitimer_args *)uap))) {
+ goto done2;
+ }
+ if (itvp == 0) {
+ error = 0;
+ goto done2;
+ }
+ if (itimerfix(&aitv.it_value)) {
+ error = EINVAL;
+ goto done2;
+ }
+ if (!timevalisset(&aitv.it_value)) {
+ timevalclear(&aitv.it_interval);
+ } else if (itimerfix(&aitv.it_interval)) {
+ error = EINVAL;
+ goto done2;
+ }
+ s = splclock(); /* XXX: still needed ? */
+ if (uap->which == ITIMER_REAL) {
+ if (timevalisset(&p->p_realtimer.it_value))
+ callout_stop(&p->p_itcallout);
+ if (timevalisset(&aitv.it_value))
+ callout_reset(&p->p_itcallout, tvtohz(&aitv.it_value),
+ realitexpire, p);
+ getmicrouptime(&ctv);
+ timevaladd(&aitv.it_value, &ctv);
+ p->p_realtimer = aitv;
+ } else {
+ p->p_stats->p_timer[uap->which] = aitv;
+ }
+ splx(s);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Real interval timer expired:
+ * send process whose timer expired an alarm signal.
+ * If time is not set up to reload, then just return.
+ * Else compute next time timer should go off which is > current time.
+ * This is where delay in processing this timeout causes multiple
+ * SIGALRM calls to be compressed into one.
+ * tvtohz() always adds 1 to allow for the time until the next clock
+ * interrupt being strictly less than 1 clock tick, but we don't want
+ * that here since we want to appear to be in sync with the clock
+ * interrupt even when we're delayed.
+ */
+void
+realitexpire(arg)
+ void *arg;
+{
+ register struct proc *p;
+ struct timeval ctv, ntv;
+ int s;
+
+ p = (struct proc *)arg;
+ PROC_LOCK(p);
+ psignal(p, SIGALRM);
+ if (!timevalisset(&p->p_realtimer.it_interval)) {
+ timevalclear(&p->p_realtimer.it_value);
+ PROC_UNLOCK(p);
+ return;
+ }
+ for (;;) {
+ s = splclock(); /* XXX: still neeeded ? */
+ timevaladd(&p->p_realtimer.it_value,
+ &p->p_realtimer.it_interval);
+ getmicrouptime(&ctv);
+ if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
+ ntv = p->p_realtimer.it_value;
+ timevalsub(&ntv, &ctv);
+ callout_reset(&p->p_itcallout, tvtohz(&ntv) - 1,
+ realitexpire, p);
+ splx(s);
+ PROC_UNLOCK(p);
+ return;
+ }
+ splx(s);
+ }
+ /*NOTREACHED*/
+}
+
+/*
+ * Check that a proposed value to load into the .it_value or
+ * .it_interval part of an interval timer is acceptable, and
+ * fix it to have at least minimal value (i.e. if it is less
+ * than the resolution of the clock, round it up.)
+ */
+int
+itimerfix(tv)
+ struct timeval *tv;
+{
+
+ if (tv->tv_sec < 0 || tv->tv_sec > 100000000 ||
+ tv->tv_usec < 0 || tv->tv_usec >= 1000000)
+ return (EINVAL);
+ if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
+ tv->tv_usec = tick;
+ return (0);
+}
+
+/*
+ * Decrement an interval timer by a specified number
+ * of microseconds, which must be less than a second,
+ * i.e. < 1000000. If the timer expires, then reload
+ * it. In this case, carry over (usec - old value) to
+ * reduce the value reloaded into the timer so that
+ * the timer does not drift. This routine assumes
+ * that it is called in a context where the timers
+ * on which it is operating cannot change in value.
+ */
+int
+itimerdecr(itp, usec)
+ register struct itimerval *itp;
+ int usec;
+{
+
+ if (itp->it_value.tv_usec < usec) {
+ if (itp->it_value.tv_sec == 0) {
+ /* expired, and already in next interval */
+ usec -= itp->it_value.tv_usec;
+ goto expire;
+ }
+ itp->it_value.tv_usec += 1000000;
+ itp->it_value.tv_sec--;
+ }
+ itp->it_value.tv_usec -= usec;
+ usec = 0;
+ if (timevalisset(&itp->it_value))
+ return (1);
+ /* expired, exactly at end of interval */
+expire:
+ if (timevalisset(&itp->it_interval)) {
+ itp->it_value = itp->it_interval;
+ itp->it_value.tv_usec -= usec;
+ if (itp->it_value.tv_usec < 0) {
+ itp->it_value.tv_usec += 1000000;
+ itp->it_value.tv_sec--;
+ }
+ } else
+ itp->it_value.tv_usec = 0; /* sec is already 0 */
+ return (0);
+}
+
+/*
+ * Add and subtract routines for timevals.
+ * N.B.: subtract routine doesn't deal with
+ * results which are before the beginning,
+ * it just gets very confused in this case.
+ * Caveat emptor.
+ */
+void
+timevaladd(t1, t2)
+ struct timeval *t1, *t2;
+{
+
+ t1->tv_sec += t2->tv_sec;
+ t1->tv_usec += t2->tv_usec;
+ timevalfix(t1);
+}
+
+void
+timevalsub(t1, t2)
+ struct timeval *t1, *t2;
+{
+
+ t1->tv_sec -= t2->tv_sec;
+ t1->tv_usec -= t2->tv_usec;
+ timevalfix(t1);
+}
+
+static void
+timevalfix(t1)
+ struct timeval *t1;
+{
+
+ if (t1->tv_usec < 0) {
+ t1->tv_sec--;
+ t1->tv_usec += 1000000;
+ }
+ if (t1->tv_usec >= 1000000) {
+ t1->tv_sec++;
+ t1->tv_usec -= 1000000;
+ }
+}
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
new file mode 100644
index 0000000..937b0c2
--- /dev/null
+++ b/sys/kern/kern_timeout.c
@@ -0,0 +1,414 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * From: @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+/*
+ * TODO:
+ * allocate more timeout table slots when table overflows.
+ */
+
+/* Exported to machdep.c and/or kern_clock.c. */
+struct callout *callout;
+struct callout_list callfree;
+int callwheelsize, callwheelbits, callwheelmask;
+struct callout_tailq *callwheel;
+int softticks; /* Like ticks, but for softclock(). */
+struct mtx callout_lock;
+
+static struct callout *nextsoftcheck; /* Next callout to be checked. */
+
+/*
+ * kern_timeout_callwheel_alloc() - kernel low level callwheel initialization
+ *
+ * This code is called very early in the kernel initialization sequence,
+ * and may be called more then once.
+ */
+caddr_t
+kern_timeout_callwheel_alloc(caddr_t v)
+{
+ /*
+ * Calculate callout wheel size
+ */
+ for (callwheelsize = 1, callwheelbits = 0;
+ callwheelsize < ncallout;
+ callwheelsize <<= 1, ++callwheelbits)
+ ;
+ callwheelmask = callwheelsize - 1;
+
+ callout = (struct callout *)v;
+ v = (caddr_t)(callout + ncallout);
+ callwheel = (struct callout_tailq *)v;
+ v = (caddr_t)(callwheel + callwheelsize);
+ return(v);
+}
+
+/*
+ * kern_timeout_callwheel_init() - initialize previously reserved callwheel
+ * space.
+ *
+ * This code is called just once, after the space reserved for the
+ * callout wheel has been finalized.
+ */
+void
+kern_timeout_callwheel_init(void)
+{
+ int i;
+
+ SLIST_INIT(&callfree);
+ for (i = 0; i < ncallout; i++) {
+ callout_init(&callout[i], 0);
+ callout[i].c_flags = CALLOUT_LOCAL_ALLOC;
+ SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle);
+ }
+ for (i = 0; i < callwheelsize; i++) {
+ TAILQ_INIT(&callwheel[i]);
+ }
+ mtx_init(&callout_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
+}
+
+/*
+ * The callout mechanism is based on the work of Adam M. Costello and
+ * George Varghese, published in a technical report entitled "Redesigning
+ * the BSD Callout and Timer Facilities" and modified slightly for inclusion
+ * in FreeBSD by Justin T. Gibbs. The original work on the data structures
+ * used in this implementation was published by G.Varghese and A. Lauck in
+ * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
+ * the Efficient Implementation of a Timer Facility" in the Proceedings of
+ * the 11th ACM Annual Symposium on Operating Systems Principles,
+ * Austin, Texas Nov 1987.
+ */
+
+/*
+ * Software (low priority) clock interrupt.
+ * Run periodic events from timeout queue.
+ */
+void
+softclock(void *dummy)
+{
+ register struct callout *c;
+ register struct callout_tailq *bucket;
+ register int curticks;
+ register int steps; /* #steps since we last allowed interrupts */
+
+#ifndef MAX_SOFTCLOCK_STEPS
+#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */
+#endif /* MAX_SOFTCLOCK_STEPS */
+
+ steps = 0;
+ mtx_lock_spin(&callout_lock);
+ while (softticks != ticks) {
+ softticks++;
+ /*
+ * softticks may be modified by hard clock, so cache
+ * it while we work on a given bucket.
+ */
+ curticks = softticks;
+ bucket = &callwheel[curticks & callwheelmask];
+ c = TAILQ_FIRST(bucket);
+ while (c) {
+ if (c->c_time != curticks) {
+ c = TAILQ_NEXT(c, c_links.tqe);
+ ++steps;
+ if (steps >= MAX_SOFTCLOCK_STEPS) {
+ nextsoftcheck = c;
+ /* Give interrupts a chance. */
+ mtx_unlock_spin(&callout_lock);
+ ; /* nothing */
+ mtx_lock_spin(&callout_lock);
+ c = nextsoftcheck;
+ steps = 0;
+ }
+ } else {
+ void (*c_func)(void *);
+ void *c_arg;
+ int c_flags;
+
+ nextsoftcheck = TAILQ_NEXT(c, c_links.tqe);
+ TAILQ_REMOVE(bucket, c, c_links.tqe);
+ c_func = c->c_func;
+ c_arg = c->c_arg;
+ c_flags = c->c_flags;
+ c->c_func = NULL;
+ if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
+ c->c_flags = CALLOUT_LOCAL_ALLOC;
+ SLIST_INSERT_HEAD(&callfree, c,
+ c_links.sle);
+ } else {
+ c->c_flags =
+ (c->c_flags & ~CALLOUT_PENDING);
+ }
+ mtx_unlock_spin(&callout_lock);
+ if (!(c_flags & CALLOUT_MPSAFE))
+ mtx_lock(&Giant);
+ c_func(c_arg);
+ if (!(c_flags & CALLOUT_MPSAFE))
+ mtx_unlock(&Giant);
+ mtx_lock_spin(&callout_lock);
+ steps = 0;
+ c = nextsoftcheck;
+ }
+ }
+ }
+ nextsoftcheck = NULL;
+ mtx_unlock_spin(&callout_lock);
+}
+
+/*
+ * timeout --
+ * Execute a function after a specified length of time.
+ *
+ * untimeout --
+ * Cancel previous timeout function call.
+ *
+ * callout_handle_init --
+ * Initialize a handle so that using it with untimeout is benign.
+ *
+ * See AT&T BCI Driver Reference Manual for specification. This
+ * implementation differs from that one in that although an
+ * identification value is returned from timeout, the original
+ * arguments to timeout as well as the identifier are used to
+ * identify entries for untimeout.
+ */
+struct callout_handle
+timeout(ftn, arg, to_ticks)
+ timeout_t *ftn;
+ void *arg;
+ int to_ticks;
+{
+ struct callout *new;
+ struct callout_handle handle;
+
+ mtx_lock_spin(&callout_lock);
+
+ /* Fill in the next free callout structure. */
+ new = SLIST_FIRST(&callfree);
+ if (new == NULL)
+ /* XXX Attempt to malloc first */
+ panic("timeout table full");
+ SLIST_REMOVE_HEAD(&callfree, c_links.sle);
+
+ callout_reset(new, to_ticks, ftn, arg);
+
+ handle.callout = new;
+ mtx_unlock_spin(&callout_lock);
+ return (handle);
+}
+
+void
+untimeout(ftn, arg, handle)
+ timeout_t *ftn;
+ void *arg;
+ struct callout_handle handle;
+{
+
+ /*
+ * Check for a handle that was initialized
+ * by callout_handle_init, but never used
+ * for a real timeout.
+ */
+ if (handle.callout == NULL)
+ return;
+
+ mtx_lock_spin(&callout_lock);
+ if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
+ callout_stop(handle.callout);
+ mtx_unlock_spin(&callout_lock);
+}
+
+void
+callout_handle_init(struct callout_handle *handle)
+{
+ handle->callout = NULL;
+}
+
+/*
+ * New interface; clients allocate their own callout structures.
+ *
+ * callout_reset() - establish or change a timeout
+ * callout_stop() - disestablish a timeout
+ * callout_init() - initialize a callout structure so that it can
+ * safely be passed to callout_reset() and callout_stop()
+ *
+ * <sys/callout.h> defines three convenience macros:
+ *
+ * callout_active() - returns truth if callout has not been serviced
+ * callout_pending() - returns truth if callout is still waiting for timeout
+ * callout_deactivate() - marks the callout as having been serviced
+ */
+void
+callout_reset(c, to_ticks, ftn, arg)
+ struct callout *c;
+ int to_ticks;
+ void (*ftn)(void *);
+ void *arg;
+{
+
+ mtx_lock_spin(&callout_lock);
+ if (c->c_flags & CALLOUT_PENDING)
+ callout_stop(c);
+
+ /*
+ * We could unlock callout_lock here and lock it again before the
+ * TAILQ_INSERT_TAIL, but there's no point since doing this setup
+ * doesn't take much time.
+ */
+ if (to_ticks <= 0)
+ to_ticks = 1;
+
+ c->c_arg = arg;
+ c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
+ c->c_func = ftn;
+ c->c_time = ticks + to_ticks;
+ TAILQ_INSERT_TAIL(&callwheel[c->c_time & callwheelmask],
+ c, c_links.tqe);
+ mtx_unlock_spin(&callout_lock);
+}
+
+int
+callout_stop(c)
+ struct callout *c;
+{
+
+ mtx_lock_spin(&callout_lock);
+ /*
+ * Don't attempt to delete a callout that's not on the queue.
+ */
+ if (!(c->c_flags & CALLOUT_PENDING)) {
+ c->c_flags &= ~CALLOUT_ACTIVE;
+ mtx_unlock_spin(&callout_lock);
+ return (0);
+ }
+ c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+
+ if (nextsoftcheck == c) {
+ nextsoftcheck = TAILQ_NEXT(c, c_links.tqe);
+ }
+ TAILQ_REMOVE(&callwheel[c->c_time & callwheelmask], c, c_links.tqe);
+ c->c_func = NULL;
+
+ if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
+ SLIST_INSERT_HEAD(&callfree, c, c_links.sle);
+ }
+ mtx_unlock_spin(&callout_lock);
+ return (1);
+}
+
+void
+callout_init(c, mpsafe)
+ struct callout *c;
+ int mpsafe;
+{
+ bzero(c, sizeof *c);
+ if (mpsafe)
+ c->c_flags |= CALLOUT_MPSAFE;
+}
+
+#ifdef APM_FIXUP_CALLTODO
+/*
+ * Adjust the kernel calltodo timeout list. This routine is used after
+ * an APM resume to recalculate the calltodo timer list values with the
+ * number of hz's we have been sleeping. The next hardclock() will detect
+ * that there are fired timers and run softclock() to execute them.
+ *
+ * Please note, I have not done an exhaustive analysis of what code this
+ * might break. I am motivated to have my select()'s and alarm()'s that
+ * have expired during suspend firing upon resume so that the applications
+ * which set the timer can do the maintanence the timer was for as close
+ * as possible to the originally intended time. Testing this code for a
+ * week showed that resuming from a suspend resulted in 22 to 25 timers
+ * firing, which seemed independant on whether the suspend was 2 hours or
+ * 2 days. Your milage may vary. - Ken Key <key@cs.utk.edu>
+ */
+void
+adjust_timeout_calltodo(time_change)
+ struct timeval *time_change;
+{
+ register struct callout *p;
+ unsigned long delta_ticks;
+
+ /*
+ * How many ticks were we asleep?
+ * (stolen from tvtohz()).
+ */
+
+ /* Don't do anything */
+ if (time_change->tv_sec < 0)
+ return;
+ else if (time_change->tv_sec <= LONG_MAX / 1000000)
+ delta_ticks = (time_change->tv_sec * 1000000 +
+ time_change->tv_usec + (tick - 1)) / tick + 1;
+ else if (time_change->tv_sec <= LONG_MAX / hz)
+ delta_ticks = time_change->tv_sec * hz +
+ (time_change->tv_usec + (tick - 1)) / tick + 1;
+ else
+ delta_ticks = LONG_MAX;
+
+ if (delta_ticks > INT_MAX)
+ delta_ticks = INT_MAX;
+
+ /*
+ * Now rip through the timer calltodo list looking for timers
+ * to expire.
+ */
+
+ /* don't collide with softclock() */
+ mtx_lock_spin(&callout_lock);
+ for (p = calltodo.c_next; p != NULL; p = p->c_next) {
+ p->c_time -= delta_ticks;
+
+ /* Break if the timer had more time on it than delta_ticks */
+ if (p->c_time > 0)
+ break;
+
+ /* take back the ticks the timer didn't use (p->c_time <= 0) */
+ delta_ticks = -p->c_time;
+ }
+ mtx_unlock_spin(&callout_lock);
+
+ return;
+}
+#endif /* APM_FIXUP_CALLTODO */
diff --git a/sys/kern/kern_uuid.c b/sys/kern/kern_uuid.c
new file mode 100644
index 0000000..ba5faa5
--- /dev/null
+++ b/sys/kern/kern_uuid.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2002 Marcel Moolenaar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/socket.h>
+#include <sys/sysproto.h>
+#include <sys/uuid.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+
+/*
+ * See also:
+ * http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
+ * http://www.opengroup.org/onlinepubs/009629399/apdxa.htm
+ *
+ * Note that the generator state is itself an UUID, but the time and clock
+ * sequence fields are written in the native byte order.
+ */
+
+CTASSERT(sizeof(struct uuid) == 16);
+
+/* We use an alternative, more convenient representation in the generator. */
+struct uuid_private {
+ union {
+ uint64_t ll; /* internal. */
+ struct {
+ uint32_t low;
+ uint16_t mid;
+ uint16_t hi;
+ } x;
+ } time;
+ uint16_t seq; /* Big-endian. */
+ uint16_t node[UUID_NODE_LEN>>1];
+};
+
+CTASSERT(sizeof(struct uuid_private) == 16);
+
+static struct uuid_private uuid_last;
+
+static struct mtx uuid_mutex;
+MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF);
+
+/*
+ * Return the first MAC address we encounter or, if none was found,
+ * construct a sufficiently random multicast address. We don't try
+ * to return the same MAC address as previously returned. We always
+ * generate a new multicast address if no MAC address exists in the
+ * system.
+ * It would be nice to know if 'ifnet' or any of its sub-structures
+ * has been changed in any way. If not, we could simply skip the
+ * scan and safely return the MAC address we returned before.
+ */
+static void
+uuid_node(uint16_t *node)
+{
+ struct ifnet *ifp;
+ struct ifaddr *ifa;
+ struct sockaddr_dl *sdl;
+ int i;
+
+ /* XXX: lock ifnet. */
+ TAILQ_FOREACH(ifp, &ifnet, if_link) {
+ /* Walk the address list */
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ sdl = (struct sockaddr_dl*)ifa->ifa_addr;
+ if (sdl != NULL && sdl->sdl_family == AF_LINK &&
+ sdl->sdl_type == IFT_ETHER) {
+ /* Got a MAC address. */
+ bcopy(LLADDR(sdl), node, UUID_NODE_LEN);
+ /* XXX: unlock ifnet. */
+ return;
+ }
+ }
+ }
+ /* XXX: unlock ifnet. */
+
+ for (i = 0; i < (UUID_NODE_LEN>>1); i++)
+ node[i] = (uint16_t)arc4random();
+ *((uint8_t*)node) |= 0x80;
+}
+
+/*
+ * Get the current time as a 60 bit count of 100-nanosecond intervals
+ * since 00:00:00.00, October 15,1582. We apply a magic offset to convert
+ * the Unix time since 00:00:00.00, Januari 1, 1970 to the date of the
+ * Gregorian reform to the Christian calendar.
+ */
+static uint64_t
+uuid_time(void)
+{
+ struct bintime bt;
+ uint64_t time = 0x01B21DD213814000LL;
+
+ bintime(&bt);
+ time += (uint64_t)bt.sec * 10000000LL;
+ time += (10000000LL * (uint32_t)(bt.frac >> 32)) >> 32;
+ return (time & ((1LL << 60) - 1LL));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct uuidgen_args {
+ struct uuid *store;
+ int count;
+};
+#endif
+
+int uuidgen(struct thread *td, struct uuidgen_args *uap)
+{
+ struct uuid_private uuid;
+ uint64_t time;
+ int error;
+
+ /*
+ * Limit the number of UUIDs that can be created at the same time
+ * to some arbitrary number. This isn't really necessary, but I
+ * like to have some sort of upper-bound that's less than 2G :-)
+ * XXX needs to be tunable.
+ */
+ if (uap->count < 1 || uap->count > 2048)
+ return (EINVAL);
+
+ /* XXX: pre-validate accessibility to the whole of the UUID store? */
+
+ mtx_lock(&uuid_mutex);
+
+ uuid_node(uuid.node);
+ time = uuid_time();
+
+ if (uuid_last.time.ll == 0LL || uuid_last.node[0] != uuid.node[0] ||
+ uuid_last.node[1] != uuid.node[1] ||
+ uuid_last.node[2] != uuid.node[2])
+ uuid.seq = (uint16_t)arc4random() & 0x3fff;
+ else if (uuid_last.time.ll >= time)
+ uuid.seq = (uuid_last.seq + 1) & 0x3fff;
+ else
+ uuid.seq = uuid_last.seq;
+
+ uuid_last = uuid;
+ uuid_last.time.ll = (time + uap->count - 1) & ((1LL << 60) - 1LL);
+
+ mtx_unlock(&uuid_mutex);
+
+ /* Set sequence and variant and deal with byte order. */
+ uuid.seq = htobe16(uuid.seq | 0x8000);
+
+ /* XXX: this should copyout larger chunks at a time. */
+ do {
+ /* Set time and version (=1) and deal with byte order. */
+ uuid.time.x.low = (uint32_t)time;
+ uuid.time.x.mid = (uint16_t)(time >> 32);
+ uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) | (1 << 12);
+ error = copyout(&uuid, uap->store, sizeof(uuid));
+ uap->store++;
+ uap->count--;
+ time++;
+ } while (uap->count > 0 && !error);
+
+ return (error);
+}
+
+int
+snprintf_uuid(char *buf, size_t sz, struct uuid *uuid)
+{
+ struct uuid_private *id;
+ int cnt;
+
+ id = (struct uuid_private *)uuid;
+ cnt = snprintf(buf, sz, "%08x-%04x-%04x-%04x-%04x%04x%04x",
+ id->time.x.low, id->time.x.mid, id->time.x.hi, be16toh(id->seq),
+ be16toh(id->node[0]), be16toh(id->node[1]), be16toh(id->node[2]));
+ return (cnt);
+}
+
+int
+printf_uuid(struct uuid *uuid)
+{
+ char buf[38];
+
+ snprintf_uuid(buf, sizeof(buf), uuid);
+ return (printf("%s", buf));
+}
+
+int
+sbuf_printf_uuid(struct sbuf *sb, struct uuid *uuid)
+{
+ char buf[38];
+
+ snprintf_uuid(buf, sizeof(buf), uuid);
+ return (sbuf_printf(sb, "%s", buf));
+}
diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c
new file mode 100644
index 0000000..9d4136b
--- /dev/null
+++ b/sys/kern/kern_xxx.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_xxx.c 8.2 (Berkeley) 11/14/93
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/utsname.h>
+
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+
+#ifndef _SYS_SYSPROTO_H_
+struct gethostname_args {
+ char *hostname;
+ u_int len;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+ogethostname(td, uap)
+ struct thread *td;
+ struct gethostname_args *uap;
+{
+ int name[2];
+ int error;
+ size_t len = uap->len;
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_HOSTNAME;
+ mtx_lock(&Giant);
+ error = userland_sysctl(td, name, 2, uap->hostname, &len, 1, 0, 0, 0);
+ mtx_unlock(&Giant);
+ return(error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sethostname_args {
+ char *hostname;
+ u_int len;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osethostname(td, uap)
+ struct thread *td;
+ register struct sethostname_args *uap;
+{
+ int name[2];
+ int error;
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_HOSTNAME;
+ mtx_lock(&Giant);
+ if ((error = suser_cred(td->td_ucred, PRISON_ROOT)) == 0) {
+ error = userland_sysctl(td, name, 2, 0, 0, 0,
+ uap->hostname, uap->len, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ogethostid_args {
+ int dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+ogethostid(td, uap)
+ struct thread *td;
+ struct ogethostid_args *uap;
+{
+
+ *(long *)(td->td_retval) = hostid;
+ return (0);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+#ifdef COMPAT_43
+#ifndef _SYS_SYSPROTO_H_
+struct osethostid_args {
+ long hostid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osethostid(td, uap)
+ struct thread *td;
+ struct osethostid_args *uap;
+{
+ int error;
+
+ mtx_lock(&Giant);
+ if ((error = suser(td)))
+ hostid = uap->hostid;
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+oquota(td, uap)
+ struct thread *td;
+ struct oquota_args *uap;
+{
+ return (ENOSYS);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * This is the FreeBSD-1.1 compatable uname(2) interface. These
+ * days it is done in libc as a wrapper around a bunch of sysctl's.
+ * This must maintain the old 1.1 binary ABI.
+ */
+#if SYS_NMLN != 32
+#error "FreeBSD-1.1 uname syscall has been broken"
+#endif
+#ifndef _SYS_SYSPROTO_H_
+struct uname_args {
+ struct utsname *name;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+uname(td, uap)
+ struct thread *td;
+ struct uname_args *uap;
+{
+ int name[2], error;
+ size_t len;
+ char *s, *us;
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_OSTYPE;
+ len = sizeof (uap->name->sysname);
+ mtx_lock(&Giant);
+ error = userland_sysctl(td, name, 2, uap->name->sysname, &len,
+ 1, 0, 0, 0);
+ if (error)
+ goto done2;
+ subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0);
+
+ name[1] = KERN_HOSTNAME;
+ len = sizeof uap->name->nodename;
+ error = userland_sysctl(td, name, 2, uap->name->nodename, &len,
+ 1, 0, 0, 0);
+ if (error)
+ goto done2;
+ subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0);
+
+ name[1] = KERN_OSRELEASE;
+ len = sizeof uap->name->release;
+ error = userland_sysctl(td, name, 2, uap->name->release, &len,
+ 1, 0, 0, 0);
+ if (error)
+ goto done2;
+ subyte( uap->name->release + sizeof(uap->name->release) - 1, 0);
+
+/*
+ name = KERN_VERSION;
+ len = sizeof uap->name->version;
+ error = userland_sysctl(td, name, 2, uap->name->version, &len,
+ 1, 0, 0, 0);
+ if (error)
+ goto done2;
+ subyte( uap->name->version + sizeof(uap->name->version) - 1, 0);
+*/
+
+/*
+ * this stupid hackery to make the version field look like FreeBSD 1.1
+ */
+ for(s = version; *s && *s != '#'; s++);
+
+ for(us = uap->name->version; *s && *s != ':'; s++) {
+ error = subyte( us++, *s);
+ if (error)
+ goto done2;
+ }
+ error = subyte( us++, 0);
+ if (error)
+ goto done2;
+
+ name[0] = CTL_HW;
+ name[1] = HW_MACHINE;
+ len = sizeof uap->name->machine;
+ error = userland_sysctl(td, name, 2, uap->name->machine, &len,
+ 1, 0, 0, 0);
+ if (error)
+ goto done2;
+ subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getdomainname_args {
+ char *domainname;
+ int len;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getdomainname(td, uap)
+ struct thread *td;
+ struct getdomainname_args *uap;
+{
+ int domainnamelen;
+ int error;
+
+ mtx_lock(&Giant);
+ domainnamelen = strlen(domainname) + 1;
+ if ((u_int)uap->len > domainnamelen + 1)
+ uap->len = domainnamelen + 1;
+ error = copyout((caddr_t)domainname, (caddr_t)uap->domainname, uap->len);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setdomainname_args {
+ char *domainname;
+ int len;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setdomainname(td, uap)
+ struct thread *td;
+ struct setdomainname_args *uap;
+{
+ int error, domainnamelen;
+
+ mtx_lock(&Giant);
+ if ((error = suser(td)))
+ goto done2;
+ if ((u_int)uap->len > sizeof (domainname) - 1) {
+ error = EINVAL;
+ goto done2;
+ }
+ domainnamelen = uap->len;
+ error = copyin((caddr_t)uap->domainname, domainname, uap->len);
+ domainname[domainnamelen] = 0;
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
diff --git a/sys/kern/ksched.c b/sys/kern/ksched.c
new file mode 100644
index 0000000..c9081c3
--- /dev/null
+++ b/sys/kern/ksched.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 1996, 1997
+ * HD Associates, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/* ksched: Soft real time scheduling based on "rtprio".
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+
+#include <posix4/posix4.h>
+
+/* ksched: Real-time extension to support POSIX priority scheduling.
+ */
+
+struct ksched {
+ struct timespec rr_interval;
+};
+
+int ksched_attach(struct ksched **p)
+{
+ struct ksched *ksched= p31b_malloc(sizeof(*ksched));
+
+ ksched->rr_interval.tv_sec = 0;
+ ksched->rr_interval.tv_nsec = 1000000000L / roundrobin_interval();
+
+ *p = ksched;
+ return 0;
+}
+
+int ksched_detach(struct ksched *ks)
+{
+ p31b_free(ks);
+
+ return 0;
+}
+
+/*
+ * XXX About priorities
+ *
+ * POSIX 1003.1b requires that numerically higher priorities be of
+ * higher priority. It also permits sched_setparam to be
+ * implementation defined for SCHED_OTHER. I don't like
+ * the notion of inverted priorites for normal processes when
+ * you can use "setpriority" for that.
+ *
+ * I'm rejecting sched_setparam for SCHED_OTHER with EINVAL.
+ */
+
+/* Macros to convert between the unix (lower numerically is higher priority)
+ * and POSIX 1003.1b (higher numerically is higher priority)
+ */
+
+#define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P))
+#define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P))
+
+/* These improve readability a bit for me:
+ */
+#define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX)
+#define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN)
+
+static __inline int
+getscheduler(register_t *ret, struct ksched *ksched, struct thread *td)
+{
+ struct rtprio rtp;
+ int e = 0;
+
+ mtx_lock_spin(&sched_lock);
+ pri_to_rtp(td->td_ksegrp, &rtp);
+ mtx_unlock_spin(&sched_lock);
+ switch (rtp.type)
+ {
+ case RTP_PRIO_FIFO:
+ *ret = SCHED_FIFO;
+ break;
+
+ case RTP_PRIO_REALTIME:
+ *ret = SCHED_RR;
+ break;
+
+ default:
+ *ret = SCHED_OTHER;
+ break;
+ }
+
+ return e;
+}
+
+int ksched_setparam(register_t *ret, struct ksched *ksched,
+ struct thread *td, const struct sched_param *param)
+{
+ register_t policy;
+ int e;
+
+ e = getscheduler(&policy, ksched, td);
+
+ if (e == 0)
+ {
+ if (policy == SCHED_OTHER)
+ e = EINVAL;
+ else
+ e = ksched_setscheduler(ret, ksched, td, policy, param);
+ }
+
+ return e;
+}
+
+int ksched_getparam(register_t *ret, struct ksched *ksched,
+ struct thread *td, struct sched_param *param)
+{
+ struct rtprio rtp;
+
+ mtx_lock_spin(&sched_lock);
+ pri_to_rtp(td->td_ksegrp, &rtp);
+ mtx_unlock_spin(&sched_lock);
+ if (RTP_PRIO_IS_REALTIME(rtp.type))
+ param->sched_priority = rtpprio_to_p4prio(rtp.prio);
+
+ return 0;
+}
+
+/*
+ * XXX The priority and scheduler modifications should
+ * be moved into published interfaces in kern/kern_sync.
+ *
+ * The permissions to modify process p were checked in "p31b_proc()".
+ *
+ */
+int ksched_setscheduler(register_t *ret, struct ksched *ksched,
+ struct thread *td, int policy, const struct sched_param *param)
+{
+ int e = 0;
+ struct rtprio rtp;
+ struct ksegrp *kg = td->td_ksegrp;
+
+ switch(policy)
+ {
+ case SCHED_RR:
+ case SCHED_FIFO:
+
+ if (param->sched_priority >= P1B_PRIO_MIN &&
+ param->sched_priority <= P1B_PRIO_MAX)
+ {
+ rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+ rtp.type = (policy == SCHED_FIFO)
+ ? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;
+
+ mtx_lock_spin(&sched_lock);
+ rtp_to_pri(&rtp, kg);
+ td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */
+ mtx_unlock_spin(&sched_lock);
+ }
+ else
+ e = EPERM;
+
+
+ break;
+
+ case SCHED_OTHER:
+ {
+ rtp.type = RTP_PRIO_NORMAL;
+ rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+ mtx_lock_spin(&sched_lock);
+ rtp_to_pri(&rtp, kg);
+
+ /* XXX Simply revert to whatever we had for last
+ * normal scheduler priorities.
+ * This puts a requirement
+ * on the scheduling code: You must leave the
+ * scheduling info alone.
+ */
+ td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */
+ mtx_unlock_spin(&sched_lock);
+ }
+ break;
+ }
+
+ return e;
+}
+
+int ksched_getscheduler(register_t *ret, struct ksched *ksched, struct thread *td)
+{
+ return getscheduler(ret, ksched, td);
+}
+
+/* ksched_yield: Yield the CPU.
+ */
+int ksched_yield(register_t *ret, struct ksched *ksched)
+{
+ mtx_lock_spin(&sched_lock);
+ curthread->td_kse->ke_flags |= KEF_NEEDRESCHED;
+ mtx_unlock_spin(&sched_lock);
+ return 0;
+}
+
+int ksched_get_priority_max(register_t*ret, struct ksched *ksched, int policy)
+{
+ int e = 0;
+
+ switch (policy)
+ {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ *ret = RTP_PRIO_MAX;
+ break;
+
+ case SCHED_OTHER:
+ *ret = PRIO_MAX;
+ break;
+
+ default:
+ e = EINVAL;
+ }
+
+ return e;
+}
+
+int ksched_get_priority_min(register_t *ret, struct ksched *ksched, int policy)
+{
+ int e = 0;
+
+ switch (policy)
+ {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ *ret = P1B_PRIO_MIN;
+ break;
+
+ case SCHED_OTHER:
+ *ret = PRIO_MIN;
+ break;
+
+ default:
+ e = EINVAL;
+ }
+
+ return e;
+}
+
+int ksched_rr_get_interval(register_t *ret, struct ksched *ksched,
+ struct thread *td, struct timespec *timespec)
+{
+ *timespec = ksched->rr_interval;
+
+ return 0;
+}
diff --git a/sys/kern/link_aout.c b/sys/kern/link_aout.c
new file mode 100644
index 0000000..5a863bd
--- /dev/null
+++ b/sys/kern/link_aout.c
@@ -0,0 +1,590 @@
+/*-
+ * Copyright (c) 1997-2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifdef __i386__
+
+#define FREEBSD_AOUT 1
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+
+
+#include "linker_if.h"
+
+#ifndef __ELF__
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <machine/vmparam.h>
+#endif
+
+#include <a.out.h>
+#include <link.h>
+
+typedef struct aout_file {
+ struct linker_file lf; /* Common fields */
+ int preloaded; /* Was this pre-loader */
+ char* address; /* Load address */
+ struct _dynamic* dynamic; /* Symbol table etc. */
+} *aout_file_t;
+
+static int link_aout_link_preload(linker_class_t lc,
+ const char* modname, linker_file_t*);
+static int link_aout_link_preload_finish(linker_file_t);
+
+static int link_aout_load_file(linker_class_t lc, const char*, linker_file_t*);
+static int link_aout_lookup_symbol(linker_file_t, const char*,
+ c_linker_sym_t*);
+static int link_aout_symbol_values(linker_file_t file, c_linker_sym_t sym,
+ linker_symval_t* symval);
+static int link_aout_search_symbol(linker_file_t lf, caddr_t value,
+ c_linker_sym_t* sym, long* diffp);
+static void link_aout_unload_file(linker_file_t);
+static void link_aout_unload_preload(linker_file_t);
+static int link_aout_lookup_set(linker_file_t, const char*,
+ void ***, void ***, int*);
+
+static kobj_method_t link_aout_methods[] = {
+ KOBJMETHOD(linker_lookup_symbol, link_aout_lookup_symbol),
+ KOBJMETHOD(linker_symbol_values, link_aout_symbol_values),
+ KOBJMETHOD(linker_search_symbol, link_aout_search_symbol),
+ KOBJMETHOD(linker_unload, link_aout_unload_file),
+ KOBJMETHOD(linker_load_file, link_aout_load_file),
+ KOBJMETHOD(linker_link_preload, link_aout_link_preload),
+ KOBJMETHOD(linker_link_preload_finish, link_aout_link_preload_finish),
+ KOBJMETHOD(linker_lookup_set, link_aout_lookup_set),
+ { 0, 0 }
+};
+
+static struct linker_class link_aout_class = {
+ "a.out", link_aout_methods, sizeof(struct aout_file)
+};
+
+static int relocate_file(aout_file_t af);
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic __DYNAMIC;
+
+static void
+link_aout_init(void* arg)
+{
+#ifndef __ELF__
+ struct _dynamic* dp = &__DYNAMIC;
+#endif
+
+ linker_add_class(&link_aout_class);
+
+#ifndef __ELF__
+ if (dp) {
+ aout_file_t af;
+
+ linker_kernel_file =
+ linker_make_file(kernelname, &link_aout_class);
+ if (linker_kernel_file == NULL)
+ panic("link_aout_init: Can't create linker structures for kernel");
+ af = (aout_file_t) linker_kernel_file;
+ af->address = 0;
+ af->dynamic = dp;
+ linker_kernel_file->address = (caddr_t) KERNBASE;
+ linker_kernel_file->size = -(long)linker_kernel_file->address;
+ }
+#endif
+}
+
+SYSINIT(link_aout, SI_SUB_KLD, SI_ORDER_THIRD, link_aout_init, 0);
+
+static int
+link_aout_link_preload(linker_class_t lc,
+ const char* filename, linker_file_t* result)
+{
+ caddr_t modptr, baseptr;
+ char *type;
+ struct exec *ehdr;
+ aout_file_t af;
+ linker_file_t lf;
+
+ /* Look to see if we have the module preloaded. */
+ modptr = preload_search_by_name(filename);
+ if (modptr == NULL)
+ return ENOENT;
+
+ if (((type = (char *)preload_search_info(modptr, MODINFO_TYPE)) == NULL) ||
+ strcmp(type, "a.out module") ||
+ ((baseptr = preload_search_info(modptr, MODINFO_ADDR)) == NULL) ||
+ ((ehdr = (struct exec *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_AOUTEXEC)) == NULL))
+ return(0); /* we can't handle this */
+
+ /* Register with kld */
+ lf = linker_make_file(filename, &link_aout_class);
+ if (lf == NULL) {
+ return(ENOMEM);
+ }
+ af = (aout_file_t) lf;
+
+ /* Looks like we can handle this one */
+ filename = preload_search_info(modptr, MODINFO_NAME);
+ af->preloaded = 1;
+ af->address = baseptr;
+
+ /* Assume _DYNAMIC is the first data item. */
+ af->dynamic = (struct _dynamic*)(af->address + ehdr->a_text);
+ if (af->dynamic->d_version != LD_VERSION_BSD) {
+ linker_file_unload(lf);
+ return(0); /* we can't handle this */
+ }
+ af->dynamic->d_un.d_sdt = (struct section_dispatch_table *)
+ ((char *)af->dynamic->d_un.d_sdt + (vm_offset_t)af->address);
+
+ lf->address = af->address;
+ lf->size = ehdr->a_text + ehdr->a_data + ehdr->a_bss;
+ *result = lf;
+ return(0);
+}
+
+static int
+link_aout_link_preload_finish(linker_file_t lf)
+{
+ aout_file_t af;
+ int error;
+
+ af = (aout_file_t) lf;
+ error = relocate_file(af);
+ if (error) {
+ linker_file_unload(lf);
+ return(error);
+ }
+ return(0);
+}
+
+static int
+link_aout_load_file(linker_class_t lc, const char* filename, linker_file_t* result)
+{
+ struct nameidata nd;
+ struct thread *td = curthread; /* XXX */
+ int error = 0;
+ int resid, flags;
+ struct exec header;
+ aout_file_t af;
+ linker_file_t lf = 0;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+ flags = FREAD;
+ error = vn_open(&nd, &flags, 0);
+ if (error)
+ return error;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ /*
+ * Read the a.out header from the file.
+ */
+ error = vn_rdwr(UIO_READ, nd.ni_vp, (void*) &header, sizeof header, 0,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+ if (error)
+ goto out;
+
+ if (N_BADMAG(header) || !(N_GETFLAG(header) & EX_DYNAMIC))
+ goto out;
+
+ /*
+ * We have an a.out file, so make some space to read it in.
+ */
+ lf = linker_make_file(filename, &link_aout_class);
+ if (lf == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+
+ af = (aout_file_t) lf;
+ af->address = malloc(header.a_text + header.a_data + header.a_bss,
+ M_LINKER, M_WAITOK);
+
+ /*
+ * Read the text and data sections and zero the bss.
+ */
+ error = vn_rdwr(UIO_READ, nd.ni_vp, (void*) af->address,
+ header.a_text + header.a_data, 0,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+ if (error)
+ goto out;
+ bzero(af->address + header.a_text + header.a_data, header.a_bss);
+
+ /*
+ * Assume _DYNAMIC is the first data item.
+ */
+ af->dynamic = (struct _dynamic*) (af->address + header.a_text);
+ if (af->dynamic->d_version != LD_VERSION_BSD) {
+ error = ENOEXEC;
+ goto out;
+ }
+ af->dynamic->d_un.d_sdt = (struct section_dispatch_table *)
+ ((char *)af->dynamic->d_un.d_sdt + (vm_offset_t)af->address);
+
+ lf->address = af->address;
+ lf->size = header.a_text + header.a_data + header.a_bss;
+
+ error = linker_load_dependencies(lf);
+ if (error)
+ goto out;
+ error = relocate_file(af);
+ if (error)
+ goto out;
+
+ *result = lf;
+
+out:
+ if (error && lf)
+ linker_file_unload(lf);
+ VOP_UNLOCK(nd.ni_vp, 0, td);
+ vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+
+ return error;
+}
+
+static void
+link_aout_unload_file(linker_file_t file)
+{
+ aout_file_t af = (aout_file_t) file;
+
+ if (af->preloaded) {
+ link_aout_unload_preload(file);
+ return;
+ }
+
+ if (af->address)
+ free(af->address, M_LINKER);
+}
+
+static void
+link_aout_unload_preload(linker_file_t file)
+{
+ if (file->filename)
+ preload_delete_name(file->filename);
+}
+
+/*
+ * XXX i386 dependant.
+ */
+static long
+read_relocation(struct relocation_info* r, char* addr)
+{
+ int length = r->r_length;
+
+ if (length == 0)
+ return *(u_char*) addr;
+ else if (length == 1)
+ return *(u_short*) addr;
+ else if (length == 2)
+ return *(u_int*) addr;
+ else
+ printf("link_aout: unsupported relocation size %d\n", r->r_length);
+ return 0;
+}
+
+static void
+write_relocation(struct relocation_info* r, char* addr, long value)
+{
+ int length = r->r_length;
+
+ if (length == 0)
+ *(u_char*) addr = value;
+ else if (length == 1)
+ *(u_short*) addr = value;
+ else if (length == 2)
+ *(u_int*) addr = value;
+ else
+ printf("link_aout: unsupported relocation size %d\n", r->r_length);
+}
+
+#define AOUT_RELOC(af, type, off) (type*) ((af)->address + (off))
+
+static int
+relocate_file(aout_file_t af)
+{
+ struct relocation_info* rel;
+ struct relocation_info* erel;
+ struct relocation_info* r;
+ struct nzlist* symbolbase;
+ char* stringbase;
+ struct nzlist* np;
+ char* sym;
+ long relocation;
+
+ rel = AOUT_RELOC(af, struct relocation_info, LD_REL(af->dynamic));
+ erel = AOUT_RELOC(af, struct relocation_info,
+ LD_REL(af->dynamic) + LD_RELSZ(af->dynamic));
+ symbolbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic));
+ stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic));
+
+ for (r = rel; r < erel; r++) {
+ char* addr;
+
+ if (r->r_address == 0)
+ break;
+
+ addr = AOUT_RELOC(af, char, r->r_address);
+ if (r->r_extern) {
+ np = &symbolbase[r->r_symbolnum];
+ sym = &stringbase[np->nz_strx];
+
+ if (sym[0] != '_') {
+ printf("link_aout: bad symbol name %s\n", sym);
+ relocation = 0;
+ } else
+ relocation = (intptr_t)
+ linker_file_lookup_symbol(&af->lf, sym + 1,
+ np->nz_type != (N_SETV+N_EXT));
+ if (!relocation) {
+ printf("link_aout: symbol %s not found\n", sym);
+ return ENOENT;
+ }
+
+ relocation += read_relocation(r, addr);
+
+ if (r->r_jmptable) {
+ printf("link_aout: can't cope with jump table relocations\n");
+ continue;
+ }
+
+ if (r->r_pcrel)
+ relocation -= (intptr_t) af->address;
+
+ if (r->r_copy) {
+ printf("link_aout: can't cope with copy relocations\n");
+ continue;
+ }
+
+ write_relocation(r, addr, relocation);
+ } else {
+ write_relocation(r, addr,
+ (intptr_t)(read_relocation(r, addr) + af->address));
+ }
+
+ }
+
+ return 0;
+}
+
+static long
+symbol_hash_value(aout_file_t af, const char* name)
+{
+ long hashval;
+ const char* p;
+
+ hashval = '_'; /* fake a starting '_' for C symbols */
+ for (p = name; *p; p++)
+ hashval = (hashval << 1) + *p;
+
+ return (hashval & 0x7fffffff) % LD_BUCKETS(af->dynamic);
+}
+
+int
+link_aout_lookup_symbol(linker_file_t file, const char* name,
+ c_linker_sym_t* sym)
+{
+ aout_file_t af = (aout_file_t) file;
+ long hashval;
+ struct rrs_hash* hashbase;
+ struct nzlist* symbolbase;
+ char* stringbase;
+ struct rrs_hash* hp;
+ struct nzlist* np;
+ char* cp;
+
+ if (LD_BUCKETS(af->dynamic) == 0)
+ return 0;
+
+ hashbase = AOUT_RELOC(af, struct rrs_hash, LD_HASH(af->dynamic));
+ symbolbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic));
+ stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic));
+
+restart:
+ hashval = symbol_hash_value(af, name);
+ hp = &hashbase[hashval];
+ if (hp->rh_symbolnum == -1)
+ return ENOENT;
+
+ while (hp) {
+ np = (struct nzlist *) &symbolbase[hp->rh_symbolnum];
+ cp = stringbase + np->nz_strx;
+ /*
+ * Note: we fake the leading '_' for C symbols.
+ */
+ if (cp[0] == '_' && !strcmp(cp + 1, name))
+ break;
+
+ if (hp->rh_next == 0)
+ hp = NULL;
+ else
+ hp = &hashbase[hp->rh_next];
+ }
+
+ if (hp == NULL)
+ /*
+ * Not found.
+ */
+ return ENOENT;
+
+ /*
+ * Check for an aliased symbol, whatever that is.
+ */
+ if (np->nz_type == N_INDR+N_EXT) {
+ name = stringbase + (++np)->nz_strx + 1; /* +1 for '_' */
+ goto restart;
+ }
+
+ /*
+ * Check this is an actual definition of the symbol.
+ */
+ if (np->nz_value == 0)
+ return ENOENT;
+
+ if (np->nz_type == N_UNDF+N_EXT && np->nz_value != 0) {
+ if (np->nz_other == AUX_FUNC)
+ /* weak function */
+ return ENOENT;
+ }
+
+ *sym = (linker_sym_t) np;
+
+ return 0;
+}
+
+
+static int
+link_aout_symbol_values(linker_file_t file, c_linker_sym_t sym,
+ linker_symval_t* symval)
+{
+ aout_file_t af = (aout_file_t) file;
+ const struct nzlist* np = (const struct nzlist*) sym;
+ char* stringbase;
+ long numsym = LD_STABSZ(af->dynamic) / sizeof(struct nzlist);
+ struct nzlist *symbase;
+
+ /* Is it one of ours? It could be another module... */
+ symbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic));
+ if (np < symbase)
+ return ENOENT;
+ if ((np - symbase) > numsym)
+ return ENOENT;
+
+ stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic));
+
+ symval->name = stringbase + np->nz_strx + 1; /* +1 for '_' */
+ if (np->nz_type == N_UNDF+N_EXT && np->nz_value != 0) {
+ symval->value = 0;
+ symval->size = np->nz_value;
+ } else {
+ symval->value = AOUT_RELOC(af, char, np->nz_value);
+ symval->size = np->nz_size;
+ }
+ return 0;
+}
+
+static int
+link_aout_search_symbol(linker_file_t lf, caddr_t value,
+ c_linker_sym_t* sym, long* diffp)
+{
+ aout_file_t af = (aout_file_t) lf;
+ u_long off = (uintptr_t) (void *) value;
+ u_long diff = off;
+ u_long sp_nz_value;
+ struct nzlist* sp;
+ struct nzlist* ep;
+ struct nzlist* best = 0;
+
+ for (sp = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic)),
+ ep = (struct nzlist *) ((caddr_t) sp + LD_STABSZ(af->dynamic));
+ sp < ep; sp++) {
+ if (sp->nz_name == 0)
+ continue;
+ sp_nz_value = sp->nz_value + (uintptr_t) (void *) af->address;
+ if (off >= sp_nz_value) {
+ if (off - sp_nz_value < diff) {
+ diff = off - sp_nz_value;
+ best = sp;
+ if (diff == 0)
+ break;
+ } else if (off - sp_nz_value == diff) {
+ best = sp;
+ }
+ }
+ }
+ if (best == 0)
+ *diffp = off;
+ else
+ *diffp = diff;
+ *sym = (linker_sym_t) best;
+
+ return 0;
+}
+
+/*
+ * Look up a linker set on an a.out + gnu LD system.
+ */
+struct generic_linker_set {
+ int ls_length;
+ void *ls_items[1];
+};
+static int
+link_aout_lookup_set(linker_file_t lf, const char *name,
+ void ***startp, void ***stopp, int *countp)
+{
+ c_linker_sym_t sym;
+ linker_symval_t symval;
+ void **start, **stop;
+ int error, count;
+ struct generic_linker_set *setp;
+
+ error = link_aout_lookup_symbol(lf, name, &sym);
+ if (error)
+ return error;
+ link_aout_symbol_values(lf, sym, &symval);
+ if (symval.value == 0)
+ return ESRCH;
+ setp = (struct generic_linker_set *)symval.value;
+ count = setp->ls_length;
+ start = &setp->ls_items[0];
+ stop = &setp->ls_items[count];
+ if (startp)
+ *startp = start;
+ if (stopp)
+ *stopp = stop;
+ if (countp)
+ *countp = count;
+ return 0;
+}
+
+#endif /* __i386__ */
diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c
new file mode 100644
index 0000000..dd59405
--- /dev/null
+++ b/sys/kern/link_elf.c
@@ -0,0 +1,1239 @@
+/*-
+ * Copyright (c) 1998-2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+
+#include <machine/elf.h>
+#ifdef GPROF
+#include <machine/profile.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#ifdef SPARSE_MAPPING
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#endif
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#ifdef __AOUT__
+#include <nlist.h>
+#endif
+#include <link.h>
+
+#include "linker_if.h"
+
+typedef struct elf_file {
+ struct linker_file lf; /* Common fields */
+ int preloaded; /* Was file pre-loaded */
+ caddr_t address; /* Relocation address */
+#ifdef SPARSE_MAPPING
+ vm_object_t object; /* VM object to hold file pages */
+#endif
+ Elf_Dyn* dynamic; /* Symbol table etc. */
+ Elf_Hashelt nbuckets; /* DT_HASH info */
+ Elf_Hashelt nchains;
+ const Elf_Hashelt* buckets;
+ const Elf_Hashelt* chains;
+ caddr_t hash;
+ caddr_t strtab; /* DT_STRTAB */
+ int strsz; /* DT_STRSZ */
+ const Elf_Sym* symtab; /* DT_SYMTAB */
+ Elf_Addr* got; /* DT_PLTGOT */
+ const Elf_Rel* pltrel; /* DT_JMPREL */
+ int pltrelsize; /* DT_PLTRELSZ */
+ const Elf_Rela* pltrela; /* DT_JMPREL */
+ int pltrelasize; /* DT_PLTRELSZ */
+ const Elf_Rel* rel; /* DT_REL */
+ int relsize; /* DT_RELSZ */
+ const Elf_Rela* rela; /* DT_RELA */
+ int relasize; /* DT_RELASZ */
+ caddr_t modptr;
+ const Elf_Sym* ddbsymtab; /* The symbol table we are using */
+ long ddbsymcnt; /* Number of symbols */
+ caddr_t ddbstrtab; /* String table */
+ long ddbstrcnt; /* number of bytes in string table */
+ caddr_t symbase; /* malloc'ed symbold base */
+ caddr_t strbase; /* malloc'ed string base */
+#ifdef DDB
+ struct link_map gdb; /* hooks for gdb */
+#endif
+} *elf_file_t;
+
+static int link_elf_link_preload(linker_class_t cls,
+ const char*, linker_file_t*);
+static int link_elf_link_preload_finish(linker_file_t);
+static int link_elf_load_file(linker_class_t, const char*, linker_file_t*);
+static int link_elf_lookup_symbol(linker_file_t, const char*,
+ c_linker_sym_t*);
+static int link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t*);
+static int link_elf_search_symbol(linker_file_t, caddr_t value,
+ c_linker_sym_t* sym, long* diffp);
+
+static void link_elf_unload_file(linker_file_t);
+static void link_elf_unload_preload(linker_file_t);
+static int link_elf_lookup_set(linker_file_t, const char *,
+ void ***, void ***, int *);
+static int link_elf_each_function_name(linker_file_t,
+ int (*)(const char *, void *),
+ void *);
+
+static kobj_method_t link_elf_methods[] = {
+ KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol),
+ KOBJMETHOD(linker_symbol_values, link_elf_symbol_values),
+ KOBJMETHOD(linker_search_symbol, link_elf_search_symbol),
+ KOBJMETHOD(linker_unload, link_elf_unload_file),
+ KOBJMETHOD(linker_load_file, link_elf_load_file),
+ KOBJMETHOD(linker_link_preload, link_elf_link_preload),
+ KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish),
+ KOBJMETHOD(linker_lookup_set, link_elf_lookup_set),
+ KOBJMETHOD(linker_each_function_name, link_elf_each_function_name),
+ { 0, 0 }
+};
+
+static struct linker_class link_elf_class = {
+#if ELF_TARG_CLASS == ELFCLASS32
+ "elf32",
+#else
+ "elf64",
+#endif
+ link_elf_methods, sizeof(struct elf_file)
+};
+
+static int parse_dynamic(elf_file_t ef);
+static int relocate_file(elf_file_t ef);
+static int link_elf_preload_parse_symbols(elf_file_t ef);
+
+#ifdef DDB
+static void r_debug_state(struct r_debug *dummy_one,
+ struct link_map *dummy_two);
+
+/*
+ * A list of loaded modules for GDB to use for loading symbols.
+ */
+struct r_debug r_debug;
+
+#define GDB_STATE(s) r_debug.r_state = s; r_debug_state(NULL, NULL);
+
+/*
+ * Function for the debugger to set a breakpoint on to gain control.
+ */
+void
+r_debug_state(struct r_debug *dummy_one __unused,
+ struct link_map *dummy_two __unused)
+{
+}
+
+#endif
+
+#ifdef __ia64__
+Elf_Addr link_elf_get_gp(linker_file_t);
+#endif
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic _DYNAMIC;
+
+static void
+link_elf_init(void* arg)
+{
+#ifdef __ELF__
+ Elf_Dyn *dp;
+ caddr_t modptr, baseptr, sizeptr;
+ elf_file_t ef;
+ char *modname;
+#ifdef DDB
+ char *newfilename;
+#endif
+#endif
+
+ linker_add_class(&link_elf_class);
+
+#ifdef __ELF__
+ dp = (Elf_Dyn*) &_DYNAMIC;
+ modname = NULL;
+ modptr = preload_search_by_type("elf kernel");
+ if (modptr)
+ modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+ if (modname == NULL)
+ modname = "kernel";
+ linker_kernel_file = linker_make_file(modname, &link_elf_class);
+ if (linker_kernel_file == NULL)
+ panic("link_elf_init: Can't create linker structures for kernel");
+
+ ef = (elf_file_t) linker_kernel_file;
+ ef->preloaded = 1;
+ ef->address = 0;
+#ifdef SPARSE_MAPPING
+ ef->object = 0;
+#endif
+ ef->dynamic = dp;
+
+ if (dp)
+ parse_dynamic(ef);
+ linker_kernel_file->address = (caddr_t) KERNBASE;
+ linker_kernel_file->size = -(intptr_t)linker_kernel_file->address;
+
+ if (modptr) {
+ ef->modptr = modptr;
+ baseptr = preload_search_info(modptr, MODINFO_ADDR);
+ if (baseptr)
+ linker_kernel_file->address = *(caddr_t *)baseptr;
+ sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+ if (sizeptr)
+ linker_kernel_file->size = *(size_t *)sizeptr;
+ }
+ (void)link_elf_preload_parse_symbols(ef);
+
+#ifdef DDB
+ ef->gdb.l_addr = linker_kernel_file->address;
+ newfilename = malloc(strlen(modname) + 1, M_LINKER, M_WAITOK);
+ strcpy(newfilename, modname);
+ ef->gdb.l_name = newfilename;
+ ef->gdb.l_ld = dp;
+ ef->gdb.l_prev = 0;
+ ef->gdb.l_next = 0;
+
+ r_debug.r_map = &ef->gdb;
+ r_debug.r_brk = r_debug_state;
+ r_debug.r_state = RT_CONSISTENT;
+
+ r_debug_state(NULL, NULL); /* say hello to gdb! */
+#endif
+
+#endif
+}
+
+SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
+
+static int
+link_elf_preload_parse_symbols(elf_file_t ef)
+{
+ caddr_t pointer;
+ caddr_t ssym, esym, base;
+ caddr_t strtab;
+ int strcnt;
+ Elf_Sym* symtab;
+ int symcnt;
+
+ if (ef->modptr == NULL)
+ return 0;
+ pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_SSYM);
+ if (pointer == NULL)
+ return 0;
+ ssym = *(caddr_t *)pointer;
+ pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_ESYM);
+ if (pointer == NULL)
+ return 0;
+ esym = *(caddr_t *)pointer;
+
+ base = ssym;
+
+ symcnt = *(long *)base;
+ base += sizeof(long);
+ symtab = (Elf_Sym *)base;
+ base += roundup(symcnt, sizeof(long));
+
+ if (base > esym || base < ssym) {
+ printf("Symbols are corrupt!\n");
+ return EINVAL;
+ }
+
+ strcnt = *(long *)base;
+ base += sizeof(long);
+ strtab = base;
+ base += roundup(strcnt, sizeof(long));
+
+ if (base > esym || base < ssym) {
+ printf("Symbols are corrupt!\n");
+ return EINVAL;
+ }
+
+ ef->ddbsymtab = symtab;
+ ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+ ef->ddbstrtab = strtab;
+ ef->ddbstrcnt = strcnt;
+
+ return 0;
+}
+
+static int
+parse_dynamic(elf_file_t ef)
+{
+ Elf_Dyn *dp;
+ int plttype = DT_REL;
+
+ for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+ switch (dp->d_tag) {
+ case DT_HASH:
+ {
+ /* From src/libexec/rtld-elf/rtld.c */
+ const Elf_Hashelt *hashtab = (const Elf_Hashelt *)
+ (ef->address + dp->d_un.d_ptr);
+ ef->nbuckets = hashtab[0];
+ ef->nchains = hashtab[1];
+ ef->buckets = hashtab + 2;
+ ef->chains = ef->buckets + ef->nbuckets;
+ break;
+ }
+ case DT_STRTAB:
+ ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_STRSZ:
+ ef->strsz = dp->d_un.d_val;
+ break;
+ case DT_SYMTAB:
+ ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_SYMENT:
+ if (dp->d_un.d_val != sizeof(Elf_Sym))
+ return ENOEXEC;
+ break;
+ case DT_PLTGOT:
+ ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_REL:
+ ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_RELSZ:
+ ef->relsize = dp->d_un.d_val;
+ break;
+ case DT_RELENT:
+ if (dp->d_un.d_val != sizeof(Elf_Rel))
+ return ENOEXEC;
+ break;
+ case DT_JMPREL:
+ ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_PLTRELSZ:
+ ef->pltrelsize = dp->d_un.d_val;
+ break;
+ case DT_RELA:
+ ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_RELASZ:
+ ef->relasize = dp->d_un.d_val;
+ break;
+ case DT_RELAENT:
+ if (dp->d_un.d_val != sizeof(Elf_Rela))
+ return ENOEXEC;
+ break;
+ case DT_PLTREL:
+ plttype = dp->d_un.d_val;
+ if (plttype != DT_REL && plttype != DT_RELA)
+ return ENOEXEC;
+ break;
+#ifdef DDB
+ case DT_DEBUG:
+ dp->d_un.d_ptr = (Elf_Addr) &r_debug;
+ break;
+#endif
+ }
+ }
+
+ if (plttype == DT_RELA) {
+ ef->pltrela = (const Elf_Rela *) ef->pltrel;
+ ef->pltrel = NULL;
+ ef->pltrelasize = ef->pltrelsize;
+ ef->pltrelsize = 0;
+ }
+
+ ef->ddbsymtab = ef->symtab;
+ ef->ddbsymcnt = ef->nchains;
+ ef->ddbstrtab = ef->strtab;
+ ef->ddbstrcnt = ef->strsz;
+
+ return 0;
+}
+
+static void
+link_elf_error(const char *s)
+{
+ printf("kldload: %s\n", s);
+}
+
+#ifdef DDB
+
+static void
+link_elf_add_gdb(struct link_map *l)
+{
+ struct link_map *prev;
+
+ /*
+ * Scan to the end of the list.
+ */
+ for (prev = r_debug.r_map; prev->l_next != NULL; prev = prev->l_next)
+ ;
+
+ /* Link in the new entry. */
+ l->l_prev = prev;
+ l->l_next = prev->l_next;
+ prev->l_next = l;
+}
+
+static void
+link_elf_delete_gdb(struct link_map *l)
+{
+ if (l->l_prev == NULL) {
+ if ((r_debug.r_map = l->l_next) != NULL)
+ l->l_next->l_prev = NULL;
+ return;
+ }
+
+ if ((l->l_prev->l_next = l->l_next) != NULL)
+ l->l_next->l_prev = l->l_prev;
+}
+
+#endif /* DDB */
+
+static int
+link_elf_link_preload(linker_class_t cls,
+ const char* filename, linker_file_t *result)
+{
+ caddr_t modptr, baseptr, sizeptr, dynptr;
+ char *type;
+ elf_file_t ef;
+ linker_file_t lf;
+ int error;
+ vm_offset_t dp;
+
+ /* Look to see if we have the file preloaded */
+ modptr = preload_search_by_name(filename);
+ if (modptr == NULL)
+ return ENOENT;
+
+ type = (char *)preload_search_info(modptr, MODINFO_TYPE);
+ baseptr = preload_search_info(modptr, MODINFO_ADDR);
+ sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+ dynptr = preload_search_info(modptr, MODINFO_METADATA|MODINFOMD_DYNAMIC);
+ if (type == NULL || strcmp(type, "elf module") != 0)
+ return (EFTYPE);
+ if (baseptr == NULL || sizeptr == NULL || dynptr == NULL)
+ return (EINVAL);
+
+ lf = linker_make_file(filename, &link_elf_class);
+ if (lf == NULL) {
+ return ENOMEM;
+ }
+
+ ef = (elf_file_t) lf;
+ ef->preloaded = 1;
+ ef->modptr = modptr;
+ ef->address = *(caddr_t *)baseptr;
+#ifdef SPARSE_MAPPING
+ ef->object = 0;
+#endif
+ dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr;
+ ef->dynamic = (Elf_Dyn *)dp;
+ lf->address = ef->address;
+ lf->size = *(size_t *)sizeptr;
+
+ error = parse_dynamic(ef);
+ if (error) {
+ linker_file_unload(lf);
+ return error;
+ }
+ *result = lf;
+ return (0);
+}
+
+static int
+link_elf_link_preload_finish(linker_file_t lf)
+{
+ elf_file_t ef;
+ int error;
+#ifdef DDB
+ char *newfilename;
+#endif
+
+ ef = (elf_file_t) lf;
+#if 0 /* this will be more trouble than it's worth for now */
+ for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+ if (dp->d_tag != DT_NEEDED)
+ continue;
+ modname = ef->strtab + dp->d_un.d_val;
+ error = linker_load_module(modname, lf);
+ if (error)
+ goto out;
+ }
+#endif
+ error = relocate_file(ef);
+ if (error)
+ return error;
+ (void)link_elf_preload_parse_symbols(ef);
+
+#ifdef DDB
+ GDB_STATE(RT_ADD);
+ ef->gdb.l_addr = lf->address;
+ newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK);
+ strcpy(newfilename, lf->filename);
+ ef->gdb.l_name = newfilename;
+ ef->gdb.l_ld = ef->dynamic;
+ link_elf_add_gdb(&ef->gdb);
+ GDB_STATE(RT_CONSISTENT);
+#endif
+
+ return (0);
+}
+
+static int
+link_elf_load_file(linker_class_t cls, const char* filename, linker_file_t* result)
+{
+ struct nameidata nd;
+ struct thread* td = curthread; /* XXX */
+ Elf_Ehdr *hdr;
+ caddr_t firstpage;
+ int nbytes, i;
+ Elf_Phdr *phdr;
+ Elf_Phdr *phlimit;
+ Elf_Phdr *segs[2];
+ int nsegs;
+ Elf_Phdr *phdyn;
+ Elf_Phdr *phphdr;
+ caddr_t mapbase;
+ size_t mapsize;
+ Elf_Off base_offset;
+ Elf_Addr base_vaddr;
+ Elf_Addr base_vlimit;
+ int error = 0;
+ int resid, flags;
+ elf_file_t ef;
+ linker_file_t lf;
+ Elf_Shdr *shdr;
+ int symtabindex;
+ int symstrindex;
+ int symcnt;
+ int strcnt;
+#ifdef DDB
+ char *newfilename;
+#endif
+
+ GIANT_REQUIRED;
+
+ shdr = NULL;
+ lf = NULL;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+ flags = FREAD;
+ error = vn_open(&nd, &flags, 0);
+ if (error)
+ return error;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ /*
+ * Read the elf header from the file.
+ */
+ firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
+ if (firstpage == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ hdr = (Elf_Ehdr *)firstpage;
+ error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+ nbytes = PAGE_SIZE - resid;
+ if (error)
+ goto out;
+
+ if (!IS_ELF(*hdr)) {
+ error = ENOEXEC;
+ goto out;
+ }
+
+ if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
+ || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
+ link_elf_error("Unsupported file layout");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_ident[EI_VERSION] != EV_CURRENT
+ || hdr->e_version != EV_CURRENT) {
+ link_elf_error("Unsupported file version");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
+ link_elf_error("Unsupported file type");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_machine != ELF_TARG_MACH) {
+ link_elf_error("Unsupported machine");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * We rely on the program header being in the first page. This is
+ * not strictly required by the ABI specification, but it seems to
+ * always true in practice. And, it simplifies things considerably.
+ */
+ if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
+ (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
+ (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
+ link_elf_error("Unreadable program headers");
+
+ /*
+ * Scan the program header entries, and save key information.
+ *
+ * We rely on there being exactly two load segments, text and data,
+ * in that order.
+ */
+ phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
+ phlimit = phdr + hdr->e_phnum;
+ nsegs = 0;
+ phdyn = NULL;
+ phphdr = NULL;
+ while (phdr < phlimit) {
+ switch (phdr->p_type) {
+
+ case PT_LOAD:
+ if (nsegs == 2) {
+ link_elf_error("Too many sections");
+ error = ENOEXEC;
+ goto out;
+ }
+ segs[nsegs] = phdr;
+ ++nsegs;
+ break;
+
+ case PT_PHDR:
+ phphdr = phdr;
+ break;
+
+ case PT_DYNAMIC:
+ phdyn = phdr;
+ break;
+
+ case PT_INTERP:
+ link_elf_error("Unsupported file type");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ ++phdr;
+ }
+ if (phdyn == NULL) {
+ link_elf_error("Object is not dynamically-linked");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * Allocate the entire address space of the object, to stake out our
+ * contiguous region, and to establish the base address for relocation.
+ */
+ base_offset = trunc_page(segs[0]->p_offset);
+ base_vaddr = trunc_page(segs[0]->p_vaddr);
+ base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz);
+ mapsize = base_vlimit - base_vaddr;
+
+ lf = linker_make_file(filename, &link_elf_class);
+ if (!lf) {
+ error = ENOMEM;
+ goto out;
+ }
+
+ ef = (elf_file_t) lf;
+#ifdef SPARSE_MAPPING
+ ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
+ if (ef->object == NULL) {
+ free(ef, M_LINKER);
+ error = ENOMEM;
+ goto out;
+ }
+ vm_object_reference(ef->object);
+ ef->address = (caddr_t) vm_map_min(kernel_map);
+ error = vm_map_find(kernel_map, ef->object, 0,
+ (vm_offset_t *) &ef->address,
+ mapsize, 1,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error) {
+ vm_object_deallocate(ef->object);
+ ef->object = 0;
+ goto out;
+ }
+#else
+ ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
+ if (!ef->address) {
+ error = ENOMEM;
+ goto out;
+ }
+#endif
+ mapbase = ef->address;
+
+ /*
+ * Read the text and data sections and zero the bss.
+ */
+ for (i = 0; i < 2; i++) {
+ caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ segbase, segs[i]->p_filesz, segs[i]->p_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+ if (error) {
+ goto out;
+ }
+ bzero(segbase + segs[i]->p_filesz,
+ segs[i]->p_memsz - segs[i]->p_filesz);
+
+#ifdef SPARSE_MAPPING
+ /*
+ * Wire down the pages
+ */
+ vm_map_pageable(kernel_map,
+ (vm_offset_t) segbase,
+ (vm_offset_t) segbase + segs[i]->p_memsz,
+ FALSE);
+#endif
+ }
+
+#ifdef GPROF
+ /* Update profiling information with the new text segment. */
+ kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr +
+ segs[0]->p_memsz));
+#endif
+
+ ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
+
+ lf->address = ef->address;
+ lf->size = mapsize;
+
+ error = parse_dynamic(ef);
+ if (error)
+ goto out;
+ error = linker_load_dependencies(lf);
+ if (error)
+ goto out;
+#if 0 /* this will be more trouble than it's worth for now */
+ for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+ if (dp->d_tag != DT_NEEDED)
+ continue;
+ modname = ef->strtab + dp->d_un.d_val;
+ error = linker_load_module(modname, lf);
+ if (error)
+ goto out;
+ }
+#endif
+ error = relocate_file(ef);
+ if (error)
+ goto out;
+
+ /* Try and load the symbol table if it's present. (you can strip it!) */
+ nbytes = hdr->e_shnum * hdr->e_shentsize;
+ if (nbytes == 0 || hdr->e_shoff == 0)
+ goto nosyms;
+ shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO);
+ if (shdr == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ (caddr_t)shdr, nbytes, hdr->e_shoff,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+ if (error)
+ goto out;
+ symtabindex = -1;
+ symstrindex = -1;
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if (shdr[i].sh_type == SHT_SYMTAB) {
+ symtabindex = i;
+ symstrindex = shdr[i].sh_link;
+ }
+ }
+ if (symtabindex < 0 || symstrindex < 0)
+ goto nosyms;
+
+ symcnt = shdr[symtabindex].sh_size;
+ ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
+ strcnt = shdr[symstrindex].sh_size;
+ ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);
+
+ if (ef->symbase == NULL || ef->strbase == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ ef->symbase, symcnt, shdr[symtabindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+ if (error)
+ goto out;
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ ef->strbase, strcnt, shdr[symstrindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+ if (error)
+ goto out;
+
+ ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+ ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
+ ef->ddbstrcnt = strcnt;
+ ef->ddbstrtab = ef->strbase;
+
+#ifdef DDB
+ GDB_STATE(RT_ADD);
+ ef->gdb.l_addr = lf->address;
+ newfilename = malloc(strlen(filename) + 1, M_LINKER, M_WAITOK);
+ strcpy(newfilename, filename);
+ ef->gdb.l_name = (const char *)newfilename;
+ ef->gdb.l_ld = ef->dynamic;
+ link_elf_add_gdb(&ef->gdb);
+ GDB_STATE(RT_CONSISTENT);
+#endif
+
+nosyms:
+
+ *result = lf;
+
+out:
+ if (error && lf)
+ linker_file_unload(lf);
+ if (shdr)
+ free(shdr, M_LINKER);
+ if (firstpage)
+ free(firstpage, M_LINKER);
+ VOP_UNLOCK(nd.ni_vp, 0, td);
+ vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+
+ return error;
+}
+
+static void
+link_elf_unload_file(linker_file_t file)
+{
+ elf_file_t ef = (elf_file_t) file;
+
+#ifdef DDB
+ if (ef->gdb.l_ld) {
+ GDB_STATE(RT_DELETE);
+ free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER);
+ link_elf_delete_gdb(&ef->gdb);
+ GDB_STATE(RT_CONSISTENT);
+ }
+#endif
+
+ if (ef->preloaded) {
+ link_elf_unload_preload(file);
+ return;
+ }
+#ifdef SPARSE_MAPPING
+ if (ef->object) {
+ vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+ (vm_offset_t) ef->address
+ + (ef->object->size << PAGE_SHIFT));
+ vm_object_deallocate(ef->object);
+ }
+#else
+ if (ef->address)
+ free(ef->address, M_LINKER);
+#endif
+ if (ef->symbase)
+ free(ef->symbase, M_LINKER);
+ if (ef->strbase)
+ free(ef->strbase, M_LINKER);
+}
+
+static void
+link_elf_unload_preload(linker_file_t file)
+{
+ if (file->filename)
+ preload_delete_name(file->filename);
+}
+
+static const char *
+symbol_name(elf_file_t ef, Elf_Word r_info)
+{
+ const Elf_Sym *ref;
+
+ if (ELF_R_SYM(r_info)) {
+ ref = ef->symtab + ELF_R_SYM(r_info);
+ return ef->strtab + ref->st_name;
+ } else
+ return NULL;
+}
+
+static int
+relocate_file(elf_file_t ef)
+{
+ const Elf_Rel *rellim;
+ const Elf_Rel *rel;
+ const Elf_Rela *relalim;
+ const Elf_Rela *rela;
+ const char *symname;
+
+ /* Perform relocations without addend if there are any: */
+ rel = ef->rel;
+ if (rel) {
+ rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize);
+ while (rel < rellim) {
+ if (elf_reloc(&ef->lf, rel, ELF_RELOC_REL)) {
+ symname = symbol_name(ef, rel->r_info);
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rel++;
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ rela = ef->rela;
+ if (rela) {
+ relalim = (const Elf_Rela *)((const char *)ef->rela + ef->relasize);
+ while (rela < relalim) {
+ if (elf_reloc(&ef->lf, rela, ELF_RELOC_RELA)) {
+ symname = symbol_name(ef, rela->r_info);
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rela++;
+ }
+ }
+
+ /* Perform PLT relocations without addend if there are any: */
+ rel = ef->pltrel;
+ if (rel) {
+ rellim = (const Elf_Rel *)((const char *)ef->pltrel + ef->pltrelsize);
+ while (rel < rellim) {
+ if (elf_reloc(&ef->lf, rel, ELF_RELOC_REL)) {
+ symname = symbol_name(ef, rel->r_info);
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rel++;
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ rela = ef->pltrela;
+ if (rela) {
+ relalim = (const Elf_Rela *)((const char *)ef->pltrela + ef->pltrelasize);
+ while (rela < relalim) {
+ if (elf_reloc(&ef->lf, rela, ELF_RELOC_RELA)) {
+ symname = symbol_name(ef, rela->r_info);
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rela++;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Hash function for symbol table lookup. Don't even think about changing
+ * this. It is specified by the System V ABI.
+ */
+static unsigned long
+elf_hash(const char *name)
+{
+ const unsigned char *p = (const unsigned char *) name;
+ unsigned long h = 0;
+ unsigned long g;
+
+ while (*p != '\0') {
+ h = (h << 4) + *p++;
+ if ((g = h & 0xf0000000) != 0)
+ h ^= g >> 24;
+ h &= ~g;
+ }
+ return h;
+}
+
+int
+link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym)
+{
+ elf_file_t ef = (elf_file_t) lf;
+ unsigned long symnum;
+ const Elf_Sym* symp;
+ const char *strp;
+ unsigned long hash;
+ int i;
+
+ /* First, search hashed global symbols */
+ hash = elf_hash(name);
+ symnum = ef->buckets[hash % ef->nbuckets];
+
+ while (symnum != STN_UNDEF) {
+ if (symnum >= ef->nchains) {
+ printf("link_elf_lookup_symbol: corrupt symbol table\n");
+ return ENOENT;
+ }
+
+ symp = ef->symtab + symnum;
+ if (symp->st_name == 0) {
+ printf("link_elf_lookup_symbol: corrupt symbol table\n");
+ return ENOENT;
+ }
+
+ strp = ef->strtab + symp->st_name;
+
+ if (strcmp(name, strp) == 0) {
+ if (symp->st_shndx != SHN_UNDEF ||
+ (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+ *sym = (c_linker_sym_t) symp;
+ return 0;
+ } else
+ return ENOENT;
+ }
+
+ symnum = ef->chains[symnum];
+ }
+
+ /* If we have not found it, look at the full table (if loaded) */
+ if (ef->symtab == ef->ddbsymtab)
+ return ENOENT;
+
+ /* Exhaustive search */
+ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+ strp = ef->ddbstrtab + symp->st_name;
+ if (strcmp(name, strp) == 0) {
+ if (symp->st_shndx != SHN_UNDEF ||
+ (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+ *sym = (c_linker_sym_t) symp;
+ return 0;
+ } else
+ return ENOENT;
+ }
+ }
+
+ return ENOENT;
+}
+
+static int
+link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t* symval)
+{
+ elf_file_t ef = (elf_file_t) lf;
+ const Elf_Sym* es = (const Elf_Sym*) sym;
+
+ if (es >= ef->symtab && ((es - ef->symtab) < ef->nchains)) {
+ symval->name = ef->strtab + es->st_name;
+ symval->value = (caddr_t) ef->address + es->st_value;
+ symval->size = es->st_size;
+ return 0;
+ }
+ if (ef->symtab == ef->ddbsymtab)
+ return ENOENT;
+ if (es >= ef->ddbsymtab && ((es - ef->ddbsymtab) < ef->ddbsymcnt)) {
+ symval->name = ef->ddbstrtab + es->st_name;
+ symval->value = (caddr_t) ef->address + es->st_value;
+ symval->size = es->st_size;
+ return 0;
+ }
+ return ENOENT;
+}
+
+static int
+link_elf_search_symbol(linker_file_t lf, caddr_t value,
+ c_linker_sym_t* sym, long* diffp)
+{
+ elf_file_t ef = (elf_file_t) lf;
+ u_long off = (uintptr_t) (void *) value;
+ u_long diff = off;
+ u_long st_value;
+ const Elf_Sym* es;
+ const Elf_Sym* best = 0;
+ int i;
+
+ for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
+ if (es->st_name == 0)
+ continue;
+ st_value = es->st_value + (uintptr_t) (void *) ef->address;
+ if (off >= st_value) {
+ if (off - st_value < diff) {
+ diff = off - st_value;
+ best = es;
+ if (diff == 0)
+ break;
+ } else if (off - st_value == diff) {
+ best = es;
+ }
+ }
+ }
+ if (best == 0)
+ *diffp = off;
+ else
+ *diffp = diff;
+ *sym = (c_linker_sym_t) best;
+
+ return 0;
+}
+
+/*
+ * Look up a linker set on an ELF system.
+ */
+static int
+link_elf_lookup_set(linker_file_t lf, const char *name,
+ void ***startp, void ***stopp, int *countp)
+{
+ c_linker_sym_t sym;
+ linker_symval_t symval;
+ char *setsym;
+ void **start, **stop;
+ int len, error = 0, count;
+
+ len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */
+ setsym = malloc(len, M_LINKER, M_WAITOK);
+ if (setsym == NULL)
+ return ENOMEM;
+
+ /* get address of first entry */
+ snprintf(setsym, len, "%s%s", "__start_set_", name);
+ error = link_elf_lookup_symbol(lf, setsym, &sym);
+ if (error)
+ goto out;
+ link_elf_symbol_values(lf, sym, &symval);
+ if (symval.value == 0) {
+ error = ESRCH;
+ goto out;
+ }
+ start = (void **)symval.value;
+
+ /* get address of last entry */
+ snprintf(setsym, len, "%s%s", "__stop_set_", name);
+ error = link_elf_lookup_symbol(lf, setsym, &sym);
+ if (error)
+ goto out;
+ link_elf_symbol_values(lf, sym, &symval);
+ if (symval.value == 0) {
+ error = ESRCH;
+ goto out;
+ }
+ stop = (void **)symval.value;
+
+ /* and the number of entries */
+ count = stop - start;
+
+ /* and copy out */
+ if (startp)
+ *startp = start;
+ if (stopp)
+ *stopp = stop;
+ if (countp)
+ *countp = count;
+
+out:
+ free(setsym, M_LINKER);
+ return error;
+}
+
+static int
+link_elf_each_function_name(linker_file_t file,
+ int (*callback)(const char *, void *), void *opaque) {
+ elf_file_t ef = (elf_file_t)file;
+ const Elf_Sym* symp;
+ int i, error;
+
+ /* Exhaustive search */
+ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+ if (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
+ error = callback(ef->ddbstrtab + symp->st_name, opaque);
+ if (error)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+#ifdef __ia64__
+/*
+ * Each KLD has its own GP. The GP value for each load module is given by
+ * DT_PLTGOT on ia64. We need GP to construct function descriptors, but
+ * don't have direct access to the ELF file structure. The link_elf_get_gp()
+ * function returns the GP given a pointer to a generic linker file struct.
+ */
+Elf_Addr
+link_elf_get_gp(linker_file_t lf)
+{
+ elf_file_t ef = (elf_file_t)lf;
+ return (Elf_Addr)ef->got;
+}
+#endif
+
+/*
+ * Symbol lookup function that can be used when the symbol index is known (ie
+ * in relocations). It uses the symbol index instead of doing a fully fledged
+ * hash table based lookup when such is valid. For example for local symbols.
+ * This is not only more efficient, it's also more correct. It's not always
+ * the case that the symbol can be found through the hash table.
+ */
+Elf_Addr
+elf_lookup(linker_file_t lf, Elf_Word symidx, int deps)
+{
+ elf_file_t ef = (elf_file_t)lf;
+ const Elf_Sym *sym;
+ const char *symbol;
+
+ /* Don't even try to lookup the symbol if the index is bogus. */
+ if (symidx >= ef->nchains)
+ return (0);
+
+ sym = ef->symtab + symidx;
+
+ /*
+ * Don't do a full lookup when the symbol is local. It may even
+ * fail because it may not be found through the hash table.
+ */
+ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) {
+ /* Force lookup failure when we have an insanity. */
+ if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0)
+ return (0);
+ return ((Elf_Addr)ef->address + sym->st_value);
+ }
+
+ /*
+ * XXX we can avoid doing a hash table based lookup for global
+ * symbols as well. This however is not always valid, so we'll
+ * just do it the hard way for now. Performance tweaks can
+ * always be added.
+ */
+
+ symbol = ef->strtab + sym->st_name;
+
+ /* Force a lookup failure if the symbol name is bogus. */
+ if (*symbol == 0)
+ return (0);
+
+ return ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
+}
diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c
new file mode 100644
index 0000000..dd59405
--- /dev/null
+++ b/sys/kern/link_elf_obj.c
@@ -0,0 +1,1239 @@
+/*-
+ * Copyright (c) 1998-2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+
+#include <machine/elf.h>
+#ifdef GPROF
+#include <machine/profile.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#ifdef SPARSE_MAPPING
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#endif
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#ifdef __AOUT__
+#include <nlist.h>
+#endif
+#include <link.h>
+
+#include "linker_if.h"
+
+typedef struct elf_file {
+ struct linker_file lf; /* Common fields */
+ int preloaded; /* Was file pre-loaded */
+ caddr_t address; /* Relocation address */
+#ifdef SPARSE_MAPPING
+ vm_object_t object; /* VM object to hold file pages */
+#endif
+ Elf_Dyn* dynamic; /* Symbol table etc. */
+ Elf_Hashelt nbuckets; /* DT_HASH info */
+ Elf_Hashelt nchains;
+ const Elf_Hashelt* buckets;
+ const Elf_Hashelt* chains;
+ caddr_t hash;
+ caddr_t strtab; /* DT_STRTAB */
+ int strsz; /* DT_STRSZ */
+ const Elf_Sym* symtab; /* DT_SYMTAB */
+ Elf_Addr* got; /* DT_PLTGOT */
+ const Elf_Rel* pltrel; /* DT_JMPREL */
+ int pltrelsize; /* DT_PLTRELSZ */
+ const Elf_Rela* pltrela; /* DT_JMPREL */
+ int pltrelasize; /* DT_PLTRELSZ */
+ const Elf_Rel* rel; /* DT_REL */
+ int relsize; /* DT_RELSZ */
+ const Elf_Rela* rela; /* DT_RELA */
+ int relasize; /* DT_RELASZ */
+ caddr_t modptr;
+ const Elf_Sym* ddbsymtab; /* The symbol table we are using */
+ long ddbsymcnt; /* Number of symbols */
+ caddr_t ddbstrtab; /* String table */
+ long ddbstrcnt; /* number of bytes in string table */
+ caddr_t symbase; /* malloc'ed symbold base */
+ caddr_t strbase; /* malloc'ed string base */
+#ifdef DDB
+ struct link_map gdb; /* hooks for gdb */
+#endif
+} *elf_file_t;
+
+static int link_elf_link_preload(linker_class_t cls,
+ const char*, linker_file_t*);
+static int link_elf_link_preload_finish(linker_file_t);
+static int link_elf_load_file(linker_class_t, const char*, linker_file_t*);
+static int link_elf_lookup_symbol(linker_file_t, const char*,
+ c_linker_sym_t*);
+static int link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t*);
+static int link_elf_search_symbol(linker_file_t, caddr_t value,
+ c_linker_sym_t* sym, long* diffp);
+
+static void link_elf_unload_file(linker_file_t);
+static void link_elf_unload_preload(linker_file_t);
+static int link_elf_lookup_set(linker_file_t, const char *,
+ void ***, void ***, int *);
+static int link_elf_each_function_name(linker_file_t,
+ int (*)(const char *, void *),
+ void *);
+
+static kobj_method_t link_elf_methods[] = {
+ KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol),
+ KOBJMETHOD(linker_symbol_values, link_elf_symbol_values),
+ KOBJMETHOD(linker_search_symbol, link_elf_search_symbol),
+ KOBJMETHOD(linker_unload, link_elf_unload_file),
+ KOBJMETHOD(linker_load_file, link_elf_load_file),
+ KOBJMETHOD(linker_link_preload, link_elf_link_preload),
+ KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish),
+ KOBJMETHOD(linker_lookup_set, link_elf_lookup_set),
+ KOBJMETHOD(linker_each_function_name, link_elf_each_function_name),
+ { 0, 0 }
+};
+
+static struct linker_class link_elf_class = {
+#if ELF_TARG_CLASS == ELFCLASS32
+ "elf32",
+#else
+ "elf64",
+#endif
+ link_elf_methods, sizeof(struct elf_file)
+};
+
+static int parse_dynamic(elf_file_t ef);
+static int relocate_file(elf_file_t ef);
+static int link_elf_preload_parse_symbols(elf_file_t ef);
+
+#ifdef DDB
+static void r_debug_state(struct r_debug *dummy_one,
+ struct link_map *dummy_two);
+
+/*
+ * A list of loaded modules for GDB to use for loading symbols.
+ */
+struct r_debug r_debug;
+
+#define GDB_STATE(s) r_debug.r_state = s; r_debug_state(NULL, NULL);
+
+/*
+ * Function for the debugger to set a breakpoint on to gain control.
+ */
+void
+r_debug_state(struct r_debug *dummy_one __unused,
+ struct link_map *dummy_two __unused)
+{
+}
+
+#endif
+
+#ifdef __ia64__
+Elf_Addr link_elf_get_gp(linker_file_t);
+#endif
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic _DYNAMIC;
+
+static void
+link_elf_init(void* arg)
+{
+#ifdef __ELF__
+ Elf_Dyn *dp;
+ caddr_t modptr, baseptr, sizeptr;
+ elf_file_t ef;
+ char *modname;
+#ifdef DDB
+ char *newfilename;
+#endif
+#endif
+
+ linker_add_class(&link_elf_class);
+
+#ifdef __ELF__
+ dp = (Elf_Dyn*) &_DYNAMIC;
+ modname = NULL;
+ modptr = preload_search_by_type("elf kernel");
+ if (modptr)
+ modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+ if (modname == NULL)
+ modname = "kernel";
+ linker_kernel_file = linker_make_file(modname, &link_elf_class);
+ if (linker_kernel_file == NULL)
+ panic("link_elf_init: Can't create linker structures for kernel");
+
+ ef = (elf_file_t) linker_kernel_file;
+ ef->preloaded = 1;
+ ef->address = 0;
+#ifdef SPARSE_MAPPING
+ ef->object = 0;
+#endif
+ ef->dynamic = dp;
+
+ if (dp)
+ parse_dynamic(ef);
+ linker_kernel_file->address = (caddr_t) KERNBASE;
+ linker_kernel_file->size = -(intptr_t)linker_kernel_file->address;
+
+ if (modptr) {
+ ef->modptr = modptr;
+ baseptr = preload_search_info(modptr, MODINFO_ADDR);
+ if (baseptr)
+ linker_kernel_file->address = *(caddr_t *)baseptr;
+ sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+ if (sizeptr)
+ linker_kernel_file->size = *(size_t *)sizeptr;
+ }
+ (void)link_elf_preload_parse_symbols(ef);
+
+#ifdef DDB
+ ef->gdb.l_addr = linker_kernel_file->address;
+ newfilename = malloc(strlen(modname) + 1, M_LINKER, M_WAITOK);
+ strcpy(newfilename, modname);
+ ef->gdb.l_name = newfilename;
+ ef->gdb.l_ld = dp;
+ ef->gdb.l_prev = 0;
+ ef->gdb.l_next = 0;
+
+ r_debug.r_map = &ef->gdb;
+ r_debug.r_brk = r_debug_state;
+ r_debug.r_state = RT_CONSISTENT;
+
+ r_debug_state(NULL, NULL); /* say hello to gdb! */
+#endif
+
+#endif
+}
+
+SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
+
+static int
+link_elf_preload_parse_symbols(elf_file_t ef)
+{
+ caddr_t pointer;
+ caddr_t ssym, esym, base;
+ caddr_t strtab;
+ int strcnt;
+ Elf_Sym* symtab;
+ int symcnt;
+
+ if (ef->modptr == NULL)
+ return 0;
+ pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_SSYM);
+ if (pointer == NULL)
+ return 0;
+ ssym = *(caddr_t *)pointer;
+ pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_ESYM);
+ if (pointer == NULL)
+ return 0;
+ esym = *(caddr_t *)pointer;
+
+ base = ssym;
+
+ symcnt = *(long *)base;
+ base += sizeof(long);
+ symtab = (Elf_Sym *)base;
+ base += roundup(symcnt, sizeof(long));
+
+ if (base > esym || base < ssym) {
+ printf("Symbols are corrupt!\n");
+ return EINVAL;
+ }
+
+ strcnt = *(long *)base;
+ base += sizeof(long);
+ strtab = base;
+ base += roundup(strcnt, sizeof(long));
+
+ if (base > esym || base < ssym) {
+ printf("Symbols are corrupt!\n");
+ return EINVAL;
+ }
+
+ ef->ddbsymtab = symtab;
+ ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+ ef->ddbstrtab = strtab;
+ ef->ddbstrcnt = strcnt;
+
+ return 0;
+}
+
+static int
+parse_dynamic(elf_file_t ef)
+{
+ Elf_Dyn *dp;
+ int plttype = DT_REL;
+
+ for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+ switch (dp->d_tag) {
+ case DT_HASH:
+ {
+ /* From src/libexec/rtld-elf/rtld.c */
+ const Elf_Hashelt *hashtab = (const Elf_Hashelt *)
+ (ef->address + dp->d_un.d_ptr);
+ ef->nbuckets = hashtab[0];
+ ef->nchains = hashtab[1];
+ ef->buckets = hashtab + 2;
+ ef->chains = ef->buckets + ef->nbuckets;
+ break;
+ }
+ case DT_STRTAB:
+ ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_STRSZ:
+ ef->strsz = dp->d_un.d_val;
+ break;
+ case DT_SYMTAB:
+ ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_SYMENT:
+ if (dp->d_un.d_val != sizeof(Elf_Sym))
+ return ENOEXEC;
+ break;
+ case DT_PLTGOT:
+ ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_REL:
+ ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_RELSZ:
+ ef->relsize = dp->d_un.d_val;
+ break;
+ case DT_RELENT:
+ if (dp->d_un.d_val != sizeof(Elf_Rel))
+ return ENOEXEC;
+ break;
+ case DT_JMPREL:
+ ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_PLTRELSZ:
+ ef->pltrelsize = dp->d_un.d_val;
+ break;
+ case DT_RELA:
+ ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_RELASZ:
+ ef->relasize = dp->d_un.d_val;
+ break;
+ case DT_RELAENT:
+ if (dp->d_un.d_val != sizeof(Elf_Rela))
+ return ENOEXEC;
+ break;
+ case DT_PLTREL:
+ plttype = dp->d_un.d_val;
+ if (plttype != DT_REL && plttype != DT_RELA)
+ return ENOEXEC;
+ break;
+#ifdef DDB
+ case DT_DEBUG:
+ dp->d_un.d_ptr = (Elf_Addr) &r_debug;
+ break;
+#endif
+ }
+ }
+
+ if (plttype == DT_RELA) {
+ ef->pltrela = (const Elf_Rela *) ef->pltrel;
+ ef->pltrel = NULL;
+ ef->pltrelasize = ef->pltrelsize;
+ ef->pltrelsize = 0;
+ }
+
+ ef->ddbsymtab = ef->symtab;
+ ef->ddbsymcnt = ef->nchains;
+ ef->ddbstrtab = ef->strtab;
+ ef->ddbstrcnt = ef->strsz;
+
+ return 0;
+}
+
+static void
+link_elf_error(const char *s)
+{
+ printf("kldload: %s\n", s);
+}
+
+#ifdef DDB
+
+static void
+link_elf_add_gdb(struct link_map *l)
+{
+ struct link_map *prev;
+
+ /*
+ * Scan to the end of the list.
+ */
+ for (prev = r_debug.r_map; prev->l_next != NULL; prev = prev->l_next)
+ ;
+
+ /* Link in the new entry. */
+ l->l_prev = prev;
+ l->l_next = prev->l_next;
+ prev->l_next = l;
+}
+
+static void
+link_elf_delete_gdb(struct link_map *l)
+{
+ if (l->l_prev == NULL) {
+ if ((r_debug.r_map = l->l_next) != NULL)
+ l->l_next->l_prev = NULL;
+ return;
+ }
+
+ if ((l->l_prev->l_next = l->l_next) != NULL)
+ l->l_next->l_prev = l->l_prev;
+}
+
+#endif /* DDB */
+
+static int
+link_elf_link_preload(linker_class_t cls,
+ const char* filename, linker_file_t *result)
+{
+ caddr_t modptr, baseptr, sizeptr, dynptr;
+ char *type;
+ elf_file_t ef;
+ linker_file_t lf;
+ int error;
+ vm_offset_t dp;
+
+ /* Look to see if we have the file preloaded */
+ modptr = preload_search_by_name(filename);
+ if (modptr == NULL)
+ return ENOENT;
+
+ type = (char *)preload_search_info(modptr, MODINFO_TYPE);
+ baseptr = preload_search_info(modptr, MODINFO_ADDR);
+ sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+ dynptr = preload_search_info(modptr, MODINFO_METADATA|MODINFOMD_DYNAMIC);
+ if (type == NULL || strcmp(type, "elf module") != 0)
+ return (EFTYPE);
+ if (baseptr == NULL || sizeptr == NULL || dynptr == NULL)
+ return (EINVAL);
+
+ lf = linker_make_file(filename, &link_elf_class);
+ if (lf == NULL) {
+ return ENOMEM;
+ }
+
+ ef = (elf_file_t) lf;
+ ef->preloaded = 1;
+ ef->modptr = modptr;
+ ef->address = *(caddr_t *)baseptr;
+#ifdef SPARSE_MAPPING
+ ef->object = 0;
+#endif
+ dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr;
+ ef->dynamic = (Elf_Dyn *)dp;
+ lf->address = ef->address;
+ lf->size = *(size_t *)sizeptr;
+
+ error = parse_dynamic(ef);
+ if (error) {
+ linker_file_unload(lf);
+ return error;
+ }
+ *result = lf;
+ return (0);
+}
+
+static int
+link_elf_link_preload_finish(linker_file_t lf)
+{
+ elf_file_t ef;
+ int error;
+#ifdef DDB
+ char *newfilename;
+#endif
+
+ ef = (elf_file_t) lf;
+#if 0 /* this will be more trouble than it's worth for now */
+ for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+ if (dp->d_tag != DT_NEEDED)
+ continue;
+ modname = ef->strtab + dp->d_un.d_val;
+ error = linker_load_module(modname, lf);
+ if (error)
+ goto out;
+ }
+#endif
+ error = relocate_file(ef);
+ if (error)
+ return error;
+ (void)link_elf_preload_parse_symbols(ef);
+
+#ifdef DDB
+ GDB_STATE(RT_ADD);
+ ef->gdb.l_addr = lf->address;
+ newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK);
+ strcpy(newfilename, lf->filename);
+ ef->gdb.l_name = newfilename;
+ ef->gdb.l_ld = ef->dynamic;
+ link_elf_add_gdb(&ef->gdb);
+ GDB_STATE(RT_CONSISTENT);
+#endif
+
+ return (0);
+}
+
+static int
+link_elf_load_file(linker_class_t cls, const char* filename, linker_file_t* result)
+{
+ struct nameidata nd;
+ struct thread* td = curthread; /* XXX */
+ Elf_Ehdr *hdr;
+ caddr_t firstpage;
+ int nbytes, i;
+ Elf_Phdr *phdr;
+ Elf_Phdr *phlimit;
+ Elf_Phdr *segs[2];
+ int nsegs;
+ Elf_Phdr *phdyn;
+ Elf_Phdr *phphdr;
+ caddr_t mapbase;
+ size_t mapsize;
+ Elf_Off base_offset;
+ Elf_Addr base_vaddr;
+ Elf_Addr base_vlimit;
+ int error = 0;
+ int resid, flags;
+ elf_file_t ef;
+ linker_file_t lf;
+ Elf_Shdr *shdr;
+ int symtabindex;
+ int symstrindex;
+ int symcnt;
+ int strcnt;
+#ifdef DDB
+ char *newfilename;
+#endif
+
+ GIANT_REQUIRED;
+
+ shdr = NULL;
+ lf = NULL;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+ flags = FREAD;
+ error = vn_open(&nd, &flags, 0);
+ if (error)
+ return error;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ /*
+ * Read the elf header from the file.
+ */
+ firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
+ if (firstpage == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ hdr = (Elf_Ehdr *)firstpage;
+ error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+ nbytes = PAGE_SIZE - resid;
+ if (error)
+ goto out;
+
+ if (!IS_ELF(*hdr)) {
+ error = ENOEXEC;
+ goto out;
+ }
+
+ if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
+ || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
+ link_elf_error("Unsupported file layout");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_ident[EI_VERSION] != EV_CURRENT
+ || hdr->e_version != EV_CURRENT) {
+ link_elf_error("Unsupported file version");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
+ link_elf_error("Unsupported file type");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_machine != ELF_TARG_MACH) {
+ link_elf_error("Unsupported machine");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * We rely on the program header being in the first page. This is
+ * not strictly required by the ABI specification, but it seems to
+ * always true in practice. And, it simplifies things considerably.
+ */
+ if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
+ (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
+ (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
+ link_elf_error("Unreadable program headers");
+
+ /*
+ * Scan the program header entries, and save key information.
+ *
+ * We rely on there being exactly two load segments, text and data,
+ * in that order.
+ */
+ phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
+ phlimit = phdr + hdr->e_phnum;
+ nsegs = 0;
+ phdyn = NULL;
+ phphdr = NULL;
+ while (phdr < phlimit) {
+ switch (phdr->p_type) {
+
+ case PT_LOAD:
+ if (nsegs == 2) {
+ link_elf_error("Too many sections");
+ error = ENOEXEC;
+ goto out;
+ }
+ segs[nsegs] = phdr;
+ ++nsegs;
+ break;
+
+ case PT_PHDR:
+ phphdr = phdr;
+ break;
+
+ case PT_DYNAMIC:
+ phdyn = phdr;
+ break;
+
+ case PT_INTERP:
+ link_elf_error("Unsupported file type");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ ++phdr;
+ }
+ if (phdyn == NULL) {
+ link_elf_error("Object is not dynamically-linked");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * Allocate the entire address space of the object, to stake out our
+ * contiguous region, and to establish the base address for relocation.
+ */
+ base_offset = trunc_page(segs[0]->p_offset);
+ base_vaddr = trunc_page(segs[0]->p_vaddr);
+ base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz);
+ mapsize = base_vlimit - base_vaddr;
+
+ lf = linker_make_file(filename, &link_elf_class);
+ if (!lf) {
+ error = ENOMEM;
+ goto out;
+ }
+
+ ef = (elf_file_t) lf;
+#ifdef SPARSE_MAPPING
+ ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
+ if (ef->object == NULL) {
+ free(ef, M_LINKER);
+ error = ENOMEM;
+ goto out;
+ }
+ vm_object_reference(ef->object);
+ ef->address = (caddr_t) vm_map_min(kernel_map);
+ error = vm_map_find(kernel_map, ef->object, 0,
+ (vm_offset_t *) &ef->address,
+ mapsize, 1,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error) {
+ vm_object_deallocate(ef->object);
+ ef->object = 0;
+ goto out;
+ }
+#else
+ ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
+ if (!ef->address) {
+ error = ENOMEM;
+ goto out;
+ }
+#endif
+ mapbase = ef->address;
+
+ /*
+ * Read the text and data sections and zero the bss.
+ */
+ for (i = 0; i < 2; i++) {
+ caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ segbase, segs[i]->p_filesz, segs[i]->p_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+ if (error) {
+ goto out;
+ }
+ bzero(segbase + segs[i]->p_filesz,
+ segs[i]->p_memsz - segs[i]->p_filesz);
+
+#ifdef SPARSE_MAPPING
+ /*
+ * Wire down the pages
+ */
+ vm_map_pageable(kernel_map,
+ (vm_offset_t) segbase,
+ (vm_offset_t) segbase + segs[i]->p_memsz,
+ FALSE);
+#endif
+ }
+
+#ifdef GPROF
+ /* Update profiling information with the new text segment. */
+ kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr +
+ segs[0]->p_memsz));
+#endif
+
+ ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
+
+ lf->address = ef->address;
+ lf->size = mapsize;
+
+ error = parse_dynamic(ef);
+ if (error)
+ goto out;
+ error = linker_load_dependencies(lf);
+ if (error)
+ goto out;
+#if 0 /* this will be more trouble than it's worth for now */
+ for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+ if (dp->d_tag != DT_NEEDED)
+ continue;
+ modname = ef->strtab + dp->d_un.d_val;
+ error = linker_load_module(modname, lf);
+ if (error)
+ goto out;
+ }
+#endif
+ error = relocate_file(ef);
+ if (error)
+ goto out;
+
+ /* Try and load the symbol table if it's present. (you can strip it!) */
+ nbytes = hdr->e_shnum * hdr->e_shentsize;
+ if (nbytes == 0 || hdr->e_shoff == 0)
+ goto nosyms;
+ shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO);
+ if (shdr == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ (caddr_t)shdr, nbytes, hdr->e_shoff,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+ if (error)
+ goto out;
+ symtabindex = -1;
+ symstrindex = -1;
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if (shdr[i].sh_type == SHT_SYMTAB) {
+ symtabindex = i;
+ symstrindex = shdr[i].sh_link;
+ }
+ }
+ if (symtabindex < 0 || symstrindex < 0)
+ goto nosyms;
+
+ symcnt = shdr[symtabindex].sh_size;
+ ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
+ strcnt = shdr[symstrindex].sh_size;
+ ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);
+
+ if (ef->symbase == NULL || ef->strbase == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ ef->symbase, symcnt, shdr[symtabindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+ if (error)
+ goto out;
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ ef->strbase, strcnt, shdr[symstrindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+ if (error)
+ goto out;
+
+ ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+ ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
+ ef->ddbstrcnt = strcnt;
+ ef->ddbstrtab = ef->strbase;
+
+#ifdef DDB
+ GDB_STATE(RT_ADD);
+ ef->gdb.l_addr = lf->address;
+ newfilename = malloc(strlen(filename) + 1, M_LINKER, M_WAITOK);
+ strcpy(newfilename, filename);
+ ef->gdb.l_name = (const char *)newfilename;
+ ef->gdb.l_ld = ef->dynamic;
+ link_elf_add_gdb(&ef->gdb);
+ GDB_STATE(RT_CONSISTENT);
+#endif
+
+nosyms:
+
+ *result = lf;
+
+out:
+ if (error && lf)
+ linker_file_unload(lf);
+ if (shdr)
+ free(shdr, M_LINKER);
+ if (firstpage)
+ free(firstpage, M_LINKER);
+ VOP_UNLOCK(nd.ni_vp, 0, td);
+ vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+
+ return error;
+}
+
+static void
+link_elf_unload_file(linker_file_t file)
+{
+ elf_file_t ef = (elf_file_t) file;
+
+#ifdef DDB
+ if (ef->gdb.l_ld) {
+ GDB_STATE(RT_DELETE);
+ free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER);
+ link_elf_delete_gdb(&ef->gdb);
+ GDB_STATE(RT_CONSISTENT);
+ }
+#endif
+
+ if (ef->preloaded) {
+ link_elf_unload_preload(file);
+ return;
+ }
+#ifdef SPARSE_MAPPING
+ if (ef->object) {
+ vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+ (vm_offset_t) ef->address
+ + (ef->object->size << PAGE_SHIFT));
+ vm_object_deallocate(ef->object);
+ }
+#else
+ if (ef->address)
+ free(ef->address, M_LINKER);
+#endif
+ if (ef->symbase)
+ free(ef->symbase, M_LINKER);
+ if (ef->strbase)
+ free(ef->strbase, M_LINKER);
+}
+
+static void
+link_elf_unload_preload(linker_file_t file)
+{
+ if (file->filename)
+ preload_delete_name(file->filename);
+}
+
+static const char *
+symbol_name(elf_file_t ef, Elf_Word r_info)
+{
+ const Elf_Sym *ref;
+
+ if (ELF_R_SYM(r_info)) {
+ ref = ef->symtab + ELF_R_SYM(r_info);
+ return ef->strtab + ref->st_name;
+ } else
+ return NULL;
+}
+
+static int
+relocate_file(elf_file_t ef)
+{
+ const Elf_Rel *rellim;
+ const Elf_Rel *rel;
+ const Elf_Rela *relalim;
+ const Elf_Rela *rela;
+ const char *symname;
+
+ /* Perform relocations without addend if there are any: */
+ rel = ef->rel;
+ if (rel) {
+ rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize);
+ while (rel < rellim) {
+ if (elf_reloc(&ef->lf, rel, ELF_RELOC_REL)) {
+ symname = symbol_name(ef, rel->r_info);
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rel++;
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ rela = ef->rela;
+ if (rela) {
+ relalim = (const Elf_Rela *)((const char *)ef->rela + ef->relasize);
+ while (rela < relalim) {
+ if (elf_reloc(&ef->lf, rela, ELF_RELOC_RELA)) {
+ symname = symbol_name(ef, rela->r_info);
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rela++;
+ }
+ }
+
+ /* Perform PLT relocations without addend if there are any: */
+ rel = ef->pltrel;
+ if (rel) {
+ rellim = (const Elf_Rel *)((const char *)ef->pltrel + ef->pltrelsize);
+ while (rel < rellim) {
+ if (elf_reloc(&ef->lf, rel, ELF_RELOC_REL)) {
+ symname = symbol_name(ef, rel->r_info);
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rel++;
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ rela = ef->pltrela;
+ if (rela) {
+ relalim = (const Elf_Rela *)((const char *)ef->pltrela + ef->pltrelasize);
+ while (rela < relalim) {
+ if (elf_reloc(&ef->lf, rela, ELF_RELOC_RELA)) {
+ symname = symbol_name(ef, rela->r_info);
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rela++;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Hash function for symbol table lookup. Don't even think about changing
+ * this. It is specified by the System V ABI.
+ */
+static unsigned long
+elf_hash(const char *name)
+{
+ const unsigned char *p = (const unsigned char *) name;
+ unsigned long h = 0;
+ unsigned long g;
+
+ while (*p != '\0') {
+ h = (h << 4) + *p++;
+ if ((g = h & 0xf0000000) != 0)
+ h ^= g >> 24;
+ h &= ~g;
+ }
+ return h;
+}
+
+int
+link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym)
+{
+ elf_file_t ef = (elf_file_t) lf;
+ unsigned long symnum;
+ const Elf_Sym* symp;
+ const char *strp;
+ unsigned long hash;
+ int i;
+
+ /* First, search hashed global symbols */
+ hash = elf_hash(name);
+ symnum = ef->buckets[hash % ef->nbuckets];
+
+ while (symnum != STN_UNDEF) {
+ if (symnum >= ef->nchains) {
+ printf("link_elf_lookup_symbol: corrupt symbol table\n");
+ return ENOENT;
+ }
+
+ symp = ef->symtab + symnum;
+ if (symp->st_name == 0) {
+ printf("link_elf_lookup_symbol: corrupt symbol table\n");
+ return ENOENT;
+ }
+
+ strp = ef->strtab + symp->st_name;
+
+ if (strcmp(name, strp) == 0) {
+ if (symp->st_shndx != SHN_UNDEF ||
+ (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+ *sym = (c_linker_sym_t) symp;
+ return 0;
+ } else
+ return ENOENT;
+ }
+
+ symnum = ef->chains[symnum];
+ }
+
+ /* If we have not found it, look at the full table (if loaded) */
+ if (ef->symtab == ef->ddbsymtab)
+ return ENOENT;
+
+ /* Exhaustive search */
+ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+ strp = ef->ddbstrtab + symp->st_name;
+ if (strcmp(name, strp) == 0) {
+ if (symp->st_shndx != SHN_UNDEF ||
+ (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+ *sym = (c_linker_sym_t) symp;
+ return 0;
+ } else
+ return ENOENT;
+ }
+ }
+
+ return ENOENT;
+}
+
+static int
+link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t* symval)
+{
+ elf_file_t ef = (elf_file_t) lf;
+ const Elf_Sym* es = (const Elf_Sym*) sym;
+
+ if (es >= ef->symtab && ((es - ef->symtab) < ef->nchains)) {
+ symval->name = ef->strtab + es->st_name;
+ symval->value = (caddr_t) ef->address + es->st_value;
+ symval->size = es->st_size;
+ return 0;
+ }
+ if (ef->symtab == ef->ddbsymtab)
+ return ENOENT;
+ if (es >= ef->ddbsymtab && ((es - ef->ddbsymtab) < ef->ddbsymcnt)) {
+ symval->name = ef->ddbstrtab + es->st_name;
+ symval->value = (caddr_t) ef->address + es->st_value;
+ symval->size = es->st_size;
+ return 0;
+ }
+ return ENOENT;
+}
+
+static int
+link_elf_search_symbol(linker_file_t lf, caddr_t value,
+ c_linker_sym_t* sym, long* diffp)
+{
+ elf_file_t ef = (elf_file_t) lf;
+ u_long off = (uintptr_t) (void *) value;
+ u_long diff = off;
+ u_long st_value;
+ const Elf_Sym* es;
+ const Elf_Sym* best = 0;
+ int i;
+
+ for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
+ if (es->st_name == 0)
+ continue;
+ st_value = es->st_value + (uintptr_t) (void *) ef->address;
+ if (off >= st_value) {
+ if (off - st_value < diff) {
+ diff = off - st_value;
+ best = es;
+ if (diff == 0)
+ break;
+ } else if (off - st_value == diff) {
+ best = es;
+ }
+ }
+ }
+ if (best == 0)
+ *diffp = off;
+ else
+ *diffp = diff;
+ *sym = (c_linker_sym_t) best;
+
+ return 0;
+}
+
+/*
+ * Look up a linker set on an ELF system.
+ */
+static int
+link_elf_lookup_set(linker_file_t lf, const char *name,
+ void ***startp, void ***stopp, int *countp)
+{
+ c_linker_sym_t sym;
+ linker_symval_t symval;
+ char *setsym;
+ void **start, **stop;
+ int len, error = 0, count;
+
+ len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */
+ setsym = malloc(len, M_LINKER, M_WAITOK);
+ if (setsym == NULL)
+ return ENOMEM;
+
+ /* get address of first entry */
+ snprintf(setsym, len, "%s%s", "__start_set_", name);
+ error = link_elf_lookup_symbol(lf, setsym, &sym);
+ if (error)
+ goto out;
+ link_elf_symbol_values(lf, sym, &symval);
+ if (symval.value == 0) {
+ error = ESRCH;
+ goto out;
+ }
+ start = (void **)symval.value;
+
+ /* get address of last entry */
+ snprintf(setsym, len, "%s%s", "__stop_set_", name);
+ error = link_elf_lookup_symbol(lf, setsym, &sym);
+ if (error)
+ goto out;
+ link_elf_symbol_values(lf, sym, &symval);
+ if (symval.value == 0) {
+ error = ESRCH;
+ goto out;
+ }
+ stop = (void **)symval.value;
+
+ /* and the number of entries */
+ count = stop - start;
+
+ /* and copy out */
+ if (startp)
+ *startp = start;
+ if (stopp)
+ *stopp = stop;
+ if (countp)
+ *countp = count;
+
+out:
+ free(setsym, M_LINKER);
+ return error;
+}
+
+static int
+link_elf_each_function_name(linker_file_t file,
+ int (*callback)(const char *, void *), void *opaque) {
+ elf_file_t ef = (elf_file_t)file;
+ const Elf_Sym* symp;
+ int i, error;
+
+ /* Exhaustive search */
+ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+ if (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
+ error = callback(ef->ddbstrtab + symp->st_name, opaque);
+ if (error)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+#ifdef __ia64__
+/*
+ * Each KLD has its own GP. The GP value for each load module is given by
+ * DT_PLTGOT on ia64. We need GP to construct function descriptors, but
+ * don't have direct access to the ELF file structure. The link_elf_get_gp()
+ * function returns the GP given a pointer to a generic linker file struct.
+ */
+Elf_Addr
+link_elf_get_gp(linker_file_t lf)
+{
+ elf_file_t ef = (elf_file_t)lf;
+ return (Elf_Addr)ef->got;
+}
+#endif
+
+/*
+ * Symbol lookup function that can be used when the symbol index is known (ie
+ * in relocations). It uses the symbol index instead of doing a fully fledged
+ * hash table based lookup when such is valid. For example for local symbols.
+ * This is not only more efficient, it's also more correct. It's not always
+ * the case that the symbol can be found through the hash table.
+ */
+Elf_Addr
+elf_lookup(linker_file_t lf, Elf_Word symidx, int deps)
+{
+ elf_file_t ef = (elf_file_t)lf;
+ const Elf_Sym *sym;
+ const char *symbol;
+
+ /* Don't even try to lookup the symbol if the index is bogus. */
+ if (symidx >= ef->nchains)
+ return (0);
+
+ sym = ef->symtab + symidx;
+
+ /*
+ * Don't do a full lookup when the symbol is local. It may even
+ * fail because it may not be found through the hash table.
+ */
+ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) {
+ /* Force lookup failure when we have an insanity. */
+ if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0)
+ return (0);
+ return ((Elf_Addr)ef->address + sym->st_value);
+ }
+
+ /*
+ * XXX we can avoid doing a hash table based lookup for global
+ * symbols as well. This however is not always valid, so we'll
+ * just do it the hard way for now. Performance tweaks can
+ * always be added.
+ */
+
+ symbol = ef->strtab + sym->st_name;
+
+ /* Force a lookup failure if the symbol name is bogus. */
+ if (*symbol == 0)
+ return (0);
+
+ return ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
+}
diff --git a/sys/kern/linker_if.m b/sys/kern/linker_if.m
new file mode 100644
index 0000000..9dafb57
--- /dev/null
+++ b/sys/kern/linker_if.m
@@ -0,0 +1,107 @@
+#
+# Copyright (c) 2000 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/linker.h>
+
+INTERFACE linker;
+
+#
+# Lookup a symbol in the file's symbol table. If the symbol is not
+# found then return ENOENT, otherwise zero.
+#
+METHOD int lookup_symbol {
+ linker_file_t file;
+ const char* name;
+ c_linker_sym_t* symp;
+};
+
+METHOD int symbol_values {
+ linker_file_t file;
+ c_linker_sym_t sym;
+ linker_symval_t* valp;
+};
+
+METHOD int search_symbol {
+ linker_file_t file;
+ caddr_t value;
+ c_linker_sym_t* symp;
+ long* diffp;
+};
+
+#
+# Call the callback with each specified function defined in the file.
+# Stop and return the error if the callback returns an error.
+#
+METHOD int each_function_name {
+ linker_file_t file;
+ linker_function_name_callback_t callback;
+ void* opaque;
+};
+
+#
+# Search for a linker set in a file. Return a pointer to the first
+# entry (which is itself a pointer), and the number of entries.
+# "stop" points to the entry beyond the last valid entry.
+# If count, start or stop are NULL, they are not returned.
+#
+METHOD int lookup_set {
+ linker_file_t file;
+ const char* name;
+ void*** start;
+ void*** stop;
+ int* count;
+};
+
+#
+# Unload a file, releasing dependancies and freeing storage.
+#
+METHOD void unload {
+ linker_file_t file;
+};
+
+#
+# Load a file, returning the new linker_file_t in *result. If
+# the class does not recognise the file type, zero should be
+# returned, without modifying *result. If the file is
+# recognised, the file should be loaded, *result set to the new
+# file and zero returned. If some other error is detected an
+# appropriate errno should be returned.
+#
+STATICMETHOD int load_file {
+ linker_class_t cls;
+ const char* filename;
+ linker_file_t* result;
+};
+STATICMETHOD int link_preload {
+ linker_class_t cls;
+ const char* filename;
+ linker_file_t* result;
+};
+STATICMETHOD int link_preload_finish {
+ linker_file_t file;
+};
diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh
new file mode 100644
index 0000000..f4a0212
--- /dev/null
+++ b/sys/kern/makesyscalls.sh
@@ -0,0 +1,446 @@
+#! /bin/sh -
+# @(#)makesyscalls.sh 8.1 (Berkeley) 6/10/93
+# $FreeBSD$
+
+set -e
+
+# name of compat option:
+compat=COMPAT_43
+
+# output files:
+sysnames="syscalls.c"
+sysproto="../sys/sysproto.h"
+sysproto_h=_SYS_SYSPROTO_H_
+syshdr="../sys/syscall.h"
+sysmk="../sys/syscall.mk"
+syssw="init_sysent.c"
+syscallprefix="SYS_"
+switchname="sysent"
+namesname="syscallnames"
+
+# tmp files:
+sysdcl="sysent.dcl.$$"
+syscompat="sysent.compat.$$"
+syscompatdcl="sysent.compatdcl.$$"
+sysent="sysent.switch.$$"
+sysinc="sysinc.switch.$$"
+sysarg="sysarg.switch.$$"
+
+trap "rm $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg" 0
+
+touch $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg
+
+case $# in
+ 0) echo "usage: $0 input-file <config-file>" 1>&2
+ exit 1
+ ;;
+esac
+
+if [ -n "$2" -a -f "$2" ]; then
+ . $2
+fi
+
+sed -e '
+s/\$//g
+:join
+ /\\$/{a\
+
+ N
+ s/\\\n//
+ b join
+ }
+2,${
+ /^#/!s/\([{}()*,]\)/ \1 /g
+}
+' < $1 | awk "
+ BEGIN {
+ sysdcl = \"$sysdcl\"
+ sysproto = \"$sysproto\"
+ sysproto_h = \"$sysproto_h\"
+ syscompat = \"$syscompat\"
+ syscompatdcl = \"$syscompatdcl\"
+ sysent = \"$sysent\"
+ syssw = \"$syssw\"
+ sysinc = \"$sysinc\"
+ sysarg = \"$sysarg\"
+ sysnames = \"$sysnames\"
+ syshdr = \"$syshdr\"
+ sysmk = \"$sysmk\"
+ compat = \"$compat\"
+ syscallprefix = \"$syscallprefix\"
+ switchname = \"$switchname\"
+ namesname = \"$namesname\"
+ infile = \"$1\"
+ "'
+
+ printf "/*\n * System call switch table.\n *\n" > syssw
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > syssw
+ printf " * $%s$\n", "FreeBSD" > syssw
+
+ printf "/*\n * System call prototypes.\n *\n" > sysarg
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg
+ printf " * $%s$\n", "FreeBSD" > sysarg
+
+ printf "\n#ifdef %s\n\n", compat > syscompat
+
+ printf "/*\n * System call names.\n *\n" > sysnames
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
+ printf " * $%s$\n", "FreeBSD" > sysnames
+
+ printf "/*\n * System call numbers.\n *\n" > syshdr
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr
+ printf " * $%s$\n", "FreeBSD" > syshdr
+ printf "# FreeBSD system call names.\n" > sysmk
+ printf "# DO NOT EDIT-- this file is automatically generated.\n" > sysmk
+ printf "# $%s$\n", "FreeBSD" > sysmk
+ }
+ NR == 1 {
+ gsub("[$]FreeBSD: ", "", $0)
+ gsub(" [$]", "", $0)
+
+ printf " * created from%s\n */\n\n", $0 > syssw
+
+ printf "\n/* The casts are bogus but will do for now. */\n" > sysent
+ printf "struct sysent %s[] = {\n",switchname > sysent
+
+ printf " * created from%s\n */\n\n", $0 > sysarg
+ printf "#ifndef %s\n", sysproto_h > sysarg
+ printf "#define\t%s\n\n", sysproto_h > sysarg
+ printf "#include <sys/signal.h>\n\n" > sysarg
+ printf "#include <sys/acl.h>\n\n" > sysarg
+ printf "struct proc;\n\n" > sysarg
+ printf "struct thread;\n\n" > sysarg
+ printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) ? \\\n" > sysarg
+ printf "\t\t0 : sizeof(register_t) - sizeof(t))\n\n" > sysarg
+ printf "#if BYTE_ORDER == LITTLE_ENDIAN\n"> sysarg
+ printf "#define\tPADL_(t)\t0\n" > sysarg
+ printf "#define\tPADR_(t)\tPAD_(t)\n" > sysarg
+ printf "#else\n" > sysarg
+ printf "#define\tPADL_(t)\tPAD_(t)\n" > sysarg
+ printf "#define\tPADR_(t)\t0\n" > sysarg
+ printf "#endif\n\n" > sysarg
+
+ printf " * created from%s\n */\n\n", $0 > sysnames
+ printf "char *%s[] = {\n", namesname > sysnames
+
+ printf " * created from%s\n */\n\n", $0 > syshdr
+
+ printf "# created from%s\nMIASM = ", $0 > sysmk
+
+ next
+ }
+ NF == 0 || $1 ~ /^;/ {
+ next
+ }
+ $1 ~ /^#[ ]*include/ {
+ print > sysinc
+ next
+ }
+ $1 ~ /^#[ ]*if/ {
+ print > sysent
+ print > sysdcl
+ print > sysarg
+ print > syscompat
+ print > sysnames
+ savesyscall = syscall
+ next
+ }
+ $1 ~ /^#[ ]*else/ {
+ print > sysent
+ print > sysdcl
+ print > sysarg
+ print > syscompat
+ print > sysnames
+ syscall = savesyscall
+ next
+ }
+ $1 ~ /^#/ {
+ print > sysent
+ print > sysdcl
+ print > sysarg
+ print > syscompat
+ print > sysnames
+ next
+ }
+ syscall != $1 {
+ printf "%s: line %d: syscall number out of sync at %d\n",
+ infile, NR, syscall
+ printf "line is:\n"
+ print
+ exit 1
+ }
+ function align_sysent_comment(column) {
+ printf("\t") > sysent
+ column = column + 8 - column % 8
+ while (column < 56) {
+ printf("\t") > sysent
+ column = column + 8
+ }
+ }
+ function parserr(was, wanted) {
+ printf "%s: line %d: unexpected %s (expected %s)\n",
+ infile, NR, was, wanted
+ exit 1
+ }
+ function parseline() {
+ f=4 # toss number and type
+ argc= 0;
+ argssize = "0"
+ if ($NF != "}") {
+ funcalias=$(NF-2)
+ argalias=$(NF-1)
+ rettype=$NF
+ end=NF-3
+ } else {
+ funcalias=""
+ argalias=""
+ rettype="int"
+ end=NF
+ }
+ if ($2 == "NODEF") {
+ funcname=$4
+ argssize = "AS(" $6 ")"
+ return
+ }
+ if ($f != "{")
+ parserr($f, "{")
+ f++
+ if ($end != "}")
+ parserr($end, "}")
+ end--
+ if ($end != ";")
+ parserr($end, ";")
+ end--
+ if ($end != ")")
+ parserr($end, ")")
+ end--
+
+ f++ #function return type
+
+ funcname=$f
+ if (funcalias == "")
+ funcalias = funcname
+ if (argalias == "") {
+ argalias = funcname "_args"
+ if ($2 == "COMPAT")
+ argalias = "o" argalias
+ }
+ f++
+
+ if ($f != "(")
+ parserr($f, ")")
+ f++
+
+ if (f == end) {
+ if ($f != "void")
+ parserr($f, "argument definition")
+ return
+ }
+
+ while (f <= end) {
+ argc++
+ argtype[argc]=""
+ oldf=""
+ while (f < end && $(f+1) != ",") {
+ if (argtype[argc] != "" && oldf != "*")
+ argtype[argc] = argtype[argc]" ";
+ argtype[argc] = argtype[argc]$f;
+ oldf = $f;
+ f++
+ }
+ if (argtype[argc] == "")
+ parserr($f, "argument definition")
+ argname[argc]=$f;
+ f += 2; # skip name, and any comma
+ }
+ if (argc != 0)
+ argssize = "AS(" argalias ")"
+ }
+ { comment = $4
+ if (NF < 7)
+ for (i = 5; i <= NF; i++)
+ comment = comment " " $i
+ }
+
+ # The 'M' type prefix
+ #
+ {
+ mpsafe = "SYF_MPSAFE | ";
+ if ($2 == "MSTD") {
+ $2 = "STD";
+ } else if ($2 == "MNODEF") {
+ $2 = "NODEF";
+ } else if ($2 == "MNOARGS") {
+ $2 = "NOARGS";
+ } else if ($2 == "MNOPROTO") {
+ $2 = "NOPROTO";
+ } else if ($2 == "MNOIMPL") {
+ $2 = "NOIMPL";
+ } else if ($2 == "MNOSTD") {
+ $2 = "NOSTD";
+ } else if ($2 == "MCOMPAT") {
+ $2 = "COMPAT";
+ } else if ($2 == "MCPT_NOA") {
+ $2 = "CPT_NOA";
+ } else if ($2 == "MLIBCOMPAT") {
+ $2 = "LIBCOMPAT";
+ } else if ($2 == "MOBSOL") {
+ $2 = "OBSOL";
+ } else if ($2 == "MUNIMPL") {
+ $2 = "UNIMPL";
+ } else {
+ mpsafe = "";
+ }
+ }
+ $2 == "STD" || $2 == "NODEF" || $2 == "NOARGS" || $2 == "NOPROTO" \
+ || $2 == "NOIMPL" || $2 == "NOSTD" {
+ parseline()
+ if ((!nosys || funcname != "nosys") && \
+ (funcname != "lkmnosys") && (funcname != "lkmressys")) {
+ if (argc != 0 && $2 != "NOARGS" && $2 != "NOPROTO") {
+ printf("struct %s {\n", argalias) > sysarg
+ for (i = 1; i <= argc; i++)
+ printf("\tchar %s_l_[PADL_(%s)]; " \
+ "%s %s; char %s_r_[PADR_(%s)];\n",
+ argname[i], argtype[i],
+ argtype[i], argname[i],
+ argname[i], argtype[i]) > sysarg
+ printf("};\n") > sysarg
+ }
+ else if ($2 != "NOARGS" && $2 != "NOPROTO" && \
+ $2 != "NODEF")
+ printf("struct %s {\n\tregister_t dummy;\n};\n",
+ argalias) > sysarg
+ }
+ if (($2 != "NOPROTO" && $2 != "NODEF" && \
+ (funcname != "nosys" || !nosys)) || \
+ (funcname == "lkmnosys" && !lkmnosys) || \
+ funcname == "lkmressys") {
+ printf("%s\t%s(struct thread *, struct %s *)",
+ rettype, funcname, argalias) > sysdcl
+ printf(";\n") > sysdcl
+ }
+ if (funcname == "nosys")
+ nosys = 1
+ if (funcname == "lkmnosys")
+ lkmnosys = 1
+ printf("\t{ %s%s, (sy_call_t *)", mpsafe, argssize) > sysent
+ column = 8 + 2 + length(mpsafe) + length(argssize) + 15
+ if ($2 == "NOIMPL") {
+ printf("%s },", "nosys") > sysent
+ column = column + length("nosys") + 3
+ } else if ($2 == "NOSTD") {
+ printf("%s },", "lkmressys") > sysent
+ column = column + length("lkmressys") + 3
+ } else {
+ printf("%s },", funcname) > sysent
+ column = column + length(funcname) + 3
+ }
+ align_sysent_comment(column)
+ printf("/* %d = %s */\n", syscall, funcalias) > sysent
+ printf("\t\"%s\",\t\t\t/* %d = %s */\n",
+ funcalias, syscall, funcalias) > sysnames
+ if ($2 != "NODEF") {
+ printf("#define\t%s%s\t%d\n", syscallprefix,
+ funcalias, syscall) > syshdr
+ printf(" \\\n\t%s.o", funcalias) > sysmk
+ }
+ syscall++
+ next
+ }
+ $2 == "COMPAT" || $2 == "CPT_NOA" {
+ ncompat++
+ parseline()
+ if (argc != 0 && $2 != "CPT_NOA") {
+ printf("struct %s {\n", argalias) > syscompat
+ for (i = 1; i <= argc; i++)
+ printf("\tchar %s_l_[PADL_(%s)]; %s %s; " \
+ "char %s_r_[PADR_(%s)];\n",
+ argname[i], argtype[i],
+ argtype[i], argname[i],
+ argname[i], argtype[i]) > syscompat
+ printf("};\n") > syscompat
+ }
+ else if($2 != "CPT_NOA")
+ printf("struct %s {\n\tregister_t dummy;\n};\n",
+ argalias) > sysarg
+ printf("%s\to%s(struct thread *, struct %s *);\n",
+ rettype, funcname, argalias) > syscompatdcl
+ printf("\t{ compat(%s%s,%s) },",
+ mpsafe, argssize, funcname) > sysent
+ align_sysent_comment(8 + 9 + length(mpsafe) + \
+ length(argssize) + 1 + length(funcname) + 4)
+ printf("/* %d = old %s */\n", syscall, funcalias) > sysent
+ printf("\t\"old.%s\",\t\t/* %d = old %s */\n",
+ funcalias, syscall, funcalias) > sysnames
+ printf("\t\t\t\t/* %d is old %s */\n",
+ syscall, funcalias) > syshdr
+ syscall++
+ next
+ }
+ $2 == "LIBCOMPAT" {
+ ncompat++
+ parseline()
+ printf("%s\to%s();\n", rettype, funcname) > syscompatdcl
+ printf("\t{ compat(%s%s,%s) },",
+ mpsafe, argssize, funcname) > sysent
+ align_sysent_comment(8 + 9 + length(mpsafe) + \
+ length(argssize) + 1 + length(funcname) + 4)
+ printf("/* %d = old %s */\n", syscall, funcalias) > sysent
+ printf("\t\"old.%s\",\t\t/* %d = old %s */\n",
+ funcalias, syscall, funcalias) > sysnames
+ printf("#define\t%s%s\t%d\t/* compatibility; still used by libc */\n",
+ syscallprefix, funcalias, syscall) > syshdr
+ printf(" \\\n\t%s.o", funcalias) > sysmk
+ syscall++
+ next
+ }
+ $2 == "OBSOL" {
+ printf("\t{ 0, (sy_call_t *)nosys },") > sysent
+ align_sysent_comment(34)
+ printf("/* %d = obsolete %s */\n", syscall, comment) > sysent
+ printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n",
+ $4, syscall, comment) > sysnames
+ printf("\t\t\t\t/* %d is obsolete %s */\n",
+ syscall, comment) > syshdr
+ syscall++
+ next
+ }
+ $2 == "UNIMPL" {
+ printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = %s */\n",
+ syscall, comment) > sysent
+ printf("\t\"#%d\",\t\t\t/* %d = %s */\n",
+ syscall, syscall, comment) > sysnames
+ syscall++
+ next
+ }
+ {
+ printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2
+ exit 1
+ }
+ END {
+ printf "\n#define AS(name) (sizeof(struct name) / sizeof(register_t))\n" > sysinc
+ if (ncompat != 0) {
+ printf "#include \"opt_compat.h\"\n\n" > syssw
+ printf "\n#ifdef %s\n", compat > sysinc
+ printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysinc
+ printf "#else\n" > sysinc
+ printf "#define compat(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+ printf "#endif\n" > sysinc
+ }
+
+ printf("\n#endif /* %s */\n\n", compat) > syscompatdcl
+ printf("#undef PAD_\n") > syscompatdcl
+ printf("#undef PADL_\n") > syscompatdcl
+ printf("#undef PADR_\n") > syscompatdcl
+ printf("\n#endif /* !%s */\n", sysproto_h) > syscompatdcl
+
+ printf("\n") > sysmk
+ printf("};\n") > sysent
+ printf("};\n") > sysnames
+ printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \
+ > syshdr
+ } '
+
+cat $sysinc $sysent >> $syssw
+cat $sysarg $sysdcl $syscompat $syscompatdcl > $sysproto
diff --git a/sys/kern/md4c.c b/sys/kern/md4c.c
new file mode 100644
index 0000000..e3a0bfa
--- /dev/null
+++ b/sys/kern/md4c.c
@@ -0,0 +1,285 @@
+/* MD4C.C - RSA Data Security, Inc., MD4 message-digest algorithm
+ * $FreeBSD$
+ */
+
+/* Copyright (C) 1990-2, RSA Data Security, Inc. All rights reserved.
+
+ License to copy and use this software is granted provided that it
+ is identified as the "RSA Data Security, Inc. MD4 Message-Digest
+ Algorithm" in all material mentioning or referencing this software
+ or this function.
+
+ License is also granted to make and use derivative works provided
+ that such works are identified as "derived from the RSA Data
+ Security, Inc. MD4 Message-Digest Algorithm" in all material
+ mentioning or referencing the derived work.
+
+ RSA Data Security, Inc. makes no representations concerning either
+ the merchantability of this software or the suitability of this
+ software for any particular purpose. It is provided "as is"
+ without express or implied warranty of any kind.
+
+ These notices must be retained in any copies of any part of this
+ documentation and/or software.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/md4.h>
+
+typedef unsigned char *POINTER;
+typedef u_int16_t UINT2;
+typedef u_int32_t UINT4;
+
+#define PROTO_LIST(list) list
+
+/* Constants for MD4Transform routine.
+ */
+#define S11 3
+#define S12 7
+#define S13 11
+#define S14 19
+#define S21 3
+#define S22 5
+#define S23 9
+#define S24 13
+#define S31 3
+#define S32 9
+#define S33 11
+#define S34 15
+
+static void MD4Transform PROTO_LIST ((UINT4 [4], const unsigned char [64]));
+static void Encode PROTO_LIST
+ ((unsigned char *, UINT4 *, unsigned int));
+static void Decode PROTO_LIST
+ ((UINT4 *, const unsigned char *, unsigned int));
+
+static unsigned char PADDING[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G and H are basic MD4 functions.
+ */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+
+/* ROTATE_LEFT rotates x left n bits.
+ */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/* FF, GG and HH are transformations for rounds 1, 2 and 3 */
+/* Rotation is separate from addition to prevent recomputation */
+#define FF(a, b, c, d, x, s) { \
+ (a) += F ((b), (c), (d)) + (x); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ }
+#define GG(a, b, c, d, x, s) { \
+ (a) += G ((b), (c), (d)) + (x) + (UINT4)0x5a827999; \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ }
+#define HH(a, b, c, d, x, s) { \
+ (a) += H ((b), (c), (d)) + (x) + (UINT4)0x6ed9eba1; \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ }
+
+/* MD4 initialization. Begins an MD4 operation, writing a new context.
+ */
+void MD4Init (context)
+MD4_CTX *context; /* context */
+{
+ context->count[0] = context->count[1] = 0;
+
+ /* Load magic initialization constants.
+ */
+ context->state[0] = 0x67452301;
+ context->state[1] = 0xefcdab89;
+ context->state[2] = 0x98badcfe;
+ context->state[3] = 0x10325476;
+}
+
+/* MD4 block update operation. Continues an MD4 message-digest
+ operation, processing another message block, and updating the
+ context.
+ */
+void MD4Update (context, input, inputLen)
+MD4_CTX *context; /* context */
+const unsigned char *input; /* input block */
+unsigned int inputLen; /* length of input block */
+{
+ unsigned int i, index, partLen;
+
+ /* Compute number of bytes mod 64 */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+ /* Update number of bits */
+ if ((context->count[0] += ((UINT4)inputLen << 3))
+ < ((UINT4)inputLen << 3))
+ context->count[1]++;
+ context->count[1] += ((UINT4)inputLen >> 29);
+
+ partLen = 64 - index;
+ /* Transform as many times as possible.
+ */
+ if (inputLen >= partLen) {
+ bcopy(input, &context->buffer[index], partLen);
+ MD4Transform (context->state, context->buffer);
+
+ for (i = partLen; i + 63 < inputLen; i += 64)
+ MD4Transform (context->state, &input[i]);
+
+ index = 0;
+ }
+ else
+ i = 0;
+
+ /* Buffer remaining input */
+ bcopy(&input[i], &context->buffer[index], inputLen-i);
+}
+
+/* MD4 padding. */
+void MD4Pad (context)
+MD4_CTX *context; /* context */
+{
+ unsigned char bits[8];
+ unsigned int index, padLen;
+
+ /* Save number of bits */
+ Encode (bits, context->count, 8);
+
+ /* Pad out to 56 mod 64.
+ */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+ padLen = (index < 56) ? (56 - index) : (120 - index);
+ MD4Update (context, PADDING, padLen);
+
+ /* Append length (before padding) */
+ MD4Update (context, bits, 8);
+}
+
+/* MD4 finalization. Ends an MD4 message-digest operation, writing the
+ the message digest and zeroizing the context.
+ */
+void MD4Final (digest, context)
+unsigned char digest[16]; /* message digest */
+MD4_CTX *context; /* context */
+{
+ /* Do padding */
+ MD4Pad (context);
+
+ /* Store state in digest */
+ Encode (digest, context->state, 16);
+
+ /* Zeroize sensitive information.
+ */
+ bzero((POINTER)context, sizeof (*context));
+}
+
+/* MD4 basic transformation. Transforms state based on block.
+ */
+static void MD4Transform (state, block)
+UINT4 state[4];
+const unsigned char block[64];
+{
+ UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+ Decode (x, block, 64);
+
+ /* Round 1 */
+ FF (a, b, c, d, x[ 0], S11); /* 1 */
+ FF (d, a, b, c, x[ 1], S12); /* 2 */
+ FF (c, d, a, b, x[ 2], S13); /* 3 */
+ FF (b, c, d, a, x[ 3], S14); /* 4 */
+ FF (a, b, c, d, x[ 4], S11); /* 5 */
+ FF (d, a, b, c, x[ 5], S12); /* 6 */
+ FF (c, d, a, b, x[ 6], S13); /* 7 */
+ FF (b, c, d, a, x[ 7], S14); /* 8 */
+ FF (a, b, c, d, x[ 8], S11); /* 9 */
+ FF (d, a, b, c, x[ 9], S12); /* 10 */
+ FF (c, d, a, b, x[10], S13); /* 11 */
+ FF (b, c, d, a, x[11], S14); /* 12 */
+ FF (a, b, c, d, x[12], S11); /* 13 */
+ FF (d, a, b, c, x[13], S12); /* 14 */
+ FF (c, d, a, b, x[14], S13); /* 15 */
+ FF (b, c, d, a, x[15], S14); /* 16 */
+
+ /* Round 2 */
+ GG (a, b, c, d, x[ 0], S21); /* 17 */
+ GG (d, a, b, c, x[ 4], S22); /* 18 */
+ GG (c, d, a, b, x[ 8], S23); /* 19 */
+ GG (b, c, d, a, x[12], S24); /* 20 */
+ GG (a, b, c, d, x[ 1], S21); /* 21 */
+ GG (d, a, b, c, x[ 5], S22); /* 22 */
+ GG (c, d, a, b, x[ 9], S23); /* 23 */
+ GG (b, c, d, a, x[13], S24); /* 24 */
+ GG (a, b, c, d, x[ 2], S21); /* 25 */
+ GG (d, a, b, c, x[ 6], S22); /* 26 */
+ GG (c, d, a, b, x[10], S23); /* 27 */
+ GG (b, c, d, a, x[14], S24); /* 28 */
+ GG (a, b, c, d, x[ 3], S21); /* 29 */
+ GG (d, a, b, c, x[ 7], S22); /* 30 */
+ GG (c, d, a, b, x[11], S23); /* 31 */
+ GG (b, c, d, a, x[15], S24); /* 32 */
+
+ /* Round 3 */
+ HH (a, b, c, d, x[ 0], S31); /* 33 */
+ HH (d, a, b, c, x[ 8], S32); /* 34 */
+ HH (c, d, a, b, x[ 4], S33); /* 35 */
+ HH (b, c, d, a, x[12], S34); /* 36 */
+ HH (a, b, c, d, x[ 2], S31); /* 37 */
+ HH (d, a, b, c, x[10], S32); /* 38 */
+ HH (c, d, a, b, x[ 6], S33); /* 39 */
+ HH (b, c, d, a, x[14], S34); /* 40 */
+ HH (a, b, c, d, x[ 1], S31); /* 41 */
+ HH (d, a, b, c, x[ 9], S32); /* 42 */
+ HH (c, d, a, b, x[ 5], S33); /* 43 */
+ HH (b, c, d, a, x[13], S34); /* 44 */
+ HH (a, b, c, d, x[ 3], S31); /* 45 */
+ HH (d, a, b, c, x[11], S32); /* 46 */
+ HH (c, d, a, b, x[ 7], S33); /* 47 */
+ HH (b, c, d, a, x[15], S34); /* 48 */
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+
+ /* Zeroize sensitive information.
+ */
+ bzero((POINTER)x, sizeof (x));
+}
+
+/* Encodes input (UINT4) into output (unsigned char). Assumes len is
+ a multiple of 4.
+ */
+static void Encode (output, input, len)
+unsigned char *output;
+UINT4 *input;
+unsigned int len;
+{
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4) {
+ output[j] = (unsigned char)(input[i] & 0xff);
+ output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+ output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+ output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+ }
+}
+
+/* Decodes input (unsigned char) into output (UINT4). Assumes len is
+ a multiple of 4.
+ */
+static void Decode (output, input, len)
+
+UINT4 *output;
+const unsigned char *input;
+unsigned int len;
+{
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4)
+ output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
+ (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
+}
diff --git a/sys/kern/md5c.c b/sys/kern/md5c.c
new file mode 100644
index 0000000..72c970b
--- /dev/null
+++ b/sys/kern/md5c.c
@@ -0,0 +1,339 @@
+/*
+ * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
+ *
+ * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+ * rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * This code is the same as the code published by RSA Inc. It has been
+ * edited for clarity and style only.
+ */
+
+/*
+ * This file should be kept in sync with src/lib/libmd/md5c.c
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include <machine/endian.h>
+#include <sys/endian.h>
+#include <sys/md5.h>
+
+static void MD5Transform(u_int32_t [4], const unsigned char [64]);
+
+#ifdef _KERNEL
+#define memset(x,y,z) bzero(x,z);
+#define memcpy(x,y,z) bcopy(y, x, z)
+#endif
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+#define Encode memcpy
+#define Decode memcpy
+#else
+
+/*
+ * Encodes input (u_int32_t) into output (unsigned char). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Encode (unsigned char *output, u_int32_t *input, unsigned int len)
+{
+ unsigned int i;
+ u_int32_t *op = (u_int32_t *)output;
+
+ for (i = 0; i < len / 4; i++)
+ op[i] = htole32(input[i]);
+}
+
+/*
+ * Decodes input (unsigned char) into output (u_int32_t). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Decode (u_int32_t *output, const unsigned char *input, unsigned int len)
+{
+ unsigned int i;
+ const u_int32_t *ip = (const u_int32_t *)input;
+
+ for (i = 0; i < len / 4; i++)
+ output[i] = le32toh(ip[i]);
+}
+#endif
+
+static unsigned char PADDING[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G, H and I are basic MD5 functions. */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits. */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/*
+ * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+ * Rotation is separate from addition to prevent recomputation.
+ */
+#define FF(a, b, c, d, x, s, ac) { \
+ (a) += F ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define GG(a, b, c, d, x, s, ac) { \
+ (a) += G ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define HH(a, b, c, d, x, s, ac) { \
+ (a) += H ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define II(a, b, c, d, x, s, ac) { \
+ (a) += I ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+
+/* MD5 initialization. Begins an MD5 operation, writing a new context. */
+
+void
+MD5Init (context)
+ MD5_CTX *context;
+{
+
+ context->count[0] = context->count[1] = 0;
+
+ /* Load magic initialization constants. */
+ context->state[0] = 0x67452301;
+ context->state[1] = 0xefcdab89;
+ context->state[2] = 0x98badcfe;
+ context->state[3] = 0x10325476;
+}
+
+/*
+ * MD5 block update operation. Continues an MD5 message-digest
+ * operation, processing another message block, and updating the
+ * context.
+ */
+
+void
+MD5Update (context, input, inputLen)
+ MD5_CTX *context;
+ const unsigned char *input;
+ unsigned int inputLen;
+{
+ unsigned int i, index, partLen;
+
+ /* Compute number of bytes mod 64 */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+
+ /* Update number of bits */
+ if ((context->count[0] += ((u_int32_t)inputLen << 3))
+ < ((u_int32_t)inputLen << 3))
+ context->count[1]++;
+ context->count[1] += ((u_int32_t)inputLen >> 29);
+
+ partLen = 64 - index;
+
+ /* Transform as many times as possible. */
+ if (inputLen >= partLen) {
+ memcpy((void *)&context->buffer[index], (const void *)input,
+ partLen);
+ MD5Transform (context->state, context->buffer);
+
+ for (i = partLen; i + 63 < inputLen; i += 64)
+ MD5Transform (context->state, &input[i]);
+
+ index = 0;
+ }
+ else
+ i = 0;
+
+ /* Buffer remaining input */
+ memcpy ((void *)&context->buffer[index], (const void *)&input[i],
+ inputLen-i);
+}
+
+/*
+ * MD5 padding. Adds padding followed by original length.
+ */
+
+void
+MD5Pad (context)
+ MD5_CTX *context;
+{
+ unsigned char bits[8];
+ unsigned int index, padLen;
+
+ /* Save number of bits */
+ Encode (bits, context->count, 8);
+
+ /* Pad out to 56 mod 64. */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+ padLen = (index < 56) ? (56 - index) : (120 - index);
+ MD5Update (context, PADDING, padLen);
+
+ /* Append length (before padding) */
+ MD5Update (context, bits, 8);
+}
+
+/*
+ * MD5 finalization. Ends an MD5 message-digest operation, writing the
+ * the message digest and zeroizing the context.
+ */
+
+void
+MD5Final (digest, context)
+ unsigned char digest[16];
+ MD5_CTX *context;
+{
+ /* Do padding. */
+ MD5Pad (context);
+
+ /* Store state in digest */
+ Encode (digest, context->state, 16);
+
+ /* Zeroize sensitive information. */
+ memset ((void *)context, 0, sizeof (*context));
+}
+
+/* MD5 basic transformation. Transforms state based on block. */
+
+static void
+MD5Transform (state, block)
+ u_int32_t state[4];
+ const unsigned char block[64];
+{
+ u_int32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+ Decode (x, block, 64);
+
+ /* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+ FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+ FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+ FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+ FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+ FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+ FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+ FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+ FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+ FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+ FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+ FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+ FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+ FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+ FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+ FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+ FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+ /* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+ GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+ GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+ GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+ GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+ GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+ GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */
+ GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+ GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+ GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+ GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+ GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+ GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+ GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+ GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+ GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+ GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+ /* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+ HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+ HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+ HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+ HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+ HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+ HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+ HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+ HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+ HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+ HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+ HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+ HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */
+ HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+ HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+ HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+ HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+ /* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+ II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+ II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+ II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+ II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+ II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+ II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+ II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+ II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+ II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+ II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+ II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+ II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+ II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+ II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+ II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+ II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+
+ /* Zeroize sensitive information. */
+ memset ((void *)x, 0, sizeof (x));
+}
diff --git a/sys/kern/p1003_1b.c b/sys/kern/p1003_1b.c
new file mode 100644
index 0000000..9e6fdca
--- /dev/null
+++ b/sys/kern/p1003_1b.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 1996, 1997, 1998
+ * HD Associates, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/* p1003_1b: Real Time common code.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+
+#include <posix4/posix4.h>
+
+MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");
+
+/* The system calls return ENOSYS if an entry is called that is
+ * not run-time supported. I am also logging since some programs
+ * start to use this when they shouldn't. That will be removed if annoying.
+ */
+int
+syscall_not_present(struct thread *td, const char *s, struct nosys_args *uap)
+{
+ log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
+ td->td_proc->p_comm, td->td_proc->p_pid, s);
+
+ /* a " return nosys(p, uap); " here causes a core dump.
+ */
+
+ return ENOSYS;
+}
+
+#if !defined(_KPOSIX_PRIORITY_SCHEDULING)
+
+/* Not configured but loadable via a module:
+ */
+
+static int sched_attach(void)
+{
+ return 0;
+}
+
+SYSCALL_NOT_PRESENT_GEN(sched_setparam)
+SYSCALL_NOT_PRESENT_GEN(sched_getparam)
+SYSCALL_NOT_PRESENT_GEN(sched_setscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_getscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_yield)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min)
+SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval)
+
+#else
+
+/* Configured in kernel version:
+ */
+static struct ksched *ksched;
+
+static int sched_attach(void)
+{
+ int ret = ksched_attach(&ksched);
+
+ if (ret == 0)
+ p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 1);
+
+ return ret;
+}
+
+/*
+ * MPSAFE
+ */
+int sched_setparam(struct thread *td,
+ struct sched_setparam_args *uap)
+{
+ struct thread *targettd;
+ struct proc *targetp;
+ int e;
+ struct sched_param sched_param;
+
+ e = copyin(uap->param, &sched_param, sizeof(sched_param));
+ if (e)
+ return (e);
+
+ mtx_lock(&Giant);
+ if (uap->pid == 0) {
+ targetp = td->td_proc;
+ targettd = td;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL) {
+ e = ESRCH;
+ goto done2;
+ }
+ targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+ }
+
+ e = p_cansched(td, targetp);
+ PROC_UNLOCK(targetp);
+ if (e == 0) {
+ e = ksched_setparam(&td->td_retval[0], ksched, targettd,
+ (const struct sched_param *)&sched_param);
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (e);
+}
+
+/*
+ * MPSAFE
+ */
+int sched_getparam(struct thread *td,
+ struct sched_getparam_args *uap)
+{
+ int e;
+ struct sched_param sched_param;
+ struct thread *targettd;
+ struct proc *targetp;
+
+ mtx_lock(&Giant);
+ if (uap->pid == 0) {
+ targetp = td->td_proc;
+ targettd = td;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL) {
+ e = ESRCH;
+ goto done2;
+ }
+ targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+ }
+
+ e = p_cansee(td, targetp);
+ PROC_UNLOCK(targetp);
+ if (e)
+ goto done2;
+
+ e = ksched_getparam(&td->td_retval[0], ksched, targettd, &sched_param);
+ if (e == 0)
+ e = copyout(&sched_param, uap->param, sizeof(sched_param));
+done2:
+ mtx_unlock(&Giant);
+ return (e);
+}
+
+/*
+ * MPSAFE
+ */
+int sched_setscheduler(struct thread *td,
+ struct sched_setscheduler_args *uap)
+{
+ int e;
+ struct sched_param sched_param;
+ struct thread *targettd;
+ struct proc *targetp;
+
+ e = copyin(uap->param, &sched_param, sizeof(sched_param));
+ if (e)
+ return (e);
+
+ mtx_lock(&Giant);
+ if (uap->pid == 0) {
+ targetp = td->td_proc;
+ targettd = td;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL) {
+ e = ESRCH;
+ goto done2;
+ }
+ targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+ }
+
+ e = p_cansched(td, targetp);
+ PROC_UNLOCK(targetp);
+ if (e == 0) {
+ e = ksched_setscheduler(&td->td_retval[0], ksched, targettd,
+ uap->policy, (const struct sched_param *)&sched_param);
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (e);
+}
+
+/*
+ * MPSAFE
+ */
+int sched_getscheduler(struct thread *td,
+ struct sched_getscheduler_args *uap)
+{
+ int e;
+ struct thread *targettd;
+ struct proc *targetp;
+
+ mtx_lock(&Giant);
+ if (uap->pid == 0) {
+ targetp = td->td_proc;
+ targettd = td;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL) {
+ e = ESRCH;
+ goto done2;
+ }
+ targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+ }
+
+ e = p_cansee(td, targetp);
+ PROC_UNLOCK(targetp);
+ if (e == 0)
+ e = ksched_getscheduler(&td->td_retval[0], ksched, targettd);
+
+done2:
+ mtx_unlock(&Giant);
+ return (e);
+}
+
+/*
+ * MPSAFE
+ */
+int sched_yield(struct thread *td,
+ struct sched_yield_args *uap)
+{
+ int error;
+
+ mtx_lock(&Giant);
+ error = ksched_yield(&td->td_retval[0], ksched);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int sched_get_priority_max(struct thread *td,
+ struct sched_get_priority_max_args *uap)
+{
+ int error;
+
+ mtx_lock(&Giant);
+ error = ksched_get_priority_max(&td->td_retval[0], ksched, uap->policy);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int sched_get_priority_min(struct thread *td,
+ struct sched_get_priority_min_args *uap)
+{
+ int error;
+
+ mtx_lock(&Giant);
+ error = ksched_get_priority_min(&td->td_retval[0], ksched, uap->policy);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int sched_rr_get_interval(struct thread *td,
+ struct sched_rr_get_interval_args *uap)
+{
+ int e;
+ struct thread *targettd;
+ struct proc *targetp;
+
+ mtx_lock(&Giant);
+ if (uap->pid == 0) {
+ targettd = td;
+ targetp = td->td_proc;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL) {
+ e = ESRCH;
+ goto done2;
+ }
+ targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+ }
+
+ e = p_cansee(td, targetp);
+ PROC_UNLOCK(targetp);
+ if (e == 0) {
+ e = ksched_rr_get_interval(&td->td_retval[0], ksched, targettd,
+ uap->interval);
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (e);
+}
+
+#endif
+
+static void p31binit(void *notused)
+{
+ (void) sched_attach();
+ p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+}
+
+SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL);
diff --git a/sys/kern/posix4_mib.c b/sys/kern/posix4_mib.c
new file mode 100644
index 0000000..09af27d
--- /dev/null
+++ b/sys/kern/posix4_mib.c
@@ -0,0 +1,115 @@
+/*-
+ * Copyright (c) 1998
+ * HD Associates, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <posix4/posix4.h>
+
+static int facility[CTL_P1003_1B_MAXID - 1];
+
+/* OID_AUTO isn't working with sysconf(3). I guess I'd have to
+ * modify it to do a lookup by name from the index.
+ * For now I've left it a top-level sysctl.
+ */
+
+#if 1
+
+SYSCTL_DECL(_p1003_1b);
+
+#define P1B_SYSCTL(num, name) \
+SYSCTL_INT(_p1003_1b, num, \
+ name, CTLFLAG_RD, facility + num - 1, 0, "");
+
+#else
+
+SYSCTL_DECL(_kern_p1003_1b);
+
+#define P1B_SYSCTL(num, name) \
+SYSCTL_INT(_kern_p1003_1b, OID_AUTO, \
+ name, CTLFLAG_RD, facility + num - 1, 0, "");
+SYSCTL_NODE(_kern, OID_AUTO, p1003_1b, CTLFLAG_RW, 0, "P1003.1B");
+
+#endif
+
+P1B_SYSCTL(CTL_P1003_1B_ASYNCHRONOUS_IO, asynchronous_io);
+P1B_SYSCTL(CTL_P1003_1B_MAPPED_FILES, mapped_files);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK, memlock);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK_RANGE, memlock_range);
+P1B_SYSCTL(CTL_P1003_1B_MEMORY_PROTECTION, memory_protection);
+P1B_SYSCTL(CTL_P1003_1B_MESSAGE_PASSING, message_passing);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITIZED_IO, prioritized_io);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITY_SCHEDULING, priority_scheduling);
+P1B_SYSCTL(CTL_P1003_1B_REALTIME_SIGNALS, realtime_signals);
+P1B_SYSCTL(CTL_P1003_1B_SEMAPHORES, semaphores);
+P1B_SYSCTL(CTL_P1003_1B_FSYNC, fsync);
+P1B_SYSCTL(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, shared_memory_objects);
+P1B_SYSCTL(CTL_P1003_1B_SYNCHRONIZED_IO, synchronized_io);
+P1B_SYSCTL(CTL_P1003_1B_TIMERS, timers);
+P1B_SYSCTL(CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_MAX, aio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, aio_prio_delta_max);
+P1B_SYSCTL(CTL_P1003_1B_DELAYTIMER_MAX, delaytimer_max);
+P1B_SYSCTL(CTL_P1003_1B_MQ_OPEN_MAX, mq_open_max);
+P1B_SYSCTL(CTL_P1003_1B_PAGESIZE, pagesize);
+P1B_SYSCTL(CTL_P1003_1B_RTSIG_MAX, rtsig_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_NSEMS_MAX, sem_nsems_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_VALUE_MAX, sem_value_max);
+P1B_SYSCTL(CTL_P1003_1B_SIGQUEUE_MAX, sigqueue_max);
+P1B_SYSCTL(CTL_P1003_1B_TIMER_MAX, timer_max);
+
+/* p31b_setcfg: Set the configuration
+ */
+void p31b_setcfg(int num, int value)
+{
+ if (num >= 1 && num < CTL_P1003_1B_MAXID)
+ facility[num - 1] = value;
+}
+
+/*
+ * Turn on indications for standard (non-configurable) kernel features.
+ */
+static void
+p31b_set_standard(void *dummy)
+{
+ /* ??? p31b_setcfg(CTL_P1003_1B_FSYNC, 1); */
+ p31b_setcfg(CTL_P1003_1B_MAPPED_FILES, 1);
+ p31b_setcfg(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, 1);
+ p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+}
+
+SYSINIT(p31b_set_standard, SI_SUB_P1003_1B, SI_ORDER_ANY, p31b_set_standard,
+ 0);
+
diff --git a/sys/kern/subr_acl_posix1e.c b/sys/kern/subr_acl_posix1e.c
new file mode 100644
index 0000000..70be0ec
--- /dev/null
+++ b/sys/kern/subr_acl_posix1e.c
@@ -0,0 +1,830 @@
+/*-
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ * Support for POSIX.1e access control lists.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+
+MALLOC_DEFINE(M_ACL, "acl", "access control list");
+
+static int vacl_set_acl(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+static int vacl_get_acl(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+static int vacl_aclcheck(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+
+/*
+ * Implement a version of vaccess() that understands POSIX.1e ACL semantics.
+ * Return 0 on success, else an errno value. Should be merged into
+ * vaccess() eventually.
+ */
+int
+vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
+ struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
+{
+ struct acl_entry *acl_other, *acl_mask;
+ mode_t dac_granted;
+ mode_t cap_granted;
+ mode_t acl_mask_granted;
+ int group_matched, i;
+
+ /*
+ * Look for a normal, non-privileged way to access the file/directory
+ * as requested. If it exists, go with that. Otherwise, attempt
+ * to use privileges granted via cap_granted. In some cases,
+ * which privileges to use may be ambiguous due to "best match",
+ * in which case fall back on first match for the time being.
+ */
+ if (privused != NULL)
+ *privused = 0;
+
+ /*
+ * Determine privileges now, but don't apply until we've found
+ * a DAC entry that matches but has failed to allow access.
+ */
+#ifndef CAPABILITIES
+ if (suser_cred(cred, PRISON_ROOT) == 0)
+ cap_granted = (VEXEC | VREAD | VWRITE | VADMIN);
+ else
+ cap_granted = 0;
+#else
+ cap_granted = 0;
+
+ if (type == VDIR) {
+ if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
+ CAP_DAC_READ_SEARCH, PRISON_ROOT))
+ cap_granted |= VEXEC;
+ } else {
+ if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
+ CAP_DAC_EXECUTE, PRISON_ROOT))
+ cap_granted |= VEXEC;
+ }
+
+ if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH,
+ PRISON_ROOT))
+ cap_granted |= VREAD;
+
+ if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE,
+ PRISON_ROOT))
+ cap_granted |= VWRITE;
+
+ if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER,
+ PRISON_ROOT))
+ cap_granted |= VADMIN;
+#endif /* CAPABILITIES */
+
+ /*
+ * The owner matches if the effective uid associated with the
+ * credential matches that of the ACL_USER_OBJ entry. While we're
+ * doing the first scan, also cache the location of the ACL_MASK
+ * and ACL_OTHER entries, preventing some future iterations.
+ */
+ acl_mask = acl_other = NULL;
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_USER_OBJ:
+ if (file_uid != cred->cr_uid)
+ break;
+ dac_granted = 0;
+ dac_granted |= VADMIN;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+ if ((acc_mode & (dac_granted | cap_granted)) ==
+ acc_mode) {
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+ goto error;
+
+ case ACL_MASK:
+ acl_mask = &acl->acl_entry[i];
+ break;
+
+ case ACL_OTHER:
+ acl_other = &acl->acl_entry[i];
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ /*
+ * An ACL_OTHER entry should always exist in a valid access
+ * ACL. If it doesn't, then generate a serious failure. For now,
+ * this means a debugging message and EPERM, but in the future
+ * should probably be a panic.
+ */
+ if (acl_other == NULL) {
+ /*
+ * XXX This should never happen
+ */
+ printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
+ return (EPERM);
+ }
+
+ /*
+ * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields
+ * are masked by an ACL_MASK entry, if any. As such, first identify
+ * the ACL_MASK field, then iterate through identifying potential
+ * user matches, then group matches. If there is no ACL_MASK,
+ * assume that the mask allows all requests to succeed.
+ */
+ if (acl_mask != NULL) {
+ acl_mask_granted = 0;
+ if (acl_mask->ae_perm & ACL_EXECUTE)
+ acl_mask_granted |= VEXEC;
+ if (acl_mask->ae_perm & ACL_READ)
+ acl_mask_granted |= VREAD;
+ if (acl_mask->ae_perm & ACL_WRITE)
+ acl_mask_granted |= VWRITE;
+ } else
+ acl_mask_granted = VEXEC | VREAD | VWRITE;
+
+ /*
+ * Iterate through user ACL entries. Do checks twice, first
+ * without privilege, and then if a match is found but failed,
+ * a second time with privilege.
+ */
+
+ /*
+ * Check ACL_USER ACL entries.
+ */
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_USER:
+ if (acl->acl_entry[i].ae_id != cred->cr_uid)
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+ if ((acc_mode & (dac_granted | cap_granted)) !=
+ acc_mode)
+ goto error;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+ }
+
+ /*
+ * Group match is best-match, not first-match, so find a
+ * "best" match. Iterate across, testing each potential group
+ * match. Make sure we keep track of whether we found a match
+ * or not, so that we know if we should try again with any
+ * available privilege, or if we should move on to ACL_OTHER.
+ */
+ group_matched = 0;
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_GROUP_OBJ:
+ if (!groupmember(file_gid, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+
+ group_matched = 1;
+ break;
+
+ case ACL_GROUP:
+ if (!groupmember(acl->acl_entry[i].ae_id, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+
+ group_matched = 1;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (group_matched == 1) {
+ /*
+ * There was a match, but it did not grant rights via
+ * pure DAC. Try again, this time with privilege.
+ */
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_GROUP_OBJ:
+ if (!groupmember(file_gid, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & (dac_granted | cap_granted)) !=
+ acc_mode)
+ break;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+
+ case ACL_GROUP:
+ if (!groupmember(acl->acl_entry[i].ae_id,
+ cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & (dac_granted | cap_granted)) !=
+ acc_mode)
+ break;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+
+ default:
+ break;
+ }
+ }
+ /*
+ * Even with privilege, group membership was not sufficient.
+ * Return failure.
+ */
+ goto error;
+ }
+
+ /*
+ * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER.
+ */
+ dac_granted = 0;
+ if (acl_other->ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl_other->ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl_other->ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+ if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) {
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+
+error:
+ return ((acc_mode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * For the purposes of filesystems maintaining the _OBJ entries in an
+ * inode with a mode_t field, this routine converts a mode_t entry
+ * to an acl_perm_t.
+ */
+acl_perm_t
+acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
+{
+ acl_perm_t perm = 0;
+
+ switch(tag) {
+ case ACL_USER_OBJ:
+ if (mode & S_IXUSR)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IRUSR)
+ perm |= ACL_READ;
+ if (mode & S_IWUSR)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ case ACL_GROUP_OBJ:
+ if (mode & S_IXGRP)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IRGRP)
+ perm |= ACL_READ;
+ if (mode & S_IWGRP)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ case ACL_OTHER:
+ if (mode & S_IXOTH)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IROTH)
+ perm |= ACL_READ;
+ if (mode & S_IWOTH)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ default:
+ printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
+ return (0);
+ }
+}
+
+/*
+ * Given inode information (uid, gid, mode), return an acl entry of the
+ * appropriate type.
+ */
+struct acl_entry
+acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
+{
+ struct acl_entry acl_entry;
+
+ acl_entry.ae_tag = tag;
+ acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
+ switch(tag) {
+ case ACL_USER_OBJ:
+ acl_entry.ae_id = uid;
+ break;
+
+ case ACL_GROUP_OBJ:
+ acl_entry.ae_id = gid;
+ break;
+
+ case ACL_OTHER:
+ acl_entry.ae_id = ACL_UNDEFINED_ID;
+ break;
+
+ default:
+ acl_entry.ae_id = ACL_UNDEFINED_ID;
+ printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
+ }
+
+ return (acl_entry);
+}
+
+/*
+ * Utility function to generate a file mode given appropriate ACL entries.
+ */
+mode_t
+acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
+ struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
+{
+ mode_t mode;
+
+ mode = 0;
+ if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXUSR;
+ if (acl_user_obj_entry->ae_perm & ACL_READ)
+ mode |= S_IRUSR;
+ if (acl_user_obj_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWUSR;
+ if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXGRP;
+ if (acl_group_obj_entry->ae_perm & ACL_READ)
+ mode |= S_IRGRP;
+ if (acl_group_obj_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWGRP;
+ if (acl_other_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXOTH;
+ if (acl_other_entry->ae_perm & ACL_READ)
+ mode |= S_IROTH;
+ if (acl_other_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWOTH;
+
+ return (mode);
+}
+
+/*
+ * Perform a syntactic check of the ACL, sufficient to allow an
+ * implementing filesystem to determine if it should accept this and
+ * rely on the POSIX.1e ACL properties.
+ */
+int
+acl_posix1e_check(struct acl *acl)
+{
+ int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
+ int num_acl_mask, num_acl_other, i;
+
+ /*
+ * Verify that the number of entries does not exceed the maximum
+ * defined for acl_t.
+ * Verify that the correct number of various sorts of ae_tags are
+ * present:
+ * Exactly one ACL_USER_OBJ
+ * Exactly one ACL_GROUP_OBJ
+ * Exactly one ACL_OTHER
+ * If any ACL_USER or ACL_GROUP entries appear, then exactly one
+ * ACL_MASK entry must also appear.
+ * Verify that all ae_perm entries are in ACL_PERM_BITS.
+ * Verify all ae_tag entries are understood by this implementation.
+ * Note: Does not check for uniqueness of qualifier (ae_id) field.
+ */
+ num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
+ num_acl_mask = num_acl_other = 0;
+ if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
+ return (EINVAL);
+ for (i = 0; i < acl->acl_cnt; i++) {
+ /*
+ * Check for a valid tag.
+ */
+ switch(acl->acl_entry[i].ae_tag) {
+ case ACL_USER_OBJ:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_user_obj++;
+ break;
+ case ACL_GROUP_OBJ:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_group_obj++;
+ break;
+ case ACL_USER:
+ if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_user++;
+ break;
+ case ACL_GROUP:
+ if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_group++;
+ break;
+ case ACL_OTHER:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_other++;
+ break;
+ case ACL_MASK:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_mask++;
+ break;
+ default:
+ return (EINVAL);
+ }
+ /*
+ * Check for valid perm entries.
+ */
+ if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
+ ACL_PERM_BITS)
+ return (EINVAL);
+ }
+ if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
+ (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
+ return (EINVAL);
+ if (((num_acl_group != 0) || (num_acl_user != 0)) &&
+ (num_acl_mask != 1))
+ return (EINVAL);
+ return (0);
+}
+
+/*
+ * These calls wrap the real vnode operations, and are called by the
+ * syscall code once the syscall has converted the path or file
+ * descriptor to a vnode (unlocked). The aclp pointer is assumed
+ * still to point to userland, so this should not be consumed within
+ * the kernel except by syscall code. Other code should directly
+ * invoke VOP_{SET,GET}ACL.
+ */
+
+/*
+ * Given a vnode, set its ACL.
+ */
+static int
+vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl inkernacl;
+ struct mount *mp;
+ int error;
+
+ error = copyin(aclp, &inkernacl, sizeof(struct acl));
+ if (error)
+ return(error);
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return(error);
+}
+
+/*
+ * Given a vnode, get its ACL.
+ */
+static int
+vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl inkernelacl;
+ int error;
+
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ if (error == 0)
+ error = copyout(&inkernelacl, aclp, sizeof(struct acl));
+ return (error);
+}
+
+/*
+ * Given a vnode, delete its ACL.
+ */
+static int
+vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
+{
+ struct mount *mp;
+ int error;
+
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_SETACL(vp, type, NULL, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Given a vnode, check whether an ACL is appropriate for it
+ */
+static int
+vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl inkernelacl;
+ int error;
+
+ error = copyin(aclp, &inkernelacl, sizeof(struct acl));
+ if (error)
+ return(error);
+ error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
+ return (error);
+}
+
+/*
+ * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.
+ * Don't need to lock, as the vacl_ code will get/release any locks
+ * required.
+ */
+
+/*
+ * Given a file path, get an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ mtx_lock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type),
+ SCARG(uap, aclp));
+ NDFREE(&nd, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ mtx_lock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type),
+ SCARG(uap, aclp));
+ NDFREE(&nd, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file descriptor, get an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ mtx_lock(&Giant);
+ error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+ if (error == 0) {
+ error = vacl_get_acl(td, (struct vnode *)fp->f_data,
+ SCARG(uap, type), SCARG(uap, aclp));
+ fdrop(fp, td);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file descriptor, set an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ mtx_lock(&Giant);
+ error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+ if (error == 0) {
+ error = vacl_set_acl(td, (struct vnode *)fp->f_data,
+ SCARG(uap, type), SCARG(uap, aclp));
+ fdrop(fp, td);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ *
+ * MPSAFE
+ */
+int
+__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ mtx_lock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_delete(td, nd.ni_vp, SCARG(uap, type));
+ NDFREE(&nd, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ *
+ * MPSAFE
+ */
+int
+__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ mtx_lock(&Giant);
+ error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+ if (error == 0) {
+ error = vacl_delete(td, (struct vnode *)fp->f_data,
+ SCARG(uap, type));
+ fdrop(fp, td);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ mtx_lock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type),
+ SCARG(uap, aclp));
+ NDFREE(&nd, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file descriptor, check an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ mtx_lock(&Giant);
+ error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+ if (error == 0) {
+ error = vacl_aclcheck(td, (struct vnode *)fp->f_data,
+ SCARG(uap, type), SCARG(uap, aclp));
+ fdrop(fp, td);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c
new file mode 100644
index 0000000..5132e02
--- /dev/null
+++ b/sys/kern/subr_autoconf.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This software was developed by the Computer Systems Engineering group
+ * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
+ * contributed to Berkeley.
+ *
+ * All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Lawrence Berkeley Laboratories.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_autoconf.c 8.1 (Berkeley) 6/10/93
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+
+/*
+ * Autoconfiguration subroutines.
+ */
+
+/*
+ * "Interrupt driven config" functions.
+ */
+static TAILQ_HEAD(, intr_config_hook) intr_config_hook_list =
+ TAILQ_HEAD_INITIALIZER(intr_config_hook_list);
+
+
+/* ARGSUSED */
+static void run_interrupt_driven_config_hooks(void *dummy);
+static void
+run_interrupt_driven_config_hooks(dummy)
+ void *dummy;
+{
+ struct intr_config_hook *hook_entry, *next_entry;
+
+ for (hook_entry = TAILQ_FIRST(&intr_config_hook_list);
+ hook_entry != NULL;
+ hook_entry = next_entry) {
+ next_entry = TAILQ_NEXT(hook_entry, ich_links);
+ (*hook_entry->ich_func)(hook_entry->ich_arg);
+ }
+
+ while (!TAILQ_EMPTY(&intr_config_hook_list)) {
+ tsleep(&intr_config_hook_list, PCONFIG, "conifhk", 0);
+ }
+}
+SYSINIT(intr_config_hooks, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST,
+ run_interrupt_driven_config_hooks, NULL)
+
+/*
+ * Register a hook that will be called after "cold"
+ * autoconfiguration is complete and interrupts can
+ * be used to complete initialization.
+ */
+int
+config_intrhook_establish(hook)
+ struct intr_config_hook *hook;
+{
+ struct intr_config_hook *hook_entry;
+
+ for (hook_entry = TAILQ_FIRST(&intr_config_hook_list);
+ hook_entry != NULL;
+ hook_entry = TAILQ_NEXT(hook_entry, ich_links))
+ if (hook_entry == hook)
+ break;
+ if (hook_entry != NULL) {
+ printf("config_intrhook_establish: establishing an "
+ "already established hook.\n");
+ return (1);
+ }
+ TAILQ_INSERT_TAIL(&intr_config_hook_list, hook, ich_links);
+ if (cold == 0)
+ /* XXX Sufficient for modules loaded after initial config??? */
+ run_interrupt_driven_config_hooks(NULL);
+ return (0);
+}
+
+void
+config_intrhook_disestablish(hook)
+ struct intr_config_hook *hook;
+{
+ struct intr_config_hook *hook_entry;
+
+ for (hook_entry = TAILQ_FIRST(&intr_config_hook_list);
+ hook_entry != NULL;
+ hook_entry = TAILQ_NEXT(hook_entry, ich_links))
+ if (hook_entry == hook)
+ break;
+ if (hook_entry == NULL)
+ panic("config_intrhook_disestablish: disestablishing an "
+ "unestablished hook");
+
+ TAILQ_REMOVE(&intr_config_hook_list, hook, ich_links);
+ /* Wakeup anyone watching the list */
+ wakeup(&intr_config_hook_list);
+}
diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c
new file mode 100644
index 0000000..eeeb7d9
--- /dev/null
+++ b/sys/kern/subr_blist.c
@@ -0,0 +1,929 @@
+
+/*
+ * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
+ *
+ * (c)Copyright 1998, Matthew Dillon. Terms for use and redistribution
+ * are covered by the BSD Copyright as found in /usr/src/COPYRIGHT.
+ *
+ * This module implements a general bitmap allocator/deallocator. The
+ * allocator eats around 2 bits per 'block'. The module does not
+ * try to interpret the meaning of a 'block' other then to return
+ * SWAPBLK_NONE on an allocation failure.
+ *
+ * A radix tree is used to maintain the bitmap. Two radix constants are
+ * involved: One for the bitmaps contained in the leaf nodes (typically
+ * 32), and one for the meta nodes (typically 16). Both meta and leaf
+ * nodes have a hint field. This field gives us a hint as to the largest
+ * free contiguous range of blocks under the node. It may contain a
+ * value that is too high, but will never contain a value that is too
+ * low. When the radix tree is searched, allocation failures in subtrees
+ * update the hint.
+ *
+ * The radix tree also implements two collapsed states for meta nodes:
+ * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is
+ * in either of these two states, all information contained underneath
+ * the node is considered stale. These states are used to optimize
+ * allocation and freeing operations.
+ *
+ * The hinting greatly increases code efficiency for allocations while
+ * the general radix structure optimizes both allocations and frees. The
+ * radix tree should be able to operate well no matter how much
+ * fragmentation there is and no matter how large a bitmap is used.
+ *
+ * Unlike the rlist code, the blist code wires all necessary memory at
+ * creation time. Neither allocations nor frees require interaction with
+ * the memory subsystem. In contrast, the rlist code may allocate memory
+ * on an rlist_free() call. The non-blocking features of the blist code
+ * are used to great advantage in the swap code (vm/nswap_pager.c). The
+ * rlist code uses a little less overall memory then the blist code (but
+ * due to swap interleaving not all that much less), but the blist code
+ * scales much, much better.
+ *
+ * LAYOUT: The radix tree is layed out recursively using a
+ * linear array. Each meta node is immediately followed (layed out
+ * sequentially in memory) by BLIST_META_RADIX lower level nodes. This
+ * is a recursive structure but one that can be easily scanned through
+ * a very simple 'skip' calculation. In order to support large radixes,
+ * portions of the tree may reside outside our memory allocation. We
+ * handle this with an early-termination optimization (when bighint is
+ * set to -1) on the scan. The memory allocation is only large enough
+ * to cover the number of blocks requested at creation time even if it
+ * must be encompassed in larger root-node radix.
+ *
+ * NOTE: the allocator cannot currently allocate more then
+ * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too
+ * large' if you try. This is an area that could use improvement. The
+ * radix is large enough that this restriction does not effect the swap
+ * system, though. Currently only the allocation code is effected by
+ * this algorithmic unfeature. The freeing code can handle arbitrary
+ * ranges.
+ *
+ * This code can be compiled stand-alone for debugging.
+ *
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/blist.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/mutex.h>
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_page.h>
+
+#else
+
+#ifndef BLIST_NO_DEBUG
+#define BLIST_DEBUG
+#endif
+
+#define SWAPBLK_NONE ((daddr_t)-1)
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#define malloc(a,b,c) malloc(a)
+#define free(a,b) free(a)
+
+typedef unsigned int u_daddr_t;
+
+#include <sys/blist.h>
+
+void panic(const char *ctl, ...);
+
+#endif
+
+/*
+ * static support functions
+ */
+
+static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
+static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk,
+ daddr_t count, daddr_t radix, int skip);
+static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
+static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
+ daddr_t radix, int skip, daddr_t blk);
+static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
+ daddr_t skip, blist_t dest, daddr_t count);
+static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix,
+ int skip, daddr_t count);
+#ifndef _KERNEL
+static void blst_radix_print(blmeta_t *scan, daddr_t blk,
+ daddr_t radix, int skip, int tab);
+#endif
+
+#ifdef _KERNEL
+static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
+#endif
+
+/*
+ * blist_create() - create a blist capable of handling up to the specified
+ * number of blocks
+ *
+ * blocks must be greater then 0
+ *
+ * The smallest blist consists of a single leaf node capable of
+ * managing BLIST_BMAP_RADIX blocks.
+ */
+
+blist_t
+blist_create(daddr_t blocks)
+{
+ blist_t bl;
+ int radix;
+ int skip = 0;
+
+ /*
+ * Calculate radix and skip field used for scanning.
+ */
+ radix = BLIST_BMAP_RADIX;
+
+ while (radix < blocks) {
+ radix <<= BLIST_META_RADIX_SHIFT;
+ skip = (skip + 1) << BLIST_META_RADIX_SHIFT;
+ }
+
+ bl = malloc(sizeof(struct blist), M_SWAP, M_WAITOK | M_ZERO);
+
+ bl->bl_blocks = blocks;
+ bl->bl_radix = radix;
+ bl->bl_skip = skip;
+ bl->bl_rootblks = 1 +
+ blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
+ bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, M_WAITOK);
+
+#if defined(BLIST_DEBUG)
+ printf(
+ "BLIST representing %d blocks (%d MB of swap)"
+ ", requiring %dK of ram\n",
+ bl->bl_blocks,
+ bl->bl_blocks * 4 / 1024,
+ (bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
+ );
+ printf("BLIST raw radix tree contains %d records\n", bl->bl_rootblks);
+#endif
+ blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
+
+ return(bl);
+}
+
+void
+blist_destroy(blist_t bl)
+{
+ free(bl->bl_root, M_SWAP);
+ free(bl, M_SWAP);
+}
+
+/*
+ * blist_alloc() - reserve space in the block bitmap. Return the base
+ * of a contiguous region or SWAPBLK_NONE if space could
+ * not be allocated.
+ */
+
+daddr_t
+blist_alloc(blist_t bl, daddr_t count)
+{
+ daddr_t blk = SWAPBLK_NONE;
+
+ if (bl) {
+ if (bl->bl_radix == BLIST_BMAP_RADIX)
+ blk = blst_leaf_alloc(bl->bl_root, 0, count);
+ else
+ blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
+ if (blk != SWAPBLK_NONE)
+ bl->bl_free -= count;
+ }
+ return(blk);
+}
+
+/*
+ * blist_free() - free up space in the block bitmap. Return the base
+ * of a contiguous region. Panic if an inconsistancy is
+ * found.
+ */
+
+void
+blist_free(blist_t bl, daddr_t blkno, daddr_t count)
+{
+ if (bl) {
+ if (bl->bl_radix == BLIST_BMAP_RADIX)
+ blst_leaf_free(bl->bl_root, blkno, count);
+ else
+ blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
+ bl->bl_free += count;
+ }
+}
+
+/*
+ * blist_resize() - resize an existing radix tree to handle the
+ * specified number of blocks. This will reallocate
+ * the tree and transfer the previous bitmap to the new
+ * one. When extending the tree you can specify whether
+ * the new blocks are to left allocated or freed.
+ */
+
+void
+blist_resize(blist_t *pbl, daddr_t count, int freenew)
+{
+ blist_t newbl = blist_create(count);
+ blist_t save = *pbl;
+
+ *pbl = newbl;
+ if (count > save->bl_blocks)
+ count = save->bl_blocks;
+ blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
+
+ /*
+ * If resizing upwards, should we free the new space or not?
+ */
+ if (freenew && count < newbl->bl_blocks) {
+ blist_free(newbl, count, newbl->bl_blocks - count);
+ }
+ blist_destroy(save);
+}
+
+#ifdef BLIST_DEBUG
+
+/*
+ * blist_print() - dump radix tree
+ */
+
+void
+blist_print(blist_t bl)
+{
+ printf("BLIST {\n");
+ blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
+ printf("}\n");
+}
+
+#endif
+
+/************************************************************************
+ * ALLOCATION SUPPORT FUNCTIONS *
+ ************************************************************************
+ *
+ * These support functions do all the actual work. They may seem
+ * rather longish, but that's because I've commented them up. The
+ * actual code is straight forward.
+ *
+ */
+
+/*
+ * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap).
+ *
+ * This is the core of the allocator and is optimized for the 1 block
+ * and the BLIST_BMAP_RADIX block allocation cases. Other cases are
+ * somewhat slower. The 1 block allocation case is log2 and extremely
+ * quick.
+ */
+
+static daddr_t
+blst_leaf_alloc(
+ blmeta_t *scan,
+ daddr_t blk,
+ int count
+) {
+ u_daddr_t orig = scan->u.bmu_bitmap;
+
+ if (orig == 0) {
+ /*
+ * Optimize bitmap all-allocated case. Also, count = 1
+ * case assumes at least 1 bit is free in the bitmap, so
+ * we have to take care of this case here.
+ */
+ scan->bm_bighint = 0;
+ return(SWAPBLK_NONE);
+ }
+ if (count == 1) {
+ /*
+ * Optimized code to allocate one bit out of the bitmap
+ */
+ u_daddr_t mask;
+ int j = BLIST_BMAP_RADIX/2;
+ int r = 0;
+
+ mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
+
+ while (j) {
+ if ((orig & mask) == 0) {
+ r += j;
+ orig >>= j;
+ }
+ j >>= 1;
+ mask >>= j;
+ }
+ scan->u.bmu_bitmap &= ~(1 << r);
+ return(blk + r);
+ }
+ if (count <= BLIST_BMAP_RADIX) {
+ /*
+ * non-optimized code to allocate N bits out of the bitmap.
+ * The more bits, the faster the code runs. It will run
+ * the slowest allocating 2 bits, but since there aren't any
+ * memory ops in the core loop (or shouldn't be, anyway),
+ * you probably won't notice the difference.
+ */
+ int j;
+ int n = BLIST_BMAP_RADIX - count;
+ u_daddr_t mask;
+
+ mask = (u_daddr_t)-1 >> n;
+
+ for (j = 0; j <= n; ++j) {
+ if ((orig & mask) == mask) {
+ scan->u.bmu_bitmap &= ~mask;
+ return(blk + j);
+ }
+ mask = (mask << 1);
+ }
+ }
+ /*
+ * We couldn't allocate count in this subtree, update bighint.
+ */
+ scan->bm_bighint = count - 1;
+ return(SWAPBLK_NONE);
+}
+
+/*
+ * blist_meta_alloc() - allocate at a meta in the radix tree.
+ *
+ * Attempt to allocate at a meta node. If we can't, we update
+ * bighint and return a failure. Updating bighint optimize future
+ * calls that hit this node. We have to check for our collapse cases
+ * and we have a few optimizations strewn in as well.
+ */
+
+static daddr_t
+blst_meta_alloc(
+ blmeta_t *scan,
+ daddr_t blk,
+ daddr_t count,
+ daddr_t radix,
+ int skip
+) {
+ int i;
+ int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+ if (scan->u.bmu_avail == 0) {
+ /*
+ * ALL-ALLOCATED special case
+ */
+ scan->bm_bighint = count;
+ return(SWAPBLK_NONE);
+ }
+
+ if (scan->u.bmu_avail == radix) {
+ radix >>= BLIST_META_RADIX_SHIFT;
+
+ /*
+ * ALL-FREE special case, initialize uninitialize
+ * sublevel.
+ */
+ for (i = 1; i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1)
+ break;
+ if (next_skip == 1) {
+ scan[i].u.bmu_bitmap = (u_daddr_t)-1;
+ scan[i].bm_bighint = BLIST_BMAP_RADIX;
+ } else {
+ scan[i].bm_bighint = radix;
+ scan[i].u.bmu_avail = radix;
+ }
+ }
+ } else {
+ radix >>= BLIST_META_RADIX_SHIFT;
+ }
+
+ for (i = 1; i <= skip; i += next_skip) {
+ if (count <= scan[i].bm_bighint) {
+ /*
+ * count fits in object
+ */
+ daddr_t r;
+ if (next_skip == 1) {
+ r = blst_leaf_alloc(&scan[i], blk, count);
+ } else {
+ r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1);
+ }
+ if (r != SWAPBLK_NONE) {
+ scan->u.bmu_avail -= count;
+ if (scan->bm_bighint > scan->u.bmu_avail)
+ scan->bm_bighint = scan->u.bmu_avail;
+ return(r);
+ }
+ } else if (scan[i].bm_bighint == (daddr_t)-1) {
+ /*
+ * Terminator
+ */
+ break;
+ } else if (count > radix) {
+ /*
+ * count does not fit in object even if it were
+ * complete free.
+ */
+ panic("blist_meta_alloc: allocation too large");
+ }
+ blk += radix;
+ }
+
+ /*
+ * We couldn't allocate count in this subtree, update bighint.
+ */
+ if (scan->bm_bighint >= count)
+ scan->bm_bighint = count - 1;
+ return(SWAPBLK_NONE);
+}
+
+/*
+ * BLST_LEAF_FREE() - free allocated block from leaf bitmap
+ *
+ */
+
+static void
+blst_leaf_free(
+ blmeta_t *scan,
+ daddr_t blk,
+ int count
+) {
+ /*
+ * free some data in this bitmap
+ *
+ * e.g.
+ * 0000111111111110000
+ * \_________/\__/
+ * v n
+ */
+ int n = blk & (BLIST_BMAP_RADIX - 1);
+ u_daddr_t mask;
+
+ mask = ((u_daddr_t)-1 << n) &
+ ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
+
+ if (scan->u.bmu_bitmap & mask)
+ panic("blst_radix_free: freeing free block");
+ scan->u.bmu_bitmap |= mask;
+
+ /*
+ * We could probably do a better job here. We are required to make
+ * bighint at least as large as the biggest contiguous block of
+ * data. If we just shoehorn it, a little extra overhead will
+ * be incured on the next allocation (but only that one typically).
+ */
+ scan->bm_bighint = BLIST_BMAP_RADIX;
+}
+
+/*
+ * BLST_META_FREE() - free allocated blocks from radix tree meta info
+ *
+ * This support routine frees a range of blocks from the bitmap.
+ * The range must be entirely enclosed by this radix node. If a
+ * meta node, we break the range down recursively to free blocks
+ * in subnodes (which means that this code can free an arbitrary
+ * range whereas the allocation code cannot allocate an arbitrary
+ * range).
+ */
+
+static void
+blst_meta_free(
+ blmeta_t *scan,
+ daddr_t freeBlk,
+ daddr_t count,
+ daddr_t radix,
+ int skip,
+ daddr_t blk
+) {
+ int i;
+ int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+#if 0
+ printf("FREE (%x,%d) FROM (%x,%d)\n",
+ freeBlk, count,
+ blk, radix
+ );
+#endif
+
+ if (scan->u.bmu_avail == 0) {
+ /*
+ * ALL-ALLOCATED special case, with possible
+ * shortcut to ALL-FREE special case.
+ */
+ scan->u.bmu_avail = count;
+ scan->bm_bighint = count;
+
+ if (count != radix) {
+ for (i = 1; i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1)
+ break;
+ scan[i].bm_bighint = 0;
+ if (next_skip == 1) {
+ scan[i].u.bmu_bitmap = 0;
+ } else {
+ scan[i].u.bmu_avail = 0;
+ }
+ }
+ /* fall through */
+ }
+ } else {
+ scan->u.bmu_avail += count;
+ /* scan->bm_bighint = radix; */
+ }
+
+ /*
+ * ALL-FREE special case.
+ */
+
+ if (scan->u.bmu_avail == radix)
+ return;
+ if (scan->u.bmu_avail > radix)
+ panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
+ (long long)count, (long long)scan->u.bmu_avail,
+ (long long)radix);
+
+ /*
+ * Break the free down into its components
+ */
+
+ radix >>= BLIST_META_RADIX_SHIFT;
+
+ i = (freeBlk - blk) / radix;
+ blk += i * radix;
+ i = i * next_skip + 1;
+
+ while (i <= skip && blk < freeBlk + count) {
+ daddr_t v;
+
+ v = blk + radix - freeBlk;
+ if (v > count)
+ v = count;
+
+ if (scan->bm_bighint == (daddr_t)-1)
+ panic("blst_meta_free: freeing unexpected range");
+
+ if (next_skip == 1) {
+ blst_leaf_free(&scan[i], freeBlk, v);
+ } else {
+ blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
+ }
+ if (scan->bm_bighint < scan[i].bm_bighint)
+ scan->bm_bighint = scan[i].bm_bighint;
+ count -= v;
+ freeBlk += v;
+ blk += radix;
+ i += next_skip;
+ }
+}
+
+/*
+ * BLIST_RADIX_COPY() - copy one radix tree to another
+ *
+ * Locates free space in the source tree and frees it in the destination
+ * tree. The space may not already be free in the destination.
+ */
+
+static void blst_copy(
+ blmeta_t *scan,
+ daddr_t blk,
+ daddr_t radix,
+ daddr_t skip,
+ blist_t dest,
+ daddr_t count
+) {
+ int next_skip;
+ int i;
+
+ /*
+ * Leaf node
+ */
+
+ if (radix == BLIST_BMAP_RADIX) {
+ u_daddr_t v = scan->u.bmu_bitmap;
+
+ if (v == (u_daddr_t)-1) {
+ blist_free(dest, blk, count);
+ } else if (v != 0) {
+ int i;
+
+ for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
+ if (v & (1 << i))
+ blist_free(dest, blk + i, 1);
+ }
+ }
+ return;
+ }
+
+ /*
+ * Meta node
+ */
+
+ if (scan->u.bmu_avail == 0) {
+ /*
+ * Source all allocated, leave dest allocated
+ */
+ return;
+ }
+ if (scan->u.bmu_avail == radix) {
+ /*
+ * Source all free, free entire dest
+ */
+ if (count < radix)
+ blist_free(dest, blk, count);
+ else
+ blist_free(dest, blk, radix);
+ return;
+ }
+
+
+ radix >>= BLIST_META_RADIX_SHIFT;
+ next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+ for (i = 1; count && i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1)
+ break;
+
+ if (count >= radix) {
+ blst_copy(
+ &scan[i],
+ blk,
+ radix,
+ next_skip - 1,
+ dest,
+ radix
+ );
+ count -= radix;
+ } else {
+ if (count) {
+ blst_copy(
+ &scan[i],
+ blk,
+ radix,
+ next_skip - 1,
+ dest,
+ count
+ );
+ }
+ count = 0;
+ }
+ blk += radix;
+ }
+}
+
+/*
+ * BLST_RADIX_INIT() - initialize radix tree
+ *
+ * Initialize our meta structures and bitmaps and calculate the exact
+ * amount of space required to manage 'count' blocks - this space may
+ * be considerably less then the calculated radix due to the large
+ * RADIX values we use.
+ */
+
+static daddr_t
+blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
+{
+ int i;
+ int next_skip;
+ daddr_t memindex = 0;
+
+ /*
+ * Leaf node
+ */
+
+ if (radix == BLIST_BMAP_RADIX) {
+ if (scan) {
+ scan->bm_bighint = 0;
+ scan->u.bmu_bitmap = 0;
+ }
+ return(memindex);
+ }
+
+ /*
+ * Meta node. If allocating the entire object we can special
+ * case it. However, we need to figure out how much memory
+ * is required to manage 'count' blocks, so we continue on anyway.
+ */
+
+ if (scan) {
+ scan->bm_bighint = 0;
+ scan->u.bmu_avail = 0;
+ }
+
+ radix >>= BLIST_META_RADIX_SHIFT;
+ next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+ for (i = 1; i <= skip; i += next_skip) {
+ if (count >= radix) {
+ /*
+ * Allocate the entire object
+ */
+ memindex = i + blst_radix_init(
+ ((scan) ? &scan[i] : NULL),
+ radix,
+ next_skip - 1,
+ radix
+ );
+ count -= radix;
+ } else if (count > 0) {
+ /*
+ * Allocate a partial object
+ */
+ memindex = i + blst_radix_init(
+ ((scan) ? &scan[i] : NULL),
+ radix,
+ next_skip - 1,
+ count
+ );
+ count = 0;
+ } else {
+ /*
+ * Add terminator and break out
+ */
+ if (scan)
+ scan[i].bm_bighint = (daddr_t)-1;
+ break;
+ }
+ }
+ if (memindex < i)
+ memindex = i;
+ return(memindex);
+}
+
+#ifdef BLIST_DEBUG
+
+static void
+blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
+{
+ int i;
+ int next_skip;
+ int lastState = 0;
+
+ if (radix == BLIST_BMAP_RADIX) {
+ printf(
+ "%*.*s(%04x,%d): bitmap %08x big=%d\n",
+ tab, tab, "",
+ blk, radix,
+ scan->u.bmu_bitmap,
+ scan->bm_bighint
+ );
+ return;
+ }
+
+ if (scan->u.bmu_avail == 0) {
+ printf(
+ "%*.*s(%04x,%d) ALL ALLOCATED\n",
+ tab, tab, "",
+ blk,
+ radix
+ );
+ return;
+ }
+ if (scan->u.bmu_avail == radix) {
+ printf(
+ "%*.*s(%04x,%d) ALL FREE\n",
+ tab, tab, "",
+ blk,
+ radix
+ );
+ return;
+ }
+
+ printf(
+ "%*.*s(%04x,%d): subtree (%d/%d) big=%d {\n",
+ tab, tab, "",
+ blk, radix,
+ scan->u.bmu_avail,
+ radix,
+ scan->bm_bighint
+ );
+
+ radix >>= BLIST_META_RADIX_SHIFT;
+ next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+ tab += 4;
+
+ for (i = 1; i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1) {
+ printf(
+ "%*.*s(%04x,%d): Terminator\n",
+ tab, tab, "",
+ blk, radix
+ );
+ lastState = 0;
+ break;
+ }
+ blst_radix_print(
+ &scan[i],
+ blk,
+ radix,
+ next_skip - 1,
+ tab
+ );
+ blk += radix;
+ }
+ tab -= 4;
+
+ printf(
+ "%*.*s}\n",
+ tab, tab, ""
+ );
+}
+
+#endif
+
+#ifdef BLIST_DEBUG
+
+int
+main(int ac, char **av)
+{
+ int size = 1024;
+ int i;
+ blist_t bl;
+
+ for (i = 1; i < ac; ++i) {
+ const char *ptr = av[i];
+ if (*ptr != '-') {
+ size = strtol(ptr, NULL, 0);
+ continue;
+ }
+ ptr += 2;
+ fprintf(stderr, "Bad option: %s\n", ptr - 2);
+ exit(1);
+ }
+ bl = blist_create(size);
+ blist_free(bl, 0, size);
+
+ for (;;) {
+ char buf[1024];
+ daddr_t da = 0;
+ daddr_t count = 0;
+
+
+ printf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix);
+ fflush(stdout);
+ if (fgets(buf, sizeof(buf), stdin) == NULL)
+ break;
+ switch(buf[0]) {
+ case 'r':
+ if (sscanf(buf + 1, "%d", &count) == 1) {
+ blist_resize(&bl, count, 1);
+ } else {
+ printf("?\n");
+ }
+ case 'p':
+ blist_print(bl);
+ break;
+ case 'a':
+ if (sscanf(buf + 1, "%d", &count) == 1) {
+ daddr_t blk = blist_alloc(bl, count);
+ printf(" R=%04x\n", blk);
+ } else {
+ printf("?\n");
+ }
+ break;
+ case 'f':
+ if (sscanf(buf + 1, "%x %d", &da, &count) == 2) {
+ blist_free(bl, da, count);
+ } else {
+ printf("?\n");
+ }
+ break;
+ case '?':
+ case 'h':
+ puts(
+ "p -print\n"
+ "a %d -allocate\n"
+ "f %x %d -free\n"
+ "r %d -resize\n"
+ "h/? -help"
+ );
+ break;
+ default:
+ printf("?\n");
+ break;
+ }
+ }
+ return(0);
+}
+
+void
+panic(const char *ctl, ...)
+{
+ va_list va;
+
+ va_start(va, ctl);
+ vfprintf(stderr, ctl, va);
+ fprintf(stderr, "\n");
+ va_end(va);
+ exit(1);
+}
+
+#endif
+
diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c
new file mode 100644
index 0000000..7281051
--- /dev/null
+++ b/sys/kern/subr_bus.c
@@ -0,0 +1,2179 @@
+/*-
+ * Copyright (c) 1997,1998 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_bus.h"
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/kobj.h>
+#include <sys/bus_private.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <machine/bus.h>
+#include <sys/rman.h>
+#include <machine/stdarg.h> /* for device_printf() */
+
+static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures");
+
+#ifdef BUS_DEBUG
+
+static int bus_debug = 1;
+SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RW, &bus_debug, 0,
+ "Debug bus code");
+
+#define PDEBUG(a) if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a, printf("\n");}
+#define DEVICENAME(d) ((d)? device_get_name(d): "no device")
+#define DRIVERNAME(d) ((d)? d->name : "no driver")
+#define DEVCLANAME(d) ((d)? d->name : "no devclass")
+
+/* Produce the indenting, indent*2 spaces plus a '.' ahead of that to
+ * prevent syslog from deleting initial spaces
+ */
+#define indentprintf(p) do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf(" "); printf p ; } while (0)
+
+static void print_device_short(device_t dev, int indent);
+static void print_device(device_t dev, int indent);
+void print_device_tree_short(device_t dev, int indent);
+void print_device_tree(device_t dev, int indent);
+static void print_driver_short(driver_t *driver, int indent);
+static void print_driver(driver_t *driver, int indent);
+static void print_driver_list(driver_list_t drivers, int indent);
+static void print_devclass_short(devclass_t dc, int indent);
+static void print_devclass(devclass_t dc, int indent);
+void print_devclass_list_short(void);
+void print_devclass_list(void);
+
+#else
+/* Make the compiler ignore the function calls */
+#define PDEBUG(a) /* nop */
+#define DEVICENAME(d) /* nop */
+#define DRIVERNAME(d) /* nop */
+#define DEVCLANAME(d) /* nop */
+
+#define print_device_short(d,i) /* nop */
+#define print_device(d,i) /* nop */
+#define print_device_tree_short(d,i) /* nop */
+#define print_device_tree(d,i) /* nop */
+#define print_driver_short(d,i) /* nop */
+#define print_driver(d,i) /* nop */
+#define print_driver_list(d,i) /* nop */
+#define print_devclass_short(d,i) /* nop */
+#define print_devclass(d,i) /* nop */
+#define print_devclass_list_short() /* nop */
+#define print_devclass_list() /* nop */
+#endif
+
+TAILQ_HEAD(,device) bus_data_devices;
+static int bus_data_generation = 1;
+
+kobj_method_t null_methods[] = {
+ { 0, 0 }
+};
+
+DEFINE_CLASS(null, null_methods, 0);
+
+/*
+ * Devclass implementation
+ */
+
+static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);
+
+static devclass_t
+devclass_find_internal(const char *classname, int create)
+{
+ devclass_t dc;
+
+ PDEBUG(("looking for %s", classname));
+ if (!classname)
+ return (NULL);
+
+ TAILQ_FOREACH(dc, &devclasses, link) {
+ if (!strcmp(dc->name, classname))
+ return (dc);
+ }
+
+ PDEBUG(("%s not found%s", classname, (create? ", creating": "")));
+ if (create) {
+ dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
+ M_BUS, M_NOWAIT|M_ZERO);
+ if (!dc)
+ return (NULL);
+ dc->name = (char*) (dc + 1);
+ strcpy(dc->name, classname);
+ TAILQ_INIT(&dc->drivers);
+ TAILQ_INSERT_TAIL(&devclasses, dc, link);
+
+ bus_data_generation_update();
+ }
+
+ return (dc);
+}
+
+devclass_t
+devclass_create(const char *classname)
+{
+ return (devclass_find_internal(classname, TRUE));
+}
+
+devclass_t
+devclass_find(const char *classname)
+{
+ return (devclass_find_internal(classname, FALSE));
+}
+
+int
+devclass_add_driver(devclass_t dc, driver_t *driver)
+{
+ driverlink_t dl;
+ int i;
+
+ PDEBUG(("%s", DRIVERNAME(driver)));
+
+ dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO);
+ if (!dl)
+ return (ENOMEM);
+
+ /*
+ * Compile the driver's methods. Also increase the reference count
+ * so that the class doesn't get freed when the last instance
+ * goes. This means we can safely use static methods and avoids a
+ * double-free in devclass_delete_driver.
+ */
+ kobj_class_compile((kobj_class_t) driver);
+
+ /*
+ * Make sure the devclass which the driver is implementing exists.
+ */
+ devclass_find_internal(driver->name, TRUE);
+
+ dl->driver = driver;
+ TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
+ driver->refs++;
+
+ /*
+ * Call BUS_DRIVER_ADDED for any existing busses in this class.
+ */
+ for (i = 0; i < dc->maxunit; i++)
+ if (dc->devices[i])
+ BUS_DRIVER_ADDED(dc->devices[i], driver);
+
+ bus_data_generation_update();
+ return (0);
+}
+
+int
+devclass_delete_driver(devclass_t busclass, driver_t *driver)
+{
+ devclass_t dc = devclass_find(driver->name);
+ driverlink_t dl;
+ device_t dev;
+ int i;
+ int error;
+
+ PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
+
+ if (!dc)
+ return (0);
+
+ /*
+ * Find the link structure in the bus' list of drivers.
+ */
+ TAILQ_FOREACH(dl, &busclass->drivers, link) {
+ if (dl->driver == driver)
+ break;
+ }
+
+ if (!dl) {
+ PDEBUG(("%s not found in %s list", driver->name,
+ busclass->name));
+ return (ENOENT);
+ }
+
+ /*
+ * Disassociate from any devices. We iterate through all the
+ * devices in the devclass of the driver and detach any which are
+ * using the driver and which have a parent in the devclass which
+ * we are deleting from.
+ *
+ * Note that since a driver can be in multiple devclasses, we
+ * should not detach devices which are not children of devices in
+ * the affected devclass.
+ */
+ for (i = 0; i < dc->maxunit; i++) {
+ if (dc->devices[i]) {
+ dev = dc->devices[i];
+ if (dev->driver == driver && dev->parent &&
+ dev->parent->devclass == busclass) {
+ if ((error = device_detach(dev)) != 0)
+ return (error);
+ device_set_driver(dev, NULL);
+ }
+ }
+ }
+
+ TAILQ_REMOVE(&busclass->drivers, dl, link);
+ free(dl, M_BUS);
+
+ driver->refs--;
+ if (driver->refs == 0)
+ kobj_class_free((kobj_class_t) driver);
+
+ bus_data_generation_update();
+ return (0);
+}
+
+static driverlink_t
+devclass_find_driver_internal(devclass_t dc, const char *classname)
+{
+ driverlink_t dl;
+
+ PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));
+
+ TAILQ_FOREACH(dl, &dc->drivers, link) {
+ if (!strcmp(dl->driver->name, classname))
+ return (dl);
+ }
+
+ PDEBUG(("not found"));
+ return (NULL);
+}
+
+driver_t *
+devclass_find_driver(devclass_t dc, const char *classname)
+{
+ driverlink_t dl;
+
+ dl = devclass_find_driver_internal(dc, classname);
+ if (dl)
+ return (dl->driver);
+ return (NULL);
+}
+
+const char *
+devclass_get_name(devclass_t dc)
+{
+ return (dc->name);
+}
+
+device_t
+devclass_get_device(devclass_t dc, int unit)
+{
+ if (dc == NULL || unit < 0 || unit >= dc->maxunit)
+ return (NULL);
+ return (dc->devices[unit]);
+}
+
+void *
+devclass_get_softc(devclass_t dc, int unit)
+{
+ device_t dev;
+
+ dev = devclass_get_device(dc, unit);
+ if (!dev)
+ return (NULL);
+
+ return (device_get_softc(dev));
+}
+
+int
+devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp)
+{
+ int i;
+ int count;
+ device_t *list;
+
+ count = 0;
+ for (i = 0; i < dc->maxunit; i++)
+ if (dc->devices[i])
+ count++;
+
+ list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
+ if (!list)
+ return (ENOMEM);
+
+ count = 0;
+ for (i = 0; i < dc->maxunit; i++) {
+ if (dc->devices[i]) {
+ list[count] = dc->devices[i];
+ count++;
+ }
+ }
+
+ *devlistp = list;
+ *devcountp = count;
+
+ return (0);
+}
+
+int
+devclass_get_maxunit(devclass_t dc)
+{
+ return (dc->maxunit);
+}
+
+int
+devclass_find_free_unit(devclass_t dc, int unit)
+{
+ if (dc == NULL)
+ return (unit);
+ while (unit < dc->maxunit && dc->devices[unit] != NULL)
+ unit++;
+ return (unit);
+}
+
+static int
+devclass_alloc_unit(devclass_t dc, int *unitp)
+{
+ int unit = *unitp;
+
+ PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));
+
+ /* If we were given a wired unit number, check for existing device */
+ /* XXX imp XXX */
+ if (unit != -1) {
+ if (unit >= 0 && unit < dc->maxunit &&
+ dc->devices[unit] != NULL) {
+ if (bootverbose)
+ printf("%s: %s%d already exists; skipping it\n",
+ dc->name, dc->name, *unitp);
+ return (EEXIST);
+ }
+ } else {
+ /* Unwired device, find the next available slot for it */
+ unit = 0;
+ while (unit < dc->maxunit && dc->devices[unit] != NULL)
+ unit++;
+ }
+
+ /*
+ * We've selected a unit beyond the length of the table, so let's
+ * extend the table to make room for all units up to and including
+ * this one.
+ */
+ if (unit >= dc->maxunit) {
+ device_t *newlist;
+ int newsize;
+
+ newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t));
+ newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT);
+ if (!newlist)
+ return (ENOMEM);
+ bcopy(dc->devices, newlist, sizeof(device_t) * dc->maxunit);
+ bzero(newlist + dc->maxunit,
+ sizeof(device_t) * (newsize - dc->maxunit));
+ if (dc->devices)
+ free(dc->devices, M_BUS);
+ dc->devices = newlist;
+ dc->maxunit = newsize;
+ }
+ PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));
+
+ *unitp = unit;
+ return (0);
+}
+
+static int
+devclass_add_device(devclass_t dc, device_t dev)
+{
+ int buflen, error;
+
+ PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
+
+ buflen = snprintf(NULL, 0, "%s%d$", dc->name, dev->unit);
+ if (buflen < 0)
+ return (ENOMEM);
+ dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO);
+ if (!dev->nameunit)
+ return (ENOMEM);
+
+ if ((error = devclass_alloc_unit(dc, &dev->unit)) != 0) {
+ free(dev->nameunit, M_BUS);
+ dev->nameunit = NULL;
+ return (error);
+ }
+ dc->devices[dev->unit] = dev;
+ dev->devclass = dc;
+ snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit);
+
+ return (0);
+}
+
+static int
+devclass_delete_device(devclass_t dc, device_t dev)
+{
+ if (!dc || !dev)
+ return (0);
+
+ PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
+
+ if (dev->devclass != dc || dc->devices[dev->unit] != dev)
+ panic("devclass_delete_device: inconsistent device class");
+ dc->devices[dev->unit] = NULL;
+ if (dev->flags & DF_WILDCARD)
+ dev->unit = -1;
+ dev->devclass = NULL;
+ free(dev->nameunit, M_BUS);
+ dev->nameunit = NULL;
+
+ return (0);
+}
+
+static device_t
+make_device(device_t parent, const char *name, int unit)
+{
+ device_t dev;
+ devclass_t dc;
+
+ PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));
+
+ if (name) {
+ dc = devclass_find_internal(name, TRUE);
+ if (!dc) {
+ printf("make_device: can't find device class %s\n",
+ name);
+ return (NULL);
+ }
+ } else {
+ dc = NULL;
+ }
+
+ dev = malloc(sizeof(struct device), M_BUS, M_NOWAIT|M_ZERO);
+ if (!dev)
+ return (NULL);
+
+ dev->parent = parent;
+ TAILQ_INIT(&dev->children);
+ kobj_init((kobj_t) dev, &null_class);
+ dev->driver = NULL;
+ dev->devclass = NULL;
+ dev->unit = unit;
+ dev->nameunit = NULL;
+ dev->desc = NULL;
+ dev->busy = 0;
+ dev->devflags = 0;
+ dev->flags = DF_ENABLED;
+ dev->order = 0;
+ if (unit == -1)
+ dev->flags |= DF_WILDCARD;
+ if (name) {
+ dev->flags |= DF_FIXEDCLASS;
+ if (devclass_add_device(dc, dev)) {
+ kobj_delete((kobj_t) dev, M_BUS);
+ return (NULL);
+ }
+ }
+ dev->ivars = NULL;
+ dev->softc = NULL;
+
+ dev->state = DS_NOTPRESENT;
+
+ TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink);
+ bus_data_generation_update();
+
+ return (dev);
+}
+
+static int
+device_print_child(device_t dev, device_t child)
+{
+ int retval = 0;
+
+ if (device_is_alive(child))
+ retval += BUS_PRINT_CHILD(dev, child);
+ else
+ retval += device_printf(child, " not found\n");
+
+ return (retval);
+}
+
+device_t
+device_add_child(device_t dev, const char *name, int unit)
+{
+ return (device_add_child_ordered(dev, 0, name, unit));
+}
+
+device_t
+device_add_child_ordered(device_t dev, int order, const char *name, int unit)
+{
+ device_t child;
+ device_t place;
+
+ PDEBUG(("%s at %s with order %d as unit %d",
+ name, DEVICENAME(dev), order, unit));
+
+ child = make_device(dev, name, unit);
+ if (child == NULL)
+ return (child);
+ child->order = order;
+
+ TAILQ_FOREACH(place, &dev->children, link) {
+ if (place->order > order)
+ break;
+ }
+
+ if (place) {
+ /*
+ * The device 'place' is the first device whose order is
+ * greater than the new child.
+ */
+ TAILQ_INSERT_BEFORE(place, child, link);
+ } else {
+ /*
+ * The new child's order is greater or equal to the order of
+ * any existing device. Add the child to the tail of the list.
+ */
+ TAILQ_INSERT_TAIL(&dev->children, child, link);
+ }
+
+ bus_data_generation_update();
+ return (child);
+}
+
+int
+device_delete_child(device_t dev, device_t child)
+{
+ int error;
+ device_t grandchild;
+
+ PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
+
+ /* remove children first */
+ while ( (grandchild = TAILQ_FIRST(&child->children)) ) {
+ error = device_delete_child(child, grandchild);
+ if (error)
+ return (error);
+ }
+
+ if ((error = device_detach(child)) != 0)
+ return (error);
+ if (child->devclass)
+ devclass_delete_device(child->devclass, child);
+ TAILQ_REMOVE(&dev->children, child, link);
+ TAILQ_REMOVE(&bus_data_devices, child, devlink);
+ device_set_desc(child, NULL);
+ free(child, M_BUS);
+
+ bus_data_generation_update();
+ return (0);
+}
+
+/*
+ * Find only devices attached to this bus.
+ */
+device_t
+device_find_child(device_t dev, const char *classname, int unit)
+{
+ devclass_t dc;
+ device_t child;
+
+ dc = devclass_find(classname);
+ if (!dc)
+ return (NULL);
+
+ child = devclass_get_device(dc, unit);
+ if (child && child->parent == dev)
+ return (child);
+ return (NULL);
+}
+
+static driverlink_t
+first_matching_driver(devclass_t dc, device_t dev)
+{
+ if (dev->devclass)
+ return (devclass_find_driver_internal(dc, dev->devclass->name));
+ return (TAILQ_FIRST(&dc->drivers));
+}
+
+static driverlink_t
+next_matching_driver(devclass_t dc, device_t dev, driverlink_t last)
+{
+ if (dev->devclass) {
+ driverlink_t dl;
+ for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link))
+ if (!strcmp(dev->devclass->name, dl->driver->name))
+ return (dl);
+ return (NULL);
+ }
+ return (TAILQ_NEXT(last, link));
+}
+
+static int
+device_probe_child(device_t dev, device_t child)
+{
+ devclass_t dc;
+ driverlink_t best = 0;
+ driverlink_t dl;
+ int result, pri = 0;
+ int hasclass = (child->devclass != 0);
+
+ dc = dev->devclass;
+ if (!dc)
+ panic("device_probe_child: parent device has no devclass");
+
+ if (child->state == DS_ALIVE)
+ return (0);
+
+ for (dl = first_matching_driver(dc, child);
+ dl;
+ dl = next_matching_driver(dc, child, dl)) {
+ PDEBUG(("Trying %s", DRIVERNAME(dl->driver)));
+ device_set_driver(child, dl->driver);
+ if (!hasclass)
+ device_set_devclass(child, dl->driver->name);
+ result = DEVICE_PROBE(child);
+ if (!hasclass)
+ device_set_devclass(child, 0);
+
+ /*
+ * If the driver returns SUCCESS, there can be no higher match
+ * for this device.
+ */
+ if (result == 0) {
+ best = dl;
+ pri = 0;
+ break;
+ }
+
+ /*
+ * The driver returned an error so it certainly doesn't match.
+ */
+ if (result > 0) {
+ device_set_driver(child, 0);
+ continue;
+ }
+
+ /*
+ * A priority lower than SUCCESS, remember the best matching
+ * driver. Initialise the value of pri for the first match.
+ */
+ if (best == 0 || result > pri) {
+ best = dl;
+ pri = result;
+ continue;
+ }
+ }
+
+ /*
+ * If we found a driver, change state and initialise the devclass.
+ */
+ if (best) {
+ if (!child->devclass)
+ device_set_devclass(child, best->driver->name);
+ device_set_driver(child, best->driver);
+ if (pri < 0) {
+ /*
+ * A bit bogus. Call the probe method again to make
+ * sure that we have the right description.
+ */
+ DEVICE_PROBE(child);
+ }
+ child->state = DS_ALIVE;
+
+ bus_data_generation_update();
+ return (0);
+ }
+
+ return (ENXIO);
+}
+
+device_t
+device_get_parent(device_t dev)
+{
+ return (dev->parent);
+}
+
+int
+device_get_children(device_t dev, device_t **devlistp, int *devcountp)
+{
+ int count;
+ device_t child;
+ device_t *list;
+
+ count = 0;
+ TAILQ_FOREACH(child, &dev->children, link) {
+ count++;
+ }
+
+ list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
+ if (!list)
+ return (ENOMEM);
+
+ count = 0;
+ TAILQ_FOREACH(child, &dev->children, link) {
+ list[count] = child;
+ count++;
+ }
+
+ *devlistp = list;
+ *devcountp = count;
+
+ return (0);
+}
+
+driver_t *
+device_get_driver(device_t dev)
+{
+ return (dev->driver);
+}
+
+devclass_t
+device_get_devclass(device_t dev)
+{
+ return (dev->devclass);
+}
+
+const char *
+device_get_name(device_t dev)
+{
+ if (dev->devclass)
+ return (devclass_get_name(dev->devclass));
+ return (NULL);
+}
+
+const char *
+device_get_nameunit(device_t dev)
+{
+ return (dev->nameunit);
+}
+
+int
+device_get_unit(device_t dev)
+{
+ return (dev->unit);
+}
+
+const char *
+device_get_desc(device_t dev)
+{
+ return (dev->desc);
+}
+
+u_int32_t
+device_get_flags(device_t dev)
+{
+ return (dev->devflags);
+}
+
+int
+device_print_prettyname(device_t dev)
+{
+ const char *name = device_get_name(dev);
+
+ if (name == 0)
+ return (printf("unknown: "));
+ return (printf("%s%d: ", name, device_get_unit(dev)));
+}
+
+int
+device_printf(device_t dev, const char * fmt, ...)
+{
+ va_list ap;
+ int retval;
+
+ retval = device_print_prettyname(dev);
+ va_start(ap, fmt);
+ retval += vprintf(fmt, ap);
+ va_end(ap);
+ return (retval);
+}
+
+static void
+device_set_desc_internal(device_t dev, const char* desc, int copy)
+{
+ if (dev->desc && (dev->flags & DF_DESCMALLOCED)) {
+ free(dev->desc, M_BUS);
+ dev->flags &= ~DF_DESCMALLOCED;
+ dev->desc = NULL;
+ }
+
+ if (copy && desc) {
+ dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT);
+ if (dev->desc) {
+ strcpy(dev->desc, desc);
+ dev->flags |= DF_DESCMALLOCED;
+ }
+ } else {
+ /* Avoid a -Wcast-qual warning */
+ dev->desc = (char *)(uintptr_t) desc;
+ }
+
+ bus_data_generation_update();
+}
+
+void
+device_set_desc(device_t dev, const char* desc)
+{
+ device_set_desc_internal(dev, desc, FALSE);
+}
+
+void
+device_set_desc_copy(device_t dev, const char* desc)
+{
+ device_set_desc_internal(dev, desc, TRUE);
+}
+
+void
+device_set_flags(device_t dev, u_int32_t flags)
+{
+ dev->devflags = flags;
+}
+
+void *
+device_get_softc(device_t dev)
+{
+ return (dev->softc);
+}
+
+void
+device_set_softc(device_t dev, void *softc)
+{
+ if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC))
+ free(dev->softc, M_BUS);
+ dev->softc = softc;
+ if (dev->softc)
+ dev->flags |= DF_EXTERNALSOFTC;
+ else
+ dev->flags &= ~DF_EXTERNALSOFTC;
+}
+
+void *
+device_get_ivars(device_t dev)
+{
+ return (dev->ivars);
+}
+
+void
+device_set_ivars(device_t dev, void * ivars)
+{
+ if (!dev)
+ return;
+
+ dev->ivars = ivars;
+
+ return;
+}
+
+device_state_t
+device_get_state(device_t dev)
+{
+ return (dev->state);
+}
+
+void
+device_enable(device_t dev)
+{
+ dev->flags |= DF_ENABLED;
+}
+
+void
+device_disable(device_t dev)
+{
+ dev->flags &= ~DF_ENABLED;
+}
+
+void
+device_busy(device_t dev)
+{
+ if (dev->state < DS_ATTACHED)
+ panic("device_busy: called for unattached device");
+ if (dev->busy == 0 && dev->parent)
+ device_busy(dev->parent);
+ dev->busy++;
+ dev->state = DS_BUSY;
+}
+
+void
+device_unbusy(device_t dev)
+{
+ if (dev->state != DS_BUSY)
+ panic("device_unbusy: called for non-busy device");
+ dev->busy--;
+ if (dev->busy == 0) {
+ if (dev->parent)
+ device_unbusy(dev->parent);
+ dev->state = DS_ATTACHED;
+ }
+}
+
+void
+device_quiet(device_t dev)
+{
+ dev->flags |= DF_QUIET;
+}
+
+void
+device_verbose(device_t dev)
+{
+ dev->flags &= ~DF_QUIET;
+}
+
+int
+device_is_quiet(device_t dev)
+{
+ return ((dev->flags & DF_QUIET) != 0);
+}
+
+int
+device_is_enabled(device_t dev)
+{
+ return ((dev->flags & DF_ENABLED) != 0);
+}
+
+int
+device_is_alive(device_t dev)
+{
+ return (dev->state >= DS_ALIVE);
+}
+
+int
+device_set_devclass(device_t dev, const char *classname)
+{
+ devclass_t dc;
+ int error;
+
+ if (!classname) {
+ if (dev->devclass)
+ devclass_delete_device(dev->devclass, dev);
+ return (0);
+ }
+
+ if (dev->devclass) {
+ printf("device_set_devclass: device class already set\n");
+ return (EINVAL);
+ }
+
+ dc = devclass_find_internal(classname, TRUE);
+ if (!dc)
+ return (ENOMEM);
+
+ error = devclass_add_device(dc, dev);
+
+ bus_data_generation_update();
+ return (error);
+}
+
+int
+device_set_driver(device_t dev, driver_t *driver)
+{
+ if (dev->state >= DS_ATTACHED)
+ return (EBUSY);
+
+ if (dev->driver == driver)
+ return (0);
+
+ if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) {
+ free(dev->softc, M_BUS);
+ dev->softc = NULL;
+ }
+ kobj_delete((kobj_t) dev, 0);
+ dev->driver = driver;
+ if (driver) {
+ kobj_init((kobj_t) dev, (kobj_class_t) driver);
+ if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) {
+ dev->softc = malloc(driver->size, M_BUS,
+ M_NOWAIT | M_ZERO);
+ if (!dev->softc) {
+ kobj_init((kobj_t) dev, &null_class);
+ dev->driver = NULL;
+ return (ENOMEM);
+ }
+ }
+ } else {
+ kobj_init((kobj_t) dev, &null_class);
+ }
+
+ bus_data_generation_update();
+ return (0);
+}
+
+int
+device_probe_and_attach(device_t dev)
+{
+ device_t bus = dev->parent;
+ int error = 0;
+ int hasclass = (dev->devclass != 0);
+
+ if (dev->state >= DS_ALIVE)
+ return (0);
+
+ if (dev->flags & DF_ENABLED) {
+ error = device_probe_child(bus, dev);
+ if (!error) {
+ if (!device_is_quiet(dev))
+ device_print_child(bus, dev);
+ error = DEVICE_ATTACH(dev);
+ if (!error)
+ dev->state = DS_ATTACHED;
+ else {
+ printf("device_probe_and_attach: %s%d attach returned %d\n",
+ dev->driver->name, dev->unit, error);
+ /* Unset the class; set in device_probe_child */
+ if (!hasclass)
+ device_set_devclass(dev, 0);
+ device_set_driver(dev, NULL);
+ dev->state = DS_NOTPRESENT;
+ }
+ } else {
+ if (!(dev->flags & DF_DONENOMATCH)) {
+ BUS_PROBE_NOMATCH(bus, dev);
+ dev->flags |= DF_DONENOMATCH;
+ }
+ }
+ } else {
+ if (bootverbose) {
+ device_print_prettyname(dev);
+ printf("not probed (disabled)\n");
+ }
+ }
+
+ return (error);
+}
+
+int
+device_detach(device_t dev)
+{
+ int error;
+
+ PDEBUG(("%s", DEVICENAME(dev)));
+ if (dev->state == DS_BUSY)
+ return (EBUSY);
+ if (dev->state != DS_ATTACHED)
+ return (0);
+
+ if ((error = DEVICE_DETACH(dev)) != 0)
+ return (error);
+ device_printf(dev, "detached\n");
+ if (dev->parent)
+ BUS_CHILD_DETACHED(dev->parent, dev);
+
+ if (!(dev->flags & DF_FIXEDCLASS))
+ devclass_delete_device(dev->devclass, dev);
+
+ dev->state = DS_NOTPRESENT;
+ device_set_driver(dev, NULL);
+
+ return (0);
+}
+
+int
+device_shutdown(device_t dev)
+{
+ if (dev->state < DS_ATTACHED)
+ return (0);
+ return (DEVICE_SHUTDOWN(dev));
+}
+
+int
+device_set_unit(device_t dev, int unit)
+{
+ devclass_t dc;
+ int err;
+
+ dc = device_get_devclass(dev);
+ if (unit < dc->maxunit && dc->devices[unit])
+ return (EBUSY);
+ err = devclass_delete_device(dc, dev);
+ if (err)
+ return (err);
+ dev->unit = unit;
+ err = devclass_add_device(dc, dev);
+ if (err)
+ return (err);
+
+ bus_data_generation_update();
+ return (0);
+}
+
+/*======================================*/
+/*
+ * Some useful method implementations to make life easier for bus drivers.
+ */
+
+void
+resource_list_init(struct resource_list *rl)
+{
+ SLIST_INIT(rl);
+}
+
+void
+resource_list_free(struct resource_list *rl)
+{
+ struct resource_list_entry *rle;
+
+ while ((rle = SLIST_FIRST(rl)) != NULL) {
+ if (rle->res)
+ panic("resource_list_free: resource entry is busy");
+ SLIST_REMOVE_HEAD(rl, link);
+ free(rle, M_BUS);
+ }
+}
+
+int
+resource_list_add_next(struct resource_list *rl, int type,
+ u_long start, u_long end, u_long count)
+{
+ int rid;
+
+ rid = 0;
+ while (resource_list_find(rl, type, rid)) rid++;
+ resource_list_add(rl, type, rid, start, end, count);
+
+ return (rid);
+}
+
+void
+resource_list_add(struct resource_list *rl, int type, int rid,
+ u_long start, u_long end, u_long count)
+{
+ struct resource_list_entry *rle;
+
+ rle = resource_list_find(rl, type, rid);
+ if (!rle) {
+ rle = malloc(sizeof(struct resource_list_entry), M_BUS,
+ M_NOWAIT);
+ if (!rle)
+ panic("resource_list_add: can't record entry");
+ SLIST_INSERT_HEAD(rl, rle, link);
+ rle->type = type;
+ rle->rid = rid;
+ rle->res = NULL;
+ }
+
+ if (rle->res)
+ panic("resource_list_add: resource entry is busy");
+
+ rle->start = start;
+ rle->end = end;
+ rle->count = count;
+}
+
+struct resource_list_entry *
+resource_list_find(struct resource_list *rl, int type, int rid)
+{
+ struct resource_list_entry *rle;
+
+ SLIST_FOREACH(rle, rl, link) {
+ if (rle->type == type && rle->rid == rid)
+ return (rle);
+ }
+ return (NULL);
+}
+
+void
+resource_list_delete(struct resource_list *rl, int type, int rid)
+{
+ struct resource_list_entry *rle = resource_list_find(rl, type, rid);
+
+ if (rle) {
+ if (rle->res != NULL)
+ panic("resource_list_delete: resource has not been released");
+ SLIST_REMOVE(rl, rle, resource_list_entry, link);
+ free(rle, M_BUS);
+ }
+}
+
+struct resource *
+resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
+ int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
+{
+ struct resource_list_entry *rle = 0;
+ int passthrough = (device_get_parent(child) != bus);
+ int isdefault = (start == 0UL && end == ~0UL);
+
+ if (passthrough) {
+ return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
+ type, rid, start, end, count, flags));
+ }
+
+ rle = resource_list_find(rl, type, *rid);
+
+ if (!rle)
+ return (NULL); /* no resource of that type/rid */
+
+ if (rle->res)
+ panic("resource_list_alloc: resource entry is busy");
+
+ if (isdefault) {
+ start = rle->start;
+ count = ulmax(count, rle->count);
+ end = ulmax(rle->end, start + count - 1);
+ }
+
+ rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
+ type, rid, start, end, count, flags);
+
+ /*
+ * Record the new range.
+ */
+ if (rle->res) {
+ rle->start = rman_get_start(rle->res);
+ rle->end = rman_get_end(rle->res);
+ rle->count = count;
+ }
+
+ return (rle->res);
+}
+
+int
+resource_list_release(struct resource_list *rl, device_t bus, device_t child,
+ int type, int rid, struct resource *res)
+{
+ struct resource_list_entry *rle = 0;
+ int passthrough = (device_get_parent(child) != bus);
+ int error;
+
+ if (passthrough) {
+ return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
+ type, rid, res));
+ }
+
+ rle = resource_list_find(rl, type, rid);
+
+ if (!rle)
+ panic("resource_list_release: can't find resource");
+ if (!rle->res)
+ panic("resource_list_release: resource entry is not busy");
+
+ error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
+ type, rid, res);
+ if (error)
+ return (error);
+
+ rle->res = NULL;
+ return (0);
+}
+
+int
+resource_list_print_type(struct resource_list *rl, const char *name, int type,
+ const char *format)
+{
+ struct resource_list_entry *rle;
+ int printed, retval;
+
+ printed = 0;
+ retval = 0;
+ /* Yes, this is kinda cheating */
+ SLIST_FOREACH(rle, rl, link) {
+ if (rle->type == type) {
+ if (printed == 0)
+ retval += printf(" %s ", name);
+ else
+ retval += printf(",");
+ printed++;
+ retval += printf(format, rle->start);
+ if (rle->count > 1) {
+ retval += printf("-");
+ retval += printf(format, rle->start +
+ rle->count - 1);
+ }
+ }
+ }
+ return (retval);
+}
+
+/*
+ * Call DEVICE_IDENTIFY for each driver.
+ */
+int
+bus_generic_probe(device_t dev)
+{
+ devclass_t dc = dev->devclass;
+ driverlink_t dl;
+
+ TAILQ_FOREACH(dl, &dc->drivers, link) {
+ DEVICE_IDENTIFY(dl->driver, dev);
+ }
+
+ return (0);
+}
+
+int
+bus_generic_attach(device_t dev)
+{
+ device_t child;
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ device_probe_and_attach(child);
+ }
+
+ return (0);
+}
+
+int
+bus_generic_detach(device_t dev)
+{
+ device_t child;
+ int error;
+
+ if (dev->state != DS_ATTACHED)
+ return (EBUSY);
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ if ((error = device_detach(child)) != 0)
+ return (error);
+ }
+
+ return (0);
+}
+
+int
+bus_generic_shutdown(device_t dev)
+{
+ device_t child;
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ device_shutdown(child);
+ }
+
+ return (0);
+}
+
+int
+bus_generic_suspend(device_t dev)
+{
+ int error;
+ device_t child, child2;
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ error = DEVICE_SUSPEND(child);
+ if (error) {
+ for (child2 = TAILQ_FIRST(&dev->children);
+ child2 && child2 != child;
+ child2 = TAILQ_NEXT(child2, link))
+ DEVICE_RESUME(child2);
+ return (error);
+ }
+ }
+ return (0);
+}
+
+int
+bus_generic_resume(device_t dev)
+{
+ device_t child;
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ DEVICE_RESUME(child);
+ /* if resume fails, there's nothing we can usefully do... */
+ }
+ return (0);
+}
+
+int
+bus_print_child_header (device_t dev, device_t child)
+{
+ int retval = 0;
+
+ if (device_get_desc(child)) {
+ retval += device_printf(child, "<%s>", device_get_desc(child));
+ } else {
+ retval += printf("%s", device_get_nameunit(child));
+ }
+
+ return (retval);
+}
+
+int
+bus_print_child_footer (device_t dev, device_t child)
+{
+ return (printf(" on %s\n", device_get_nameunit(dev)));
+}
+
+int
+bus_generic_print_child(device_t dev, device_t child)
+{
+ int retval = 0;
+
+ retval += bus_print_child_header(dev, child);
+ retval += bus_print_child_footer(dev, child);
+
+ return (retval);
+}
+
+int
+bus_generic_read_ivar(device_t dev, device_t child, int index,
+ uintptr_t * result)
+{
+ return (ENOENT);
+}
+
+int
+bus_generic_write_ivar(device_t dev, device_t child, int index,
+ uintptr_t value)
+{
+ return (ENOENT);
+}
+
+struct resource_list *
+bus_generic_get_resource_list (device_t dev, device_t child)
+{
+ return (NULL);
+}
+
+void
+bus_generic_driver_added(device_t dev, driver_t *driver)
+{
+ device_t child;
+
+ DEVICE_IDENTIFY(driver, dev);
+ TAILQ_FOREACH(child, &dev->children, link) {
+ if (child->state == DS_NOTPRESENT)
+ device_probe_and_attach(child);
+ }
+}
+
+int
+bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
+ int flags, driver_intr_t *intr, void *arg, void **cookiep)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
+ intr, arg, cookiep));
+ return (EINVAL);
+}
+
+int
+bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
+ void *cookie)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
+ return (EINVAL);
+}
+
+struct resource *
+bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
+ u_long start, u_long end, u_long count, u_int flags)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid,
+ start, end, count, flags));
+ return (NULL);
+}
+
+int
+bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
+ struct resource *r)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid,
+ r));
+ return (EINVAL);
+}
+
+int
+bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
+ struct resource *r)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid,
+ r));
+ return (EINVAL);
+}
+
+int
+bus_generic_deactivate_resource(device_t dev, device_t child, int type,
+ int rid, struct resource *r)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
+ r));
+ return (EINVAL);
+}
+
+int
+bus_generic_rl_get_resource (device_t dev, device_t child, int type, int rid,
+ u_long *startp, u_long *countp)
+{
+ struct resource_list * rl = NULL;
+ struct resource_list_entry * rle = NULL;
+
+ rl = BUS_GET_RESOURCE_LIST(dev, child);
+ if (!rl)
+ return (EINVAL);
+
+ rle = resource_list_find(rl, type, rid);
+ if (!rle)
+ return (ENOENT);
+
+ if (startp)
+ *startp = rle->start;
+ if (countp)
+ *countp = rle->count;
+
+ return (0);
+}
+
+int
+bus_generic_rl_set_resource (device_t dev, device_t child, int type, int rid,
+ u_long start, u_long count)
+{
+ struct resource_list * rl = NULL;
+
+ rl = BUS_GET_RESOURCE_LIST(dev, child);
+ if (!rl)
+ return (EINVAL);
+
+ resource_list_add(rl, type, rid, start, (start + count - 1), count);
+
+ return (0);
+}
+
+void
+bus_generic_rl_delete_resource (device_t dev, device_t child, int type, int rid)
+{
+ struct resource_list * rl = NULL;
+
+ rl = BUS_GET_RESOURCE_LIST(dev, child);
+ if (!rl)
+ return;
+
+ resource_list_delete(rl, type, rid);
+
+ return;
+}
+
+int
+bus_generic_rl_release_resource (device_t dev, device_t child, int type,
+ int rid, struct resource *r)
+{
+ struct resource_list * rl = NULL;
+
+ rl = BUS_GET_RESOURCE_LIST(dev, child);
+ if (!rl)
+ return (EINVAL);
+
+ return (resource_list_release(rl, dev, child, type, rid, r));
+}
+
+struct resource *
+bus_generic_rl_alloc_resource (device_t dev, device_t child, int type,
+ int *rid, u_long start, u_long end, u_long count, u_int flags)
+{
+ struct resource_list * rl = NULL;
+
+ rl = BUS_GET_RESOURCE_LIST(dev, child);
+ if (!rl)
+ return (NULL);
+
+ return (resource_list_alloc(rl, dev, child, type, rid,
+ start, end, count, flags));
+}
+
+/*
+ * Some convenience functions to make it easier for drivers to use the
+ * resource-management functions. All these really do is hide the
+ * indirection through the parent's method table, making for slightly
+ * less-wordy code. In the future, it might make sense for this code
+ * to maintain some sort of a list of resources allocated by each device.
+ */
+struct resource *
+bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
+ u_long count, u_int flags)
+{
+ if (dev->parent == 0)
+ return (0);
+ return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
+ count, flags));
+}
+
+int
+bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
+{
+ if (dev->parent == 0)
+ return (EINVAL);
+ return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+int
+bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
+{
+ if (dev->parent == 0)
+ return (EINVAL);
+ return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+int
+bus_release_resource(device_t dev, int type, int rid, struct resource *r)
+{
+ if (dev->parent == 0)
+ return (EINVAL);
+ return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+int
+bus_setup_intr(device_t dev, struct resource *r, int flags,
+ driver_intr_t handler, void *arg, void **cookiep)
+{
+ if (dev->parent == 0)
+ return (EINVAL);
+ return (BUS_SETUP_INTR(dev->parent, dev, r, flags,
+ handler, arg, cookiep));
+}
+
+int
+bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
+{
+ if (dev->parent == 0)
+ return (EINVAL);
+ return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
+}
+
+int
+bus_set_resource(device_t dev, int type, int rid,
+ u_long start, u_long count)
+{
+ return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid,
+ start, count));
+}
+
+int
+bus_get_resource(device_t dev, int type, int rid,
+ u_long *startp, u_long *countp)
+{
+ return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
+ startp, countp));
+}
+
+u_long
+bus_get_resource_start(device_t dev, int type, int rid)
+{
+ u_long start, count;
+ int error;
+
+ error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
+ &start, &count);
+ if (error)
+ return (0);
+ return (start);
+}
+
+u_long
+bus_get_resource_count(device_t dev, int type, int rid)
+{
+ u_long start, count;
+ int error;
+
+ error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
+ &start, &count);
+ if (error)
+ return (0);
+ return (count);
+}
+
+void
+bus_delete_resource(device_t dev, int type, int rid)
+{
+ BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid);
+}
+
+static int
+root_print_child(device_t dev, device_t child)
+{
+ int retval = 0;
+
+ retval += bus_print_child_header(dev, child);
+ retval += printf("\n");
+
+ return (retval);
+}
+
+static int
+root_setup_intr(device_t dev, device_t child, driver_intr_t *intr, void *arg,
+ void **cookiep)
+{
+ /*
+ * If an interrupt mapping gets to here something bad has happened.
+ */
+ panic("root_setup_intr");
+}
+
+static kobj_method_t root_methods[] = {
+ /* Device interface */
+ KOBJMETHOD(device_shutdown, bus_generic_shutdown),
+ KOBJMETHOD(device_suspend, bus_generic_suspend),
+ KOBJMETHOD(device_resume, bus_generic_resume),
+
+ /* Bus interface */
+ KOBJMETHOD(bus_print_child, root_print_child),
+ KOBJMETHOD(bus_read_ivar, bus_generic_read_ivar),
+ KOBJMETHOD(bus_write_ivar, bus_generic_write_ivar),
+ KOBJMETHOD(bus_setup_intr, root_setup_intr),
+
+ { 0, 0 }
+};
+
+static driver_t root_driver = {
+ "root",
+ root_methods,
+ 1, /* no softc */
+};
+
+device_t root_bus;
+devclass_t root_devclass;
+
+static int
+root_bus_module_handler(module_t mod, int what, void* arg)
+{
+ switch (what) {
+ case MOD_LOAD:
+ TAILQ_INIT(&bus_data_devices);
+ kobj_class_compile((kobj_class_t) &root_driver);
+ root_bus = make_device(NULL, "root", 0);
+ root_bus->desc = "System root bus";
+ kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
+ root_bus->driver = &root_driver;
+ root_bus->state = DS_ATTACHED;
+ root_devclass = devclass_find_internal("root", FALSE);
+ return (0);
+
+ case MOD_SHUTDOWN:
+ device_shutdown(root_bus);
+ return (0);
+ }
+
+ return (0);
+}
+
+static moduledata_t root_bus_mod = {
+ "rootbus",
+ root_bus_module_handler,
+ 0
+};
+DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+
+void
+root_bus_configure(void)
+{
+ device_t dev;
+
+ PDEBUG(("."));
+
+ TAILQ_FOREACH(dev, &root_bus->children, link) {
+ device_probe_and_attach(dev);
+ }
+}
+
+int
+driver_module_handler(module_t mod, int what, void *arg)
+{
+ int error, i;
+ struct driver_module_data *dmd;
+ devclass_t bus_devclass;
+
+ dmd = (struct driver_module_data *)arg;
+ bus_devclass = devclass_find_internal(dmd->dmd_busname, TRUE);
+ error = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ if (dmd->dmd_chainevh)
+ error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
+
+ for (i = 0; !error && i < dmd->dmd_ndrivers; i++) {
+ PDEBUG(("Loading module: driver %s on bus %s",
+ DRIVERNAME(dmd->dmd_drivers[i]), dmd->dmd_busname));
+ error = devclass_add_driver(bus_devclass,
+ dmd->dmd_drivers[i]);
+ }
+ if (error)
+ break;
+
+ /*
+ * The drivers loaded in this way are assumed to all
+ * implement the same devclass.
+ */
+ *dmd->dmd_devclass =
+ devclass_find_internal(dmd->dmd_drivers[0]->name, TRUE);
+ break;
+
+ case MOD_UNLOAD:
+ for (i = 0; !error && i < dmd->dmd_ndrivers; i++) {
+ PDEBUG(("Unloading module: driver %s from bus %s",
+ DRIVERNAME(dmd->dmd_drivers[i]),
+ dmd->dmd_busname));
+ error = devclass_delete_driver(bus_devclass,
+ dmd->dmd_drivers[i]);
+ }
+
+ if (!error && dmd->dmd_chainevh)
+ error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
+ break;
+ }
+
+ return (error);
+}
+
+#ifdef BUS_DEBUG
+
+/* the _short versions avoid iteration by not calling anything that prints
+ * more than oneliners. I love oneliners.
+ */
+
+static void
+print_device_short(device_t dev, int indent)
+{
+ if (!dev)
+ return;
+
+ indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s,%sivars,%ssoftc,busy=%d\n",
+ dev->unit, dev->desc,
+ (dev->parent? "":"no "),
+ (TAILQ_EMPTY(&dev->children)? "no ":""),
+ (dev->flags&DF_ENABLED? "enabled,":"disabled,"),
+ (dev->flags&DF_FIXEDCLASS? "fixed,":""),
+ (dev->flags&DF_WILDCARD? "wildcard,":""),
+ (dev->flags&DF_DESCMALLOCED? "descmalloced,":""),
+ (dev->ivars? "":"no "),
+ (dev->softc? "":"no "),
+ dev->busy));
+}
+
+static void
+print_device(device_t dev, int indent)
+{
+ if (!dev)
+ return;
+
+ print_device_short(dev, indent);
+
+ indentprintf(("Parent:\n"));
+ print_device_short(dev->parent, indent+1);
+ indentprintf(("Driver:\n"));
+ print_driver_short(dev->driver, indent+1);
+ indentprintf(("Devclass:\n"));
+ print_devclass_short(dev->devclass, indent+1);
+}
+
+void
+print_device_tree_short(device_t dev, int indent)
+/* print the device and all its children (indented) */
+{
+ device_t child;
+
+ if (!dev)
+ return;
+
+ print_device_short(dev, indent);
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ print_device_tree_short(child, indent+1);
+ }
+}
+
+void
+print_device_tree(device_t dev, int indent)
+/* print the device and all its children (indented) */
+{
+ device_t child;
+
+ if (!dev)
+ return;
+
+ print_device(dev, indent);
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ print_device_tree(child, indent+1);
+ }
+}
+
+static void
+print_driver_short(driver_t *driver, int indent)
+{
+ if (!driver)
+ return;
+
+ indentprintf(("driver %s: softc size = %d\n",
+ driver->name, driver->size));
+}
+
+static void
+print_driver(driver_t *driver, int indent)
+{
+ if (!driver)
+ return;
+
+ print_driver_short(driver, indent);
+}
+
+
+static void
+print_driver_list(driver_list_t drivers, int indent)
+{
+ driverlink_t driver;
+
+ TAILQ_FOREACH(driver, &drivers, link) {
+ print_driver(driver->driver, indent);
+ }
+}
+
+static void
+print_devclass_short(devclass_t dc, int indent)
+{
+ if ( !dc )
+ return;
+
+ indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit));
+}
+
+static void
+print_devclass(devclass_t dc, int indent)
+{
+ int i;
+
+ if ( !dc )
+ return;
+
+ print_devclass_short(dc, indent);
+ indentprintf(("Drivers:\n"));
+ print_driver_list(dc->drivers, indent+1);
+
+ indentprintf(("Devices:\n"));
+ for (i = 0; i < dc->maxunit; i++)
+ if (dc->devices[i])
+ print_device(dc->devices[i], indent+1);
+}
+
+void
+print_devclass_list_short(void)
+{
+ devclass_t dc;
+
+ printf("Short listing of devclasses, drivers & devices:\n");
+ TAILQ_FOREACH(dc, &devclasses, link) {
+ print_devclass_short(dc, 0);
+ }
+}
+
+void
+print_devclass_list(void)
+{
+ devclass_t dc;
+
+ printf("Full listing of devclasses, drivers & devices:\n");
+ TAILQ_FOREACH(dc, &devclasses, link) {
+ print_devclass(dc, 0);
+ }
+}
+
+#endif
+
+/*
+ * User-space access to the device tree.
+ *
+ * We implement a small set of nodes:
+ *
+ * hw.bus Single integer read method to obtain the
+ * current generation count.
+ * hw.bus.devices Reads the entire device tree in flat space.
+ * hw.bus.rman Resource manager interface
+ *
+ * We might like to add the ability to scan devclasses and/or drivers to
+ * determine what else is currently loaded/available.
+ */
+SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL);
+
+static int
+sysctl_bus(SYSCTL_HANDLER_ARGS)
+{
+ struct u_businfo ubus;
+
+ ubus.ub_version = BUS_USER_VERSION;
+ ubus.ub_generation = bus_data_generation;
+
+ return (SYSCTL_OUT(req, &ubus, sizeof(ubus)));
+}
+SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus,
+ "bus-related data");
+
+static int
+sysctl_devices(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ int index;
+ struct device *dev;
+ struct u_device udev; /* XXX this is a bit big */
+ int error;
+
+ if (namelen != 2)
+ return (EINVAL);
+
+ if (bus_data_generation_check(name[0]))
+ return (EINVAL);
+
+ index = name[1];
+
+ /*
+ * Scan the list of devices, looking for the requested index.
+ */
+ TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
+ if (index-- == 0)
+ break;
+ }
+ if (dev == NULL)
+ return (ENOENT);
+
+ /*
+ * Populate the return array.
+ */
+ udev.dv_handle = (uintptr_t)dev;
+ udev.dv_parent = (uintptr_t)dev->parent;
+ if (dev->nameunit == NULL) {
+ udev.dv_name[0] = 0;
+ } else {
+ snprintf(udev.dv_name, 32, "%s", dev->nameunit);
+ }
+ if (dev->desc == NULL) {
+ udev.dv_desc[0] = 0;
+ } else {
+ snprintf(udev.dv_desc, 32, "%s", dev->desc);
+ }
+ if ((dev->driver == NULL) || (dev->driver->name == NULL)) {
+ udev.dv_drivername[0] = 0;
+ } else {
+ snprintf(udev.dv_drivername, 32, "%s", dev->driver->name);
+ }
+ error = SYSCTL_OUT(req, &udev, sizeof(udev));
+ return (error);
+}
+
+SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices,
+ "system device tree");
+
+/*
+ * Sysctl interface for scanning the resource lists.
+ *
+ * We take two input parameters; the index into the list of resource
+ * managers, and the resource offset into the list.
+ */
+static int
+sysctl_rman(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ int rman_idx, res_idx;
+ struct rman *rm;
+ struct resource *res;
+ struct u_rman urm;
+ struct u_resource ures;
+ int error;
+
+ if (namelen != 3)
+ return (EINVAL);
+
+ if (bus_data_generation_check(name[0]))
+ return (EINVAL);
+ rman_idx = name[1];
+ res_idx = name[2];
+
+ /*
+ * Find the indexed resource manager
+ */
+ TAILQ_FOREACH(rm, &rman_head, rm_link) {
+ if (rman_idx-- == 0)
+ break;
+ }
+ if (rm == NULL)
+ return (ENOENT);
+
+ /*
+ * If the resource index is -1, we want details on the
+ * resource manager.
+ */
+ if (res_idx == -1) {
+ urm.rm_handle = (uintptr_t)rm;
+ snprintf(urm.rm_descr, RM_TEXTLEN, "%s", rm->rm_descr);
+ urm.rm_descr[RM_TEXTLEN - 1] = '\0';
+ urm.rm_start = rm->rm_start;
+ urm.rm_size = rm->rm_end - rm->rm_start + 1;
+ urm.rm_type = rm->rm_type;
+
+ error = SYSCTL_OUT(req, &urm, sizeof(urm));
+ return (error);
+ }
+
+ /*
+ * Find the indexed resource and return it.
+ */
+ TAILQ_FOREACH(res, &rm->rm_list, r_link) {
+ if (res_idx-- == 0) {
+ ures.r_handle = (uintptr_t)res;
+ ures.r_parent = (uintptr_t)res->r_rm;
+ ures.r_device = (uintptr_t)res->r_dev;
+ if (res->r_dev != NULL) {
+ if (device_get_name(res->r_dev) != NULL) {
+ snprintf(ures.r_devname, RM_TEXTLEN,
+ "%s%d",
+ device_get_name(res->r_dev),
+ device_get_unit(res->r_dev));
+ } else {
+ snprintf(ures.r_devname, RM_TEXTLEN,
+ "nomatch");
+ }
+ } else {
+ ures.r_devname[0] = 0;
+ }
+ ures.r_start = res->r_start;
+ ures.r_size = res->r_end - res->r_start + 1;
+ ures.r_flags = res->r_flags;
+
+ error = SYSCTL_OUT(req, &ures, sizeof(ures));
+ return (error);
+ }
+ }
+ return (ENOENT);
+}
+
+SYSCTL_NODE(_hw_bus, OID_AUTO, rman, CTLFLAG_RD, sysctl_rman,
+ "kernel resource manager");
+
+int
+bus_data_generation_check(int generation)
+{
+ if (generation != bus_data_generation)
+ return (1);
+
+ /* XXX generate optimised lists here? */
+ return (0);
+}
+
+void
+bus_data_generation_update(void)
+{
+ bus_data_generation++;
+}
diff --git a/sys/kern/subr_clist.c b/sys/kern/subr_clist.c
new file mode 100644
index 0000000..78bb231
--- /dev/null
+++ b/sys/kern/subr_clist.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 1994, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * clist support routines
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/tty.h>
+#include <sys/clist.h>
+
+static void clist_init(void *);
+SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL)
+
+static struct cblock *cfreelist = 0;
+int cfreecount = 0;
+static int cslushcount;
+static int ctotcount;
+
+#ifndef INITIAL_CBLOCKS
+#define INITIAL_CBLOCKS 50
+#endif
+
+static struct cblock *cblock_alloc(void);
+static void cblock_alloc_cblocks(int number);
+static void cblock_free(struct cblock *cblockp);
+static void cblock_free_cblocks(int number);
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cbstat, cbstat)
+{
+ int cbsize = CBSIZE;
+
+ printf(
+ "tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n",
+ ctotcount * cbsize, ctotcount * cbsize - cfreecount, cfreecount,
+ cfreecount - cslushcount * cbsize, cslushcount * cbsize);
+}
+#endif /* DDB */
+
+/*
+ * Called from init_main.c
+ */
+/* ARGSUSED*/
+static void
+clist_init(dummy)
+ void *dummy;
+{
+ /*
+ * Allocate an initial base set of cblocks as a 'slush'.
+ * We allocate non-slush cblocks with each initial ttyopen() and
+ * deallocate them with each ttyclose().
+ * We should adjust the slush allocation. This can't be done in
+ * the i/o routines because they are sometimes called from
+ * interrupt handlers when it may be unsafe to call malloc().
+ */
+ cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS);
+}
+
+/*
+ * Remove a cblock from the cfreelist queue and return a pointer
+ * to it.
+ */
+static __inline struct cblock *
+cblock_alloc()
+{
+ struct cblock *cblockp;
+
+ cblockp = cfreelist;
+ if (cblockp == NULL)
+ panic("clist reservation botch");
+ cfreelist = cblockp->c_next;
+ cblockp->c_next = NULL;
+ cfreecount -= CBSIZE;
+ return (cblockp);
+}
+
+/*
+ * Add a cblock to the cfreelist queue.
+ */
+static __inline void
+cblock_free(cblockp)
+ struct cblock *cblockp;
+{
+ if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1))
+ bzero(cblockp->c_quote, sizeof cblockp->c_quote);
+ cblockp->c_next = cfreelist;
+ cfreelist = cblockp;
+ cfreecount += CBSIZE;
+}
+
+/*
+ * Allocate some cblocks for the cfreelist queue.
+ */
+static void
+cblock_alloc_cblocks(number)
+ int number;
+{
+ int i;
+ struct cblock *cbp;
+
+ for (i = 0; i < number; ++i) {
+ cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT);
+ if (cbp == NULL) {
+ printf(
+"cblock_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n");
+ cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK);
+ }
+ /*
+ * Freed cblocks have zero quotes and garbage elsewhere.
+ * Set the may-have-quote bit to force zeroing the quotes.
+ */
+ setbit(cbp->c_quote, CBQSIZE * NBBY - 1);
+ cblock_free(cbp);
+ }
+ ctotcount += number;
+}
+
+/*
+ * Set the cblock allocation policy for a a clist.
+ * Must be called in process context at spltty().
+ */
+void
+clist_alloc_cblocks(clistp, ccmax, ccreserved)
+ struct clist *clistp;
+ int ccmax;
+ int ccreserved;
+{
+ int dcbr;
+
+ /*
+ * Allow for wasted space at the head.
+ */
+ if (ccmax != 0)
+ ccmax += CBSIZE - 1;
+ if (ccreserved != 0)
+ ccreserved += CBSIZE - 1;
+
+ clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE;
+ dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved;
+ if (dcbr >= 0)
+ cblock_alloc_cblocks(dcbr);
+ else {
+ if (clistp->c_cbreserved + dcbr < clistp->c_cbcount)
+ dcbr = clistp->c_cbcount - clistp->c_cbreserved;
+ cblock_free_cblocks(-dcbr);
+ }
+ clistp->c_cbreserved += dcbr;
+}
+
+/*
+ * Free some cblocks from the cfreelist queue back to the
+ * system malloc pool.
+ */
+static void
+cblock_free_cblocks(number)
+ int number;
+{
+ int i;
+
+ for (i = 0; i < number; ++i)
+ free(cblock_alloc(), M_TTYS);
+ ctotcount -= number;
+}
+
+/*
+ * Free the cblocks reserved for a clist.
+ * Must be called at spltty().
+ */
+void
+clist_free_cblocks(clistp)
+ struct clist *clistp;
+{
+ if (clistp->c_cbcount != 0)
+ panic("freeing active clist cblocks");
+ cblock_free_cblocks(clistp->c_cbreserved);
+ clistp->c_cbmax = 0;
+ clistp->c_cbreserved = 0;
+}
+
+/*
+ * Get a character from the head of a clist.
+ */
+int
+getc(clistp)
+ struct clist *clistp;
+{
+ int chr = -1;
+ int s;
+ struct cblock *cblockp;
+
+ s = spltty();
+
+ /* If there are characters in the list, get one */
+ if (clistp->c_cc) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+ chr = (u_char)*clistp->c_cf;
+
+ /*
+ * If this char is quoted, set the flag.
+ */
+ if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info))
+ chr |= TTY_QUOTE;
+
+ /*
+ * Advance to next character.
+ */
+ clistp->c_cf++;
+ clistp->c_cc--;
+ /*
+ * If we have advanced the 'first' character pointer
+ * past the end of this cblock, advance to the next one.
+ * If there are no more characters, set the first and
+ * last pointers to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+ return (chr);
+}
+
+/*
+ * Copy 'amount' of chars, beginning at head of clist 'clistp' to
+ * destination linear buffer 'dest'. Return number of characters
+ * actually copied.
+ */
+int
+q_to_b(clistp, dest, amount)
+ struct clist *clistp;
+ char *dest;
+ int amount;
+{
+ struct cblock *cblockp;
+ struct cblock *cblockn;
+ char *dest_orig = dest;
+ int numc;
+ int s;
+
+ s = spltty();
+
+ while (clistp && amount && (clistp->c_cc > 0)) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+ cblockn = cblockp + 1; /* pointer arithmetic! */
+ numc = min(amount, (char *)cblockn - clistp->c_cf);
+ numc = min(numc, clistp->c_cc);
+ bcopy(clistp->c_cf, dest, numc);
+ amount -= numc;
+ clistp->c_cf += numc;
+ clistp->c_cc -= numc;
+ dest += numc;
+ /*
+ * If this cblock has been emptied, advance to the next
+ * one. If there are no more characters, set the first
+ * and last pointer to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+ return (dest - dest_orig);
+}
+
+/*
+ * Flush 'amount' of chars, beginning at head of clist 'clistp'.
+ */
+void
+ndflush(clistp, amount)
+ struct clist *clistp;
+ int amount;
+{
+ struct cblock *cblockp;
+ struct cblock *cblockn;
+ int numc;
+ int s;
+
+ s = spltty();
+
+ while (amount && (clistp->c_cc > 0)) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+ cblockn = cblockp + 1; /* pointer arithmetic! */
+ numc = min(amount, (char *)cblockn - clistp->c_cf);
+ numc = min(numc, clistp->c_cc);
+ amount -= numc;
+ clistp->c_cf += numc;
+ clistp->c_cc -= numc;
+ /*
+ * If this cblock has been emptied, advance to the next
+ * one. If there are no more characters, set the first
+ * and last pointer to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+}
+
+/*
+ * Add a character to the end of a clist. Return -1 is no
+ * more clists, or 0 for success.
+ */
+int
+putc(chr, clistp)
+ int chr;
+ struct clist *clistp;
+{
+ struct cblock *cblockp;
+ int s;
+
+ s = spltty();
+
+ if (clistp->c_cl == NULL) {
+ if (clistp->c_cbreserved < 1) {
+ splx(s);
+ printf("putc to a clist with no reserved cblocks\n");
+ return (-1); /* nothing done */
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount = 1;
+ clistp->c_cf = clistp->c_cl = cblockp->c_info;
+ clistp->c_cc = 0;
+ } else {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+ if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+ struct cblock *prev = (cblockp - 1);
+
+ if (clistp->c_cbcount >= clistp->c_cbreserved) {
+ if (clistp->c_cbcount >= clistp->c_cbmax
+ || cslushcount <= 0) {
+ splx(s);
+ return (-1);
+ }
+ --cslushcount;
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount++;
+ prev->c_next = cblockp;
+ clistp->c_cl = cblockp->c_info;
+ }
+ }
+
+ /*
+ * If this character is quoted, set the quote bit, if not, clear it.
+ */
+ if (chr & TTY_QUOTE) {
+ setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+ /*
+ * Use one of the spare quote bits to record that something
+ * may be quoted.
+ */
+ setbit(cblockp->c_quote, CBQSIZE * NBBY - 1);
+ } else
+ clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+
+ *clistp->c_cl++ = chr;
+ clistp->c_cc++;
+
+ splx(s);
+ return (0);
+}
+
+/*
+ * Copy data from linear buffer to clist chain. Return the
+ * number of characters not copied.
+ */
+int
+b_to_q(src, amount, clistp)
+ char *src;
+ int amount;
+ struct clist *clistp;
+{
+ struct cblock *cblockp;
+ char *firstbyte, *lastbyte;
+ u_char startmask, endmask;
+ int startbit, endbit, num_between, numc;
+ int s;
+
+ /*
+ * Avoid allocating an initial cblock and then not using it.
+ * c_cc == 0 must imply c_cbount == 0.
+ */
+ if (amount <= 0)
+ return (amount);
+
+ s = spltty();
+
+ /*
+ * If there are no cblocks assigned to this clist yet,
+ * then get one.
+ */
+ if (clistp->c_cl == NULL) {
+ if (clistp->c_cbreserved < 1) {
+ splx(s);
+ printf("b_to_q to a clist with no reserved cblocks.\n");
+ return (amount); /* nothing done */
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount = 1;
+ clistp->c_cf = clistp->c_cl = cblockp->c_info;
+ clistp->c_cc = 0;
+ } else {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+ }
+
+ while (amount) {
+ /*
+ * Get another cblock if needed.
+ */
+ if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+ struct cblock *prev = cblockp - 1;
+
+ if (clistp->c_cbcount >= clistp->c_cbreserved) {
+ if (clistp->c_cbcount >= clistp->c_cbmax
+ || cslushcount <= 0) {
+ splx(s);
+ return (amount);
+ }
+ --cslushcount;
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount++;
+ prev->c_next = cblockp;
+ clistp->c_cl = cblockp->c_info;
+ }
+
+ /*
+ * Copy a chunk of the linear buffer up to the end
+ * of this cblock.
+ */
+ numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl);
+ bcopy(src, clistp->c_cl, numc);
+
+ /*
+ * Clear quote bits if they aren't known to be clear.
+ * The following could probably be made into a separate
+ * "bitzero()" routine, but why bother?
+ */
+ if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) {
+ startbit = clistp->c_cl - (char *)cblockp->c_info;
+ endbit = startbit + numc - 1;
+
+ firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY);
+ lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY);
+
+ /*
+ * Calculate mask of bits to preserve in first and
+ * last bytes.
+ */
+ startmask = NBBY - (startbit % NBBY);
+ startmask = 0xff >> startmask;
+ endmask = (endbit % NBBY);
+ endmask = 0xff << (endmask + 1);
+
+ if (firstbyte != lastbyte) {
+ *firstbyte &= startmask;
+ *lastbyte &= endmask;
+
+ num_between = lastbyte - firstbyte - 1;
+ if (num_between)
+ bzero(firstbyte + 1, num_between);
+ } else {
+ *firstbyte &= (startmask | endmask);
+ }
+ }
+
+ /*
+ * ...and update pointer for the next chunk.
+ */
+ src += numc;
+ clistp->c_cl += numc;
+ clistp->c_cc += numc;
+ amount -= numc;
+ /*
+ * If we go through the loop again, it's always
+ * for data in the next cblock, so by adding one (cblock),
+ * (which makes the pointer 1 beyond the end of this
+ * cblock) we prepare for the assignment of 'prev'
+ * above.
+ */
+ cblockp += 1;
+
+ }
+
+ splx(s);
+ return (amount);
+}
+
+/*
+ * Get the next character in the clist. Store it at dst. Don't
+ * advance any clist pointers, but return a pointer to the next
+ * character position.
+ */
+char *
+nextc(clistp, cp, dst)
+ struct clist *clistp;
+ char *cp;
+ int *dst;
+{
+ struct cblock *cblockp;
+
+ ++cp;
+ /*
+ * See if the next character is beyond the end of
+ * the clist.
+ */
+ if (clistp->c_cc && (cp != clistp->c_cl)) {
+ /*
+ * If the next character is beyond the end of this
+ * cblock, advance to the next cblock.
+ */
+ if (((intptr_t)cp & CROUND) == 0)
+ cp = ((struct cblock *)cp - 1)->c_next->c_info;
+ cblockp = (struct cblock *)((intptr_t)cp & ~CROUND);
+
+ /*
+ * Get the character. Set the quote flag if this character
+ * is quoted.
+ */
+ *dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0);
+
+ return (cp);
+ }
+
+ return (NULL);
+}
+
+/*
+ * "Unput" a character from a clist.
+ */
+int
+unputc(clistp)
+ struct clist *clistp;
+{
+ struct cblock *cblockp = 0, *cbp = 0;
+ int s;
+ int chr = -1;
+
+
+ s = spltty();
+
+ if (clistp->c_cc) {
+ --clistp->c_cc;
+ --clistp->c_cl;
+
+ chr = (u_char)*clistp->c_cl;
+
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+
+ /*
+ * Set quote flag if this character was quoted.
+ */
+ if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info))
+ chr |= TTY_QUOTE;
+
+ /*
+ * If all of the characters have been unput in this
+ * cblock, then find the previous one and free this
+ * one.
+ */
+ if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) {
+ cbp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+
+ while (cbp->c_next != cblockp)
+ cbp = cbp->c_next;
+
+ /*
+ * When the previous cblock is at the end, the 'last'
+ * pointer always points (invalidly) one past.
+ */
+ clistp->c_cl = (char *)(cbp+1);
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ cbp->c_next = NULL;
+ }
+ }
+
+ /*
+ * If there are no more characters on the list, then
+ * free the last cblock.
+ */
+ if ((clistp->c_cc == 0) && clistp->c_cl) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+
+ splx(s);
+ return (chr);
+}
+
+/*
+ * Move characters in source clist to destination clist,
+ * preserving quote bits.
+ */
+void
+catq(src_clistp, dest_clistp)
+ struct clist *src_clistp, *dest_clistp;
+{
+ int chr, s;
+
+ s = spltty();
+ /*
+ * If the destination clist is empty (has no cblocks atttached),
+ * and there are no possible complications with the resource counters,
+ * then we simply assign the current clist to the destination.
+ */
+ if (!dest_clistp->c_cf
+ && src_clistp->c_cbcount <= src_clistp->c_cbmax
+ && src_clistp->c_cbcount <= dest_clistp->c_cbmax) {
+ dest_clistp->c_cf = src_clistp->c_cf;
+ dest_clistp->c_cl = src_clistp->c_cl;
+ src_clistp->c_cf = src_clistp->c_cl = NULL;
+
+ dest_clistp->c_cc = src_clistp->c_cc;
+ src_clistp->c_cc = 0;
+ dest_clistp->c_cbcount = src_clistp->c_cbcount;
+ src_clistp->c_cbcount = 0;
+
+ splx(s);
+ return;
+ }
+
+ splx(s);
+
+ /*
+ * XXX This should probably be optimized to more than one
+ * character at a time.
+ */
+ while ((chr = getc(src_clistp)) != -1)
+ putc(chr, dest_clistp);
+}
diff --git a/sys/kern/subr_clock.c b/sys/kern/subr_clock.c
new file mode 100644
index 0000000..a79e331
--- /dev/null
+++ b/sys/kern/subr_clock.c
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1982, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: Utah $Hdr: clock.c 1.18 91/01/21$
+ * from: @(#)clock.c 8.2 (Berkeley) 1/12/94
+ * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
+ * and
+ * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Helpers for time-of-day clocks. This is useful for architectures that need
+ * support multiple models of such clocks, and generally serves to make the
+ * code more machine-independent.
+ * If the clock in question can also be used as a time counter, the driver
+ * needs to initiate this.
+ * This code is not yet used by all architectures.
+ */
+
+/*
+ * Generic routines to convert between a POSIX date
+ * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
+ * Derived from NetBSD arch/hp300/hp300/clock.c
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+#include <sys/timetc.h>
+
+#include "clock_if.h"
+
+static __inline int leapyear(int year);
+static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS);
+
+#define FEBRUARY 2
+#define days_in_year(y) (leapyear(y) ? 366 : 365)
+#define days_in_month(y, m) \
+ (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0))
+/* Day of week. Days are counted from 1/1/1970, which was a Thursday */
+#define day_of_week(days) (((days) + 4) % 7)
+
+static const int month_days[12] = {
+ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+static device_t clock_dev = NULL;
+static long clock_res;
+
+int adjkerntz; /* local offset from GMT in seconds */
+int disable_rtc_set; /* disable resettodr() if != 0 */
+int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */
+
+/*
+ * These have traditionally been in machdep, but should probably be moved to
+ * kern.
+ */
+SYSCTL_PROC(_machdep, OID_AUTO, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
+ &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
+
+SYSCTL_INT(_machdep, OID_AUTO, disable_rtc_set,
+ CTLFLAG_RW, &disable_rtc_set, 0, "");
+
+SYSCTL_INT(_machdep, OID_AUTO, wall_cmos_clock,
+ CTLFLAG_RW, &wall_cmos_clock, 0, "");
+
+static int
+sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
+ req);
+ if (!error && req->newptr)
+ resettodr();
+ return (error);
+}
+
+/*
+ * This inline avoids some unnecessary modulo operations
+ * as compared with the usual macro:
+ * ( ((year % 4) == 0 &&
+ * (year % 100) != 0) ||
+ * ((year % 400) == 0) )
+ * It is otherwise equivalent.
+ */
+static __inline int
+leapyear(int year)
+{
+ int rv = 0;
+
+ if ((year & 3) == 0) {
+ rv = 1;
+ if ((year % 100) == 0) {
+ rv = 0;
+ if ((year % 400) == 0)
+ rv = 1;
+ }
+ }
+ return (rv);
+}
+
+int
+clock_ct_to_ts(struct clocktime *ct, struct timespec *ts)
+{
+ time_t secs;
+ int i, year, days;
+
+ year = ct->year;
+
+ /* Sanity checks. */
+ if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 ||
+ ct->day > days_in_month(year, ct->mon) ||
+ ct->hour > 23 || ct->min > 59 || ct->sec > 59 ||
+ ct->year > 2037) /* time_t overflow */
+ return (EINVAL);
+
+ /*
+ * Compute days since start of time
+ * First from years, then from months.
+ */
+ days = 0;
+ for (i = POSIX_BASE_YEAR; i < year; i++)
+ days += days_in_year(i);
+
+ /* Months */
+ for (i = 1; i < ct->mon; i++)
+ days += days_in_month(year, i);
+ days += (ct->day - 1);
+
+ /* Another sanity check. */
+ if (ct->dow != -1 && ct->dow != day_of_week(days))
+ return (EINVAL);
+
+ /* Add hours, minutes, seconds. */
+ secs = ((days * 24 + ct->hour) * 60 + ct->min) * 60 + ct->sec;
+
+ ts->tv_sec = secs;
+ ts->tv_nsec = ct->nsec;
+ return (0);
+}
+
+void
+clock_ts_to_ct(struct timespec *ts, struct clocktime *ct)
+{
+ int i, year, days;
+ time_t rsec; /* remainder seconds */
+ time_t secs;
+
+ secs = ts->tv_sec;
+ days = secs / SECDAY;
+ rsec = secs % SECDAY;
+
+ ct->dow = day_of_week(days);
+
+ /* Subtract out whole years, counting them in i. */
+ for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++)
+ days -= days_in_year(year);
+ ct->year = year;
+
+ /* Subtract out whole months, counting them in i. */
+ for (i = 1; days >= days_in_month(year, i); i++)
+ days -= days_in_month(year, i);
+ ct->mon = i;
+
+ /* Days are what is left over (+1) from all that. */
+ ct->day = days + 1;
+
+ /* Hours, minutes, seconds are easy */
+ ct->hour = rsec / 3600;
+ rsec = rsec % 3600;
+ ct->min = rsec / 60;
+ rsec = rsec % 60;
+ ct->sec = rsec;
+ ct->nsec = ts->tv_nsec;
+}
+
+void
+clock_register(device_t dev, long res)
+{
+
+ if (clock_dev != NULL) {
+ if (clock_res > res) {
+ if (bootverbose) {
+ device_printf(dev, "not installed as "
+ "time-of-day clock: clock %s has higher "
+ "resolution\n", device_get_name(clock_dev));
+ }
+ return;
+ } else {
+ if (bootverbose) {
+ device_printf(clock_dev, "removed as "
+ "time-of-day clock: clock %s has higher "
+ "resolution\n", device_get_name(dev));
+ }
+ }
+ }
+ clock_dev = dev;
+ clock_res = res;
+ if (bootverbose) {
+ device_printf(dev, "registered as a time-of-day clock "
+ "(resolution %ldus)\n", res);
+ }
+}
+
+/*
+ * inittodr and settodr derived from the i386 versions written
+ * by Christoph Robitschko <chmr@edvz.tu-graz.ac.at>, reintroduced and
+ * updated by Chris Stenton <chris@gnome.co.uk> 8/10/94
+ */
+
+/*
+ * Initialize the time of day register, based on the time base which is, e.g.
+ * from a filesystem.
+ */
+void
+inittodr(time_t base)
+{
+ struct timespec diff, ref, ts;
+ int error;
+
+ if (base) {
+ ref.tv_sec = base;
+ ref.tv_nsec = 0;
+ tc_setclock(&ref);
+ }
+
+ if (clock_dev == NULL) {
+ printf("warning: no time-of-day clock registered, system time "
+ "will not be set accurately\n");
+ return;
+ }
+ error = CLOCK_GETTIME(clock_dev, &ts);
+ if (error != 0 && error != EINVAL) {
+ printf("warning: clock_gettime failed (%d), the system time "
+ "will not be set accurately\n", error);
+ return;
+ }
+ if (error == EINVAL || ts.tv_sec < 0) {
+ printf("Invalid time in real time clock.\n");
+ printf("Check and reset the date immediately!\n");
+ }
+
+ ts.tv_sec += tz.tz_minuteswest * 60 +
+ (wall_cmos_clock ? adjkerntz : 0);
+
+ if (timespeccmp(&ref, &ts, >)) {
+ diff = ref;
+ timespecsub(&ref, &ts);
+ } else {
+ diff = ts;
+ timespecsub(&diff, &ref);
+ }
+ if (ts.tv_sec >= 2) {
+ /* badly off, adjust it */
+ tc_setclock(&ts);
+ }
+}
+
+/*
+ * Write system time back to RTC
+ */
+void
+resettodr()
+{
+ struct timespec ts;
+ int error;
+
+ if (disable_rtc_set || clock_dev == NULL)
+ return;
+
+ getnanotime(&ts);
+ ts.tv_sec -= tz.tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0);
+ if ((error = CLOCK_SETTIME(clock_dev, &ts)) != 0) {
+ printf("warning: clock_settime failed (%d), time-of-day clock "
+ "not adjusted to system time\n", error);
+ return;
+ }
+}
diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c
new file mode 100644
index 0000000..dabdf9d
--- /dev/null
+++ b/sys/kern/subr_devstat.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/sysctl.h>
+
+#include <sys/devicestat.h>
+
+static int devstat_num_devs;
+static long devstat_generation;
+static int devstat_version = DEVSTAT_VERSION;
+static int devstat_current_devnumber;
+
+static struct devstatlist device_statq;
+
+/*
+ * Take a malloced and zeroed devstat structure given to us, fill it in
+ * and add it to the queue of devices.
+ */
+void
+devstat_add_entry(struct devstat *ds, const char *dev_name,
+ int unit_number, u_int32_t block_size,
+ devstat_support_flags flags,
+ devstat_type_flags device_type,
+ devstat_priority priority)
+{
+ struct devstatlist *devstat_head;
+ struct devstat *ds_tmp;
+
+ if (ds == NULL)
+ return;
+
+ if (devstat_num_devs == 0)
+ STAILQ_INIT(&device_statq);
+
+ devstat_generation++;
+ devstat_num_devs++;
+
+ devstat_head = &device_statq;
+
+ /*
+ * Priority sort. Each driver passes in its priority when it adds
+ * its devstat entry. Drivers are sorted first by priority, and
+ * then by probe order.
+ *
+ * For the first device, we just insert it, since the priority
+ * doesn't really matter yet. Subsequent devices are inserted into
+ * the list using the order outlined above.
+ */
+ if (devstat_num_devs == 1)
+ STAILQ_INSERT_TAIL(devstat_head, ds, dev_links);
+ else {
+ STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) {
+ struct devstat *ds_next;
+
+ ds_next = STAILQ_NEXT(ds_tmp, dev_links);
+
+ /*
+ * If we find a break between higher and lower
+ * priority items, and if this item fits in the
+ * break, insert it. This also applies if the
+ * "lower priority item" is the end of the list.
+ */
+ if ((priority <= ds_tmp->priority)
+ && ((ds_next == NULL)
+ || (priority > ds_next->priority))) {
+ STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds,
+ dev_links);
+ break;
+ } else if (priority > ds_tmp->priority) {
+ /*
+ * If this is the case, we should be able
+ * to insert ourselves at the head of the
+ * list. If we can't, something is wrong.
+ */
+ if (ds_tmp == STAILQ_FIRST(devstat_head)) {
+ STAILQ_INSERT_HEAD(devstat_head,
+ ds, dev_links);
+ break;
+ } else {
+ STAILQ_INSERT_TAIL(devstat_head,
+ ds, dev_links);
+ printf("devstat_add_entry: HELP! "
+ "sorting problem detected "
+ "for %s%d\n", dev_name,
+ unit_number);
+ break;
+ }
+ }
+ }
+ }
+
+ ds->device_number = devstat_current_devnumber++;
+ ds->unit_number = unit_number;
+ strncpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN);
+ ds->device_name[DEVSTAT_NAME_LEN - 1] = '\0';
+ ds->block_size = block_size;
+ ds->flags = flags;
+ ds->device_type = device_type;
+ ds->priority = priority;
+ getmicrotime(&ds->dev_creation_time);
+}
+
+/*
+ * Remove a devstat structure from the list of devices.
+ */
+void
+devstat_remove_entry(struct devstat *ds)
+{
+ struct devstatlist *devstat_head;
+
+ if (ds == NULL)
+ return;
+
+ devstat_generation++;
+ devstat_num_devs--;
+
+ devstat_head = &device_statq;
+
+ /* Remove this entry from the devstat queue */
+ STAILQ_REMOVE(devstat_head, ds, devstat, dev_links);
+}
+
+/*
+ * Record a transaction start.
+ */
+void
+devstat_start_transaction(struct devstat *ds)
+{
+ /* sanity check */
+ if (ds == NULL)
+ return;
+
+ /*
+ * We only want to set the start time when we are going from idle
+ * to busy. The start time is really the start of the latest busy
+ * period.
+ */
+ if (ds->busy_count == 0)
+ getmicrouptime(&ds->start_time);
+ ds->busy_count++;
+}
+
+/*
+ * Record the ending of a transaction, and incrment the various counters.
+ */
+void
+devstat_end_transaction(struct devstat *ds, u_int32_t bytes,
+ devstat_tag_type tag_type, devstat_trans_flags flags)
+{
+ struct timeval busy_time;
+
+ /* sanity check */
+ if (ds == NULL)
+ return;
+
+ getmicrouptime(&ds->last_comp_time);
+ ds->busy_count--;
+
+ /*
+ * There might be some transactions (DEVSTAT_NO_DATA) that don't
+ * transfer any data.
+ */
+ if (flags == DEVSTAT_READ) {
+ ds->bytes_read += bytes;
+ ds->num_reads++;
+ } else if (flags == DEVSTAT_WRITE) {
+ ds->bytes_written += bytes;
+ ds->num_writes++;
+ } else if (flags == DEVSTAT_FREE) {
+ ds->bytes_freed += bytes;
+ ds->num_frees++;
+ } else
+ ds->num_other++;
+
+ /*
+ * Keep a count of the various tag types sent.
+ */
+ if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 &&
+ tag_type != DEVSTAT_TAG_NONE)
+ ds->tag_types[tag_type]++;
+
+ /*
+ * We only update the busy time when we go idle. Otherwise, this
+ * calculation would require many more clock cycles.
+ */
+ if (ds->busy_count == 0) {
+ /* Calculate how long we were busy */
+ busy_time = ds->last_comp_time;
+ timevalsub(&busy_time, &ds->start_time);
+
+ /* Add our busy time to the total busy time. */
+ timevaladd(&ds->busy_time, &busy_time);
+ } else if (ds->busy_count < 0)
+ printf("devstat_end_transaction: HELP!! busy_count "
+ "for %s%d is < 0 (%d)!\n", ds->device_name,
+ ds->unit_number, ds->busy_count);
+}
+
+void
+devstat_end_transaction_bio(struct devstat *ds, struct bio *bp)
+{
+ devstat_trans_flags flg;
+
+ if (bp->bio_cmd == BIO_DELETE)
+ flg = DEVSTAT_FREE;
+ else if (bp->bio_cmd == BIO_READ)
+ flg = DEVSTAT_READ;
+ else
+ flg = DEVSTAT_WRITE;
+
+ devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid,
+ DEVSTAT_TAG_SIMPLE, flg);
+}
+
+/*
+ * This is the sysctl handler for the devstat package. The data pushed out
+ * on the kern.devstat.all sysctl variable consists of the current devstat
+ * generation number, and then an array of devstat structures, one for each
+ * device in the system.
+ *
+ * I'm really not too fond of this method of doing things, but there really
+ * aren't that many alternatives. We must have some method of making sure
+ * that the generation number the user gets corresponds with the data the
+ * user gets. If the user makes a separate sysctl call to get the
+ * generation, and then a sysctl call to get the device statistics, the
+ * device list could have changed in that brief period of time. By
+ * supplying the generation number along with the statistics output, we can
+ * guarantee that the generation number and the statistics match up.
+ */
+static int
+sysctl_devstat(SYSCTL_HANDLER_ARGS)
+{
+ int error, i;
+ struct devstat *nds;
+ struct devstatlist *devstat_head;
+
+ if (devstat_num_devs == 0)
+ return(EINVAL);
+
+ error = 0;
+ devstat_head = &device_statq;
+
+ /*
+ * First push out the generation number.
+ */
+ error = SYSCTL_OUT(req, &devstat_generation, sizeof(long));
+
+ /*
+ * Now push out all the devices.
+ */
+ for (i = 0, nds = STAILQ_FIRST(devstat_head);
+ (nds != NULL) && (i < devstat_num_devs) && (error == 0);
+ nds = STAILQ_NEXT(nds, dev_links), i++)
+ error = SYSCTL_OUT(req, nds, sizeof(struct devstat));
+
+ return(error);
+}
+
+/*
+ * Sysctl entries for devstat. The first one is a node that all the rest
+ * hang off of.
+ */
+SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, 0, "Device Statistics");
+
+SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE,
+ 0, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list");
+/*
+ * Export the number of devices in the system so that userland utilities
+ * can determine how much memory to allocate to hold all the devices.
+ */
+SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD,
+ &devstat_num_devs, 0, "Number of devices in the devstat list");
+SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD,
+ &devstat_generation, 0, "Devstat list generation");
+SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD,
+ &devstat_version, 0, "Devstat list version number");
diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c
new file mode 100644
index 0000000..1982e7f
--- /dev/null
+++ b/sys/kern/subr_disk.c
@@ -0,0 +1,434 @@
+/*
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD$
+ *
+ */
+
+#include "opt_geom.h"
+#ifndef GEOM
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/bio.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <sys/malloc.h>
+#include <sys/sysctl.h>
+#include <machine/md_var.h>
+#include <sys/ctype.h>
+
+static MALLOC_DEFINE(M_DISK, "disk", "disk data");
+
+static d_strategy_t diskstrategy;
+static d_open_t diskopen;
+static d_close_t diskclose;
+static d_ioctl_t diskioctl;
+static d_psize_t diskpsize;
+
+static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist);
+
+void disk_dev_synth(dev_t dev);
+
+void
+disk_dev_synth(dev_t dev)
+{
+ struct disk *dp;
+ int u, s, p;
+ dev_t pdev;
+
+ if (dksparebits(dev))
+ return;
+ LIST_FOREACH(dp, &disklist, d_list) {
+ if (major(dev) != dp->d_devsw->d_maj)
+ continue;
+ u = dkunit(dev);
+ p = RAW_PART;
+ s = WHOLE_DISK_SLICE;
+ pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
+ if (pdev->si_devsw == NULL)
+ return; /* Probably a unit we don't have */
+ s = dkslice(dev);
+ p = dkpart(dev);
+ if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
+ /* XXX: actually should not happen */
+ dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
+ UID_ROOT, GID_OPERATOR, 0640, "%s%d",
+ dp->d_devsw->d_name, u);
+ dev_depends(pdev, dev);
+ return;
+ }
+ if (s == COMPATIBILITY_SLICE) {
+ dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
+ UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
+ dp->d_devsw->d_name, u, 'a' + p);
+ dev_depends(pdev, dev);
+ return;
+ }
+ if (p != RAW_PART) {
+ dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
+ UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
+ dp->d_devsw->d_name, u, s - BASE_SLICE + 1,
+ 'a' + p);
+ } else {
+ dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
+ UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
+ dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
+ make_dev_alias(dev, "%s%ds%dc",
+ dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
+ }
+ dev_depends(pdev, dev);
+ return;
+ }
+}
+
+static void
+disk_clone(void *arg, char *name, int namelen, dev_t *dev)
+{
+ struct disk *dp;
+ char const *d;
+ char *e;
+ int j, u, s, p;
+ dev_t pdev;
+
+ if (*dev != NODEV)
+ return;
+
+ LIST_FOREACH(dp, &disklist, d_list) {
+ d = dp->d_devsw->d_name;
+ j = dev_stdclone(name, &e, d, &u);
+ if (j == 0)
+ continue;
+ if (u > DKMAXUNIT)
+ continue;
+ p = RAW_PART;
+ s = WHOLE_DISK_SLICE;
+ pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
+ if (pdev->si_disk == NULL)
+ continue;
+ if (*e != '\0') {
+ j = dev_stdclone(e, &e, "s", &s);
+ if (j == 0)
+ s = COMPATIBILITY_SLICE;
+ else if (j == 1 || j == 2)
+ s += BASE_SLICE - 1;
+ if (!*e)
+ ; /* ad0s1 case */
+ else if (e[1] != '\0')
+ return; /* can never be a disk name */
+ else if (*e < 'a' || *e > 'h')
+ return; /* can never be a disk name */
+ else
+ p = *e - 'a';
+ }
+ if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
+ return;
+ } else if (s >= BASE_SLICE && p != RAW_PART) {
+ *dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
+ UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
+ pdev->si_devsw->d_name, u, s - BASE_SLICE + 1,
+ p + 'a');
+ } else if (s >= BASE_SLICE) {
+ *dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
+ UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
+ pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
+ make_dev_alias(*dev, "%s%ds%dc",
+ pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
+ } else {
+ *dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
+ UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
+ pdev->si_devsw->d_name, u, p + 'a');
+ }
+ dev_depends(pdev, *dev);
+ return;
+ }
+}
+
+static void
+inherit_raw(dev_t pdev, dev_t dev)
+{
+ dev->si_disk = pdev->si_disk;
+ dev->si_drv1 = pdev->si_drv1;
+ dev->si_drv2 = pdev->si_drv2;
+ dev->si_iosize_max = pdev->si_iosize_max;
+ dev->si_bsize_phys = pdev->si_bsize_phys;
+ dev->si_bsize_best = pdev->si_bsize_best;
+}
+
+dev_t
+disk_create(int unit, struct disk *dp, int flags, struct cdevsw *cdevsw, struct cdevsw *proto)
+{
+ static int once;
+ dev_t dev;
+
+ if (!once) {
+ EVENTHANDLER_REGISTER(dev_clone, disk_clone, 0, 1000);
+ once++;
+ }
+
+ bzero(dp, sizeof(*dp));
+
+ if (proto->d_open != diskopen) {
+ *proto = *cdevsw;
+ proto->d_open = diskopen;
+ proto->d_close = diskclose;
+ proto->d_ioctl = diskioctl;
+ proto->d_strategy = diskstrategy;
+ proto->d_psize = diskpsize;
+ }
+
+ if (bootverbose)
+ printf("Creating DISK %s%d\n", cdevsw->d_name, unit);
+ dev = make_dev(proto, dkmakeminor(unit, WHOLE_DISK_SLICE, RAW_PART),
+ UID_ROOT, GID_OPERATOR, 0640, "%s%d", cdevsw->d_name, unit);
+
+ dev->si_disk = dp;
+ dp->d_dev = dev;
+ dp->d_dsflags = flags;
+ dp->d_devsw = cdevsw;
+ LIST_INSERT_HEAD(&disklist, dp, d_list);
+
+ return (dev);
+}
+
+static int
+diskdumpconf(u_int onoff, dev_t dev, struct disk *dp)
+{
+ struct dumperinfo di;
+ struct disklabel *dl;
+
+ if (!onoff)
+ return(set_dumper(NULL));
+ dl = dsgetlabel(dev, dp->d_slice);
+ if (!dl)
+ return (ENXIO);
+ bzero(&di, sizeof di);
+ di.dumper = (dumper_t *)dp->d_devsw->d_dump;
+ di.priv = dp->d_dev;
+ di.blocksize = dl->d_secsize;
+ di.mediaoffset = (off_t)(dl->d_partitions[dkpart(dev)].p_offset +
+ dp->d_slice->dss_slices[dkslice(dev)].ds_offset) * DEV_BSIZE;
+ di.mediasize =
+ (off_t)(dl->d_partitions[dkpart(dev)].p_size) * DEV_BSIZE;
+ return(set_dumper(&di));
+}
+
+void
+disk_invalidate (struct disk *disk)
+{
+ if (disk->d_slice)
+ dsgone(&disk->d_slice);
+}
+
+void
+disk_destroy(dev_t dev)
+{
+ LIST_REMOVE(dev->si_disk, d_list);
+ bzero(dev->si_disk, sizeof(*dev->si_disk));
+ dev->si_disk = NULL;
+ destroy_dev(dev);
+ return;
+}
+
+struct disk *
+disk_enumerate(struct disk *disk)
+{
+ if (!disk)
+ return (LIST_FIRST(&disklist));
+ else
+ return (LIST_NEXT(disk, d_list));
+}
+
+static int
+sysctl_disks(SYSCTL_HANDLER_ARGS)
+{
+ struct disk *disk;
+ int error, first;
+
+ disk = NULL;
+ first = 1;
+
+ while ((disk = disk_enumerate(disk))) {
+ if (!first) {
+ error = SYSCTL_OUT(req, " ", 1);
+ if (error)
+ return error;
+ } else {
+ first = 0;
+ }
+ error = SYSCTL_OUT(req, disk->d_dev->si_name, strlen(disk->d_dev->si_name));
+ if (error)
+ return error;
+ }
+ error = SYSCTL_OUT(req, "", 1);
+ return error;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, 0, NULL,
+ sysctl_disks, "A", "names of available disks");
+
+/*
+ * The cdevsw functions
+ */
+
+static int
+diskopen(dev_t dev, int oflags, int devtype, struct thread *td)
+{
+ dev_t pdev;
+ struct disk *dp;
+ int error;
+
+ error = 0;
+ pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+
+ dp = pdev->si_disk;
+ if (!dp)
+ return (ENXIO);
+
+ while (dp->d_flags & DISKFLAG_LOCK) {
+ dp->d_flags |= DISKFLAG_WANTED;
+ error = tsleep(dp, PRIBIO | PCATCH, "diskopen", hz);
+ if (error)
+ return (error);
+ }
+ dp->d_flags |= DISKFLAG_LOCK;
+
+ if (!dsisopen(dp->d_slice)) {
+ if (!pdev->si_iosize_max)
+ pdev->si_iosize_max = dev->si_iosize_max;
+ error = dp->d_devsw->d_open(pdev, oflags, devtype, td);
+ }
+
+ /* Inherit properties from the whole/raw dev_t */
+ inherit_raw(pdev, dev);
+
+ if (error)
+ goto out;
+
+ error = dsopen(dev, devtype, dp->d_dsflags, &dp->d_slice, &dp->d_label);
+
+ if (!dsisopen(dp->d_slice))
+ dp->d_devsw->d_close(pdev, oflags, devtype, td);
+out:
+ dp->d_flags &= ~DISKFLAG_LOCK;
+ if (dp->d_flags & DISKFLAG_WANTED) {
+ dp->d_flags &= ~DISKFLAG_WANTED;
+ wakeup(dp);
+ }
+
+ return(error);
+}
+
+static int
+diskclose(dev_t dev, int fflag, int devtype, struct thread *td)
+{
+ struct disk *dp;
+ int error;
+ dev_t pdev;
+
+ error = 0;
+ pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+ dp = pdev->si_disk;
+ if (!dp)
+ return (ENXIO);
+ dsclose(dev, devtype, dp->d_slice);
+ if (!dsisopen(dp->d_slice))
+ error = dp->d_devsw->d_close(dp->d_dev, fflag, devtype, td);
+ return (error);
+}
+
+static void
+diskstrategy(struct bio *bp)
+{
+ dev_t pdev;
+ struct disk *dp;
+
+ pdev = dkmodpart(dkmodslice(bp->bio_dev, WHOLE_DISK_SLICE), RAW_PART);
+ dp = pdev->si_disk;
+ bp->bio_resid = bp->bio_bcount;
+ if (dp != bp->bio_dev->si_disk)
+ inherit_raw(pdev, bp->bio_dev);
+
+ if (!dp) {
+ biofinish(bp, NULL, ENXIO);
+ return;
+ }
+
+ if (dscheck(bp, dp->d_slice) <= 0) {
+ biodone(bp);
+ return;
+ }
+
+ if (bp->bio_bcount == 0) {
+ biodone(bp);
+ return;
+ }
+
+ KASSERT(dp->d_devsw != NULL, ("NULL devsw"));
+ KASSERT(dp->d_devsw->d_strategy != NULL, ("NULL d_strategy"));
+ dp->d_devsw->d_strategy(bp);
+ return;
+
+}
+
+static int
+diskioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
+{
+ struct disk *dp;
+ int error;
+ u_int u;
+ dev_t pdev;
+
+ pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+ dp = pdev->si_disk;
+ if (!dp)
+ return (ENXIO);
+ if (cmd == DIOCSKERNELDUMP) {
+ u = *(u_int *)data;
+ return (diskdumpconf(u, dev, dp));
+ }
+ if (cmd == DIOCGFRONTSTUFF) {
+ *(off_t *)data = 8192; /* XXX: crude but enough) */
+ return (0);
+ }
+ error = dsioctl(dev, cmd, data, fflag, &dp->d_slice);
+ if (error == ENOIOCTL)
+ error = dp->d_devsw->d_ioctl(dev, cmd, data, fflag, td);
+ return (error);
+}
+
+static int
+diskpsize(dev_t dev)
+{
+ struct disk *dp;
+ dev_t pdev;
+
+ pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+ dp = pdev->si_disk;
+ if (!dp)
+ return (-1);
+ if (dp != dev->si_disk) {
+ dev->si_drv1 = pdev->si_drv1;
+ dev->si_drv2 = pdev->si_drv2;
+ /* XXX: don't set bp->b_dev->si_disk (?) */
+ }
+ return (dssize(dev, &dp->d_slice));
+}
+
+SYSCTL_INT(_debug_sizeof, OID_AUTO, disklabel, CTLFLAG_RD,
+ 0, sizeof(struct disklabel), "sizeof(struct disklabel)");
+
+SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD,
+ 0, sizeof(struct diskslices), "sizeof(struct diskslices)");
+
+SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD,
+ 0, sizeof(struct disk), "sizeof(struct disk)");
+
+#endif
diff --git a/sys/kern/subr_disklabel.c b/sys/kern/subr_disklabel.c
new file mode 100644
index 0000000..e149687
--- /dev/null
+++ b/sys/kern/subr_disklabel.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/stdint.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/disklabel.h>
+#include <sys/diskslice.h>
+#include <sys/syslog.h>
+#include <machine/atomic.h>
+
+#ifdef notquite
+/*
+ * Mutex to use when delaying niced I/O bound processes in bioqdisksort().
+ */
+static struct mtx dksort_mtx;
+static void
+dksort_init(void)
+{
+
+ mtx_init(&dksort_mtx, "dksort", NULL, MTX_DEF);
+}
+SYSINIT(dksort, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, dksort_init, NULL)
+#endif
+
+/*
+ * Seek sort for disks.
+ *
+ * The buf_queue keep two queues, sorted in ascending block order. The first
+ * queue holds those requests which are positioned after the current block
+ * (in the first request); the second, which starts at queue->switch_point,
+ * holds requests which came in after their block number was passed. Thus
+ * we implement a one way scan, retracting after reaching the end of the drive
+ * to the first request on the second queue, at which time it becomes the
+ * first queue.
+ *
+ * A one-way scan is natural because of the way UNIX read-ahead blocks are
+ * allocated.
+ */
+
+void
+bioqdisksort(bioq, bp)
+ struct bio_queue_head *bioq;
+ struct bio *bp;
+{
+ struct bio *bq;
+ struct bio *bn;
+ struct bio *be;
+
+#ifdef notquite
+ struct thread *td = curthread;
+
+ if (td && td->td_ksegrp->kg_nice > 0) {
+ TAILQ_FOREACH(bn, &bioq->queue, bio_queue)
+ if (BIOTOBUF(bp)->b_vp != BIOTOBUF(bn)->b_vp)
+ break;
+ if (bn != NULL) {
+ mtx_lock(&dksort_mtx);
+ msleep((caddr_t)&dksort_mtx, &dksort_mtx,
+ PPAUSE | PCATCH | PDROP, "ioslow",
+ td->td_ksegrp->kg_nice);
+ }
+ }
+#endif
+ if (!atomic_cmpset_int(&bioq->busy, 0, 1))
+ panic("Recursing in bioqdisksort()");
+ be = TAILQ_LAST(&bioq->queue, bio_queue);
+ /*
+ * If the queue is empty or we are an
+ * ordered transaction, then it's easy.
+ */
+ if ((bq = bioq_first(bioq)) == NULL) {
+ bioq_insert_tail(bioq, bp);
+ bioq->busy = 0;
+ return;
+ } else if (bioq->insert_point != NULL) {
+
+ /*
+ * A certain portion of the list is
+ * "locked" to preserve ordering, so
+ * we can only insert after the insert
+ * point.
+ */
+ bq = bioq->insert_point;
+ } else {
+
+ /*
+ * If we lie before the last removed (currently active)
+ * request, and are not inserting ourselves into the
+ * "locked" portion of the list, then we must add ourselves
+ * to the second request list.
+ */
+ if (bp->bio_pblkno < bioq->last_pblkno) {
+
+ bq = bioq->switch_point;
+ /*
+ * If we are starting a new secondary list,
+ * then it's easy.
+ */
+ if (bq == NULL) {
+ bioq->switch_point = bp;
+ bioq_insert_tail(bioq, bp);
+ bioq->busy = 0;
+ return;
+ }
+ /*
+ * If we lie ahead of the current switch point,
+ * insert us before the switch point and move
+ * the switch point.
+ */
+ if (bp->bio_pblkno < bq->bio_pblkno) {
+ bioq->switch_point = bp;
+ TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
+ bioq->busy = 0;
+ return;
+ }
+ } else {
+ if (bioq->switch_point != NULL)
+ be = TAILQ_PREV(bioq->switch_point,
+ bio_queue, bio_queue);
+ /*
+ * If we lie between last_pblkno and bq,
+ * insert before bq.
+ */
+ if (bp->bio_pblkno < bq->bio_pblkno) {
+ TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
+ bioq->busy = 0;
+ return;
+ }
+ }
+ }
+
+ /*
+ * Request is at/after our current position in the list.
+ * Optimize for sequential I/O by seeing if we go at the tail.
+ */
+ if (bp->bio_pblkno > be->bio_pblkno) {
+ TAILQ_INSERT_AFTER(&bioq->queue, be, bp, bio_queue);
+ bioq->busy = 0;
+ return;
+ }
+
+ /* Otherwise, insertion sort */
+ while ((bn = TAILQ_NEXT(bq, bio_queue)) != NULL) {
+
+ /*
+ * We want to go after the current request if it is the end
+ * of the first request list, or if the next request is a
+ * larger cylinder than our request.
+ */
+ if (bn == bioq->switch_point
+ || bp->bio_pblkno < bn->bio_pblkno)
+ break;
+ bq = bn;
+ }
+ TAILQ_INSERT_AFTER(&bioq->queue, bq, bp, bio_queue);
+ bioq->busy = 0;
+}
+
+
+/*
+ * Attempt to read a disk label from a device using the indicated strategy
+ * routine. The label must be partly set up before this: secpercyl, secsize
+ * and anything required in the strategy routine (e.g., dummy bounds for the
+ * partition containing the label) must be filled in before calling us.
+ * Returns NULL on success and an error string on failure.
+ */
+char *
+readdisklabel(dev, lp)
+ dev_t dev;
+ register struct disklabel *lp;
+{
+ register struct buf *bp;
+ struct disklabel *dlp;
+ char *msg = NULL;
+
+ bp = geteblk((int)lp->d_secsize);
+ bp->b_dev = dev;
+ bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
+ bp->b_bcount = lp->d_secsize;
+ bp->b_flags &= ~B_INVAL;
+ bp->b_iocmd = BIO_READ;
+ DEV_STRATEGY(bp, 1);
+ if (bufwait(bp))
+ msg = "I/O error";
+ else if (bp->b_resid != 0)
+ msg = "disk too small for a label";
+ else for (dlp = (struct disklabel *)bp->b_data;
+ dlp <= (struct disklabel *)((char *)bp->b_data +
+ lp->d_secsize - sizeof(*dlp));
+ dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
+ if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) {
+ if (msg == NULL)
+ msg = "no disk label";
+ } else if (dlp->d_npartitions > MAXPARTITIONS ||
+ dkcksum(dlp) != 0)
+ msg = "disk label corrupted";
+ else {
+ *lp = *dlp;
+ msg = NULL;
+ break;
+ }
+ }
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ return (msg);
+}
+
+/*
+ * Check new disk label for sensibility before setting it.
+ */
+int
+setdisklabel(olp, nlp, openmask)
+ register struct disklabel *olp, *nlp;
+ u_long openmask;
+{
+ register int i;
+ register struct partition *opp, *npp;
+
+ /*
+ * Check it is actually a disklabel we are looking at.
+ */
+ if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC ||
+ dkcksum(nlp) != 0)
+ return (EINVAL);
+ /*
+ * For each partition that we think is open,
+ */
+ while ((i = ffs((long)openmask)) != 0) {
+ i--;
+ /*
+ * Check it is not changing....
+ */
+ openmask &= ~(1 << i);
+ if (nlp->d_npartitions <= i)
+ return (EBUSY);
+ opp = &olp->d_partitions[i];
+ npp = &nlp->d_partitions[i];
+ if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size)
+ return (EBUSY);
+ /*
+ * Copy internally-set partition information
+ * if new label doesn't include it. XXX
+ * (If we are using it then we had better stay the same type)
+ * This is possibly dubious, as someone else noted (XXX)
+ */
+ if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) {
+ npp->p_fstype = opp->p_fstype;
+ npp->p_fsize = opp->p_fsize;
+ npp->p_frag = opp->p_frag;
+ npp->p_cpg = opp->p_cpg;
+ }
+ }
+ nlp->d_checksum = 0;
+ nlp->d_checksum = dkcksum(nlp);
+ *olp = *nlp;
+ return (0);
+}
+
+/*
+ * Write disk label back to device after modification.
+ */
+int
+writedisklabel(dev, lp)
+ dev_t dev;
+ register struct disklabel *lp;
+{
+ struct buf *bp;
+ struct disklabel *dlp;
+ int error = 0;
+
+ if (lp->d_partitions[RAW_PART].p_offset != 0)
+ return (EXDEV); /* not quite right */
+ bp = geteblk((int)lp->d_secsize);
+ bp->b_dev = dkmodpart(dev, RAW_PART);
+ bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
+ bp->b_bcount = lp->d_secsize;
+#if 1
+ /*
+ * We read the label first to see if it's there,
+ * in which case we will put ours at the same offset into the block..
+ * (I think this is stupid [Julian])
+ * Note that you can't write a label out over a corrupted label!
+ * (also stupid.. how do you write the first one? by raw writes?)
+ */
+ bp->b_flags &= ~B_INVAL;
+ bp->b_iocmd = BIO_READ;
+ DEV_STRATEGY(bp, 1);
+ error = bufwait(bp);
+ if (error)
+ goto done;
+ if (bp->b_resid != 0) {
+ error = ENOSPC;
+ goto done;
+ }
+ for (dlp = (struct disklabel *)bp->b_data;
+ dlp <= (struct disklabel *)
+ ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp));
+ dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
+ if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC &&
+ dkcksum(dlp) == 0) {
+ *dlp = *lp;
+ bp->b_flags &= ~B_DONE;
+ bp->b_iocmd = BIO_WRITE;
+#ifdef __alpha__
+ alpha_fix_srm_checksum(bp);
+#endif
+ DEV_STRATEGY(bp, 1);
+ error = bufwait(bp);
+ goto done;
+ }
+ }
+ error = ESRCH;
+done:
+#else
+ bzero(bp->b_data, lp->d_secsize);
+ dlp = (struct disklabel *)bp->b_data;
+ *dlp = *lp;
+ bp->b_flags &= ~B_INVAL;
+ bp->b_iocmd = BIO_WRITE;
+ DEV_STRATEGY(bp, 1);
+ error = bufwait(bp);
+#endif
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ return (error);
+}
+
+/*
+ * Disk error is the preface to plaintive error messages
+ * about failing disk transfers. It prints messages of the form
+
+hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
+
+ * if the offset of the error in the transfer and a disk label
+ * are both available. blkdone should be -1 if the position of the error
+ * is unknown; the disklabel pointer may be null from drivers that have not
+ * been converted to use them. The message is printed with printf.
+ * The message should be completed with at least a newline. There is no
+ * trailing space.
+ */
+void
+diskerr(bp, what, blkdone, lp)
+ struct bio *bp;
+ char *what;
+ int blkdone;
+ register struct disklabel *lp;
+{
+ int part = dkpart(bp->bio_dev);
+ char partname[2];
+ char *sname;
+ daddr_t sn;
+
+ *partname = '\0';
+ sname = bp->bio_dev->si_name;
+ printf("%s%s: %s %sing fsbn ", sname, partname, what,
+ bp->bio_cmd == BIO_READ ? "read" : "writ");
+ sn = bp->bio_blkno;
+ if (bp->bio_bcount <= DEV_BSIZE)
+ printf("%jd", (intmax_t)sn);
+ else {
+ if (blkdone >= 0) {
+ sn += blkdone;
+ printf("%jd of ", (intmax_t)sn);
+ }
+ printf("%ld-%ld", (long)bp->bio_blkno,
+ (long)(bp->bio_blkno + (bp->bio_bcount - 1) / DEV_BSIZE));
+ }
+ if (lp && (blkdone >= 0 || bp->bio_bcount <= lp->d_secsize)) {
+ sn += lp->d_partitions[part].p_offset;
+ /*
+ * XXX should add slice offset and not print the slice,
+ * but we don't know the slice pointer.
+ * XXX should print bp->b_pblkno so that this will work
+ * independent of slices, labels and bad sector remapping,
+ * but some drivers don't set bp->b_pblkno.
+ */
+ printf(" (%s bn %jd; cn %jd", sname, (intmax_t)sn,
+ (intmax_t)(sn / lp->d_secpercyl));
+ sn %= lp->d_secpercyl;
+ printf(" tn %ld sn %ld)", (long)(sn / lp->d_nsectors),
+ (long)(sn % lp->d_nsectors));
+ }
+}
diff --git a/sys/kern/subr_diskmbr.c b/sys/kern/subr_diskmbr.c
new file mode 100644
index 0000000..40d5b2d
--- /dev/null
+++ b/sys/kern/subr_diskmbr.c
@@ -0,0 +1,544 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91
+ * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#ifdef PC98
+#define PC98_ATCOMPAT
+#define dsinit atcompat_dsinit
+#endif
+#include <sys/disklabel.h>
+#define DOSPTYP_EXTENDED 5
+#define DOSPTYP_EXTENDEDX 15
+#define DOSPTYP_ONTRACK 84
+#include <sys/diskslice.h>
+#include <sys/malloc.h>
+#include <sys/syslog.h>
+
+#define TRACE(str) do { if (dsi_debug) printf str; } while (0)
+
+static volatile u_char dsi_debug;
+
+/*
+ * This is what we have embedded in every boot1 for supporting the bogus
+ * "Dangerously Dedicated" mode. However, the old table is broken because
+ * it has an illegal geometry in it - it specifies 256 heads (heads = end
+ * head + 1) which causes nasty stuff when that wraps to zero in bios code.
+ * eg: divide by zero etc. This caused the dead-thinkpad problem, numerous
+ * SCSI bios crashes, EFI to crash, etc.
+ *
+ * We still have to recognize the old table though, even though we stopped
+ * inflicting it apon the world.
+ */
+static struct dos_partition historical_bogus_partition_table[NDOSPART] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+ { 0x80, 0, 1, 0, DOSPTYP_386BSD, 255, 255, 255, 0, 50000, },
+};
+static struct dos_partition historical_bogus_partition_table_fixed[NDOSPART] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+ { 0x80, 0, 1, 0, DOSPTYP_386BSD, 254, 255, 255, 0, 50000, },
+};
+
+static int check_part(char *sname, struct dos_partition *dp,
+ u_long offset, int nsectors, int ntracks,
+ u_long mbr_offset);
+static void mbr_extended(dev_t dev, struct disklabel *lp,
+ struct diskslices *ssp, u_long ext_offset,
+ u_long ext_size, u_long base_ext_offset,
+ int nsectors, int ntracks, u_long mbr_offset,
+ int level);
+static int mbr_setslice(char *sname, struct disklabel *lp,
+ struct diskslice *sp, struct dos_partition *dp,
+ u_long br_offset);
+
+static int
+check_part(sname, dp, offset, nsectors, ntracks, mbr_offset )
+ char *sname;
+ struct dos_partition *dp;
+ u_long offset;
+ int nsectors;
+ int ntracks;
+ u_long mbr_offset;
+{
+ int chs_ecyl;
+ int chs_esect;
+ int chs_scyl;
+ int chs_ssect;
+ int error;
+ u_long esector;
+ u_long esector1;
+ u_long secpercyl;
+ u_long ssector;
+ u_long ssector1;
+
+ secpercyl = (u_long)nsectors * ntracks;
+ chs_scyl = DPCYL(dp->dp_scyl, dp->dp_ssect);
+ chs_ssect = DPSECT(dp->dp_ssect);
+ ssector = chs_ssect - 1 + dp->dp_shd * nsectors + chs_scyl * secpercyl
+ + mbr_offset;
+ ssector1 = offset + dp->dp_start;
+
+ /*
+ * If ssector1 is on a cylinder >= 1024, then ssector can't be right.
+ * Allow the C/H/S for it to be 1023/ntracks-1/nsectors, or correct
+ * apart from the cylinder being reduced modulo 1024. Always allow
+ * 1023/255/63, because this is the official way to represent
+ * pure-LBA for the starting position.
+ */
+ if ((ssector < ssector1
+ && ((chs_ssect == nsectors && dp->dp_shd == ntracks - 1
+ && chs_scyl == 1023)
+ || (secpercyl != 0
+ && (ssector1 - ssector) % (1024 * secpercyl) == 0)))
+ || (dp->dp_scyl == 255 && dp->dp_shd == 255
+ && dp->dp_ssect == 255)) {
+ TRACE(("%s: C/H/S start %d/%d/%d, start %lu: allow\n",
+ sname, chs_scyl, dp->dp_shd, chs_ssect, ssector1));
+ ssector = ssector1;
+ }
+
+ chs_ecyl = DPCYL(dp->dp_ecyl, dp->dp_esect);
+ chs_esect = DPSECT(dp->dp_esect);
+ esector = chs_esect - 1 + dp->dp_ehd * nsectors + chs_ecyl * secpercyl
+ + mbr_offset;
+ esector1 = ssector1 + dp->dp_size - 1;
+
+ /*
+ * Allow certain bogus C/H/S values for esector, as above. However,
+ * heads == 255 isn't really legal and causes some BIOS crashes. The
+ * correct value to indicate a pure-LBA end is 1023/heads-1/sectors -
+ * usually 1023/254/63. "heads" is base 0, "sectors" is base 1.
+ */
+ if ((esector < esector1
+ && ((chs_esect == nsectors && dp->dp_ehd == ntracks - 1
+ && chs_ecyl == 1023)
+ || (secpercyl != 0
+ && (esector1 - esector) % (1024 * secpercyl) == 0)))
+ || (dp->dp_ecyl == 255 && dp->dp_ehd == 255
+ && dp->dp_esect == 255)) {
+ TRACE(("%s: C/H/S end %d/%d/%d, end %lu: allow\n",
+ sname, chs_ecyl, dp->dp_ehd, chs_esect, esector1));
+ esector = esector1;
+ }
+
+ error = (ssector == ssector1 && esector == esector1) ? 0 : EINVAL;
+ if (bootverbose)
+ printf("%s: type 0x%x, start %lu, end = %lu, size %lu %s\n",
+ sname, dp->dp_typ, ssector1, esector1,
+ (u_long)dp->dp_size, error ? "" : ": OK");
+ if (ssector != ssector1 && bootverbose)
+ printf("%s: C/H/S start %d/%d/%d (%lu) != start %lu: invalid\n",
+ sname, chs_scyl, dp->dp_shd, chs_ssect,
+ ssector, ssector1);
+ if (esector != esector1 && bootverbose)
+ printf("%s: C/H/S end %d/%d/%d (%lu) != end %lu: invalid\n",
+ sname, chs_ecyl, dp->dp_ehd, chs_esect,
+ esector, esector1);
+ return (error);
+}
+
+int
+dsinit(dev, lp, sspp)
+ dev_t dev;
+ struct disklabel *lp;
+ struct diskslices **sspp;
+{
+ struct buf *bp;
+ u_char *cp;
+ int dospart;
+ struct dos_partition *dp;
+ struct dos_partition *dp0;
+ struct dos_partition dpcopy[NDOSPART];
+ int error;
+ int max_ncyls;
+ int max_nsectors;
+ int max_ntracks;
+ u_long mbr_offset;
+ char partname[2];
+ u_long secpercyl;
+ char *sname;
+ struct diskslice *sp;
+ struct diskslices *ssp;
+
+ mbr_offset = DOSBBSECTOR;
+reread_mbr:
+ /* Read master boot record. */
+ bp = geteblk((int)lp->d_secsize);
+ bp->b_dev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+ bp->b_blkno = mbr_offset;
+ bp->b_bcount = lp->d_secsize;
+ bp->b_iocmd = BIO_READ;
+ DEV_STRATEGY(bp, 1);
+ if (bufwait(bp) != 0) {
+ diskerr(&bp->b_io, "reading primary partition table: error",
+ 0, (struct disklabel *)NULL);
+ printf("\n");
+ error = EIO;
+ goto done;
+ }
+
+ /* Weakly verify it. */
+ cp = bp->b_data;
+ sname = dsname(dev, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART, partname);
+ if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) {
+ if (bootverbose)
+ printf("%s: invalid primary partition table: no magic\n",
+ sname);
+ error = EINVAL;
+ goto done;
+ }
+
+ /* Make a copy of the partition table to avoid alignment problems. */
+ memcpy(&dpcopy[0], cp + DOSPARTOFF, sizeof(dpcopy));
+
+ dp0 = &dpcopy[0];
+
+ /* Check for "Ontrack Diskmanager". */
+ for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+ if (dp->dp_typ == DOSPTYP_ONTRACK) {
+ if (bootverbose)
+ printf(
+ "%s: Found \"Ontrack Disk Manager\" on this disk.\n", sname);
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ mbr_offset = 63;
+ goto reread_mbr;
+ }
+ }
+
+ if (bcmp(dp0, historical_bogus_partition_table,
+ sizeof historical_bogus_partition_table) == 0 ||
+ bcmp(dp0, historical_bogus_partition_table_fixed,
+ sizeof historical_bogus_partition_table_fixed) == 0) {
+ if (bootverbose)
+ printf(
+ "%s: invalid primary partition table: Dangerously Dedicated (ignored)\n",
+ sname);
+ error = EINVAL;
+ goto done;
+ }
+
+ /* Guess the geometry. */
+ /*
+ * TODO:
+ * Perhaps skip entries with 0 size.
+ * Perhaps only look at entries of type DOSPTYP_386BSD.
+ */
+ max_ncyls = 0;
+ max_nsectors = 0;
+ max_ntracks = 0;
+ for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+ int ncyls;
+ int nsectors;
+ int ntracks;
+
+ ncyls = DPCYL(dp->dp_ecyl, dp->dp_esect) + 1;
+ if (max_ncyls < ncyls)
+ max_ncyls = ncyls;
+ nsectors = DPSECT(dp->dp_esect);
+ if (max_nsectors < nsectors)
+ max_nsectors = nsectors;
+ ntracks = dp->dp_ehd + 1;
+ if (max_ntracks < ntracks)
+ max_ntracks = ntracks;
+ }
+
+ /*
+ * Check that we have guessed the geometry right by checking the
+ * partition entries.
+ */
+ /*
+ * TODO:
+ * As above.
+ * Check for overlaps.
+ * Check against d_secperunit if the latter is reliable.
+ */
+ error = 0;
+ for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+ if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0
+ && dp->dp_start == 0 && dp->dp_size == 0)
+ continue;
+ sname = dsname(dev, dkunit(dev), BASE_SLICE + dospart,
+ RAW_PART, partname);
+
+ /*
+ * Temporarily ignore errors from this check. We could
+ * simplify things by accepting the table eariler if we
+ * always ignore errors here. Perhaps we should always
+ * accept the table if the magic is right but not let
+ * bad entries affect the geometry.
+ */
+ check_part(sname, dp, mbr_offset, max_nsectors, max_ntracks,
+ mbr_offset);
+ }
+ if (error != 0)
+ goto done;
+
+ /*
+ * Accept the DOS partition table.
+ * First adjust the label (we have been careful not to change it
+ * before we can guarantee success).
+ */
+ secpercyl = (u_long)max_nsectors * max_ntracks;
+ if (secpercyl != 0) {
+ lp->d_nsectors = max_nsectors;
+ lp->d_ntracks = max_ntracks;
+ lp->d_secpercyl = secpercyl;
+ lp->d_ncylinders = lp->d_secperunit / secpercyl;
+ }
+
+ /*
+ * We are passed a pointer to a suitably initialized minimal
+ * slices "struct" with no dangling pointers in it. Replace it
+ * by a maximal one. This usually oversizes the "struct", but
+ * enlarging it while searching for logical drives would be
+ * inconvenient.
+ */
+ free(*sspp, M_DEVBUF);
+ ssp = dsmakeslicestruct(MAX_SLICES, lp);
+ *sspp = ssp;
+
+ /* Initialize normal slices. */
+ sp = &ssp->dss_slices[BASE_SLICE];
+ for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++, sp++) {
+ sname = dsname(dev, dkunit(dev), BASE_SLICE + dospart,
+ RAW_PART, partname);
+ (void)mbr_setslice(sname, lp, sp, dp, mbr_offset);
+ }
+ ssp->dss_nslices = BASE_SLICE + NDOSPART;
+
+ /* Handle extended partitions. */
+ sp -= NDOSPART;
+ for (dospart = 0; dospart < NDOSPART; dospart++, sp++)
+ if (sp->ds_type == DOSPTYP_EXTENDED ||
+ sp->ds_type == DOSPTYP_EXTENDEDX)
+ mbr_extended(bp->b_dev, lp, ssp,
+ sp->ds_offset, sp->ds_size, sp->ds_offset,
+ max_nsectors, max_ntracks, mbr_offset, 1);
+
+ /*
+ * mbr_extended() abuses ssp->dss_nslices for the number of slices
+ * that would be found if there were no limit on the number of slices
+ * in *ssp. Cut it back now.
+ */
+ if (ssp->dss_nslices > MAX_SLICES)
+ ssp->dss_nslices = MAX_SLICES;
+
+done:
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ if (error == EINVAL)
+ error = 0;
+ return (error);
+}
+
+void
+mbr_extended(dev, lp, ssp, ext_offset, ext_size, base_ext_offset, nsectors,
+ ntracks, mbr_offset, level)
+ dev_t dev;
+ struct disklabel *lp;
+ struct diskslices *ssp;
+ u_long ext_offset;
+ u_long ext_size;
+ u_long base_ext_offset;
+ int nsectors;
+ int ntracks;
+ u_long mbr_offset;
+ int level;
+{
+ struct buf *bp;
+ u_char *cp;
+ int dospart;
+ struct dos_partition *dp;
+ struct dos_partition dpcopy[NDOSPART];
+ u_long ext_offsets[NDOSPART];
+ u_long ext_sizes[NDOSPART];
+ char partname[2];
+ int slice;
+ char *sname;
+ struct diskslice *sp;
+
+ if (level >= 16) {
+ printf(
+ "%s: excessive recursion in search for slices; aborting search\n",
+ devtoname(dev));
+ return;
+ }
+
+ /* Read extended boot record. */
+ bp = geteblk((int)lp->d_secsize);
+ bp->b_dev = dev;
+ bp->b_blkno = ext_offset;
+ bp->b_bcount = lp->d_secsize;
+ bp->b_iocmd = BIO_READ;
+ DEV_STRATEGY(bp, 1);
+ if (bufwait(bp) != 0) {
+ diskerr(&bp->b_io, "reading extended partition table: error",
+ 0, (struct disklabel *)NULL);
+ printf("\n");
+ goto done;
+ }
+
+ /* Weakly verify it. */
+ cp = bp->b_data;
+ if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) {
+ sname = dsname(dev, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART,
+ partname);
+ if (bootverbose)
+ printf("%s: invalid extended partition table: no magic\n",
+ sname);
+ goto done;
+ }
+
+ /* Make a copy of the partition table to avoid alignment problems. */
+ memcpy(&dpcopy[0], cp + DOSPARTOFF, sizeof(dpcopy));
+
+ slice = ssp->dss_nslices;
+ for (dospart = 0, dp = &dpcopy[0]; dospart < NDOSPART;
+ dospart++, dp++) {
+ ext_sizes[dospart] = 0;
+ if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0
+ && dp->dp_start == 0 && dp->dp_size == 0)
+ continue;
+ if (dp->dp_typ == DOSPTYP_EXTENDED ||
+ dp->dp_typ == DOSPTYP_EXTENDEDX) {
+ static char buf[32];
+
+ sname = dsname(dev, dkunit(dev), WHOLE_DISK_SLICE,
+ RAW_PART, partname);
+ snprintf(buf, sizeof(buf), "%s", sname);
+ if (strlen(buf) < sizeof buf - 11)
+ strcat(buf, "<extended>");
+ check_part(buf, dp, base_ext_offset, nsectors,
+ ntracks, mbr_offset);
+ ext_offsets[dospart] = base_ext_offset + dp->dp_start;
+ ext_sizes[dospart] = dp->dp_size;
+ } else {
+ sname = dsname(dev, dkunit(dev), slice, RAW_PART,
+ partname);
+ check_part(sname, dp, ext_offset, nsectors, ntracks,
+ mbr_offset);
+ if (slice >= MAX_SLICES) {
+ printf("%s: too many slices\n", sname);
+ slice++;
+ continue;
+ }
+ sp = &ssp->dss_slices[slice];
+ if (mbr_setslice(sname, lp, sp, dp, ext_offset) != 0)
+ continue;
+ slice++;
+ }
+ }
+ ssp->dss_nslices = slice;
+
+ /* If we found any more slices, recursively find all the subslices. */
+ for (dospart = 0; dospart < NDOSPART; dospart++)
+ if (ext_sizes[dospart] != 0)
+ mbr_extended(dev, lp, ssp, ext_offsets[dospart],
+ ext_sizes[dospart], base_ext_offset,
+ nsectors, ntracks, mbr_offset, ++level);
+
+done:
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+}
+
+static int
+mbr_setslice(sname, lp, sp, dp, br_offset)
+ char *sname;
+ struct disklabel *lp;
+ struct diskslice *sp;
+ struct dos_partition *dp;
+ u_long br_offset;
+{
+ u_long offset;
+ u_long size;
+
+ offset = br_offset + dp->dp_start;
+ if (offset > lp->d_secperunit || offset < br_offset) {
+ printf(
+ "%s: slice starts beyond end of the disk: rejecting it\n",
+ sname);
+ return (1);
+ }
+ size = lp->d_secperunit - offset;
+ if (size >= dp->dp_size)
+ size = dp->dp_size;
+ else
+ printf(
+"%s: slice extends beyond end of disk: truncating from %lu to %lu sectors\n",
+ sname, (u_long)dp->dp_size, size);
+ sp->ds_offset = offset;
+ sp->ds_size = size;
+ sp->ds_type = dp->dp_typ;
+#ifdef PC98_ATCOMPAT
+ /* Fake FreeBSD(98). */
+ if (sp->ds_type == DOSPTYP_386BSD)
+ sp->ds_type = 0x94;
+#endif
+#if 0
+ lp->d_subtype |= (lp->d_subtype & 3) | dospart | DSTYPE_INDOSPART;
+#endif
+ return (0);
+}
+
+#ifdef __alpha__
+void
+alpha_fix_srm_checksum(bp)
+ struct buf *bp;
+{
+ u_int64_t *p;
+ u_int64_t sum;
+ int i;
+
+ p = (u_int64_t *) bp->b_data;
+ sum = 0;
+ for (i = 0; i < 63; i++)
+ sum += p[i];
+ p[63] = sum;
+}
+#endif
diff --git a/sys/kern/subr_diskslice.c b/sys/kern/subr_diskslice.c
new file mode 100644
index 0000000..ec6099e
--- /dev/null
+++ b/sys/kern/subr_diskslice.c
@@ -0,0 +1,997 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)wd.c 7.2 (Berkeley) 5/9/91
+ * from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $
+ * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91
+ * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <sys/disklabel.h>
+#include <sys/diskslice.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>
+#include <sys/stat.h>
+#include <sys/stdint.h>
+#include <sys/syslog.h>
+#include <sys/vnode.h>
+
+#define TRACE(str) do { if (ds_debug) printf str; } while (0)
+
+typedef u_char bool_t;
+
+static volatile bool_t ds_debug;
+
+static struct disklabel *clone_label(struct disklabel *lp);
+static void dsiodone(struct bio *bp);
+static char *fixlabel(char *sname, struct diskslice *sp,
+ struct disklabel *lp, int writeflag);
+static void free_ds_label(struct diskslices *ssp, int slice);
+static void partition_info(char *sname, int part, struct partition *pp);
+static void slice_info(char *sname, struct diskslice *sp);
+static void set_ds_label(struct diskslices *ssp, int slice,
+ struct disklabel *lp);
+static void set_ds_labeldevs(dev_t dev, struct diskslices *ssp);
+static void set_ds_wlabel(struct diskslices *ssp, int slice,
+ int wlabel);
+
+/*
+ * Duplicate a label for the whole disk, and initialize defaults in the
+ * copy for fields that are not already initialized. The caller only
+ * needs to initialize d_secsize and d_secperunit, and zero the fields
+ * that are to be defaulted.
+ */
+static struct disklabel *
+clone_label(lp)
+ struct disklabel *lp;
+{
+ struct disklabel *lp1;
+
+ lp1 = malloc(sizeof *lp1, M_DEVBUF, M_WAITOK);
+ *lp1 = *lp;
+ lp = NULL;
+ if (lp1->d_typename[0] == '\0')
+ strncpy(lp1->d_typename, "amnesiac", sizeof(lp1->d_typename));
+ if (lp1->d_packname[0] == '\0')
+ strncpy(lp1->d_packname, "fictitious", sizeof(lp1->d_packname));
+ if (lp1->d_nsectors == 0)
+ lp1->d_nsectors = 32;
+ if (lp1->d_ntracks == 0)
+ lp1->d_ntracks = 64;
+ lp1->d_secpercyl = lp1->d_nsectors * lp1->d_ntracks;
+ lp1->d_ncylinders = lp1->d_secperunit / lp1->d_secpercyl;
+ if (lp1->d_rpm == 0)
+ lp1->d_rpm = 3600;
+ if (lp1->d_interleave == 0)
+ lp1->d_interleave = 1;
+ if (lp1->d_npartitions < RAW_PART + 1)
+ lp1->d_npartitions = MAXPARTITIONS;
+ if (lp1->d_bbsize == 0)
+ lp1->d_bbsize = BBSIZE;
+ lp1->d_partitions[RAW_PART].p_size = lp1->d_secperunit;
+ lp1->d_magic = DISKMAGIC;
+ lp1->d_magic2 = DISKMAGIC;
+ lp1->d_checksum = dkcksum(lp1);
+ return (lp1);
+}
+
+dev_t
+dkmodpart(dev_t dev, int part)
+{
+ return (makedev(major(dev), (minor(dev) & ~7) | part));
+}
+
+dev_t
+dkmodslice(dev_t dev, int slice)
+{
+ return (makedev(major(dev), (minor(dev) & ~0x1f0000) | (slice << 16)));
+}
+
+u_int
+dkunit(dev_t dev)
+{
+ return (((minor(dev) >> 16) & 0x1e0) | ((minor(dev) >> 3) & 0x1f));
+}
+
+/*
+ * Determine the size of the transfer, and make sure it is
+ * within the boundaries of the partition. Adjust transfer
+ * if needed, and signal errors or early completion.
+ *
+ * XXX TODO:
+ * o Split buffers that are too big for the device.
+ * o Check for overflow.
+ * o Finish cleaning this up.
+ */
+int
+dscheck(bp, ssp)
+ struct bio *bp;
+ struct diskslices *ssp;
+{
+ daddr_t blkno;
+ daddr_t endsecno;
+ daddr_t labelsect;
+ struct disklabel *lp;
+ char *msg;
+ long nsec;
+ struct partition *pp;
+ daddr_t secno;
+ daddr_t slicerel_secno;
+ struct diskslice *sp;
+
+ blkno = bp->bio_blkno;
+ if (blkno < 0) {
+ printf("dscheck(%s): negative bio_blkno %ld\n",
+ devtoname(bp->bio_dev), (long)blkno);
+ bp->bio_error = EINVAL;
+ goto bad;
+ }
+ sp = &ssp->dss_slices[dkslice(bp->bio_dev)];
+ lp = sp->ds_label;
+ if (ssp->dss_secmult == 1) {
+ if (bp->bio_bcount % (u_long)DEV_BSIZE)
+ goto bad_bcount;
+ secno = blkno;
+ nsec = bp->bio_bcount >> DEV_BSHIFT;
+ } else if (ssp->dss_secshift != -1) {
+ if (bp->bio_bcount & (ssp->dss_secsize - 1))
+ goto bad_bcount;
+ if (blkno & (ssp->dss_secmult - 1))
+ goto bad_blkno;
+ secno = blkno >> ssp->dss_secshift;
+ nsec = bp->bio_bcount >> (DEV_BSHIFT + ssp->dss_secshift);
+ } else {
+ if (bp->bio_bcount % ssp->dss_secsize)
+ goto bad_bcount;
+ if (blkno % ssp->dss_secmult)
+ goto bad_blkno;
+ secno = blkno / ssp->dss_secmult;
+ nsec = bp->bio_bcount / ssp->dss_secsize;
+ }
+ if (lp == NULL) {
+ labelsect = -LABELSECTOR - 1;
+ endsecno = sp->ds_size;
+ slicerel_secno = secno;
+ } else {
+ labelsect = lp->d_partitions[LABEL_PART].p_offset;
+if (labelsect != 0) Debugger("labelsect != 0 in dscheck()");
+ pp = &lp->d_partitions[dkpart(bp->bio_dev)];
+ endsecno = pp->p_size;
+ slicerel_secno = pp->p_offset + secno;
+ }
+
+ /* overwriting disk label ? */
+ /* XXX should also protect bootstrap in first 8K */
+ if (slicerel_secno <= LABELSECTOR + labelsect &&
+#if LABELSECTOR != 0
+ slicerel_secno + nsec > LABELSECTOR + labelsect &&
+#endif
+ (bp->bio_cmd == BIO_WRITE) && sp->ds_wlabel == 0) {
+ bp->bio_error = EROFS;
+ goto bad;
+ }
+
+#if defined(DOSBBSECTOR) && defined(notyet)
+ /* overwriting master boot record? */
+ if (slicerel_secno <= DOSBBSECTOR && (bp->bio_cmd == BIO_WRITE) &&
+ sp->ds_wlabel == 0) {
+ bp->bio_error = EROFS;
+ goto bad;
+ }
+#endif
+
+ /* beyond partition? */
+ if ((uintmax_t)secno + nsec > endsecno) {
+ /* if exactly at end of disk, return an EOF */
+ if (secno == endsecno) {
+ bp->bio_resid = bp->bio_bcount;
+ return (0);
+ }
+ /* or truncate if part of it fits */
+ if (secno > endsecno) {
+ bp->bio_error = EINVAL;
+ goto bad;
+ }
+ bp->bio_bcount = (endsecno - secno) * ssp->dss_secsize;
+ }
+
+ bp->bio_pblkno = sp->ds_offset + slicerel_secno;
+
+ /*
+ * Snoop on label accesses if the slice offset is nonzero. Fudge
+ * offsets in the label to keep the in-core label coherent with
+ * the on-disk one.
+ */
+ if (slicerel_secno <= LABELSECTOR + labelsect
+#if LABELSECTOR != 0
+ && slicerel_secno + nsec > LABELSECTOR + labelsect
+#endif
+ && sp->ds_offset != 0) {
+ struct iodone_chain *ic;
+
+ ic = malloc(sizeof *ic , M_DEVBUF, M_WAITOK);
+ ic->ic_prev_flags = bp->bio_flags;
+ ic->ic_prev_iodone = bp->bio_done;
+ ic->ic_prev_iodone_chain = bp->bio_done_chain;
+ ic->ic_args[0].ia_long = (LABELSECTOR + labelsect -
+ slicerel_secno) * ssp->dss_secsize;
+ ic->ic_args[1].ia_ptr = sp;
+ bp->bio_done = dsiodone;
+ bp->bio_done_chain = ic;
+ if (!(bp->bio_cmd == BIO_READ)) {
+ /*
+ * XXX even disklabel(8) writes directly so we need
+ * to adjust writes. Perhaps we should drop support
+ * for DIOCWLABEL (always write protect labels) and
+ * require the use of DIOCWDINFO.
+ *
+ * XXX probably need to copy the data to avoid even
+ * temporarily corrupting the in-core copy.
+ */
+ /* XXX need name here. */
+ msg = fixlabel((char *)NULL, sp,
+ (struct disklabel *)
+ (bp->bio_data + ic->ic_args[0].ia_long),
+ TRUE);
+ if (msg != NULL) {
+ printf("dscheck(%s): %s\n",
+ devtoname(bp->bio_dev), msg);
+ bp->bio_error = EROFS;
+ goto bad;
+ }
+ }
+ }
+ return (1);
+
+bad_bcount:
+ printf(
+ "dscheck(%s): bio_bcount %ld is not on a sector boundary (ssize %d)\n",
+ devtoname(bp->bio_dev), bp->bio_bcount, ssp->dss_secsize);
+ bp->bio_error = EINVAL;
+ goto bad;
+
+bad_blkno:
+ printf(
+ "dscheck(%s): bio_blkno %ld is not on a sector boundary (ssize %d)\n",
+ devtoname(bp->bio_dev), (long)blkno, ssp->dss_secsize);
+ bp->bio_error = EINVAL;
+ goto bad;
+
+bad:
+ bp->bio_resid = bp->bio_bcount;
+ bp->bio_flags |= BIO_ERROR;
+ return (-1);
+}
+
+void
+dsclose(dev, mode, ssp)
+ dev_t dev;
+ int mode;
+ struct diskslices *ssp;
+{
+ u_char mask;
+ struct diskslice *sp;
+
+ sp = &ssp->dss_slices[dkslice(dev)];
+ mask = 1 << dkpart(dev);
+ sp->ds_openmask &= ~mask;
+}
+
+void
+dsgone(sspp)
+ struct diskslices **sspp;
+{
+ int slice;
+ struct diskslice *sp;
+ struct diskslices *ssp;
+
+ for (slice = 0, ssp = *sspp; slice < ssp->dss_nslices; slice++) {
+ sp = &ssp->dss_slices[slice];
+ free_ds_label(ssp, slice);
+ }
+ free(ssp, M_DEVBUF);
+ *sspp = NULL;
+}
+
+/*
+ * For the "write" commands (DIOCSDINFO and DIOCWDINFO), this
+ * is subject to the same restriction as dsopen().
+ */
+int
+dsioctl(dev, cmd, data, flags, sspp)
+ dev_t dev;
+ u_long cmd;
+ caddr_t data;
+ int flags;
+ struct diskslices **sspp;
+{
+ int error;
+ struct disklabel *lp;
+ int old_wlabel;
+ u_char openmask;
+ int part;
+ int slice;
+ struct diskslice *sp;
+ struct diskslices *ssp;
+ struct partition *pp;
+
+ slice = dkslice(dev);
+ ssp = *sspp;
+ sp = &ssp->dss_slices[slice];
+ lp = sp->ds_label;
+ switch (cmd) {
+
+ case DIOCGDVIRGIN:
+ lp = (struct disklabel *)data;
+ if (ssp->dss_slices[WHOLE_DISK_SLICE].ds_label) {
+ *lp = *ssp->dss_slices[WHOLE_DISK_SLICE].ds_label;
+ } else {
+ bzero(lp, sizeof(struct disklabel));
+ }
+
+ lp->d_magic = DISKMAGIC;
+ lp->d_magic2 = DISKMAGIC;
+ pp = &lp->d_partitions[RAW_PART];
+ pp->p_offset = 0;
+ pp->p_size = sp->ds_size;
+
+ lp->d_npartitions = MAXPARTITIONS;
+ if (lp->d_interleave == 0)
+ lp->d_interleave = 1;
+ if (lp->d_rpm == 0)
+ lp->d_rpm = 3600;
+ if (lp->d_nsectors == 0)
+ lp->d_nsectors = 32;
+ if (lp->d_ntracks == 0)
+ lp->d_ntracks = 64;
+
+ lp->d_bbsize = BBSIZE;
+ lp->d_sbsize = 0;
+ lp->d_secpercyl = lp->d_nsectors * lp->d_ntracks;
+ lp->d_ncylinders = sp->ds_size / lp->d_secpercyl;
+ lp->d_secperunit = sp->ds_size;
+ lp->d_checksum = 0;
+ lp->d_checksum = dkcksum(lp);
+ return (0);
+
+ case DIOCGDINFO:
+ if (lp == NULL)
+ return (EINVAL);
+ *(struct disklabel *)data = *lp;
+ return (0);
+
+ case DIOCGSECTORSIZE:
+ if (lp == NULL)
+ return (EINVAL);
+ *(u_int *)data = lp->d_secsize;
+ return (0);
+
+ case DIOCGMEDIASIZE:
+ if (lp == NULL)
+ return (EINVAL);
+ *(off_t *)data = (off_t)lp->d_partitions[dkpart(dev)].p_size *
+ lp->d_secsize;
+ return (0);
+
+ case DIOCGSLICEINFO:
+ bcopy(ssp, data, (char *)&ssp->dss_slices[ssp->dss_nslices] -
+ (char *)ssp);
+ return (0);
+
+ case DIOCSDINFO:
+ if (slice == WHOLE_DISK_SLICE)
+ return (ENODEV);
+ if (!(flags & FWRITE))
+ return (EBADF);
+ lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK);
+ if (sp->ds_label == NULL)
+ bzero(lp, sizeof *lp);
+ else
+ bcopy(sp->ds_label, lp, sizeof *lp);
+ if (sp->ds_label == NULL)
+ openmask = 0;
+ else {
+ openmask = sp->ds_openmask;
+ if (slice == COMPATIBILITY_SLICE)
+ openmask |= ssp->dss_slices[
+ ssp->dss_first_bsd_slice].ds_openmask;
+ else if (slice == ssp->dss_first_bsd_slice)
+ openmask |= ssp->dss_slices[
+ COMPATIBILITY_SLICE].ds_openmask;
+ }
+ error = setdisklabel(lp, (struct disklabel *)data,
+ (u_long)openmask);
+ /* XXX why doesn't setdisklabel() check this? */
+ if (error == 0 && lp->d_partitions[RAW_PART].p_offset != 0)
+ error = EXDEV;
+ if (error == 0) {
+ if (lp->d_secperunit > sp->ds_size)
+ error = ENOSPC;
+ for (part = 0; part < lp->d_npartitions; part++)
+ if (lp->d_partitions[part].p_size > sp->ds_size)
+ error = ENOSPC;
+ }
+ if (error != 0) {
+ free(lp, M_DEVBUF);
+ return (error);
+ }
+ free_ds_label(ssp, slice);
+ set_ds_label(ssp, slice, lp);
+ set_ds_labeldevs(dev, ssp);
+ return (0);
+
+ case DIOCSYNCSLICEINFO:
+ if (slice != WHOLE_DISK_SLICE || dkpart(dev) != RAW_PART)
+ return (EINVAL);
+ if (!*(int *)data)
+ for (slice = 0; slice < ssp->dss_nslices; slice++) {
+ openmask = ssp->dss_slices[slice].ds_openmask;
+ if (openmask
+ && (slice != WHOLE_DISK_SLICE
+ || openmask & ~(1 << RAW_PART)))
+ return (EBUSY);
+ }
+
+ /*
+ * Temporarily forget the current slices struct and read
+ * the current one.
+ * XXX should wait for current accesses on this disk to
+ * complete, then lock out future accesses and opens.
+ */
+ *sspp = NULL;
+ lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK);
+ *lp = *ssp->dss_slices[WHOLE_DISK_SLICE].ds_label;
+ error = dsopen(dev, S_IFCHR, ssp->dss_oflags, sspp, lp);
+ if (error != 0) {
+ free(lp, M_DEVBUF);
+ *sspp = ssp;
+ return (error);
+ }
+
+ /*
+ * Reopen everything. This is a no-op except in the "force"
+ * case and when the raw bdev and cdev are both open. Abort
+ * if anything fails.
+ */
+ for (slice = 0; slice < ssp->dss_nslices; slice++) {
+ for (openmask = ssp->dss_slices[slice].ds_openmask,
+ part = 0; openmask; openmask >>= 1, part++) {
+ if (!(openmask & 1))
+ continue;
+ error = dsopen(dkmodslice(dkmodpart(dev, part),
+ slice),
+ S_IFCHR, ssp->dss_oflags, sspp,
+ lp);
+ if (error != 0) {
+ free(lp, M_DEVBUF);
+ *sspp = ssp;
+ return (EBUSY);
+ }
+ }
+ }
+
+ free(lp, M_DEVBUF);
+ dsgone(&ssp);
+ return (0);
+
+ case DIOCWDINFO:
+ error = dsioctl(dev, DIOCSDINFO, data, flags, &ssp);
+ if (error != 0)
+ return (error);
+ /*
+ * XXX this used to hack on dk_openpart to fake opening
+ * partition 0 in case that is used instead of dkpart(dev).
+ */
+ old_wlabel = sp->ds_wlabel;
+ set_ds_wlabel(ssp, slice, TRUE);
+ error = writedisklabel(dev, sp->ds_label);
+ /* XXX should invalidate in-core label if write failed. */
+ set_ds_wlabel(ssp, slice, old_wlabel);
+ return (error);
+
+ case DIOCWLABEL:
+#ifndef __alpha__
+ if (slice == WHOLE_DISK_SLICE)
+ return (ENODEV);
+#endif
+ if (!(flags & FWRITE))
+ return (EBADF);
+ set_ds_wlabel(ssp, slice, *(int *)data != 0);
+ return (0);
+
+ default:
+ return (ENOIOCTL);
+ }
+}
+
+static void
+dsiodone(bp)
+ struct bio *bp;
+{
+ struct iodone_chain *ic;
+ char *msg;
+
+ ic = bp->bio_done_chain;
+ bp->bio_done = ic->ic_prev_iodone;
+ bp->bio_done_chain = ic->ic_prev_iodone_chain;
+ if (!(bp->bio_cmd == BIO_READ)
+ || (!(bp->bio_flags & BIO_ERROR) && bp->bio_error == 0)) {
+ msg = fixlabel((char *)NULL, ic->ic_args[1].ia_ptr,
+ (struct disklabel *)
+ (bp->bio_data + ic->ic_args[0].ia_long),
+ FALSE);
+ if (msg != NULL)
+ printf("%s\n", msg);
+ }
+ free(ic, M_DEVBUF);
+ biodone(bp);
+}
+
+int
+dsisopen(ssp)
+ struct diskslices *ssp;
+{
+ int slice;
+
+ if (ssp == NULL)
+ return (0);
+ for (slice = 0; slice < ssp->dss_nslices; slice++)
+ if (ssp->dss_slices[slice].ds_openmask)
+ return (1);
+ return (0);
+}
+
+/*
+ * Allocate a slices "struct" and initialize it to contain only an empty
+ * compatibility slice (pointing to itself), a whole disk slice (covering
+ * the disk as described by the label), and (nslices - BASE_SLICES) empty
+ * slices beginning at BASE_SLICE.
+ */
+struct diskslices *
+dsmakeslicestruct(nslices, lp)
+ int nslices;
+ struct disklabel *lp;
+{
+ struct diskslice *sp;
+ struct diskslices *ssp;
+
+ ssp = malloc(offsetof(struct diskslices, dss_slices) +
+ nslices * sizeof *sp, M_DEVBUF, M_WAITOK);
+ ssp->dss_first_bsd_slice = COMPATIBILITY_SLICE;
+ ssp->dss_nslices = nslices;
+ ssp->dss_oflags = 0;
+ ssp->dss_secmult = lp->d_secsize / DEV_BSIZE;
+ if (ssp->dss_secmult & (ssp->dss_secmult - 1))
+ ssp->dss_secshift = -1;
+ else
+ ssp->dss_secshift = ffs(ssp->dss_secmult) - 1;
+ ssp->dss_secsize = lp->d_secsize;
+ sp = &ssp->dss_slices[0];
+ bzero(sp, nslices * sizeof *sp);
+ sp[WHOLE_DISK_SLICE].ds_size = lp->d_secperunit;
+ return (ssp);
+}
+
+char *
+dsname(dev, unit, slice, part, partname)
+ dev_t dev;
+ int unit;
+ int slice;
+ int part;
+ char *partname;
+{
+ static char name[32];
+ const char *dname;
+
+ dname = devsw(dev)->d_name;
+ if (strlen(dname) > 16)
+ dname = "nametoolong";
+ snprintf(name, sizeof(name), "%s%d", dname, unit);
+ partname[0] = '\0';
+ if (slice != WHOLE_DISK_SLICE || part != RAW_PART) {
+ partname[0] = 'a' + part;
+ partname[1] = '\0';
+ if (slice != COMPATIBILITY_SLICE)
+ snprintf(name + strlen(name),
+ sizeof(name) - strlen(name), "s%d", slice - 1);
+ }
+ return (name);
+}
+
+/*
+ * This should only be called when the unit is inactive and the strategy
+ * routine should not allow it to become active unless we call it. Our
+ * strategy routine must be special to allow activity.
+ */
+int
+dsopen(dev, mode, flags, sspp, lp)
+ dev_t dev;
+ int mode;
+ u_int flags;
+ struct diskslices **sspp;
+ struct disklabel *lp;
+{
+ dev_t dev1;
+ int error;
+ struct disklabel *lp1;
+ char *msg;
+ u_char mask;
+ int part;
+ char partname[2];
+ int slice;
+ char *sname;
+ struct diskslice *sp;
+ struct diskslices *ssp;
+ int unit;
+
+ dev->si_bsize_phys = lp->d_secsize;
+
+ unit = dkunit(dev);
+ if (lp->d_secsize % DEV_BSIZE) {
+ printf("%s: invalid sector size %lu\n", devtoname(dev),
+ (u_long)lp->d_secsize);
+ return (EINVAL);
+ }
+
+ /*
+ * XXX reinitialize the slice table unless there is an open device
+ * on the unit. This should only be done if the media has changed.
+ */
+ ssp = *sspp;
+ if (!dsisopen(ssp)) {
+ if (ssp != NULL)
+ dsgone(sspp);
+ /*
+ * Allocate a minimal slices "struct". This will become
+ * the final slices "struct" if we don't want real slices
+ * or if we can't find any real slices.
+ */
+ *sspp = dsmakeslicestruct(BASE_SLICE, lp);
+
+ if (!(flags & DSO_ONESLICE)) {
+ TRACE(("dsinit\n"));
+ error = dsinit(dev, lp, sspp);
+ if (error != 0) {
+ dsgone(sspp);
+ return (error);
+ }
+ }
+ ssp = *sspp;
+ ssp->dss_oflags = flags;
+
+ /*
+ * If there are no real slices, then make the compatiblity
+ * slice cover the whole disk.
+ */
+ if (ssp->dss_nslices == BASE_SLICE)
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_size
+ = lp->d_secperunit;
+
+ /* Point the compatibility slice at the BSD slice, if any. */
+ for (slice = BASE_SLICE; slice < ssp->dss_nslices; slice++) {
+ sp = &ssp->dss_slices[slice];
+ if (sp->ds_type == DOSPTYP_386BSD /* XXX */) {
+ ssp->dss_first_bsd_slice = slice;
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_offset
+ = sp->ds_offset;
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_size
+ = sp->ds_size;
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_type
+ = sp->ds_type;
+ break;
+ }
+ }
+
+ ssp->dss_slices[WHOLE_DISK_SLICE].ds_label = clone_label(lp);
+ ssp->dss_slices[WHOLE_DISK_SLICE].ds_wlabel = TRUE;
+ }
+
+ /* Initialize secondary info for all slices. */
+ for (slice = 0; slice < ssp->dss_nslices; slice++) {
+ sp = &ssp->dss_slices[slice];
+ if (sp->ds_label != NULL
+#ifdef __alpha__
+ && slice != WHOLE_DISK_SLICE
+#endif
+ )
+ continue;
+ dev1 = dkmodslice(dkmodpart(dev, RAW_PART), slice);
+#if 0
+ sname = dsname(dev, unit, slice, RAW_PART, partname);
+#else
+ *partname='\0';
+ sname = dev1->si_name;
+#endif
+ /*
+ * XXX this should probably only be done for the need_init
+ * case, but there may be a problem with DIOCSYNCSLICEINFO.
+ */
+ set_ds_wlabel(ssp, slice, TRUE); /* XXX invert */
+ lp1 = clone_label(lp);
+ TRACE(("readdisklabel\n"));
+ if (flags & DSO_NOLABELS)
+ msg = NULL;
+ else {
+ msg = readdisklabel(dev1, lp1);
+
+ /*
+ * readdisklabel() returns NULL for success, and an
+ * error string for failure.
+ *
+ * If there isn't a label on the disk, and if the
+ * DSO_COMPATLABEL is set, we want to use the
+ * faked-up label provided by the caller.
+ *
+ * So we set msg to NULL to indicate that there is
+ * no failure (since we have a faked-up label),
+ * free lp1, and then clone it again from lp.
+ * (In case readdisklabel() modified lp1.)
+ */
+ if (msg != NULL && (flags & DSO_COMPATLABEL)) {
+ msg = NULL;
+ free(lp1, M_DEVBUF);
+ lp1 = clone_label(lp);
+ }
+ }
+ if (msg == NULL)
+ msg = fixlabel(sname, sp, lp1, FALSE);
+ if (msg == NULL && lp1->d_secsize != ssp->dss_secsize)
+ msg = "inconsistent sector size";
+ if (msg != NULL) {
+ if (sp->ds_type == DOSPTYP_386BSD /* XXX */)
+ log(LOG_WARNING, "%s: cannot find label (%s)\n",
+ sname, msg);
+ free(lp1, M_DEVBUF);
+ continue;
+ }
+ if (lp1->d_flags & D_BADSECT) {
+ log(LOG_ERR, "%s: bad sector table not supported\n",
+ sname);
+ free(lp1, M_DEVBUF);
+ continue;
+ }
+ set_ds_label(ssp, slice, lp1);
+ set_ds_labeldevs(dev1, ssp);
+ set_ds_wlabel(ssp, slice, FALSE);
+ }
+
+ slice = dkslice(dev);
+ if (slice >= ssp->dss_nslices)
+ return (ENXIO);
+ sp = &ssp->dss_slices[slice];
+ part = dkpart(dev);
+ if (part != RAW_PART
+ && (sp->ds_label == NULL || part >= sp->ds_label->d_npartitions))
+ return (EINVAL); /* XXX needs translation */
+ mask = 1 << part;
+ sp->ds_openmask |= mask;
+ return (0);
+}
+
+int
+dssize(dev, sspp)
+ dev_t dev;
+ struct diskslices **sspp;
+{
+ struct disklabel *lp;
+ int part;
+ int slice;
+ struct diskslices *ssp;
+
+ slice = dkslice(dev);
+ part = dkpart(dev);
+ ssp = *sspp;
+ if (ssp == NULL || slice >= ssp->dss_nslices
+ || !(ssp->dss_slices[slice].ds_openmask & (1 << part))) {
+ if (devsw(dev)->d_open(dev, FREAD, S_IFCHR,
+ (struct thread *)NULL) != 0)
+ return (-1);
+ devsw(dev)->d_close(dev, FREAD, S_IFCHR, (struct thread *)NULL);
+ ssp = *sspp;
+ }
+ lp = ssp->dss_slices[slice].ds_label;
+ if (lp == NULL)
+ return (-1);
+ return ((int)lp->d_partitions[part].p_size);
+}
+
+static void
+free_ds_label(ssp, slice)
+ struct diskslices *ssp;
+ int slice;
+{
+ struct disklabel *lp;
+ struct diskslice *sp;
+
+ sp = &ssp->dss_slices[slice];
+ lp = sp->ds_label;
+ if (lp == NULL)
+ return;
+ free(lp, M_DEVBUF);
+ set_ds_label(ssp, slice, (struct disklabel *)NULL);
+}
+
+
+static char *
+fixlabel(sname, sp, lp, writeflag)
+ char *sname;
+ struct diskslice *sp;
+ struct disklabel *lp;
+ int writeflag;
+{
+ u_long end;
+ u_long offset;
+ int part;
+ struct partition *pp;
+ u_long start;
+ bool_t warned;
+
+ /* These errors "can't happen" so don't bother reporting details. */
+ if (lp->d_magic != DISKMAGIC || lp->d_magic2 != DISKMAGIC)
+ return ("fixlabel: invalid magic");
+ if (dkcksum(lp) != 0)
+ return ("fixlabel: invalid checksum");
+
+ pp = &lp->d_partitions[RAW_PART];
+ if (writeflag) {
+ start = 0;
+ offset = sp->ds_offset;
+ } else {
+ start = sp->ds_offset;
+ offset = -sp->ds_offset;
+ }
+ if (pp->p_offset != start) {
+ if (sname != NULL) {
+ printf(
+"%s: rejecting BSD label: raw partition offset != slice offset\n",
+ sname);
+ slice_info(sname, sp);
+ partition_info(sname, RAW_PART, pp);
+ }
+ return ("fixlabel: raw partition offset != slice offset");
+ }
+ if (pp->p_size != sp->ds_size) {
+ if (sname != NULL) {
+ printf("%s: raw partition size != slice size\n", sname);
+ slice_info(sname, sp);
+ partition_info(sname, RAW_PART, pp);
+ }
+ if (pp->p_size > sp->ds_size) {
+ if (sname == NULL)
+ return ("fixlabel: raw partition size > slice size");
+ printf("%s: truncating raw partition\n", sname);
+ pp->p_size = sp->ds_size;
+ }
+ }
+ end = start + sp->ds_size;
+ if (start > end)
+ return ("fixlabel: slice wraps");
+ if (lp->d_secpercyl <= 0)
+ return ("fixlabel: d_secpercyl <= 0");
+ pp -= RAW_PART;
+ warned = FALSE;
+ for (part = 0; part < lp->d_npartitions; part++, pp++) {
+ if (pp->p_offset != 0 || pp->p_size != 0) {
+ if (pp->p_offset < start
+ || pp->p_offset + pp->p_size > end
+ || pp->p_offset + pp->p_size < pp->p_offset) {
+ if (sname != NULL) {
+ printf(
+"%s: rejecting partition in BSD label: it isn't entirely within the slice\n",
+ sname);
+ if (!warned) {
+ slice_info(sname, sp);
+ warned = TRUE;
+ }
+ partition_info(sname, part, pp);
+ }
+ /* XXX else silently discard junk. */
+ bzero(pp, sizeof *pp);
+ } else
+ pp->p_offset += offset;
+ }
+ }
+ lp->d_ncylinders = sp->ds_size / lp->d_secpercyl;
+ lp->d_secperunit = sp->ds_size;
+ lp->d_checksum = 0;
+ lp->d_checksum = dkcksum(lp);
+ return (NULL);
+}
+
+static void
+partition_info(sname, part, pp)
+ char *sname;
+ int part;
+ struct partition *pp;
+{
+ printf("%s%c: start %lu, end %lu, size %lu\n", sname, 'a' + part,
+ (u_long)pp->p_offset, (u_long)(pp->p_offset + pp->p_size - 1),
+ (u_long)pp->p_size);
+}
+
+static void
+slice_info(sname, sp)
+ char *sname;
+ struct diskslice *sp;
+{
+ printf("%s: start %lu, end %lu, size %lu\n", sname,
+ sp->ds_offset, sp->ds_offset + sp->ds_size - 1, sp->ds_size);
+}
+
+static void
+set_ds_label(ssp, slice, lp)
+ struct diskslices *ssp;
+ int slice;
+ struct disklabel *lp;
+{
+ ssp->dss_slices[slice].ds_label = lp;
+ if (slice == COMPATIBILITY_SLICE)
+ ssp->dss_slices[ssp->dss_first_bsd_slice].ds_label = lp;
+ else if (slice == ssp->dss_first_bsd_slice)
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_label = lp;
+}
+
+static void
+set_ds_labeldevs(dev, ssp)
+ dev_t dev;
+ struct diskslices *ssp;
+{
+}
+
+
+static void
+set_ds_wlabel(ssp, slice, wlabel)
+ struct diskslices *ssp;
+ int slice;
+ int wlabel;
+{
+ ssp->dss_slices[slice].ds_wlabel = wlabel;
+ if (slice == COMPATIBILITY_SLICE)
+ ssp->dss_slices[ssp->dss_first_bsd_slice].ds_wlabel = wlabel;
+ else if (slice == ssp->dss_first_bsd_slice)
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_wlabel = wlabel;
+}
diff --git a/sys/kern/subr_eventhandler.c b/sys/kern/subr_eventhandler.c
new file mode 100644
index 0000000..45b4302
--- /dev/null
+++ b/sys/kern/subr_eventhandler.c
@@ -0,0 +1,173 @@
+/*-
+ * Copyright (c) 1999 Michael Smith <msmith@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+
+static MALLOC_DEFINE(M_EVENTHANDLER, "eventhandler", "Event handler records");
+
+/* List of 'slow' lists */
+static TAILQ_HEAD(, eventhandler_list) eventhandler_lists;
+static int eventhandler_lists_initted = 0;
+static struct mtx eventhandler_mutex;
+
+struct eventhandler_entry_generic
+{
+ struct eventhandler_entry ee;
+ void (* func)(void);
+};
+
+/*
+ * Initialize the eventhandler mutex and list.
+ */
+static void
+eventhandler_init(void *dummy __unused)
+{
+ TAILQ_INIT(&eventhandler_lists);
+ mtx_init(&eventhandler_mutex, "eventhandler", NULL, MTX_DEF | MTX_RECURSE);
+ eventhandler_lists_initted = 1;
+}
+SYSINIT(eventhandlers, SI_SUB_EVENTHANDLER, SI_ORDER_FIRST, eventhandler_init,
+ NULL)
+
+/*
+ * Insertion is O(n) due to the priority scan, but optimises to O(1)
+ * if all priorities are identical.
+ */
+eventhandler_tag
+eventhandler_register(struct eventhandler_list *list, char *name,
+ void *func, void *arg, int priority)
+{
+ struct eventhandler_entry_generic *eg;
+ struct eventhandler_entry *ep;
+
+ KASSERT(eventhandler_lists_initted, ("eventhandler registered too early"));
+
+ /* lock the eventhandler lists */
+ mtx_lock(&eventhandler_mutex);
+
+ /* Do we need to find/create the (slow) list? */
+ if (list == NULL) {
+ /* look for a matching, existing list */
+ list = eventhandler_find_list(name);
+
+ /* Do we need to create the list? */
+ if (list == NULL) {
+ if ((list = malloc(sizeof(struct eventhandler_list) + strlen(name) + 1,
+ M_EVENTHANDLER, M_NOWAIT)) == NULL) {
+ mtx_unlock(&eventhandler_mutex);
+ return(NULL);
+ }
+ list->el_flags = 0;
+ bzero(&list->el_lock, sizeof(list->el_lock));
+ list->el_name = (char *)list + sizeof(struct eventhandler_list);
+ strcpy(list->el_name, name);
+ TAILQ_INSERT_HEAD(&eventhandler_lists, list, el_link);
+ }
+ }
+ if (!(list->el_flags & EHE_INITTED)) {
+ TAILQ_INIT(&list->el_entries);
+ sx_init(&list->el_lock, name);
+ list->el_flags = EHE_INITTED;
+ }
+ mtx_unlock(&eventhandler_mutex);
+
+ /* allocate an entry for this handler, populate it */
+ if ((eg = malloc(sizeof(struct eventhandler_entry_generic),
+ M_EVENTHANDLER, M_NOWAIT)) == NULL) {
+ return(NULL);
+ }
+ eg->func = func;
+ eg->ee.ee_arg = arg;
+ eg->ee.ee_priority = priority;
+
+ /* sort it into the list */
+ EHE_LOCK(list);
+ for (ep = TAILQ_FIRST(&list->el_entries);
+ ep != NULL;
+ ep = TAILQ_NEXT(ep, ee_link)) {
+ if (eg->ee.ee_priority < ep->ee_priority) {
+ TAILQ_INSERT_BEFORE(ep, &eg->ee, ee_link);
+ break;
+ }
+ }
+ if (ep == NULL)
+ TAILQ_INSERT_TAIL(&list->el_entries, &eg->ee, ee_link);
+ EHE_UNLOCK(list);
+ return(&eg->ee);
+}
+
+void
+eventhandler_deregister(struct eventhandler_list *list, eventhandler_tag tag)
+{
+ struct eventhandler_entry *ep = tag;
+
+ /* XXX insert diagnostic check here? */
+ EHE_LOCK(list);
+ if (ep != NULL) {
+ /* remove just this entry */
+ TAILQ_REMOVE(&list->el_entries, ep, ee_link);
+ free(ep, M_EVENTHANDLER);
+ } else {
+ /* remove entire list */
+ while (!TAILQ_EMPTY(&list->el_entries)) {
+ ep = TAILQ_FIRST(&list->el_entries);
+ TAILQ_REMOVE(&list->el_entries, ep, ee_link);
+ free(ep, M_EVENTHANDLER);
+ }
+ }
+ EHE_UNLOCK(list);
+}
+
+struct eventhandler_list *
+eventhandler_find_list(char *name)
+{
+ struct eventhandler_list *list;
+
+ if (!eventhandler_lists_initted)
+ return(NULL);
+
+ /* scan looking for the requested list */
+ mtx_lock(&eventhandler_mutex);
+ for (list = TAILQ_FIRST(&eventhandler_lists);
+ list != NULL;
+ list = TAILQ_NEXT(list, el_link)) {
+ if (!strcmp(name, list->el_name))
+ break;
+ }
+ mtx_unlock(&eventhandler_mutex);
+
+ return(list);
+}
+
diff --git a/sys/kern/subr_hints.c b/sys/kern/subr_hints.c
new file mode 100644
index 0000000..c68d607
--- /dev/null
+++ b/sys/kern/subr_hints.c
@@ -0,0 +1,366 @@
+/*-
+ * Copyright (c) 2000,2001 Peter Wemm <peter@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/sx.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+/*
+ * Access functions for device resources.
+ */
+
+static int checkmethod = 1;
+static int use_kenv;
+static char *hintp;
+
+/*
+ * Evil wildcarding resource string lookup.
+ * This walks the supplied env string table and returns a match.
+ * The start point can be remembered for incremental searches.
+ */
+static int
+res_find(int *line, int *startln,
+ const char *name, int *unit, const char *resname, const char *value,
+ const char **ret_name, int *ret_namelen, int *ret_unit,
+ const char **ret_resname, int *ret_resnamelen, const char **ret_value)
+{
+ int n = 0, hit, i = 0;
+ char r_name[32];
+ int r_unit;
+ char r_resname[32];
+ char r_value[128];
+ const char *s, *cp;
+ char *p;
+
+ if (checkmethod) {
+ switch (hintmode) {
+ case 0: /* loader hints in environment only */
+ break;
+ case 1: /* static hints only */
+ hintp = static_hints;
+ checkmethod = 0;
+ break;
+ case 2: /* fallback mode */
+ if (dynamic_kenv) {
+ sx_slock(&kenv_lock);
+ cp = kenvp[0];
+ for (i = 0; cp != NULL; cp = kenvp[++i]) {
+ if (!strncmp(cp, "hint.", 5)) {
+ use_kenv = 1;
+ checkmethod = 0;
+ break;
+ }
+ }
+ sx_sunlock(&kenv_lock);
+ } else {
+ cp = kern_envp;
+ while (cp) {
+ if (strncmp(cp, "hint.", 5) == 0) {
+ cp = NULL;
+ hintp = kern_envp;
+ break;
+ }
+ while (*cp != '\0')
+ cp++;
+ cp++;
+ if (*cp == '\0') {
+ cp = NULL;
+ hintp = static_hints;
+ break;
+ }
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ if (hintp == NULL) {
+ if (dynamic_kenv) {
+ use_kenv = 1;
+ checkmethod = 0;
+ } else
+ hintp = kern_envp;
+ }
+ }
+
+ if (use_kenv) {
+ sx_slock(&kenv_lock);
+ i = 0;
+ cp = kenvp[0];
+ if (cp == NULL) {
+ sx_sunlock(&kenv_lock);
+ return (ENOENT);
+ }
+ } else
+ cp = hintp;
+ while (cp) {
+ hit = 1;
+ (*line)++;
+ if (strncmp(cp, "hint.", 5) != 0)
+ hit = 0;
+ else
+ n = sscanf(cp, "hint.%32[^.].%d.%32[^=]=%128s",
+ r_name, &r_unit, r_resname, r_value);
+ if (hit && n != 4) {
+ printf("CONFIG: invalid hint '%s'\n", cp);
+ /* XXX: abuse bogus index() declaration */
+ p = index(cp, 'h');
+ *p = 'H';
+ hit = 0;
+ }
+ if (hit && startln && *startln >= 0 && *line < *startln)
+ hit = 0;
+ if (hit && name && strcmp(name, r_name) != 0)
+ hit = 0;
+ if (hit && unit && *unit != r_unit)
+ hit = 0;
+ if (hit && resname && strcmp(resname, r_resname) != 0)
+ hit = 0;
+ if (hit && value && strcmp(value, r_value) != 0)
+ hit = 0;
+ if (hit)
+ break;
+ if (use_kenv) {
+ cp = kenvp[++i];
+ if (cp == NULL)
+ break;
+ } else {
+ while (*cp != '\0')
+ cp++;
+ cp++;
+ if (*cp == '\0') {
+ cp = NULL;
+ break;
+ }
+ }
+ }
+ if (use_kenv)
+ sx_sunlock(&kenv_lock);
+ if (cp == NULL)
+ return ENOENT;
+
+ s = cp;
+ /* This is a bit of a hack, but at least is reentrant */
+ /* Note that it returns some !unterminated! strings. */
+ s = index(s, '.') + 1; /* start of device */
+ if (ret_name)
+ *ret_name = s;
+ s = index(s, '.') + 1; /* start of unit */
+ if (ret_namelen)
+ *ret_namelen = s - *ret_name - 1; /* device length */
+ if (ret_unit)
+ *ret_unit = r_unit;
+ s = index(s, '.') + 1; /* start of resname */
+ if (ret_resname)
+ *ret_resname = s;
+ s = index(s, '=') + 1; /* start of value */
+ if (ret_resnamelen)
+ *ret_resnamelen = s - *ret_resname - 1; /* value len */
+ if (ret_value)
+ *ret_value = s;
+ if (startln) /* line number for anchor */
+ *startln = *line + 1;
+ return 0;
+}
+
+/*
+ * Search all the data sources for matches to our query. We look for
+ * dynamic hints first as overrides for static or fallback hints.
+ */
+static int
+resource_find(int *line, int *startln,
+ const char *name, int *unit, const char *resname, const char *value,
+ const char **ret_name, int *ret_namelen, int *ret_unit,
+ const char **ret_resname, int *ret_resnamelen, const char **ret_value)
+{
+ int i;
+ int un;
+
+ *line = 0;
+
+ /* Search for exact unit matches first */
+ i = res_find(line, startln, name, unit, resname, value,
+ ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen,
+ ret_value);
+ if (i == 0)
+ return 0;
+ if (unit == NULL)
+ return ENOENT;
+ /* If we are still here, search for wildcard matches */
+ un = -1;
+ i = res_find(line, startln, name, &un, resname, value,
+ ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen,
+ ret_value);
+ if (i == 0)
+ return 0;
+ return ENOENT;
+}
+
+int
+resource_int_value(const char *name, int unit, const char *resname, int *result)
+{
+ int error;
+ const char *str;
+ char *op;
+ unsigned long val;
+ int line;
+
+ line = 0;
+ error = resource_find(&line, NULL, name, &unit, resname, NULL,
+ NULL, NULL, NULL, NULL, NULL, &str);
+ if (error)
+ return error;
+ if (*str == '\0')
+ return EFTYPE;
+ val = strtoul(str, &op, 0);
+ if (*op != '\0')
+ return EFTYPE;
+ *result = val;
+ return 0;
+}
+
+int
+resource_long_value(const char *name, int unit, const char *resname,
+ long *result)
+{
+ int error;
+ const char *str;
+ char *op;
+ unsigned long val;
+ int line;
+
+ line = 0;
+ error = resource_find(&line, NULL, name, &unit, resname, NULL,
+ NULL, NULL, NULL, NULL, NULL, &str);
+ if (error)
+ return error;
+ if (*str == '\0')
+ return EFTYPE;
+ val = strtoul(str, &op, 0);
+ if (*op != '\0')
+ return EFTYPE;
+ *result = val;
+ return 0;
+}
+
+int
+resource_string_value(const char *name, int unit, const char *resname,
+ const char **result)
+{
+ int error;
+ const char *str;
+ int line;
+
+ line = 0;
+ error = resource_find(&line, NULL, name, &unit, resname, NULL,
+ NULL, NULL, NULL, NULL, NULL, &str);
+ if (error)
+ return error;
+ *result = str;
+ return 0;
+}
+
+/*
+ * This is a bit nasty, but allows us to not modify the env strings.
+ */
+static const char *
+resource_string_copy(const char *s, int len)
+{
+ static char stringbuf[256];
+ static int offset = 0;
+ const char *ret;
+
+ if (len == 0)
+ len = strlen(s);
+ if (len > 255)
+ return NULL;
+ if ((offset + len + 1) > 255)
+ offset = 0;
+ bcopy(s, &stringbuf[offset], len);
+ stringbuf[offset + len] = '\0';
+ ret = &stringbuf[offset];
+ offset += len + 1;
+ return ret;
+}
+
+/*
+ * err = resource_find_at(&anchor, &name, &unit, resname, value)
+ * Iteratively fetch a list of devices wired "at" something
+ * res and value are restrictions. eg: "at", "scbus0".
+ * For practical purposes, res = required, value = optional.
+ * *name and *unit are set.
+ * set *anchor to zero before starting.
+ */
+int
+resource_find_match(int *anchor, const char **name, int *unit,
+ const char *resname, const char *value)
+{
+ const char *found_name;
+ int found_namelen;
+ int found_unit;
+ int ret;
+ int newln;
+
+ newln = *anchor;
+ ret = resource_find(anchor, &newln, NULL, NULL, resname, value,
+ &found_name, &found_namelen, &found_unit, NULL, NULL, NULL);
+ if (ret == 0) {
+ *name = resource_string_copy(found_name, found_namelen);
+ *unit = found_unit;
+ }
+ *anchor = newln;
+ return ret;
+}
+
+
+/*
+ * err = resource_find_dev(&anchor, name, &unit, res, value);
+ * Iterate through a list of devices, returning their unit numbers.
+ * res and value are optional restrictions. eg: "at", "scbus0".
+ * *unit is set to the value.
+ * set *anchor to zero before starting.
+ */
+int
+resource_find_dev(int *anchor, const char *name, int *unit,
+ const char *resname, const char *value)
+{
+ int found_unit;
+ int newln;
+ int ret;
+
+ newln = *anchor;
+ ret = resource_find(anchor, &newln, name, NULL, resname, value,
+ NULL, NULL, &found_unit, NULL, NULL, NULL);
+ if (ret == 0) {
+ *unit = found_unit;
+ }
+ *anchor = newln;
+ return ret;
+}
diff --git a/sys/kern/subr_kobj.c b/sys/kern/subr_kobj.c
new file mode 100644
index 0000000..b5bfa1f
--- /dev/null
+++ b/sys/kern/subr_kobj.c
@@ -0,0 +1,216 @@
+/*-
+ * Copyright (c) 2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/errno.h>
+#ifndef TEST
+#include <sys/systm.h>
+#endif
+#include <sys/kobj.h>
+
+#ifdef TEST
+#include "usertest.h"
+#endif
+
+static MALLOC_DEFINE(M_KOBJ, "kobj", "Kernel object structures");
+
+#ifdef KOBJ_STATS
+
+#include <sys/sysctl.h>
+
+u_int kobj_lookup_hits;
+u_int kobj_lookup_misses;
+
+SYSCTL_UINT(_kern, OID_AUTO, kobj_hits, CTLFLAG_RD,
+ &kobj_lookup_hits, 0, "")
+SYSCTL_UINT(_kern, OID_AUTO, kobj_misses, CTLFLAG_RD,
+ &kobj_lookup_misses, 0, "")
+
+#endif
+
+static int kobj_next_id = 1;
+
+static int
+kobj_error_method(void)
+{
+ return ENXIO;
+}
+
+static void
+kobj_register_method(struct kobjop_desc *desc)
+{
+ if (desc->id == 0)
+ desc->id = kobj_next_id++;
+}
+
+static void
+kobj_unregister_method(struct kobjop_desc *desc)
+{
+}
+
+static void
+kobj_class_compile_common(kobj_class_t cls, kobj_ops_t ops)
+{
+ kobj_method_t *m;
+ int i;
+
+ /*
+ * Don't do anything if we are already compiled.
+ */
+ if (cls->ops)
+ return;
+
+ /*
+ * First register any methods which need it.
+ */
+ for (i = 0, m = cls->methods; m->desc; i++, m++)
+ kobj_register_method(m->desc);
+
+ /*
+ * Then initialise the ops table.
+ */
+ bzero(ops, sizeof(struct kobj_ops));
+ ops->cls = cls;
+ cls->ops = ops;
+}
+
+void
+kobj_class_compile(kobj_class_t cls)
+{
+ kobj_ops_t ops;
+
+ /*
+ * Allocate space for the compiled ops table.
+ */
+ ops = malloc(sizeof(struct kobj_ops), M_KOBJ, M_NOWAIT);
+ if (!ops)
+ panic("kobj_compile_methods: out of memory");
+ kobj_class_compile_common(cls, ops);
+}
+
+void
+kobj_class_compile_static(kobj_class_t cls, kobj_ops_t ops)
+{
+ /*
+ * Increment refs to make sure that the ops table is not freed.
+ */
+ cls->refs++;
+ kobj_class_compile_common(cls, ops);
+}
+
+void
+kobj_lookup_method(kobj_method_t *methods,
+ kobj_method_t *ce,
+ kobjop_desc_t desc)
+{
+ ce->desc = desc;
+ for (; methods && methods->desc; methods++) {
+ if (methods->desc == desc) {
+ ce->func = methods->func;
+ return;
+ }
+ }
+ if (desc->deflt)
+ ce->func = desc->deflt;
+ else
+ ce->func = kobj_error_method;
+ return;
+}
+
+void
+kobj_class_free(kobj_class_t cls)
+{
+ int i;
+ kobj_method_t *m;
+
+ /*
+ * Unregister any methods which are no longer used.
+ */
+ for (i = 0, m = cls->methods; m->desc; i++, m++)
+ kobj_unregister_method(m->desc);
+
+ /*
+ * Free memory and clean up.
+ */
+ free(cls->ops, M_KOBJ);
+ cls->ops = 0;
+}
+
+kobj_t
+kobj_create(kobj_class_t cls,
+ struct malloc_type *mtype,
+ int mflags)
+{
+ kobj_t obj;
+
+ /*
+ * Allocate and initialise the new object.
+ */
+ obj = malloc(cls->size, mtype, mflags | M_ZERO);
+ if (!obj)
+ return 0;
+ kobj_init(obj, cls);
+
+ return obj;
+}
+
+void
+kobj_init(kobj_t obj, kobj_class_t cls)
+{
+ /*
+ * Consider compiling the class' method table.
+ */
+ if (!cls->ops)
+ kobj_class_compile(cls);
+
+ obj->ops = cls->ops;
+ cls->refs++;
+}
+
+void
+kobj_delete(kobj_t obj, struct malloc_type *mtype)
+{
+ kobj_class_t cls = obj->ops->cls;
+
+ /*
+ * Consider freeing the compiled method table for the class
+ * after its last instance is deleted. As an optimisation, we
+ * should defer this for a short while to avoid thrashing.
+ */
+ cls->refs--;
+ if (!cls->refs)
+ kobj_class_free(cls);
+
+ obj->ops = 0;
+ if (mtype)
+ free(obj, mtype);
+}
diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c
new file mode 100644
index 0000000..2c01568
--- /dev/null
+++ b/sys/kern/subr_log.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_log.c 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+/*
+ * Error log buffer for kernel printf's.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/msgbuf.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/poll.h>
+#include <sys/filedesc.h>
+#include <sys/sysctl.h>
+
+#define LOG_RDPRI (PZERO + 1)
+
+#define LOG_ASYNC 0x04
+#define LOG_RDWAIT 0x08
+
+static d_open_t logopen;
+static d_close_t logclose;
+static d_read_t logread;
+static d_ioctl_t logioctl;
+static d_poll_t logpoll;
+
+static void logtimeout(void *arg);
+
+#define CDEV_MAJOR 7
+static struct cdevsw log_cdevsw = {
+ /* open */ logopen,
+ /* close */ logclose,
+ /* read */ logread,
+ /* write */ nowrite,
+ /* ioctl */ logioctl,
+ /* poll */ logpoll,
+ /* mmap */ nommap,
+ /* strategy */ nostrategy,
+ /* name */ "log",
+ /* maj */ CDEV_MAJOR,
+ /* dump */ nodump,
+ /* psize */ nopsize,
+ /* flags */ 0,
+};
+
+static struct logsoftc {
+ int sc_state; /* see above for possibilities */
+ struct selinfo sc_selp; /* process waiting on select call */
+ struct sigio *sc_sigio; /* information for async I/O */
+ struct callout sc_callout; /* callout to wakeup syslog */
+} logsoftc;
+
+int log_open; /* also used in log() */
+
+/* Times per second to check for a pending syslog wakeup. */
+static int log_wakeups_per_second = 5;
+SYSCTL_INT(_kern, OID_AUTO, log_wakeups_per_second, CTLFLAG_RW,
+ &log_wakeups_per_second, 0, "");
+
+/*ARGSUSED*/
+static int
+logopen(dev_t dev, int flags, int mode, struct thread *td)
+{
+ if (log_open)
+ return (EBUSY);
+ log_open = 1;
+ callout_init(&logsoftc.sc_callout, 0);
+ fsetown(td->td_proc->p_pid, &logsoftc.sc_sigio); /* signal process only */
+ callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second,
+ logtimeout, NULL);
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+logclose(dev_t dev, int flag, int mode, struct thread *td)
+{
+
+ log_open = 0;
+ callout_stop(&logsoftc.sc_callout);
+ logsoftc.sc_state = 0;
+ funsetown(&logsoftc.sc_sigio);
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+logread(dev_t dev, struct uio *uio, int flag)
+{
+ struct msgbuf *mbp = msgbufp;
+ long l;
+ int s;
+ int error = 0;
+
+ s = splhigh();
+ while (mbp->msg_bufr == mbp->msg_bufx) {
+ if (flag & IO_NDELAY) {
+ splx(s);
+ return (EWOULDBLOCK);
+ }
+ logsoftc.sc_state |= LOG_RDWAIT;
+ if ((error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH,
+ "klog", 0))) {
+ splx(s);
+ return (error);
+ }
+ }
+ splx(s);
+ logsoftc.sc_state &= ~LOG_RDWAIT;
+
+ while (uio->uio_resid > 0) {
+ l = mbp->msg_bufx - mbp->msg_bufr;
+ if (l < 0)
+ l = mbp->msg_size - mbp->msg_bufr;
+ l = min(l, uio->uio_resid);
+ if (l == 0)
+ break;
+ error = uiomove((caddr_t)msgbufp->msg_ptr + mbp->msg_bufr,
+ (int)l, uio);
+ if (error)
+ break;
+ mbp->msg_bufr += l;
+ if (mbp->msg_bufr >= mbp->msg_size)
+ mbp->msg_bufr = 0;
+ }
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+logpoll(dev_t dev, int events, struct thread *td)
+{
+ int s;
+ int revents = 0;
+
+ s = splhigh();
+
+ if (events & (POLLIN | POLLRDNORM)) {
+ if (msgbufp->msg_bufr != msgbufp->msg_bufx)
+ revents |= events & (POLLIN | POLLRDNORM);
+ else
+ selrecord(td, &logsoftc.sc_selp);
+ }
+ splx(s);
+ return (revents);
+}
+
+static void
+logtimeout(void *arg)
+{
+
+ if (!log_open)
+ return;
+ if (msgbuftrigger == 0) {
+ callout_reset(&logsoftc.sc_callout,
+ hz / log_wakeups_per_second, logtimeout, NULL);
+ return;
+ }
+ msgbuftrigger = 0;
+ selwakeup(&logsoftc.sc_selp);
+ if ((logsoftc.sc_state & LOG_ASYNC) && logsoftc.sc_sigio != NULL)
+ pgsigio(&logsoftc.sc_sigio, SIGIO, 0);
+ if (logsoftc.sc_state & LOG_RDWAIT) {
+ wakeup((caddr_t)msgbufp);
+ logsoftc.sc_state &= ~LOG_RDWAIT;
+ }
+ callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second,
+ logtimeout, NULL);
+}
+
+/*ARGSUSED*/
+static int
+logioctl(dev_t dev, u_long com, caddr_t data, int flag, struct thread *td)
+{
+ long l;
+ int s;
+
+ switch (com) {
+
+ /* return number of characters immediately available */
+ case FIONREAD:
+ s = splhigh();
+ l = msgbufp->msg_bufx - msgbufp->msg_bufr;
+ splx(s);
+ if (l < 0)
+ l += msgbufp->msg_size;
+ *(int *)data = l;
+ break;
+
+ case FIONBIO:
+ break;
+
+ case FIOASYNC:
+ if (*(int *)data)
+ logsoftc.sc_state |= LOG_ASYNC;
+ else
+ logsoftc.sc_state &= ~LOG_ASYNC;
+ break;
+
+ case FIOSETOWN:
+ return (fsetown(*(int *)data, &logsoftc.sc_sigio));
+
+ case FIOGETOWN:
+ *(int *)data = fgetown(logsoftc.sc_sigio);
+ break;
+
+ /* This is deprecated, FIOSETOWN should be used instead. */
+ case TIOCSPGRP:
+ return (fsetown(-(*(int *)data), &logsoftc.sc_sigio));
+
+ /* This is deprecated, FIOGETOWN should be used instead */
+ case TIOCGPGRP:
+ *(int *)data = -fgetown(logsoftc.sc_sigio);
+ break;
+
+ default:
+ return (ENOTTY);
+ }
+ return (0);
+}
+
+static void
+log_drvinit(void *unused)
+{
+
+ make_dev(&log_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "klog");
+}
+
+SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,log_drvinit,NULL)
diff --git a/sys/kern/subr_mbuf.c b/sys/kern/subr_mbuf.c
new file mode 100644
index 0000000..74e1f56
--- /dev/null
+++ b/sys/kern/subr_mbuf.c
@@ -0,0 +1,1111 @@
+/*-
+ * Copyright (c) 2001
+ * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+/*
+ * Maximum number of PCPU containers. If you know what you're doing you could
+ * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
+ * system during compilation, and thus prevent kernel structure bloat.
+ *
+ * SMP and non-SMP kernels clearly have a different number of possible CPUs,
+ * but because we cannot assume a dense array of CPUs, we always allocate
+ * and traverse PCPU containers up to NCPU amount and merely check for
+ * CPU availability.
+ */
+#ifdef MBALLOC_NCPU
+#define NCPU MBALLOC_NCPU
+#else
+#define NCPU MAXCPU
+#endif
+
+/*-
+ * The mbuf allocator is heavily based on Alfred Perlstein's
+ * (alfred@FreeBSD.org) "memcache" allocator which is itself based
+ * on concepts from several per-CPU memory allocators. The difference
+ * between this allocator and memcache is that, among other things:
+ *
+ * (i) We don't free back to the map from the free() routine - we leave the
+ * option of implementing lazy freeing (from a kproc) in the future.
+ *
+ * (ii) We allocate from separate sub-maps of kmem_map, thus limiting the
+ * maximum number of allocatable objects of a given type. Further,
+ * we handle blocking on a cv in the case that the map is starved and
+ * we have to rely solely on cached (circulating) objects.
+ *
+ * The mbuf allocator keeps all objects that it allocates in mb_buckets.
+ * The buckets keep a page worth of objects (an object can be an mbuf or an
+ * mbuf cluster) and facilitate moving larger sets of contiguous objects
+ * from the per-CPU lists to the main list for the given object. The buckets
+ * also have an added advantage in that after several moves from a per-CPU
+ * list to the main list and back to the per-CPU list, contiguous objects
+ * are kept together, thus trying to put the TLB cache to good use.
+ *
+ * The buckets are kept on singly-linked lists called "containers." A container
+ * is protected by a mutex lock in order to ensure consistency. The mutex lock
+ * itself is allocated separately and attached to the container at boot time,
+ * thus allowing for certain containers to share the same mutex lock. Per-CPU
+ * containers for mbufs and mbuf clusters all share the same per-CPU
+ * lock whereas the "general system" containers (i.e., the "main lists") for
+ * these objects share one global lock.
+ */
+struct mb_bucket {
+ SLIST_ENTRY(mb_bucket) mb_blist;
+ int mb_owner;
+ int mb_numfree;
+ void *mb_free[0];
+};
+
+struct mb_container {
+ SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead;
+ struct mtx *mc_lock;
+ int mc_numowner;
+ u_int mc_starved;
+ long *mc_types;
+ u_long *mc_objcount;
+ u_long *mc_numpgs;
+};
+
+struct mb_gen_list {
+ struct mb_container mb_cont;
+ struct cv mgl_mstarved;
+};
+
+struct mb_pcpu_list {
+ struct mb_container mb_cont;
+};
+
+/*
+ * Boot-time configurable object counts that will determine the maximum
+ * number of permitted objects in the mbuf and mcluster cases. In the
+ * ext counter (nmbcnt) case, it's just an indicator serving to scale
+ * kmem_map size properly - in other words, we may be allowed to allocate
+ * more than nmbcnt counters, whereas we will never be allowed to allocate
+ * more than nmbufs mbufs or nmbclusters mclusters.
+ * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be
+ * allocatable by the sfbuf allocator (found in uipc_syscalls.c)
+ */
+#ifndef NMBCLUSTERS
+#define NMBCLUSTERS (1024 + maxusers * 64)
+#endif
+#ifndef NMBUFS
+#define NMBUFS (nmbclusters * 2)
+#endif
+#ifndef NSFBUFS
+#define NSFBUFS (512 + maxusers * 16)
+#endif
+#ifndef NMBCNTS
+#define NMBCNTS (nmbclusters + nsfbufs)
+#endif
+int nmbufs;
+int nmbclusters;
+int nmbcnt;
+int nsfbufs;
+
+/*
+ * Perform sanity checks of tunables declared above.
+ */
+static void
+tunable_mbinit(void *dummy)
+{
+
+ /*
+ * This has to be done before VM init.
+ */
+ nmbclusters = NMBCLUSTERS;
+ TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
+ nmbufs = NMBUFS;
+ TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
+ nsfbufs = NSFBUFS;
+ TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
+ nmbcnt = NMBCNTS;
+ TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
+ /* Sanity checks */
+ if (nmbufs < nmbclusters * 2)
+ nmbufs = nmbclusters * 2;
+ if (nmbcnt < nmbclusters + nsfbufs)
+ nmbcnt = nmbclusters + nsfbufs;
+}
+SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
+
+/*
+ * The freelist structures and mutex locks. The number statically declared
+ * here depends on the number of CPUs.
+ *
+ * We set up in such a way that all the objects (mbufs, clusters)
+ * share the same mutex lock. It has been established that we do not benefit
+ * from different locks for different objects, so we use the same lock,
+ * regardless of object type.
+ */
+struct mb_lstmngr {
+ struct mb_gen_list *ml_genlist;
+ struct mb_pcpu_list *ml_cntlst[NCPU];
+ struct mb_bucket **ml_btable;
+ vm_map_t ml_map;
+ vm_offset_t ml_mapbase;
+ vm_offset_t ml_maptop;
+ int ml_mapfull;
+ u_int ml_objsize;
+ u_int *ml_wmhigh;
+};
+static struct mb_lstmngr mb_list_mbuf, mb_list_clust;
+static struct mtx mbuf_gen, mbuf_pcpu[NCPU];
+
+/*
+ * Local macros for internal allocator structure manipulations.
+ */
+#ifdef SMP
+#define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[PCPU_GET(cpuid)]
+#else
+#define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[0]
+#endif
+
+#define MB_GET_GEN_LIST(mb_lst) (mb_lst)->ml_genlist
+
+#define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mc_lock)
+
+#define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mc_lock)
+
+#define MB_GET_PCPU_LIST_NUM(mb_lst, num) \
+ (mb_lst)->ml_cntlst[(num)]
+
+#define MB_BUCKET_INDX(mb_obj, mb_lst) \
+ (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE)
+
+#define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \
+{ \
+ struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead); \
+ \
+ (mb_bckt)->mb_numfree--; \
+ (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)]; \
+ (*((mb_lst)->mb_cont.mc_objcount))--; \
+ if ((mb_bckt)->mb_numfree == 0) { \
+ SLIST_REMOVE_HEAD(_mchd, mb_blist); \
+ SLIST_NEXT((mb_bckt), mb_blist) = NULL; \
+ (mb_bckt)->mb_owner |= MB_BUCKET_FREE; \
+ } \
+}
+
+#define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst) \
+ (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp); \
+ (mb_bckt)->mb_numfree++; \
+ (*((mb_lst)->mb_cont.mc_objcount))++;
+
+#define MB_MBTYPES_INC(mb_cnt, mb_type, mb_num) \
+ if ((mb_type) != MT_NOTMBUF) \
+ (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num)
+
+#define MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num) \
+ if ((mb_type) != MT_NOTMBUF) \
+ (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num)
+
+/*
+ * Ownership of buckets/containers is represented by integers. The PCPU
+ * lists range from 0 to NCPU-1. We need a free numerical id for the general
+ * list (we use NCPU). We also need a non-conflicting free bit to indicate
+ * that the bucket is free and removed from a container, while not losing
+ * the bucket's originating container id. We use the highest bit
+ * for the free marker.
+ */
+#define MB_GENLIST_OWNER (NCPU)
+#define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1))
+
+/* Statistics structures for allocator (per-CPU and general). */
+static struct mbpstat mb_statpcpu[NCPU + 1];
+struct mbstat mbstat;
+
+/* Sleep time for wait code (in ticks). */
+static int mbuf_wait = 64;
+
+static u_int mbuf_limit = 512; /* Upper limit on # of mbufs per CPU. */
+static u_int clust_limit = 128; /* Upper limit on # of clusters per CPU. */
+
+/*
+ * Objects exported by sysctl(8).
+ */
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0,
+ "Maximum number of mbuf clusters available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
+ "Maximum number of mbufs available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
+ "Number used to scale kmem_map to ensure sufficient space for counters");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0,
+ "Maximum number of sendfile(2) sf_bufs available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
+ "Sleep time of mbuf subsystem wait allocations during exhaustion");
+SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0,
+ "Upper limit of number of mbufs allowed on each PCPU list");
+SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0,
+ "Upper limit of number of mbuf clusters allowed on each PCPU list");
+SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
+ "Mbuf general information and statistics");
+SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
+ sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
+
+/*
+ * Prototypes of local allocator routines.
+ */
+static void *mb_alloc_wait(struct mb_lstmngr *, short);
+static struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int,
+ struct mb_pcpu_list *);
+static void mb_reclaim(void);
+static void mbuf_init(void *);
+
+/*
+ * Initial allocation numbers. Each parameter represents the number of buckets
+ * of each object that will be placed initially in each PCPU container for
+ * said object.
+ */
+#define NMB_MBUF_INIT 4
+#define NMB_CLUST_INIT 16
+
+/*
+ * Initialize the mbuf subsystem.
+ *
+ * We sub-divide the kmem_map into several submaps; this way, we don't have
+ * to worry about artificially limiting the number of mbuf or mbuf cluster
+ * allocations, due to fear of one type of allocation "stealing" address
+ * space initially reserved for another.
+ *
+ * Set up both the general containers and all the PCPU containers. Populate
+ * the PCPU containers with initial numbers.
+ */
+MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
+void
+mbuf_init(void *dummy)
+{
+ struct mb_pcpu_list *pcpu_cnt;
+ vm_size_t mb_map_size;
+ int i, j;
+
+ /*
+ * Set up all the submaps, for each type of object that we deal
+ * with in this allocator.
+ */
+ mb_map_size = (vm_size_t)(nmbufs * MSIZE);
+ mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
+ mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE *
+ sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
+ if (mb_list_mbuf.ml_btable == NULL)
+ goto bad;
+ mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
+ &(mb_list_mbuf.ml_maptop), mb_map_size);
+ mb_list_mbuf.ml_map->system_map = 1;
+ mb_list_mbuf.ml_mapfull = 0;
+ mb_list_mbuf.ml_objsize = MSIZE;
+ mb_list_mbuf.ml_wmhigh = &mbuf_limit;
+
+ mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
+ mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
+ mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE
+ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
+ if (mb_list_clust.ml_btable == NULL)
+ goto bad;
+ mb_list_clust.ml_map = kmem_suballoc(kmem_map,
+ &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop),
+ mb_map_size);
+ mb_list_clust.ml_map->system_map = 1;
+ mb_list_clust.ml_mapfull = 0;
+ mb_list_clust.ml_objsize = MCLBYTES;
+ mb_list_clust.ml_wmhigh = &clust_limit;
+
+ /*
+ * Allocate required general (global) containers for each object type.
+ */
+ mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
+ M_NOWAIT);
+ mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
+ M_NOWAIT);
+ if ((mb_list_mbuf.ml_genlist == NULL) ||
+ (mb_list_clust.ml_genlist == NULL))
+ goto bad;
+
+ /*
+ * Initialize condition variables and general container mutex locks.
+ */
+ mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, 0);
+ cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved");
+ cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved),
+ "mcluster pool starved");
+ mb_list_mbuf.ml_genlist->mb_cont.mc_lock =
+ mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
+
+ /*
+ * Set up the general containers for each object.
+ */
+ mb_list_mbuf.ml_genlist->mb_cont.mc_numowner =
+ mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
+ mb_list_mbuf.ml_genlist->mb_cont.mc_starved =
+ mb_list_clust.ml_genlist->mb_cont.mc_starved = 0;
+ mb_list_mbuf.ml_genlist->mb_cont.mc_objcount =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
+ mb_list_clust.ml_genlist->mb_cont.mc_objcount =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
+ mb_list_mbuf.ml_genlist->mb_cont.mc_numpgs =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbpgs);
+ mb_list_clust.ml_genlist->mb_cont.mc_numpgs =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs);
+ mb_list_mbuf.ml_genlist->mb_cont.mc_types =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]);
+ mb_list_clust.ml_genlist->mb_cont.mc_types = NULL;
+ SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead));
+ SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead));
+
+ /*
+ * Initialize general mbuf statistics.
+ */
+ mbstat.m_msize = MSIZE;
+ mbstat.m_mclbytes = MCLBYTES;
+ mbstat.m_minclsize = MINCLSIZE;
+ mbstat.m_mlen = MLEN;
+ mbstat.m_mhlen = MHLEN;
+ mbstat.m_numtypes = MT_NTYPES;
+
+ /*
+ * Allocate and initialize PCPU containers.
+ */
+ for (i = 0; i < NCPU; i++) {
+ if (CPU_ABSENT(i))
+ continue;
+
+ mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
+ M_MBUF, M_NOWAIT);
+ mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
+ M_MBUF, M_NOWAIT);
+ if ((mb_list_mbuf.ml_cntlst[i] == NULL) ||
+ (mb_list_clust.ml_cntlst[i] == NULL))
+ goto bad;
+
+ mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, 0);
+ mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock =
+ mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
+
+ mb_statpcpu[i].mb_active = 1;
+ mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner =
+ mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i;
+ mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved =
+ mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0;
+ mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount =
+ &(mb_statpcpu[i].mb_mbfree);
+ mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
+ &(mb_statpcpu[i].mb_clfree);
+ mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numpgs =
+ &(mb_statpcpu[i].mb_mbpgs);
+ mb_list_clust.ml_cntlst[i]->mb_cont.mc_numpgs =
+ &(mb_statpcpu[i].mb_clpgs);
+ mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types =
+ &(mb_statpcpu[i].mb_mbtypes[0]);
+ mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL;
+
+ SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
+ SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead));
+
+ /*
+ * Perform initial allocations.
+ */
+ pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
+ MB_LOCK_CONT(pcpu_cnt);
+ for (j = 0; j < NMB_MBUF_INIT; j++) {
+ if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
+ == NULL)
+ goto bad;
+ }
+ MB_UNLOCK_CONT(pcpu_cnt);
+
+ pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
+ MB_LOCK_CONT(pcpu_cnt);
+ for (j = 0; j < NMB_CLUST_INIT; j++) {
+ if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
+ == NULL)
+ goto bad;
+ }
+ MB_UNLOCK_CONT(pcpu_cnt);
+ }
+
+ return;
+bad:
+ panic("mbuf_init(): failed to initialize mbuf subsystem!");
+}
+
+/*
+ * Populate a given mbuf PCPU container with a bucket full of fresh new
+ * buffers. Return a pointer to the new bucket (already in the container if
+ * successful), or return NULL on failure.
+ *
+ * LOCKING NOTES:
+ * PCPU container lock must be held when this is called.
+ * The lock is dropped here so that we can cleanly call the underlying VM
+ * code. If we fail, we return with no locks held. If we succeed (i.e., return
+ * non-NULL), we return with the PCPU lock held, ready for allocation from
+ * the returned bucket.
+ */
+static struct mb_bucket *
+mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
+{
+ struct mb_bucket *bucket;
+ caddr_t p;
+ int i;
+
+ MB_UNLOCK_CONT(cnt_lst);
+ /*
+ * If our object's (finite) map is starved now (i.e., no more address
+ * space), bail out now.
+ */
+ if (mb_list->ml_mapfull)
+ return (NULL);
+
+ bucket = malloc(sizeof(struct mb_bucket) +
+ PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF,
+ how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
+ if (bucket == NULL)
+ return (NULL);
+
+ p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE,
+ how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
+ if (p == NULL) {
+ free(bucket, M_MBUF);
+ if (how == M_TRYWAIT)
+ mb_list->ml_mapfull = 1;
+ return (NULL);
+ }
+
+ bucket->mb_numfree = 0;
+ mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
+ for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) {
+ bucket->mb_free[i] = p;
+ bucket->mb_numfree++;
+ p += mb_list->ml_objsize;
+ }
+
+ MB_LOCK_CONT(cnt_lst);
+ bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
+ SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
+ (*(cnt_lst->mb_cont.mc_numpgs))++;
+ *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
+
+ return (bucket);
+}
+
+/*
+ * Allocate an mbuf-subsystem type object.
+ * The general case is very easy. Complications only arise if our PCPU
+ * container is empty. Things get worse if the PCPU container is empty,
+ * the general container is empty, and we've run out of address space
+ * in our map; then we try to block if we're willing to (M_TRYWAIT).
+ */
+static __inline
+void *
+mb_alloc(struct mb_lstmngr *mb_list, int how, short type)
+{
+ static int last_report;
+ struct mb_pcpu_list *cnt_lst;
+ struct mb_bucket *bucket;
+ void *m;
+
+ m = NULL;
+ cnt_lst = MB_GET_PCPU_LIST(mb_list);
+ MB_LOCK_CONT(cnt_lst);
+
+ if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) {
+ /*
+ * This is the easy allocation case. We just grab an object
+ * from a bucket in the PCPU container. At worst, we
+ * have just emptied the bucket and so we remove it
+ * from the container.
+ */
+ MB_GET_OBJECT(m, bucket, cnt_lst);
+ MB_MBTYPES_INC(cnt_lst, type, 1);
+ MB_UNLOCK_CONT(cnt_lst);
+ } else {
+ struct mb_gen_list *gen_list;
+
+ /*
+ * This is the less-common more difficult case. We must
+ * first verify if the general list has anything for us
+ * and if that also fails, we must allocate a page from
+ * the map and create a new bucket to place in our PCPU
+ * container (already locked). If the map is starved then
+ * we're really in for trouble, as we have to wait on
+ * the general container's condition variable.
+ */
+ gen_list = MB_GET_GEN_LIST(mb_list);
+ MB_LOCK_CONT(gen_list);
+
+ if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead)))
+ != NULL) {
+ /*
+ * Give ownership of the bucket to our CPU's
+ * container, but only actually put the bucket
+ * in the container if it doesn't become free
+ * upon removing an mbuf from it.
+ */
+ SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
+ mb_blist);
+ bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
+ (*(gen_list->mb_cont.mc_numpgs))--;
+ (*(cnt_lst->mb_cont.mc_numpgs))++;
+ *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
+ bucket->mb_numfree--;
+ m = bucket->mb_free[(bucket->mb_numfree)];
+ if (bucket->mb_numfree == 0) {
+ SLIST_NEXT(bucket, mb_blist) = NULL;
+ bucket->mb_owner |= MB_BUCKET_FREE;
+ } else {
+ SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
+ bucket, mb_blist);
+ *(cnt_lst->mb_cont.mc_objcount) +=
+ bucket->mb_numfree;
+ }
+ MB_UNLOCK_CONT(gen_list);
+ MB_MBTYPES_INC(cnt_lst, type, 1);
+ MB_UNLOCK_CONT(cnt_lst);
+ } else {
+ /*
+ * We'll have to allocate a new page.
+ */
+ MB_UNLOCK_CONT(gen_list);
+ bucket = mb_pop_cont(mb_list, how, cnt_lst);
+ if (bucket != NULL) {
+ MB_GET_OBJECT(m, bucket, cnt_lst);
+ MB_MBTYPES_INC(cnt_lst, type, 1);
+ MB_UNLOCK_CONT(cnt_lst);
+ } else {
+ if (how == M_TRYWAIT) {
+ /*
+ * Absolute worst-case scenario.
+ * We block if we're willing to, but
+ * only after trying to steal from
+ * other lists.
+ */
+ m = mb_alloc_wait(mb_list, type);
+ } else {
+ /* XXX: No consistency. */
+ mbstat.m_drops++;
+
+ if (ticks < last_report ||
+ (ticks - last_report) >= hz) {
+ last_report = ticks;
+ printf(
+"All mbufs exhausted, please see tuning(7).\n");
+/* XXX: Actually could be clusters, but it gets the point across. */
+ }
+
+ }
+ }
+ }
+ }
+
+ return (m);
+}
+
+/*
+ * This is the worst-case scenario called only if we're allocating with
+ * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf
+ * by looking in every PCPU container. If we're still unsuccesful, we
+ * try the general container one last time and possibly block on our
+ * starved cv.
+ */
+static void *
+mb_alloc_wait(struct mb_lstmngr *mb_list, short type)
+{
+ struct mb_pcpu_list *cnt_lst;
+ struct mb_gen_list *gen_list;
+ struct mb_bucket *bucket;
+ void *m;
+ int i, cv_ret;
+
+ /*
+ * Try to reclaim mbuf-related objects (mbufs, clusters).
+ */
+ mb_reclaim();
+
+ /*
+ * Cycle all the PCPU containers. Increment starved counts if found
+ * empty.
+ */
+ for (i = 0; i < NCPU; i++) {
+ if (CPU_ABSENT(i))
+ continue;
+ cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
+ MB_LOCK_CONT(cnt_lst);
+
+ /*
+ * If container is non-empty, get a single object from it.
+ * If empty, increment starved count.
+ */
+ if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) !=
+ NULL) {
+ MB_GET_OBJECT(m, bucket, cnt_lst);
+ MB_MBTYPES_INC(cnt_lst, type, 1);
+ MB_UNLOCK_CONT(cnt_lst);
+ mbstat.m_wait++; /* XXX: No consistency. */
+ return (m);
+ } else
+ cnt_lst->mb_cont.mc_starved++;
+
+ MB_UNLOCK_CONT(cnt_lst);
+ }
+
+ /*
+ * We're still here, so that means it's time to get the general
+ * container lock, check it one more time (now that mb_reclaim()
+ * has been called) and if we still get nothing, block on the cv.
+ */
+ gen_list = MB_GET_GEN_LIST(mb_list);
+ MB_LOCK_CONT(gen_list);
+ if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) {
+ MB_GET_OBJECT(m, bucket, gen_list);
+ MB_MBTYPES_INC(gen_list, type, 1);
+ MB_UNLOCK_CONT(gen_list);
+ mbstat.m_wait++; /* XXX: No consistency. */
+ return (m);
+ }
+
+ gen_list->mb_cont.mc_starved++;
+ cv_ret = cv_timedwait(&(gen_list->mgl_mstarved),
+ gen_list->mb_cont.mc_lock, mbuf_wait);
+ gen_list->mb_cont.mc_starved--;
+
+ if ((cv_ret == 0) &&
+ ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) {
+ MB_GET_OBJECT(m, bucket, gen_list);
+ MB_MBTYPES_INC(gen_list, type, 1);
+ mbstat.m_wait++; /* XXX: No consistency. */
+ } else {
+ mbstat.m_drops++; /* XXX: No consistency. */
+ m = NULL;
+ }
+
+ MB_UNLOCK_CONT(gen_list);
+
+ return (m);
+}
+
+/*-
+ * Free an object to its rightful container.
+ * In the very general case, this operation is really very easy.
+ * Complications arise primarily if:
+ * (a) We've hit the high limit on number of free objects allowed in
+ * our PCPU container.
+ * (b) We're in a critical situation where our container has been
+ * marked 'starved' and we need to issue wakeups on the starved
+ * condition variable.
+ * (c) Minor (odd) cases: our bucket has migrated while we were
+ * waiting for the lock; our bucket is in the general container;
+ * our bucket is empty.
+ */
+static __inline
+void
+mb_free(struct mb_lstmngr *mb_list, void *m, short type)
+{
+ struct mb_pcpu_list *cnt_lst;
+ struct mb_gen_list *gen_list;
+ struct mb_bucket *bucket;
+ u_int owner;
+
+ bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
+
+ /*
+ * Make sure that if after we lock the bucket's present container the
+ * bucket has migrated, that we drop the lock and get the new one.
+ */
+retry_lock:
+ owner = bucket->mb_owner & ~MB_BUCKET_FREE;
+ switch (owner) {
+ case MB_GENLIST_OWNER:
+ gen_list = MB_GET_GEN_LIST(mb_list);
+ MB_LOCK_CONT(gen_list);
+ if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
+ MB_UNLOCK_CONT(gen_list);
+ goto retry_lock;
+ }
+
+ /*
+ * If we're intended for the general container, this is
+ * real easy: no migrating required. The only `bogon'
+ * is that we're now contending with all the threads
+ * dealing with the general list, but this is expected.
+ */
+ MB_PUT_OBJECT(m, bucket, gen_list);
+ MB_MBTYPES_DEC(gen_list, type, 1);
+ if (gen_list->mb_cont.mc_starved > 0)
+ cv_signal(&(gen_list->mgl_mstarved));
+ MB_UNLOCK_CONT(gen_list);
+ break;
+
+ default:
+ cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
+ MB_LOCK_CONT(cnt_lst);
+ if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
+ MB_UNLOCK_CONT(cnt_lst);
+ goto retry_lock;
+ }
+
+ MB_PUT_OBJECT(m, bucket, cnt_lst);
+ MB_MBTYPES_DEC(cnt_lst, type, 1);
+
+ if (cnt_lst->mb_cont.mc_starved > 0) {
+ /*
+ * This is a tough case. It means that we've
+ * been flagged at least once to indicate that
+ * we're empty, and that the system is in a critical
+ * situation, so we ought to migrate at least one
+ * bucket over to the general container.
+ * There may or may not be a thread blocking on
+ * the starved condition variable, but chances
+ * are that one will eventually come up soon so
+ * it's better to migrate now than never.
+ */
+ gen_list = MB_GET_GEN_LIST(mb_list);
+ MB_LOCK_CONT(gen_list);
+ KASSERT((bucket->mb_owner & MB_BUCKET_FREE) != 0,
+ ("mb_free: corrupt bucket %p\n", bucket));
+ SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
+ bucket, mb_blist);
+ bucket->mb_owner = MB_GENLIST_OWNER;
+ (*(cnt_lst->mb_cont.mc_objcount))--;
+ (*(gen_list->mb_cont.mc_objcount))++;
+ (*(cnt_lst->mb_cont.mc_numpgs))--;
+ (*(gen_list->mb_cont.mc_numpgs))++;
+
+ /*
+ * Determine whether or not to keep transferring
+ * buckets to the general list or whether we've
+ * transferred enough already.
+ * We realize that although we may flag another
+ * bucket to be migrated to the general container
+ * that in the meantime, the thread that was
+ * blocked on the cv is already woken up and
+ * long gone. But in that case, the worst
+ * consequence is that we will end up migrating
+ * one bucket too many, which is really not a big
+ * deal, especially if we're close to a critical
+ * situation.
+ */
+ if (gen_list->mb_cont.mc_starved > 0) {
+ cnt_lst->mb_cont.mc_starved--;
+ cv_signal(&(gen_list->mgl_mstarved));
+ } else
+ cnt_lst->mb_cont.mc_starved = 0;
+
+ MB_UNLOCK_CONT(gen_list);
+ MB_UNLOCK_CONT(cnt_lst);
+ break;
+ }
+
+ if (*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) {
+ /*
+ * We've hit the high limit of allowed numbers of mbufs
+ * on this PCPU list. We must now migrate a bucket
+ * over to the general container.
+ */
+ gen_list = MB_GET_GEN_LIST(mb_list);
+ MB_LOCK_CONT(gen_list);
+ if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) {
+ bucket =
+ SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead));
+ SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead),
+ mb_blist);
+ }
+ SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
+ bucket, mb_blist);
+ bucket->mb_owner = MB_GENLIST_OWNER;
+ *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
+ *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
+ (*(cnt_lst->mb_cont.mc_numpgs))--;
+ (*(gen_list->mb_cont.mc_numpgs))++;
+
+ /*
+ * While we're at it, transfer some of the mbtypes
+ * "count load" onto the general list's mbtypes
+ * array, seeing as how we're moving the bucket
+ * there now, meaning that the freeing of objects
+ * there will now decrement the _general list's_
+ * mbtypes counters, and no longer our PCPU list's
+ * mbtypes counters. We do this for the type presently
+ * being freed in an effort to keep the mbtypes
+ * counters approximately balanced across all lists.
+ */
+ MB_MBTYPES_DEC(cnt_lst, type, (PAGE_SIZE /
+ mb_list->ml_objsize) - bucket->mb_numfree);
+ MB_MBTYPES_INC(gen_list, type, (PAGE_SIZE /
+ mb_list->ml_objsize) - bucket->mb_numfree);
+
+ MB_UNLOCK_CONT(gen_list);
+ MB_UNLOCK_CONT(cnt_lst);
+ break;
+ }
+
+ if (bucket->mb_owner & MB_BUCKET_FREE) {
+ SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
+ bucket, mb_blist);
+ bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
+ }
+
+ MB_UNLOCK_CONT(cnt_lst);
+ break;
+ }
+}
+
+/*
+ * Drain protocols in hopes to free up some resources.
+ *
+ * LOCKING NOTES:
+ * No locks should be held when this is called. The drain routines have to
+ * presently acquire some locks which raises the possibility of lock order
+ * violation if we're holding any mutex if that mutex is acquired in reverse
+ * order relative to one of the locks in the drain routines.
+ */
+static void
+mb_reclaim(void)
+{
+ struct domain *dp;
+ struct protosw *pr;
+
+/*
+ * XXX: Argh, we almost always trip here with witness turned on now-a-days
+ * XXX: because we often come in with Giant held. For now, there's no way
+ * XXX: to avoid this.
+ */
+#ifdef WITNESS
+ KASSERT(witness_list(curthread) == 0,
+ ("mb_reclaim() called with locks held"));
+#endif
+
+ mbstat.m_drain++; /* XXX: No consistency. */
+
+ for (dp = domains; dp != NULL; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_drain != NULL)
+ (*pr->pr_drain)();
+}
+
+/*
+ * Local mbuf & cluster alloc macros and routines.
+ * Local macro and function names begin with an underscore ("_").
+ */
+static void _mclfree(struct mbuf *);
+
+#define _m_get(m, how, type) do { \
+ (m) = (struct mbuf *)mb_alloc(&mb_list_mbuf, (how), (type)); \
+ if ((m) != NULL) { \
+ (m)->m_type = (type); \
+ (m)->m_next = NULL; \
+ (m)->m_nextpkt = NULL; \
+ (m)->m_data = (m)->m_dat; \
+ (m)->m_flags = 0; \
+ } \
+} while (0)
+
+#define _m_gethdr(m, how, type) do { \
+ (m) = (struct mbuf *)mb_alloc(&mb_list_mbuf, (how), (type)); \
+ if ((m) != NULL) { \
+ (m)->m_type = (type); \
+ (m)->m_next = NULL; \
+ (m)->m_nextpkt = NULL; \
+ (m)->m_data = (m)->m_pktdat; \
+ (m)->m_flags = M_PKTHDR; \
+ (m)->m_pkthdr.rcvif = NULL; \
+ (m)->m_pkthdr.csum_flags = 0; \
+ (m)->m_pkthdr.aux = NULL; \
+ } \
+} while (0)
+
+/* XXX: Check for M_PKTHDR && m_pkthdr.aux is bogus... please fix (see KAME). */
+#define _m_free(m, n) do { \
+ (n) = (m)->m_next; \
+ if ((m)->m_flags & M_EXT) \
+ MEXTFREE((m)); \
+ if (((m)->m_flags & M_PKTHDR) != 0 && (m)->m_pkthdr.aux) { \
+ m_freem((m)->m_pkthdr.aux); \
+ (m)->m_pkthdr.aux = NULL; \
+ } \
+ mb_free(&mb_list_mbuf, (m), (m)->m_type); \
+} while (0)
+
+#define _mext_init_ref(m) do { \
+ (m)->m_ext.ref_cnt = malloc(sizeof(u_int), M_MBUF, M_NOWAIT); \
+ if ((m)->m_ext.ref_cnt != NULL) { \
+ *((m)->m_ext.ref_cnt) = 0; \
+ MEXT_ADD_REF((m)); \
+ } \
+} while (0)
+
+#define _mext_dealloc_ref(m) \
+ free((m)->m_ext.ref_cnt, M_MBUF)
+
+void
+_mext_free(struct mbuf *mb)
+{
+
+ if (mb->m_ext.ext_type == EXT_CLUSTER)
+ mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF);
+ else
+ (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args);
+ _mext_dealloc_ref(mb);
+}
+
+/*
+ * We only include this here to avoid making m_clget() excessively large
+ * due to too much inlined code.
+ */
+static void
+_mclfree(struct mbuf *mb)
+{
+
+ mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF);
+ mb->m_ext.ext_buf = NULL;
+}
+
+/*
+ * Exported space allocation and de-allocation routines.
+ */
+struct mbuf *
+m_get(int how, int type)
+{
+ struct mbuf *mb;
+
+ _m_get(mb, how, type);
+ return (mb);
+}
+
+struct mbuf *
+m_gethdr(int how, int type)
+{
+ struct mbuf *mb;
+
+ _m_gethdr(mb, how, type);
+ return (mb);
+}
+
+struct mbuf *
+m_get_clrd(int how, int type)
+{
+ struct mbuf *mb;
+
+ _m_get(mb, how, type);
+ if (mb != NULL)
+ bzero(mtod(mb, caddr_t), MLEN);
+ return (mb);
+}
+
+struct mbuf *
+m_gethdr_clrd(int how, int type)
+{
+ struct mbuf *mb;
+
+ _m_gethdr(mb, how, type);
+ if (mb != NULL)
+ bzero(mtod(mb, caddr_t), MHLEN);
+ return (mb);
+}
+
+struct mbuf *
+m_free(struct mbuf *mb)
+{
+ struct mbuf *nb;
+
+ _m_free(mb, nb);
+ return (nb);
+}
+
+void
+m_clget(struct mbuf *mb, int how)
+{
+
+ mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how, MT_NOTMBUF);
+ if (mb->m_ext.ext_buf != NULL) {
+ _mext_init_ref(mb);
+ if (mb->m_ext.ref_cnt == NULL)
+ _mclfree(mb);
+ else {
+ mb->m_data = mb->m_ext.ext_buf;
+ mb->m_flags |= M_EXT;
+ mb->m_ext.ext_free = NULL;
+ mb->m_ext.ext_args = NULL;
+ mb->m_ext.ext_size = MCLBYTES;
+ mb->m_ext.ext_type = EXT_CLUSTER;
+ }
+ }
+}
+
+void
+m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
+ void (*freef)(void *, void *), void *args, short flags, int type)
+{
+
+ _mext_init_ref(mb);
+ if (mb->m_ext.ref_cnt != NULL) {
+ mb->m_flags |= (M_EXT | flags);
+ mb->m_ext.ext_buf = buf;
+ mb->m_data = mb->m_ext.ext_buf;
+ mb->m_ext.ext_size = size;
+ mb->m_ext.ext_free = freef;
+ mb->m_ext.ext_args = args;
+ mb->m_ext.ext_type = type;
+ }
+}
+
+/*
+ * Change type for mbuf `mb'; this is a relatively expensive operation and
+ * should be avoided.
+ */
+void
+m_chtype(struct mbuf *mb, short new_type)
+{
+ struct mb_gen_list *gen_list;
+
+ gen_list = MB_GET_GEN_LIST(&mb_list_mbuf);
+ MB_LOCK_CONT(gen_list);
+ MB_MBTYPES_DEC(gen_list, mb->m_type, 1);
+ MB_MBTYPES_INC(gen_list, new_type, 1);
+ MB_UNLOCK_CONT(gen_list);
+ mb->m_type = new_type;
+}
diff --git a/sys/kern/subr_mchain.c b/sys/kern/subr_mchain.c
new file mode 100644
index 0000000..1a8c4bd
--- /dev/null
+++ b/sys/kern/subr_mchain.c
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 2000, 2001 Boris Popov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Boris Popov.
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/endian.h>
+#include <sys/errno.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/uio.h>
+
+#include <sys/mchain.h>
+
+MODULE_VERSION(libmchain, 1);
+
+#define MBERROR(format, args...) printf("%s(%d): "format, __func__ , \
+ __LINE__ ,## args)
+
+#define MBPANIC(format, args...) printf("%s(%d): "format, __func__ , \
+ __LINE__ ,## args)
+
+/*
+ * Various helper functions
+ */
+int
+m_fixhdr(struct mbuf *m0)
+{
+ struct mbuf *m = m0;
+ int len = 0;
+
+ while (m) {
+ len += m->m_len;
+ m = m->m_next;
+ }
+ m0->m_pkthdr.len = len;
+ return len;
+}
+
+int
+mb_init(struct mbchain *mbp)
+{
+ struct mbuf *m;
+
+ m = m_gethdr(M_TRYWAIT, MT_DATA);
+ if (m == NULL)
+ return ENOBUFS;
+ m->m_len = 0;
+ mb_initm(mbp, m);
+ return 0;
+}
+
+void
+mb_initm(struct mbchain *mbp, struct mbuf *m)
+{
+ bzero(mbp, sizeof(*mbp));
+ mbp->mb_top = mbp->mb_cur = m;
+ mbp->mb_mleft = M_TRAILINGSPACE(m);
+}
+
+void
+mb_done(struct mbchain *mbp)
+{
+ if (mbp->mb_top) {
+ m_freem(mbp->mb_top);
+ mbp->mb_top = NULL;
+ }
+}
+
+struct mbuf *
+mb_detach(struct mbchain *mbp)
+{
+ struct mbuf *m;
+
+ m = mbp->mb_top;
+ mbp->mb_top = NULL;
+ return m;
+}
+
+int
+mb_fixhdr(struct mbchain *mbp)
+{
+ return mbp->mb_top->m_pkthdr.len = m_fixhdr(mbp->mb_top);
+}
+
+/*
+ * Check if object of size 'size' fit to the current position and
+ * allocate new mbuf if not. Advance pointers and increase length of mbuf(s).
+ * Return pointer to the object placeholder or NULL if any error occured.
+ * Note: size should be <= MLEN
+ */
+caddr_t
+mb_reserve(struct mbchain *mbp, int size)
+{
+ struct mbuf *m, *mn;
+ caddr_t bpos;
+
+ if (size > MLEN)
+ panic("mb_reserve: size = %d\n", size);
+ m = mbp->mb_cur;
+ if (mbp->mb_mleft < size) {
+ mn = m_get(M_TRYWAIT, MT_DATA);
+ if (mn == NULL)
+ return NULL;
+ mbp->mb_cur = m->m_next = mn;
+ m = mn;
+ m->m_len = 0;
+ mbp->mb_mleft = M_TRAILINGSPACE(m);
+ }
+ mbp->mb_mleft -= size;
+ mbp->mb_count += size;
+ bpos = mtod(m, caddr_t) + m->m_len;
+ m->m_len += size;
+ return bpos;
+}
+
+int
+mb_put_uint8(struct mbchain *mbp, u_int8_t x)
+{
+ return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_uint16be(struct mbchain *mbp, u_int16_t x)
+{
+ x = htobes(x);
+ return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_uint16le(struct mbchain *mbp, u_int16_t x)
+{
+ x = htoles(x);
+ return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_uint32be(struct mbchain *mbp, u_int32_t x)
+{
+ x = htobel(x);
+ return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_uint32le(struct mbchain *mbp, u_int32_t x)
+{
+ x = htolel(x);
+ return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_int64be(struct mbchain *mbp, int64_t x)
+{
+ x = htobeq(x);
+ return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_int64le(struct mbchain *mbp, int64_t x)
+{
+ x = htoleq(x);
+ return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_mem(struct mbchain *mbp, c_caddr_t source, int size, int type)
+{
+ struct mbuf *m;
+ caddr_t dst;
+ c_caddr_t src;
+ int cplen, error, mleft, count;
+
+ m = mbp->mb_cur;
+ mleft = mbp->mb_mleft;
+
+ while (size > 0) {
+ if (mleft == 0) {
+ if (m->m_next == NULL) {
+ m = m_getm(m, size, M_TRYWAIT, MT_DATA);
+ if (m == NULL)
+ return ENOBUFS;
+ }
+ m = m->m_next;
+ mleft = M_TRAILINGSPACE(m);
+ continue;
+ }
+ cplen = mleft > size ? size : mleft;
+ dst = mtod(m, caddr_t) + m->m_len;
+ switch (type) {
+ case MB_MCUSTOM:
+ error = mbp->mb_copy(mbp, source, dst, cplen);
+ if (error)
+ return error;
+ break;
+ case MB_MINLINE:
+ for (src = source, count = cplen; count; count--)
+ *dst++ = *src++;
+ break;
+ case MB_MSYSTEM:
+ bcopy(source, dst, cplen);
+ break;
+ case MB_MUSER:
+ error = copyin(source, dst, cplen);
+ if (error)
+ return error;
+ break;
+ case MB_MZERO:
+ bzero(dst, cplen);
+ break;
+ }
+ size -= cplen;
+ source += cplen;
+ m->m_len += cplen;
+ mleft -= cplen;
+ mbp->mb_count += cplen;
+ }
+ mbp->mb_cur = m;
+ mbp->mb_mleft = mleft;
+ return 0;
+}
+
+int
+mb_put_mbuf(struct mbchain *mbp, struct mbuf *m)
+{
+ mbp->mb_cur->m_next = m;
+ while (m) {
+ mbp->mb_count += m->m_len;
+ if (m->m_next == NULL)
+ break;
+ m = m->m_next;
+ }
+ mbp->mb_mleft = M_TRAILINGSPACE(m);
+ mbp->mb_cur = m;
+ return 0;
+}
+
+/*
+ * copies a uio scatter/gather list to an mbuf chain.
+ */
+int
+mb_put_uio(struct mbchain *mbp, struct uio *uiop, int size)
+{
+ long left;
+ int mtype, error;
+
+ mtype = (uiop->uio_segflg == UIO_SYSSPACE) ? MB_MSYSTEM : MB_MUSER;
+
+ while (size > 0 && uiop->uio_resid) {
+ if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
+ return EFBIG;
+ left = uiop->uio_iov->iov_len;
+ if (left == 0) {
+ uiop->uio_iov++;
+ uiop->uio_iovcnt--;
+ continue;
+ }
+ if (left > size)
+ left = size;
+ error = mb_put_mem(mbp, uiop->uio_iov->iov_base, left, mtype);
+ if (error)
+ return error;
+ uiop->uio_offset += left;
+ uiop->uio_resid -= left;
+ uiop->uio_iov->iov_base += left;
+ uiop->uio_iov->iov_len -= left;
+ size -= left;
+ }
+ return 0;
+}
+
+/*
+ * Routines for fetching data from an mbuf chain
+ */
+int
+md_init(struct mdchain *mdp)
+{
+ struct mbuf *m;
+
+ m = m_gethdr(M_TRYWAIT, MT_DATA);
+ if (m == NULL)
+ return ENOBUFS;
+ m->m_len = 0;
+ md_initm(mdp, m);
+ return 0;
+}
+
+void
+md_initm(struct mdchain *mdp, struct mbuf *m)
+{
+ bzero(mdp, sizeof(*mdp));
+ mdp->md_top = mdp->md_cur = m;
+ mdp->md_pos = mtod(m, u_char*);
+}
+
+void
+md_done(struct mdchain *mdp)
+{
+ if (mdp->md_top) {
+ m_freem(mdp->md_top);
+ mdp->md_top = NULL;
+ }
+}
+
+/*
+ * Append a separate mbuf chain. It is caller responsibility to prevent
+ * multiple calls to fetch/record routines.
+ */
+void
+md_append_record(struct mdchain *mdp, struct mbuf *top)
+{
+ struct mbuf *m;
+
+ if (mdp->md_top == NULL) {
+ md_initm(mdp, top);
+ return;
+ }
+ m = mdp->md_top;
+ while (m->m_nextpkt)
+ m = m->m_nextpkt;
+ m->m_nextpkt = top;
+ top->m_nextpkt = NULL;
+ return;
+}
+
+/*
+ * Put next record in place of existing
+ */
+int
+md_next_record(struct mdchain *mdp)
+{
+ struct mbuf *m;
+
+ if (mdp->md_top == NULL)
+ return ENOENT;
+ m = mdp->md_top->m_nextpkt;
+ md_done(mdp);
+ if (m == NULL)
+ return ENOENT;
+ md_initm(mdp, m);
+ return 0;
+}
+
+int
+md_get_uint8(struct mdchain *mdp, u_int8_t *x)
+{
+ return md_get_mem(mdp, x, 1, MB_MINLINE);
+}
+
+int
+md_get_uint16(struct mdchain *mdp, u_int16_t *x)
+{
+ return md_get_mem(mdp, (caddr_t)x, 2, MB_MINLINE);
+}
+
+int
+md_get_uint16le(struct mdchain *mdp, u_int16_t *x)
+{
+ u_int16_t v;
+ int error = md_get_uint16(mdp, &v);
+
+ *x = letohs(v);
+ return error;
+}
+
+int
+md_get_uint16be(struct mdchain *mdp, u_int16_t *x) {
+ u_int16_t v;
+ int error = md_get_uint16(mdp, &v);
+
+ *x = betohs(v);
+ return error;
+}
+
+int
+md_get_uint32(struct mdchain *mdp, u_int32_t *x)
+{
+ return md_get_mem(mdp, (caddr_t)x, 4, MB_MINLINE);
+}
+
+int
+md_get_uint32be(struct mdchain *mdp, u_int32_t *x)
+{
+ u_int32_t v;
+ int error;
+
+ error = md_get_uint32(mdp, &v);
+ *x = betohl(v);
+ return error;
+}
+
+int
+md_get_uint32le(struct mdchain *mdp, u_int32_t *x)
+{
+ u_int32_t v;
+ int error;
+
+ error = md_get_uint32(mdp, &v);
+ *x = letohl(v);
+ return error;
+}
+
+int
+md_get_int64(struct mdchain *mdp, int64_t *x)
+{
+ return md_get_mem(mdp, (caddr_t)x, 8, MB_MINLINE);
+}
+
+int
+md_get_int64be(struct mdchain *mdp, int64_t *x)
+{
+ int64_t v;
+ int error;
+
+ error = md_get_int64(mdp, &v);
+ *x = betohq(v);
+ return error;
+}
+
+int
+md_get_int64le(struct mdchain *mdp, int64_t *x)
+{
+ int64_t v;
+ int error;
+
+ error = md_get_int64(mdp, &v);
+ *x = letohq(v);
+ return error;
+}
+
+int
+md_get_mem(struct mdchain *mdp, caddr_t target, int size, int type)
+{
+ struct mbuf *m = mdp->md_cur;
+ int error;
+ u_int count;
+ u_char *s;
+
+ while (size > 0) {
+ if (m == NULL) {
+ MBERROR("incomplete copy\n");
+ return EBADRPC;
+ }
+ s = mdp->md_pos;
+ count = mtod(m, u_char*) + m->m_len - s;
+ if (count == 0) {
+ mdp->md_cur = m = m->m_next;
+ if (m)
+ s = mdp->md_pos = mtod(m, caddr_t);
+ continue;
+ }
+ if (count > size)
+ count = size;
+ size -= count;
+ mdp->md_pos += count;
+ if (target == NULL)
+ continue;
+ switch (type) {
+ case MB_MUSER:
+ error = copyout(s, target, count);
+ if (error)
+ return error;
+ break;
+ case MB_MSYSTEM:
+ bcopy(s, target, count);
+ break;
+ case MB_MINLINE:
+ while (count--)
+ *target++ = *s++;
+ continue;
+ }
+ target += count;
+ }
+ return 0;
+}
+
+int
+md_get_mbuf(struct mdchain *mdp, int size, struct mbuf **ret)
+{
+ struct mbuf *m = mdp->md_cur, *rm;
+
+ rm = m_copym(m, mdp->md_pos - mtod(m, u_char*), size, M_TRYWAIT);
+ if (rm == NULL)
+ return EBADRPC;
+ md_get_mem(mdp, NULL, size, MB_MZERO);
+ *ret = rm;
+ return 0;
+}
+
+int
+md_get_uio(struct mdchain *mdp, struct uio *uiop, int size)
+{
+ char *uiocp;
+ long left;
+ int mtype, error;
+
+ mtype = (uiop->uio_segflg == UIO_SYSSPACE) ? MB_MSYSTEM : MB_MUSER;
+ while (size > 0 && uiop->uio_resid) {
+ if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
+ return EFBIG;
+ left = uiop->uio_iov->iov_len;
+ if (left == 0) {
+ uiop->uio_iov++;
+ uiop->uio_iovcnt--;
+ continue;
+ }
+ uiocp = uiop->uio_iov->iov_base;
+ if (left > size)
+ left = size;
+ error = md_get_mem(mdp, uiocp, left, mtype);
+ if (error)
+ return error;
+ uiop->uio_offset += left;
+ uiop->uio_resid -= left;
+ uiop->uio_iov->iov_base += left;
+ uiop->uio_iov->iov_len -= left;
+ size -= left;
+ }
+ return 0;
+}
diff --git a/sys/kern/subr_module.c b/sys/kern/subr_module.c
new file mode 100644
index 0000000..ce74eca
--- /dev/null
+++ b/sys/kern/subr_module.c
@@ -0,0 +1,266 @@
+/*-
+ * Copyright (c) 1998 Michael Smith
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/linker.h>
+
+/*
+ * Preloaded module support
+ */
+
+caddr_t preload_metadata;
+
+/*
+ * Search for the preloaded module (name)
+ */
+caddr_t
+preload_search_by_name(const char *name)
+{
+ caddr_t curp;
+ u_int32_t *hdr;
+ int next;
+
+ if (preload_metadata != NULL) {
+
+ curp = preload_metadata;
+ for (;;) {
+ hdr = (u_int32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* Search for a MODINFO_NAME field */
+ if ((hdr[0] == MODINFO_NAME) &&
+ !strcmp(name, curp + sizeof(u_int32_t) * 2))
+ return(curp);
+
+ /* skip to next field */
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+ return(NULL);
+}
+
+/*
+ * Search for the first preloaded module of (type)
+ */
+caddr_t
+preload_search_by_type(const char *type)
+{
+ caddr_t curp, lname;
+ u_int32_t *hdr;
+ int next;
+
+ if (preload_metadata != NULL) {
+
+ curp = preload_metadata;
+ lname = NULL;
+ for (;;) {
+ hdr = (u_int32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* remember the start of each record */
+ if (hdr[0] == MODINFO_NAME)
+ lname = curp;
+
+ /* Search for a MODINFO_TYPE field */
+ if ((hdr[0] == MODINFO_TYPE) &&
+ !strcmp(type, curp + sizeof(u_int32_t) * 2))
+ return(lname);
+
+ /* skip to next field */
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+ return(NULL);
+}
+
+/*
+ * Walk through the preloaded module list
+ */
+caddr_t
+preload_search_next_name(caddr_t base)
+{
+ caddr_t curp;
+ u_int32_t *hdr;
+ int next;
+
+ if (preload_metadata != NULL) {
+
+ /* Pick up where we left off last time */
+ if (base) {
+ /* skip to next field */
+ curp = base;
+ hdr = (u_int32_t *)curp;
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ } else
+ curp = preload_metadata;
+
+ for (;;) {
+ hdr = (u_int32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* Found a new record? */
+ if (hdr[0] == MODINFO_NAME)
+ return curp;
+
+ /* skip to next field */
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+ return(NULL);
+}
+
+/*
+ * Given a preloaded module handle (mod), return a pointer
+ * to the data for the attribute (inf).
+ */
+caddr_t
+preload_search_info(caddr_t mod, int inf)
+{
+ caddr_t curp;
+ u_int32_t *hdr;
+ u_int32_t type = 0;
+ int next;
+
+ curp = mod;
+ for (;;) {
+ hdr = (u_int32_t *)curp;
+ /* end of module data? */
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+ /*
+ * We give up once we've looped back to what we were looking at
+ * first - this should normally be a MODINFO_NAME field.
+ */
+ if (type == 0) {
+ type = hdr[0];
+ } else {
+ if (hdr[0] == type)
+ break;
+ }
+
+ /*
+ * Attribute match? Return pointer to data.
+ * Consumer may safely assume that size value precedes
+ * data.
+ */
+ if (hdr[0] == inf)
+ return(curp + (sizeof(u_int32_t) * 2));
+
+ /* skip to next field */
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ return(NULL);
+}
+
+/*
+ * Delete a preload record by name.
+ */
+void
+preload_delete_name(const char *name)
+{
+ caddr_t curp;
+ u_int32_t *hdr;
+ int next;
+ int clearing;
+
+ if (preload_metadata != NULL) {
+
+ clearing = 0;
+ curp = preload_metadata;
+ for (;;) {
+ hdr = (u_int32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* Search for a MODINFO_NAME field */
+ if (hdr[0] == MODINFO_NAME) {
+ if (!strcmp(name, curp + sizeof(u_int32_t) * 2))
+ clearing = 1; /* got it, start clearing */
+ else if (clearing)
+ clearing = 0; /* at next one now.. better stop */
+ }
+ if (clearing)
+ hdr[0] = MODINFO_EMPTY;
+
+ /* skip to next field */
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+}
+
+/* Called from locore on i386. Convert physical pointers to kvm. Sigh. */
+void
+preload_bootstrap_relocate(vm_offset_t offset)
+{
+ caddr_t curp;
+ u_int32_t *hdr;
+ vm_offset_t *ptr;
+ int next;
+
+ if (preload_metadata != NULL) {
+
+ curp = preload_metadata;
+ for (;;) {
+ hdr = (u_int32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* Deal with the ones that we know we have to fix */
+ switch (hdr[0]) {
+ case MODINFO_ADDR:
+ case MODINFO_METADATA|MODINFOMD_SSYM:
+ case MODINFO_METADATA|MODINFOMD_ESYM:
+ ptr = (vm_offset_t *)(curp + (sizeof(u_int32_t) * 2));
+ *ptr += offset;
+ break;
+ }
+ /* The rest is beyond us for now */
+
+ /* skip to next field */
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+}
diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
new file mode 100644
index 0000000..820fe0107
--- /dev/null
+++ b/sys/kern/subr_param.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 1980, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)param.c 8.3 (Berkeley) 8/20/94
+ * $FreeBSD$
+ */
+
+#include "opt_param.h"
+#include "opt_maxusers.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+
+#include <machine/vmparam.h>
+
+/*
+ * System parameter formulae.
+ */
+
+#ifndef HZ
+#define HZ 100
+#endif
+#define NPROC (20 + 16 * maxusers)
+#ifndef NBUF
+#define NBUF 0
+#endif
+#ifndef MAXFILES
+#define MAXFILES (maxproc * 2)
+#endif
+
+int hz;
+int tick;
+int maxusers; /* base tunable */
+int maxproc; /* maximum # of processes */
+int maxprocperuid; /* max # of procs per user */
+int maxfiles; /* sys. wide open files limit */
+int maxfilesperproc; /* per-proc open files limit */
+int ncallout; /* maximum # of timer events */
+int nbuf;
+int nswbuf;
+int maxswzone; /* max swmeta KVA storage */
+int maxbcache; /* max buffer cache KVA storage */
+u_quad_t maxtsiz; /* max text size */
+u_quad_t dfldsiz; /* initial data size limit */
+u_quad_t maxdsiz; /* max data size */
+u_quad_t dflssiz; /* initial stack size limit */
+u_quad_t maxssiz; /* max stack size */
+u_quad_t sgrowsiz; /* amount to grow stack */
+
+/*
+ * These have to be allocated somewhere; allocating
+ * them here forces loader errors if this file is omitted
+ * (if they've been externed everywhere else; hah!).
+ */
+struct buf *swbuf;
+
+/*
+ * Boot time overrides that are not scaled against main memory
+ */
+void
+init_param1(void)
+{
+
+ hz = HZ;
+ TUNABLE_INT_FETCH("kern.hz", &hz);
+ tick = 1000000 / hz;
+
+#ifdef VM_SWZONE_SIZE_MAX
+ maxswzone = VM_SWZONE_SIZE_MAX;
+#endif
+ TUNABLE_INT_FETCH("kern.maxswzone", &maxswzone);
+#ifdef VM_BCACHE_SIZE_MAX
+ maxbcache = VM_BCACHE_SIZE_MAX;
+#endif
+ TUNABLE_INT_FETCH("kern.maxbcache", &maxbcache);
+
+ maxtsiz = MAXTSIZ;
+ TUNABLE_QUAD_FETCH("kern.maxtsiz", &maxtsiz);
+ dfldsiz = DFLDSIZ;
+ TUNABLE_QUAD_FETCH("kern.dfldsiz", &dfldsiz);
+ maxdsiz = MAXDSIZ;
+ TUNABLE_QUAD_FETCH("kern.maxdsiz", &maxdsiz);
+ dflssiz = DFLSSIZ;
+ TUNABLE_QUAD_FETCH("kern.dflssiz", &dflssiz);
+ maxssiz = MAXSSIZ;
+ TUNABLE_QUAD_FETCH("kern.maxssiz", &maxssiz);
+ sgrowsiz = SGROWSIZ;
+ TUNABLE_QUAD_FETCH("kern.sgrowsiz", &sgrowsiz);
+}
+
+/*
+ * Boot time overrides that are scaled against main memory
+ */
+void
+init_param2(int physpages)
+{
+
+ /* Base parameters */
+ maxusers = MAXUSERS;
+ TUNABLE_INT_FETCH("kern.maxusers", &maxusers);
+ if (maxusers == 0) {
+ maxusers = physpages / (2 * 1024 * 1024 / PAGE_SIZE);
+ if (maxusers < 32)
+ maxusers = 32;
+ if (maxusers > 384)
+ maxusers = 384;
+ }
+
+ /*
+ * The following can be overridden after boot via sysctl. Note:
+ * unless overriden, these macros are ultimately based on maxusers.
+ */
+ maxproc = NPROC;
+ TUNABLE_INT_FETCH("kern.maxproc", &maxproc);
+ /*
+ * Limit maxproc so that kmap entries cannot be exhausted by
+ * processes.
+ */
+ if (maxproc > (physpages / 12))
+ maxproc = physpages / 12;
+ maxfiles = MAXFILES;
+ TUNABLE_INT_FETCH("kern.maxfiles", &maxfiles);
+ maxprocperuid = (maxproc * 9) / 10;
+ maxfilesperproc = (maxfiles * 9) / 10;
+
+ /*
+ * Cannot be changed after boot.
+ */
+ nbuf = NBUF;
+ TUNABLE_INT_FETCH("kern.nbuf", &nbuf);
+
+ ncallout = 16 + maxproc + maxfiles;
+ TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
+}
diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c
new file mode 100644
index 0000000..132e957
--- /dev/null
+++ b/sys/kern/subr_pcpu.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2001 Wind River Systems, Inc.
+ * All rights reserved.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This module provides MI support for per-cpu data.
+ *
+ * Each architecture determines the mapping of logical CPU IDs to physical
+ * CPUs. The requirements of this mapping are as follows:
+ * - Logical CPU IDs must reside in the range 0 ... MAXCPU - 1.
+ * - The mapping is not required to be dense. That is, there may be
+ * gaps in the mappings.
+ * - The platform sets the value of MAXCPU in <machine/param.h>.
+ * - It is suggested, but not required, that in the non-SMP case, the
+ * platform define MAXCPU to be 1 and define the logical ID of the
+ * sole CPU as 0.
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/linker_set.h>
+#include <sys/lock.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <ddb/ddb.h>
+
+static struct pcpu *cpuid_to_pcpu[MAXCPU];
+struct cpuhead cpuhead = SLIST_HEAD_INITIALIZER(cpuhead);
+
+/*
+ * Initialize the MI portions of a struct pcpu.
+ */
+void
+pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
+{
+
+ bzero(pcpu, size);
+ KASSERT(cpuid >= 0 && cpuid < MAXCPU,
+ ("pcpu_init: invalid cpuid %d", cpuid));
+ pcpu->pc_cpuid = cpuid;
+ pcpu->pc_cpumask = 1 << cpuid;
+ cpuid_to_pcpu[cpuid] = pcpu;
+ SLIST_INSERT_HEAD(&cpuhead, pcpu, pc_allcpu);
+ cpu_pcpu_init(pcpu, cpuid, size);
+}
+
+/*
+ * Destroy a struct pcpu.
+ */
+void
+pcpu_destroy(struct pcpu *pcpu)
+{
+
+ SLIST_REMOVE(&cpuhead, pcpu, pcpu, pc_allcpu);
+ cpuid_to_pcpu[pcpu->pc_cpuid] = NULL;
+}
+
+/*
+ * Locate a struct pcpu by cpu id.
+ */
+struct pcpu *
+pcpu_find(u_int cpuid)
+{
+
+ return (cpuid_to_pcpu[cpuid]);
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(pcpu, db_show_pcpu)
+{
+ struct pcpu *pc;
+ struct thread *td;
+ int id;
+
+ if (have_addr)
+ id = ((addr >> 4) % 16) * 10 + (addr % 16);
+ else
+ id = PCPU_GET(cpuid);
+ pc = pcpu_find(id);
+ if (pc == NULL) {
+ db_printf("CPU %d not found\n", id);
+ return;
+ }
+ db_printf("cpuid = %d\n", pc->pc_cpuid);
+ db_printf("curthread = ");
+ td = pc->pc_curthread;
+ if (td != NULL)
+ db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid,
+ td->td_proc->p_comm);
+ else
+ db_printf("none\n");
+ db_printf("curpcb = %p\n", pc->pc_curpcb);
+ db_printf("fpcurthread = ");
+ td = pc->pc_fpcurthread;
+ if (td != NULL)
+ db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid,
+ td->td_proc->p_comm);
+ else
+ db_printf("none\n");
+ db_printf("idlethread = ");
+ td = pc->pc_idlethread;
+ if (td != NULL)
+ db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid,
+ td->td_proc->p_comm);
+ else
+ db_printf("none\n");
+ db_show_mdpcpu(pc);
+
+#ifdef WITNESS
+ db_printf("spin locks held:\n");
+ witness_list_locks(&pc->pc_spinlocks);
+#endif
+}
+#endif
diff --git a/sys/kern/subr_power.c b/sys/kern/subr_power.c
new file mode 100644
index 0000000..7c96c9e
--- /dev/null
+++ b/sys/kern/subr_power.c
@@ -0,0 +1,107 @@
+/*-
+ * Copyright (c) 2001 Mitsuru IWASAKI
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+
+#include <sys/power.h>
+
+static u_int power_pm_type = POWER_PM_TYPE_NONE;
+static power_pm_fn_t power_pm_fn = NULL;
+static void *power_pm_arg = NULL;
+
+int
+power_pm_register(u_int pm_type, power_pm_fn_t pm_fn, void *pm_arg)
+{
+ int error;
+
+ if (power_pm_type == POWER_PM_TYPE_NONE ||
+ power_pm_type == pm_type) {
+ power_pm_type = pm_type;
+ power_pm_fn = pm_fn;
+ power_pm_arg = pm_arg;
+ error = 0;
+ } else {
+ error = ENXIO;
+ }
+
+ return (error);
+}
+
+u_int
+power_pm_get_type(void)
+{
+
+ return (power_pm_type);
+}
+
+void
+power_pm_suspend(int state)
+{
+ if (power_pm_fn == NULL)
+ return;
+
+ if (state != POWER_SLEEP_STATE_STANDBY &&
+ state != POWER_SLEEP_STATE_SUSPEND &&
+ state != POWER_SLEEP_STATE_HIBERNATE)
+ return;
+
+ power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state);
+}
+
+/*
+ * Power profile.
+ */
+
+static int power_profile_state = POWER_PROFILE_PERFORMANCE;
+
+int
+power_profile_get_state(void)
+{
+ return (power_profile_state);
+}
+
+void
+power_profile_set_state(int state)
+{
+ int changed;
+
+ if (state != power_profile_state) {
+ power_profile_state = state;
+ changed = 1;
+ printf("system power profile changed to '%s'\n",
+ (state == POWER_PROFILE_PERFORMANCE) ? "performance" : "economy");
+ } else {
+ changed = 0;
+ }
+
+ if (changed)
+ EVENTHANDLER_INVOKE(power_profile_change);
+}
+
diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c
new file mode 100644
index 0000000..7f9b790
--- /dev/null
+++ b/sys/kern/subr_prf.c
@@ -0,0 +1,905 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/kernel.h>
+#include <sys/msgbuf.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/stdint.h>
+#include <sys/sysctl.h>
+#include <sys/tty.h>
+#include <sys/syslog.h>
+#include <sys/cons.h>
+#include <sys/uio.h>
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#define TOCONS 0x01
+#define TOTTY 0x02
+#define TOLOG 0x04
+
+/* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */
+#define MAXNBUF (sizeof(intmax_t) * NBBY + 1)
+
+struct putchar_arg {
+ int flags;
+ int pri;
+ struct tty *tty;
+};
+
+struct snprintf_arg {
+ char *str;
+ size_t remain;
+};
+
+extern int log_open;
+
+struct tty *constty; /* pointer to console "window" tty */
+
+static void (*v_putc)(int) = cnputc; /* routine to putc on virtual console */
+static void msglogchar(int c, int pri);
+static void msgaddchar(int c, void *dummy);
+static void putchar(int ch, void *arg);
+static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len);
+static void snprintf_func(int ch, void *arg);
+
+static int consintr = 1; /* Ok to handle console interrupts? */
+static int msgbufmapped; /* Set when safe to use msgbuf */
+int msgbuftrigger;
+
+static int log_console_output = 1;
+SYSCTL_INT(_kern, OID_AUTO, log_console_output, CTLFLAG_RW,
+ &log_console_output, 0, "");
+
+/*
+ * Warn that a system table is full.
+ */
+void
+tablefull(const char *tab)
+{
+
+ log(LOG_ERR, "%s: table is full\n", tab);
+}
+
+/*
+ * Uprintf prints to the controlling terminal for the current process.
+ * It may block if the tty queue is overfull. No message is printed if
+ * the queue does not clear in a reasonable time.
+ */
+int
+uprintf(const char *fmt, ...)
+{
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ va_list ap;
+ struct putchar_arg pca;
+ int retval;
+
+ if (td == NULL || td == PCPU_GET(idlethread))
+ return (0);
+
+ p = td->td_proc;
+ PROC_LOCK(p);
+ if ((p->p_flag & P_CONTROLT) == 0) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ SESS_LOCK(p->p_session);
+ pca.tty = p->p_session->s_ttyp;
+ SESS_UNLOCK(p->p_session);
+ PROC_UNLOCK(p);
+ if (pca.tty == NULL)
+ return (0);
+ pca.flags = TOTTY;
+ va_start(ap, fmt);
+ retval = kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+
+ return (retval);
+}
+
+/*
+ * tprintf prints on the controlling terminal associated
+ * with the given session, possibly to the log as well.
+ */
+void
+tprintf(struct proc *p, int pri, const char *fmt, ...)
+{
+ struct tty *tp = NULL;
+ int flags = 0, shld = 0;
+ va_list ap;
+ struct putchar_arg pca;
+ int retval;
+
+ if (pri != -1)
+ flags |= TOLOG;
+ if (p != NULL) {
+ PROC_LOCK(p);
+ if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
+ SESS_LOCK(p->p_session);
+ SESSHOLD(p->p_session);
+ tp = p->p_session->s_ttyp;
+ SESS_UNLOCK(p->p_session);
+ PROC_UNLOCK(p);
+ shld++;
+ if (ttycheckoutq(tp, 0))
+ flags |= TOTTY;
+ else
+ tp = NULL;
+ } else
+ PROC_UNLOCK(p);
+ }
+ pca.pri = pri;
+ pca.tty = tp;
+ pca.flags = flags;
+ va_start(ap, fmt);
+ retval = kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+ if (shld) {
+ PROC_LOCK(p);
+ SESS_LOCK(p->p_session);
+ SESSRELE(p->p_session);
+ SESS_UNLOCK(p->p_session);
+ PROC_UNLOCK(p);
+ }
+ msgbuftrigger = 1;
+}
+
+/*
+ * Ttyprintf displays a message on a tty; it should be used only by
+ * the tty driver, or anything that knows the underlying tty will not
+ * be revoke(2)'d away. Other callers should use tprintf.
+ */
+int
+ttyprintf(struct tty *tp, const char *fmt, ...)
+{
+ va_list ap;
+ struct putchar_arg pca;
+ int retval;
+
+ va_start(ap, fmt);
+ pca.tty = tp;
+ pca.flags = TOTTY;
+ retval = kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+ return (retval);
+}
+
+/*
+ * Log writes to the log buffer, and guarantees not to sleep (so can be
+ * called by interrupt routines). If there is no process reading the
+ * log yet, it writes to the console also.
+ */
+void
+log(int level, const char *fmt, ...)
+{
+ va_list ap;
+ int retval;
+ struct putchar_arg pca;
+
+ pca.tty = NULL;
+ pca.pri = level;
+ pca.flags = log_open ? TOLOG : TOCONS;
+
+ va_start(ap, fmt);
+ retval = kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+
+ msgbuftrigger = 1;
+}
+
+#define CONSCHUNK 128
+
+void
+log_console(struct uio *uio)
+{
+ int c, i, error, iovlen, nl;
+ struct uio muio;
+ struct iovec *miov = NULL;
+ char *consbuffer;
+ int pri;
+
+ if (!log_console_output)
+ return;
+
+ pri = LOG_INFO | LOG_CONSOLE;
+ muio = *uio;
+ iovlen = uio->uio_iovcnt * sizeof (struct iovec);
+ MALLOC(miov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+ MALLOC(consbuffer, char *, CONSCHUNK, M_TEMP, M_WAITOK);
+ bcopy(muio.uio_iov, miov, iovlen);
+ muio.uio_iov = miov;
+ uio = &muio;
+
+ nl = 0;
+ while (uio->uio_resid > 0) {
+ c = imin(uio->uio_resid, CONSCHUNK);
+ error = uiomove(consbuffer, c, uio);
+ if (error != 0)
+ return;
+ for (i = 0; i < c; i++) {
+ msglogchar(consbuffer[i], pri);
+ if (consbuffer[i] == '\n')
+ nl = 1;
+ else
+ nl = 0;
+ }
+ }
+ if (!nl)
+ msglogchar('\n', pri);
+ msgbuftrigger = 1;
+ FREE(miov, M_TEMP);
+ FREE(consbuffer, M_TEMP);
+ return;
+}
+
+int
+printf(const char *fmt, ...)
+{
+ va_list ap;
+ int savintr;
+ struct putchar_arg pca;
+ int retval;
+
+ savintr = consintr; /* disable interrupts */
+ consintr = 0;
+ va_start(ap, fmt);
+ pca.tty = NULL;
+ pca.flags = TOCONS | TOLOG;
+ pca.pri = -1;
+ retval = kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+ if (!panicstr)
+ msgbuftrigger = 1;
+ consintr = savintr; /* reenable interrupts */
+ return (retval);
+}
+
+int
+vprintf(const char *fmt, va_list ap)
+{
+ int savintr;
+ struct putchar_arg pca;
+ int retval;
+
+ savintr = consintr; /* disable interrupts */
+ consintr = 0;
+ pca.tty = NULL;
+ pca.flags = TOCONS | TOLOG;
+ pca.pri = -1;
+ retval = kvprintf(fmt, putchar, &pca, 10, ap);
+ if (!panicstr)
+ msgbuftrigger = 1;
+ consintr = savintr; /* reenable interrupts */
+ return (retval);
+}
+
+/*
+ * Print a character on console or users terminal. If destination is
+ * the console then the last bunch of characters are saved in msgbuf for
+ * inspection later.
+ */
+static void
+putchar(int c, void *arg)
+{
+ struct putchar_arg *ap = (struct putchar_arg*) arg;
+ int flags = ap->flags;
+ struct tty *tp = ap->tty;
+ if (panicstr)
+ constty = NULL;
+ if ((flags & TOCONS) && tp == NULL && constty) {
+ tp = constty;
+ flags |= TOTTY;
+ }
+ if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 &&
+ (flags & TOCONS) && tp == constty)
+ constty = NULL;
+ if ((flags & TOLOG))
+ msglogchar(c, ap->pri);
+ if ((flags & TOCONS) && constty == NULL && c != '\0')
+ (*v_putc)(c);
+}
+
+/*
+ * Scaled down version of sprintf(3).
+ */
+int
+sprintf(char *buf, const char *cfmt, ...)
+{
+ int retval;
+ va_list ap;
+
+ va_start(ap, cfmt);
+ retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+ buf[retval] = '\0';
+ va_end(ap);
+ return (retval);
+}
+
+/*
+ * Scaled down version of vsprintf(3).
+ */
+int
+vsprintf(char *buf, const char *cfmt, va_list ap)
+{
+ int retval;
+
+ retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+ buf[retval] = '\0';
+ return (retval);
+}
+
+/*
+ * Scaled down version of snprintf(3).
+ */
+int
+snprintf(char *str, size_t size, const char *format, ...)
+{
+ int retval;
+ va_list ap;
+
+ va_start(ap, format);
+ retval = vsnprintf(str, size, format, ap);
+ va_end(ap);
+ return(retval);
+}
+
+/*
+ * Scaled down version of vsnprintf(3).
+ */
+int
+vsnprintf(char *str, size_t size, const char *format, va_list ap)
+{
+ struct snprintf_arg info;
+ int retval;
+
+ info.str = str;
+ info.remain = size;
+ retval = kvprintf(format, snprintf_func, &info, 10, ap);
+ if (info.remain >= 1)
+ *info.str++ = '\0';
+ return (retval);
+}
+
+static void
+snprintf_func(int ch, void *arg)
+{
+ struct snprintf_arg *const info = arg;
+
+ if (info->remain >= 2) {
+ *info->str++ = ch;
+ info->remain--;
+ }
+}
+
+/*
+ * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse
+ * order; return an optional length and a pointer to the last character
+ * written in the buffer (i.e., the first character of the string).
+ * The buffer pointed to by `nbuf' must have length >= MAXNBUF.
+ */
+static char *
+ksprintn(char *nbuf, uintmax_t num, int base, int *lenp)
+{
+ char *p;
+
+ p = nbuf;
+ *p = '\0';
+ do {
+ *++p = hex2ascii(num % base);
+ } while (num /= base);
+ if (lenp)
+ *lenp = p - nbuf;
+ return (p);
+}
+
+/*
+ * Scaled down version of printf(3).
+ *
+ * Two additional formats:
+ *
+ * The format %b is supported to decode error registers.
+ * Its usage is:
+ *
+ * printf("reg=%b\n", regval, "<base><arg>*");
+ *
+ * where <base> is the output base expressed as a control character, e.g.
+ * \10 gives octal; \20 gives hex. Each arg is a sequence of characters,
+ * the first of which gives the bit number to be inspected (origin 1), and
+ * the next characters (up to a control character, i.e. a character <= 32),
+ * give the name of the register. Thus:
+ *
+ * kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
+ *
+ * would produce output:
+ *
+ * reg=3<BITTWO,BITONE>
+ *
+ * XXX: %D -- Hexdump, takes pointer and separator string:
+ * ("%6D", ptr, ":") -> XX:XX:XX:XX:XX:XX
+ * ("%*D", len, ptr, " " -> XX XX XX XX ...
+ */
+int
+kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap)
+{
+#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; }
+ char nbuf[MAXNBUF];
+ char *d;
+ const char *p, *percent, *q;
+ u_char *up;
+ int ch, n;
+ uintmax_t num;
+ int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
+ int jflag;
+ int dwidth;
+ char padc;
+ int retval = 0;
+
+ num = 0;
+ if (!func)
+ d = (char *) arg;
+ else
+ d = NULL;
+
+ if (fmt == NULL)
+ fmt = "(fmt null)\n";
+
+ if (radix < 2 || radix > 36)
+ radix = 10;
+
+ for (;;) {
+ padc = ' ';
+ width = 0;
+ while ((ch = (u_char)*fmt++) != '%') {
+ if (ch == '\0')
+ return (retval);
+ PCHAR(ch);
+ }
+ percent = fmt - 1;
+ qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
+ sign = 0; dot = 0; dwidth = 0;
+ jflag = 0;
+reswitch: switch (ch = (u_char)*fmt++) {
+ case '.':
+ dot = 1;
+ goto reswitch;
+ case '#':
+ sharpflag = 1;
+ goto reswitch;
+ case '+':
+ sign = 1;
+ goto reswitch;
+ case '-':
+ ladjust = 1;
+ goto reswitch;
+ case '%':
+ PCHAR(ch);
+ break;
+ case '*':
+ if (!dot) {
+ width = va_arg(ap, int);
+ if (width < 0) {
+ ladjust = !ladjust;
+ width = -width;
+ }
+ } else {
+ dwidth = va_arg(ap, int);
+ }
+ goto reswitch;
+ case '0':
+ if (!dot) {
+ padc = '0';
+ goto reswitch;
+ }
+ case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ for (n = 0;; ++fmt) {
+ n = n * 10 + ch - '0';
+ ch = *fmt;
+ if (ch < '0' || ch > '9')
+ break;
+ }
+ if (dot)
+ dwidth = n;
+ else
+ width = n;
+ goto reswitch;
+ case 'b':
+ num = va_arg(ap, int);
+ p = va_arg(ap, char *);
+ for (q = ksprintn(nbuf, num, *p++, NULL); *q;)
+ PCHAR(*q--);
+
+ if (num == 0)
+ break;
+
+ for (tmp = 0; *p;) {
+ n = *p++;
+ if (num & (1 << (n - 1))) {
+ PCHAR(tmp ? ',' : '<');
+ for (; (n = *p) > ' '; ++p)
+ PCHAR(n);
+ tmp = 1;
+ } else
+ for (; *p > ' '; ++p)
+ continue;
+ }
+ if (tmp)
+ PCHAR('>');
+ break;
+ case 'c':
+ PCHAR(va_arg(ap, int));
+ break;
+ case 'D':
+ up = va_arg(ap, u_char *);
+ p = va_arg(ap, char *);
+ if (!width)
+ width = 16;
+ while(width--) {
+ PCHAR(hex2ascii(*up >> 4));
+ PCHAR(hex2ascii(*up & 0x0f));
+ up++;
+ if (width)
+ for (q=p;*q;q++)
+ PCHAR(*q);
+ }
+ break;
+ case 'd':
+ base = 10;
+ sign = 1;
+ goto handle_sign;
+ case 'j':
+ jflag = 1;
+ goto reswitch;
+ case 'l':
+ if (lflag) {
+ lflag = 0;
+ qflag = 1;
+ } else
+ lflag = 1;
+ goto reswitch;
+ case 'n':
+ if (jflag)
+ *(va_arg(ap, intmax_t *)) = retval;
+ else if (qflag)
+ *(va_arg(ap, quad_t *)) = retval;
+ else if (lflag)
+ *(va_arg(ap, long *)) = retval;
+ else
+ *(va_arg(ap, int *)) = retval;
+ break;
+ case 'o':
+ base = 8;
+ goto handle_nosign;
+ case 'p':
+ base = 16;
+ sharpflag = (width == 0);
+ sign = 0;
+ num = (uintptr_t)va_arg(ap, void *);
+ goto number;
+ case 'q':
+ qflag = 1;
+ goto reswitch;
+ case 'r':
+ base = radix;
+ if (sign)
+ goto handle_sign;
+ goto handle_nosign;
+ case 's':
+ p = va_arg(ap, char *);
+ if (p == NULL)
+ p = "(null)";
+ if (!dot)
+ n = strlen (p);
+ else
+ for (n = 0; n < dwidth && p[n]; n++)
+ continue;
+
+ width -= n;
+
+ if (!ladjust && width > 0)
+ while (width--)
+ PCHAR(padc);
+ while (n--)
+ PCHAR(*p++);
+ if (ladjust && width > 0)
+ while (width--)
+ PCHAR(padc);
+ break;
+ case 'u':
+ base = 10;
+ goto handle_nosign;
+ case 'x':
+ case 'X':
+ base = 16;
+ goto handle_nosign;
+ case 'z':
+ base = 16;
+ if (sign)
+ goto handle_sign;
+handle_nosign:
+ sign = 0;
+ if (jflag)
+ num = va_arg(ap, uintmax_t);
+ else if (qflag)
+ num = va_arg(ap, u_quad_t);
+ else if (lflag)
+ num = va_arg(ap, u_long);
+ else
+ num = va_arg(ap, u_int);
+ goto number;
+handle_sign:
+ if (jflag)
+ num = va_arg(ap, intmax_t);
+ else if (qflag)
+ num = va_arg(ap, quad_t);
+ else if (lflag)
+ num = va_arg(ap, long);
+ else
+ num = va_arg(ap, int);
+number:
+ if (sign && (intmax_t)num < 0) {
+ neg = 1;
+ num = -(intmax_t)num;
+ }
+ p = ksprintn(nbuf, num, base, &tmp);
+ if (sharpflag && num != 0) {
+ if (base == 8)
+ tmp++;
+ else if (base == 16)
+ tmp += 2;
+ }
+ if (neg)
+ tmp++;
+
+ if (!ladjust && width && (width -= tmp) > 0)
+ while (width--)
+ PCHAR(padc);
+ if (neg)
+ PCHAR('-');
+ if (sharpflag && num != 0) {
+ if (base == 8) {
+ PCHAR('0');
+ } else if (base == 16) {
+ PCHAR('0');
+ PCHAR('x');
+ }
+ }
+
+ while (*p)
+ PCHAR(*p--);
+
+ if (ladjust && width && (width -= tmp) > 0)
+ while (width--)
+ PCHAR(padc);
+
+ break;
+ default:
+ while (percent < fmt)
+ PCHAR(*percent++);
+ break;
+ }
+ }
+#undef PCHAR
+}
+
+/*
+ * Put character in log buffer with a particular priority.
+ */
+static void
+msglogchar(int c, int pri)
+{
+ static int lastpri = -1;
+ static int dangling;
+ char nbuf[MAXNBUF];
+ char *p;
+
+ if (!msgbufmapped)
+ return;
+ if (c == '\0' || c == '\r')
+ return;
+ if (pri != -1 && pri != lastpri) {
+ if (dangling) {
+ msgaddchar('\n', NULL);
+ dangling = 0;
+ }
+ msgaddchar('<', NULL);
+ for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL); *p;)
+ msgaddchar(*p--, NULL);
+ msgaddchar('>', NULL);
+ lastpri = pri;
+ }
+ msgaddchar(c, NULL);
+ if (c == '\n') {
+ dangling = 0;
+ lastpri = -1;
+ } else {
+ dangling = 1;
+ }
+}
+
+/*
+ * Put char in log buffer
+ */
+static void
+msgaddchar(int c, void *dummy)
+{
+ struct msgbuf *mbp;
+
+ if (!msgbufmapped)
+ return;
+ mbp = msgbufp;
+ mbp->msg_ptr[mbp->msg_bufx++] = c;
+ if (mbp->msg_bufx >= mbp->msg_size)
+ mbp->msg_bufx = 0;
+ /* If the buffer is full, keep the most recent data. */
+ if (mbp->msg_bufr == mbp->msg_bufx) {
+ if (++mbp->msg_bufr >= mbp->msg_size)
+ mbp->msg_bufr = 0;
+ }
+}
+
+static void
+msgbufcopy(struct msgbuf *oldp)
+{
+ int pos;
+
+ pos = oldp->msg_bufr;
+ while (pos != oldp->msg_bufx) {
+ msglogchar(oldp->msg_ptr[pos], -1);
+ if (++pos >= oldp->msg_size)
+ pos = 0;
+ }
+}
+
+void
+msgbufinit(void *ptr, size_t size)
+{
+ char *cp;
+ static struct msgbuf *oldp = NULL;
+
+ size -= sizeof(*msgbufp);
+ cp = (char *)ptr;
+ msgbufp = (struct msgbuf *) (cp + size);
+ if (msgbufp->msg_magic != MSG_MAGIC || msgbufp->msg_size != size ||
+ msgbufp->msg_bufx >= size || msgbufp->msg_bufr >= size) {
+ bzero(cp, size);
+ bzero(msgbufp, sizeof(*msgbufp));
+ msgbufp->msg_magic = MSG_MAGIC;
+ msgbufp->msg_size = (char *)msgbufp - cp;
+ }
+ msgbufp->msg_ptr = cp;
+ if (msgbufmapped && oldp != msgbufp)
+ msgbufcopy(oldp);
+ msgbufmapped = 1;
+ oldp = msgbufp;
+}
+
+SYSCTL_DECL(_security_bsd);
+
+static int unprivileged_read_msgbuf = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_read_msgbuf,
+ CTLFLAG_RW, &unprivileged_read_msgbuf, 0,
+ "Unprivileged processes may read the kernel message buffer");
+
+/* Sysctls for accessing/clearing the msgbuf */
+static int
+sysctl_kern_msgbuf(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+
+ if (!unprivileged_read_msgbuf) {
+ error = suser(req->td);
+ if (error)
+ return (error);
+ }
+
+ /*
+ * Unwind the buffer, so that it's linear (possibly starting with
+ * some initial nulls).
+ */
+ error = sysctl_handle_opaque(oidp, msgbufp->msg_ptr + msgbufp->msg_bufx,
+ msgbufp->msg_size - msgbufp->msg_bufx, req);
+ if (error)
+ return (error);
+ if (msgbufp->msg_bufx > 0) {
+ error = sysctl_handle_opaque(oidp, msgbufp->msg_ptr,
+ msgbufp->msg_bufx, req);
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, msgbuf, CTLTYPE_STRING | CTLFLAG_RD,
+ 0, 0, sysctl_kern_msgbuf, "A", "Contents of kernel message buffer");
+
+static int msgbuf_clear;
+
+static int
+sysctl_kern_msgbuf_clear(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+ if (!error && req->newptr) {
+ /* Clear the buffer and reset write pointer */
+ bzero(msgbufp->msg_ptr, msgbufp->msg_size);
+ msgbufp->msg_bufr = msgbufp->msg_bufx = 0;
+ msgbuf_clear = 0;
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, msgbuf_clear,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, &msgbuf_clear, 0,
+ sysctl_kern_msgbuf_clear, "I", "Clear kernel message buffer");
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(msgbuf, db_show_msgbuf)
+{
+ int i, j;
+
+ if (!msgbufmapped) {
+ db_printf("msgbuf not mapped yet\n");
+ return;
+ }
+ db_printf("msgbufp = %p\n", msgbufp);
+ db_printf("magic = %x, size = %d, r= %d, w = %d, ptr = %p\n",
+ msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_bufr,
+ msgbufp->msg_bufx, msgbufp->msg_ptr);
+ for (i = 0; i < msgbufp->msg_size; i++) {
+ j = (i + msgbufp->msg_bufr) % msgbufp->msg_size;
+ db_printf("%c", msgbufp->msg_ptr[j]);
+ }
+ db_printf("\n");
+}
+
+#endif /* DDB */
diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c
new file mode 100644
index 0000000..706863d
--- /dev/null
+++ b/sys/kern/subr_prof.c
@@ -0,0 +1,531 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_prof.c 8.3 (Berkeley) 9/23/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+
+#ifdef GPROF
+#include <sys/malloc.h>
+#include <sys/gmon.h>
+#undef MCOUNT
+
+static MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");
+
+static void kmstartup(void *);
+SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL)
+
+struct gmonparam _gmonparam = { GMON_PROF_OFF };
+
+#ifdef GUPROF
+#include <machine/asmacros.h>
+
+void
+nullfunc_loop_profiled()
+{
+ int i;
+
+ for (i = 0; i < CALIB_SCALE; i++)
+ nullfunc_profiled();
+}
+
+#define nullfunc_loop_profiled_end nullfunc_profiled /* XXX */
+
+void
+nullfunc_profiled()
+{
+}
+#endif /* GUPROF */
+
+/*
+ * Update the histograms to support extending the text region arbitrarily.
+ * This is done slightly naively (no sparse regions), so will waste slight
+ * amounts of memory, but will overall work nicely enough to allow profiling
+ * of KLDs.
+ */
+void
+kmupetext(uintfptr_t nhighpc)
+{
+ struct gmonparam np; /* slightly large */
+ struct gmonparam *p = &_gmonparam;
+ char *cp;
+
+ GIANT_REQUIRED;
+ bcopy(p, &np, sizeof(*p));
+ np.highpc = ROUNDUP(nhighpc, HISTFRACTION * sizeof(HISTCOUNTER));
+ if (np.highpc <= p->highpc)
+ return;
+ np.textsize = np.highpc - p->lowpc;
+ np.kcountsize = np.textsize / HISTFRACTION;
+ np.hashfraction = HASHFRACTION;
+ np.fromssize = np.textsize / HASHFRACTION;
+ np.tolimit = np.textsize * ARCDENSITY / 100;
+ if (np.tolimit < MINARCS)
+ np.tolimit = MINARCS;
+ else if (np.tolimit > MAXARCS)
+ np.tolimit = MAXARCS;
+ np.tossize = np.tolimit * sizeof(struct tostruct);
+ cp = malloc(np.kcountsize + np.fromssize + np.tossize,
+ M_GPROF, M_WAITOK);
+ /*
+ * Check for something else extending highpc while we slept.
+ */
+ if (np.highpc <= p->highpc) {
+ free(cp, M_GPROF);
+ return;
+ }
+ np.tos = (struct tostruct *)cp;
+ cp += np.tossize;
+ np.kcount = (HISTCOUNTER *)cp;
+ cp += np.kcountsize;
+ np.froms = (u_short *)cp;
+#ifdef GUPROF
+ /* Reinitialize pointers to overhead counters. */
+ np.cputime_count = &KCOUNT(&np, PC_TO_I(&np, cputime));
+ np.mcount_count = &KCOUNT(&np, PC_TO_I(&np, mcount));
+ np.mexitcount_count = &KCOUNT(&np, PC_TO_I(&np, mexitcount));
+#endif
+ critical_enter();
+ bcopy(p->tos, np.tos, p->tossize);
+ bzero((char *)np.tos + p->tossize, np.tossize - p->tossize);
+ bcopy(p->kcount, np.kcount, p->kcountsize);
+ bzero((char *)np.kcount + p->kcountsize, np.kcountsize -
+ p->kcountsize);
+ bcopy(p->froms, np.froms, p->fromssize);
+ bzero((char *)np.froms + p->fromssize, np.fromssize - p->fromssize);
+ cp = (char *)p->tos;
+ bcopy(&np, p, sizeof(*p));
+ critical_exit();
+ free(cp, M_GPROF);
+}
+
+static void
+kmstartup(dummy)
+ void *dummy;
+{
+ char *cp;
+ struct gmonparam *p = &_gmonparam;
+#ifdef GUPROF
+ int cputime_overhead;
+ int empty_loop_time;
+ int i;
+ int mcount_overhead;
+ int mexitcount_overhead;
+ int nullfunc_loop_overhead;
+ int nullfunc_loop_profiled_time;
+ uintfptr_t tmp_addr;
+#endif
+
+ /*
+ * Round lowpc and highpc to multiples of the density we're using
+ * so the rest of the scaling (here and in gprof) stays in ints.
+ */
+ p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER));
+ p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER));
+ p->textsize = p->highpc - p->lowpc;
+ printf("Profiling kernel, textsize=%lu [%x..%x]\n",
+ p->textsize, p->lowpc, p->highpc);
+ p->kcountsize = p->textsize / HISTFRACTION;
+ p->hashfraction = HASHFRACTION;
+ p->fromssize = p->textsize / HASHFRACTION;
+ p->tolimit = p->textsize * ARCDENSITY / 100;
+ if (p->tolimit < MINARCS)
+ p->tolimit = MINARCS;
+ else if (p->tolimit > MAXARCS)
+ p->tolimit = MAXARCS;
+ p->tossize = p->tolimit * sizeof(struct tostruct);
+ cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize,
+ M_GPROF, M_WAITOK | M_ZERO);
+ p->tos = (struct tostruct *)cp;
+ cp += p->tossize;
+ p->kcount = (HISTCOUNTER *)cp;
+ cp += p->kcountsize;
+ p->froms = (u_short *)cp;
+
+#ifdef GUPROF
+ /* Initialize pointers to overhead counters. */
+ p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime));
+ p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount));
+ p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount));
+
+ /*
+ * Disable interrupts to avoid interference while we calibrate
+ * things.
+ */
+ critical_enter();
+
+ /*
+ * Determine overheads.
+ * XXX this needs to be repeated for each useful timer/counter.
+ */
+ cputime_overhead = 0;
+ startguprof(p);
+ for (i = 0; i < CALIB_SCALE; i++)
+ cputime_overhead += cputime();
+
+ empty_loop();
+ startguprof(p);
+ empty_loop();
+ empty_loop_time = cputime();
+
+ nullfunc_loop_profiled();
+
+ /*
+ * Start profiling. There won't be any normal function calls since
+ * interrupts are disabled, but we will call the profiling routines
+ * directly to determine their overheads.
+ */
+ p->state = GMON_PROF_HIRES;
+
+ startguprof(p);
+ nullfunc_loop_profiled();
+
+ startguprof(p);
+ for (i = 0; i < CALIB_SCALE; i++)
+#if defined(__i386__) && __GNUC__ >= 2
+ __asm("pushl %0; call __mcount; popl %%ecx"
+ :
+ : "i" (profil)
+ : "ax", "bx", "cx", "dx", "memory");
+#else
+#error
+#endif
+ mcount_overhead = KCOUNT(p, PC_TO_I(p, profil));
+
+ startguprof(p);
+ for (i = 0; i < CALIB_SCALE; i++)
+#if defined(__i386__) && __GNUC__ >= 2
+ __asm("call " __XSTRING(HIDENAME(mexitcount)) "; 1:"
+ : : : "ax", "bx", "cx", "dx", "memory");
+ __asm("movl $1b,%0" : "=rm" (tmp_addr));
+#else
+#error
+#endif
+ mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr));
+
+ p->state = GMON_PROF_OFF;
+ stopguprof(p);
+
+ critical_exit();
+
+ nullfunc_loop_profiled_time = 0;
+ for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled;
+ tmp_addr < (uintfptr_t)nullfunc_loop_profiled_end;
+ tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER))
+ nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr));
+#define CALIB_DOSCALE(count) (((count) + CALIB_SCALE / 3) / CALIB_SCALE)
+#define c2n(count, freq) ((int)((count) * 1000000000LL / freq))
+ printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n",
+ CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)),
+ CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)),
+ CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)),
+ CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)),
+ CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate)));
+ cputime_overhead -= empty_loop_time;
+ mcount_overhead -= empty_loop_time;
+ mexitcount_overhead -= empty_loop_time;
+
+ /*-
+ * Profiling overheads are determined by the times between the
+ * following events:
+ * MC1: mcount() is called
+ * MC2: cputime() (called from mcount()) latches the timer
+ * MC3: mcount() completes
+ * ME1: mexitcount() is called
+ * ME2: cputime() (called from mexitcount()) latches the timer
+ * ME3: mexitcount() completes.
+ * The times between the events vary slightly depending on instruction
+ * combination and cache misses, etc. Attempt to determine the
+ * minimum times. These can be subtracted from the profiling times
+ * without much risk of reducing the profiling times below what they
+ * would be when profiling is not configured. Abbreviate:
+ * ab = minimum time between MC1 and MC3
+ * a = minumum time between MC1 and MC2
+ * b = minimum time between MC2 and MC3
+ * cd = minimum time between ME1 and ME3
+ * c = minimum time between ME1 and ME2
+ * d = minimum time between ME2 and ME3.
+ * These satisfy the relations:
+ * ab <= mcount_overhead (just measured)
+ * a + b <= ab
+ * cd <= mexitcount_overhead (just measured)
+ * c + d <= cd
+ * a + d <= nullfunc_loop_profiled_time (just measured)
+ * a >= 0, b >= 0, c >= 0, d >= 0.
+ * Assume that ab and cd are equal to the minimums.
+ */
+ p->cputime_overhead = CALIB_DOSCALE(cputime_overhead);
+ p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead);
+ p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead
+ - cputime_overhead);
+ nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time;
+ p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead
+ - nullfunc_loop_overhead)
+ / 4);
+ p->mexitcount_pre_overhead = p->mexitcount_overhead
+ + p->cputime_overhead
+ - p->mexitcount_post_overhead;
+ p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead)
+ - p->mexitcount_post_overhead;
+ p->mcount_post_overhead = p->mcount_overhead
+ + p->cputime_overhead
+ - p->mcount_pre_overhead;
+ printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n",
+ c2n(p->cputime_overhead, p->profrate),
+ c2n(p->mcount_overhead, p->profrate),
+ c2n(p->mcount_pre_overhead, p->profrate),
+ c2n(p->mcount_post_overhead, p->profrate),
+ c2n(p->cputime_overhead, p->profrate),
+ c2n(p->mexitcount_overhead, p->profrate),
+ c2n(p->mexitcount_pre_overhead, p->profrate),
+ c2n(p->mexitcount_post_overhead, p->profrate));
+ printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n",
+ p->cputime_overhead, p->mcount_overhead,
+ p->mcount_pre_overhead, p->mcount_post_overhead,
+ p->cputime_overhead, p->mexitcount_overhead,
+ p->mexitcount_pre_overhead, p->mexitcount_post_overhead);
+#endif /* GUPROF */
+}
+
+/*
+ * Return kernel profiling information.
+ */
+static int
+sysctl_kern_prof(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ struct gmonparam *gp = &_gmonparam;
+ int error;
+ int state;
+
+ /* all sysctl names at this level are terminal */
+ if (namelen != 1)
+ return (ENOTDIR); /* overloaded */
+
+ switch (name[0]) {
+ case GPROF_STATE:
+ state = gp->state;
+ error = sysctl_handle_int(oidp, &state, 0, req);
+ if (error)
+ return (error);
+ if (!req->newptr)
+ return (0);
+ if (state == GMON_PROF_OFF) {
+ gp->state = state;
+ stopprofclock(&proc0);
+ stopguprof(gp);
+ } else if (state == GMON_PROF_ON) {
+ gp->state = GMON_PROF_OFF;
+ stopguprof(gp);
+ gp->profrate = profhz;
+ startprofclock(&proc0);
+ gp->state = state;
+#ifdef GUPROF
+ } else if (state == GMON_PROF_HIRES) {
+ gp->state = GMON_PROF_OFF;
+ stopprofclock(&proc0);
+ startguprof(gp);
+ gp->state = state;
+#endif
+ } else if (state != gp->state)
+ return (EINVAL);
+ return (0);
+ case GPROF_COUNT:
+ return (sysctl_handle_opaque(oidp,
+ gp->kcount, gp->kcountsize, req));
+ case GPROF_FROMS:
+ return (sysctl_handle_opaque(oidp,
+ gp->froms, gp->fromssize, req));
+ case GPROF_TOS:
+ return (sysctl_handle_opaque(oidp,
+ gp->tos, gp->tossize, req));
+ case GPROF_GMONPARAM:
+ return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req));
+ default:
+ return (EOPNOTSUPP);
+ }
+ /* NOTREACHED */
+}
+
+SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, "");
+#endif /* GPROF */
+
+/*
+ * Profiling system call.
+ *
+ * The scale factor is a fixed point number with 16 bits of fraction, so that
+ * 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct profil_args {
+ caddr_t samples;
+ size_t size;
+ size_t offset;
+ u_int scale;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+profil(td, uap)
+ struct thread *td;
+ register struct profil_args *uap;
+{
+ register struct uprof *upp;
+ int s;
+ int error = 0;
+
+ mtx_lock(&Giant);
+
+ if (uap->scale > (1 << 16)) {
+ error = EINVAL;
+ goto done2;
+ }
+ if (uap->scale == 0) {
+ stopprofclock(td->td_proc);
+ goto done2;
+ }
+ upp = &td->td_proc->p_stats->p_prof;
+
+ /* Block profile interrupts while changing state. */
+ s = splstatclock();
+ upp->pr_off = uap->offset;
+ upp->pr_scale = uap->scale;
+ upp->pr_base = uap->samples;
+ upp->pr_size = uap->size;
+ startprofclock(td->td_proc);
+ splx(s);
+
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Scale is a fixed-point number with the binary point 16 bits
+ * into the value, and is <= 1.0. pc is at most 32 bits, so the
+ * intermediate result is at most 48 bits.
+ */
+#define PC_TO_INDEX(pc, prof) \
+ ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
+ (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
+
+/*
+ * Collect user-level profiling statistics; called on a profiling tick,
+ * when a process is running in user-mode. This routine may be called
+ * from an interrupt context. We try to update the user profiling buffers
+ * cheaply with fuswintr() and suswintr(). If that fails, we revert to
+ * an AST that will vector us to trap() with a context in which copyin
+ * and copyout will work. Trap will then call addupc_task().
+ *
+ * Note that we may (rarely) not get around to the AST soon enough, and
+ * lose profile ticks when the next tick overwrites this one, but in this
+ * case the system is overloaded and the profile is probably already
+ * inaccurate.
+ */
+void
+addupc_intr(ke, pc, ticks)
+ register struct kse *ke;
+ register uintptr_t pc;
+ u_int ticks;
+{
+ register struct uprof *prof;
+ register caddr_t addr;
+ register u_int i;
+ register int v;
+
+ if (ticks == 0)
+ return;
+ prof = &ke->ke_proc->p_stats->p_prof;
+ if (pc < prof->pr_off ||
+ (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
+ return; /* out of range; ignore */
+
+ addr = prof->pr_base + i;
+ if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) {
+ mtx_lock_spin(&sched_lock);
+ prof->pr_addr = pc;
+ prof->pr_ticks = ticks;
+ ke->ke_flags |= KEF_OWEUPC | KEF_ASTPENDING ;
+ mtx_unlock_spin(&sched_lock);
+ }
+}
+
+/*
+ * Much like before, but we can afford to take faults here. If the
+ * update fails, we simply turn off profiling.
+ */
+void
+addupc_task(ke, pc, ticks)
+ register struct kse *ke;
+ register uintptr_t pc;
+ u_int ticks;
+{
+ struct proc *p = ke->ke_proc;
+ register struct uprof *prof;
+ register caddr_t addr;
+ register u_int i;
+ u_short v;
+
+ if (ticks == 0)
+ return;
+
+ prof = &p->p_stats->p_prof;
+ if (pc < prof->pr_off ||
+ (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
+ return;
+
+ addr = prof->pr_base + i;
+ if (copyin(addr, (caddr_t)&v, sizeof(v)) == 0) {
+ v += ticks;
+ if (copyout((caddr_t)&v, addr, sizeof(v)) == 0)
+ return;
+ }
+ stopprofclock(p);
+}
diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c
new file mode 100644
index 0000000..85af088
--- /dev/null
+++ b/sys/kern/subr_rman.c
@@ -0,0 +1,609 @@
+/*
+ * Copyright 1998 Massachusetts Institute of Technology
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation for any purpose and without fee is hereby
+ * granted, provided that both the above copyright notice and this
+ * permission notice appear in all copies, that both the above
+ * copyright notice and this permission notice appear in all
+ * supporting documentation, and that the name of M.I.T. not be used
+ * in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission. M.I.T. makes
+ * no representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied
+ * warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
+ * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
+ * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * The kernel resource manager. This code is responsible for keeping track
+ * of hardware resources which are apportioned out to various drivers.
+ * It does not actually assign those resources, and it is not expected
+ * that end-device drivers will call into this code directly. Rather,
+ * the code which implements the buses that those devices are attached to,
+ * and the code which manages CPU resources, will call this code, and the
+ * end-device drivers will make upcalls to that code to actually perform
+ * the allocation.
+ *
+ * There are two sorts of resources managed by this code. The first is
+ * the more familiar array (RMAN_ARRAY) type; resources in this class
+ * consist of a sequence of individually-allocatable objects which have
+ * been numbered in some well-defined order. Most of the resources
+ * are of this type, as it is the most familiar. The second type is
+ * called a gauge (RMAN_GAUGE), and models fungible resources (i.e.,
+ * resources in which each instance is indistinguishable from every
+ * other instance). The principal anticipated application of gauges
+ * is in the context of power consumption, where a bus may have a specific
+ * power budget which all attached devices share. RMAN_GAUGE is not
+ * implemented yet.
+ *
+ * For array resources, we make one simplifying assumption: two clients
+ * sharing the same resource must use the same range of indices. That
+ * is to say, sharing of overlapping-but-not-identical regions is not
+ * permitted.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/bus.h> /* XXX debugging */
+#include <machine/bus.h>
+#include <sys/rman.h>
+
+#ifdef RMAN_DEBUG
+#define DPRINTF(params) printf##params
+#else
+#define DPRINTF(params)
+#endif
+
+static MALLOC_DEFINE(M_RMAN, "rman", "Resource manager");
+
+struct rman_head rman_head;
+static struct mtx rman_mtx; /* mutex to protect rman_head */
+static int int_rman_activate_resource(struct rman *rm, struct resource *r,
+ struct resource **whohas);
+static int int_rman_deactivate_resource(struct resource *r);
+static int int_rman_release_resource(struct rman *rm, struct resource *r);
+
+int
+rman_init(struct rman *rm)
+{
+ static int once;
+
+ if (once == 0) {
+ once = 1;
+ TAILQ_INIT(&rman_head);
+ mtx_init(&rman_mtx, "rman head", NULL, MTX_DEF);
+ }
+
+ if (rm->rm_type == RMAN_UNINIT)
+ panic("rman_init");
+ if (rm->rm_type == RMAN_GAUGE)
+ panic("implement RMAN_GAUGE");
+
+ TAILQ_INIT(&rm->rm_list);
+ rm->rm_mtx = malloc(sizeof *rm->rm_mtx, M_RMAN, M_NOWAIT | M_ZERO);
+ if (rm->rm_mtx == 0)
+ return ENOMEM;
+ mtx_init(rm->rm_mtx, "rman", NULL, MTX_DEF);
+
+ mtx_lock(&rman_mtx);
+ TAILQ_INSERT_TAIL(&rman_head, rm, rm_link);
+ mtx_unlock(&rman_mtx);
+ return 0;
+}
+
+/*
+ * NB: this interface is not robust against programming errors which
+ * add multiple copies of the same region.
+ */
+int
+rman_manage_region(struct rman *rm, u_long start, u_long end)
+{
+ struct resource *r, *s;
+
+ r = malloc(sizeof *r, M_RMAN, M_NOWAIT | M_ZERO);
+ if (r == 0)
+ return ENOMEM;
+ r->r_start = start;
+ r->r_end = end;
+ r->r_rm = rm;
+
+ mtx_lock(rm->rm_mtx);
+ for (s = TAILQ_FIRST(&rm->rm_list);
+ s && s->r_end < r->r_start;
+ s = TAILQ_NEXT(s, r_link))
+ ;
+
+ if (s == NULL) {
+ TAILQ_INSERT_TAIL(&rm->rm_list, r, r_link);
+ } else {
+ TAILQ_INSERT_BEFORE(s, r, r_link);
+ }
+
+ mtx_unlock(rm->rm_mtx);
+ return 0;
+}
+
+int
+rman_fini(struct rman *rm)
+{
+ struct resource *r;
+
+ mtx_lock(rm->rm_mtx);
+ TAILQ_FOREACH(r, &rm->rm_list, r_link) {
+ if (r->r_flags & RF_ALLOCATED) {
+ mtx_unlock(rm->rm_mtx);
+ return EBUSY;
+ }
+ }
+
+ /*
+ * There really should only be one of these if we are in this
+ * state and the code is working properly, but it can't hurt.
+ */
+ while (!TAILQ_EMPTY(&rm->rm_list)) {
+ r = TAILQ_FIRST(&rm->rm_list);
+ TAILQ_REMOVE(&rm->rm_list, r, r_link);
+ free(r, M_RMAN);
+ }
+ mtx_unlock(rm->rm_mtx);
+ mtx_lock(&rman_mtx);
+ TAILQ_REMOVE(&rman_head, rm, rm_link);
+ mtx_unlock(&rman_mtx);
+ mtx_destroy(rm->rm_mtx);
+ free(rm->rm_mtx, M_RMAN);
+
+ return 0;
+}
+
+struct resource *
+rman_reserve_resource_bound(struct rman *rm, u_long start, u_long end,
+ u_long count, u_long bound, u_int flags,
+ struct device *dev)
+{
+ u_int want_activate;
+ struct resource *r, *s, *rv;
+ u_long rstart, rend, amask, bmask;
+
+ rv = 0;
+
+ DPRINTF(("rman_reserve_resource: <%s> request: [%#lx, %#lx], length "
+ "%#lx, flags %u, device %s\n", rm->rm_descr, start, end, count,
+ flags, dev == NULL ? "<null>" : device_get_nameunit(dev)));
+ want_activate = (flags & RF_ACTIVE);
+ flags &= ~RF_ACTIVE;
+
+ mtx_lock(rm->rm_mtx);
+
+ for (r = TAILQ_FIRST(&rm->rm_list);
+ r && r->r_end < start;
+ r = TAILQ_NEXT(r, r_link))
+ ;
+
+ if (r == NULL) {
+ DPRINTF(("could not find a region\n"));
+ goto out;
+ }
+
+ amask = (1ul << RF_ALIGNMENT(flags)) - 1;
+ /* If bound is 0, bmask will also be 0 */
+ bmask = ~(bound - 1);
+ /*
+ * First try to find an acceptable totally-unshared region.
+ */
+ for (s = r; s; s = TAILQ_NEXT(s, r_link)) {
+ DPRINTF(("considering [%#lx, %#lx]\n", s->r_start, s->r_end));
+ if (s->r_start > end) {
+ DPRINTF(("s->r_start (%#lx) > end (%#lx)\n", s->r_start, end));
+ break;
+ }
+ if (s->r_flags & RF_ALLOCATED) {
+ DPRINTF(("region is allocated\n"));
+ continue;
+ }
+ rstart = ulmax(s->r_start, start);
+ /*
+ * Try to find a region by adjusting to boundary and alignment
+ * until both conditions are satisfied. This is not an optimal
+ * algorithm, but in most cases it isn't really bad, either.
+ */
+ do {
+ rstart = (rstart + amask) & ~amask;
+ if (((rstart ^ (rstart + count)) & bmask) != 0)
+ rstart += bound - (rstart & ~bmask);
+ } while ((rstart & amask) != 0 && rstart < end &&
+ rstart < s->r_end);
+ rend = ulmin(s->r_end, ulmax(rstart + count, end));
+ DPRINTF(("truncated region: [%#lx, %#lx]; size %#lx (requested %#lx)\n",
+ rstart, rend, (rend - rstart + 1), count));
+
+ if ((rend - rstart + 1) >= count) {
+ DPRINTF(("candidate region: [%#lx, %#lx], size %#lx\n",
+ rend, rstart, (rend - rstart + 1)));
+ if ((s->r_end - s->r_start + 1) == count) {
+ DPRINTF(("candidate region is entire chunk\n"));
+ rv = s;
+ rv->r_flags |= RF_ALLOCATED | flags;
+ rv->r_dev = dev;
+ goto out;
+ }
+
+ /*
+ * If s->r_start < rstart and
+ * s->r_end > rstart + count - 1, then
+ * we need to split the region into three pieces
+ * (the middle one will get returned to the user).
+ * Otherwise, we are allocating at either the
+ * beginning or the end of s, so we only need to
+ * split it in two. The first case requires
+ * two new allocations; the second requires but one.
+ */
+ rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT | M_ZERO);
+ if (rv == 0)
+ goto out;
+ rv->r_start = rstart;
+ rv->r_end = rstart + count - 1;
+ rv->r_flags = flags | RF_ALLOCATED;
+ rv->r_dev = dev;
+ rv->r_rm = rm;
+
+ if (s->r_start < rv->r_start && s->r_end > rv->r_end) {
+ DPRINTF(("splitting region in three parts: "
+ "[%#lx, %#lx]; [%#lx, %#lx]; [%#lx, %#lx]\n",
+ s->r_start, rv->r_start - 1,
+ rv->r_start, rv->r_end,
+ rv->r_end + 1, s->r_end));
+ /*
+ * We are allocating in the middle.
+ */
+ r = malloc(sizeof *r, M_RMAN, M_NOWAIT|M_ZERO);
+ if (r == 0) {
+ free(rv, M_RMAN);
+ rv = 0;
+ goto out;
+ }
+ r->r_start = rv->r_end + 1;
+ r->r_end = s->r_end;
+ r->r_flags = s->r_flags;
+ r->r_rm = rm;
+ s->r_end = rv->r_start - 1;
+ TAILQ_INSERT_AFTER(&rm->rm_list, s, rv,
+ r_link);
+ TAILQ_INSERT_AFTER(&rm->rm_list, rv, r,
+ r_link);
+ } else if (s->r_start == rv->r_start) {
+ DPRINTF(("allocating from the beginning\n"));
+ /*
+ * We are allocating at the beginning.
+ */
+ s->r_start = rv->r_end + 1;
+ TAILQ_INSERT_BEFORE(s, rv, r_link);
+ } else {
+ DPRINTF(("allocating at the end\n"));
+ /*
+ * We are allocating at the end.
+ */
+ s->r_end = rv->r_start - 1;
+ TAILQ_INSERT_AFTER(&rm->rm_list, s, rv,
+ r_link);
+ }
+ goto out;
+ }
+ }
+
+ /*
+ * Now find an acceptable shared region, if the client's requirements
+ * allow sharing. By our implementation restriction, a candidate
+ * region must match exactly by both size and sharing type in order
+ * to be considered compatible with the client's request. (The
+ * former restriction could probably be lifted without too much
+ * additional work, but this does not seem warranted.)
+ */
+ DPRINTF(("no unshared regions found\n"));
+ if ((flags & (RF_SHAREABLE | RF_TIMESHARE)) == 0)
+ goto out;
+
+ for (s = r; s; s = TAILQ_NEXT(s, r_link)) {
+ if (s->r_start > end)
+ break;
+ if ((s->r_flags & flags) != flags)
+ continue;
+ rstart = ulmax(s->r_start, start);
+ rend = ulmin(s->r_end, ulmax(start + count, end));
+ if (s->r_start >= start && s->r_end <= end
+ && (s->r_end - s->r_start + 1) == count &&
+ (s->r_start & amask) == 0 &&
+ ((s->r_start ^ s->r_end) & bmask) == 0) {
+ rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT | M_ZERO);
+ if (rv == 0)
+ goto out;
+ rv->r_start = s->r_start;
+ rv->r_end = s->r_end;
+ rv->r_flags = s->r_flags &
+ (RF_ALLOCATED | RF_SHAREABLE | RF_TIMESHARE);
+ rv->r_dev = dev;
+ rv->r_rm = rm;
+ if (s->r_sharehead == 0) {
+ s->r_sharehead = malloc(sizeof *s->r_sharehead,
+ M_RMAN, M_NOWAIT | M_ZERO);
+ if (s->r_sharehead == 0) {
+ free(rv, M_RMAN);
+ rv = 0;
+ goto out;
+ }
+ LIST_INIT(s->r_sharehead);
+ LIST_INSERT_HEAD(s->r_sharehead, s,
+ r_sharelink);
+ s->r_flags |= RF_FIRSTSHARE;
+ }
+ rv->r_sharehead = s->r_sharehead;
+ LIST_INSERT_HEAD(s->r_sharehead, rv, r_sharelink);
+ goto out;
+ }
+ }
+
+ /*
+ * We couldn't find anything.
+ */
+out:
+ /*
+ * If the user specified RF_ACTIVE in the initial flags,
+ * which is reflected in `want_activate', we attempt to atomically
+ * activate the resource. If this fails, we release the resource
+ * and indicate overall failure. (This behavior probably doesn't
+ * make sense for RF_TIMESHARE-type resources.)
+ */
+ if (rv && want_activate) {
+ struct resource *whohas;
+ if (int_rman_activate_resource(rm, rv, &whohas)) {
+ int_rman_release_resource(rm, rv);
+ rv = 0;
+ }
+ }
+
+ mtx_unlock(rm->rm_mtx);
+ return (rv);
+}
+
+struct resource *
+rman_reserve_resource(struct rman *rm, u_long start, u_long end, u_long count,
+ u_int flags, struct device *dev)
+{
+
+ return (rman_reserve_resource_bound(rm, start, end, count, 0, flags,
+ dev));
+}
+
+static int
+int_rman_activate_resource(struct rman *rm, struct resource *r,
+ struct resource **whohas)
+{
+ struct resource *s;
+ int ok;
+
+ /*
+ * If we are not timesharing, then there is nothing much to do.
+ * If we already have the resource, then there is nothing at all to do.
+ * If we are not on a sharing list with anybody else, then there is
+ * little to do.
+ */
+ if ((r->r_flags & RF_TIMESHARE) == 0
+ || (r->r_flags & RF_ACTIVE) != 0
+ || r->r_sharehead == 0) {
+ r->r_flags |= RF_ACTIVE;
+ return 0;
+ }
+
+ ok = 1;
+ for (s = LIST_FIRST(r->r_sharehead); s && ok;
+ s = LIST_NEXT(s, r_sharelink)) {
+ if ((s->r_flags & RF_ACTIVE) != 0) {
+ ok = 0;
+ *whohas = s;
+ }
+ }
+ if (ok) {
+ r->r_flags |= RF_ACTIVE;
+ return 0;
+ }
+ return EBUSY;
+}
+
+int
+rman_activate_resource(struct resource *r)
+{
+ int rv;
+ struct resource *whohas;
+ struct rman *rm;
+
+ rm = r->r_rm;
+ mtx_lock(rm->rm_mtx);
+ rv = int_rman_activate_resource(rm, r, &whohas);
+ mtx_unlock(rm->rm_mtx);
+ return rv;
+}
+
+int
+rman_await_resource(struct resource *r, int pri, int timo)
+{
+ int rv;
+ struct resource *whohas;
+ struct rman *rm;
+
+ rm = r->r_rm;
+ mtx_lock(rm->rm_mtx);
+ for (;;) {
+ rv = int_rman_activate_resource(rm, r, &whohas);
+ if (rv != EBUSY)
+ return (rv); /* returns with mutex held */
+
+ if (r->r_sharehead == 0)
+ panic("rman_await_resource");
+ whohas->r_flags |= RF_WANTED;
+ rv = msleep(r->r_sharehead, rm->rm_mtx, pri, "rmwait", timo);
+ if (rv) {
+ mtx_unlock(rm->rm_mtx);
+ return (rv);
+ }
+ }
+}
+
+static int
+int_rman_deactivate_resource(struct resource *r)
+{
+ struct rman *rm;
+
+ rm = r->r_rm;
+ r->r_flags &= ~RF_ACTIVE;
+ if (r->r_flags & RF_WANTED) {
+ r->r_flags &= ~RF_WANTED;
+ wakeup(r->r_sharehead);
+ }
+ return 0;
+}
+
+int
+rman_deactivate_resource(struct resource *r)
+{
+ struct rman *rm;
+
+ rm = r->r_rm;
+ mtx_lock(rm->rm_mtx);
+ int_rman_deactivate_resource(r);
+ mtx_unlock(rm->rm_mtx);
+ return 0;
+}
+
+static int
+int_rman_release_resource(struct rman *rm, struct resource *r)
+{
+ struct resource *s, *t;
+
+ if (r->r_flags & RF_ACTIVE)
+ int_rman_deactivate_resource(r);
+
+ /*
+ * Check for a sharing list first. If there is one, then we don't
+ * have to think as hard.
+ */
+ if (r->r_sharehead) {
+ /*
+ * If a sharing list exists, then we know there are at
+ * least two sharers.
+ *
+ * If we are in the main circleq, appoint someone else.
+ */
+ LIST_REMOVE(r, r_sharelink);
+ s = LIST_FIRST(r->r_sharehead);
+ if (r->r_flags & RF_FIRSTSHARE) {
+ s->r_flags |= RF_FIRSTSHARE;
+ TAILQ_INSERT_BEFORE(r, s, r_link);
+ TAILQ_REMOVE(&rm->rm_list, r, r_link);
+ }
+
+ /*
+ * Make sure that the sharing list goes away completely
+ * if the resource is no longer being shared at all.
+ */
+ if (LIST_NEXT(s, r_sharelink) == 0) {
+ free(s->r_sharehead, M_RMAN);
+ s->r_sharehead = 0;
+ s->r_flags &= ~RF_FIRSTSHARE;
+ }
+ goto out;
+ }
+
+ /*
+ * Look at the adjacent resources in the list and see if our
+ * segment can be merged with any of them.
+ */
+ s = TAILQ_PREV(r, resource_head, r_link);
+ t = TAILQ_NEXT(r, r_link);
+
+ if (s != NULL && (s->r_flags & RF_ALLOCATED) == 0
+ && t != NULL && (t->r_flags & RF_ALLOCATED) == 0) {
+ /*
+ * Merge all three segments.
+ */
+ s->r_end = t->r_end;
+ TAILQ_REMOVE(&rm->rm_list, r, r_link);
+ TAILQ_REMOVE(&rm->rm_list, t, r_link);
+ free(t, M_RMAN);
+ } else if (s != NULL && (s->r_flags & RF_ALLOCATED) == 0) {
+ /*
+ * Merge previous segment with ours.
+ */
+ s->r_end = r->r_end;
+ TAILQ_REMOVE(&rm->rm_list, r, r_link);
+ } else if (t != NULL && (t->r_flags & RF_ALLOCATED) == 0) {
+ /*
+ * Merge next segment with ours.
+ */
+ t->r_start = r->r_start;
+ TAILQ_REMOVE(&rm->rm_list, r, r_link);
+ } else {
+ /*
+ * At this point, we know there is nothing we
+ * can potentially merge with, because on each
+ * side, there is either nothing there or what is
+ * there is still allocated. In that case, we don't
+ * want to remove r from the list; we simply want to
+ * change it to an unallocated region and return
+ * without freeing anything.
+ */
+ r->r_flags &= ~RF_ALLOCATED;
+ return 0;
+ }
+
+out:
+ free(r, M_RMAN);
+ return 0;
+}
+
+int
+rman_release_resource(struct resource *r)
+{
+ int rv;
+ struct rman *rm = r->r_rm;
+
+ mtx_lock(rm->rm_mtx);
+ rv = int_rman_release_resource(rm, r);
+ mtx_unlock(rm->rm_mtx);
+ return (rv);
+}
+
+uint32_t
+rman_make_alignment_flags(uint32_t size)
+{
+ int i;
+
+ /*
+ * Find the hightest bit set, and add one if more than one bit
+ * set. We're effectively computing the ceil(log2(size)) here.
+ */
+ for (i = 31; i > 0; i--)
+ if ((1 << i) & size)
+ break;
+ if (~(1 << i) & size)
+ i++;
+
+ return(RF_ALIGNMENT_LOG2(i));
+}
diff --git a/sys/kern/subr_rtc.c b/sys/kern/subr_rtc.c
new file mode 100644
index 0000000..a79e331
--- /dev/null
+++ b/sys/kern/subr_rtc.c
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1982, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: Utah $Hdr: clock.c 1.18 91/01/21$
+ * from: @(#)clock.c 8.2 (Berkeley) 1/12/94
+ * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
+ * and
+ * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Helpers for time-of-day clocks. This is useful for architectures that need
+ * support multiple models of such clocks, and generally serves to make the
+ * code more machine-independent.
+ * If the clock in question can also be used as a time counter, the driver
+ * needs to initiate this.
+ * This code is not yet used by all architectures.
+ */
+
+/*
+ * Generic routines to convert between a POSIX date
+ * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
+ * Derived from NetBSD arch/hp300/hp300/clock.c
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+#include <sys/timetc.h>
+
+#include "clock_if.h"
+
+static __inline int leapyear(int year);
+static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS);
+
+#define FEBRUARY 2
+#define days_in_year(y) (leapyear(y) ? 366 : 365)
+#define days_in_month(y, m) \
+ (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0))
+/* Day of week. Days are counted from 1/1/1970, which was a Thursday */
+#define day_of_week(days) (((days) + 4) % 7)
+
+static const int month_days[12] = {
+ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+static device_t clock_dev = NULL;
+static long clock_res;
+
+int adjkerntz; /* local offset from GMT in seconds */
+int disable_rtc_set; /* disable resettodr() if != 0 */
+int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */
+
+/*
+ * These have traditionally been in machdep, but should probably be moved to
+ * kern.
+ */
+SYSCTL_PROC(_machdep, OID_AUTO, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
+ &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
+
+SYSCTL_INT(_machdep, OID_AUTO, disable_rtc_set,
+ CTLFLAG_RW, &disable_rtc_set, 0, "");
+
+SYSCTL_INT(_machdep, OID_AUTO, wall_cmos_clock,
+ CTLFLAG_RW, &wall_cmos_clock, 0, "");
+
+static int
+sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
+ req);
+ if (!error && req->newptr)
+ resettodr();
+ return (error);
+}
+
+/*
+ * This inline avoids some unnecessary modulo operations
+ * as compared with the usual macro:
+ * ( ((year % 4) == 0 &&
+ * (year % 100) != 0) ||
+ * ((year % 400) == 0) )
+ * It is otherwise equivalent.
+ */
+static __inline int
+leapyear(int year)
+{
+ int rv = 0;
+
+ if ((year & 3) == 0) {
+ rv = 1;
+ if ((year % 100) == 0) {
+ rv = 0;
+ if ((year % 400) == 0)
+ rv = 1;
+ }
+ }
+ return (rv);
+}
+
+int
+clock_ct_to_ts(struct clocktime *ct, struct timespec *ts)
+{
+ time_t secs;
+ int i, year, days;
+
+ year = ct->year;
+
+ /* Sanity checks. */
+ if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 ||
+ ct->day > days_in_month(year, ct->mon) ||
+ ct->hour > 23 || ct->min > 59 || ct->sec > 59 ||
+ ct->year > 2037) /* time_t overflow */
+ return (EINVAL);
+
+ /*
+ * Compute days since start of time
+ * First from years, then from months.
+ */
+ days = 0;
+ for (i = POSIX_BASE_YEAR; i < year; i++)
+ days += days_in_year(i);
+
+ /* Months */
+ for (i = 1; i < ct->mon; i++)
+ days += days_in_month(year, i);
+ days += (ct->day - 1);
+
+ /* Another sanity check. */
+ if (ct->dow != -1 && ct->dow != day_of_week(days))
+ return (EINVAL);
+
+ /* Add hours, minutes, seconds. */
+ secs = ((days * 24 + ct->hour) * 60 + ct->min) * 60 + ct->sec;
+
+ ts->tv_sec = secs;
+ ts->tv_nsec = ct->nsec;
+ return (0);
+}
+
+void
+clock_ts_to_ct(struct timespec *ts, struct clocktime *ct)
+{
+ int i, year, days;
+ time_t rsec; /* remainder seconds */
+ time_t secs;
+
+ secs = ts->tv_sec;
+ days = secs / SECDAY;
+ rsec = secs % SECDAY;
+
+ ct->dow = day_of_week(days);
+
+ /* Subtract out whole years, counting them in i. */
+ for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++)
+ days -= days_in_year(year);
+ ct->year = year;
+
+ /* Subtract out whole months, counting them in i. */
+ for (i = 1; days >= days_in_month(year, i); i++)
+ days -= days_in_month(year, i);
+ ct->mon = i;
+
+ /* Days are what is left over (+1) from all that. */
+ ct->day = days + 1;
+
+ /* Hours, minutes, seconds are easy */
+ ct->hour = rsec / 3600;
+ rsec = rsec % 3600;
+ ct->min = rsec / 60;
+ rsec = rsec % 60;
+ ct->sec = rsec;
+ ct->nsec = ts->tv_nsec;
+}
+
+void
+clock_register(device_t dev, long res)
+{
+
+ if (clock_dev != NULL) {
+ if (clock_res > res) {
+ if (bootverbose) {
+ device_printf(dev, "not installed as "
+ "time-of-day clock: clock %s has higher "
+ "resolution\n", device_get_name(clock_dev));
+ }
+ return;
+ } else {
+ if (bootverbose) {
+ device_printf(clock_dev, "removed as "
+ "time-of-day clock: clock %s has higher "
+ "resolution\n", device_get_name(dev));
+ }
+ }
+ }
+ clock_dev = dev;
+ clock_res = res;
+ if (bootverbose) {
+ device_printf(dev, "registered as a time-of-day clock "
+ "(resolution %ldus)\n", res);
+ }
+}
+
+/*
+ * inittodr and settodr derived from the i386 versions written
+ * by Christoph Robitschko <chmr@edvz.tu-graz.ac.at>, reintroduced and
+ * updated by Chris Stenton <chris@gnome.co.uk> 8/10/94
+ */
+
+/*
+ * Initialize the time of day register, based on the time base which is, e.g.
+ * from a filesystem.
+ */
+void
+inittodr(time_t base)
+{
+ struct timespec diff, ref, ts;
+ int error;
+
+ if (base) {
+ ref.tv_sec = base;
+ ref.tv_nsec = 0;
+ tc_setclock(&ref);
+ }
+
+ if (clock_dev == NULL) {
+ printf("warning: no time-of-day clock registered, system time "
+ "will not be set accurately\n");
+ return;
+ }
+ error = CLOCK_GETTIME(clock_dev, &ts);
+ if (error != 0 && error != EINVAL) {
+ printf("warning: clock_gettime failed (%d), the system time "
+ "will not be set accurately\n", error);
+ return;
+ }
+ if (error == EINVAL || ts.tv_sec < 0) {
+ printf("Invalid time in real time clock.\n");
+ printf("Check and reset the date immediately!\n");
+ }
+
+ ts.tv_sec += tz.tz_minuteswest * 60 +
+ (wall_cmos_clock ? adjkerntz : 0);
+
+ if (timespeccmp(&ref, &ts, >)) {
+ diff = ref;
+ timespecsub(&ref, &ts);
+ } else {
+ diff = ts;
+ timespecsub(&diff, &ref);
+ }
+ if (ts.tv_sec >= 2) {
+ /* badly off, adjust it */
+ tc_setclock(&ts);
+ }
+}
+
+/*
+ * Write system time back to RTC
+ */
+void
+resettodr()
+{
+ struct timespec ts;
+ int error;
+
+ if (disable_rtc_set || clock_dev == NULL)
+ return;
+
+ getnanotime(&ts);
+ ts.tv_sec -= tz.tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0);
+ if ((error = CLOCK_SETTIME(clock_dev, &ts)) != 0) {
+ printf("warning: clock_settime failed (%d), time-of-day clock "
+ "not adjusted to system time\n", error);
+ return;
+ }
+}
diff --git a/sys/kern/subr_sbuf.c b/sys/kern/subr_sbuf.c
new file mode 100644
index 0000000..6c910e6
--- /dev/null
+++ b/sys/kern/subr_sbuf.c
@@ -0,0 +1,560 @@
+/*-
+ * Copyright (c) 2000 Poul-Henning Kamp and Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+
+#ifdef _KERNEL
+#include <sys/ctype.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/uio.h>
+#include <machine/stdarg.h>
+#else /* _KERNEL */
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#endif /* _KERNEL */
+
+#include <sys/sbuf.h>
+
+#ifdef _KERNEL
+MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers");
+#define SBMALLOC(size) malloc(size, M_SBUF, M_WAITOK)
+#define SBFREE(buf) free(buf, M_SBUF)
+#else /* _KERNEL */
+#define KASSERT(e, m)
+#define SBMALLOC(size) malloc(size)
+#define SBFREE(buf) free(buf)
+#define min(x,y) MIN(x,y)
+#endif /* _KERNEL */
+
+/*
+ * Predicates
+ */
+#define SBUF_ISDYNAMIC(s) ((s)->s_flags & SBUF_DYNAMIC)
+#define SBUF_ISDYNSTRUCT(s) ((s)->s_flags & SBUF_DYNSTRUCT)
+#define SBUF_ISFINISHED(s) ((s)->s_flags & SBUF_FINISHED)
+#define SBUF_HASOVERFLOWED(s) ((s)->s_flags & SBUF_OVERFLOWED)
+#define SBUF_HASROOM(s) ((s)->s_len < (s)->s_size - 1)
+#define SBUF_FREESPACE(s) ((s)->s_size - (s)->s_len - 1)
+#define SBUF_CANEXTEND(s) ((s)->s_flags & SBUF_AUTOEXTEND)
+
+/*
+ * Set / clear flags
+ */
+#define SBUF_SETFLAG(s, f) do { (s)->s_flags |= (f); } while (0)
+#define SBUF_CLEARFLAG(s, f) do { (s)->s_flags &= ~(f); } while (0)
+
+#define SBUF_MINEXTENDSIZE 16 /* Should be power of 2. */
+#define SBUF_MAXEXTENDSIZE PAGE_SIZE
+#define SBUF_MAXEXTENDINCR PAGE_SIZE
+
+/*
+ * Debugging support
+ */
+#if defined(_KERNEL) && defined(INVARIANTS)
+static void
+_assert_sbuf_integrity(const char *fun, struct sbuf *s)
+{
+ KASSERT(s != NULL,
+ ("%s called with a NULL sbuf pointer", fun));
+ KASSERT(s->s_buf != NULL,
+ ("%s called with uninitialized or corrupt sbuf", fun));
+ KASSERT(s->s_len < s->s_size,
+ ("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size));
+}
+
+static void
+_assert_sbuf_state(const char *fun, struct sbuf *s, int state)
+{
+ KASSERT((s->s_flags & SBUF_FINISHED) == state,
+ ("%s called with %sfinished or corrupt sbuf", fun,
+ (state ? "un" : "")));
+}
+#define assert_sbuf_integrity(s) _assert_sbuf_integrity(__func__, (s))
+#define assert_sbuf_state(s, i) _assert_sbuf_state(__func__, (s), (i))
+#else /* _KERNEL && INVARIANTS */
+#define assert_sbuf_integrity(s) do { } while (0)
+#define assert_sbuf_state(s, i) do { } while (0)
+#endif /* _KERNEL && INVARIANTS */
+
+static int
+sbuf_extendsize(int size)
+{
+ int newsize;
+
+ newsize = SBUF_MINEXTENDSIZE;
+ while (newsize < size) {
+ if (newsize < SBUF_MAXEXTENDSIZE)
+ newsize *= 2;
+ else
+ newsize += SBUF_MAXEXTENDINCR;
+ }
+
+ return (newsize);
+}
+
+
+/*
+ * Extend an sbuf.
+ */
+static int
+sbuf_extend(struct sbuf *s, int addlen)
+{
+ char *newbuf;
+ int newsize;
+
+ if (!SBUF_CANEXTEND(s))
+ return (-1);
+
+ newsize = sbuf_extendsize(s->s_size + addlen);
+ newbuf = (char *)SBMALLOC(newsize);
+ if (newbuf == NULL)
+ return (-1);
+ bcopy(s->s_buf, newbuf, s->s_size);
+ if (SBUF_ISDYNAMIC(s))
+ SBFREE(s->s_buf);
+ else
+ SBUF_SETFLAG(s, SBUF_DYNAMIC);
+ s->s_buf = newbuf;
+ s->s_size = newsize;
+ return (0);
+}
+
+/*
+ * Initialize an sbuf.
+ * If buf is non-NULL, it points to a static or already-allocated string
+ * big enough to hold at least length characters.
+ */
+struct sbuf *
+sbuf_new(struct sbuf *s, char *buf, int length, int flags)
+{
+ KASSERT(length >= 0,
+ ("attempt to create an sbuf of negative length (%d)", length));
+ KASSERT((flags & ~SBUF_USRFLAGMSK) == 0,
+ ("%s called with invalid flags", __func__));
+
+ flags &= SBUF_USRFLAGMSK;
+ if (s == NULL) {
+ s = (struct sbuf *)SBMALLOC(sizeof *s);
+ if (s == NULL)
+ return (NULL);
+ bzero(s, sizeof *s);
+ s->s_flags = flags;
+ SBUF_SETFLAG(s, SBUF_DYNSTRUCT);
+ } else {
+ bzero(s, sizeof *s);
+ s->s_flags = flags;
+ }
+ s->s_size = length;
+ if (buf) {
+ s->s_buf = buf;
+ return (s);
+ }
+ if (flags & SBUF_AUTOEXTEND)
+ s->s_size = sbuf_extendsize(s->s_size);
+ s->s_buf = (char *)SBMALLOC(s->s_size);
+ if (s->s_buf == NULL) {
+ if (SBUF_ISDYNSTRUCT(s))
+ SBFREE(s);
+ return (NULL);
+ }
+ SBUF_SETFLAG(s, SBUF_DYNAMIC);
+ return (s);
+}
+
+#ifdef _KERNEL
+/*
+ * Create an sbuf with uio data
+ */
+struct sbuf *
+sbuf_uionew(struct sbuf *s, struct uio *uio, int *error)
+{
+ KASSERT(uio != NULL,
+ ("%s called with NULL uio pointer", __func__));
+ KASSERT(error != NULL,
+ ("%s called with NULL error pointer", __func__));
+
+ s = sbuf_new(s, NULL, uio->uio_resid + 1, 0);
+ if (s == NULL) {
+ *error = ENOMEM;
+ return (NULL);
+ }
+ *error = uiomove(s->s_buf, uio->uio_resid, uio);
+ if (*error != 0) {
+ sbuf_delete(s);
+ return (NULL);
+ }
+ s->s_len = s->s_size - 1;
+ *error = 0;
+ return (s);
+}
+#endif
+
+/*
+ * Clear an sbuf and reset its position.
+ */
+void
+sbuf_clear(struct sbuf *s)
+{
+ assert_sbuf_integrity(s);
+ /* don't care if it's finished or not */
+
+ SBUF_CLEARFLAG(s, SBUF_FINISHED);
+ SBUF_CLEARFLAG(s, SBUF_OVERFLOWED);
+ s->s_len = 0;
+}
+
+/*
+ * Set the sbuf's end position to an arbitrary value.
+ * Effectively truncates the sbuf at the new position.
+ */
+int
+sbuf_setpos(struct sbuf *s, int pos)
+{
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ KASSERT(pos >= 0,
+ ("attempt to seek to a negative position (%d)", pos));
+ KASSERT(pos < s->s_size,
+ ("attempt to seek past end of sbuf (%d >= %d)", pos, s->s_size));
+
+ if (pos < 0 || pos > s->s_len)
+ return (-1);
+ s->s_len = pos;
+ return (0);
+}
+
+/*
+ * Append a byte string to an sbuf.
+ */
+int
+sbuf_bcat(struct sbuf *s, const char *str, size_t len)
+{
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ if (SBUF_HASOVERFLOWED(s))
+ return (-1);
+
+ for (; len; len--) {
+ if (!SBUF_HASROOM(s) && sbuf_extend(s, len) < 0)
+ break;
+ s->s_buf[s->s_len++] = *str++;
+ }
+ if (len) {
+ SBUF_SETFLAG(s, SBUF_OVERFLOWED);
+ return (-1);
+ }
+ return (0);
+}
+
+#ifdef _KERNEL
+/*
+ * Copy a byte string from userland into an sbuf.
+ */
+int
+sbuf_bcopyin(struct sbuf *s, const void *uaddr, size_t len)
+{
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ if (SBUF_HASOVERFLOWED(s))
+ return (-1);
+
+ if (len == 0)
+ return (0);
+ if (len > SBUF_FREESPACE(s)) {
+ sbuf_extend(s, len - SBUF_FREESPACE(s));
+ len = min(len, SBUF_FREESPACE(s));
+ }
+ if (copyin(uaddr, s->s_buf + s->s_len, len) != 0)
+ return (-1);
+ s->s_len += len;
+
+ return (0);
+}
+#endif
+
+/*
+ * Copy a byte string into an sbuf.
+ */
+int
+sbuf_bcpy(struct sbuf *s, const char *str, size_t len)
+{
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ sbuf_clear(s);
+ return (sbuf_bcat(s, str, len));
+}
+
+/*
+ * Append a string to an sbuf.
+ */
+int
+sbuf_cat(struct sbuf *s, const char *str)
+{
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ if (SBUF_HASOVERFLOWED(s))
+ return (-1);
+
+ while (*str) {
+ if (!SBUF_HASROOM(s) && sbuf_extend(s, strlen(str)) < 0)
+ break;
+ s->s_buf[s->s_len++] = *str++;
+ }
+ if (*str) {
+ SBUF_SETFLAG(s, SBUF_OVERFLOWED);
+ return (-1);
+ }
+ return (0);
+}
+
+#ifdef _KERNEL
+/*
+ * Append a string from userland to an sbuf.
+ */
+int
+sbuf_copyin(struct sbuf *s, const void *uaddr, size_t len)
+{
+ size_t done;
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ if (SBUF_HASOVERFLOWED(s))
+ return (-1);
+
+ if (len == 0)
+ len = SBUF_FREESPACE(s); /* XXX return 0? */
+ if (len > SBUF_FREESPACE(s)) {
+ sbuf_extend(s, len);
+ len = min(len, SBUF_FREESPACE(s));
+ }
+ switch (copyinstr(uaddr, s->s_buf + s->s_len, len + 1, &done)) {
+ case ENAMETOOLONG:
+ SBUF_SETFLAG(s, SBUF_OVERFLOWED);
+ /* fall through */
+ case 0:
+ s->s_len += done - 1;
+ break;
+ default:
+ return (-1); /* XXX */
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * Copy a string into an sbuf.
+ */
+int
+sbuf_cpy(struct sbuf *s, const char *str)
+{
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ sbuf_clear(s);
+ return (sbuf_cat(s, str));
+}
+
+/*
+ * Format the given argument list and append the resulting string to an sbuf.
+ */
+int
+sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap)
+{
+ int len;
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ KASSERT(fmt != NULL,
+ ("%s called with a NULL format string", __func__));
+
+ if (SBUF_HASOVERFLOWED(s))
+ return (-1);
+
+ do {
+ len = vsnprintf(&s->s_buf[s->s_len], SBUF_FREESPACE(s) + 1,
+ fmt, ap);
+ } while (len > SBUF_FREESPACE(s) &&
+ sbuf_extend(s, len - SBUF_FREESPACE(s)) == 0);
+
+ /*
+ * s->s_len is the length of the string, without the terminating nul.
+ * When updating s->s_len, we must subtract 1 from the length that
+ * we passed into vsnprintf() because that length includes the
+ * terminating nul.
+ *
+ * vsnprintf() returns the amount that would have been copied,
+ * given sufficient space, hence the min() calculation below.
+ */
+ s->s_len += min(len, SBUF_FREESPACE(s));
+ if (!SBUF_HASROOM(s) && !SBUF_CANEXTEND(s))
+ SBUF_SETFLAG(s, SBUF_OVERFLOWED);
+
+ KASSERT(s->s_len < s->s_size,
+ ("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size));
+
+ if (SBUF_HASOVERFLOWED(s))
+ return (-1);
+ return (0);
+}
+
+/*
+ * Format the given arguments and append the resulting string to an sbuf.
+ */
+int
+sbuf_printf(struct sbuf *s, const char *fmt, ...)
+{
+ va_list ap;
+ int result;
+
+ va_start(ap, fmt);
+ result = sbuf_vprintf(s, fmt, ap);
+ va_end(ap);
+ return(result);
+}
+
+/*
+ * Append a character to an sbuf.
+ */
+int
+sbuf_putc(struct sbuf *s, int c)
+{
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ if (SBUF_HASOVERFLOWED(s))
+ return (-1);
+
+ if (!SBUF_HASROOM(s) && sbuf_extend(s, 1) < 0) {
+ SBUF_SETFLAG(s, SBUF_OVERFLOWED);
+ return (-1);
+ }
+ if (c != '\0')
+ s->s_buf[s->s_len++] = c;
+ return (0);
+}
+
+/*
+ * Trim whitespace characters from end of an sbuf.
+ */
+int
+sbuf_trim(struct sbuf *s)
+{
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ if (SBUF_HASOVERFLOWED(s))
+ return (-1);
+
+ while (s->s_len && isspace(s->s_buf[s->s_len-1]))
+ --s->s_len;
+
+ return (0);
+}
+
+/*
+ * Check if an sbuf overflowed
+ */
+int
+sbuf_overflowed(struct sbuf *s)
+{
+ return SBUF_HASOVERFLOWED(s);
+}
+
+/*
+ * Finish off an sbuf.
+ */
+void
+sbuf_finish(struct sbuf *s)
+{
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ s->s_buf[s->s_len] = '\0';
+ SBUF_CLEARFLAG(s, SBUF_OVERFLOWED);
+ SBUF_SETFLAG(s, SBUF_FINISHED);
+}
+
+/*
+ * Return a pointer to the sbuf data.
+ */
+char *
+sbuf_data(struct sbuf *s)
+{
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, SBUF_FINISHED);
+
+ return s->s_buf;
+}
+
+/*
+ * Return the length of the sbuf data.
+ */
+int
+sbuf_len(struct sbuf *s)
+{
+ assert_sbuf_integrity(s);
+ /* don't care if it's finished or not */
+
+ if (SBUF_HASOVERFLOWED(s))
+ return (-1);
+ return s->s_len;
+}
+
+/*
+ * Clear an sbuf, free its buffer if necessary.
+ */
+void
+sbuf_delete(struct sbuf *s)
+{
+ int isdyn;
+
+ assert_sbuf_integrity(s);
+ /* don't care if it's finished or not */
+
+ if (SBUF_ISDYNAMIC(s))
+ SBFREE(s->s_buf);
+ isdyn = SBUF_ISDYNSTRUCT(s);
+ bzero(s, sizeof *s);
+ if (isdyn)
+ SBFREE(s);
+}
diff --git a/sys/kern/subr_scanf.c b/sys/kern/subr_scanf.c
new file mode 100644
index 0000000..13f02b8
--- /dev/null
+++ b/sys/kern/subr_scanf.c
@@ -0,0 +1,628 @@
+/*-
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Chris Torek.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ * From: Id: vfscanf.c,v 1.13 1998/09/25 12:20:27 obrien Exp
+ * From: static char sccsid[] = "@(#)strtol.c 8.1 (Berkeley) 6/4/93";
+ * From: static char sccsid[] = "@(#)strtoul.c 8.1 (Berkeley) 6/4/93";
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ctype.h>
+#include <machine/limits.h>
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#define BUF 32 /* Maximum length of numeric string. */
+
+/*
+ * Flags used during conversion.
+ */
+#define LONG 0x01 /* l: long or double */
+#define SHORT 0x04 /* h: short */
+#define SUPPRESS 0x08 /* suppress assignment */
+#define POINTER 0x10 /* weird %p pointer (`fake hex') */
+#define NOSKIP 0x20 /* do not skip blanks */
+#define QUAD 0x400
+
+/*
+ * The following are used in numeric conversions only:
+ * SIGNOK, NDIGITS, DPTOK, and EXPOK are for floating point;
+ * SIGNOK, NDIGITS, PFXOK, and NZDIGITS are for integral.
+ */
+#define SIGNOK 0x40 /* +/- is (still) legal */
+#define NDIGITS 0x80 /* no digits detected */
+
+#define DPTOK 0x100 /* (float) decimal point is still legal */
+#define EXPOK 0x200 /* (float) exponent (e+3, etc) still legal */
+
+#define PFXOK 0x100 /* 0x prefix is (still) legal */
+#define NZDIGITS 0x200 /* no zero digits detected */
+
+/*
+ * Conversion types.
+ */
+#define CT_CHAR 0 /* %c conversion */
+#define CT_CCL 1 /* %[...] conversion */
+#define CT_STRING 2 /* %s conversion */
+#define CT_INT 3 /* integer, i.e., strtoq or strtouq */
+typedef u_quad_t (*ccfntype)(const char *, char **, int);
+
+static const u_char *__sccl(char *, const u_char *);
+
+int
+sscanf(const char *ibuf, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = vsscanf(ibuf, fmt, ap);
+ va_end(ap);
+ return(ret);
+}
+
+int
+vsscanf(const char *inp, char const *fmt0, va_list ap)
+{
+ int inr;
+ const u_char *fmt = (const u_char *)fmt0;
+ int c; /* character from format, or conversion */
+ size_t width; /* field width, or 0 */
+ char *p; /* points into all kinds of strings */
+ int n; /* handy integer */
+ int flags; /* flags as defined above */
+ char *p0; /* saves original value of p when necessary */
+ int nassigned; /* number of fields assigned */
+ int nconversions; /* number of conversions */
+ int nread; /* number of characters consumed from fp */
+ int base; /* base argument to strtoq/strtouq */
+ ccfntype ccfn; /* conversion function (strtoq/strtouq) */
+ char ccltab[256]; /* character class table for %[...] */
+ char buf[BUF]; /* buffer for numeric conversions */
+
+ /* `basefix' is used to avoid `if' tests in the integer scanner */
+ static short basefix[17] =
+ { 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
+
+ inr = strlen(inp);
+
+ nassigned = 0;
+ nconversions = 0;
+ nread = 0;
+ base = 0; /* XXX just to keep gcc happy */
+ ccfn = NULL; /* XXX just to keep gcc happy */
+ for (;;) {
+ c = *fmt++;
+ if (c == 0)
+ return (nassigned);
+ if (isspace(c)) {
+ while (inr > 0 && isspace(*inp))
+ nread++, inr--, inp++;
+ continue;
+ }
+ if (c != '%')
+ goto literal;
+ width = 0;
+ flags = 0;
+ /*
+ * switch on the format. continue if done;
+ * break once format type is derived.
+ */
+again: c = *fmt++;
+ switch (c) {
+ case '%':
+literal:
+ if (inr <= 0)
+ goto input_failure;
+ if (*inp != c)
+ goto match_failure;
+ inr--, inp++;
+ nread++;
+ continue;
+
+ case '*':
+ flags |= SUPPRESS;
+ goto again;
+ case 'l':
+ flags |= LONG;
+ goto again;
+ case 'q':
+ flags |= QUAD;
+ goto again;
+ case 'h':
+ flags |= SHORT;
+ goto again;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ width = width * 10 + c - '0';
+ goto again;
+
+ /*
+ * Conversions.
+ *
+ */
+ case 'd':
+ c = CT_INT;
+ ccfn = (ccfntype)strtoq;
+ base = 10;
+ break;
+
+ case 'i':
+ c = CT_INT;
+ ccfn = (ccfntype)strtoq;
+ base = 0;
+ break;
+
+ case 'o':
+ c = CT_INT;
+ ccfn = strtouq;
+ base = 8;
+ break;
+
+ case 'u':
+ c = CT_INT;
+ ccfn = strtouq;
+ base = 10;
+ break;
+
+ case 'x':
+ flags |= PFXOK; /* enable 0x prefixing */
+ c = CT_INT;
+ ccfn = strtouq;
+ base = 16;
+ break;
+
+ case 's':
+ c = CT_STRING;
+ break;
+
+ case '[':
+ fmt = __sccl(ccltab, fmt);
+ flags |= NOSKIP;
+ c = CT_CCL;
+ break;
+
+ case 'c':
+ flags |= NOSKIP;
+ c = CT_CHAR;
+ break;
+
+ case 'p': /* pointer format is like hex */
+ flags |= POINTER | PFXOK;
+ c = CT_INT;
+ ccfn = strtouq;
+ base = 16;
+ break;
+
+ case 'n':
+ nconversions++;
+ if (flags & SUPPRESS) /* ??? */
+ continue;
+ if (flags & SHORT)
+ *va_arg(ap, short *) = nread;
+ else if (flags & LONG)
+ *va_arg(ap, long *) = nread;
+ else if (flags & QUAD)
+ *va_arg(ap, quad_t *) = nread;
+ else
+ *va_arg(ap, int *) = nread;
+ continue;
+ }
+
+ /*
+ * We have a conversion that requires input.
+ */
+ if (inr <= 0)
+ goto input_failure;
+
+ /*
+ * Consume leading white space, except for formats
+ * that suppress this.
+ */
+ if ((flags & NOSKIP) == 0) {
+ while (isspace(*inp)) {
+ nread++;
+ if (--inr > 0)
+ inp++;
+ else
+ goto input_failure;
+ }
+ /*
+ * Note that there is at least one character in
+ * the buffer, so conversions that do not set NOSKIP
+ * can no longer result in an input failure.
+ */
+ }
+
+ /*
+ * Do the conversion.
+ */
+ switch (c) {
+
+ case CT_CHAR:
+ /* scan arbitrary characters (sets NOSKIP) */
+ if (width == 0)
+ width = 1;
+ if (flags & SUPPRESS) {
+ size_t sum = 0;
+ for (;;) {
+ if ((n = inr) < width) {
+ sum += n;
+ width -= n;
+ inp += n;
+ if (sum == 0)
+ goto input_failure;
+ break;
+ } else {
+ sum += width;
+ inr -= width;
+ inp += width;
+ break;
+ }
+ }
+ nread += sum;
+ } else {
+ bcopy(inp, va_arg(ap, char *), width);
+ inr -= width;
+ inp += width;
+ nread += width;
+ nassigned++;
+ }
+ nconversions++;
+ break;
+
+ case CT_CCL:
+ /* scan a (nonempty) character class (sets NOSKIP) */
+ if (width == 0)
+ width = (size_t)~0; /* `infinity' */
+ /* take only those things in the class */
+ if (flags & SUPPRESS) {
+ n = 0;
+ while (ccltab[(unsigned char)*inp]) {
+ n++, inr--, inp++;
+ if (--width == 0)
+ break;
+ if (inr <= 0) {
+ if (n == 0)
+ goto input_failure;
+ break;
+ }
+ }
+ if (n == 0)
+ goto match_failure;
+ } else {
+ p0 = p = va_arg(ap, char *);
+ while (ccltab[(unsigned char)*inp]) {
+ inr--;
+ *p++ = *inp++;
+ if (--width == 0)
+ break;
+ if (inr <= 0) {
+ if (p == p0)
+ goto input_failure;
+ break;
+ }
+ }
+ n = p - p0;
+ if (n == 0)
+ goto match_failure;
+ *p = 0;
+ nassigned++;
+ }
+ nread += n;
+ nconversions++;
+ break;
+
+ case CT_STRING:
+ /* like CCL, but zero-length string OK, & no NOSKIP */
+ if (width == 0)
+ width = (size_t)~0;
+ if (flags & SUPPRESS) {
+ n = 0;
+ while (!isspace(*inp)) {
+ n++, inr--, inp++;
+ if (--width == 0)
+ break;
+ if (inr <= 0)
+ break;
+ }
+ nread += n;
+ } else {
+ p0 = p = va_arg(ap, char *);
+ while (!isspace(*inp)) {
+ inr--;
+ *p++ = *inp++;
+ if (--width == 0)
+ break;
+ if (inr <= 0)
+ break;
+ }
+ *p = 0;
+ nread += p - p0;
+ nassigned++;
+ }
+ nconversions++;
+ continue;
+
+ case CT_INT:
+ /* scan an integer as if by strtoq/strtouq */
+#ifdef hardway
+ if (width == 0 || width > sizeof(buf) - 1)
+ width = sizeof(buf) - 1;
+#else
+ /* size_t is unsigned, hence this optimisation */
+ if (--width > sizeof(buf) - 2)
+ width = sizeof(buf) - 2;
+ width++;
+#endif
+ flags |= SIGNOK | NDIGITS | NZDIGITS;
+ for (p = buf; width; width--) {
+ c = *inp;
+ /*
+ * Switch on the character; `goto ok'
+ * if we accept it as a part of number.
+ */
+ switch (c) {
+
+ /*
+ * The digit 0 is always legal, but is
+ * special. For %i conversions, if no
+ * digits (zero or nonzero) have been
+ * scanned (only signs), we will have
+ * base==0. In that case, we should set
+ * it to 8 and enable 0x prefixing.
+ * Also, if we have not scanned zero digits
+ * before this, do not turn off prefixing
+ * (someone else will turn it off if we
+ * have scanned any nonzero digits).
+ */
+ case '0':
+ if (base == 0) {
+ base = 8;
+ flags |= PFXOK;
+ }
+ if (flags & NZDIGITS)
+ flags &= ~(SIGNOK|NZDIGITS|NDIGITS);
+ else
+ flags &= ~(SIGNOK|PFXOK|NDIGITS);
+ goto ok;
+
+ /* 1 through 7 always legal */
+ case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ base = basefix[base];
+ flags &= ~(SIGNOK | PFXOK | NDIGITS);
+ goto ok;
+
+ /* digits 8 and 9 ok iff decimal or hex */
+ case '8': case '9':
+ base = basefix[base];
+ if (base <= 8)
+ break; /* not legal here */
+ flags &= ~(SIGNOK | PFXOK | NDIGITS);
+ goto ok;
+
+ /* letters ok iff hex */
+ case 'A': case 'B': case 'C':
+ case 'D': case 'E': case 'F':
+ case 'a': case 'b': case 'c':
+ case 'd': case 'e': case 'f':
+ /* no need to fix base here */
+ if (base <= 10)
+ break; /* not legal here */
+ flags &= ~(SIGNOK | PFXOK | NDIGITS);
+ goto ok;
+
+ /* sign ok only as first character */
+ case '+': case '-':
+ if (flags & SIGNOK) {
+ flags &= ~SIGNOK;
+ goto ok;
+ }
+ break;
+
+ /* x ok iff flag still set & 2nd char */
+ case 'x': case 'X':
+ if (flags & PFXOK && p == buf + 1) {
+ base = 16; /* if %i */
+ flags &= ~PFXOK;
+ goto ok;
+ }
+ break;
+ }
+
+ /*
+ * If we got here, c is not a legal character
+ * for a number. Stop accumulating digits.
+ */
+ break;
+ ok:
+ /*
+ * c is legal: store it and look at the next.
+ */
+ *p++ = c;
+ if (--inr > 0)
+ inp++;
+ else
+ break; /* end of input */
+ }
+ /*
+ * If we had only a sign, it is no good; push
+ * back the sign. If the number ends in `x',
+ * it was [sign] '0' 'x', so push back the x
+ * and treat it as [sign] '0'.
+ */
+ if (flags & NDIGITS) {
+ if (p > buf) {
+ inp--;
+ inr++;
+ }
+ goto match_failure;
+ }
+ c = ((u_char *)p)[-1];
+ if (c == 'x' || c == 'X') {
+ --p;
+ inp--;
+ inr++;
+ }
+ if ((flags & SUPPRESS) == 0) {
+ u_quad_t res;
+
+ *p = 0;
+ res = (*ccfn)(buf, (char **)NULL, base);
+ if (flags & POINTER)
+ *va_arg(ap, void **) =
+ (void *)(uintptr_t)res;
+ else if (flags & SHORT)
+ *va_arg(ap, short *) = res;
+ else if (flags & LONG)
+ *va_arg(ap, long *) = res;
+ else if (flags & QUAD)
+ *va_arg(ap, quad_t *) = res;
+ else
+ *va_arg(ap, int *) = res;
+ nassigned++;
+ }
+ nread += p - buf;
+ nconversions++;
+ break;
+
+ }
+ }
+input_failure:
+ return (nconversions != 0 ? nassigned : -1);
+match_failure:
+ return (nassigned);
+}
+
+/*
+ * Fill in the given table from the scanset at the given format
+ * (just after `['). Return a pointer to the character past the
+ * closing `]'. The table has a 1 wherever characters should be
+ * considered part of the scanset.
+ */
+static const u_char *
+__sccl(char *tab, const u_char *fmt)
+{
+ int c, n, v;
+
+ /* first `clear' the whole table */
+ c = *fmt++; /* first char hat => negated scanset */
+ if (c == '^') {
+ v = 1; /* default => accept */
+ c = *fmt++; /* get new first char */
+ } else
+ v = 0; /* default => reject */
+
+ /* XXX: Will not work if sizeof(tab*) > sizeof(char) */
+ for (n = 0; n < 256; n++)
+ tab[n] = v; /* memset(tab, v, 256) */
+
+ if (c == 0)
+ return (fmt - 1);/* format ended before closing ] */
+
+ /*
+ * Now set the entries corresponding to the actual scanset
+ * to the opposite of the above.
+ *
+ * The first character may be ']' (or '-') without being special;
+ * the last character may be '-'.
+ */
+ v = 1 - v;
+ for (;;) {
+ tab[c] = v; /* take character c */
+doswitch:
+ n = *fmt++; /* and examine the next */
+ switch (n) {
+
+ case 0: /* format ended too soon */
+ return (fmt - 1);
+
+ case '-':
+ /*
+ * A scanset of the form
+ * [01+-]
+ * is defined as `the digit 0, the digit 1,
+ * the character +, the character -', but
+ * the effect of a scanset such as
+ * [a-zA-Z0-9]
+ * is implementation defined. The V7 Unix
+ * scanf treats `a-z' as `the letters a through
+ * z', but treats `a-a' as `the letter a, the
+ * character -, and the letter a'.
+ *
+ * For compatibility, the `-' is not considerd
+ * to define a range if the character following
+ * it is either a close bracket (required by ANSI)
+ * or is not numerically greater than the character
+ * we just stored in the table (c).
+ */
+ n = *fmt;
+ if (n == ']' || n < c) {
+ c = '-';
+ break; /* resume the for(;;) */
+ }
+ fmt++;
+ /* fill in the range */
+ do {
+ tab[++c] = v;
+ } while (c < n);
+ c = n;
+ /*
+ * Alas, the V7 Unix scanf also treats formats
+ * such as [a-c-e] as `the letters a through e'.
+ * This too is permitted by the standard....
+ */
+ goto doswitch;
+ break;
+
+ case ']': /* end of scanset */
+ return (fmt);
+
+ default: /* just another character */
+ c = n;
+ break;
+ }
+ }
+ /* NOTREACHED */
+}
+
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
new file mode 100644
index 0000000..9dad93b
--- /dev/null
+++ b/sys/kern/subr_smp.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2001
+ * John Baldwin <jhb@FreeBSD.org>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY JOHN BALDWIN AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL JOHN BALDWIN OR THE VOICES IN HIS HEAD
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This module holds the global variables and machine independent functions
+ * used for the kernel SMP support.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <machine/smp.h>
+
+volatile u_int stopped_cpus;
+volatile u_int started_cpus;
+
+void (*cpustop_restartfunc)(void);
+int mp_ncpus;
+
+volatile int smp_started;
+u_int all_cpus;
+u_int mp_maxid;
+
+SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD, NULL, "Kernel SMP");
+
+int smp_active = 0; /* are the APs allowed to run? */
+SYSCTL_INT(_kern_smp, OID_AUTO, active, CTLFLAG_RW, &smp_active, 0, "");
+
+int smp_cpus = 1; /* how many cpu's running */
+SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD, &smp_cpus, 0, "");
+
+/* Enable forwarding of a signal to a process running on a different CPU */
+static int forward_signal_enabled = 1;
+SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
+ &forward_signal_enabled, 0, "");
+
+/* Enable forwarding of roundrobin to all other cpus */
+static int forward_roundrobin_enabled = 1;
+SYSCTL_INT(_kern_smp, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW,
+ &forward_roundrobin_enabled, 0, "");
+
+/* Variables needed for SMP rendezvous. */
+static void (*smp_rv_setup_func)(void *arg);
+static void (*smp_rv_action_func)(void *arg);
+static void (*smp_rv_teardown_func)(void *arg);
+static void *smp_rv_func_arg;
+static volatile int smp_rv_waiters[2];
+static struct mtx smp_rv_mtx;
+static int mp_probe_status;
+
+/*
+ * Initialize MI SMP variables.
+ */
+static void
+mp_probe(void *dummy)
+{
+ mp_probe_status = cpu_mp_probe();
+}
+SYSINIT(cpu_mp_probe, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_probe, NULL)
+
+/*
+ * Call the MD SMP initialization code.
+ */
+static void
+mp_start(void *dummy)
+{
+
+ /* Probe for MP hardware. */
+ if (mp_probe_status == 0)
+ return;
+
+ mtx_init(&smp_rv_mtx, "smp rendezvous", NULL, MTX_SPIN);
+ cpu_mp_start();
+ printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
+ mp_ncpus);
+ cpu_mp_announce();
+}
+SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_SECOND, mp_start, NULL)
+
+void
+forward_signal(struct thread *td)
+{
+ int id;
+
+ /*
+ * signotify() has already set KEF_ASTPENDING and PS_NEEDSIGCHECK on
+ * this process, so all we need to do is poke it if it is currently
+ * executing so that it executes ast().
+ */
+ mtx_assert(&sched_lock, MA_OWNED);
+ KASSERT(td->td_proc->p_stat == SRUN,
+ ("forward_signal: process is not SRUN"));
+
+ CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
+
+ if (!smp_started || cold || panicstr)
+ return;
+ if (!forward_signal_enabled)
+ return;
+
+ /* No need to IPI ourself. */
+ if (td == curthread)
+ return;
+
+ id = td->td_kse->ke_oncpu;
+ if (id == NOCPU)
+ return;
+ ipi_selected(1 << id, IPI_AST);
+}
+
+void
+forward_roundrobin(void)
+{
+ struct pcpu *pc;
+ struct thread *td;
+ u_int id, map;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+
+ CTR0(KTR_SMP, "forward_roundrobin()");
+
+ if (!smp_started || cold || panicstr)
+ return;
+ if (!forward_roundrobin_enabled)
+ return;
+ map = 0;
+ SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
+ td = pc->pc_curthread;
+ id = pc->pc_cpumask;
+ if (id != PCPU_GET(cpumask) && (id & stopped_cpus) == 0 &&
+ td != pc->pc_idlethread) {
+ td->td_kse->ke_flags |= KEF_NEEDRESCHED;
+ map |= id;
+ }
+ }
+ ipi_selected(map, IPI_AST);
+}
+
+/*
+ * When called the executing CPU will send an IPI to all other CPUs
+ * requesting that they halt execution.
+ *
+ * Usually (but not necessarily) called with 'other_cpus' as its arg.
+ *
+ * - Signals all CPUs in map to stop.
+ * - Waits for each to stop.
+ *
+ * Returns:
+ * -1: error
+ * 0: NA
+ * 1: ok
+ *
+ * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs
+ * from executing at same time.
+ */
+int
+stop_cpus(u_int map)
+{
+ int i;
+
+ if (!smp_started)
+ return 0;
+
+ CTR1(KTR_SMP, "stop_cpus(%x)", map);
+
+ /* send the stop IPI to all CPUs in map */
+ ipi_selected(map, IPI_STOP);
+
+ i = 0;
+ while ((atomic_load_acq_int(&stopped_cpus) & map) != map) {
+ /* spin */
+ i++;
+#ifdef DIAGNOSTIC
+ if (i == 100000) {
+ printf("timeout stopping cpus\n");
+ break;
+ }
+#endif
+ }
+
+ return 1;
+}
+
+
+/*
+ * Called by a CPU to restart stopped CPUs.
+ *
+ * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
+ *
+ * - Signals all CPUs in map to restart.
+ * - Waits for each to restart.
+ *
+ * Returns:
+ * -1: error
+ * 0: NA
+ * 1: ok
+ */
+int
+restart_cpus(u_int map)
+{
+
+ if (!smp_started)
+ return 0;
+
+ CTR1(KTR_SMP, "restart_cpus(%x)", map);
+
+ /* signal other cpus to restart */
+ atomic_store_rel_int(&started_cpus, map);
+
+ /* wait for each to clear its bit */
+ while ((atomic_load_acq_int(&stopped_cpus) & map) != 0)
+ ; /* nothing */
+
+ return 1;
+}
+
+/*
+ * All-CPU rendezvous. CPUs are signalled, all execute the setup function
+ * (if specified), rendezvous, execute the action function (if specified),
+ * rendezvous again, execute the teardown function (if specified), and then
+ * resume.
+ *
+ * Note that the supplied external functions _must_ be reentrant and aware
+ * that they are running in parallel and in an unknown lock context.
+ */
+void
+smp_rendezvous_action(void)
+{
+
+ /* setup function */
+ if (smp_rv_setup_func != NULL)
+ smp_rv_setup_func(smp_rv_func_arg);
+ /* spin on entry rendezvous */
+ atomic_add_int(&smp_rv_waiters[0], 1);
+ while (atomic_load_acq_int(&smp_rv_waiters[0]) < mp_ncpus)
+ ; /* nothing */
+ /* action function */
+ if (smp_rv_action_func != NULL)
+ smp_rv_action_func(smp_rv_func_arg);
+ /* spin on exit rendezvous */
+ atomic_add_int(&smp_rv_waiters[1], 1);
+ while (atomic_load_acq_int(&smp_rv_waiters[1]) < mp_ncpus)
+ ; /* nothing */
+ /* teardown function */
+ if (smp_rv_teardown_func != NULL)
+ smp_rv_teardown_func(smp_rv_func_arg);
+}
+
+void
+smp_rendezvous(void (* setup_func)(void *),
+ void (* action_func)(void *),
+ void (* teardown_func)(void *),
+ void *arg)
+{
+
+ if (!smp_started) {
+ if (setup_func != NULL)
+ setup_func(arg);
+ if (action_func != NULL)
+ action_func(arg);
+ if (teardown_func != NULL)
+ teardown_func(arg);
+ return;
+ }
+
+ /* obtain rendezvous lock */
+ mtx_lock_spin(&smp_rv_mtx);
+
+ /* set static function pointers */
+ smp_rv_setup_func = setup_func;
+ smp_rv_action_func = action_func;
+ smp_rv_teardown_func = teardown_func;
+ smp_rv_func_arg = arg;
+ smp_rv_waiters[0] = 0;
+ smp_rv_waiters[1] = 0;
+
+ /* signal other processors, which will enter the IPI with interrupts off */
+ ipi_all_but_self(IPI_RENDEZVOUS);
+
+ /* call executor function */
+ smp_rendezvous_action();
+
+ /* release lock */
+ mtx_unlock_spin(&smp_rv_mtx);
+}
diff --git a/sys/kern/subr_taskqueue.c b/sys/kern/subr_taskqueue.c
new file mode 100644
index 0000000..19a93ad
--- /dev/null
+++ b/sys/kern/subr_taskqueue.c
@@ -0,0 +1,223 @@
+/*-
+ * Copyright (c) 2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/taskqueue.h>
+
+static MALLOC_DEFINE(M_TASKQUEUE, "taskqueue", "Task Queues");
+
+static STAILQ_HEAD(taskqueue_list, taskqueue) taskqueue_queues;
+
+static void *taskqueue_ih;
+static struct mtx taskqueue_queues_mutex;
+
+struct taskqueue {
+ STAILQ_ENTRY(taskqueue) tq_link;
+ STAILQ_HEAD(, task) tq_queue;
+ const char *tq_name;
+ taskqueue_enqueue_fn tq_enqueue;
+ void *tq_context;
+ int tq_draining;
+ struct mtx tq_mutex;
+};
+
+static void init_taskqueue_list(void *data);
+
+static void
+init_taskqueue_list(void *data __unused)
+{
+
+ mtx_init(&taskqueue_queues_mutex, "taskqueue list", NULL, MTX_DEF);
+ STAILQ_INIT(&taskqueue_queues);
+}
+SYSINIT(taskqueue_list, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_taskqueue_list,
+ NULL);
+
+struct taskqueue *
+taskqueue_create(const char *name, int mflags,
+ taskqueue_enqueue_fn enqueue, void *context)
+{
+ struct taskqueue *queue;
+
+ queue = malloc(sizeof(struct taskqueue), M_TASKQUEUE, mflags | M_ZERO);
+ if (!queue)
+ return 0;
+
+ STAILQ_INIT(&queue->tq_queue);
+ queue->tq_name = name;
+ queue->tq_enqueue = enqueue;
+ queue->tq_context = context;
+ queue->tq_draining = 0;
+ mtx_init(&queue->tq_mutex, "taskqueue", NULL, MTX_DEF);
+
+ mtx_lock(&taskqueue_queues_mutex);
+ STAILQ_INSERT_TAIL(&taskqueue_queues, queue, tq_link);
+ mtx_unlock(&taskqueue_queues_mutex);
+
+ return queue;
+}
+
+void
+taskqueue_free(struct taskqueue *queue)
+{
+
+ mtx_lock(&queue->tq_mutex);
+ queue->tq_draining = 1;
+ mtx_unlock(&queue->tq_mutex);
+
+ taskqueue_run(queue);
+
+ mtx_lock(&taskqueue_queues_mutex);
+ STAILQ_REMOVE(&taskqueue_queues, queue, taskqueue, tq_link);
+ mtx_unlock(&taskqueue_queues_mutex);
+
+ mtx_destroy(&queue->tq_mutex);
+ free(queue, M_TASKQUEUE);
+}
+
+/*
+ * Returns with the taskqueue locked.
+ */
+struct taskqueue *
+taskqueue_find(const char *name)
+{
+ struct taskqueue *queue;
+
+ mtx_lock(&taskqueue_queues_mutex);
+ STAILQ_FOREACH(queue, &taskqueue_queues, tq_link) {
+ mtx_lock(&queue->tq_mutex);
+ if (!strcmp(queue->tq_name, name)) {
+ mtx_unlock(&taskqueue_queues_mutex);
+ return queue;
+ }
+ mtx_unlock(&queue->tq_mutex);
+ }
+ mtx_unlock(&taskqueue_queues_mutex);
+ return 0;
+}
+
+int
+taskqueue_enqueue(struct taskqueue *queue, struct task *task)
+{
+ struct task *ins;
+ struct task *prev;
+
+ mtx_lock(&queue->tq_mutex);
+
+ /*
+ * Don't allow new tasks on a queue which is being freed.
+ */
+ if (queue->tq_draining) {
+ mtx_unlock(&queue->tq_mutex);
+ return EPIPE;
+ }
+
+ /*
+ * Count multiple enqueues.
+ */
+ if (task->ta_pending) {
+ task->ta_pending++;
+ mtx_unlock(&queue->tq_mutex);
+ return 0;
+ }
+
+ /*
+ * Optimise the case when all tasks have the same priority.
+ */
+ prev = STAILQ_LAST(&queue->tq_queue, task, ta_link);
+ if (!prev || prev->ta_priority >= task->ta_priority) {
+ STAILQ_INSERT_TAIL(&queue->tq_queue, task, ta_link);
+ } else {
+ prev = 0;
+ for (ins = STAILQ_FIRST(&queue->tq_queue); ins;
+ prev = ins, ins = STAILQ_NEXT(ins, ta_link))
+ if (ins->ta_priority < task->ta_priority)
+ break;
+
+ if (prev)
+ STAILQ_INSERT_AFTER(&queue->tq_queue, prev, task, ta_link);
+ else
+ STAILQ_INSERT_HEAD(&queue->tq_queue, task, ta_link);
+ }
+
+ task->ta_pending = 1;
+ if (queue->tq_enqueue)
+ queue->tq_enqueue(queue->tq_context);
+
+ mtx_unlock(&queue->tq_mutex);
+
+ return 0;
+}
+
+void
+taskqueue_run(struct taskqueue *queue)
+{
+ struct task *task;
+ int pending;
+
+ mtx_lock(&queue->tq_mutex);
+ while (STAILQ_FIRST(&queue->tq_queue)) {
+ /*
+ * Carefully remove the first task from the queue and
+ * zero its pending count.
+ */
+ task = STAILQ_FIRST(&queue->tq_queue);
+ STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link);
+ pending = task->ta_pending;
+ task->ta_pending = 0;
+ mtx_unlock(&queue->tq_mutex);
+
+ task->ta_func(task->ta_context, pending);
+
+ mtx_lock(&queue->tq_mutex);
+ }
+ mtx_unlock(&queue->tq_mutex);
+}
+
+static void
+taskqueue_swi_enqueue(void *context)
+{
+ swi_sched(taskqueue_ih, 0);
+}
+
+static void
+taskqueue_swi_run(void *dummy)
+{
+ taskqueue_run(taskqueue_swi);
+}
+
+TASKQUEUE_DEFINE(swi, taskqueue_swi_enqueue, 0,
+ swi_add(NULL, "task queue", taskqueue_swi_run, NULL, SWI_TQ, 0,
+ &taskqueue_ih));
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
new file mode 100644
index 0000000..3b415de
--- /dev/null
+++ b/sys/kern/subr_trap.c
@@ -0,0 +1,209 @@
+/*-
+ * Copyright (C) 1994, David Greenman
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the University of Utah, and William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
+ * $FreeBSD$
+ */
+
+#ifdef __i386__
+#include "opt_npx.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/systm.h>
+#include <sys/vmmeter.h>
+#include <machine/cpu.h>
+#include <machine/pcb.h>
+
+/*
+ * Define the code needed before returning to user mode, for
+ * trap and syscall.
+ *
+ * MPSAFE
+ */
+void
+userret(td, frame, oticks)
+ struct thread *td;
+ struct trapframe *frame;
+ u_int oticks;
+{
+ struct proc *p = td->td_proc;
+ struct kse *ke = td->td_kse;
+ struct ksegrp *kg = td->td_ksegrp;
+
+#ifdef INVARIANTS
+ /* Check that we called signotify() enough. */
+ mtx_lock(&Giant);
+ PROC_LOCK(p);
+ mtx_lock_spin(&sched_lock);
+ if (SIGPENDING(p) && ((p->p_sflag & PS_NEEDSIGCHK) == 0 ||
+ (p->p_kse.ke_flags & KEF_ASTPENDING) == 0))
+ printf("failed to set signal flags proprly for ast()\n");
+ mtx_unlock_spin(&sched_lock);
+ PROC_UNLOCK(p);
+ mtx_unlock(&Giant);
+#endif
+
+ /*
+ * XXX we cheat slightly on the locking here to avoid locking in
+ * the usual case. Setting td_priority here is essentially an
+ * incomplete workaround for not setting it properly elsewhere.
+ * Now that some interrupt handlers are threads, not setting it
+ * properly elsewhere can clobber it in the window between setting
+ * it here and returning to user mode, so don't waste time setting
+ * it perfectly here.
+ */
+ if (td->td_priority != kg->kg_user_pri) {
+ mtx_lock_spin(&sched_lock);
+ td->td_priority = kg->kg_user_pri;
+ mtx_unlock_spin(&sched_lock);
+ }
+
+ /*
+ * Charge system time if profiling.
+ *
+ * XXX should move PS_PROFIL to a place that can obviously be
+ * accessed safely without sched_lock.
+ */
+ if (p->p_sflag & PS_PROFIL) {
+ quad_t ticks;
+
+ mtx_lock_spin(&sched_lock);
+ ticks = ke->ke_sticks - oticks;
+ mtx_unlock_spin(&sched_lock);
+ addupc_task(ke, TRAPF_PC(frame), (u_int)ticks * psratio);
+ }
+}
+
+/*
+ * Process an asynchronous software trap.
+ * This is relatively easy.
+ * This function will return with preemption disabled.
+ */
+void
+ast(framep)
+ struct trapframe *framep;
+{
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ struct kse *ke = td->td_kse;
+ struct ksegrp *kg = td->td_ksegrp;
+ u_int prticks, sticks;
+ int sflag;
+ int flags;
+ int sig;
+#if defined(DEV_NPX) && !defined(SMP)
+ int ucode;
+#endif
+
+ KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
+#ifdef WITNESS
+ if (witness_list(td))
+ panic("Returning to user mode with mutex(s) held");
+#endif
+ mtx_assert(&Giant, MA_NOTOWNED);
+ mtx_assert(&sched_lock, MA_NOTOWNED);
+ prticks = 0; /* XXX: Quiet warning. */
+ td->td_frame = framep;
+ /*
+ * This updates the p_sflag's for the checks below in one
+ * "atomic" operation with turning off the astpending flag.
+ * If another AST is triggered while we are handling the
+ * AST's saved in sflag, the astpending flag will be set and
+ * ast() will be called again.
+ */
+ mtx_lock_spin(&sched_lock);
+ sticks = ke->ke_sticks;
+ sflag = p->p_sflag;
+ flags = ke->ke_flags;
+ p->p_sflag &= ~(PS_ALRMPEND | PS_NEEDSIGCHK | PS_PROFPEND);
+ ke->ke_flags &= ~(KEF_ASTPENDING | KEF_NEEDRESCHED | KEF_OWEUPC);
+ cnt.v_soft++;
+ if (flags & KEF_OWEUPC && sflag & PS_PROFIL) {
+ prticks = p->p_stats->p_prof.pr_ticks;
+ p->p_stats->p_prof.pr_ticks = 0;
+ }
+ mtx_unlock_spin(&sched_lock);
+
+ if (td->td_ucred != p->p_ucred)
+ cred_update_thread(td);
+ if (flags & KEF_OWEUPC && sflag & PS_PROFIL)
+ addupc_task(ke, p->p_stats->p_prof.pr_addr, prticks);
+ if (sflag & PS_ALRMPEND) {
+ PROC_LOCK(p);
+ psignal(p, SIGVTALRM);
+ PROC_UNLOCK(p);
+ }
+#if defined(DEV_NPX) && !defined(SMP)
+ if (PCPU_GET(curpcb)->pcb_flags & PCB_NPXTRAP) {
+ atomic_clear_int(&PCPU_GET(curpcb)->pcb_flags,
+ PCB_NPXTRAP);
+ ucode = npxtrap();
+ if (ucode != -1) {
+ trapsignal(p, SIGFPE, ucode);
+ }
+ }
+#endif
+ if (sflag & PS_PROFPEND) {
+ PROC_LOCK(p);
+ psignal(p, SIGPROF);
+ PROC_UNLOCK(p);
+ }
+ if (flags & KEF_NEEDRESCHED) {
+ mtx_lock_spin(&sched_lock);
+ td->td_priority = kg->kg_user_pri;
+ setrunqueue(td);
+ p->p_stats->p_ru.ru_nivcsw++;
+ mi_switch();
+ mtx_unlock_spin(&sched_lock);
+ }
+ if (sflag & PS_NEEDSIGCHK) {
+ PROC_LOCK(p);
+ while ((sig = cursig(p)) != 0)
+ postsig(sig);
+ PROC_UNLOCK(p);
+ }
+
+ userret(td, framep, sticks);
+ mtx_assert(&Giant, MA_NOTOWNED);
+}
diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c
new file mode 100644
index 0000000..08bca8d
--- /dev/null
+++ b/sys/kern/subr_turnstile.c
@@ -0,0 +1,986 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
+ * $FreeBSD$
+ */
+
+/*
+ * Machine independent bits of mutex implementation.
+ */
+
+#include "opt_adaptive_mutexes.h"
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
+#include <sys/stdint.h>
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+
+#include <machine/atomic.h>
+#include <machine/bus.h>
+#include <machine/clock.h>
+#include <machine/cpu.h>
+
+#include <ddb/ddb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+/*
+ * Internal utility macros.
+ */
+#define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED)
+
+#define mtx_owner(m) (mtx_unowned((m)) ? NULL \
+ : (struct thread *)((m)->mtx_lock & MTX_FLAGMASK))
+
+/* XXXKSE This test will change. */
+#define thread_running(td) \
+ ((td)->td_kse != NULL && (td)->td_kse->ke_oncpu != NOCPU)
+
+/*
+ * Lock classes for sleep and spin mutexes.
+ */
+struct lock_class lock_class_mtx_sleep = {
+ "sleep mutex",
+ LC_SLEEPLOCK | LC_RECURSABLE
+};
+struct lock_class lock_class_mtx_spin = {
+ "spin mutex",
+ LC_SPINLOCK | LC_RECURSABLE
+};
+
+/*
+ * System-wide mutexes
+ */
+struct mtx sched_lock;
+struct mtx Giant;
+
+/*
+ * Prototypes for non-exported routines.
+ */
+static void propagate_priority(struct thread *);
+
+static void
+propagate_priority(struct thread *td)
+{
+ int pri = td->td_priority;
+ struct mtx *m = td->td_blocked;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ for (;;) {
+ struct thread *td1;
+
+ td = mtx_owner(m);
+
+ if (td == NULL) {
+ /*
+ * This really isn't quite right. Really
+ * ought to bump priority of thread that
+ * next acquires the mutex.
+ */
+ MPASS(m->mtx_lock == MTX_CONTESTED);
+ return;
+ }
+
+ MPASS(td->td_proc->p_magic == P_MAGIC);
+ KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex"));
+ if (td->td_priority <= pri) /* lower is higher priority */
+ return;
+
+ /*
+ * Bump this thread's priority.
+ */
+ td->td_priority = pri;
+
+ /*
+ * If lock holder is actually running, just bump priority.
+ */
+ if (thread_running(td)) {
+ MPASS(td->td_proc->p_stat == SRUN
+ || td->td_proc->p_stat == SZOMB
+ || td->td_proc->p_stat == SSTOP);
+ return;
+ }
+
+#ifndef SMP
+ /*
+ * For UP, we check to see if td is curthread (this shouldn't
+ * ever happen however as it would mean we are in a deadlock.)
+ */
+ KASSERT(td != curthread, ("Deadlock detected"));
+#endif
+
+ /*
+ * If on run queue move to new run queue, and quit.
+ * XXXKSE this gets a lot more complicated under threads
+ * but try anyhow.
+ */
+ if (td->td_proc->p_stat == SRUN) {
+ MPASS(td->td_blocked == NULL);
+ remrunqueue(td);
+ setrunqueue(td);
+ return;
+ }
+
+ /*
+ * If we aren't blocked on a mutex, we should be.
+ */
+ KASSERT(td->td_proc->p_stat == SMTX, (
+ "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
+ td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat,
+ m->mtx_object.lo_name));
+
+ /*
+ * Pick up the mutex that td is blocked on.
+ */
+ m = td->td_blocked;
+ MPASS(m != NULL);
+
+ /*
+ * Check if the thread needs to be moved up on
+ * the blocked chain
+ */
+ if (td == TAILQ_FIRST(&m->mtx_blocked)) {
+ continue;
+ }
+
+ td1 = TAILQ_PREV(td, threadqueue, td_blkq);
+ if (td1->td_priority <= pri) {
+ continue;
+ }
+
+ /*
+ * Remove thread from blocked chain and determine where
+ * it should be moved up to. Since we know that td1 has
+ * a lower priority than td, we know that at least one
+ * thread in the chain has a lower priority and that
+ * td1 will thus not be NULL after the loop.
+ */
+ TAILQ_REMOVE(&m->mtx_blocked, td, td_blkq);
+ TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq) {
+ MPASS(td1->td_proc->p_magic == P_MAGIC);
+ if (td1->td_priority > pri)
+ break;
+ }
+
+ MPASS(td1 != NULL);
+ TAILQ_INSERT_BEFORE(td1, td, td_blkq);
+ CTR4(KTR_LOCK,
+ "propagate_priority: p %p moved before %p on [%p] %s",
+ td, td1, m, m->mtx_object.lo_name);
+ }
+}
+
+#ifdef MUTEX_PROFILING
+SYSCTL_NODE(_debug, OID_AUTO, mutex, CTLFLAG_RD, NULL, "mutex debugging");
+SYSCTL_NODE(_debug_mutex, OID_AUTO, prof, CTLFLAG_RD, NULL, "mutex profiling");
+static int mutex_prof_enable = 0;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, enable, CTLFLAG_RW,
+ &mutex_prof_enable, 0, "Enable tracing of mutex holdtime");
+
+struct mutex_prof {
+ const char *name;
+ const char *file;
+ int line;
+#define MPROF_MAX 0
+#define MPROF_TOT 1
+#define MPROF_CNT 2
+#define MPROF_AVG 3
+ uintmax_t counter[4];
+ struct mutex_prof *next;
+};
+
+/*
+ * mprof_buf is a static pool of profiling records to avoid possible
+ * reentrance of the memory allocation functions.
+ *
+ * Note: NUM_MPROF_BUFFERS must be smaller than MPROF_HASH_SIZE.
+ */
+#define NUM_MPROF_BUFFERS 1000
+static struct mutex_prof mprof_buf[NUM_MPROF_BUFFERS];
+static int first_free_mprof_buf;
+#define MPROF_HASH_SIZE 1009
+static struct mutex_prof *mprof_hash[MPROF_HASH_SIZE];
+
+static int mutex_prof_acquisitions;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, acquisitions, CTLFLAG_RD,
+ &mutex_prof_acquisitions, 0, "Number of mutex acquistions recorded");
+static int mutex_prof_records;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, records, CTLFLAG_RD,
+ &mutex_prof_records, 0, "Number of profiling records");
+static int mutex_prof_maxrecords = NUM_MPROF_BUFFERS;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, maxrecords, CTLFLAG_RD,
+ &mutex_prof_maxrecords, 0, "Maximum number of profiling records");
+static int mutex_prof_rejected;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, rejected, CTLFLAG_RD,
+ &mutex_prof_rejected, 0, "Number of rejected profiling records");
+static int mutex_prof_hashsize = MPROF_HASH_SIZE;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, hashsize, CTLFLAG_RD,
+ &mutex_prof_hashsize, 0, "Hash size");
+static int mutex_prof_collisions = 0;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, collisions, CTLFLAG_RD,
+ &mutex_prof_collisions, 0, "Number of hash collisions");
+
+/*
+ * mprof_mtx protects the profiling buffers and the hash.
+ */
+static struct mtx mprof_mtx;
+MTX_SYSINIT(mprof, &mprof_mtx, "mutex profiling lock", MTX_SPIN | MTX_QUIET);
+
+static u_int64_t
+nanoseconds(void)
+{
+ struct timespec tv;
+
+ nanotime(&tv);
+ return (tv.tv_sec * (u_int64_t)1000000000 + tv.tv_nsec);
+}
+
+static int
+dump_mutex_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf *sb;
+ int error, i;
+
+ if (first_free_mprof_buf == 0)
+ return SYSCTL_OUT(req, "No locking recorded",
+ sizeof("No locking recorded"));
+
+ sb = sbuf_new(NULL, NULL, 1024, SBUF_AUTOEXTEND);
+ sbuf_printf(sb, "%12s %12s %12s %12s %s\n",
+ "max", "total", "count", "average", "name");
+ mtx_lock_spin(&mprof_mtx);
+ for (i = 0; i < first_free_mprof_buf; ++i)
+ sbuf_printf(sb, "%12ju %12ju %12ju %12ju %s:%d (%s)\n",
+ mprof_buf[i].counter[MPROF_MAX] / 1000,
+ mprof_buf[i].counter[MPROF_TOT] / 1000,
+ mprof_buf[i].counter[MPROF_CNT],
+ mprof_buf[i].counter[MPROF_AVG] / 1000,
+ mprof_buf[i].file, mprof_buf[i].line, mprof_buf[i].name);
+ mtx_unlock_spin(&mprof_mtx);
+ sbuf_finish(sb);
+ error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+ sbuf_delete(sb);
+ return (error);
+}
+SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
+ NULL, 0, dump_mutex_prof_stats, "A", "Mutex profiling statistics");
+#endif
+
+/*
+ * Function versions of the inlined __mtx_* macros. These are used by
+ * modules and can also be called from assembly language if needed.
+ */
+void
+_mtx_lock_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+ MPASS(curthread != NULL);
+ _get_sleep_lock(m, curthread, opts, file, line);
+ LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+ line);
+ WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+#ifdef MUTEX_PROFILING
+ /* don't reset the timer when/if recursing */
+ if (m->acqtime == 0) {
+ m->file = file;
+ m->line = line;
+ m->acqtime = mutex_prof_enable ? nanoseconds() : 0;
+ ++mutex_prof_acquisitions;
+ }
+#endif
+}
+
+void
+_mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+ MPASS(curthread != NULL);
+ mtx_assert(m, MA_OWNED);
+#ifdef MUTEX_PROFILING
+ if (m->acqtime != 0) {
+ static const char *unknown = "(unknown)";
+ struct mutex_prof *mpp;
+ u_int64_t acqtime, now;
+ const char *p, *q;
+ volatile u_int hash;
+
+ now = nanoseconds();
+ acqtime = m->acqtime;
+ m->acqtime = 0;
+ if (now <= acqtime)
+ goto out;
+ for (p = file; strncmp(p, "../", 3) == 0; p += 3)
+ /* nothing */ ;
+ if (p == NULL || *p == '\0')
+ p = unknown;
+ for (hash = line, q = p; *q != '\0'; ++q)
+ hash = (hash * 2 + *q) % MPROF_HASH_SIZE;
+ mtx_lock_spin(&mprof_mtx);
+ for (mpp = mprof_hash[hash]; mpp != NULL; mpp = mpp->next)
+ if (mpp->line == line && strcmp(mpp->file, p) == 0)
+ break;
+ if (mpp == NULL) {
+ /* Just exit if we cannot get a trace buffer */
+ if (first_free_mprof_buf >= NUM_MPROF_BUFFERS) {
+ ++mutex_prof_rejected;
+ goto unlock;
+ }
+ mpp = &mprof_buf[first_free_mprof_buf++];
+ mpp->name = mtx_name(m);
+ mpp->file = p;
+ mpp->line = line;
+ mpp->next = mprof_hash[hash];
+ if (mprof_hash[hash] != NULL)
+ ++mutex_prof_collisions;
+ mprof_hash[hash] = mpp;
+ ++mutex_prof_records;
+ }
+ /*
+ * Record if the mutex has been held longer now than ever
+ * before
+ */
+ if ((now - acqtime) > mpp->counter[MPROF_MAX])
+ mpp->counter[MPROF_MAX] = now - acqtime;
+ mpp->counter[MPROF_TOT] += now - acqtime;
+ mpp->counter[MPROF_CNT] += 1;
+ mpp->counter[MPROF_AVG] =
+ mpp->counter[MPROF_TOT] / mpp->counter[MPROF_CNT];
+unlock:
+ mtx_unlock_spin(&mprof_mtx);
+ }
+out:
+#endif
+ WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+ line);
+ _rel_sleep_lock(m, curthread, opts, file, line);
+}
+
+void
+_mtx_lock_spin_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+ MPASS(curthread != NULL);
+#if defined(SMP) || LOCK_DEBUG > 0
+ _get_spin_lock(m, curthread, opts, file, line);
+#else
+ critical_enter();
+#endif
+ LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+ line);
+ WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+}
+
+void
+_mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+ MPASS(curthread != NULL);
+ mtx_assert(m, MA_OWNED);
+ WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+ line);
+#if defined(SMP) || LOCK_DEBUG > 0
+ _rel_spin_lock(m);
+#else
+ critical_exit();
+#endif
+}
+
+/*
+ * The important part of mtx_trylock{,_flags}()
+ * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that
+ * if we're called, it's because we know we don't already own this lock.
+ */
+int
+_mtx_trylock(struct mtx *m, int opts, const char *file, int line)
+{
+ int rval;
+
+ MPASS(curthread != NULL);
+
+ rval = _obtain_lock(m, curthread);
+
+ LOCK_LOG_TRY("LOCK", &m->mtx_object, opts, rval, file, line);
+ if (rval) {
+ /*
+ * We do not handle recursion in _mtx_trylock; see the
+ * note at the top of the routine.
+ */
+ KASSERT(!mtx_recursed(m),
+ ("mtx_trylock() called on a recursed mutex"));
+ WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
+ file, line);
+ }
+
+ return (rval);
+}
+
+/*
+ * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
+ *
+ * We call this if the lock is either contested (i.e. we need to go to
+ * sleep waiting for it), or if we need to recurse on it.
+ */
+void
+_mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
+{
+ struct thread *td = curthread;
+#if defined(SMP) && defined(ADAPTIVE_MUTEXES)
+ struct thread *owner;
+#endif
+
+ if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)td) {
+ m->mtx_recurse++;
+ atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
+ return;
+ }
+
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR4(KTR_LOCK,
+ "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
+ m->mtx_object.lo_name, (void *)m->mtx_lock, file, line);
+
+ while (!_obtain_lock(m, td)) {
+ uintptr_t v;
+ struct thread *td1;
+
+ mtx_lock_spin(&sched_lock);
+ /*
+ * Check if the lock has been released while spinning for
+ * the sched_lock.
+ */
+ if ((v = m->mtx_lock) == MTX_UNOWNED) {
+ mtx_unlock_spin(&sched_lock);
+#ifdef __i386__
+ ia32_pause();
+#endif
+ continue;
+ }
+
+ /*
+ * The mutex was marked contested on release. This means that
+ * there are threads blocked on it.
+ */
+ if (v == MTX_CONTESTED) {
+ td1 = TAILQ_FIRST(&m->mtx_blocked);
+ MPASS(td1 != NULL);
+ m->mtx_lock = (uintptr_t)td | MTX_CONTESTED;
+
+ if (td1->td_priority < td->td_priority)
+ td->td_priority = td1->td_priority;
+ mtx_unlock_spin(&sched_lock);
+ return;
+ }
+
+ /*
+ * If the mutex isn't already contested and a failure occurs
+ * setting the contested bit, the mutex was either released
+ * or the state of the MTX_RECURSED bit changed.
+ */
+ if ((v & MTX_CONTESTED) == 0 &&
+ !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
+ (void *)(v | MTX_CONTESTED))) {
+ mtx_unlock_spin(&sched_lock);
+#ifdef __i386__
+ ia32_pause();
+#endif
+ continue;
+ }
+
+#if defined(SMP) && defined(ADAPTIVE_MUTEXES)
+ /*
+ * If the current owner of the lock is executing on another
+ * CPU, spin instead of blocking.
+ */
+ owner = (struct thread *)(v & MTX_FLAGMASK);
+ if (m != &Giant && thread_running(owner)) {
+ mtx_unlock_spin(&sched_lock);
+ while (mtx_owner(m) == owner && thread_running(owner)) {
+#ifdef __i386__
+ ia32_pause();
+#endif
+ }
+ continue;
+ }
+#endif /* SMP && ADAPTIVE_MUTEXES */
+
+ /*
+ * We definitely must sleep for this lock.
+ */
+ mtx_assert(m, MA_NOTOWNED);
+
+#ifdef notyet
+ /*
+ * If we're borrowing an interrupted thread's VM context, we
+ * must clean up before going to sleep.
+ */
+ if (td->td_ithd != NULL) {
+ struct ithd *it = td->td_ithd;
+
+ if (it->it_interrupted) {
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR2(KTR_LOCK,
+ "_mtx_lock_sleep: %p interrupted %p",
+ it, it->it_interrupted);
+ intr_thd_fixup(it);
+ }
+ }
+#endif
+
+ /*
+ * Put us on the list of threads blocked on this mutex.
+ */
+ if (TAILQ_EMPTY(&m->mtx_blocked)) {
+ td1 = mtx_owner(m);
+ LIST_INSERT_HEAD(&td1->td_contested, m, mtx_contested);
+ TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq);
+ } else {
+ TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq)
+ if (td1->td_priority > td->td_priority)
+ break;
+ if (td1)
+ TAILQ_INSERT_BEFORE(td1, td, td_blkq);
+ else
+ TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq);
+ }
+
+ /*
+ * Save who we're blocked on.
+ */
+ td->td_blocked = m;
+ td->td_mtxname = m->mtx_object.lo_name;
+ td->td_proc->p_stat = SMTX;
+ propagate_priority(td);
+
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR3(KTR_LOCK,
+ "_mtx_lock_sleep: p %p blocked on [%p] %s", td, m,
+ m->mtx_object.lo_name);
+
+ td->td_proc->p_stats->p_ru.ru_nvcsw++;
+ mi_switch();
+
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR3(KTR_LOCK,
+ "_mtx_lock_sleep: p %p free from blocked on [%p] %s",
+ td, m, m->mtx_object.lo_name);
+
+ mtx_unlock_spin(&sched_lock);
+ }
+
+ return;
+}
+
+/*
+ * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock.
+ *
+ * This is only called if we need to actually spin for the lock. Recursion
+ * is handled inline.
+ */
+void
+_mtx_lock_spin(struct mtx *m, int opts, const char *file, int line)
+{
+ int i = 0;
+
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
+
+ for (;;) {
+ if (_obtain_lock(m, curthread))
+ break;
+
+ /* Give interrupts a chance while we spin. */
+ critical_exit();
+ while (m->mtx_lock != MTX_UNOWNED) {
+ if (i++ < 10000000) {
+#ifdef __i386__
+ ia32_pause();
+#endif
+ continue;
+ }
+ if (i < 60000000)
+ DELAY(1);
+#ifdef DDB
+ else if (!db_active)
+#else
+ else
+#endif
+ panic("spin lock %s held by %p for > 5 seconds",
+ m->mtx_object.lo_name, (void *)m->mtx_lock);
+#ifdef __i386__
+ ia32_pause();
+#endif
+ }
+ critical_enter();
+ }
+
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
+
+ return;
+}
+
+/*
+ * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
+ *
+ * We are only called here if the lock is recursed or contested (i.e. we
+ * need to wake up a blocked thread).
+ */
+void
+_mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
+{
+ struct thread *td, *td1;
+ struct mtx *m1;
+ int pri;
+
+ td = curthread;
+
+ if (mtx_recursed(m)) {
+ if (--(m->mtx_recurse) == 0)
+ atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
+ return;
+ }
+
+ mtx_lock_spin(&sched_lock);
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
+
+ td1 = TAILQ_FIRST(&m->mtx_blocked);
+#if defined(SMP) && defined(ADAPTIVE_MUTEXES)
+ if (td1 == NULL) {
+ _release_lock_quick(m);
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m);
+ mtx_unlock_spin(&sched_lock);
+ return;
+ }
+#endif
+ MPASS(td->td_proc->p_magic == P_MAGIC);
+ MPASS(td1->td_proc->p_magic == P_MAGIC);
+
+ TAILQ_REMOVE(&m->mtx_blocked, td1, td_blkq);
+
+ if (TAILQ_EMPTY(&m->mtx_blocked)) {
+ LIST_REMOVE(m, mtx_contested);
+ _release_lock_quick(m);
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m);
+ } else
+ atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);
+
+ pri = PRI_MAX;
+ LIST_FOREACH(m1, &td->td_contested, mtx_contested) {
+ int cp = TAILQ_FIRST(&m1->mtx_blocked)->td_priority;
+ if (cp < pri)
+ pri = cp;
+ }
+
+ if (pri > td->td_base_pri)
+ pri = td->td_base_pri;
+ td->td_priority = pri;
+
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p",
+ m, td1);
+
+ td1->td_blocked = NULL;
+ td1->td_proc->p_stat = SRUN;
+ setrunqueue(td1);
+
+ if (td->td_critnest == 1 && td1->td_priority < pri) {
+#ifdef notyet
+ if (td->td_ithd != NULL) {
+ struct ithd *it = td->td_ithd;
+
+ if (it->it_interrupted) {
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR2(KTR_LOCK,
+ "_mtx_unlock_sleep: %p interrupted %p",
+ it, it->it_interrupted);
+ intr_thd_fixup(it);
+ }
+ }
+#endif
+ setrunqueue(td);
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR2(KTR_LOCK,
+ "_mtx_unlock_sleep: %p switching out lock=%p", m,
+ (void *)m->mtx_lock);
+
+ td->td_proc->p_stats->p_ru.ru_nivcsw++;
+ mi_switch();
+ if (LOCK_LOG_TEST(&m->mtx_object, opts))
+ CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
+ m, (void *)m->mtx_lock);
+ }
+
+ mtx_unlock_spin(&sched_lock);
+
+ return;
+}
+
+/*
+ * All the unlocking of MTX_SPIN locks is done inline.
+ * See the _rel_spin_lock() macro for the details.
+ */
+
+/*
+ * The backing function for the INVARIANTS-enabled mtx_assert()
+ */
+#ifdef INVARIANT_SUPPORT
+void
+_mtx_assert(struct mtx *m, int what, const char *file, int line)
+{
+
+ if (panicstr != NULL)
+ return;
+ switch (what) {
+ case MA_OWNED:
+ case MA_OWNED | MA_RECURSED:
+ case MA_OWNED | MA_NOTRECURSED:
+ if (!mtx_owned(m))
+ panic("mutex %s not owned at %s:%d",
+ m->mtx_object.lo_name, file, line);
+ if (mtx_recursed(m)) {
+ if ((what & MA_NOTRECURSED) != 0)
+ panic("mutex %s recursed at %s:%d",
+ m->mtx_object.lo_name, file, line);
+ } else if ((what & MA_RECURSED) != 0) {
+ panic("mutex %s unrecursed at %s:%d",
+ m->mtx_object.lo_name, file, line);
+ }
+ break;
+ case MA_NOTOWNED:
+ if (mtx_owned(m))
+ panic("mutex %s owned at %s:%d",
+ m->mtx_object.lo_name, file, line);
+ break;
+ default:
+ panic("unknown mtx_assert at %s:%d", file, line);
+ }
+}
+#endif
+
+/*
+ * The MUTEX_DEBUG-enabled mtx_validate()
+ *
+ * Most of these checks have been moved off into the LO_INITIALIZED flag
+ * maintained by the witness code.
+ */
+#ifdef MUTEX_DEBUG
+
+void mtx_validate(struct mtx *);
+
+void
+mtx_validate(struct mtx *m)
+{
+
+/*
+ * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
+ * we can re-enable the kernacc() checks.
+ */
+#ifndef __alpha__
+ /*
+ * Can't call kernacc() from early init386(), especially when
+ * initializing Giant mutex, because some stuff in kernacc()
+ * requires Giant itself.
+ */
+ if (!cold)
+ if (!kernacc((caddr_t)m, sizeof(m),
+ VM_PROT_READ | VM_PROT_WRITE))
+ panic("Can't read and write to mutex %p", m);
+#endif
+}
+#endif
+
+/*
+ * General init routine used by the MTX_SYSINIT() macro.
+ */
+void
+mtx_sysinit(void *arg)
+{
+ struct mtx_args *margs = arg;
+
+ mtx_init(margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts);
+}
+
+/*
+ * Mutex initialization routine; initialize lock `m' of type contained in
+ * `opts' with options contained in `opts' and name `name.' The optional
+ * lock type `type' is used as a general lock category name for use with
+ * witness.
+ */
+void
+mtx_init(struct mtx *m, const char *name, const char *type, int opts)
+{
+ struct lock_object *lock;
+
+ MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
+ MTX_SLEEPABLE | MTX_NOWITNESS | MTX_DUPOK)) == 0);
+
+#ifdef MUTEX_DEBUG
+ /* Diagnostic and error correction */
+ mtx_validate(m);
+#endif
+
+ lock = &m->mtx_object;
+ KASSERT((lock->lo_flags & LO_INITIALIZED) == 0,
+ ("mutex %s %p already initialized", name, m));
+ bzero(m, sizeof(*m));
+ if (opts & MTX_SPIN)
+ lock->lo_class = &lock_class_mtx_spin;
+ else
+ lock->lo_class = &lock_class_mtx_sleep;
+ lock->lo_name = name;
+ lock->lo_type = type != NULL ? type : name;
+ if (opts & MTX_QUIET)
+ lock->lo_flags = LO_QUIET;
+ if (opts & MTX_RECURSE)
+ lock->lo_flags |= LO_RECURSABLE;
+ if (opts & MTX_SLEEPABLE)
+ lock->lo_flags |= LO_SLEEPABLE;
+ if ((opts & MTX_NOWITNESS) == 0)
+ lock->lo_flags |= LO_WITNESS;
+ if (opts & MTX_DUPOK)
+ lock->lo_flags |= LO_DUPOK;
+
+ m->mtx_lock = MTX_UNOWNED;
+ TAILQ_INIT(&m->mtx_blocked);
+
+ LOCK_LOG_INIT(lock, opts);
+
+ WITNESS_INIT(lock);
+}
+
+/*
+ * Remove lock `m' from all_mtx queue. We don't allow MTX_QUIET to be
+ * passed in as a flag here because if the corresponding mtx_init() was
+ * called with MTX_QUIET set, then it will already be set in the mutex's
+ * flags.
+ */
+void
+mtx_destroy(struct mtx *m)
+{
+
+ LOCK_LOG_DESTROY(&m->mtx_object, 0);
+
+ if (!mtx_owned(m))
+ MPASS(mtx_unowned(m));
+ else {
+ MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
+
+ /* Tell witness this isn't locked to make it happy. */
+ WITNESS_UNLOCK(&m->mtx_object, LOP_EXCLUSIVE, __FILE__,
+ __LINE__);
+ }
+
+ WITNESS_DESTROY(&m->mtx_object);
+}
+
+/*
+ * Intialize the mutex code and system mutexes. This is called from the MD
+ * startup code prior to mi_startup(). The per-CPU data space needs to be
+ * setup before this is called.
+ */
+void
+mutex_init(void)
+{
+
+ /* Setup thread0 so that mutexes work. */
+ LIST_INIT(&thread0.td_contested);
+
+ /*
+ * Initialize mutexes.
+ */
+ mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
+ mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
+ mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+ mtx_lock(&Giant);
+}
+
+/*
+ * Encapsulated Giant mutex routines. These routines provide encapsulation
+ * control for the Giant mutex, allowing sysctls to be used to turn on and
+ * off Giant around certain subsystems. The default value for the sysctls
+ * are set to what developers believe is stable and working in regards to
+ * the Giant pushdown. Developers should not turn off Giant via these
+ * sysctls unless they know what they are doing.
+ *
+ * Callers of mtx_lock_giant() are expected to pass the return value to an
+ * accompanying mtx_unlock_giant() later on. If multiple subsystems are
+ * effected by a Giant wrap, all related sysctl variables must be zero for
+ * the subsystem call to operate without Giant (as determined by the caller).
+ */
+
+SYSCTL_NODE(_kern, OID_AUTO, giant, CTLFLAG_RD, NULL, "Giant mutex manipulation");
+
+static int kern_giant_all = 0;
+SYSCTL_INT(_kern_giant, OID_AUTO, all, CTLFLAG_RW, &kern_giant_all, 0, "");
+
+int kern_giant_proc = 1; /* Giant around PROC locks */
+int kern_giant_file = 1; /* Giant around struct file & filedesc */
+int kern_giant_ucred = 1; /* Giant around ucred */
+SYSCTL_INT(_kern_giant, OID_AUTO, proc, CTLFLAG_RW, &kern_giant_proc, 0, "");
+SYSCTL_INT(_kern_giant, OID_AUTO, file, CTLFLAG_RW, &kern_giant_file, 0, "");
+SYSCTL_INT(_kern_giant, OID_AUTO, ucred, CTLFLAG_RW, &kern_giant_ucred, 0, "");
+
+int
+mtx_lock_giant(int sysctlvar)
+{
+ if (sysctlvar || kern_giant_all) {
+ mtx_lock(&Giant);
+ return(1);
+ }
+ return(0);
+}
+
+void
+mtx_unlock_giant(int s)
+{
+ if (s)
+ mtx_unlock(&Giant);
+}
+
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
new file mode 100644
index 0000000..182221d
--- /dev/null
+++ b/sys/kern/subr_witness.c
@@ -0,0 +1,1488 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
+ * $FreeBSD$
+ */
+
+/*
+ * Implementation of the `witness' lock verifier. Originally implemented for
+ * mutexes in BSD/OS. Extended to handle generic lock objects and lock
+ * classes in FreeBSD.
+ */
+
+/*
+ * Main Entry: witness
+ * Pronunciation: 'wit-n&s
+ * Function: noun
+ * Etymology: Middle English witnesse, from Old English witnes knowledge,
+ * testimony, witness, from 2wit
+ * Date: before 12th century
+ * 1 : attestation of a fact or event : TESTIMONY
+ * 2 : one that gives evidence; specifically : one who testifies in
+ * a cause or before a judicial tribunal
+ * 3 : one asked to be present at a transaction so as to be able to
+ * testify to its having taken place
+ * 4 : one who has personal knowledge of something
+ * 5 a : something serving as evidence or proof : SIGN
+ * b : public affirmation by word or example of usually
+ * religious faith or conviction <the heroic witness to divine
+ * life -- Pilot>
+ * 6 capitalized : a member of the Jehovah's Witnesses
+ */
+
+#include "opt_ddb.h"
+#include "opt_witness.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <ddb/ddb.h>
+
+#define WITNESS_COUNT 200
+#define WITNESS_CHILDCOUNT (WITNESS_COUNT * 4)
+/*
+ * XXX: This is somewhat bogus, as we assume here that at most 1024 threads
+ * will hold LOCK_NCHILDREN * 2 locks. We handle failure ok, and we should
+ * probably be safe for the most part, but it's still a SWAG.
+ */
+#define LOCK_CHILDCOUNT (MAXCPU + 1024) * 2
+
+#define WITNESS_NCHILDREN 6
+
+struct witness_child_list_entry;
+
+struct witness {
+ const char *w_name;
+ struct lock_class *w_class;
+ STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */
+ STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */
+ struct witness_child_list_entry *w_children; /* Great evilness... */
+ const char *w_file;
+ int w_line;
+ u_int w_level;
+ u_int w_refcount;
+ u_char w_Giant_squawked:1;
+ u_char w_other_squawked:1;
+ u_char w_same_squawked:1;
+};
+
+struct witness_child_list_entry {
+ struct witness_child_list_entry *wcl_next;
+ struct witness *wcl_children[WITNESS_NCHILDREN];
+ u_int wcl_count;
+};
+
+STAILQ_HEAD(witness_list, witness);
+
+struct witness_blessed {
+ const char *b_lock1;
+ const char *b_lock2;
+};
+
+struct witness_order_list_entry {
+ const char *w_name;
+ struct lock_class *w_class;
+};
+
+static struct witness *enroll(const char *description,
+ struct lock_class *lock_class);
+static int itismychild(struct witness *parent, struct witness *child);
+static void removechild(struct witness *parent, struct witness *child);
+static int isitmychild(struct witness *parent, struct witness *child);
+static int isitmydescendant(struct witness *parent, struct witness *child);
+static int blessed(struct witness *, struct witness *);
+static void witness_display_list(void(*prnt)(const char *fmt, ...),
+ struct witness_list *list);
+static void witness_displaydescendants(void(*)(const char *fmt, ...),
+ struct witness *);
+static void witness_leveldescendents(struct witness *parent, int level);
+static void witness_levelall(void);
+static struct witness *witness_get(void);
+static void witness_free(struct witness *m);
+static struct witness_child_list_entry *witness_child_get(void);
+static void witness_child_free(struct witness_child_list_entry *wcl);
+static struct lock_list_entry *witness_lock_list_get(void);
+static void witness_lock_list_free(struct lock_list_entry *lle);
+static void witness_display(void(*)(const char *fmt, ...));
+static struct lock_instance *find_instance(struct lock_list_entry *lock_list,
+ struct lock_object *lock);
+
+MALLOC_DEFINE(M_WITNESS, "witness", "witness structure");
+
+static int witness_watch = 1;
+TUNABLE_INT("debug.witness_watch", &witness_watch);
+SYSCTL_INT(_debug, OID_AUTO, witness_watch, CTLFLAG_RD, &witness_watch, 0, "");
+
+#ifdef DDB
+/*
+ * When DDB is enabled and witness_ddb is set to 1, it will cause the system to
+ * drop into kdebug() when:
+ * - a lock heirarchy violation occurs
+ * - locks are held when going to sleep.
+ */
+#ifdef WITNESS_DDB
+int witness_ddb = 1;
+#else
+int witness_ddb = 0;
+#endif
+TUNABLE_INT("debug.witness_ddb", &witness_ddb);
+SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, "");
+#endif /* DDB */
+
+#ifdef WITNESS_SKIPSPIN
+int witness_skipspin = 1;
+#else
+int witness_skipspin = 0;
+#endif
+TUNABLE_INT("debug.witness_skipspin", &witness_skipspin);
+SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0,
+ "");
+
+static struct mtx w_mtx;
+static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
+static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
+static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
+static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
+static struct witness_child_list_entry *w_child_free = NULL;
+static struct lock_list_entry *w_lock_list_free = NULL;
+static int witness_dead; /* fatal error, probably no memory */
+
+static struct witness w_data[WITNESS_COUNT];
+static struct witness_child_list_entry w_childdata[WITNESS_CHILDCOUNT];
+static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
+
+static struct witness_order_list_entry order_lists[] = {
+ { "Giant", &lock_class_mtx_sleep },
+ { "proctree", &lock_class_sx },
+ { "allproc", &lock_class_sx },
+ { "sigio lock", &lock_class_mtx_sleep },
+ { "process group", &lock_class_mtx_sleep },
+ { "process lock", &lock_class_mtx_sleep },
+ { "session", &lock_class_mtx_sleep },
+ { "uidinfo hash", &lock_class_mtx_sleep },
+ { "uidinfo struct", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * spin locks
+ */
+#ifdef SMP
+ { "ap boot", &lock_class_mtx_spin },
+#ifdef __i386__
+ { "com", &lock_class_mtx_spin },
+#endif
+#endif
+ { "sio", &lock_class_mtx_spin },
+#ifdef __i386__
+ { "cy", &lock_class_mtx_spin },
+#endif
+ { "ng_node", &lock_class_mtx_spin },
+ { "ng_worklist", &lock_class_mtx_spin },
+ { "ithread table lock", &lock_class_mtx_spin },
+ { "sched lock", &lock_class_mtx_spin },
+ { "callout", &lock_class_mtx_spin },
+ /*
+ * leaf locks
+ */
+ { "allpmaps", &lock_class_mtx_spin },
+ { "vm page buckets mutex", &lock_class_mtx_spin },
+ { "icu", &lock_class_mtx_spin },
+#ifdef SMP
+ { "smp rendezvous", &lock_class_mtx_spin },
+#endif
+ { "clk", &lock_class_mtx_spin },
+ { "mutex profiling lock", &lock_class_mtx_spin },
+ { NULL, NULL },
+ { NULL, NULL }
+};
+
+/*
+ * Pairs of locks which have been blessed
+ * Don't complain about order problems with blessed locks
+ */
+static struct witness_blessed blessed_list[] = {
+};
+static int blessed_count =
+ sizeof(blessed_list) / sizeof(struct witness_blessed);
+
+/*
+ * List of all locks in the system.
+ */
+TAILQ_HEAD(, lock_object) all_locks = TAILQ_HEAD_INITIALIZER(all_locks);
+
+static struct mtx all_mtx = {
+ { &lock_class_mtx_sleep, /* mtx_object.lo_class */
+ "All locks list", /* mtx_object.lo_name */
+ "All locks list", /* mtx_object.lo_type */
+ LO_INITIALIZED, /* mtx_object.lo_flags */
+ { NULL, NULL }, /* mtx_object.lo_list */
+ NULL }, /* mtx_object.lo_witness */
+ MTX_UNOWNED, 0, /* mtx_lock, mtx_recurse */
+ TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
+ { NULL, NULL } /* mtx_contested */
+};
+
+/*
+ * This global is set to 0 once it becomes safe to use the witness code.
+ */
+static int witness_cold = 1;
+
+/*
+ * Global variables for book keeping.
+ */
+static int lock_cur_cnt;
+static int lock_max_cnt;
+
+/*
+ * The WITNESS-enabled diagnostic code.
+ */
+static void
+witness_initialize(void *dummy __unused)
+{
+ struct lock_object *lock;
+ struct witness_order_list_entry *order;
+ struct witness *w, *w1;
+ int i;
+
+ /*
+ * We have to release Giant before initializing its witness
+ * structure so that WITNESS doesn't get confused.
+ */
+ mtx_unlock(&Giant);
+ mtx_assert(&Giant, MA_NOTOWNED);
+
+ CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
+ TAILQ_INSERT_HEAD(&all_locks, &all_mtx.mtx_object, lo_list);
+ mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
+ MTX_NOWITNESS);
+ for (i = 0; i < WITNESS_COUNT; i++)
+ witness_free(&w_data[i]);
+ for (i = 0; i < WITNESS_CHILDCOUNT; i++)
+ witness_child_free(&w_childdata[i]);
+ for (i = 0; i < LOCK_CHILDCOUNT; i++)
+ witness_lock_list_free(&w_locklistdata[i]);
+
+ /* First add in all the specified order lists. */
+ for (order = order_lists; order->w_name != NULL; order++) {
+ w = enroll(order->w_name, order->w_class);
+ if (w == NULL)
+ continue;
+ w->w_file = "order list";
+ for (order++; order->w_name != NULL; order++) {
+ w1 = enroll(order->w_name, order->w_class);
+ if (w1 == NULL)
+ continue;
+ w1->w_file = "order list";
+ itismychild(w, w1);
+ w = w1;
+ }
+ }
+
+ /* Iterate through all locks and add them to witness. */
+ mtx_lock(&all_mtx);
+ TAILQ_FOREACH(lock, &all_locks, lo_list) {
+ if (lock->lo_flags & LO_WITNESS)
+ lock->lo_witness = enroll(lock->lo_type,
+ lock->lo_class);
+ else
+ lock->lo_witness = NULL;
+ }
+ mtx_unlock(&all_mtx);
+
+ /* Mark the witness code as being ready for use. */
+ atomic_store_rel_int(&witness_cold, 0);
+
+ mtx_lock(&Giant);
+}
+SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize, NULL)
+
+void
+witness_init(struct lock_object *lock)
+{
+ struct lock_class *class;
+
+ class = lock->lo_class;
+ if (lock->lo_flags & LO_INITIALIZED)
+ panic("%s: lock (%s) %s is already initialized", __func__,
+ class->lc_name, lock->lo_name);
+ if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
+ (class->lc_flags & LC_RECURSABLE) == 0)
+ panic("%s: lock (%s) %s can not be recursable", __func__,
+ class->lc_name, lock->lo_name);
+ if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
+ (class->lc_flags & LC_SLEEPABLE) == 0)
+ panic("%s: lock (%s) %s can not be sleepable", __func__,
+ class->lc_name, lock->lo_name);
+ if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
+ (class->lc_flags & LC_UPGRADABLE) == 0)
+ panic("%s: lock (%s) %s can not be upgradable", __func__,
+ class->lc_name, lock->lo_name);
+
+ mtx_lock(&all_mtx);
+ TAILQ_INSERT_TAIL(&all_locks, lock, lo_list);
+ lock->lo_flags |= LO_INITIALIZED;
+ lock_cur_cnt++;
+ if (lock_cur_cnt > lock_max_cnt)
+ lock_max_cnt = lock_cur_cnt;
+ mtx_unlock(&all_mtx);
+ if (!witness_cold && !witness_dead && panicstr == NULL &&
+ (lock->lo_flags & LO_WITNESS) != 0)
+ lock->lo_witness = enroll(lock->lo_type, class);
+ else
+ lock->lo_witness = NULL;
+}
+
+void
+witness_destroy(struct lock_object *lock)
+{
+ struct witness *w;
+
+ if (witness_cold)
+ panic("lock (%s) %s destroyed while witness_cold",
+ lock->lo_class->lc_name, lock->lo_name);
+ if ((lock->lo_flags & LO_INITIALIZED) == 0)
+ panic("%s: lock (%s) %s is not initialized", __func__,
+ lock->lo_class->lc_name, lock->lo_name);
+
+ /* XXX: need to verify that no one holds the lock */
+ w = lock->lo_witness;
+ if (w != NULL) {
+ mtx_lock_spin(&w_mtx);
+ MPASS(w->w_refcount > 0);
+ w->w_refcount--;
+ mtx_unlock_spin(&w_mtx);
+ }
+
+ mtx_lock(&all_mtx);
+ lock_cur_cnt--;
+ TAILQ_REMOVE(&all_locks, lock, lo_list);
+ lock->lo_flags &= ~LO_INITIALIZED;
+ mtx_unlock(&all_mtx);
+}
+
+static void
+witness_display_list(void(*prnt)(const char *fmt, ...),
+ struct witness_list *list)
+{
+ struct witness *w, *w1;
+ int found;
+
+ STAILQ_FOREACH(w, list, w_typelist) {
+ if (w->w_file == NULL)
+ continue;
+ found = 0;
+ STAILQ_FOREACH(w1, list, w_typelist) {
+ if (isitmychild(w1, w)) {
+ found++;
+ break;
+ }
+ }
+ if (found)
+ continue;
+ /*
+ * This lock has no anscestors, display its descendants.
+ */
+ witness_displaydescendants(prnt, w);
+ }
+}
+
+static void
+witness_display(void(*prnt)(const char *fmt, ...))
+{
+ struct witness *w;
+
+ KASSERT(!witness_cold, ("%s: witness_cold", __func__));
+ witness_levelall();
+
+ /*
+ * First, handle sleep locks which have been acquired at least
+ * once.
+ */
+ prnt("Sleep locks:\n");
+ witness_display_list(prnt, &w_sleep);
+
+ /*
+ * Now do spin locks which have been acquired at least once.
+ */
+ prnt("\nSpin locks:\n");
+ witness_display_list(prnt, &w_spin);
+
+ /*
+ * Finally, any locks which have not been acquired yet.
+ */
+ prnt("\nLocks which were never acquired:\n");
+ STAILQ_FOREACH(w, &w_all, w_list) {
+ if (w->w_file != NULL || w->w_refcount == 0)
+ continue;
+ prnt("%s\n", w->w_name);
+ }
+}
+
+void
+witness_lock(struct lock_object *lock, int flags, const char *file, int line)
+{
+ struct lock_list_entry **lock_list, *lle;
+ struct lock_instance *lock1, *lock2;
+ struct lock_class *class;
+ struct witness *w, *w1;
+ struct thread *td;
+ int i, j;
+#ifdef DDB
+ int go_into_ddb = 0;
+#endif /* DDB */
+
+ if (witness_cold || witness_dead || lock->lo_witness == NULL ||
+ panicstr != NULL)
+ return;
+ w = lock->lo_witness;
+ class = lock->lo_class;
+ td = curthread;
+
+ if (class->lc_flags & LC_SLEEPLOCK) {
+ /*
+ * Since spin locks include a critical section, this check
+ * impliclty enforces a lock order of all sleep locks before
+ * all spin locks.
+ */
+ if (td->td_critnest != 0 && (flags & LOP_TRYLOCK) == 0)
+ panic("blockable sleep lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name, file, line);
+ lock_list = &td->td_sleeplocks;
+ } else
+ lock_list = PCPU_PTR(spinlocks);
+
+ /*
+ * Try locks do not block if they fail to acquire the lock, thus
+ * there is no danger of deadlocks or of switching while holding a
+ * spin lock if we acquire a lock via a try operation.
+ */
+ if (flags & LOP_TRYLOCK)
+ goto out;
+
+ /*
+ * Is this the first lock acquired? If so, then no order checking
+ * is needed.
+ */
+ if (*lock_list == NULL)
+ goto out;
+
+ /*
+ * Check to see if we are recursing on a lock we already own.
+ */
+ lock1 = find_instance(*lock_list, lock);
+ if (lock1 != NULL) {
+ if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
+ (flags & LOP_EXCLUSIVE) == 0) {
+ printf("shared lock of (%s) %s @ %s:%d\n",
+ class->lc_name, lock->lo_name, file, line);
+ printf("while exclusively locked from %s:%d\n",
+ lock1->li_file, lock1->li_line);
+ panic("share->excl");
+ }
+ if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
+ (flags & LOP_EXCLUSIVE) != 0) {
+ printf("exclusive lock of (%s) %s @ %s:%d\n",
+ class->lc_name, lock->lo_name, file, line);
+ printf("while share locked from %s:%d\n",
+ lock1->li_file, lock1->li_line);
+ panic("excl->share");
+ }
+ lock1->li_flags++;
+ if ((lock->lo_flags & LO_RECURSABLE) == 0) {
+ printf(
+ "recursed on non-recursive lock (%s) %s @ %s:%d\n",
+ class->lc_name, lock->lo_name, file, line);
+ printf("first acquired @ %s:%d\n", lock1->li_file,
+ lock1->li_line);
+ panic("recurse");
+ }
+ CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
+ td->td_proc->p_pid, lock->lo_name,
+ lock1->li_flags & LI_RECURSEMASK);
+ lock1->li_file = file;
+ lock1->li_line = line;
+ return;
+ }
+
+ /*
+ * Check for duplicate locks of the same type. Note that we only
+ * have to check for this on the last lock we just acquired. Any
+ * other cases will be caught as lock order violations.
+ */
+ lock1 = &(*lock_list)->ll_children[(*lock_list)->ll_count - 1];
+ w1 = lock1->li_lock->lo_witness;
+ if (w1 == w) {
+ if (w->w_same_squawked || (lock->lo_flags & LO_DUPOK))
+ goto out;
+ w->w_same_squawked = 1;
+ printf("acquiring duplicate lock of same type: \"%s\"\n",
+ lock->lo_type);
+ printf(" 1st %s @ %s:%d\n", lock1->li_lock->lo_name,
+ lock1->li_file, lock1->li_line);
+ printf(" 2nd %s @ %s:%d\n", lock->lo_name, file, line);
+#ifdef DDB
+ go_into_ddb = 1;
+#endif /* DDB */
+ goto out;
+ }
+ MPASS(!mtx_owned(&w_mtx));
+ mtx_lock_spin(&w_mtx);
+ /*
+ * If we have a known higher number just say ok
+ */
+ if (witness_watch > 1 && w->w_level > w1->w_level) {
+ mtx_unlock_spin(&w_mtx);
+ goto out;
+ }
+ if (isitmydescendant(w1, w)) {
+ mtx_unlock_spin(&w_mtx);
+ goto out;
+ }
+ for (j = 0, lle = *lock_list; lle != NULL; lle = lle->ll_next) {
+ for (i = lle->ll_count - 1; i >= 0; i--, j++) {
+
+ MPASS(j < WITNESS_COUNT);
+ lock1 = &lle->ll_children[i];
+ w1 = lock1->li_lock->lo_witness;
+
+ /*
+ * If this lock doesn't undergo witness checking,
+ * then skip it.
+ */
+ if (w1 == NULL) {
+ KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
+ ("lock missing witness structure"));
+ continue;
+ }
+ /*
+ * If we are locking Giant and we slept with this
+ * lock, then skip it.
+ */
+ if ((lock1->li_flags & LI_SLEPT) != 0 &&
+ lock == &Giant.mtx_object)
+ continue;
+ /*
+ * If we are locking a sleepable lock and this lock
+ * isn't sleepable and isn't Giant, we want to treat
+ * it as a lock order violation to enfore a general
+ * lock order of sleepable locks before non-sleepable
+ * locks. Thus, we only bother checking the lock
+ * order hierarchy if we pass the initial test.
+ */
+ if (!((lock->lo_flags & LO_SLEEPABLE) != 0 &&
+ ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
+ lock1->li_lock != &Giant.mtx_object)) &&
+ !isitmydescendant(w, w1))
+ continue;
+ /*
+ * We have a lock order violation, check to see if it
+ * is allowed or has already been yelled about.
+ */
+ mtx_unlock_spin(&w_mtx);
+ if (blessed(w, w1))
+ goto out;
+ if (lock1->li_lock == &Giant.mtx_object) {
+ if (w1->w_Giant_squawked)
+ goto out;
+ else
+ w1->w_Giant_squawked = 1;
+ } else {
+ if (w1->w_other_squawked)
+ goto out;
+ else
+ w1->w_other_squawked = 1;
+ }
+ /*
+ * Ok, yell about it.
+ */
+ printf("lock order reversal\n");
+ /*
+ * Try to locate an earlier lock with
+ * witness w in our list.
+ */
+ do {
+ lock2 = &lle->ll_children[i];
+ MPASS(lock2->li_lock != NULL);
+ if (lock2->li_lock->lo_witness == w)
+ break;
+ i--;
+ if (i == 0 && lle->ll_next != NULL) {
+ lle = lle->ll_next;
+ i = lle->ll_count - 1;
+ MPASS(i != 0);
+ }
+ } while (i >= 0);
+ if (i < 0) {
+ printf(" 1st %p %s (%s) @ %s:%d\n",
+ lock1->li_lock, lock1->li_lock->lo_name,
+ lock1->li_lock->lo_type, lock1->li_file,
+ lock1->li_line);
+ printf(" 2nd %p %s (%s) @ %s:%d\n", lock,
+ lock->lo_name, lock->lo_type, file, line);
+ } else {
+ printf(" 1st %p %s (%s) @ %s:%d\n",
+ lock2->li_lock, lock2->li_lock->lo_name,
+ lock2->li_lock->lo_type, lock2->li_file,
+ lock2->li_line);
+ printf(" 2nd %p %s (%s) @ %s:%d\n",
+ lock1->li_lock, lock1->li_lock->lo_name,
+ lock1->li_lock->lo_type, lock1->li_file,
+ lock1->li_line);
+ printf(" 3rd %p %s (%s) @ %s:%d\n", lock,
+ lock->lo_name, lock->lo_type, file, line);
+ }
+#ifdef DDB
+ go_into_ddb = 1;
+#endif /* DDB */
+ goto out;
+ }
+ }
+ lock1 = &(*lock_list)->ll_children[(*lock_list)->ll_count - 1];
+ /*
+ * Don't build a new relationship if we are locking Giant just
+ * after waking up and the previous lock in the list was acquired
+ * prior to blocking.
+ */
+ if (lock == &Giant.mtx_object && (lock1->li_flags & LI_SLEPT) != 0)
+ mtx_unlock_spin(&w_mtx);
+ else {
+ CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
+ lock->lo_type, lock1->li_lock->lo_type);
+ if (!itismychild(lock1->li_lock->lo_witness, w))
+ mtx_unlock_spin(&w_mtx);
+ }
+
+out:
+#ifdef DDB
+ if (witness_ddb && go_into_ddb)
+ Debugger(__func__);
+#endif /* DDB */
+ w->w_file = file;
+ w->w_line = line;
+
+ lle = *lock_list;
+ if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
+ lle = witness_lock_list_get();
+ if (lle == NULL)
+ return;
+ lle->ll_next = *lock_list;
+ CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
+ td->td_proc->p_pid, lle);
+ *lock_list = lle;
+ }
+ lock1 = &lle->ll_children[lle->ll_count++];
+ lock1->li_lock = lock;
+ lock1->li_line = line;
+ lock1->li_file = file;
+ if ((flags & LOP_EXCLUSIVE) != 0)
+ lock1->li_flags = LI_EXCLUSIVE;
+ else
+ lock1->li_flags = 0;
+ CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
+ td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
+}
+
+void
+witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
+{
+ struct lock_instance *instance;
+ struct lock_class *class;
+
+ KASSERT(!witness_cold, ("%s: witness_cold", __func__));
+ if (lock->lo_witness == NULL || witness_dead || panicstr != NULL)
+ return;
+ class = lock->lo_class;
+ if ((lock->lo_flags & LO_UPGRADABLE) == 0)
+ panic("upgrade of non-upgradable lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name, file, line);
+ if ((flags & LOP_TRYLOCK) == 0)
+ panic("non-try upgrade of lock (%s) %s @ %s:%d", class->lc_name,
+ lock->lo_name, file, line);
+ if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
+ panic("upgrade of non-sleep lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name, file, line);
+ instance = find_instance(curthread->td_sleeplocks, lock);
+ if (instance == NULL)
+ panic("upgrade of unlocked lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name, file, line);
+ if ((instance->li_flags & LI_EXCLUSIVE) != 0)
+ panic("upgrade of exclusive lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name, file, line);
+ if ((instance->li_flags & LI_RECURSEMASK) != 0)
+ panic("upgrade of recursed lock (%s) %s r=%d @ %s:%d",
+ class->lc_name, lock->lo_name,
+ instance->li_flags & LI_RECURSEMASK, file, line);
+ instance->li_flags |= LI_EXCLUSIVE;
+}
+
+void
+witness_downgrade(struct lock_object *lock, int flags, const char *file,
+ int line)
+{
+ struct lock_instance *instance;
+ struct lock_class *class;
+
+ KASSERT(!witness_cold, ("%s: witness_cold", __func__));
+ if (lock->lo_witness == NULL || witness_dead || panicstr != NULL)
+ return;
+ class = lock->lo_class;
+ if ((lock->lo_flags & LO_UPGRADABLE) == 0)
+ panic("downgrade of non-upgradable lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name, file, line);
+ if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
+ panic("downgrade of non-sleep lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name, file, line);
+ instance = find_instance(curthread->td_sleeplocks, lock);
+ if (instance == NULL)
+ panic("downgrade of unlocked lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name, file, line);
+ if ((instance->li_flags & LI_EXCLUSIVE) == 0)
+ panic("downgrade of shared lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name, file, line);
+ if ((instance->li_flags & LI_RECURSEMASK) != 0)
+ panic("downgrade of recursed lock (%s) %s r=%d @ %s:%d",
+ class->lc_name, lock->lo_name,
+ instance->li_flags & LI_RECURSEMASK, file, line);
+ instance->li_flags &= ~LI_EXCLUSIVE;
+}
+
+void
+witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
+{
+ struct lock_list_entry **lock_list, *lle;
+ struct lock_instance *instance;
+ struct lock_class *class;
+ struct thread *td;
+ register_t s;
+ int i, j;
+
+ if (witness_cold || witness_dead || lock->lo_witness == NULL ||
+ panicstr != NULL)
+ return;
+ td = curthread;
+ class = lock->lo_class;
+ if (class->lc_flags & LC_SLEEPLOCK)
+ lock_list = &td->td_sleeplocks;
+ else
+ lock_list = PCPU_PTR(spinlocks);
+ for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
+ for (i = 0; i < (*lock_list)->ll_count; i++) {
+ instance = &(*lock_list)->ll_children[i];
+ if (instance->li_lock == lock) {
+ if ((instance->li_flags & LI_EXCLUSIVE) != 0 &&
+ (flags & LOP_EXCLUSIVE) == 0) {
+ printf(
+ "shared unlock of (%s) %s @ %s:%d\n",
+ class->lc_name, lock->lo_name,
+ file, line);
+ printf(
+ "while exclusively locked from %s:%d\n",
+ instance->li_file,
+ instance->li_line);
+ panic("excl->ushare");
+ }
+ if ((instance->li_flags & LI_EXCLUSIVE) == 0 &&
+ (flags & LOP_EXCLUSIVE) != 0) {
+ printf(
+ "exclusive unlock of (%s) %s @ %s:%d\n",
+ class->lc_name, lock->lo_name,
+ file, line);
+ printf(
+ "while share locked from %s:%d\n",
+ instance->li_file,
+ instance->li_line);
+ panic("share->uexcl");
+ }
+ /* If we are recursed, unrecurse. */
+ if ((instance->li_flags & LI_RECURSEMASK) > 0) {
+ CTR4(KTR_WITNESS,
+ "%s: pid %d unrecursed on %s r=%d", __func__,
+ td->td_proc->p_pid,
+ instance->li_lock->lo_name,
+ instance->li_flags);
+ instance->li_flags--;
+ return;
+ }
+ s = intr_disable();
+ CTR4(KTR_WITNESS,
+ "%s: pid %d removed %s from lle[%d]", __func__,
+ td->td_proc->p_pid,
+ instance->li_lock->lo_name,
+ (*lock_list)->ll_count - 1);
+ for (j = i; j < (*lock_list)->ll_count - 1; j++)
+ (*lock_list)->ll_children[j] =
+ (*lock_list)->ll_children[j + 1];
+ (*lock_list)->ll_count--;
+ intr_restore(s);
+ if ((*lock_list)->ll_count == 0) {
+ lle = *lock_list;
+ *lock_list = lle->ll_next;
+ CTR3(KTR_WITNESS,
+ "%s: pid %d removed lle %p", __func__,
+ td->td_proc->p_pid, lle);
+ witness_lock_list_free(lle);
+ }
+ return;
+ }
+ }
+ panic("lock (%s) %s not locked @ %s:%d", class->lc_name, lock->lo_name,
+ file, line);
+}
+
+/*
+ * Warn if any held locks are not sleepable. Note that Giant and the lock
+ * passed in are both special cases since they are both released during the
+ * sleep process and aren't actually held while the thread is asleep.
+ */
+int
+witness_sleep(int check_only, struct lock_object *lock, const char *file,
+ int line)
+{
+ struct lock_list_entry **lock_list, *lle;
+ struct lock_instance *lock1;
+ struct thread *td;
+ int i, n;
+
+ if (witness_cold || witness_dead || panicstr != NULL)
+ return (0);
+ n = 0;
+ td = curthread;
+ lock_list = &td->td_sleeplocks;
+again:
+ for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
+ for (i = lle->ll_count - 1; i >= 0; i--) {
+ lock1 = &lle->ll_children[i];
+ if (lock1->li_lock == lock ||
+ lock1->li_lock == &Giant.mtx_object)
+ continue;
+ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0) {
+ if (check_only == 0) {
+ CTR3(KTR_WITNESS,
+ "pid %d: sleeping with lock (%s) %s held",
+ td->td_proc->p_pid,
+ lock1->li_lock->lo_class->lc_name,
+ lock1->li_lock->lo_name);
+ lock1->li_flags |= LI_SLEPT;
+ }
+ continue;
+ }
+ n++;
+ printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
+ file, line, check_only ? "could sleep" : "sleeping",
+ lock1->li_lock->lo_name, lock1->li_file,
+ lock1->li_line);
+ }
+ if (lock_list == &td->td_sleeplocks && PCPU_GET(spinlocks) != NULL) {
+ /*
+ * Since we already hold a spinlock preemption is
+ * already blocked.
+ */
+ lock_list = PCPU_PTR(spinlocks);
+ goto again;
+ }
+#ifdef DDB
+ if (witness_ddb && n)
+ Debugger(__func__);
+#endif /* DDB */
+ return (n);
+}
+
+static struct witness *
+enroll(const char *description, struct lock_class *lock_class)
+{
+ struct witness *w;
+
+ if (!witness_watch || witness_dead || panicstr != NULL)
+ return (NULL);
+ if ((lock_class->lc_flags & LC_SPINLOCK) && witness_skipspin)
+ return (NULL);
+ mtx_lock_spin(&w_mtx);
+ STAILQ_FOREACH(w, &w_all, w_list) {
+ if (w->w_name == description || (w->w_refcount > 0 &&
+ strcmp(description, w->w_name) == 0)) {
+ w->w_refcount++;
+ mtx_unlock_spin(&w_mtx);
+ if (lock_class != w->w_class)
+ panic(
+ "lock (%s) %s does not match earlier (%s) lock",
+ description, lock_class->lc_name,
+ w->w_class->lc_name);
+ return (w);
+ }
+ }
+ /*
+ * This isn't quite right, as witness_cold is still 0 while we
+ * enroll all the locks initialized before witness_initialize().
+ */
+ if ((lock_class->lc_flags & LC_SPINLOCK) && !witness_cold) {
+ mtx_unlock_spin(&w_mtx);
+ panic("spin lock %s not in order list", description);
+ }
+ if ((w = witness_get()) == NULL)
+ return (NULL);
+ w->w_name = description;
+ w->w_class = lock_class;
+ w->w_refcount = 1;
+ STAILQ_INSERT_HEAD(&w_all, w, w_list);
+ if (lock_class->lc_flags & LC_SPINLOCK)
+ STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
+ else if (lock_class->lc_flags & LC_SLEEPLOCK)
+ STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
+ else {
+ mtx_unlock_spin(&w_mtx);
+ panic("lock class %s is not sleep or spin",
+ lock_class->lc_name);
+ }
+ mtx_unlock_spin(&w_mtx);
+ return (w);
+}
+
+static int
+itismychild(struct witness *parent, struct witness *child)
+{
+ static int recursed;
+ struct witness_child_list_entry **wcl;
+ struct witness_list *list;
+
+ MPASS(child != NULL && parent != NULL);
+ if ((parent->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) !=
+ (child->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)))
+ panic(
+ "%s: parent (%s) and child (%s) are not the same lock type",
+ __func__, parent->w_class->lc_name,
+ child->w_class->lc_name);
+
+ /*
+ * Insert "child" after "parent"
+ */
+ wcl = &parent->w_children;
+ while (*wcl != NULL && (*wcl)->wcl_count == WITNESS_NCHILDREN)
+ wcl = &(*wcl)->wcl_next;
+ if (*wcl == NULL) {
+ *wcl = witness_child_get();
+ if (*wcl == NULL)
+ return (1);
+ }
+ (*wcl)->wcl_children[(*wcl)->wcl_count++] = child;
+
+ /*
+ * Now prune whole tree. We look for cases where a lock is now
+ * both a descendant and a direct child of a given lock. In that
+ * case, we want to remove the direct child link from the tree.
+ */
+ if (recursed)
+ return (0);
+ recursed = 1;
+ if (parent->w_class->lc_flags & LC_SLEEPLOCK)
+ list = &w_sleep;
+ else
+ list = &w_spin;
+ STAILQ_FOREACH(child, list, w_typelist) {
+ STAILQ_FOREACH(parent, list, w_typelist) {
+ if (!isitmychild(parent, child))
+ continue;
+ removechild(parent, child);
+ if (isitmydescendant(parent, child))
+ continue;
+ itismychild(parent, child);
+ }
+ }
+ recursed = 0;
+ witness_levelall();
+ return (0);
+}
+
+static void
+removechild(struct witness *parent, struct witness *child)
+{
+ struct witness_child_list_entry **wcl, *wcl1;
+ int i;
+
+ for (wcl = &parent->w_children; *wcl != NULL; wcl = &(*wcl)->wcl_next)
+ for (i = 0; i < (*wcl)->wcl_count; i++)
+ if ((*wcl)->wcl_children[i] == child)
+ goto found;
+ return;
+found:
+ (*wcl)->wcl_count--;
+ if ((*wcl)->wcl_count > i)
+ (*wcl)->wcl_children[i] =
+ (*wcl)->wcl_children[(*wcl)->wcl_count];
+ MPASS((*wcl)->wcl_children[i] != NULL);
+ if ((*wcl)->wcl_count != 0)
+ return;
+ wcl1 = *wcl;
+ *wcl = wcl1->wcl_next;
+ witness_child_free(wcl1);
+}
+
+static int
+isitmychild(struct witness *parent, struct witness *child)
+{
+ struct witness_child_list_entry *wcl;
+ int i;
+
+ for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) {
+ for (i = 0; i < wcl->wcl_count; i++) {
+ if (wcl->wcl_children[i] == child)
+ return (1);
+ }
+ }
+ return (0);
+}
+
+static int
+isitmydescendant(struct witness *parent, struct witness *child)
+{
+ struct witness_child_list_entry *wcl;
+ int i, j;
+
+ if (isitmychild(parent, child))
+ return (1);
+ j = 0;
+ for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) {
+ MPASS(j < 1000);
+ for (i = 0; i < wcl->wcl_count; i++) {
+ if (isitmydescendant(wcl->wcl_children[i], child))
+ return (1);
+ }
+ j++;
+ }
+ return (0);
+}
+
+void
+witness_levelall (void)
+{
+ struct witness_list *list;
+ struct witness *w, *w1;
+
+ /*
+ * First clear all levels.
+ */
+ STAILQ_FOREACH(w, &w_all, w_list) {
+ w->w_level = 0;
+ }
+
+ /*
+ * Look for locks with no parent and level all their descendants.
+ */
+ STAILQ_FOREACH(w, &w_all, w_list) {
+ /*
+ * This is just an optimization, technically we could get
+ * away just walking the all list each time.
+ */
+ if (w->w_class->lc_flags & LC_SLEEPLOCK)
+ list = &w_sleep;
+ else
+ list = &w_spin;
+ STAILQ_FOREACH(w1, list, w_typelist) {
+ if (isitmychild(w1, w))
+ goto skip;
+ }
+ witness_leveldescendents(w, 0);
+ skip:
+ ; /* silence GCC 3.x */
+ }
+}
+
+static void
+witness_leveldescendents(struct witness *parent, int level)
+{
+ struct witness_child_list_entry *wcl;
+ int i;
+
+ if (parent->w_level < level)
+ parent->w_level = level;
+ level++;
+ for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
+ for (i = 0; i < wcl->wcl_count; i++)
+ witness_leveldescendents(wcl->wcl_children[i], level);
+}
+
+static void
+witness_displaydescendants(void(*prnt)(const char *fmt, ...),
+ struct witness *parent)
+{
+ struct witness_child_list_entry *wcl;
+ int i, level;
+
+ level = parent->w_level;
+ prnt("%-2d", level);
+ for (i = 0; i < level; i++)
+ prnt(" ");
+ if (parent->w_refcount > 0) {
+ prnt("%s", parent->w_name);
+ if (parent->w_file != NULL)
+ prnt(" -- last acquired @ %s:%d\n", parent->w_file,
+ parent->w_line);
+ } else
+ prnt("(dead)\n");
+ for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
+ for (i = 0; i < wcl->wcl_count; i++)
+ witness_displaydescendants(prnt,
+ wcl->wcl_children[i]);
+}
+
+static int
+blessed(struct witness *w1, struct witness *w2)
+{
+ int i;
+ struct witness_blessed *b;
+
+ for (i = 0; i < blessed_count; i++) {
+ b = &blessed_list[i];
+ if (strcmp(w1->w_name, b->b_lock1) == 0) {
+ if (strcmp(w2->w_name, b->b_lock2) == 0)
+ return (1);
+ continue;
+ }
+ if (strcmp(w1->w_name, b->b_lock2) == 0)
+ if (strcmp(w2->w_name, b->b_lock1) == 0)
+ return (1);
+ }
+ return (0);
+}
+
+static struct witness *
+witness_get(void)
+{
+ struct witness *w;
+
+ if (witness_dead) {
+ mtx_unlock_spin(&w_mtx);
+ return (NULL);
+ }
+ if (STAILQ_EMPTY(&w_free)) {
+ witness_dead = 1;
+ mtx_unlock_spin(&w_mtx);
+ printf("%s: witness exhausted\n", __func__);
+ return (NULL);
+ }
+ w = STAILQ_FIRST(&w_free);
+ STAILQ_REMOVE_HEAD(&w_free, w_list);
+ bzero(w, sizeof(*w));
+ return (w);
+}
+
+static void
+witness_free(struct witness *w)
+{
+
+ STAILQ_INSERT_HEAD(&w_free, w, w_list);
+}
+
+static struct witness_child_list_entry *
+witness_child_get(void)
+{
+ struct witness_child_list_entry *wcl;
+
+ if (witness_dead) {
+ mtx_unlock_spin(&w_mtx);
+ return (NULL);
+ }
+ wcl = w_child_free;
+ if (wcl == NULL) {
+ witness_dead = 1;
+ mtx_unlock_spin(&w_mtx);
+ printf("%s: witness exhausted\n", __func__);
+ return (NULL);
+ }
+ w_child_free = wcl->wcl_next;
+ bzero(wcl, sizeof(*wcl));
+ return (wcl);
+}
+
+static void
+witness_child_free(struct witness_child_list_entry *wcl)
+{
+
+ wcl->wcl_next = w_child_free;
+ w_child_free = wcl;
+}
+
+static struct lock_list_entry *
+witness_lock_list_get(void)
+{
+ struct lock_list_entry *lle;
+
+ if (witness_dead)
+ return (NULL);
+ mtx_lock_spin(&w_mtx);
+ lle = w_lock_list_free;
+ if (lle == NULL) {
+ witness_dead = 1;
+ mtx_unlock_spin(&w_mtx);
+ printf("%s: witness exhausted\n", __func__);
+ return (NULL);
+ }
+ w_lock_list_free = lle->ll_next;
+ mtx_unlock_spin(&w_mtx);
+ bzero(lle, sizeof(*lle));
+ return (lle);
+}
+
+static void
+witness_lock_list_free(struct lock_list_entry *lle)
+{
+
+ mtx_lock_spin(&w_mtx);
+ lle->ll_next = w_lock_list_free;
+ w_lock_list_free = lle;
+ mtx_unlock_spin(&w_mtx);
+}
+
+static struct lock_instance *
+find_instance(struct lock_list_entry *lock_list, struct lock_object *lock)
+{
+ struct lock_list_entry *lle;
+ struct lock_instance *instance;
+ int i;
+
+ for (lle = lock_list; lle != NULL; lle = lle->ll_next)
+ for (i = lle->ll_count - 1; i >= 0; i--) {
+ instance = &lle->ll_children[i];
+ if (instance->li_lock == lock)
+ return (instance);
+ }
+ return (NULL);
+}
+
+int
+witness_list_locks(struct lock_list_entry **lock_list)
+{
+ struct lock_list_entry *lle;
+ struct lock_instance *instance;
+ struct lock_object *lock;
+ int i, nheld;
+
+ nheld = 0;
+ for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
+ for (i = lle->ll_count - 1; i >= 0; i--) {
+ instance = &lle->ll_children[i];
+ lock = instance->li_lock;
+ printf("%s %s %s",
+ (instance->li_flags & LI_EXCLUSIVE) != 0 ?
+ "exclusive" : "shared",
+ lock->lo_class->lc_name, lock->lo_name);
+ if (lock->lo_type != lock->lo_name)
+ printf(" (%s)", lock->lo_type);
+ printf(" r = %d (%p) locked @ %s:%d\n",
+ instance->li_flags & LI_RECURSEMASK, lock,
+ instance->li_file, instance->li_line);
+ nheld++;
+ }
+ return (nheld);
+}
+
+/*
+ * Calling this on td != curthread is bad unless we are in ddb.
+ */
+int
+witness_list(struct thread *td)
+{
+ int nheld;
+
+ KASSERT(!witness_cold, ("%s: witness_cold", __func__));
+#ifdef DDB
+ KASSERT(td == curthread || db_active,
+ ("%s: td != curthread and we aren't in the debugger", __func__));
+ if (!db_active && witness_dead)
+ return (0);
+#else
+ KASSERT(td == curthread, ("%s: p != curthread", __func__));
+ if (witness_dead)
+ return (0);
+#endif
+ nheld = witness_list_locks(&td->td_sleeplocks);
+
+ /*
+ * We only handle spinlocks if td == curthread. This is somewhat broken
+ * if td is currently executing on some other CPU and holds spin locks
+ * as we won't display those locks. If we had a MI way of getting
+ * the per-cpu data for a given cpu then we could use
+ * td->td_kse->ke_oncpu to get the list of spinlocks for this thread
+ * and "fix" this.
+ *
+ * That still wouldn't really fix this unless we locked sched_lock
+ * or stopped the other CPU to make sure it wasn't changing the list
+ * out from under us. It is probably best to just not try to handle
+ * threads on other CPU's for now.
+ */
+ if (td == curthread && PCPU_GET(spinlocks) != NULL)
+ nheld += witness_list_locks(PCPU_PTR(spinlocks));
+
+ return (nheld);
+}
+
+void
+witness_save(struct lock_object *lock, const char **filep, int *linep)
+{
+ struct lock_instance *instance;
+
+ KASSERT(!witness_cold, ("%s: witness_cold", __func__));
+ if (lock->lo_witness == NULL || witness_dead || panicstr != NULL)
+ return;
+ if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
+ panic("%s: lock (%s) %s is not a sleep lock", __func__,
+ lock->lo_class->lc_name, lock->lo_name);
+ instance = find_instance(curthread->td_sleeplocks, lock);
+ if (instance == NULL)
+ panic("%s: lock (%s) %s not locked", __func__,
+ lock->lo_class->lc_name, lock->lo_name);
+ *filep = instance->li_file;
+ *linep = instance->li_line;
+}
+
+void
+witness_restore(struct lock_object *lock, const char *file, int line)
+{
+ struct lock_instance *instance;
+
+ KASSERT(!witness_cold, ("%s: witness_cold", __func__));
+ if (lock->lo_witness == NULL || witness_dead || panicstr != NULL)
+ return;
+ if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
+ panic("%s: lock (%s) %s is not a sleep lock", __func__,
+ lock->lo_class->lc_name, lock->lo_name);
+ instance = find_instance(curthread->td_sleeplocks, lock);
+ if (instance == NULL)
+ panic("%s: lock (%s) %s not locked", __func__,
+ lock->lo_class->lc_name, lock->lo_name);
+ lock->lo_witness->w_file = file;
+ lock->lo_witness->w_line = line;
+ instance->li_file = file;
+ instance->li_line = line;
+}
+
+void
+witness_assert(struct lock_object *lock, int flags, const char *file, int line)
+{
+#ifdef INVARIANT_SUPPORT
+ struct lock_instance *instance;
+
+ if (lock->lo_witness == NULL || witness_dead || panicstr != NULL)
+ return;
+ if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) != 0)
+ instance = find_instance(curthread->td_sleeplocks, lock);
+ else if ((lock->lo_class->lc_flags & LC_SPINLOCK) != 0)
+ instance = find_instance(PCPU_GET(spinlocks), lock);
+ else {
+ panic("Lock (%s) %s is not sleep or spin!",
+ lock->lo_class->lc_name, lock->lo_name);
+ return;
+ }
+ switch (flags) {
+ case LA_UNLOCKED:
+ if (instance != NULL)
+ panic("Lock (%s) %s locked @ %s:%d.",
+ lock->lo_class->lc_name, lock->lo_name, file, line);
+ break;
+ case LA_LOCKED:
+ case LA_LOCKED | LA_RECURSED:
+ case LA_LOCKED | LA_NOTRECURSED:
+ case LA_SLOCKED:
+ case LA_SLOCKED | LA_RECURSED:
+ case LA_SLOCKED | LA_NOTRECURSED:
+ case LA_XLOCKED:
+ case LA_XLOCKED | LA_RECURSED:
+ case LA_XLOCKED | LA_NOTRECURSED:
+ if (instance == NULL) {
+ panic("Lock (%s) %s not locked @ %s:%d.",
+ lock->lo_class->lc_name, lock->lo_name, file, line);
+ break;
+ }
+ if ((flags & LA_XLOCKED) != 0 &&
+ (instance->li_flags & LI_EXCLUSIVE) == 0)
+ panic("Lock (%s) %s not exclusively locked @ %s:%d.",
+ lock->lo_class->lc_name, lock->lo_name, file, line);
+ if ((flags & LA_SLOCKED) != 0 &&
+ (instance->li_flags & LI_EXCLUSIVE) != 0)
+ panic("Lock (%s) %s exclusively locked @ %s:%d.",
+ lock->lo_class->lc_name, lock->lo_name, file, line);
+ if ((flags & LA_RECURSED) != 0 &&
+ (instance->li_flags & LI_RECURSEMASK) == 0)
+ panic("Lock (%s) %s not recursed @ %s:%d.",
+ lock->lo_class->lc_name, lock->lo_name, file, line);
+ if ((flags & LA_NOTRECURSED) != 0 &&
+ (instance->li_flags & LI_RECURSEMASK) != 0)
+ panic("Lock (%s) %s recursed @ %s:%d.",
+ lock->lo_class->lc_name, lock->lo_name, file, line);
+ break;
+ default:
+ panic("Invalid lock assertion at %s:%d.", file, line);
+
+ }
+#endif /* INVARIANT_SUPPORT */
+}
+
+#ifdef DDB
+
+DB_SHOW_COMMAND(locks, db_witness_list)
+{
+ struct thread *td;
+ pid_t pid;
+ struct proc *p;
+
+ if (have_addr) {
+ pid = (addr % 16) + ((addr >> 4) % 16) * 10 +
+ ((addr >> 8) % 16) * 100 + ((addr >> 12) % 16) * 1000 +
+ ((addr >> 16) % 16) * 10000;
+ /* sx_slock(&allproc_lock); */
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_pid == pid)
+ break;
+ }
+ /* sx_sunlock(&allproc_lock); */
+ if (p == NULL) {
+ db_printf("pid %d not found\n", pid);
+ return;
+ }
+ FOREACH_THREAD_IN_PROC(p, td) {
+ witness_list(td);
+ }
+ } else {
+ td = curthread;
+ witness_list(td);
+ }
+}
+
+DB_SHOW_COMMAND(witness, db_witness_display)
+{
+
+ witness_display(db_printf);
+}
+#endif
diff --git a/sys/kern/subr_xxx.c b/sys/kern/subr_xxx.c
new file mode 100644
index 0000000..c9d2676
--- /dev/null
+++ b/sys/kern/subr_xxx.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_xxx.c 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+/*
+ * Miscellaneous trivial functions.
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+
+/*
+ * Return error for operation not supported
+ * on a specific object or file type.
+ */
+int
+eopnotsupp()
+{
+
+ return (EOPNOTSUPP);
+}
+
+/*
+ * Generic null operation, always returns success.
+ */
+int
+nullop()
+{
+
+ return (0);
+}
+
+#include <sys/conf.h>
+
+/*
+ * Unsupported devswitch functions (e.g. for writing to read-only device).
+ * XXX may belong elsewhere.
+ */
+
+int
+noopen(dev, flags, fmt, td)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct thread *td;
+{
+
+ return (ENODEV);
+}
+
+int
+noclose(dev, flags, fmt, td)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct thread *td;
+{
+
+ return (ENODEV);
+}
+
+int
+noread(dev, uio, ioflag)
+ dev_t dev;
+ struct uio *uio;
+ int ioflag;
+{
+
+ return (ENODEV);
+}
+
+int
+nowrite(dev, uio, ioflag)
+ dev_t dev;
+ struct uio *uio;
+ int ioflag;
+{
+
+ return (ENODEV);
+}
+
+int
+noioctl(dev, cmd, data, flags, td)
+ dev_t dev;
+ u_long cmd;
+ caddr_t data;
+ int flags;
+ struct thread *td;
+{
+
+ return (ENODEV);
+}
+
+int
+nokqfilter(dev, kn)
+ dev_t dev;
+ struct knote *kn;
+{
+
+ return (ENODEV);
+}
+
+int
+nommap(dev, offset, nprot)
+ dev_t dev;
+ vm_offset_t offset;
+ int nprot;
+{
+
+ /* Don't return ENODEV. That would allow mapping address ENODEV! */
+ return (-1);
+}
+
+int
+nodump(dev_t dev, void *virtual __unused, vm_offset_t physical __unused, off_t offset __unused, size_t length __unused)
+{
+
+ return (ENODEV);
+}
+
+/*
+ * Null devswitch functions (for when the operation always succeeds).
+ * XXX may belong elsewhere.
+ * XXX not all are here (e.g., seltrue() isn't).
+ */
+
+/*
+ * XXX this is probably bogus. Any device that uses it isn't checking the
+ * minor number.
+ */
+int
+nullopen(dev, flags, fmt, td)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct thread *td;
+{
+
+ return (0);
+}
+
+int
+nullclose(dev, flags, fmt, td)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct thread *td;
+{
+
+ return (0);
+}
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
new file mode 100644
index 0000000..1bdd913
--- /dev/null
+++ b/sys/kern/sys_generic.c
@@ -0,0 +1,1210 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/resourcevar.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/condvar.h>
+#ifdef __alpha__
+#include <sys/disklabel.h>
+#endif
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/limits.h>
+
+static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
+static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
+MALLOC_DEFINE(M_IOV, "iov", "large iov's");
+
+static int pollscan(struct thread *, struct pollfd *, u_int);
+static int selscan(struct thread *, fd_mask **, fd_mask **, int);
+static int dofileread(struct thread *, struct file *, int, void *,
+ size_t, off_t, int);
+static int dofilewrite(struct thread *, struct file *, int,
+ const void *, size_t, off_t, int);
+
+/*
+ * Read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct read_args {
+ int fd;
+ void *buf;
+ size_t nbyte;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+read(td, uap)
+ struct thread *td;
+ struct read_args *uap;
+{
+ struct file *fp;
+ int error;
+
+ if ((error = fget_read(td, uap->fd, &fp)) == 0) {
+ error = dofileread(td, fp, uap->fd, uap->buf,
+ uap->nbyte, (off_t)-1, 0);
+ fdrop(fp, td);
+ }
+ return(error);
+}
+
+/*
+ * Pread system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pread_args {
+ int fd;
+ void *buf;
+ size_t nbyte;
+ int pad;
+ off_t offset;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+pread(td, uap)
+ struct thread *td;
+ struct pread_args *uap;
+{
+ struct file *fp;
+ int error;
+
+ if ((error = fget_read(td, uap->fd, &fp)) != 0)
+ return (error);
+ if (fp->f_type != DTYPE_VNODE) {
+ error = ESPIPE;
+ } else {
+ error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
+ uap->offset, FOF_OFFSET);
+ }
+ fdrop(fp, td);
+ return(error);
+}
+
+/*
+ * Code common for read and pread
+ */
+int
+dofileread(td, fp, fd, buf, nbyte, offset, flags)
+ struct thread *td;
+ struct file *fp;
+ int fd, flags;
+ void *buf;
+ size_t nbyte;
+ off_t offset;
+{
+ struct uio auio;
+ struct iovec aiov;
+ long cnt, error = 0;
+#ifdef KTRACE
+ struct iovec ktriov;
+ struct uio ktruio;
+ int didktr = 0;
+#endif
+
+ aiov.iov_base = (caddr_t)buf;
+ aiov.iov_len = nbyte;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = offset;
+ if (nbyte > INT_MAX)
+ return (EINVAL);
+ auio.uio_resid = nbyte;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+#ifdef KTRACE
+ /*
+ * if tracing, save a copy of iovec
+ */
+ if (KTRPOINT(td, KTR_GENIO)) {
+ ktriov = aiov;
+ ktruio = auio;
+ didktr = 1;
+ }
+#endif
+ cnt = nbyte;
+
+ if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
+ if (auio.uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ }
+ cnt -= auio.uio_resid;
+#ifdef KTRACE
+ if (didktr && error == 0) {
+ ktruio.uio_iov = &ktriov;
+ ktruio.uio_resid = cnt;
+ ktrgenio(fd, UIO_READ, &ktruio, error);
+ }
+#endif
+ td->td_retval[0] = cnt;
+ return (error);
+}
+
+/*
+ * Scatter read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readv_args {
+ int fd;
+ struct iovec *iovp;
+ u_int iovcnt;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+readv(td, uap)
+ struct thread *td;
+ struct readv_args *uap;
+{
+ struct file *fp;
+ struct uio auio;
+ struct iovec *iov;
+ struct iovec *needfree;
+ struct iovec aiov[UIO_SMALLIOV];
+ long i, cnt;
+ int error;
+ u_int iovlen;
+#ifdef KTRACE
+ struct iovec *ktriov = NULL;
+ struct uio ktruio;
+#endif
+
+ if ((error = fget_read(td, uap->fd, &fp)) != 0)
+ return (error);
+ needfree = NULL;
+ /* note: can't use iovlen until iovcnt is validated */
+ iovlen = uap->iovcnt * sizeof (struct iovec);
+ if (uap->iovcnt > UIO_SMALLIOV) {
+ if (uap->iovcnt > UIO_MAXIOV) {
+ error = EINVAL;
+ goto done;
+ }
+ MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+ needfree = iov;
+ } else
+ iov = aiov;
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = uap->iovcnt;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auio.uio_offset = -1;
+ if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
+ goto done;
+ auio.uio_resid = 0;
+ for (i = 0; i < uap->iovcnt; i++) {
+ if (iov->iov_len > INT_MAX - auio.uio_resid) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid += iov->iov_len;
+ iov++;
+ }
+#ifdef KTRACE
+ /*
+ * if tracing, save a copy of iovec
+ */
+ if (KTRPOINT(td, KTR_GENIO)) {
+ MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+ bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+ ktruio = auio;
+ }
+#endif
+ cnt = auio.uio_resid;
+ if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
+ if (auio.uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ }
+ cnt -= auio.uio_resid;
+#ifdef KTRACE
+ if (ktriov != NULL) {
+ if (error == 0) {
+ ktruio.uio_iov = ktriov;
+ ktruio.uio_resid = cnt;
+ ktrgenio(uap->fd, UIO_READ, &ktruio, error);
+ }
+ FREE(ktriov, M_TEMP);
+ }
+#endif
+ td->td_retval[0] = cnt;
+done:
+ fdrop(fp, td);
+ if (needfree)
+ FREE(needfree, M_IOV);
+ return (error);
+}
+
+/*
+ * Write system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct write_args {
+ int fd;
+ const void *buf;
+ size_t nbyte;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+write(td, uap)
+ struct thread *td;
+ struct write_args *uap;
+{
+ struct file *fp;
+ int error;
+
+ if ((error = fget_write(td, uap->fd, &fp)) == 0) {
+ error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
+ (off_t)-1, 0);
+ fdrop(fp, td);
+ } else {
+ error = EBADF; /* XXX this can't be right */
+ }
+ return(error);
+}
+
+/*
+ * Pwrite system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pwrite_args {
+ int fd;
+ const void *buf;
+ size_t nbyte;
+ int pad;
+ off_t offset;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+pwrite(td, uap)
+ struct thread *td;
+ struct pwrite_args *uap;
+{
+ struct file *fp;
+ int error;
+
+ if ((error = fget_write(td, uap->fd, &fp)) == 0) {
+ if (fp->f_type == DTYPE_VNODE) {
+ error = dofilewrite(td, fp, uap->fd, uap->buf,
+ uap->nbyte, uap->offset, FOF_OFFSET);
+ } else {
+ error = ESPIPE;
+ }
+ fdrop(fp, td);
+ } else {
+ error = EBADF; /* this can't be right */
+ }
+ return(error);
+}
+
+static int
+dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
+ struct thread *td;
+ struct file *fp;
+ int fd, flags;
+ const void *buf;
+ size_t nbyte;
+ off_t offset;
+{
+ struct uio auio;
+ struct iovec aiov;
+ long cnt, error = 0;
+#ifdef KTRACE
+ struct iovec ktriov;
+ struct uio ktruio;
+ int didktr = 0;
+#endif
+
+ aiov.iov_base = (void *)(uintptr_t)buf;
+ aiov.iov_len = nbyte;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = offset;
+ if (nbyte > INT_MAX)
+ return (EINVAL);
+ auio.uio_resid = nbyte;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+#ifdef KTRACE
+ /*
+ * if tracing, save a copy of iovec and uio
+ */
+ if (KTRPOINT(td, KTR_GENIO)) {
+ ktriov = aiov;
+ ktruio = auio;
+ didktr = 1;
+ }
+#endif
+ cnt = nbyte;
+ if (fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+ if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
+ if (auio.uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ /* Socket layer is responsible for issuing SIGPIPE. */
+ if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
+ PROC_LOCK(td->td_proc);
+ psignal(td->td_proc, SIGPIPE);
+ PROC_UNLOCK(td->td_proc);
+ }
+ }
+ cnt -= auio.uio_resid;
+#ifdef KTRACE
+ if (didktr && error == 0) {
+ ktruio.uio_iov = &ktriov;
+ ktruio.uio_resid = cnt;
+ ktrgenio(fd, UIO_WRITE, &ktruio, error);
+ }
+#endif
+ td->td_retval[0] = cnt;
+ return (error);
+}
+
+/*
+ * Gather write system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct writev_args {
+ int fd;
+ struct iovec *iovp;
+ u_int iovcnt;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+writev(td, uap)
+ struct thread *td;
+ register struct writev_args *uap;
+{
+ struct file *fp;
+ struct uio auio;
+ register struct iovec *iov;
+ struct iovec *needfree;
+ struct iovec aiov[UIO_SMALLIOV];
+ long i, cnt, error = 0;
+ u_int iovlen;
+#ifdef KTRACE
+ struct iovec *ktriov = NULL;
+ struct uio ktruio;
+#endif
+
+ mtx_lock(&Giant);
+ if ((error = fget_write(td, uap->fd, &fp)) != 0) {
+ error = EBADF;
+ goto done2;
+ }
+ /* note: can't use iovlen until iovcnt is validated */
+ iovlen = uap->iovcnt * sizeof (struct iovec);
+ if (uap->iovcnt > UIO_SMALLIOV) {
+ if (uap->iovcnt > UIO_MAXIOV) {
+ needfree = NULL;
+ error = EINVAL;
+ goto done;
+ }
+ MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+ needfree = iov;
+ } else {
+ iov = aiov;
+ needfree = NULL;
+ }
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = uap->iovcnt;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auio.uio_offset = -1;
+ if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
+ goto done;
+ auio.uio_resid = 0;
+ for (i = 0; i < uap->iovcnt; i++) {
+ if (iov->iov_len > INT_MAX - auio.uio_resid) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid += iov->iov_len;
+ iov++;
+ }
+#ifdef KTRACE
+ /*
+ * if tracing, save a copy of iovec and uio
+ */
+ if (KTRPOINT(td, KTR_GENIO)) {
+ MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+ bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+ ktruio = auio;
+ }
+#endif
+ cnt = auio.uio_resid;
+ if (fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+ if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
+ if (auio.uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ if (error == EPIPE) {
+ PROC_LOCK(td->td_proc);
+ psignal(td->td_proc, SIGPIPE);
+ PROC_UNLOCK(td->td_proc);
+ }
+ }
+ cnt -= auio.uio_resid;
+#ifdef KTRACE
+ if (ktriov != NULL) {
+ if (error == 0) {
+ ktruio.uio_iov = ktriov;
+ ktruio.uio_resid = cnt;
+ ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
+ }
+ FREE(ktriov, M_TEMP);
+ }
+#endif
+ td->td_retval[0] = cnt;
+done:
+ fdrop(fp, td);
+ if (needfree)
+ FREE(needfree, M_IOV);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Ioctl system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ioctl_args {
+ int fd;
+ u_long com;
+ caddr_t data;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+ioctl(td, uap)
+ struct thread *td;
+ register struct ioctl_args *uap;
+{
+ struct file *fp;
+ register struct filedesc *fdp;
+ register u_long com;
+ int error = 0;
+ register u_int size;
+ caddr_t data, memp;
+ int tmp;
+#define STK_PARAMS 128
+ union {
+ char stkbuf[STK_PARAMS];
+ long align;
+ } ubuf;
+
+ if ((error = fget(td, uap->fd, &fp)) != 0)
+ return (error);
+ mtx_lock(&Giant);
+ if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
+ fdrop(fp, td);
+ mtx_unlock(&Giant);
+ return (EBADF);
+ }
+ fdp = td->td_proc->p_fd;
+ switch (com = uap->com) {
+ case FIONCLEX:
+ FILEDESC_LOCK(fdp);
+ fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ mtx_unlock(&Giant);
+ return (0);
+ case FIOCLEX:
+ FILEDESC_LOCK(fdp);
+ fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ mtx_unlock(&Giant);
+ return (0);
+ }
+
+ /*
+ * Interpret high order word to find amount of data to be
+ * copied to/from the user's address space.
+ */
+ size = IOCPARM_LEN(com);
+ if (size > IOCPARM_MAX) {
+ fdrop(fp, td);
+ mtx_unlock(&Giant);
+ return (ENOTTY);
+ }
+
+ memp = NULL;
+ if (size > sizeof (ubuf.stkbuf)) {
+ memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
+ data = memp;
+ } else {
+ data = ubuf.stkbuf;
+ }
+ if (com&IOC_IN) {
+ if (size) {
+ error = copyin(uap->data, data, (u_int)size);
+ if (error) {
+ if (memp)
+ free(memp, M_IOCTLOPS);
+ fdrop(fp, td);
+ goto done;
+ }
+ } else {
+ *(caddr_t *)data = uap->data;
+ }
+ } else if ((com&IOC_OUT) && size) {
+ /*
+ * Zero the buffer so the user always
+ * gets back something deterministic.
+ */
+ bzero(data, size);
+ } else if (com&IOC_VOID) {
+ *(caddr_t *)data = uap->data;
+ }
+
+ switch (com) {
+
+ case FIONBIO:
+ FILE_LOCK(fp);
+ if ((tmp = *(int *)data))
+ fp->f_flag |= FNONBLOCK;
+ else
+ fp->f_flag &= ~FNONBLOCK;
+ FILE_UNLOCK(fp);
+ error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
+ break;
+
+ case FIOASYNC:
+ FILE_LOCK(fp);
+ if ((tmp = *(int *)data))
+ fp->f_flag |= FASYNC;
+ else
+ fp->f_flag &= ~FASYNC;
+ FILE_UNLOCK(fp);
+ error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
+ break;
+
+ default:
+ error = fo_ioctl(fp, com, data, td);
+ /*
+ * Copy any data to user, size was
+ * already set and checked above.
+ */
+ if (error == 0 && (com&IOC_OUT) && size)
+ error = copyout(data, uap->data, (u_int)size);
+ break;
+ }
+ if (memp)
+ free(memp, M_IOCTLOPS);
+ fdrop(fp, td);
+done:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * sellock and selwait are initialized in selectinit() via SYSINIT.
+ */
+struct mtx sellock;
+struct cv selwait;
+u_int nselcoll; /* Select collisions since boot */
+SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
+
+/*
+ * Select system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct select_args {
+ int nd;
+ fd_set *in, *ou, *ex;
+ struct timeval *tv;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+select(td, uap)
+ register struct thread *td;
+ register struct select_args *uap;
+{
+ struct filedesc *fdp;
+ /*
+ * The magic 2048 here is chosen to be just enough for FD_SETSIZE
+ * infds with the new FD_SETSIZE of 1024, and more than enough for
+ * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
+ * of 256.
+ */
+ fd_mask s_selbits[howmany(2048, NFDBITS)];
+ fd_mask *ibits[3], *obits[3], *selbits, *sbp;
+ struct timeval atv, rtv, ttv;
+ int error, timo;
+ u_int ncoll, nbufbytes, ncpbytes, nfdbits;
+
+ if (uap->nd < 0)
+ return (EINVAL);
+ fdp = td->td_proc->p_fd;
+ mtx_lock(&Giant);
+ FILEDESC_LOCK(fdp);
+
+ if (uap->nd > td->td_proc->p_fd->fd_nfiles)
+ uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
+ FILEDESC_UNLOCK(fdp);
+
+ /*
+ * Allocate just enough bits for the non-null fd_sets. Use the
+ * preallocated auto buffer if possible.
+ */
+ nfdbits = roundup(uap->nd, NFDBITS);
+ ncpbytes = nfdbits / NBBY;
+ nbufbytes = 0;
+ if (uap->in != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (uap->ou != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (uap->ex != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (nbufbytes <= sizeof s_selbits)
+ selbits = &s_selbits[0];
+ else
+ selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
+
+ /*
+ * Assign pointers into the bit buffers and fetch the input bits.
+ * Put the output buffers together so that they can be bzeroed
+ * together.
+ */
+ sbp = selbits;
+#define getbits(name, x) \
+ do { \
+ if (uap->name == NULL) \
+ ibits[x] = NULL; \
+ else { \
+ ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
+ obits[x] = sbp; \
+ sbp += ncpbytes / sizeof *sbp; \
+ error = copyin(uap->name, ibits[x], ncpbytes); \
+ if (error != 0) \
+ goto done_nosellock; \
+ } \
+ } while (0)
+ getbits(in, 0);
+ getbits(ou, 1);
+ getbits(ex, 2);
+#undef getbits
+ if (nbufbytes != 0)
+ bzero(selbits, nbufbytes / 2);
+
+ if (uap->tv) {
+ error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
+ sizeof (atv));
+ if (error)
+ goto done_nosellock;
+ if (itimerfix(&atv)) {
+ error = EINVAL;
+ goto done_nosellock;
+ }
+ getmicrouptime(&rtv);
+ timevaladd(&atv, &rtv);
+ } else {
+ atv.tv_sec = 0;
+ atv.tv_usec = 0;
+ }
+ timo = 0;
+ mtx_lock(&sellock);
+retry:
+ ncoll = nselcoll;
+ mtx_lock_spin(&sched_lock);
+ td->td_flags |= TDF_SELECT;
+ mtx_unlock_spin(&sched_lock);
+ mtx_unlock(&sellock);
+
+ /* XXX Is there a better place for this? */
+ TAILQ_INIT(&td->td_selq);
+ error = selscan(td, ibits, obits, uap->nd);
+ mtx_lock(&sellock);
+ if (error || td->td_retval[0])
+ goto done;
+ if (atv.tv_sec || atv.tv_usec) {
+ getmicrouptime(&rtv);
+ if (timevalcmp(&rtv, &atv, >=))
+ goto done;
+ ttv = atv;
+ timevalsub(&ttv, &rtv);
+ timo = ttv.tv_sec > 24 * 60 * 60 ?
+ 24 * 60 * 60 * hz : tvtohz(&ttv);
+ }
+
+ /*
+ * An event of interest may occur while we do not hold
+ * sellock, so check TDF_SELECT and the number of
+ * collisions and rescan the file descriptors if
+ * necessary.
+ */
+ mtx_lock_spin(&sched_lock);
+ if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
+ mtx_unlock_spin(&sched_lock);
+ goto retry;
+ }
+ mtx_unlock_spin(&sched_lock);
+
+ if (timo > 0)
+ error = cv_timedwait_sig(&selwait, &sellock, timo);
+ else
+ error = cv_wait_sig(&selwait, &sellock);
+
+ if (error == 0)
+ goto retry;
+
+done:
+ clear_selinfo_list(td);
+ mtx_lock_spin(&sched_lock);
+ td->td_flags &= ~TDF_SELECT;
+ mtx_unlock_spin(&sched_lock);
+ mtx_unlock(&sellock);
+
+done_nosellock:
+ /* select is not restarted after signals... */
+ if (error == ERESTART)
+ error = EINTR;
+ if (error == EWOULDBLOCK)
+ error = 0;
+#define putbits(name, x) \
+ if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
+ error = error2;
+ if (error == 0) {
+ int error2;
+
+ putbits(in, 0);
+ putbits(ou, 1);
+ putbits(ex, 2);
+#undef putbits
+ }
+ if (selbits != &s_selbits[0])
+ free(selbits, M_SELECT);
+
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static int
+selscan(td, ibits, obits, nfd)
+ struct thread *td;
+ fd_mask **ibits, **obits;
+ int nfd;
+{
+ int msk, i, fd;
+ fd_mask bits;
+ struct file *fp;
+ int n = 0;
+ /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
+ static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
+ struct filedesc *fdp = td->td_proc->p_fd;
+
+ FILEDESC_LOCK(fdp);
+ for (msk = 0; msk < 3; msk++) {
+ if (ibits[msk] == NULL)
+ continue;
+ for (i = 0; i < nfd; i += NFDBITS) {
+ bits = ibits[msk][i/NFDBITS];
+ /* ffs(int mask) not portable, fd_mask is long */
+ for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
+ if (!(bits & 1))
+ continue;
+ if ((fp = fget_locked(fdp, fd)) == NULL) {
+ FILEDESC_UNLOCK(fdp);
+ return (EBADF);
+ }
+ if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
+ obits[msk][(fd)/NFDBITS] |=
+ ((fd_mask)1 << ((fd) % NFDBITS));
+ n++;
+ }
+ }
+ }
+ }
+ FILEDESC_UNLOCK(fdp);
+ td->td_retval[0] = n;
+ return (0);
+}
+
+/*
+ * Poll system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct poll_args {
+ struct pollfd *fds;
+ u_int nfds;
+ int timeout;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+poll(td, uap)
+ struct thread *td;
+ struct poll_args *uap;
+{
+ caddr_t bits;
+ char smallbits[32 * sizeof(struct pollfd)];
+ struct timeval atv, rtv, ttv;
+ int error = 0, timo;
+ u_int ncoll, nfds;
+ size_t ni;
+
+ nfds = SCARG(uap, nfds);
+
+ mtx_lock(&Giant);
+ /*
+ * This is kinda bogus. We have fd limits, but that is not
+ * really related to the size of the pollfd array. Make sure
+ * we let the process use at least FD_SETSIZE entries and at
+ * least enough for the current limits. We want to be reasonably
+ * safe, but not overly restrictive.
+ */
+ if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
+ (nfds > FD_SETSIZE)) {
+ error = EINVAL;
+ goto done2;
+ }
+ ni = nfds * sizeof(struct pollfd);
+ if (ni > sizeof(smallbits))
+ bits = malloc(ni, M_TEMP, M_WAITOK);
+ else
+ bits = smallbits;
+ error = copyin(SCARG(uap, fds), bits, ni);
+ if (error)
+ goto done_nosellock;
+ if (SCARG(uap, timeout) != INFTIM) {
+ atv.tv_sec = SCARG(uap, timeout) / 1000;
+ atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
+ if (itimerfix(&atv)) {
+ error = EINVAL;
+ goto done_nosellock;
+ }
+ getmicrouptime(&rtv);
+ timevaladd(&atv, &rtv);
+ } else {
+ atv.tv_sec = 0;
+ atv.tv_usec = 0;
+ }
+ timo = 0;
+ mtx_lock(&sellock);
+retry:
+ ncoll = nselcoll;
+ mtx_lock_spin(&sched_lock);
+ td->td_flags |= TDF_SELECT;
+ mtx_unlock_spin(&sched_lock);
+ mtx_unlock(&sellock);
+
+ /* XXX Is there a better place for this? */
+ TAILQ_INIT(&td->td_selq);
+ error = pollscan(td, (struct pollfd *)bits, nfds);
+ mtx_lock(&sellock);
+ if (error || td->td_retval[0])
+ goto done;
+ if (atv.tv_sec || atv.tv_usec) {
+ getmicrouptime(&rtv);
+ if (timevalcmp(&rtv, &atv, >=))
+ goto done;
+ ttv = atv;
+ timevalsub(&ttv, &rtv);
+ timo = ttv.tv_sec > 24 * 60 * 60 ?
+ 24 * 60 * 60 * hz : tvtohz(&ttv);
+ }
+ /*
+ * An event of interest may occur while we do not hold
+ * sellock, so check TDF_SELECT and the number of collisions
+ * and rescan the file descriptors if necessary.
+ */
+ mtx_lock_spin(&sched_lock);
+ if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
+ mtx_unlock_spin(&sched_lock);
+ goto retry;
+ }
+ mtx_unlock_spin(&sched_lock);
+
+ if (timo > 0)
+ error = cv_timedwait_sig(&selwait, &sellock, timo);
+ else
+ error = cv_wait_sig(&selwait, &sellock);
+
+ if (error == 0)
+ goto retry;
+
+done:
+ clear_selinfo_list(td);
+ mtx_lock_spin(&sched_lock);
+ td->td_flags &= ~TDF_SELECT;
+ mtx_unlock_spin(&sched_lock);
+ mtx_unlock(&sellock);
+
+done_nosellock:
+ /* poll is not restarted after signals... */
+ if (error == ERESTART)
+ error = EINTR;
+ if (error == EWOULDBLOCK)
+ error = 0;
+ if (error == 0) {
+ error = copyout(bits, SCARG(uap, fds), ni);
+ if (error)
+ goto out;
+ }
+out:
+ if (ni > sizeof(smallbits))
+ free(bits, M_TEMP);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static int
+pollscan(td, fds, nfd)
+ struct thread *td;
+ struct pollfd *fds;
+ u_int nfd;
+{
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ int i;
+ struct file *fp;
+ int n = 0;
+
+ FILEDESC_LOCK(fdp);
+ for (i = 0; i < nfd; i++, fds++) {
+ if (fds->fd >= fdp->fd_nfiles) {
+ fds->revents = POLLNVAL;
+ n++;
+ } else if (fds->fd < 0) {
+ fds->revents = 0;
+ } else {
+ fp = fdp->fd_ofiles[fds->fd];
+ if (fp == NULL) {
+ fds->revents = POLLNVAL;
+ n++;
+ } else {
+ /*
+ * Note: backend also returns POLLHUP and
+ * POLLERR if appropriate.
+ */
+ fds->revents = fo_poll(fp, fds->events,
+ fp->f_cred, td);
+ if (fds->revents != 0)
+ n++;
+ }
+ }
+ }
+ FILEDESC_UNLOCK(fdp);
+ td->td_retval[0] = n;
+ return (0);
+}
+
+/*
+ * OpenBSD poll system call.
+ * XXX this isn't quite a true representation.. OpenBSD uses select ops.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct openbsd_poll_args {
+ struct pollfd *fds;
+ u_int nfds;
+ int timeout;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+openbsd_poll(td, uap)
+ register struct thread *td;
+ register struct openbsd_poll_args *uap;
+{
+ return (poll(td, (struct poll_args *)uap));
+}
+
+/*
+ * Remove the references to the thread from all of the objects
+ * we were polling.
+ *
+ * This code assumes that the underlying owner of the selinfo
+ * structure will hold sellock before it changes it, and that
+ * it will unlink itself from our list if it goes away.
+ */
+void
+clear_selinfo_list(td)
+ struct thread *td;
+{
+ struct selinfo *si;
+
+ mtx_assert(&sellock, MA_OWNED);
+ TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
+ si->si_thread = NULL;
+ TAILQ_INIT(&td->td_selq);
+}
+
+/*ARGSUSED*/
+int
+seltrue(dev, events, td)
+ dev_t dev;
+ int events;
+ struct thread *td;
+{
+
+ return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Record a select request.
+ */
+void
+selrecord(selector, sip)
+ struct thread *selector;
+ struct selinfo *sip;
+{
+
+ mtx_lock(&sellock);
+ /*
+ * If the thread is NULL then take ownership of selinfo
+ * however if the thread is not NULL and the thread points to
+ * someone else, then we have a collision, otherwise leave it alone
+ * as we've owned it in a previous selrecord on this selinfo.
+ */
+ if (sip->si_thread == NULL) {
+ sip->si_thread = selector;
+ TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
+ } else if (sip->si_thread != selector) {
+ sip->si_flags |= SI_COLL;
+ }
+
+ mtx_unlock(&sellock);
+}
+
+/*
+ * Do a wakeup when a selectable event occurs.
+ */
+void
+selwakeup(sip)
+ struct selinfo *sip;
+{
+ struct thread *td;
+
+ mtx_lock(&sellock);
+ td = sip->si_thread;
+ if ((sip->si_flags & SI_COLL) != 0) {
+ nselcoll++;
+ sip->si_flags &= ~SI_COLL;
+ cv_broadcast(&selwait);
+ }
+ if (td == NULL) {
+ mtx_unlock(&sellock);
+ return;
+ }
+ TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
+ sip->si_thread = NULL;
+ mtx_lock_spin(&sched_lock);
+ if (td->td_wchan == (caddr_t)&selwait) {
+ if (td->td_proc->p_stat == SSLEEP)
+ setrunnable(td);
+ else
+ cv_waitq_remove(td);
+ } else
+ td->td_flags &= ~TDF_SELECT;
+ mtx_unlock_spin(&sched_lock);
+ mtx_unlock(&sellock);
+}
+
+static void selectinit(void *);
+SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
+
+/* ARGSUSED*/
+static void
+selectinit(dummy)
+ void *dummy;
+{
+ cv_init(&selwait, "select");
+ mtx_init(&sellock, "sellck", NULL, MTX_DEF);
+}
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
new file mode 100644
index 0000000..11ab6d1
--- /dev/null
+++ b/sys/kern/sys_pipe.c
@@ -0,0 +1,1427 @@
+/*
+ * Copyright (c) 1996 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice immediately at the beginning of the file, without modification,
+ * this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ * John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ * are met.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This file contains a high-performance replacement for the socket-based
+ * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
+ * all features of sockets, but does do everything that pipes normally
+ * do.
+ */
+
+/*
+ * This code has two modes of operation, a small write mode and a large
+ * write mode. The small write mode acts like conventional pipes with
+ * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
+ * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
+ * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
+ * the receiving process can copy it directly from the pages in the sending
+ * process.
+ *
+ * If the sending process receives a signal, it is possible that it will
+ * go away, and certainly its address space can change, because control
+ * is returned back to the user-mode side. In that case, the pipe code
+ * arranges to copy the buffer supplied by the user process, to a pageable
+ * kernel buffer, and the receiving process will grab the data from the
+ * pageable kernel buffer. Since signals don't happen all that often,
+ * the copy operation is normally eliminated.
+ *
+ * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
+ * happen for small transfers so that the system will not spend all of
+ * its time context switching. PIPE_SIZE is constrained by the
+ * amount of kernel virtual memory.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/ttycom.h>
+#include <sys/stat.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/selinfo.h>
+#include <sys/signalvar.h>
+#include <sys/sysproto.h>
+#include <sys/pipe.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/event.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+/*
+ * Use this define if you want to disable *fancy* VM things. Expect an
+ * approx 30% decrease in transfer rate. This could be useful for
+ * NetBSD or OpenBSD.
+ */
+/* #define PIPE_NODIRECT */
+
+/*
+ * interfaces to the outside world
+ */
+static int pipe_read(struct file *fp, struct uio *uio,
+ struct ucred *cred, int flags, struct thread *td);
+static int pipe_write(struct file *fp, struct uio *uio,
+ struct ucred *cred, int flags, struct thread *td);
+static int pipe_close(struct file *fp, struct thread *td);
+static int pipe_poll(struct file *fp, int events, struct ucred *cred,
+ struct thread *td);
+static int pipe_kqfilter(struct file *fp, struct knote *kn);
+static int pipe_stat(struct file *fp, struct stat *sb, struct thread *td);
+static int pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct thread *td);
+
+static struct fileops pipeops = {
+ pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
+ pipe_stat, pipe_close
+};
+
+static void filt_pipedetach(struct knote *kn);
+static int filt_piperead(struct knote *kn, long hint);
+static int filt_pipewrite(struct knote *kn, long hint);
+
+static struct filterops pipe_rfiltops =
+ { 1, NULL, filt_pipedetach, filt_piperead };
+static struct filterops pipe_wfiltops =
+ { 1, NULL, filt_pipedetach, filt_pipewrite };
+
+#define PIPE_GET_GIANT(pipe) \
+ do { \
+ KASSERT(((pipe)->pipe_state & PIPE_LOCKFL) != 0, \
+ ("%s:%d PIPE_GET_GIANT: line pipe not locked", \
+ __FILE__, __LINE__)); \
+ PIPE_UNLOCK(pipe); \
+ mtx_lock(&Giant); \
+ } while (0)
+
+#define PIPE_DROP_GIANT(pipe) \
+ do { \
+ mtx_unlock(&Giant); \
+ PIPE_LOCK(pipe); \
+ } while (0)
+
+/*
+ * Default pipe buffer size(s), this can be kind-of large now because pipe
+ * space is pageable. The pipe code will try to maintain locality of
+ * reference for performance reasons, so small amounts of outstanding I/O
+ * will not wipe the cache.
+ */
+#define MINPIPESIZE (PIPE_SIZE/3)
+#define MAXPIPESIZE (2*PIPE_SIZE/3)
+
+/*
+ * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
+ * is there so that on large systems, we don't exhaust it.
+ */
+#define MAXPIPEKVA (8*1024*1024)
+
+/*
+ * Limit for direct transfers, we cannot, of course limit
+ * the amount of kva for pipes in general though.
+ */
+#define LIMITPIPEKVA (16*1024*1024)
+
+/*
+ * Limit the number of "big" pipes
+ */
+#define LIMITBIGPIPES 32
+static int nbigpipe;
+
+static int amountpipekva;
+
+static void pipeinit(void *dummy __unused);
+static void pipeclose(struct pipe *cpipe);
+static void pipe_free_kmem(struct pipe *cpipe);
+static int pipe_create(struct pipe **cpipep);
+static __inline int pipelock(struct pipe *cpipe, int catch);
+static __inline void pipeunlock(struct pipe *cpipe);
+static __inline void pipeselwakeup(struct pipe *cpipe);
+#ifndef PIPE_NODIRECT
+static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
+static void pipe_destroy_write_buffer(struct pipe *wpipe);
+static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
+static void pipe_clone_write_buffer(struct pipe *wpipe);
+#endif
+static int pipespace(struct pipe *cpipe, int size);
+
+static uma_zone_t pipe_zone;
+
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
+
+static void
+pipeinit(void *dummy __unused)
+{
+ pipe_zone = uma_zcreate("PIPE", sizeof(struct pipe), NULL,
+ NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+
+/*
+ * The pipe system call for the DTYPE_PIPE type of pipes
+ */
+
+/* ARGSUSED */
+int
+pipe(td, uap)
+ struct thread *td;
+ struct pipe_args /* {
+ int dummy;
+ } */ *uap;
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+ struct file *rf, *wf;
+ struct pipe *rpipe, *wpipe;
+ struct mtx *pmtx;
+ int fd, error;
+
+ KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
+
+ pmtx = malloc(sizeof(*pmtx), M_TEMP, M_WAITOK | M_ZERO);
+
+ rpipe = wpipe = NULL;
+ if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
+ pipeclose(rpipe);
+ pipeclose(wpipe);
+ free(pmtx, M_TEMP);
+ return (ENFILE);
+ }
+
+ rpipe->pipe_state |= PIPE_DIRECTOK;
+ wpipe->pipe_state |= PIPE_DIRECTOK;
+
+ error = falloc(td, &rf, &fd);
+ if (error) {
+ pipeclose(rpipe);
+ pipeclose(wpipe);
+ free(pmtx, M_TEMP);
+ return (error);
+ }
+ fhold(rf);
+ td->td_retval[0] = fd;
+
+ /*
+ * Warning: once we've gotten past allocation of the fd for the
+ * read-side, we can only drop the read side via fdrop() in order
+ * to avoid races against processes which manage to dup() the read
+ * side while we are blocked trying to allocate the write side.
+ */
+ FILE_LOCK(rf);
+ rf->f_flag = FREAD | FWRITE;
+ rf->f_type = DTYPE_PIPE;
+ rf->f_data = (caddr_t)rpipe;
+ rf->f_ops = &pipeops;
+ FILE_UNLOCK(rf);
+ error = falloc(td, &wf, &fd);
+ if (error) {
+ FILEDESC_LOCK(fdp);
+ if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
+ fdp->fd_ofiles[td->td_retval[0]] = NULL;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(rf, td);
+ } else
+ FILEDESC_UNLOCK(fdp);
+ fdrop(rf, td);
+ /* rpipe has been closed by fdrop(). */
+ pipeclose(wpipe);
+ free(pmtx, M_TEMP);
+ return (error);
+ }
+ FILE_LOCK(wf);
+ wf->f_flag = FREAD | FWRITE;
+ wf->f_type = DTYPE_PIPE;
+ wf->f_data = (caddr_t)wpipe;
+ wf->f_ops = &pipeops;
+ FILE_UNLOCK(wf);
+ td->td_retval[1] = fd;
+ rpipe->pipe_peer = wpipe;
+ wpipe->pipe_peer = rpipe;
+ mtx_init(pmtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
+ rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
+ fdrop(rf, td);
+
+ return (0);
+}
+
+/*
+ * Allocate kva for pipe circular buffer, the space is pageable
+ * This routine will 'realloc' the size of a pipe safely, if it fails
+ * it will retain the old buffer.
+ * If it fails it will return ENOMEM.
+ */
+static int
+pipespace(cpipe, size)
+ struct pipe *cpipe;
+ int size;
+{
+ struct vm_object *object;
+ caddr_t buffer;
+ int npages, error;
+
+ GIANT_REQUIRED;
+ KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
+ ("pipespace: pipe mutex locked"));
+
+ npages = round_page(size)/PAGE_SIZE;
+ /*
+ * Create an object, I don't like the idea of paging to/from
+ * kernel_object.
+ * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
+ */
+ object = vm_object_allocate(OBJT_DEFAULT, npages);
+ buffer = (caddr_t) vm_map_min(kernel_map);
+
+ /*
+ * Insert the object into the kernel map, and allocate kva for it.
+ * The map entry is, by default, pageable.
+ * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
+ */
+ error = vm_map_find(kernel_map, object, 0,
+ (vm_offset_t *) &buffer, size, 1,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+
+ if (error != KERN_SUCCESS) {
+ vm_object_deallocate(object);
+ return (ENOMEM);
+ }
+
+ /* free old resources if we're resizing */
+ pipe_free_kmem(cpipe);
+ cpipe->pipe_buffer.object = object;
+ cpipe->pipe_buffer.buffer = buffer;
+ cpipe->pipe_buffer.size = size;
+ cpipe->pipe_buffer.in = 0;
+ cpipe->pipe_buffer.out = 0;
+ cpipe->pipe_buffer.cnt = 0;
+ amountpipekva += cpipe->pipe_buffer.size;
+ return (0);
+}
+
+/*
+ * initialize and allocate VM and memory for pipe
+ */
+static int
+pipe_create(cpipep)
+ struct pipe **cpipep;
+{
+ struct pipe *cpipe;
+ int error;
+
+ *cpipep = uma_zalloc(pipe_zone, M_WAITOK);
+ if (*cpipep == NULL)
+ return (ENOMEM);
+
+ cpipe = *cpipep;
+
+ /* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
+ cpipe->pipe_buffer.object = NULL;
+#ifndef PIPE_NODIRECT
+ cpipe->pipe_map.kva = NULL;
+#endif
+ /*
+ * protect so pipeclose() doesn't follow a junk pointer
+ * if pipespace() fails.
+ */
+ bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
+ cpipe->pipe_state = 0;
+ cpipe->pipe_peer = NULL;
+ cpipe->pipe_busy = 0;
+
+#ifndef PIPE_NODIRECT
+ /*
+ * pipe data structure initializations to support direct pipe I/O
+ */
+ cpipe->pipe_map.cnt = 0;
+ cpipe->pipe_map.kva = 0;
+ cpipe->pipe_map.pos = 0;
+ cpipe->pipe_map.npages = 0;
+ /* cpipe->pipe_map.ms[] = invalid */
+#endif
+
+ cpipe->pipe_mtxp = NULL; /* avoid pipespace assertion */
+ error = pipespace(cpipe, PIPE_SIZE);
+ if (error)
+ return (error);
+
+ vfs_timestamp(&cpipe->pipe_ctime);
+ cpipe->pipe_atime = cpipe->pipe_ctime;
+ cpipe->pipe_mtime = cpipe->pipe_ctime;
+
+ return (0);
+}
+
+
+/*
+ * lock a pipe for I/O, blocking other access
+ */
+static __inline int
+pipelock(cpipe, catch)
+ struct pipe *cpipe;
+ int catch;
+{
+ int error;
+
+ PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
+ while (cpipe->pipe_state & PIPE_LOCKFL) {
+ cpipe->pipe_state |= PIPE_LWANT;
+ error = msleep(cpipe, PIPE_MTX(cpipe),
+ catch ? (PRIBIO | PCATCH) : PRIBIO,
+ "pipelk", 0);
+ if (error != 0)
+ return (error);
+ }
+ cpipe->pipe_state |= PIPE_LOCKFL;
+ return (0);
+}
+
+/*
+ * unlock a pipe I/O lock
+ */
+static __inline void
+pipeunlock(cpipe)
+ struct pipe *cpipe;
+{
+
+ PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
+ cpipe->pipe_state &= ~PIPE_LOCKFL;
+ if (cpipe->pipe_state & PIPE_LWANT) {
+ cpipe->pipe_state &= ~PIPE_LWANT;
+ wakeup(cpipe);
+ }
+}
+
+static __inline void
+pipeselwakeup(cpipe)
+ struct pipe *cpipe;
+{
+
+ if (cpipe->pipe_state & PIPE_SEL) {
+ cpipe->pipe_state &= ~PIPE_SEL;
+ selwakeup(&cpipe->pipe_sel);
+ }
+ if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
+ pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
+ KNOTE(&cpipe->pipe_sel.si_note, 0);
+}
+
+/* ARGSUSED */
+static int
+pipe_read(fp, uio, cred, flags, td)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+ struct thread *td;
+ int flags;
+{
+ struct pipe *rpipe = (struct pipe *) fp->f_data;
+ int error;
+ int nread = 0;
+ u_int size;
+
+ PIPE_LOCK(rpipe);
+ ++rpipe->pipe_busy;
+ error = pipelock(rpipe, 1);
+ if (error)
+ goto unlocked_error;
+
+ while (uio->uio_resid) {
+ /*
+ * normal pipe buffer receive
+ */
+ if (rpipe->pipe_buffer.cnt > 0) {
+ size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
+ if (size > rpipe->pipe_buffer.cnt)
+ size = rpipe->pipe_buffer.cnt;
+ if (size > (u_int) uio->uio_resid)
+ size = (u_int) uio->uio_resid;
+
+ PIPE_UNLOCK(rpipe);
+ error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
+ size, uio);
+ PIPE_LOCK(rpipe);
+ if (error)
+ break;
+
+ rpipe->pipe_buffer.out += size;
+ if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
+ rpipe->pipe_buffer.out = 0;
+
+ rpipe->pipe_buffer.cnt -= size;
+
+ /*
+ * If there is no more to read in the pipe, reset
+ * its pointers to the beginning. This improves
+ * cache hit stats.
+ */
+ if (rpipe->pipe_buffer.cnt == 0) {
+ rpipe->pipe_buffer.in = 0;
+ rpipe->pipe_buffer.out = 0;
+ }
+ nread += size;
+#ifndef PIPE_NODIRECT
+ /*
+ * Direct copy, bypassing a kernel buffer.
+ */
+ } else if ((size = rpipe->pipe_map.cnt) &&
+ (rpipe->pipe_state & PIPE_DIRECTW)) {
+ caddr_t va;
+ if (size > (u_int) uio->uio_resid)
+ size = (u_int) uio->uio_resid;
+
+ va = (caddr_t) rpipe->pipe_map.kva +
+ rpipe->pipe_map.pos;
+ PIPE_UNLOCK(rpipe);
+ error = uiomove(va, size, uio);
+ PIPE_LOCK(rpipe);
+ if (error)
+ break;
+ nread += size;
+ rpipe->pipe_map.pos += size;
+ rpipe->pipe_map.cnt -= size;
+ if (rpipe->pipe_map.cnt == 0) {
+ rpipe->pipe_state &= ~PIPE_DIRECTW;
+ wakeup(rpipe);
+ }
+#endif
+ } else {
+ /*
+ * detect EOF condition
+ * read returns 0 on EOF, no need to set error
+ */
+ if (rpipe->pipe_state & PIPE_EOF)
+ break;
+
+ /*
+ * If the "write-side" has been blocked, wake it up now.
+ */
+ if (rpipe->pipe_state & PIPE_WANTW) {
+ rpipe->pipe_state &= ~PIPE_WANTW;
+ wakeup(rpipe);
+ }
+
+ /*
+ * Break if some data was read.
+ */
+ if (nread > 0)
+ break;
+
+ /*
+ * Unlock the pipe buffer for our remaining processing. We
+ * will either break out with an error or we will sleep and
+ * relock to loop.
+ */
+ pipeunlock(rpipe);
+
+ /*
+ * Handle non-blocking mode operation or
+ * wait for more data.
+ */
+ if (fp->f_flag & FNONBLOCK) {
+ error = EAGAIN;
+ } else {
+ rpipe->pipe_state |= PIPE_WANTR;
+ if ((error = msleep(rpipe, PIPE_MTX(rpipe),
+ PRIBIO | PCATCH,
+ "piperd", 0)) == 0)
+ error = pipelock(rpipe, 1);
+ }
+ if (error)
+ goto unlocked_error;
+ }
+ }
+ pipeunlock(rpipe);
+
+ /* XXX: should probably do this before getting any locks. */
+ if (error == 0)
+ vfs_timestamp(&rpipe->pipe_atime);
+unlocked_error:
+ --rpipe->pipe_busy;
+
+ /*
+ * PIPE_WANT processing only makes sense if pipe_busy is 0.
+ */
+ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
+ rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
+ wakeup(rpipe);
+ } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
+ /*
+ * Handle write blocking hysteresis.
+ */
+ if (rpipe->pipe_state & PIPE_WANTW) {
+ rpipe->pipe_state &= ~PIPE_WANTW;
+ wakeup(rpipe);
+ }
+ }
+
+ if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
+ pipeselwakeup(rpipe);
+
+ PIPE_UNLOCK(rpipe);
+ return (error);
+}
+
+#ifndef PIPE_NODIRECT
+/*
+ * Map the sending processes' buffer into kernel space and wire it.
+ * This is similar to a physical write operation.
+ */
+static int
+pipe_build_write_buffer(wpipe, uio)
+ struct pipe *wpipe;
+ struct uio *uio;
+{
+ u_int size;
+ int i;
+ vm_offset_t addr, endaddr, paddr;
+
+ GIANT_REQUIRED;
+ PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
+
+ size = (u_int) uio->uio_iov->iov_len;
+ if (size > wpipe->pipe_buffer.size)
+ size = wpipe->pipe_buffer.size;
+
+ endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
+ addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
+ for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
+ vm_page_t m;
+
+ if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
+ (paddr = pmap_extract(vmspace_pmap(curproc->p_vmspace),
+ addr)) == 0) {
+ int j;
+
+ for (j = 0; j < i; j++)
+ vm_page_unwire(wpipe->pipe_map.ms[j], 1);
+ return (EFAULT);
+ }
+
+ m = PHYS_TO_VM_PAGE(paddr);
+ vm_page_wire(m);
+ wpipe->pipe_map.ms[i] = m;
+ }
+
+/*
+ * set up the control block
+ */
+ wpipe->pipe_map.npages = i;
+ wpipe->pipe_map.pos =
+ ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
+ wpipe->pipe_map.cnt = size;
+
+/*
+ * and map the buffer
+ */
+ if (wpipe->pipe_map.kva == 0) {
+ /*
+ * We need to allocate space for an extra page because the
+ * address range might (will) span pages at times.
+ */
+ wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
+ wpipe->pipe_buffer.size + PAGE_SIZE);
+ amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
+ }
+ pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
+ wpipe->pipe_map.npages);
+
+/*
+ * and update the uio data
+ */
+
+ uio->uio_iov->iov_len -= size;
+ uio->uio_iov->iov_base += size;
+ if (uio->uio_iov->iov_len == 0)
+ uio->uio_iov++;
+ uio->uio_resid -= size;
+ uio->uio_offset += size;
+ return (0);
+}
+
+/*
+ * unmap and unwire the process buffer
+ */
+static void
+pipe_destroy_write_buffer(wpipe)
+ struct pipe *wpipe;
+{
+ int i;
+
+ GIANT_REQUIRED;
+ PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
+
+ if (wpipe->pipe_map.kva) {
+ pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
+
+ if (amountpipekva > MAXPIPEKVA) {
+ vm_offset_t kva = wpipe->pipe_map.kva;
+ wpipe->pipe_map.kva = 0;
+ kmem_free(kernel_map, kva,
+ wpipe->pipe_buffer.size + PAGE_SIZE);
+ amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
+ }
+ }
+ for (i = 0; i < wpipe->pipe_map.npages; i++)
+ vm_page_unwire(wpipe->pipe_map.ms[i], 1);
+ wpipe->pipe_map.npages = 0;
+}
+
+/*
+ * In the case of a signal, the writing process might go away. This
+ * code copies the data into the circular buffer so that the source
+ * pages can be freed without loss of data.
+ */
+static void
+pipe_clone_write_buffer(wpipe)
+ struct pipe *wpipe;
+{
+ int size;
+ int pos;
+
+ PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
+ size = wpipe->pipe_map.cnt;
+ pos = wpipe->pipe_map.pos;
+
+ wpipe->pipe_buffer.in = size;
+ wpipe->pipe_buffer.out = 0;
+ wpipe->pipe_buffer.cnt = size;
+ wpipe->pipe_state &= ~PIPE_DIRECTW;
+
+ PIPE_GET_GIANT(wpipe);
+ bcopy((caddr_t) wpipe->pipe_map.kva + pos,
+ (caddr_t) wpipe->pipe_buffer.buffer, size);
+ pipe_destroy_write_buffer(wpipe);
+ PIPE_DROP_GIANT(wpipe);
+}
+
+/*
+ * This implements the pipe buffer write mechanism. Note that only
+ * a direct write OR a normal pipe write can be pending at any given time.
+ * If there are any characters in the pipe buffer, the direct write will
+ * be deferred until the receiving process grabs all of the bytes from
+ * the pipe buffer. Then the direct mapping write is set-up.
+ */
+static int
+pipe_direct_write(wpipe, uio)
+ struct pipe *wpipe;
+ struct uio *uio;
+{
+ int error;
+
+retry:
+ PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
+ while (wpipe->pipe_state & PIPE_DIRECTW) {
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ wpipe->pipe_state |= PIPE_WANTW;
+ error = msleep(wpipe, PIPE_MTX(wpipe),
+ PRIBIO | PCATCH, "pipdww", 0);
+ if (error)
+ goto error1;
+ if (wpipe->pipe_state & PIPE_EOF) {
+ error = EPIPE;
+ goto error1;
+ }
+ }
+ wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
+ if (wpipe->pipe_buffer.cnt > 0) {
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+
+ wpipe->pipe_state |= PIPE_WANTW;
+ error = msleep(wpipe, PIPE_MTX(wpipe),
+ PRIBIO | PCATCH, "pipdwc", 0);
+ if (error)
+ goto error1;
+ if (wpipe->pipe_state & PIPE_EOF) {
+ error = EPIPE;
+ goto error1;
+ }
+ goto retry;
+ }
+
+ wpipe->pipe_state |= PIPE_DIRECTW;
+
+ pipelock(wpipe, 0);
+ PIPE_GET_GIANT(wpipe);
+ error = pipe_build_write_buffer(wpipe, uio);
+ PIPE_DROP_GIANT(wpipe);
+ pipeunlock(wpipe);
+ if (error) {
+ wpipe->pipe_state &= ~PIPE_DIRECTW;
+ goto error1;
+ }
+
+ error = 0;
+ while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
+ if (wpipe->pipe_state & PIPE_EOF) {
+ pipelock(wpipe, 0);
+ PIPE_GET_GIANT(wpipe);
+ pipe_destroy_write_buffer(wpipe);
+ PIPE_DROP_GIANT(wpipe);
+ pipeunlock(wpipe);
+ pipeselwakeup(wpipe);
+ error = EPIPE;
+ goto error1;
+ }
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ pipeselwakeup(wpipe);
+ error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
+ "pipdwt", 0);
+ }
+
+ pipelock(wpipe,0);
+ if (wpipe->pipe_state & PIPE_DIRECTW) {
+ /*
+ * this bit of trickery substitutes a kernel buffer for
+ * the process that might be going away.
+ */
+ pipe_clone_write_buffer(wpipe);
+ } else {
+ PIPE_GET_GIANT(wpipe);
+ pipe_destroy_write_buffer(wpipe);
+ PIPE_DROP_GIANT(wpipe);
+ }
+ pipeunlock(wpipe);
+ return (error);
+
+error1:
+ wakeup(wpipe);
+ return (error);
+}
+#endif
+
+static int
+pipe_write(fp, uio, cred, flags, td)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+ struct thread *td;
+ int flags;
+{
+ int error = 0;
+ int orig_resid;
+ struct pipe *wpipe, *rpipe;
+
+ rpipe = (struct pipe *) fp->f_data;
+ wpipe = rpipe->pipe_peer;
+
+ PIPE_LOCK(rpipe);
+ /*
+ * detect loss of pipe read side, issue SIGPIPE if lost.
+ */
+ if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+ PIPE_UNLOCK(rpipe);
+ return (EPIPE);
+ }
+ ++wpipe->pipe_busy;
+
+ /*
+ * If it is advantageous to resize the pipe buffer, do
+ * so.
+ */
+ if ((uio->uio_resid > PIPE_SIZE) &&
+ (nbigpipe < LIMITBIGPIPES) &&
+ (wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
+ (wpipe->pipe_buffer.size <= PIPE_SIZE) &&
+ (wpipe->pipe_buffer.cnt == 0)) {
+
+ if ((error = pipelock(wpipe,1)) == 0) {
+ PIPE_GET_GIANT(wpipe);
+ if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
+ nbigpipe++;
+ PIPE_DROP_GIANT(wpipe);
+ pipeunlock(wpipe);
+ }
+ }
+
+ /*
+ * If an early error occured unbusy and return, waking up any pending
+ * readers.
+ */
+ if (error) {
+ --wpipe->pipe_busy;
+ if ((wpipe->pipe_busy == 0) &&
+ (wpipe->pipe_state & PIPE_WANT)) {
+ wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
+ wakeup(wpipe);
+ }
+ PIPE_UNLOCK(rpipe);
+ return(error);
+ }
+
+ KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
+
+ orig_resid = uio->uio_resid;
+
+ while (uio->uio_resid) {
+ int space;
+
+#ifndef PIPE_NODIRECT
+ /*
+ * If the transfer is large, we can gain performance if
+ * we do process-to-process copies directly.
+ * If the write is non-blocking, we don't use the
+ * direct write mechanism.
+ *
+ * The direct write mechanism will detect the reader going
+ * away on us.
+ */
+ if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
+ (fp->f_flag & FNONBLOCK) == 0 &&
+ (wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
+ (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
+ error = pipe_direct_write( wpipe, uio);
+ if (error)
+ break;
+ continue;
+ }
+#endif
+
+ /*
+ * Pipe buffered writes cannot be coincidental with
+ * direct writes. We wait until the currently executing
+ * direct write is completed before we start filling the
+ * pipe buffer. We break out if a signal occurs or the
+ * reader goes away.
+ */
+ retrywrite:
+ while (wpipe->pipe_state & PIPE_DIRECTW) {
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
+ "pipbww", 0);
+ if (wpipe->pipe_state & PIPE_EOF)
+ break;
+ if (error)
+ break;
+ }
+ if (wpipe->pipe_state & PIPE_EOF) {
+ error = EPIPE;
+ break;
+ }
+
+ space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
+
+ /* Writes of size <= PIPE_BUF must be atomic. */
+ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
+ space = 0;
+
+ if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
+ if ((error = pipelock(wpipe,1)) == 0) {
+ int size; /* Transfer size */
+ int segsize; /* first segment to transfer */
+
+ /*
+ * It is possible for a direct write to
+ * slip in on us... handle it here...
+ */
+ if (wpipe->pipe_state & PIPE_DIRECTW) {
+ pipeunlock(wpipe);
+ goto retrywrite;
+ }
+ /*
+ * If a process blocked in uiomove, our
+ * value for space might be bad.
+ *
+ * XXX will we be ok if the reader has gone
+ * away here?
+ */
+ if (space > wpipe->pipe_buffer.size -
+ wpipe->pipe_buffer.cnt) {
+ pipeunlock(wpipe);
+ goto retrywrite;
+ }
+
+ /*
+ * Transfer size is minimum of uio transfer
+ * and free space in pipe buffer.
+ */
+ if (space > uio->uio_resid)
+ size = uio->uio_resid;
+ else
+ size = space;
+ /*
+ * First segment to transfer is minimum of
+ * transfer size and contiguous space in
+ * pipe buffer. If first segment to transfer
+ * is less than the transfer size, we've got
+ * a wraparound in the buffer.
+ */
+ segsize = wpipe->pipe_buffer.size -
+ wpipe->pipe_buffer.in;
+ if (segsize > size)
+ segsize = size;
+
+ /* Transfer first segment */
+
+ PIPE_UNLOCK(rpipe);
+ error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
+ segsize, uio);
+ PIPE_LOCK(rpipe);
+
+ if (error == 0 && segsize < size) {
+ /*
+ * Transfer remaining part now, to
+ * support atomic writes. Wraparound
+ * happened.
+ */
+ if (wpipe->pipe_buffer.in + segsize !=
+ wpipe->pipe_buffer.size)
+ panic("Expected pipe buffer wraparound disappeared");
+
+ PIPE_UNLOCK(rpipe);
+ error = uiomove(&wpipe->pipe_buffer.buffer[0],
+ size - segsize, uio);
+ PIPE_LOCK(rpipe);
+ }
+ if (error == 0) {
+ wpipe->pipe_buffer.in += size;
+ if (wpipe->pipe_buffer.in >=
+ wpipe->pipe_buffer.size) {
+ if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
+ panic("Expected wraparound bad");
+ wpipe->pipe_buffer.in = size - segsize;
+ }
+
+ wpipe->pipe_buffer.cnt += size;
+ if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
+ panic("Pipe buffer overflow");
+
+ }
+ pipeunlock(wpipe);
+ }
+ if (error)
+ break;
+
+ } else {
+ /*
+ * If the "read-side" has been blocked, wake it up now.
+ */
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+
+ /*
+ * don't block on non-blocking I/O
+ */
+ if (fp->f_flag & FNONBLOCK) {
+ error = EAGAIN;
+ break;
+ }
+
+ /*
+ * We have no more space and have something to offer,
+ * wake up select/poll.
+ */
+ pipeselwakeup(wpipe);
+
+ wpipe->pipe_state |= PIPE_WANTW;
+ error = msleep(wpipe, PIPE_MTX(rpipe),
+ PRIBIO | PCATCH, "pipewr", 0);
+ if (error != 0)
+ break;
+ /*
+ * If read side wants to go away, we just issue a signal
+ * to ourselves.
+ */
+ if (wpipe->pipe_state & PIPE_EOF) {
+ error = EPIPE;
+ break;
+ }
+ }
+ }
+
+ --wpipe->pipe_busy;
+
+ if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
+ wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
+ wakeup(wpipe);
+ } else if (wpipe->pipe_buffer.cnt > 0) {
+ /*
+ * If we have put any characters in the buffer, we wake up
+ * the reader.
+ */
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ }
+
+ /*
+ * Don't return EPIPE if I/O was successful
+ */
+ if ((wpipe->pipe_buffer.cnt == 0) &&
+ (uio->uio_resid == 0) &&
+ (error == EPIPE)) {
+ error = 0;
+ }
+
+ if (error == 0)
+ vfs_timestamp(&wpipe->pipe_mtime);
+
+ /*
+ * We have something to offer,
+ * wake up select/poll.
+ */
+ if (wpipe->pipe_buffer.cnt)
+ pipeselwakeup(wpipe);
+
+ PIPE_UNLOCK(rpipe);
+ return (error);
+}
+
+/*
+ * we implement a very minimal set of ioctls for compatibility with sockets.
+ */
+int
+pipe_ioctl(fp, cmd, data, td)
+ struct file *fp;
+ u_long cmd;
+ caddr_t data;
+ struct thread *td;
+{
+ struct pipe *mpipe = (struct pipe *)fp->f_data;
+
+ switch (cmd) {
+
+ case FIONBIO:
+ return (0);
+
+ case FIOASYNC:
+ PIPE_LOCK(mpipe);
+ if (*(int *)data) {
+ mpipe->pipe_state |= PIPE_ASYNC;
+ } else {
+ mpipe->pipe_state &= ~PIPE_ASYNC;
+ }
+ PIPE_UNLOCK(mpipe);
+ return (0);
+
+ case FIONREAD:
+ PIPE_LOCK(mpipe);
+ if (mpipe->pipe_state & PIPE_DIRECTW)
+ *(int *)data = mpipe->pipe_map.cnt;
+ else
+ *(int *)data = mpipe->pipe_buffer.cnt;
+ PIPE_UNLOCK(mpipe);
+ return (0);
+
+ case FIOSETOWN:
+ return (fsetown(*(int *)data, &mpipe->pipe_sigio));
+
+ case FIOGETOWN:
+ *(int *)data = fgetown(mpipe->pipe_sigio);
+ return (0);
+
+ /* This is deprecated, FIOSETOWN should be used instead. */
+ case TIOCSPGRP:
+ return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
+
+ /* This is deprecated, FIOGETOWN should be used instead. */
+ case TIOCGPGRP:
+ *(int *)data = -fgetown(mpipe->pipe_sigio);
+ return (0);
+
+ }
+ return (ENOTTY);
+}
+
+int
+pipe_poll(fp, events, cred, td)
+ struct file *fp;
+ int events;
+ struct ucred *cred;
+ struct thread *td;
+{
+ struct pipe *rpipe = (struct pipe *)fp->f_data;
+ struct pipe *wpipe;
+ int revents = 0;
+
+ wpipe = rpipe->pipe_peer;
+ PIPE_LOCK(rpipe);
+ if (events & (POLLIN | POLLRDNORM))
+ if ((rpipe->pipe_state & PIPE_DIRECTW) ||
+ (rpipe->pipe_buffer.cnt > 0) ||
+ (rpipe->pipe_state & PIPE_EOF))
+ revents |= events & (POLLIN | POLLRDNORM);
+
+ if (events & (POLLOUT | POLLWRNORM))
+ if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
+ (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
+ (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
+ revents |= events & (POLLOUT | POLLWRNORM);
+
+ if ((rpipe->pipe_state & PIPE_EOF) ||
+ (wpipe == NULL) ||
+ (wpipe->pipe_state & PIPE_EOF))
+ revents |= POLLHUP;
+
+ if (revents == 0) {
+ if (events & (POLLIN | POLLRDNORM)) {
+ selrecord(td, &rpipe->pipe_sel);
+ rpipe->pipe_state |= PIPE_SEL;
+ }
+
+ if (events & (POLLOUT | POLLWRNORM)) {
+ selrecord(td, &wpipe->pipe_sel);
+ wpipe->pipe_state |= PIPE_SEL;
+ }
+ }
+ PIPE_UNLOCK(rpipe);
+
+ return (revents);
+}
+
+/*
+ * We shouldn't need locks here as we're doing a read and this should
+ * be a natural race.
+ */
+static int
+pipe_stat(fp, ub, td)
+ struct file *fp;
+ struct stat *ub;
+ struct thread *td;
+{
+ struct pipe *pipe = (struct pipe *)fp->f_data;
+
+ bzero((caddr_t)ub, sizeof(*ub));
+ ub->st_mode = S_IFIFO;
+ ub->st_blksize = pipe->pipe_buffer.size;
+ ub->st_size = pipe->pipe_buffer.cnt;
+ ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
+ ub->st_atimespec = pipe->pipe_atime;
+ ub->st_mtimespec = pipe->pipe_mtime;
+ ub->st_ctimespec = pipe->pipe_ctime;
+ ub->st_uid = fp->f_cred->cr_uid;
+ ub->st_gid = fp->f_cred->cr_gid;
+ /*
+ * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
+ * XXX (st_dev, st_ino) should be unique.
+ */
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+pipe_close(fp, td)
+ struct file *fp;
+ struct thread *td;
+{
+ struct pipe *cpipe = (struct pipe *)fp->f_data;
+
+ fp->f_ops = &badfileops;
+ fp->f_data = NULL;
+ funsetown(&cpipe->pipe_sigio);
+ pipeclose(cpipe);
+ return (0);
+}
+
+static void
+pipe_free_kmem(cpipe)
+ struct pipe *cpipe;
+{
+
+ GIANT_REQUIRED;
+ KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
+ ("pipespace: pipe mutex locked"));
+
+ if (cpipe->pipe_buffer.buffer != NULL) {
+ if (cpipe->pipe_buffer.size > PIPE_SIZE)
+ --nbigpipe;
+ amountpipekva -= cpipe->pipe_buffer.size;
+ kmem_free(kernel_map,
+ (vm_offset_t)cpipe->pipe_buffer.buffer,
+ cpipe->pipe_buffer.size);
+ cpipe->pipe_buffer.buffer = NULL;
+ }
+#ifndef PIPE_NODIRECT
+ if (cpipe->pipe_map.kva != NULL) {
+ amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
+ kmem_free(kernel_map,
+ cpipe->pipe_map.kva,
+ cpipe->pipe_buffer.size + PAGE_SIZE);
+ cpipe->pipe_map.cnt = 0;
+ cpipe->pipe_map.kva = 0;
+ cpipe->pipe_map.pos = 0;
+ cpipe->pipe_map.npages = 0;
+ }
+#endif
+}
+
+/*
+ * shutdown the pipe
+ */
+static void
+pipeclose(cpipe)
+ struct pipe *cpipe;
+{
+ struct pipe *ppipe;
+ int hadpeer;
+
+ if (cpipe == NULL)
+ return;
+
+ hadpeer = 0;
+
+ /* partially created pipes won't have a valid mutex. */
+ if (PIPE_MTX(cpipe) != NULL)
+ PIPE_LOCK(cpipe);
+
+ pipeselwakeup(cpipe);
+
+ /*
+ * If the other side is blocked, wake it up saying that
+ * we want to close it down.
+ */
+ while (cpipe->pipe_busy) {
+ wakeup(cpipe);
+ cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
+ msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
+ }
+
+ /*
+ * Disconnect from peer
+ */
+ if ((ppipe = cpipe->pipe_peer) != NULL) {
+ hadpeer++;
+ pipeselwakeup(ppipe);
+
+ ppipe->pipe_state |= PIPE_EOF;
+ wakeup(ppipe);
+ KNOTE(&ppipe->pipe_sel.si_note, 0);
+ ppipe->pipe_peer = NULL;
+ }
+ /*
+ * free resources
+ */
+ if (PIPE_MTX(cpipe) != NULL) {
+ PIPE_UNLOCK(cpipe);
+ if (!hadpeer) {
+ mtx_destroy(PIPE_MTX(cpipe));
+ free(PIPE_MTX(cpipe), M_TEMP);
+ }
+ }
+ mtx_lock(&Giant);
+ pipe_free_kmem(cpipe);
+ uma_zfree(pipe_zone, cpipe);
+ mtx_unlock(&Giant);
+}
+
+/*ARGSUSED*/
+static int
+pipe_kqfilter(struct file *fp, struct knote *kn)
+{
+ struct pipe *cpipe;
+
+ cpipe = (struct pipe *)kn->kn_fp->f_data;
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ kn->kn_fop = &pipe_rfiltops;
+ break;
+ case EVFILT_WRITE:
+ kn->kn_fop = &pipe_wfiltops;
+ cpipe = cpipe->pipe_peer;
+ break;
+ default:
+ return (1);
+ }
+ kn->kn_hook = (caddr_t)cpipe;
+
+ PIPE_LOCK(cpipe);
+ SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
+ PIPE_UNLOCK(cpipe);
+ return (0);
+}
+
+static void
+filt_pipedetach(struct knote *kn)
+{
+ struct pipe *cpipe = (struct pipe *)kn->kn_hook;
+
+ PIPE_LOCK(cpipe);
+ SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
+ PIPE_UNLOCK(cpipe);
+}
+
+/*ARGSUSED*/
+static int
+filt_piperead(struct knote *kn, long hint)
+{
+ struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
+ struct pipe *wpipe = rpipe->pipe_peer;
+
+ PIPE_LOCK(rpipe);
+ kn->kn_data = rpipe->pipe_buffer.cnt;
+ if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
+ kn->kn_data = rpipe->pipe_map.cnt;
+
+ if ((rpipe->pipe_state & PIPE_EOF) ||
+ (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+ kn->kn_flags |= EV_EOF;
+ PIPE_UNLOCK(rpipe);
+ return (1);
+ }
+ PIPE_UNLOCK(rpipe);
+ return (kn->kn_data > 0);
+}
+
+/*ARGSUSED*/
+static int
+filt_pipewrite(struct knote *kn, long hint)
+{
+ struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
+ struct pipe *wpipe = rpipe->pipe_peer;
+
+ PIPE_LOCK(rpipe);
+ if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+ kn->kn_data = 0;
+ kn->kn_flags |= EV_EOF;
+ PIPE_UNLOCK(rpipe);
+ return (1);
+ }
+ kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
+ if (wpipe->pipe_state & PIPE_DIRECTW)
+ kn->kn_data = 0;
+
+ PIPE_UNLOCK(rpipe);
+ return (kn->kn_data >= PIPE_BUF);
+}
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
new file mode 100644
index 0000000..dacb9d9
--- /dev/null
+++ b/sys/kern/sys_process.c
@@ -0,0 +1,728 @@
+/*
+ * Copyright (c) 1994, Sean Eric Fagan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Sean Eric Fagan.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/ptrace.h>
+#include <sys/sx.h>
+#include <sys/user.h>
+
+#include <machine/reg.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+
+/*
+ * Functions implemented using PROC_ACTION():
+ *
+ * proc_read_regs(proc, regs)
+ * Get the current user-visible register set from the process
+ * and copy it into the regs structure (<machine/reg.h>).
+ * The process is stopped at the time read_regs is called.
+ *
+ * proc_write_regs(proc, regs)
+ * Update the current register set from the passed in regs
+ * structure. Take care to avoid clobbering special CPU
+ * registers or privileged bits in the PSL.
+ * Depending on the architecture this may have fix-up work to do,
+ * especially if the IAR or PCW are modified.
+ * The process is stopped at the time write_regs is called.
+ *
+ * proc_read_fpregs, proc_write_fpregs
+ * deal with the floating point register set, otherwise as above.
+ *
+ * proc_read_dbregs, proc_write_dbregs
+ * deal with the processor debug register set, otherwise as above.
+ *
+ * proc_sstep(proc)
+ * Arrange for the process to trap after executing a single instruction.
+ */
+
+#define PROC_ACTION(action) do { \
+ int error; \
+ \
+ mtx_lock_spin(&sched_lock); \
+ if ((td->td_proc->p_sflag & PS_INMEM) == 0) \
+ error = EIO; \
+ else \
+ error = (action); \
+ mtx_unlock_spin(&sched_lock); \
+ return (error); \
+} while(0)
+
+int
+proc_read_regs(struct thread *td, struct reg *regs)
+{
+
+ PROC_ACTION(fill_regs(td, regs));
+}
+
+int
+proc_write_regs(struct thread *td, struct reg *regs)
+{
+
+ PROC_ACTION(set_regs(td, regs));
+}
+
+int
+proc_read_dbregs(struct thread *td, struct dbreg *dbregs)
+{
+
+ PROC_ACTION(fill_dbregs(td, dbregs));
+}
+
+int
+proc_write_dbregs(struct thread *td, struct dbreg *dbregs)
+{
+
+ PROC_ACTION(set_dbregs(td, dbregs));
+}
+
+/*
+ * Ptrace doesn't support fpregs at all, and there are no security holes
+ * or translations for fpregs, so we can just copy them.
+ */
+int
+proc_read_fpregs(struct thread *td, struct fpreg *fpregs)
+{
+
+ PROC_ACTION(fill_fpregs(td, fpregs));
+}
+
+int
+proc_write_fpregs(struct thread *td, struct fpreg *fpregs)
+{
+
+ PROC_ACTION(set_fpregs(td, fpregs));
+}
+
+int
+proc_sstep(struct thread *td)
+{
+
+ PROC_ACTION(ptrace_single_step(td));
+}
+
+int
+proc_rwmem(struct proc *p, struct uio *uio)
+{
+ struct vmspace *vm;
+ vm_map_t map;
+ vm_object_t object = NULL;
+ vm_offset_t pageno = 0; /* page number */
+ vm_prot_t reqprot;
+ vm_offset_t kva;
+ int error, writing;
+
+ GIANT_REQUIRED;
+
+ /*
+ * if the vmspace is in the midst of being deallocated or the
+ * process is exiting, don't try to grab anything. The page table
+ * usage in that process can be messed up.
+ */
+ vm = p->p_vmspace;
+ if ((p->p_flag & P_WEXIT))
+ return (EFAULT);
+ if (vm->vm_refcnt < 1)
+ return (EFAULT);
+ ++vm->vm_refcnt;
+ /*
+ * The map we want...
+ */
+ map = &vm->vm_map;
+
+ writing = uio->uio_rw == UIO_WRITE;
+ reqprot = writing ? (VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE) :
+ VM_PROT_READ;
+
+ kva = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
+
+ /*
+ * Only map in one page at a time. We don't have to, but it
+ * makes things easier. This way is trivial - right?
+ */
+ do {
+ vm_map_t tmap;
+ vm_offset_t uva;
+ int page_offset; /* offset into page */
+ vm_map_entry_t out_entry;
+ vm_prot_t out_prot;
+ boolean_t wired;
+ vm_pindex_t pindex;
+ u_int len;
+ vm_page_t m;
+
+ object = NULL;
+
+ uva = (vm_offset_t)uio->uio_offset;
+
+ /*
+ * Get the page number of this segment.
+ */
+ pageno = trunc_page(uva);
+ page_offset = uva - pageno;
+
+ /*
+ * How many bytes to copy
+ */
+ len = min(PAGE_SIZE - page_offset, uio->uio_resid);
+
+ /*
+ * Fault the page on behalf of the process
+ */
+ error = vm_fault(map, pageno, reqprot, VM_FAULT_NORMAL);
+ if (error) {
+ error = EFAULT;
+ break;
+ }
+
+ /*
+ * Now we need to get the page. out_entry, out_prot, wired,
+ * and single_use aren't used. One would think the vm code
+ * would be a *bit* nicer... We use tmap because
+ * vm_map_lookup() can change the map argument.
+ */
+ tmap = map;
+ error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry,
+ &object, &pindex, &out_prot, &wired);
+
+ if (error) {
+ error = EFAULT;
+
+ /*
+ * Make sure that there is no residue in 'object' from
+ * an error return on vm_map_lookup.
+ */
+ object = NULL;
+
+ break;
+ }
+
+ m = vm_page_lookup(object, pindex);
+
+ /* Allow fallback to backing objects if we are reading */
+
+ while (m == NULL && !writing && object->backing_object) {
+
+ pindex += OFF_TO_IDX(object->backing_object_offset);
+ object = object->backing_object;
+
+ m = vm_page_lookup(object, pindex);
+ }
+
+ if (m == NULL) {
+ error = EFAULT;
+
+ /*
+ * Make sure that there is no residue in 'object' from
+ * an error return on vm_map_lookup.
+ */
+ object = NULL;
+
+ vm_map_lookup_done(tmap, out_entry);
+
+ break;
+ }
+
+ /*
+ * Wire the page into memory
+ */
+ vm_page_wire(m);
+
+ /*
+ * We're done with tmap now.
+ * But reference the object first, so that we won't loose
+ * it.
+ */
+ vm_object_reference(object);
+ vm_map_lookup_done(tmap, out_entry);
+
+ pmap_qenter(kva, &m, 1);
+
+ /*
+ * Now do the i/o move.
+ */
+ error = uiomove((caddr_t)(kva + page_offset), len, uio);
+
+ pmap_qremove(kva, 1);
+
+ /*
+ * release the page and the object
+ */
+ vm_page_unwire(m, 1);
+ vm_object_deallocate(object);
+
+ object = NULL;
+
+ } while (error == 0 && uio->uio_resid > 0);
+
+ if (object)
+ vm_object_deallocate(object);
+
+ kmem_free(kernel_map, kva, PAGE_SIZE);
+ vmspace_free(vm);
+ return (error);
+}
+
+/*
+ * Process debugging system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ptrace_args {
+ int req;
+ pid_t pid;
+ caddr_t addr;
+ int data;
+};
+#endif
+
+int
+ptrace(struct thread *td, struct ptrace_args *uap)
+{
+ struct iovec iov;
+ struct uio uio;
+ /*
+ * XXX this obfuscation is to reduce stack usage, but the register
+ * structs may be too large to put on the stack anyway.
+ */
+ union {
+ struct ptrace_io_desc piod;
+ struct dbreg dbreg;
+ struct fpreg fpreg;
+ struct reg reg;
+ } r;
+ struct proc *curp, *p, *pp;
+ struct thread *td2;
+ int error, write;
+ int proctree_locked = 0;
+
+ curp = td->td_proc;
+
+ /*
+ * Do copyin() early before getting locks and lock proctree before
+ * locking the process.
+ */
+ switch (uap->req) {
+ case PT_TRACE_ME:
+ case PT_ATTACH:
+ case PT_STEP:
+ case PT_CONTINUE:
+ case PT_DETACH:
+ sx_xlock(&proctree_lock);
+ proctree_locked = 1;
+ break;
+#ifdef PT_SETREGS
+ case PT_SETREGS:
+ error = copyin(uap->addr, &r.reg, sizeof r.reg);
+ if (error)
+ return (error);
+ break;
+#endif /* PT_SETREGS */
+#ifdef PT_SETFPREGS
+ case PT_SETFPREGS:
+ error = copyin(uap->addr, &r.fpreg, sizeof r.fpreg);
+ if (error)
+ return (error);
+ break;
+#endif /* PT_SETFPREGS */
+#ifdef PT_SETDBREGS
+ case PT_SETDBREGS:
+ error = copyin(uap->addr, &r.dbreg, sizeof r.dbreg);
+ if (error)
+ return (error);
+ break;
+#endif /* PT_SETDBREGS */
+ default:
+ break;
+ }
+
+ write = 0;
+ if (uap->req == PT_TRACE_ME) {
+ p = td->td_proc;
+ PROC_LOCK(p);
+ } else {
+ if ((p = pfind(uap->pid)) == NULL) {
+ if (proctree_locked)
+ sx_xunlock(&proctree_lock);
+ return (ESRCH);
+ }
+ }
+ if (p_cansee(td, p)) {
+ error = ESRCH;
+ goto fail;
+ }
+
+ if ((error = p_candebug(td, p)) != 0)
+ goto fail;
+
+ /*
+ * System processes can't be debugged.
+ */
+ if ((p->p_flag & P_SYSTEM) != 0) {
+ error = EINVAL;
+ goto fail;
+ }
+
+ /*
+ * Permissions check
+ */
+ switch (uap->req) {
+ case PT_TRACE_ME:
+ /* Always legal. */
+ break;
+
+ case PT_ATTACH:
+ /* Self */
+ if (p->p_pid == td->td_proc->p_pid) {
+ error = EINVAL;
+ goto fail;
+ }
+
+ /* Already traced */
+ if (p->p_flag & P_TRACED) {
+ error = EBUSY;
+ goto fail;
+ }
+
+ /* Can't trace an ancestor if you're being traced. */
+ if (curp->p_flag & P_TRACED) {
+ for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) {
+ if (pp == p) {
+ error = EINVAL;
+ goto fail;
+ }
+ }
+ }
+
+
+ /* OK */
+ break;
+
+ case PT_READ_I:
+ case PT_READ_D:
+ case PT_WRITE_I:
+ case PT_WRITE_D:
+ case PT_IO:
+ case PT_CONTINUE:
+ case PT_KILL:
+ case PT_STEP:
+ case PT_DETACH:
+ case PT_GETREGS:
+ case PT_SETREGS:
+ case PT_GETFPREGS:
+ case PT_SETFPREGS:
+ case PT_GETDBREGS:
+ case PT_SETDBREGS:
+ /* not being traced... */
+ if ((p->p_flag & P_TRACED) == 0) {
+ error = EPERM;
+ goto fail;
+ }
+
+ /* not being traced by YOU */
+ if (p->p_pptr != td->td_proc) {
+ error = EBUSY;
+ goto fail;
+ }
+
+ /* not currently stopped */
+ if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0) {
+ error = EBUSY;
+ goto fail;
+ }
+
+ /* OK */
+ break;
+
+ default:
+ error = EINVAL;
+ goto fail;
+ }
+
+ td2 = FIRST_THREAD_IN_PROC(p);
+#ifdef FIX_SSTEP
+ /*
+ * Single step fixup ala procfs
+ */
+ FIX_SSTEP(td2); /* XXXKSE */
+#endif
+
+ /*
+ * Actually do the requests
+ */
+
+ td->td_retval[0] = 0;
+
+ switch (uap->req) {
+ case PT_TRACE_ME:
+ /* set my trace flag and "owner" so it can read/write me */
+ p->p_flag |= P_TRACED;
+ p->p_oppid = p->p_pptr->p_pid;
+ PROC_UNLOCK(p);
+ sx_xunlock(&proctree_lock);
+ return (0);
+
+ case PT_ATTACH:
+ /* security check done above */
+ p->p_flag |= P_TRACED;
+ p->p_oppid = p->p_pptr->p_pid;
+ if (p->p_pptr != td->td_proc)
+ proc_reparent(p, td->td_proc);
+ uap->data = SIGSTOP;
+ goto sendsig; /* in PT_CONTINUE below */
+
+ case PT_STEP:
+ case PT_CONTINUE:
+ case PT_DETACH:
+ /* XXX uap->data is used even in the PT_STEP case. */
+ if (uap->req != PT_STEP && (unsigned)uap->data > _SIG_MAXSIG) {
+ error = EINVAL;
+ goto fail;
+ }
+
+ _PHOLD(p);
+
+ if (uap->req == PT_STEP) {
+ error = ptrace_single_step(td2);
+ if (error) {
+ _PRELE(p);
+ goto fail;
+ }
+ }
+
+ if (uap->addr != (caddr_t)1) {
+ fill_kinfo_proc(p, &p->p_uarea->u_kproc);
+ error = ptrace_set_pc(td2,
+ (u_long)(uintfptr_t)uap->addr);
+ if (error) {
+ _PRELE(p);
+ goto fail;
+ }
+ }
+ _PRELE(p);
+
+ if (uap->req == PT_DETACH) {
+ /* reset process parent */
+ if (p->p_oppid != p->p_pptr->p_pid) {
+ struct proc *pp;
+
+ PROC_UNLOCK(p);
+ pp = pfind(p->p_oppid);
+ if (pp == NULL)
+ pp = initproc;
+ else
+ PROC_UNLOCK(pp);
+ PROC_LOCK(p);
+ proc_reparent(p, pp);
+ }
+ p->p_flag &= ~(P_TRACED | P_WAITED);
+ p->p_oppid = 0;
+
+ /* should we send SIGCHLD? */
+ }
+
+ sendsig:
+ if (proctree_locked)
+ sx_xunlock(&proctree_lock);
+ /* deliver or queue signal */
+ if (p->p_stat == SSTOP) {
+ p->p_xstat = uap->data;
+ mtx_lock_spin(&sched_lock);
+ setrunnable(td2); /* XXXKSE */
+ mtx_unlock_spin(&sched_lock);
+ } else if (uap->data)
+ psignal(p, uap->data);
+ PROC_UNLOCK(p);
+
+ return (0);
+
+ case PT_WRITE_I:
+ case PT_WRITE_D:
+ write = 1;
+ /* fallthrough */
+ case PT_READ_I:
+ case PT_READ_D:
+ PROC_UNLOCK(p);
+ /* write = 0 set above */
+ iov.iov_base = write ? (caddr_t)&uap->data :
+ (caddr_t)td->td_retval;
+ iov.iov_len = sizeof(int);
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = (off_t)(uintptr_t)uap->addr;
+ uio.uio_resid = sizeof(int);
+ uio.uio_segflg = UIO_SYSSPACE; /* i.e.: the uap */
+ uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+ uio.uio_td = td;
+ error = proc_rwmem(p, &uio);
+ if (uio.uio_resid != 0) {
+ /*
+ * XXX proc_rwmem() doesn't currently return ENOSPC,
+ * so I think write() can bogusly return 0.
+ * XXX what happens for short writes? We don't want
+ * to write partial data.
+ * XXX proc_rwmem() returns EPERM for other invalid
+ * addresses. Convert this to EINVAL. Does this
+ * clobber returns of EPERM for other reasons?
+ */
+ if (error == 0 || error == ENOSPC || error == EPERM)
+ error = EINVAL; /* EOF */
+ }
+ return (error);
+
+ case PT_IO:
+ error = copyin(uap->addr, &r.piod, sizeof r.piod);
+ if (error)
+ return (error);
+ iov.iov_base = r.piod.piod_addr;
+ iov.iov_len = r.piod.piod_len;
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = (off_t)(uintptr_t)r.piod.piod_offs;
+ uio.uio_resid = r.piod.piod_len;
+ uio.uio_segflg = UIO_USERSPACE;
+ uio.uio_td = td;
+ switch (r.piod.piod_op) {
+ case PIOD_READ_D:
+ case PIOD_READ_I:
+ uio.uio_rw = UIO_READ;
+ break;
+ case PIOD_WRITE_D:
+ case PIOD_WRITE_I:
+ uio.uio_rw = UIO_WRITE;
+ break;
+ default:
+ return (EINVAL);
+ }
+ error = proc_rwmem(p, &uio);
+ r.piod.piod_len -= uio.uio_resid;
+ (void)copyout(&r.piod, uap->addr, sizeof r.piod);
+ return (error);
+
+ case PT_KILL:
+ uap->data = SIGKILL;
+ goto sendsig; /* in PT_CONTINUE above */
+
+ case PT_SETREGS:
+ _PHOLD(p);
+ error = proc_write_regs(td2, &r.reg);
+ _PRELE(p);
+ PROC_UNLOCK(p);
+ return (error);
+
+ case PT_GETREGS:
+ _PHOLD(p);
+ error = proc_read_regs(td2, &r.reg);
+ _PRELE(p);
+ PROC_UNLOCK(p);
+ if (error == 0)
+ error = copyout(&r.reg, uap->addr, sizeof r.reg);
+ return (error);
+
+ case PT_SETFPREGS:
+ _PHOLD(p);
+ error = proc_write_fpregs(td2, &r.fpreg);
+ _PRELE(p);
+ PROC_UNLOCK(p);
+ return (error);
+
+ case PT_GETFPREGS:
+ _PHOLD(p);
+ error = proc_read_fpregs(td2, &r.fpreg);
+ _PRELE(p);
+ PROC_UNLOCK(p);
+ if (error == 0)
+ error = copyout(&r.fpreg, uap->addr, sizeof r.fpreg);
+ return (error);
+
+ case PT_SETDBREGS:
+ _PHOLD(p);
+ error = proc_write_dbregs(td2, &r.dbreg);
+ _PRELE(p);
+ PROC_UNLOCK(p);
+ return (error);
+
+ case PT_GETDBREGS:
+ _PHOLD(p);
+ error = proc_read_dbregs(td2, &r.dbreg);
+ _PRELE(p);
+ PROC_UNLOCK(p);
+ if (error == 0)
+ error = copyout(&r.dbreg, uap->addr, sizeof r.dbreg);
+ return (error);
+
+ default:
+ KASSERT(0, ("unreachable code\n"));
+ break;
+ }
+
+ KASSERT(0, ("unreachable code\n"));
+ return (0);
+
+fail:
+ PROC_UNLOCK(p);
+ if (proctree_locked)
+ sx_xunlock(&proctree_lock);
+ return (error);
+}
+
+/*
+ * Stop a process because of a debugging event;
+ * stay stopped until p->p_step is cleared
+ * (cleared by PIOCCONT in procfs).
+ */
+void
+stopevent(struct proc *p, unsigned int event, unsigned int val)
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED | MA_NOTRECURSED);
+ p->p_step = 1;
+
+ do {
+ p->p_xstat = val;
+ p->p_stype = event; /* Which event caused the stop? */
+ wakeup(&p->p_stype); /* Wake up any PIOCWAIT'ing procs */
+ msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0);
+ } while (p->p_step);
+}
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
new file mode 100644
index 0000000..c8a6198
--- /dev/null
+++ b/sys/kern/sys_socket.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 1982, 1986, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/filio.h> /* XXX */
+#include <sys/sockio.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/filedesc.h>
+#include <sys/ucred.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+struct fileops socketops = {
+ soo_read, soo_write, soo_ioctl, soo_poll, sokqfilter,
+ soo_stat, soo_close
+};
+
+/* ARGSUSED */
+int
+soo_read(fp, uio, cred, flags, td)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+ struct thread *td;
+ int flags;
+{
+ struct socket *so = (struct socket *)fp->f_data;
+ int error;
+
+ mtx_lock(&Giant);
+ error = so->so_proto->pr_usrreqs->pru_soreceive(so, 0, uio, 0, 0, 0);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/* ARGSUSED */
+int
+soo_write(fp, uio, cred, flags, td)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+ struct thread *td;
+ int flags;
+{
+ struct socket *so = (struct socket *)fp->f_data;
+ int error;
+
+ mtx_lock(&Giant);
+ error = so->so_proto->pr_usrreqs->pru_sosend(so, 0, uio, 0, 0, 0,
+ uio->uio_td);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+int
+soo_ioctl(fp, cmd, data, td)
+ struct file *fp;
+ u_long cmd;
+ register caddr_t data;
+ struct thread *td;
+{
+ register struct socket *so = (struct socket *)fp->f_data;
+
+ switch (cmd) {
+
+ case FIONBIO:
+ if (*(int *)data)
+ so->so_state |= SS_NBIO;
+ else
+ so->so_state &= ~SS_NBIO;
+ return (0);
+
+ case FIOASYNC:
+ if (*(int *)data) {
+ so->so_state |= SS_ASYNC;
+ so->so_rcv.sb_flags |= SB_ASYNC;
+ so->so_snd.sb_flags |= SB_ASYNC;
+ } else {
+ so->so_state &= ~SS_ASYNC;
+ so->so_rcv.sb_flags &= ~SB_ASYNC;
+ so->so_snd.sb_flags &= ~SB_ASYNC;
+ }
+ return (0);
+
+ case FIONREAD:
+ *(int *)data = so->so_rcv.sb_cc;
+ return (0);
+
+ case FIOSETOWN:
+ return (fsetown(*(int *)data, &so->so_sigio));
+
+ case FIOGETOWN:
+ *(int *)data = fgetown(so->so_sigio);
+ return (0);
+
+ case SIOCSPGRP:
+ return (fsetown(-(*(int *)data), &so->so_sigio));
+
+ case SIOCGPGRP:
+ *(int *)data = -fgetown(so->so_sigio);
+ return (0);
+
+ case SIOCATMARK:
+ *(int *)data = (so->so_state&SS_RCVATMARK) != 0;
+ return (0);
+ }
+ /*
+ * Interface/routing/protocol specific ioctls:
+ * interface and routing ioctls should have a
+ * different entry since a socket's unnecessary
+ */
+ if (IOCGROUP(cmd) == 'i')
+ return (ifioctl(so, cmd, data, td));
+ if (IOCGROUP(cmd) == 'r')
+ return (rtioctl(cmd, data));
+ return ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, 0, td));
+}
+
+int
+soo_poll(fp, events, cred, td)
+ struct file *fp;
+ int events;
+ struct ucred *cred;
+ struct thread *td;
+{
+ struct socket *so = (struct socket *)fp->f_data;
+ return so->so_proto->pr_usrreqs->pru_sopoll(so, events, cred, td);
+}
+
+int
+soo_stat(fp, ub, td)
+ struct file *fp;
+ struct stat *ub;
+ struct thread *td;
+{
+ struct socket *so = (struct socket *)fp->f_data;
+
+ bzero((caddr_t)ub, sizeof (*ub));
+ ub->st_mode = S_IFSOCK;
+ /*
+ * If SS_CANTRCVMORE is set, but there's still data left in the
+ * receive buffer, the socket is still readable.
+ */
+ if ((so->so_state & SS_CANTRCVMORE) == 0 ||
+ so->so_rcv.sb_cc != 0)
+ ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
+ if ((so->so_state & SS_CANTSENDMORE) == 0)
+ ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
+ ub->st_size = so->so_rcv.sb_cc;
+ ub->st_uid = so->so_cred->cr_uid;
+ ub->st_gid = so->so_cred->cr_gid;
+ return ((*so->so_proto->pr_usrreqs->pru_sense)(so, ub));
+}
+
+/*
+ * API socket close on file pointer. We call soclose() to close the
+ * socket (including initiating closing protocols). soclose() will
+ * sorele() the file reference but the actual socket will not go away
+ * until the socket's ref count hits 0.
+ */
+/* ARGSUSED */
+int
+soo_close(fp, td)
+ struct file *fp;
+ struct thread *td;
+{
+ int error = 0;
+ struct socket *so;
+
+ so = (struct socket *)fp->f_data;
+ fp->f_ops = &badfileops;
+ fp->f_data = 0;
+
+ if (so)
+ error = soclose(so);
+ return (error);
+}
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
new file mode 100644
index 0000000..8b092fc
--- /dev/null
+++ b/sys/kern/syscalls.c
@@ -0,0 +1,403 @@
+/*
+ * System call names.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.113 2002/06/13 23:43:53 rwatson Exp
+ */
+
+char *syscallnames[] = {
+ "syscall", /* 0 = syscall */
+ "exit", /* 1 = exit */
+ "fork", /* 2 = fork */
+ "read", /* 3 = read */
+ "write", /* 4 = write */
+ "open", /* 5 = open */
+ "close", /* 6 = close */
+ "wait4", /* 7 = wait4 */
+ "old.creat", /* 8 = old creat */
+ "link", /* 9 = link */
+ "unlink", /* 10 = unlink */
+ "obs_execv", /* 11 = obsolete execv */
+ "chdir", /* 12 = chdir */
+ "fchdir", /* 13 = fchdir */
+ "mknod", /* 14 = mknod */
+ "chmod", /* 15 = chmod */
+ "chown", /* 16 = chown */
+ "break", /* 17 = break */
+ "getfsstat", /* 18 = getfsstat */
+ "old.lseek", /* 19 = old lseek */
+ "getpid", /* 20 = getpid */
+ "mount", /* 21 = mount */
+ "unmount", /* 22 = unmount */
+ "setuid", /* 23 = setuid */
+ "getuid", /* 24 = getuid */
+ "geteuid", /* 25 = geteuid */
+ "ptrace", /* 26 = ptrace */
+ "recvmsg", /* 27 = recvmsg */
+ "sendmsg", /* 28 = sendmsg */
+ "recvfrom", /* 29 = recvfrom */
+ "accept", /* 30 = accept */
+ "getpeername", /* 31 = getpeername */
+ "getsockname", /* 32 = getsockname */
+ "access", /* 33 = access */
+ "chflags", /* 34 = chflags */
+ "fchflags", /* 35 = fchflags */
+ "sync", /* 36 = sync */
+ "kill", /* 37 = kill */
+ "old.stat", /* 38 = old stat */
+ "getppid", /* 39 = getppid */
+ "old.lstat", /* 40 = old lstat */
+ "dup", /* 41 = dup */
+ "pipe", /* 42 = pipe */
+ "getegid", /* 43 = getegid */
+ "profil", /* 44 = profil */
+ "ktrace", /* 45 = ktrace */
+ "old.sigaction", /* 46 = old sigaction */
+ "getgid", /* 47 = getgid */
+ "old.sigprocmask", /* 48 = old sigprocmask */
+ "getlogin", /* 49 = getlogin */
+ "setlogin", /* 50 = setlogin */
+ "acct", /* 51 = acct */
+ "old.sigpending", /* 52 = old sigpending */
+ "sigaltstack", /* 53 = sigaltstack */
+ "ioctl", /* 54 = ioctl */
+ "reboot", /* 55 = reboot */
+ "revoke", /* 56 = revoke */
+ "symlink", /* 57 = symlink */
+ "readlink", /* 58 = readlink */
+ "execve", /* 59 = execve */
+ "umask", /* 60 = umask */
+ "chroot", /* 61 = chroot */
+ "old.fstat", /* 62 = old fstat */
+ "old.getkerninfo", /* 63 = old getkerninfo */
+ "old.getpagesize", /* 64 = old getpagesize */
+ "msync", /* 65 = msync */
+ "vfork", /* 66 = vfork */
+ "obs_vread", /* 67 = obsolete vread */
+ "obs_vwrite", /* 68 = obsolete vwrite */
+ "sbrk", /* 69 = sbrk */
+ "sstk", /* 70 = sstk */
+ "old.mmap", /* 71 = old mmap */
+ "vadvise", /* 72 = vadvise */
+ "munmap", /* 73 = munmap */
+ "mprotect", /* 74 = mprotect */
+ "madvise", /* 75 = madvise */
+ "obs_vhangup", /* 76 = obsolete vhangup */
+ "obs_vlimit", /* 77 = obsolete vlimit */
+ "mincore", /* 78 = mincore */
+ "getgroups", /* 79 = getgroups */
+ "setgroups", /* 80 = setgroups */
+ "getpgrp", /* 81 = getpgrp */
+ "setpgid", /* 82 = setpgid */
+ "setitimer", /* 83 = setitimer */
+ "old.wait", /* 84 = old wait */
+ "swapon", /* 85 = swapon */
+ "getitimer", /* 86 = getitimer */
+ "old.gethostname", /* 87 = old gethostname */
+ "old.sethostname", /* 88 = old sethostname */
+ "getdtablesize", /* 89 = getdtablesize */
+ "dup2", /* 90 = dup2 */
+ "#91", /* 91 = getdopt */
+ "fcntl", /* 92 = fcntl */
+ "select", /* 93 = select */
+ "#94", /* 94 = setdopt */
+ "fsync", /* 95 = fsync */
+ "setpriority", /* 96 = setpriority */
+ "socket", /* 97 = socket */
+ "connect", /* 98 = connect */
+ "old.accept", /* 99 = old accept */
+ "getpriority", /* 100 = getpriority */
+ "old.send", /* 101 = old send */
+ "old.recv", /* 102 = old recv */
+ "osigreturn", /* 103 = osigreturn */
+ "bind", /* 104 = bind */
+ "setsockopt", /* 105 = setsockopt */
+ "listen", /* 106 = listen */
+ "obs_vtimes", /* 107 = obsolete vtimes */
+ "old.sigvec", /* 108 = old sigvec */
+ "old.sigblock", /* 109 = old sigblock */
+ "old.sigsetmask", /* 110 = old sigsetmask */
+ "old.sigsuspend", /* 111 = old sigsuspend */
+ "old.sigstack", /* 112 = old sigstack */
+ "old.recvmsg", /* 113 = old recvmsg */
+ "old.sendmsg", /* 114 = old sendmsg */
+ "obs_vtrace", /* 115 = obsolete vtrace */
+ "gettimeofday", /* 116 = gettimeofday */
+ "getrusage", /* 117 = getrusage */
+ "getsockopt", /* 118 = getsockopt */
+ "#119", /* 119 = resuba */
+ "readv", /* 120 = readv */
+ "writev", /* 121 = writev */
+ "settimeofday", /* 122 = settimeofday */
+ "fchown", /* 123 = fchown */
+ "fchmod", /* 124 = fchmod */
+ "old.recvfrom", /* 125 = old recvfrom */
+ "setreuid", /* 126 = setreuid */
+ "setregid", /* 127 = setregid */
+ "rename", /* 128 = rename */
+ "old.truncate", /* 129 = old truncate */
+ "old.ftruncate", /* 130 = old ftruncate */
+ "flock", /* 131 = flock */
+ "mkfifo", /* 132 = mkfifo */
+ "sendto", /* 133 = sendto */
+ "shutdown", /* 134 = shutdown */
+ "socketpair", /* 135 = socketpair */
+ "mkdir", /* 136 = mkdir */
+ "rmdir", /* 137 = rmdir */
+ "utimes", /* 138 = utimes */
+ "obs_4.2", /* 139 = obsolete 4.2 sigreturn */
+ "adjtime", /* 140 = adjtime */
+ "old.getpeername", /* 141 = old getpeername */
+ "old.gethostid", /* 142 = old gethostid */
+ "old.sethostid", /* 143 = old sethostid */
+ "old.getrlimit", /* 144 = old getrlimit */
+ "old.setrlimit", /* 145 = old setrlimit */
+ "old.killpg", /* 146 = old killpg */
+ "setsid", /* 147 = setsid */
+ "quotactl", /* 148 = quotactl */
+ "old.quota", /* 149 = old quota */
+ "old.getsockname", /* 150 = old getsockname */
+ "#151", /* 151 = sem_lock */
+ "#152", /* 152 = sem_wakeup */
+ "#153", /* 153 = asyncdaemon */
+ "#154", /* 154 = nosys */
+ "nfssvc", /* 155 = nfssvc */
+ "old.getdirentries", /* 156 = old getdirentries */
+ "statfs", /* 157 = statfs */
+ "fstatfs", /* 158 = fstatfs */
+ "#159", /* 159 = nosys */
+ "#160", /* 160 = nosys */
+ "getfh", /* 161 = getfh */
+ "getdomainname", /* 162 = getdomainname */
+ "setdomainname", /* 163 = setdomainname */
+ "uname", /* 164 = uname */
+ "sysarch", /* 165 = sysarch */
+ "rtprio", /* 166 = rtprio */
+ "#167", /* 167 = nosys */
+ "#168", /* 168 = nosys */
+ "semsys", /* 169 = semsys */
+ "msgsys", /* 170 = msgsys */
+ "shmsys", /* 171 = shmsys */
+ "#172", /* 172 = nosys */
+ "pread", /* 173 = pread */
+ "pwrite", /* 174 = pwrite */
+ "#175", /* 175 = nosys */
+ "ntp_adjtime", /* 176 = ntp_adjtime */
+ "#177", /* 177 = sfork */
+ "#178", /* 178 = getdescriptor */
+ "#179", /* 179 = setdescriptor */
+ "#180", /* 180 = nosys */
+ "setgid", /* 181 = setgid */
+ "setegid", /* 182 = setegid */
+ "seteuid", /* 183 = seteuid */
+ "#184", /* 184 = lfs_bmapv */
+ "#185", /* 185 = lfs_markv */
+ "#186", /* 186 = lfs_segclean */
+ "#187", /* 187 = lfs_segwait */
+ "stat", /* 188 = stat */
+ "fstat", /* 189 = fstat */
+ "lstat", /* 190 = lstat */
+ "pathconf", /* 191 = pathconf */
+ "fpathconf", /* 192 = fpathconf */
+ "#193", /* 193 = nosys */
+ "getrlimit", /* 194 = getrlimit */
+ "setrlimit", /* 195 = setrlimit */
+ "getdirentries", /* 196 = getdirentries */
+ "mmap", /* 197 = mmap */
+ "__syscall", /* 198 = __syscall */
+ "lseek", /* 199 = lseek */
+ "truncate", /* 200 = truncate */
+ "ftruncate", /* 201 = ftruncate */
+ "__sysctl", /* 202 = __sysctl */
+ "mlock", /* 203 = mlock */
+ "munlock", /* 204 = munlock */
+ "undelete", /* 205 = undelete */
+ "futimes", /* 206 = futimes */
+ "getpgid", /* 207 = getpgid */
+ "#208", /* 208 = newreboot */
+ "poll", /* 209 = poll */
+ "lkmnosys", /* 210 = lkmnosys */
+ "lkmnosys", /* 211 = lkmnosys */
+ "lkmnosys", /* 212 = lkmnosys */
+ "lkmnosys", /* 213 = lkmnosys */
+ "lkmnosys", /* 214 = lkmnosys */
+ "lkmnosys", /* 215 = lkmnosys */
+ "lkmnosys", /* 216 = lkmnosys */
+ "lkmnosys", /* 217 = lkmnosys */
+ "lkmnosys", /* 218 = lkmnosys */
+ "lkmnosys", /* 219 = lkmnosys */
+ "__semctl", /* 220 = __semctl */
+ "semget", /* 221 = semget */
+ "semop", /* 222 = semop */
+ "#223", /* 223 = semconfig */
+ "msgctl", /* 224 = msgctl */
+ "msgget", /* 225 = msgget */
+ "msgsnd", /* 226 = msgsnd */
+ "msgrcv", /* 227 = msgrcv */
+ "shmat", /* 228 = shmat */
+ "shmctl", /* 229 = shmctl */
+ "shmdt", /* 230 = shmdt */
+ "shmget", /* 231 = shmget */
+ "clock_gettime", /* 232 = clock_gettime */
+ "clock_settime", /* 233 = clock_settime */
+ "clock_getres", /* 234 = clock_getres */
+ "#235", /* 235 = timer_create */
+ "#236", /* 236 = timer_delete */
+ "#237", /* 237 = timer_settime */
+ "#238", /* 238 = timer_gettime */
+ "#239", /* 239 = timer_getoverrun */
+ "nanosleep", /* 240 = nanosleep */
+ "#241", /* 241 = nosys */
+ "#242", /* 242 = nosys */
+ "#243", /* 243 = nosys */
+ "#244", /* 244 = nosys */
+ "#245", /* 245 = nosys */
+ "#246", /* 246 = nosys */
+ "#247", /* 247 = nosys */
+ "#248", /* 248 = nosys */
+ "#249", /* 249 = nosys */
+ "minherit", /* 250 = minherit */
+ "rfork", /* 251 = rfork */
+ "openbsd_poll", /* 252 = openbsd_poll */
+ "issetugid", /* 253 = issetugid */
+ "lchown", /* 254 = lchown */
+ "#255", /* 255 = nosys */
+ "#256", /* 256 = nosys */
+ "#257", /* 257 = nosys */
+ "#258", /* 258 = nosys */
+ "#259", /* 259 = nosys */
+ "#260", /* 260 = nosys */
+ "#261", /* 261 = nosys */
+ "#262", /* 262 = nosys */
+ "#263", /* 263 = nosys */
+ "#264", /* 264 = nosys */
+ "#265", /* 265 = nosys */
+ "#266", /* 266 = nosys */
+ "#267", /* 267 = nosys */
+ "#268", /* 268 = nosys */
+ "#269", /* 269 = nosys */
+ "#270", /* 270 = nosys */
+ "#271", /* 271 = nosys */
+ "getdents", /* 272 = getdents */
+ "#273", /* 273 = nosys */
+ "lchmod", /* 274 = lchmod */
+ "netbsd_lchown", /* 275 = netbsd_lchown */
+ "lutimes", /* 276 = lutimes */
+ "netbsd_msync", /* 277 = netbsd_msync */
+ "nstat", /* 278 = nstat */
+ "nfstat", /* 279 = nfstat */
+ "nlstat", /* 280 = nlstat */
+ "#281", /* 281 = nosys */
+ "#282", /* 282 = nosys */
+ "#283", /* 283 = nosys */
+ "#284", /* 284 = nosys */
+ "#285", /* 285 = nosys */
+ "#286", /* 286 = nosys */
+ "#287", /* 287 = nosys */
+ "#288", /* 288 = nosys */
+ "#289", /* 289 = nosys */
+ "#290", /* 290 = nosys */
+ "#291", /* 291 = nosys */
+ "#292", /* 292 = nosys */
+ "#293", /* 293 = nosys */
+ "#294", /* 294 = nosys */
+ "#295", /* 295 = nosys */
+ "#296", /* 296 = nosys */
+ "fhstatfs", /* 297 = fhstatfs */
+ "fhopen", /* 298 = fhopen */
+ "fhstat", /* 299 = fhstat */
+ "modnext", /* 300 = modnext */
+ "modstat", /* 301 = modstat */
+ "modfnext", /* 302 = modfnext */
+ "modfind", /* 303 = modfind */
+ "kldload", /* 304 = kldload */
+ "kldunload", /* 305 = kldunload */
+ "kldfind", /* 306 = kldfind */
+ "kldnext", /* 307 = kldnext */
+ "kldstat", /* 308 = kldstat */
+ "kldfirstmod", /* 309 = kldfirstmod */
+ "getsid", /* 310 = getsid */
+ "setresuid", /* 311 = setresuid */
+ "setresgid", /* 312 = setresgid */
+ "obs_signanosleep", /* 313 = obsolete signanosleep */
+ "aio_return", /* 314 = aio_return */
+ "aio_suspend", /* 315 = aio_suspend */
+ "aio_cancel", /* 316 = aio_cancel */
+ "aio_error", /* 317 = aio_error */
+ "aio_read", /* 318 = aio_read */
+ "aio_write", /* 319 = aio_write */
+ "lio_listio", /* 320 = lio_listio */
+ "yield", /* 321 = yield */
+ "obs_thr_sleep", /* 322 = obsolete thr_sleep */
+ "obs_thr_wakeup", /* 323 = obsolete thr_wakeup */
+ "mlockall", /* 324 = mlockall */
+ "munlockall", /* 325 = munlockall */
+ "__getcwd", /* 326 = __getcwd */
+ "sched_setparam", /* 327 = sched_setparam */
+ "sched_getparam", /* 328 = sched_getparam */
+ "sched_setscheduler", /* 329 = sched_setscheduler */
+ "sched_getscheduler", /* 330 = sched_getscheduler */
+ "sched_yield", /* 331 = sched_yield */
+ "sched_get_priority_max", /* 332 = sched_get_priority_max */
+ "sched_get_priority_min", /* 333 = sched_get_priority_min */
+ "sched_rr_get_interval", /* 334 = sched_rr_get_interval */
+ "utrace", /* 335 = utrace */
+ "sendfile", /* 336 = sendfile */
+ "kldsym", /* 337 = kldsym */
+ "jail", /* 338 = jail */
+ "#339", /* 339 = pioctl */
+ "sigprocmask", /* 340 = sigprocmask */
+ "sigsuspend", /* 341 = sigsuspend */
+ "sigaction", /* 342 = sigaction */
+ "sigpending", /* 343 = sigpending */
+ "sigreturn", /* 344 = sigreturn */
+ "#345", /* 345 = sigtimedwait */
+ "#346", /* 346 = sigwaitinfo */
+ "__acl_get_file", /* 347 = __acl_get_file */
+ "__acl_set_file", /* 348 = __acl_set_file */
+ "__acl_get_fd", /* 349 = __acl_get_fd */
+ "__acl_set_fd", /* 350 = __acl_set_fd */
+ "__acl_delete_file", /* 351 = __acl_delete_file */
+ "__acl_delete_fd", /* 352 = __acl_delete_fd */
+ "__acl_aclcheck_file", /* 353 = __acl_aclcheck_file */
+ "__acl_aclcheck_fd", /* 354 = __acl_aclcheck_fd */
+ "extattrctl", /* 355 = extattrctl */
+ "extattr_set_file", /* 356 = extattr_set_file */
+ "extattr_get_file", /* 357 = extattr_get_file */
+ "extattr_delete_file", /* 358 = extattr_delete_file */
+ "aio_waitcomplete", /* 359 = aio_waitcomplete */
+ "getresuid", /* 360 = getresuid */
+ "getresgid", /* 361 = getresgid */
+ "kqueue", /* 362 = kqueue */
+ "kevent", /* 363 = kevent */
+ "#364", /* 364 = __cap_get_proc */
+ "#365", /* 365 = __cap_set_proc */
+ "#366", /* 366 = __cap_get_fd */
+ "#367", /* 367 = __cap_get_file */
+ "#368", /* 368 = __cap_set_fd */
+ "#369", /* 369 = __cap_set_file */
+ "lkmressys", /* 370 = lkmressys */
+ "extattr_set_fd", /* 371 = extattr_set_fd */
+ "extattr_get_fd", /* 372 = extattr_get_fd */
+ "extattr_delete_fd", /* 373 = extattr_delete_fd */
+ "__setugid", /* 374 = __setugid */
+ "nfsclnt", /* 375 = nfsclnt */
+ "eaccess", /* 376 = eaccess */
+ "#377", /* 377 = afs_syscall */
+ "nmount", /* 378 = nmount */
+ "kse_exit", /* 379 = kse_exit */
+ "kse_wakeup", /* 380 = kse_wakeup */
+ "kse_new", /* 381 = kse_new */
+ "thread_wakeup", /* 382 = thread_wakeup */
+ "kse_yield", /* 383 = kse_yield */
+ "#384", /* 384 = __mac_get_proc */
+ "#385", /* 385 = __mac_set_proc */
+ "#386", /* 386 = __mac_get_fd */
+ "#387", /* 387 = __mac_get_file */
+ "#388", /* 388 = __mac_set_fd */
+ "#389", /* 389 = __mac_set_file */
+ "kenv", /* 390 = kenv */
+ "lchflags", /* 391 = lchflags */
+ "uuidgen", /* 392 = uuidgen */
+};
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
new file mode 100644
index 0000000..d8115fb
--- /dev/null
+++ b/sys/kern/syscalls.master
@@ -0,0 +1,565 @@
+ $FreeBSD$
+; from: @(#)syscalls.master 8.2 (Berkeley) 1/13/94
+;
+; System call name/number master file.
+; Processed to created init_sysent.c, syscalls.c and syscall.h.
+
+; Columns: number [M]type nargs namespc name alt{name,tag,rtyp}/comments
+; number system call number, must be in order
+; type one of [M]STD, [M]OBSOL, [M]UNIMPL, [M]COMPAT, [M]CPT_NOA,
+; [M]LIBCOMPAT, [M]NODEF, [M]NOARGS, [M]NOPROTO, [M]NOIMPL,
+; [M]NOSTD
+; namespc one of POSIX, BSD, NOHIDE
+; name psuedo-prototype of syscall routine
+; If one of the following alts is different, then all appear:
+; altname name of system call if different
+; alttag name of args struct tag if different from [o]`name'"_args"
+; altrtyp return type if not int (bogus - syscalls always return int)
+; for UNIMPL/OBSOL, name continues with comments
+
+; types:
+; [M] e.g. like MSTD -- means the system call is MP-safe. If no
+; M prefix is used, the syscall wrapper will obtain the Giant
+; lock for the syscall.
+; STD always included
+; COMPAT included on COMPAT #ifdef
+; LIBCOMPAT included on COMPAT #ifdef, and placed in syscall.h
+; OBSOL obsolete, not included in system, only specifies name
+; UNIMPL not implemented, placeholder only
+; NOSTD implemented but as a lkm that can be statically
+; compiled in sysent entry will be filled with lkmsys
+; so the SYSCALL_MODULE macro works
+
+; #ifdef's, etc. may be included, and are copied to the output files.
+
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+
+; Reserved/unimplemented system calls in the range 0-150 inclusive
+; are reserved for use in future Berkeley releases.
+; Additional system calls implemented in vendor and other
+; redistributions should be placed in the reserved range at the end
+; of the current calls.
+
+0 STD NOHIDE { int nosys(void); } syscall nosys_args int
+1 MSTD NOHIDE { void sys_exit(int rval); } exit sys_exit_args void
+2 MSTD POSIX { int fork(void); }
+3 MSTD POSIX { ssize_t read(int fd, void *buf, size_t nbyte); }
+4 MSTD POSIX { ssize_t write(int fd, const void *buf, size_t nbyte); }
+5 STD POSIX { int open(char *path, int flags, int mode); }
+; XXX should be { int open(const char *path, int flags, ...); }
+; but we're not ready for `const' or varargs.
+; XXX man page says `mode_t mode'.
+6 MSTD POSIX { int close(int fd); }
+7 MSTD BSD { int wait4(int pid, int *status, int options, \
+ struct rusage *rusage); } wait4 wait_args int
+8 COMPAT BSD { int creat(char *path, int mode); }
+9 STD POSIX { int link(char *path, char *link); }
+10 STD POSIX { int unlink(char *path); }
+11 OBSOL NOHIDE execv
+12 STD POSIX { int chdir(char *path); }
+13 STD BSD { int fchdir(int fd); }
+14 STD POSIX { int mknod(char *path, int mode, int dev); }
+15 STD POSIX { int chmod(char *path, int mode); }
+16 STD POSIX { int chown(char *path, int uid, int gid); }
+17 MSTD BSD { int obreak(char *nsize); } break obreak_args int
+18 STD BSD { int getfsstat(struct statfs *buf, long bufsize, \
+ int flags); }
+19 COMPAT POSIX { long lseek(int fd, long offset, int whence); }
+20 MSTD POSIX { pid_t getpid(void); }
+21 STD BSD { int mount(char *type, char *path, int flags, \
+ caddr_t data); }
+; XXX `path' should have type `const char *' but we're not ready for that.
+22 STD BSD { int unmount(char *path, int flags); }
+23 MSTD POSIX { int setuid(uid_t uid); }
+24 MSTD POSIX { uid_t getuid(void); }
+25 MSTD POSIX { uid_t geteuid(void); }
+26 STD BSD { int ptrace(int req, pid_t pid, caddr_t addr, \
+ int data); }
+27 MSTD BSD { int recvmsg(int s, struct msghdr *msg, int flags); }
+28 MSTD BSD { int sendmsg(int s, caddr_t msg, int flags); }
+29 MSTD BSD { int recvfrom(int s, caddr_t buf, size_t len, \
+ int flags, caddr_t from, int *fromlenaddr); }
+30 MSTD BSD { int accept(int s, caddr_t name, int *anamelen); }
+31 MSTD BSD { int getpeername(int fdes, caddr_t asa, int *alen); }
+32 MSTD BSD { int getsockname(int fdes, caddr_t asa, int *alen); }
+33 STD POSIX { int access(char *path, int flags); }
+34 STD BSD { int chflags(char *path, int flags); }
+35 STD BSD { int fchflags(int fd, int flags); }
+36 STD BSD { int sync(void); }
+37 MSTD POSIX { int kill(int pid, int signum); }
+38 COMPAT POSIX { int stat(char *path, struct ostat *ub); }
+39 MSTD POSIX { pid_t getppid(void); }
+40 COMPAT POSIX { int lstat(char *path, struct ostat *ub); }
+41 STD POSIX { int dup(u_int fd); }
+42 STD POSIX { int pipe(void); }
+43 MSTD POSIX { gid_t getegid(void); }
+44 MSTD BSD { int profil(caddr_t samples, size_t size, \
+ size_t offset, u_int scale); }
+45 STD BSD { int ktrace(const char *fname, int ops, int facs, \
+ int pid); }
+46 MCOMPAT POSIX { int sigaction(int signum, struct osigaction *nsa, \
+ struct osigaction *osa); }
+47 MSTD POSIX { gid_t getgid(void); }
+48 MCOMPAT POSIX { int sigprocmask(int how, osigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it, and we return the old mask as the
+; (int) return value.
+49 MSTD BSD { int getlogin(char *namebuf, u_int namelen); }
+50 MSTD BSD { int setlogin(char *namebuf); }
+51 MSTD BSD { int acct(char *path); }
+52 MCOMPAT POSIX { int sigpending(void); }
+53 MSTD BSD { int sigaltstack(stack_t *ss, stack_t *oss); }
+54 MSTD POSIX { int ioctl(int fd, u_long com, caddr_t data); }
+55 MSTD BSD { int reboot(int opt); }
+56 STD POSIX { int revoke(char *path); }
+57 STD POSIX { int symlink(char *path, char *link); }
+58 STD POSIX { int readlink(char *path, char *buf, int count); }
+59 MSTD POSIX { int execve(char *fname, char **argv, char **envv); }
+60 MSTD POSIX { int umask(int newmask); } umask umask_args int
+61 STD BSD { int chroot(char *path); }
+62 MCOMPAT POSIX { int fstat(int fd, struct ostat *sb); }
+63 MCOMPAT BSD { int getkerninfo(int op, char *where, size_t *size, \
+ int arg); } getkerninfo getkerninfo_args int
+64 MCOMPAT BSD { int getpagesize(void); } \
+ getpagesize getpagesize_args int
+65 STD BSD { int msync(void *addr, size_t len, int flags); }
+66 MSTD BSD { int vfork(void); }
+67 OBSOL NOHIDE vread
+68 OBSOL NOHIDE vwrite
+69 MSTD BSD { int sbrk(int incr); }
+70 MSTD BSD { int sstk(int incr); }
+71 MCOMPAT BSD { int mmap(void *addr, int len, int prot, \
+ int flags, int fd, long pos); }
+72 MSTD BSD { int ovadvise(int anom); } vadvise ovadvise_args int
+73 MSTD BSD { int munmap(void *addr, size_t len); }
+74 MSTD BSD { int mprotect(const void *addr, size_t len, int prot); }
+75 MSTD BSD { int madvise(void *addr, size_t len, int behav); }
+76 OBSOL NOHIDE vhangup
+77 OBSOL NOHIDE vlimit
+78 MSTD BSD { int mincore(const void *addr, size_t len, \
+ char *vec); }
+79 MSTD POSIX { int getgroups(u_int gidsetsize, gid_t *gidset); }
+80 MSTD POSIX { int setgroups(u_int gidsetsize, gid_t *gidset); }
+81 MSTD POSIX { int getpgrp(void); }
+82 MSTD POSIX { int setpgid(int pid, int pgid); }
+83 MSTD BSD { int setitimer(u_int which, struct itimerval *itv, \
+ struct itimerval *oitv); }
+84 MCOMPAT BSD { int wait(void); }
+85 MSTD BSD { int swapon(char *name); }
+86 MSTD BSD { int getitimer(u_int which, struct itimerval *itv); }
+87 MCOMPAT BSD { int gethostname(char *hostname, u_int len); } \
+ gethostname gethostname_args int
+88 MCOMPAT BSD { int sethostname(char *hostname, u_int len); } \
+ sethostname sethostname_args int
+89 MSTD BSD { int getdtablesize(void); }
+90 MSTD POSIX { int dup2(u_int from, u_int to); }
+91 UNIMPL BSD getdopt
+92 MSTD POSIX { int fcntl(int fd, int cmd, long arg); }
+; XXX should be { int fcntl(int fd, int cmd, ...); }
+; but we're not ready for varargs.
+; XXX man page says `int arg' too.
+93 MSTD BSD { int select(int nd, fd_set *in, fd_set *ou, \
+ fd_set *ex, struct timeval *tv); }
+94 UNIMPL BSD setdopt
+95 STD POSIX { int fsync(int fd); }
+96 MSTD BSD { int setpriority(int which, int who, int prio); }
+97 MSTD BSD { int socket(int domain, int type, int protocol); }
+98 MSTD BSD { int connect(int s, caddr_t name, int namelen); }
+99 MCPT_NOA BSD { int accept(int s, caddr_t name, int *anamelen); } \
+ accept accept_args int
+100 MSTD BSD { int getpriority(int which, int who); }
+101 MCOMPAT BSD { int send(int s, caddr_t buf, int len, int flags); }
+102 MCOMPAT BSD { int recv(int s, caddr_t buf, int len, int flags); }
+103 MSTD BSD { int osigreturn(struct osigcontext *sigcntxp); }
+104 MSTD BSD { int bind(int s, caddr_t name, int namelen); }
+105 MSTD BSD { int setsockopt(int s, int level, int name, \
+ caddr_t val, int valsize); }
+106 MSTD BSD { int listen(int s, int backlog); }
+107 OBSOL NOHIDE vtimes
+108 MCOMPAT BSD { int sigvec(int signum, struct sigvec *nsv, \
+ struct sigvec *osv); }
+109 MCOMPAT BSD { int sigblock(int mask); }
+110 MCOMPAT BSD { int sigsetmask(int mask); }
+111 MCOMPAT POSIX { int sigsuspend(osigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it.
+112 MCOMPAT BSD { int sigstack(struct sigstack *nss, \
+ struct sigstack *oss); }
+113 MCOMPAT BSD { int recvmsg(int s, struct omsghdr *msg, int flags); }
+114 MCOMPAT BSD { int sendmsg(int s, caddr_t msg, int flags); }
+115 OBSOL NOHIDE vtrace
+116 MSTD BSD { int gettimeofday(struct timeval *tp, \
+ struct timezone *tzp); }
+117 MSTD BSD { int getrusage(int who, struct rusage *rusage); }
+118 MSTD BSD { int getsockopt(int s, int level, int name, \
+ caddr_t val, int *avalsize); }
+119 UNIMPL NOHIDE resuba (BSD/OS 2.x)
+120 MSTD BSD { int readv(int fd, struct iovec *iovp, u_int iovcnt); }
+121 MSTD BSD { int writev(int fd, struct iovec *iovp, \
+ u_int iovcnt); }
+122 MSTD BSD { int settimeofday(struct timeval *tv, \
+ struct timezone *tzp); }
+123 STD BSD { int fchown(int fd, int uid, int gid); }
+124 STD BSD { int fchmod(int fd, int mode); }
+125 MCPT_NOA BSD { int recvfrom(int s, caddr_t buf, size_t len, \
+ int flags, caddr_t from, int *fromlenaddr); } \
+ recvfrom recvfrom_args int
+126 MSTD BSD { int setreuid(int ruid, int euid); }
+127 MSTD BSD { int setregid(int rgid, int egid); }
+128 STD POSIX { int rename(char *from, char *to); }
+129 COMPAT BSD { int truncate(char *path, long length); }
+130 COMPAT BSD { int ftruncate(int fd, long length); }
+131 MSTD BSD { int flock(int fd, int how); }
+132 STD POSIX { int mkfifo(char *path, int mode); }
+133 MSTD BSD { int sendto(int s, caddr_t buf, size_t len, \
+ int flags, caddr_t to, int tolen); }
+134 MSTD BSD { int shutdown(int s, int how); }
+135 MSTD BSD { int socketpair(int domain, int type, int protocol, \
+ int *rsv); }
+136 STD POSIX { int mkdir(char *path, int mode); }
+137 STD POSIX { int rmdir(char *path); }
+138 STD BSD { int utimes(char *path, struct timeval *tptr); }
+139 OBSOL NOHIDE 4.2 sigreturn
+140 MSTD BSD { int adjtime(struct timeval *delta, \
+ struct timeval *olddelta); }
+141 MCOMPAT BSD { int getpeername(int fdes, caddr_t asa, int *alen); }
+142 MCOMPAT BSD { long gethostid(void); }
+143 MCOMPAT BSD { int sethostid(long hostid); }
+144 MCOMPAT BSD { int getrlimit(u_int which, struct orlimit *rlp); }
+145 MCOMPAT BSD { int setrlimit(u_int which, struct orlimit *rlp); }
+146 MCOMPAT BSD { int killpg(int pgid, int signum); }
+147 MSTD POSIX { int setsid(void); }
+148 STD BSD { int quotactl(char *path, int cmd, int uid, \
+ caddr_t arg); }
+149 MCOMPAT BSD { int quota(void); }
+150 MCPT_NOA BSD { int getsockname(int fdec, caddr_t asa, int *alen); }\
+ getsockname getsockname_args int
+
+; Syscalls 151-180 inclusive are reserved for vendor-specific
+; system calls. (This includes various calls added for compatibity
+; with other Unix variants.)
+; Some of these calls are now supported by BSD...
+151 UNIMPL NOHIDE sem_lock (BSD/OS 2.x)
+152 UNIMPL NOHIDE sem_wakeup (BSD/OS 2.x)
+153 UNIMPL NOHIDE asyncdaemon (BSD/OS 2.x)
+154 UNIMPL NOHIDE nosys
+; 155 is initialized by the NFS code, if present.
+155 MNOIMPL BSD { int nfssvc(int flag, caddr_t argp); }
+156 COMPAT BSD { int getdirentries(int fd, char *buf, u_int count, \
+ long *basep); }
+157 STD BSD { int statfs(char *path, struct statfs *buf); }
+158 STD BSD { int fstatfs(int fd, struct statfs *buf); }
+159 UNIMPL NOHIDE nosys
+160 UNIMPL NOHIDE nosys
+161 STD BSD { int getfh(char *fname, struct fhandle *fhp); }
+162 MSTD BSD { int getdomainname(char *domainname, int len); }
+163 MSTD BSD { int setdomainname(char *domainname, int len); }
+164 MSTD BSD { int uname(struct utsname *name); }
+165 STD BSD { int sysarch(int op, char *parms); }
+166 MSTD BSD { int rtprio(int function, pid_t pid, \
+ struct rtprio *rtp); }
+167 UNIMPL NOHIDE nosys
+168 UNIMPL NOHIDE nosys
+; 169 is initialized by the SYSVSEM code if present or loaded
+169 MNOSTD BSD { int semsys(int which, int a2, int a3, int a4, \
+ int a5); }
+; 169 is initialized by the SYSVMSG code if present or loaded
+; XXX should be { int semsys(int which, ...); }
+170 MNOSTD BSD { int msgsys(int which, int a2, int a3, int a4, \
+ int a5, int a6); }
+; 169 is initialized by the SYSVSHM code if present or loaded
+; XXX should be { int msgsys(int which, ...); }
+171 MNOSTD BSD { int shmsys(int which, int a2, int a3, int a4); }
+; XXX should be { int shmsys(int which, ...); }
+172 UNIMPL NOHIDE nosys
+173 MSTD POSIX { ssize_t pread(int fd, void *buf, size_t nbyte, \
+ int pad, off_t offset); }
+174 MSTD POSIX { ssize_t pwrite(int fd, const void *buf, \
+ size_t nbyte, int pad, off_t offset); }
+175 UNIMPL NOHIDE nosys
+176 MSTD BSD { int ntp_adjtime(struct timex *tp); }
+177 UNIMPL NOHIDE sfork (BSD/OS 2.x)
+178 UNIMPL NOHIDE getdescriptor (BSD/OS 2.x)
+179 UNIMPL NOHIDE setdescriptor (BSD/OS 2.x)
+180 UNIMPL NOHIDE nosys
+
+; Syscalls 181-199 are used by/reserved for BSD
+181 MSTD POSIX { int setgid(gid_t gid); }
+182 MSTD BSD { int setegid(gid_t egid); }
+183 MSTD BSD { int seteuid(uid_t euid); }
+184 UNIMPL BSD lfs_bmapv
+185 UNIMPL BSD lfs_markv
+186 UNIMPL BSD lfs_segclean
+187 UNIMPL BSD lfs_segwait
+188 STD POSIX { int stat(char *path, struct stat *ub); }
+189 MSTD POSIX { int fstat(int fd, struct stat *sb); }
+190 STD POSIX { int lstat(char *path, struct stat *ub); }
+191 STD POSIX { int pathconf(char *path, int name); }
+192 MSTD POSIX { int fpathconf(int fd, int name); }
+193 UNIMPL NOHIDE nosys
+194 MSTD BSD { int getrlimit(u_int which, \
+ struct rlimit *rlp); } \
+ getrlimit __getrlimit_args int
+195 MSTD BSD { int setrlimit(u_int which, \
+ struct rlimit *rlp); } \
+ setrlimit __setrlimit_args int
+196 STD BSD { int getdirentries(int fd, char *buf, u_int count, \
+ long *basep); }
+197 MSTD BSD { caddr_t mmap(caddr_t addr, size_t len, int prot, \
+ int flags, int fd, int pad, off_t pos); }
+198 STD NOHIDE { int nosys(void); } __syscall __syscall_args int
+199 STD POSIX { off_t lseek(int fd, int pad, off_t offset, \
+ int whence); }
+200 STD BSD { int truncate(char *path, int pad, off_t length); }
+201 STD BSD { int ftruncate(int fd, int pad, off_t length); }
+202 MSTD BSD { int __sysctl(int *name, u_int namelen, void *old, \
+ size_t *oldlenp, void *new, size_t newlen); } \
+ __sysctl sysctl_args int
+; properly, __sysctl should be a NOHIDE, but making an exception
+; here allows to avoid one in libc/sys/Makefile.inc.
+203 MSTD BSD { int mlock(const void *addr, size_t len); }
+204 MSTD BSD { int munlock(const void *addr, size_t len); }
+205 STD BSD { int undelete(char *path); }
+206 STD BSD { int futimes(int fd, struct timeval *tptr); }
+207 MSTD BSD { int getpgid(pid_t pid); }
+208 UNIMPL NOHIDE newreboot (NetBSD)
+209 MSTD BSD { int poll(struct pollfd *fds, u_int nfds, \
+ int timeout); }
+
+;
+; The following are reserved for loadable syscalls
+;
+210 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+211 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+212 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+213 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+214 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+215 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+216 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+217 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+218 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+219 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+
+;
+; The following were introduced with NetBSD/4.4Lite-2
+; They are initialized by thier respective modules/sysinits
+220 MNOSTD BSD { int __semctl(int semid, int semnum, int cmd, \
+ union semun *arg); }
+221 MNOSTD BSD { int semget(key_t key, int nsems, int semflg); }
+222 MNOSTD BSD { int semop(int semid, struct sembuf *sops, \
+ u_int nsops); }
+223 UNIMPL NOHIDE semconfig
+224 MNOSTD BSD { int msgctl(int msqid, int cmd, \
+ struct msqid_ds *buf); }
+225 MNOSTD BSD { int msgget(key_t key, int msgflg); }
+226 MNOSTD BSD { int msgsnd(int msqid, void *msgp, size_t msgsz, \
+ int msgflg); }
+227 MNOSTD BSD { int msgrcv(int msqid, void *msgp, size_t msgsz, \
+ long msgtyp, int msgflg); }
+228 MNOSTD BSD { int shmat(int shmid, void *shmaddr, int shmflg); }
+229 MNOSTD BSD { int shmctl(int shmid, int cmd, \
+ struct shmid_ds *buf); }
+230 MNOSTD BSD { int shmdt(void *shmaddr); }
+231 MNOSTD BSD { int shmget(key_t key, int size, int shmflg); }
+;
+232 MSTD POSIX { int clock_gettime(clockid_t clock_id, \
+ struct timespec *tp); }
+233 MSTD POSIX { int clock_settime(clockid_t clock_id, \
+ const struct timespec *tp); }
+234 MSTD POSIX { int clock_getres(clockid_t clock_id, \
+ struct timespec *tp); }
+235 UNIMPL NOHIDE timer_create
+236 UNIMPL NOHIDE timer_delete
+237 UNIMPL NOHIDE timer_settime
+238 UNIMPL NOHIDE timer_gettime
+239 UNIMPL NOHIDE timer_getoverrun
+240 MSTD POSIX { int nanosleep(const struct timespec *rqtp, \
+ struct timespec *rmtp); }
+241 UNIMPL NOHIDE nosys
+242 UNIMPL NOHIDE nosys
+243 UNIMPL NOHIDE nosys
+244 UNIMPL NOHIDE nosys
+245 UNIMPL NOHIDE nosys
+246 UNIMPL NOHIDE nosys
+247 UNIMPL NOHIDE nosys
+248 UNIMPL NOHIDE nosys
+249 UNIMPL NOHIDE nosys
+; syscall numbers initially used in OpenBSD
+250 MSTD BSD { int minherit(void *addr, size_t len, int inherit); }
+251 MSTD BSD { int rfork(int flags); }
+252 MSTD BSD { int openbsd_poll(struct pollfd *fds, u_int nfds, \
+ int timeout); }
+253 STD BSD { int issetugid(void); }
+254 STD BSD { int lchown(char *path, int uid, int gid); }
+255 UNIMPL NOHIDE nosys
+256 UNIMPL NOHIDE nosys
+257 UNIMPL NOHIDE nosys
+258 UNIMPL NOHIDE nosys
+259 UNIMPL NOHIDE nosys
+260 UNIMPL NOHIDE nosys
+261 UNIMPL NOHIDE nosys
+262 UNIMPL NOHIDE nosys
+263 UNIMPL NOHIDE nosys
+264 UNIMPL NOHIDE nosys
+265 UNIMPL NOHIDE nosys
+266 UNIMPL NOHIDE nosys
+267 UNIMPL NOHIDE nosys
+268 UNIMPL NOHIDE nosys
+269 UNIMPL NOHIDE nosys
+270 UNIMPL NOHIDE nosys
+271 UNIMPL NOHIDE nosys
+272 STD BSD { int getdents(int fd, char *buf, size_t count); }
+273 UNIMPL NOHIDE nosys
+274 STD BSD { int lchmod(char *path, mode_t mode); }
+275 NOPROTO BSD { int lchown(char *path, uid_t uid, gid_t gid); } netbsd_lchown lchown_args int
+276 STD BSD { int lutimes(char *path, struct timeval *tptr); }
+277 MNOPROTO BSD { int msync(void *addr, size_t len, int flags); } netbsd_msync msync_args int
+278 STD BSD { int nstat(char *path, struct nstat *ub); }
+279 MSTD BSD { int nfstat(int fd, struct nstat *sb); }
+280 STD BSD { int nlstat(char *path, struct nstat *ub); }
+281 UNIMPL NOHIDE nosys
+282 UNIMPL NOHIDE nosys
+283 UNIMPL NOHIDE nosys
+284 UNIMPL NOHIDE nosys
+285 UNIMPL NOHIDE nosys
+286 UNIMPL NOHIDE nosys
+287 UNIMPL NOHIDE nosys
+288 UNIMPL NOHIDE nosys
+289 UNIMPL NOHIDE nosys
+290 UNIMPL NOHIDE nosys
+291 UNIMPL NOHIDE nosys
+292 UNIMPL NOHIDE nosys
+293 UNIMPL NOHIDE nosys
+294 UNIMPL NOHIDE nosys
+295 UNIMPL NOHIDE nosys
+296 UNIMPL NOHIDE nosys
+; XXX 297 is 300 in NetBSD
+297 STD BSD { int fhstatfs(const struct fhandle *u_fhp, struct statfs *buf); }
+298 STD BSD { int fhopen(const struct fhandle *u_fhp, int flags); }
+299 STD BSD { int fhstat(const struct fhandle *u_fhp, struct stat *sb); }
+; syscall numbers for FreeBSD
+300 MSTD BSD { int modnext(int modid); }
+301 MSTD BSD { int modstat(int modid, struct module_stat* stat); }
+302 MSTD BSD { int modfnext(int modid); }
+303 MSTD BSD { int modfind(const char *name); }
+304 MSTD BSD { int kldload(const char *file); }
+305 MSTD BSD { int kldunload(int fileid); }
+306 MSTD BSD { int kldfind(const char *file); }
+307 MSTD BSD { int kldnext(int fileid); }
+308 MSTD BSD { int kldstat(int fileid, struct kld_file_stat* stat); }
+309 MSTD BSD { int kldfirstmod(int fileid); }
+310 MSTD BSD { int getsid(pid_t pid); }
+311 MSTD BSD { int setresuid(uid_t ruid, uid_t euid, uid_t suid); }
+312 MSTD BSD { int setresgid(gid_t rgid, gid_t egid, gid_t sgid); }
+313 OBSOL NOHIDE signanosleep
+314 NOSTD BSD { int aio_return(struct aiocb *aiocbp); }
+315 NOSTD BSD { int aio_suspend(struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); }
+316 NOSTD BSD { int aio_cancel(int fd, struct aiocb *aiocbp); }
+317 NOSTD BSD { int aio_error(struct aiocb *aiocbp); }
+318 NOSTD BSD { int aio_read(struct aiocb *aiocbp); }
+319 NOSTD BSD { int aio_write(struct aiocb *aiocbp); }
+320 NOSTD BSD { int lio_listio(int mode, struct aiocb * const *acb_list, int nent, struct sigevent *sig); }
+321 MSTD BSD { int yield(void); }
+322 OBSOL NOHIDE thr_sleep
+323 OBSOL NOHIDE thr_wakeup
+324 MSTD BSD { int mlockall(int how); }
+325 MSTD BSD { int munlockall(void); }
+326 STD BSD { int __getcwd(u_char *buf, u_int buflen); }
+
+327 MSTD POSIX { int sched_setparam (pid_t pid, const struct sched_param *param); }
+328 MSTD POSIX { int sched_getparam (pid_t pid, struct sched_param *param); }
+
+329 MSTD POSIX { int sched_setscheduler (pid_t pid, int policy, const struct sched_param *param); }
+330 MSTD POSIX { int sched_getscheduler (pid_t pid); }
+
+331 MSTD POSIX { int sched_yield (void); }
+332 MSTD POSIX { int sched_get_priority_max (int policy); }
+333 MSTD POSIX { int sched_get_priority_min (int policy); }
+334 MSTD POSIX { int sched_rr_get_interval (pid_t pid, struct timespec *interval); }
+335 STD BSD { int utrace(const void *addr, size_t len); }
+336 MSTD BSD { int sendfile(int fd, int s, off_t offset, size_t nbytes, \
+ struct sf_hdtr *hdtr, off_t *sbytes, int flags); }
+337 STD BSD { int kldsym(int fileid, int cmd, void *data); }
+338 MSTD BSD { int jail(struct jail *jail); }
+339 UNIMPL BSD pioctl
+340 MSTD POSIX { int sigprocmask(int how, const sigset_t *set, \
+ sigset_t *oset); }
+341 MSTD POSIX { int sigsuspend(const sigset_t *sigmask); }
+342 MSTD POSIX { int sigaction(int sig, const struct sigaction *act, \
+ struct sigaction *oact); }
+343 MSTD POSIX { int sigpending(sigset_t *set); }
+344 MSTD BSD { int sigreturn(const struct __ucontext *sigcntxp); }
+345 UNIMPL NOHIDE sigtimedwait
+346 UNIMPL NOHIDE sigwaitinfo
+347 MSTD BSD { int __acl_get_file(const char *path, \
+ acl_type_t type, struct acl *aclp); }
+348 MSTD BSD { int __acl_set_file(const char *path, \
+ acl_type_t type, struct acl *aclp); }
+349 MSTD BSD { int __acl_get_fd(int filedes, acl_type_t type, \
+ struct acl *aclp); }
+350 MSTD BSD { int __acl_set_fd(int filedes, acl_type_t type, \
+ struct acl *aclp); }
+351 MSTD BSD { int __acl_delete_file(const char *path, \
+ acl_type_t type); }
+352 MSTD BSD { int __acl_delete_fd(int filedes, acl_type_t type); }
+353 MSTD BSD { int __acl_aclcheck_file(const char *path, \
+ acl_type_t type, struct acl *aclp); }
+354 MSTD BSD { int __acl_aclcheck_fd(int filedes, acl_type_t type, \
+ struct acl *aclp); }
+355 STD BSD { int extattrctl(const char *path, int cmd, \
+ const char *filename, int attrnamespace, \
+ const char *attrname); }
+356 STD BSD { int extattr_set_file(const char *path, \
+ int attrnamespace, const char *attrname, \
+ void *data, size_t nbytes); }
+357 STD BSD { ssize_t extattr_get_file(const char *path, \
+ int attrnamespace, const char *attrname, \
+ void *data, size_t nbytes); }
+358 STD BSD { int extattr_delete_file(const char *path, \
+ int attrnamespace, const char *attrname); }
+359 NOSTD BSD { int aio_waitcomplete(struct aiocb **aiocbp, struct timespec *timeout); }
+360 MSTD BSD { int getresuid(uid_t *ruid, uid_t *euid, uid_t *suid); }
+361 MSTD BSD { int getresgid(gid_t *rgid, gid_t *egid, gid_t *sgid); }
+362 MSTD BSD { int kqueue(void); }
+363 MSTD BSD { int kevent(int fd, \
+ const struct kevent *changelist, int nchanges, \
+ struct kevent *eventlist, int nevents, \
+ const struct timespec *timeout); }
+364 UNIMPL BSD __cap_get_proc
+365 UNIMPL BSD __cap_set_proc
+366 UNIMPL BSD __cap_get_fd
+367 UNIMPL BSD __cap_get_file
+368 UNIMPL BSD __cap_set_fd
+369 UNIMPL BSD __cap_set_file
+370 NODEF NOHIDE lkmressys lkmressys nosys_args int
+371 STD BSD { int extattr_set_fd(int fd, int attrnamespace, \
+ const char *attrname, void *data, \
+ size_t nbytes); }
+372 STD BSD { ssize_t extattr_get_fd(int fd, int attrnamespace, \
+ const char *attrname, void *data, size_t nbytes); }
+373 STD BSD { int extattr_delete_fd(int fd, int attrnamespace, \
+ const char *attrname); }
+374 MSTD BSD { int __setugid(int flag); }
+375 NOIMPL BSD { int nfsclnt(int flag, caddr_t argp); }
+376 STD BSD { int eaccess(char *path, int flags); }
+377 UNIMPL BSD afs_syscall
+378 STD BSD { int nmount(struct iovec *iovp, unsigned int iovcnt, \
+ int flags); }
+379 STD BSD { int kse_exit(void); }
+380 STD BSD { int kse_wakeup(void); }
+381 STD BSD { int kse_new(struct kse_mailbox * mbx, \
+ int new_grp_flag); }
+382 STD BSD { int thread_wakeup(struct thread_mailbox *tmbx); }
+383 STD BSD { int kse_yield(void); }
+384 UNIMPL BSD __mac_get_proc
+385 UNIMPL BSD __mac_set_proc
+386 UNIMPL BSD __mac_get_fd
+387 UNIMPL BSD __mac_get_file
+388 UNIMPL BSD __mac_set_fd
+389 UNIMPL BSD __mac_set_file
+390 STD BSD { int kenv(int what, const char *name, char *value, \
+ int len); }
+391 STD BSD { int lchflags(const char *path, int flags); }
+392 STD BSD { int uuidgen(struct uuid *store, int count); }
diff --git a/sys/kern/sysv_ipc.c b/sys/kern/sysv_ipc.c
new file mode 100644
index 0000000..fc5fd8f
--- /dev/null
+++ b/sys/kern/sysv_ipc.c
@@ -0,0 +1,97 @@
+/* $FreeBSD$ */
+/* $NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $ */
+
+/*
+ * Copyright (c) 1994 Herb Peyerl <hpeyerl@novatel.ca>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Herb Peyerl.
+ * 4. The name of Herb Peyerl may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+#include <sys/ipc.h>
+#include <sys/proc.h>
+#include <sys/ucred.h>
+
+void (*shmfork_hook)(struct proc *, struct proc *) = NULL;
+void (*shmexit_hook)(struct proc *) = NULL;
+
+/* called from kern_fork.c */
+void
+shmfork(p1, p2)
+ struct proc *p1, *p2;
+{
+
+ if (shmfork_hook != NULL)
+ shmfork_hook(p1, p2);
+ return;
+}
+
+/* called from kern_exit.c */
+void
+shmexit(p)
+ struct proc *p;
+{
+
+ if (shmexit_hook != NULL)
+ shmexit_hook(p);
+ return;
+}
+
+/*
+ * Check for ipc permission
+ */
+
+int
+ipcperm(td, perm, mode)
+ struct thread *td;
+ struct ipc_perm *perm;
+ int mode;
+{
+ struct ucred *cred = td->td_ucred;
+
+ /* Check for user match. */
+ if (cred->cr_uid != perm->cuid && cred->cr_uid != perm->uid) {
+ if (mode & IPC_M)
+ return (suser(td) == 0 ? 0 : EPERM);
+ /* Check for group match. */
+ mode >>= 3;
+ if (!groupmember(perm->gid, cred) &&
+ !groupmember(perm->cgid, cred))
+ /* Check for `other' match. */
+ mode >>= 3;
+ }
+
+ if (mode & IPC_M)
+ return (0);
+ return ((mode & perm->mode) == mode ||
+ suser(td) == 0 ? 0 : EACCES);
+}
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
new file mode 100644
index 0000000..4dd2249
--- /dev/null
+++ b/sys/kern/sysv_msg.c
@@ -0,0 +1,1240 @@
+/* $FreeBSD$ */
+
+/*
+ * Implementation of SVID messages
+ *
+ * Author: Daniel Boulet
+ *
+ * Copyright 1993 Daniel Boulet and RTMX Inc.
+ *
+ * This system call was implemented by Daniel Boulet under contract from RTMX.
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/msg.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/jail.h>
+
+static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
+
+static void msginit(void);
+static int msgunload(void);
+static int sysvmsg_modload(struct module *, int, void *);
+
+#define MSG_DEBUG
+#undef MSG_DEBUG_OK
+
+static void msg_freehdr(struct msg *msghdr);
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *msgcalls[] = {
+ (sy_call_t *)msgctl, (sy_call_t *)msgget,
+ (sy_call_t *)msgsnd, (sy_call_t *)msgrcv
+};
+
+struct msg {
+ struct msg *msg_next; /* next msg in the chain */
+ long msg_type; /* type of this message */
+ /* >0 -> type of this message */
+ /* 0 -> free header */
+ u_short msg_ts; /* size of this message */
+ short msg_spot; /* location of start of msg in buffer */
+};
+
+
+#ifndef MSGSSZ
+#define MSGSSZ 8 /* Each segment must be 2^N long */
+#endif
+#ifndef MSGSEG
+#define MSGSEG 2048 /* must be less than 32767 */
+#endif
+#define MSGMAX (MSGSSZ*MSGSEG)
+#ifndef MSGMNB
+#define MSGMNB 2048 /* max # of bytes in a queue */
+#endif
+#ifndef MSGMNI
+#define MSGMNI 40
+#endif
+#ifndef MSGTQL
+#define MSGTQL 40
+#endif
+
+/*
+ * Based on the configuration parameters described in an SVR2 (yes, two)
+ * config(1m) man page.
+ *
+ * Each message is broken up and stored in segments that are msgssz bytes
+ * long. For efficiency reasons, this should be a power of two. Also,
+ * it doesn't make sense if it is less than 8 or greater than about 256.
+ * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of
+ * two between 8 and 1024 inclusive (and panic's if it isn't).
+ */
+struct msginfo msginfo = {
+ MSGMAX, /* max chars in a message */
+ MSGMNI, /* # of message queue identifiers */
+ MSGMNB, /* max chars in a queue */
+ MSGTQL, /* max messages in system */
+ MSGSSZ, /* size of a message segment */
+ /* (must be small power of 2 greater than 4) */
+ MSGSEG /* number of message segments */
+};
+
+/*
+ * macros to convert between msqid_ds's and msqid's.
+ * (specific to this implementation)
+ */
+#define MSQID(ix,ds) ((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000))
+#define MSQID_IX(id) ((id) & 0xffff)
+#define MSQID_SEQ(id) (((id) >> 16) & 0xffff)
+
+/*
+ * The rest of this file is specific to this particular implementation.
+ */
+
+struct msgmap {
+ short next; /* next segment in buffer */
+ /* -1 -> available */
+ /* 0..(MSGSEG-1) -> index of next segment */
+};
+
+#define MSG_LOCKED 01000 /* Is this msqid_ds locked? */
+
+static int nfree_msgmaps; /* # of free map entries */
+static short free_msgmaps; /* head of linked list of free map entries */
+static struct msg *free_msghdrs;/* list of free msg headers */
+static char *msgpool; /* MSGMAX byte long msg buffer pool */
+static struct msgmap *msgmaps; /* MSGSEG msgmap structures */
+static struct msg *msghdrs; /* MSGTQL msg headers */
+static struct msqid_ds *msqids; /* MSGMNI msqid_ds struct's */
+
+static void
+msginit()
+{
+ register int i;
+
+ TUNABLE_INT_FETCH("kern.ipc.msgseg", &msginfo.msgseg);
+ TUNABLE_INT_FETCH("kern.ipc.msgssz", &msginfo.msgssz);
+ msginfo.msgmax = msginfo.msgseg * msginfo.msgssz;
+ TUNABLE_INT_FETCH("kern.ipc.msgmni", &msginfo.msgmni);
+
+ msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK);
+ if (msgpool == NULL)
+ panic("msgpool is NULL");
+ msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK);
+ if (msgmaps == NULL)
+ panic("msgmaps is NULL");
+ msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK);
+ if (msghdrs == NULL)
+ panic("msghdrs is NULL");
+ msqids = malloc(sizeof(struct msqid_ds) * msginfo.msgmni, M_MSG, M_WAITOK);
+ if (msqids == NULL)
+ panic("msqids is NULL");
+
+ /*
+ * msginfo.msgssz should be a power of two for efficiency reasons.
+ * It is also pretty silly if msginfo.msgssz is less than 8
+ * or greater than about 256 so ...
+ */
+
+ i = 8;
+ while (i < 1024 && i != msginfo.msgssz)
+ i <<= 1;
+ if (i != msginfo.msgssz) {
+ printf("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
+ msginfo.msgssz);
+ panic("msginfo.msgssz not a small power of 2");
+ }
+
+ if (msginfo.msgseg > 32767) {
+ printf("msginfo.msgseg=%d\n", msginfo.msgseg);
+ panic("msginfo.msgseg > 32767");
+ }
+
+ if (msgmaps == NULL)
+ panic("msgmaps is NULL");
+
+ for (i = 0; i < msginfo.msgseg; i++) {
+ if (i > 0)
+ msgmaps[i-1].next = i;
+ msgmaps[i].next = -1; /* implies entry is available */
+ }
+ free_msgmaps = 0;
+ nfree_msgmaps = msginfo.msgseg;
+
+ if (msghdrs == NULL)
+ panic("msghdrs is NULL");
+
+ for (i = 0; i < msginfo.msgtql; i++) {
+ msghdrs[i].msg_type = 0;
+ if (i > 0)
+ msghdrs[i-1].msg_next = &msghdrs[i];
+ msghdrs[i].msg_next = NULL;
+ }
+ free_msghdrs = &msghdrs[0];
+
+ if (msqids == NULL)
+ panic("msqids is NULL");
+
+ for (i = 0; i < msginfo.msgmni; i++) {
+ msqids[i].msg_qbytes = 0; /* implies entry is available */
+ msqids[i].msg_perm.seq = 0; /* reset to a known value */
+ msqids[i].msg_perm.mode = 0;
+ }
+}
+
+static int
+msgunload()
+{
+ struct msqid_ds *msqptr;
+ int msqid;
+
+ for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+ /*
+ * Look for an unallocated and unlocked msqid_ds.
+ * msqid_ds's can be locked by msgsnd or msgrcv while
+ * they are copying the message in/out. We can't
+ * re-use the entry until they release it.
+ */
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes != 0 ||
+ (msqptr->msg_perm.mode & MSG_LOCKED) != 0)
+ break;
+ }
+ if (msqid != msginfo.msgmni)
+ return (EBUSY);
+
+ free(msgpool, M_MSG);
+ free(msgmaps, M_MSG);
+ free(msghdrs, M_MSG);
+ free(msqids, M_MSG);
+ return (0);
+}
+
+
+static int
+sysvmsg_modload(struct module *module, int cmd, void *arg)
+{
+ int error = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ msginit();
+ break;
+ case MOD_UNLOAD:
+ error = msgunload();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t sysvmsg_mod = {
+ "sysvmsg",
+ &sysvmsg_modload,
+ NULL
+};
+
+SYSCALL_MODULE_HELPER(msgsys);
+SYSCALL_MODULE_HELPER(msgctl);
+SYSCALL_MODULE_HELPER(msgget);
+SYSCALL_MODULE_HELPER(msgsnd);
+SYSCALL_MODULE_HELPER(msgrcv);
+
+DECLARE_MODULE(sysvmsg, sysvmsg_mod,
+ SI_SUB_SYSV_MSG, SI_ORDER_FIRST);
+MODULE_VERSION(sysvmsg, 1);
+
+/*
+ * Entry point for all MSG calls
+ *
+ * MPSAFE
+ */
+int
+msgsys(td, uap)
+ struct thread *td;
+ /* XXX actually varargs. */
+ struct msgsys_args /* {
+ u_int which;
+ int a2;
+ int a3;
+ int a4;
+ int a5;
+ int a6;
+ } */ *uap;
+{
+ int error;
+
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+ if (uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
+ return (EINVAL);
+ mtx_lock(&Giant);
+ error = (*msgcalls[uap->which])(td, &uap->a2);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static void
+msg_freehdr(msghdr)
+ struct msg *msghdr;
+{
+ while (msghdr->msg_ts > 0) {
+ short next;
+ if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
+ panic("msghdr->msg_spot out of range");
+ next = msgmaps[msghdr->msg_spot].next;
+ msgmaps[msghdr->msg_spot].next = free_msgmaps;
+ free_msgmaps = msghdr->msg_spot;
+ nfree_msgmaps++;
+ msghdr->msg_spot = next;
+ if (msghdr->msg_ts >= msginfo.msgssz)
+ msghdr->msg_ts -= msginfo.msgssz;
+ else
+ msghdr->msg_ts = 0;
+ }
+ if (msghdr->msg_spot != -1)
+ panic("msghdr->msg_spot != -1");
+ msghdr->msg_next = free_msghdrs;
+ free_msghdrs = msghdr;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgctl_args {
+ int msqid;
+ int cmd;
+ struct msqid_ds *buf;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+msgctl(td, uap)
+ struct thread *td;
+ register struct msgctl_args *uap;
+{
+ int msqid = uap->msqid;
+ int cmd = uap->cmd;
+ struct msqid_ds *user_msqptr = uap->buf;
+ int rval, error;
+ struct msqid_ds msqbuf;
+ register struct msqid_ds *msqptr;
+
+#ifdef MSG_DEBUG_OK
+ printf("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, user_msqptr);
+#endif
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+
+ mtx_lock(&Giant);
+ msqid = IPCID_TO_IX(msqid);
+
+ if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+ msginfo.msgmni);
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+
+ msqptr = &msqids[msqid];
+
+ if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("no such msqid\n");
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+ if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+ printf("wrong sequence number\n");
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+
+ error = 0;
+ rval = 0;
+
+ switch (cmd) {
+
+ case IPC_RMID:
+ {
+ struct msg *msghdr;
+ if ((error = ipcperm(td, &msqptr->msg_perm, IPC_M)))
+ goto done2;
+ /* Free the message headers */
+ msghdr = msqptr->msg_first;
+ while (msghdr != NULL) {
+ struct msg *msghdr_tmp;
+
+ /* Free the segments of each message */
+ msqptr->msg_cbytes -= msghdr->msg_ts;
+ msqptr->msg_qnum--;
+ msghdr_tmp = msghdr;
+ msghdr = msghdr->msg_next;
+ msg_freehdr(msghdr_tmp);
+ }
+
+ if (msqptr->msg_cbytes != 0)
+ panic("msg_cbytes is screwed up");
+ if (msqptr->msg_qnum != 0)
+ panic("msg_qnum is screwed up");
+
+ msqptr->msg_qbytes = 0; /* Mark it as free */
+
+ wakeup((caddr_t)msqptr);
+ }
+
+ break;
+
+ case IPC_SET:
+ if ((error = ipcperm(td, &msqptr->msg_perm, IPC_M)))
+ goto done2;
+ if ((error = copyin(user_msqptr, &msqbuf, sizeof(msqbuf))) != 0)
+ goto done2;
+ if (msqbuf.msg_qbytes > msqptr->msg_qbytes) {
+ error = suser(td);
+ if (error)
+ goto done2;
+ }
+ if (msqbuf.msg_qbytes > msginfo.msgmnb) {
+#ifdef MSG_DEBUG_OK
+ printf("can't increase msg_qbytes beyond %d (truncating)\n",
+ msginfo.msgmnb);
+#endif
+ msqbuf.msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */
+ }
+ if (msqbuf.msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("can't reduce msg_qbytes to 0\n");
+#endif
+ error = EINVAL; /* non-standard errno! */
+ goto done2;
+ }
+ msqptr->msg_perm.uid = msqbuf.msg_perm.uid; /* change the owner */
+ msqptr->msg_perm.gid = msqbuf.msg_perm.gid; /* change the owner */
+ msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) |
+ (msqbuf.msg_perm.mode & 0777);
+ msqptr->msg_qbytes = msqbuf.msg_qbytes;
+ msqptr->msg_ctime = time_second;
+ break;
+
+ case IPC_STAT:
+ if ((error = ipcperm(td, &msqptr->msg_perm, IPC_R))) {
+#ifdef MSG_DEBUG_OK
+ printf("requester doesn't have read access\n");
+#endif
+ goto done2;
+ }
+ error = copyout((caddr_t)msqptr, user_msqptr,
+ sizeof(struct msqid_ds));
+ break;
+
+ default:
+#ifdef MSG_DEBUG_OK
+ printf("invalid command %d\n", cmd);
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+
+ if (error == 0)
+ td->td_retval[0] = rval;
+done2:
+ mtx_unlock(&Giant);
+ return(error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgget_args {
+ key_t key;
+ int msgflg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+msgget(td, uap)
+ struct thread *td;
+ register struct msgget_args *uap;
+{
+ int msqid, error = 0;
+ int key = uap->key;
+ int msgflg = uap->msgflg;
+ struct ucred *cred = td->td_ucred;
+ register struct msqid_ds *msqptr = NULL;
+
+#ifdef MSG_DEBUG_OK
+ printf("msgget(0x%x, 0%o)\n", key, msgflg);
+#endif
+
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+
+ mtx_lock(&Giant);
+ if (key != IPC_PRIVATE) {
+ for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes != 0 &&
+ msqptr->msg_perm.key == key)
+ break;
+ }
+ if (msqid < msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("found public key\n");
+#endif
+ if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
+#ifdef MSG_DEBUG_OK
+ printf("not exclusive\n");
+#endif
+ error = EEXIST;
+ goto done2;
+ }
+ if ((error = ipcperm(td, &msqptr->msg_perm, msgflg & 0700 ))) {
+#ifdef MSG_DEBUG_OK
+ printf("requester doesn't have 0%o access\n",
+ msgflg & 0700);
+#endif
+ goto done2;
+ }
+ goto found;
+ }
+ }
+
+#ifdef MSG_DEBUG_OK
+ printf("need to allocate the msqid_ds\n");
+#endif
+ if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
+ for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+ /*
+ * Look for an unallocated and unlocked msqid_ds.
+ * msqid_ds's can be locked by msgsnd or msgrcv while
+ * they are copying the message in/out. We can't
+ * re-use the entry until they release it.
+ */
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes == 0 &&
+ (msqptr->msg_perm.mode & MSG_LOCKED) == 0)
+ break;
+ }
+ if (msqid == msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("no more msqid_ds's available\n");
+#endif
+ error = ENOSPC;
+ goto done2;
+ }
+#ifdef MSG_DEBUG_OK
+ printf("msqid %d is available\n", msqid);
+#endif
+ msqptr->msg_perm.key = key;
+ msqptr->msg_perm.cuid = cred->cr_uid;
+ msqptr->msg_perm.uid = cred->cr_uid;
+ msqptr->msg_perm.cgid = cred->cr_gid;
+ msqptr->msg_perm.gid = cred->cr_gid;
+ msqptr->msg_perm.mode = (msgflg & 0777);
+ /* Make sure that the returned msqid is unique */
+ msqptr->msg_perm.seq++;
+ msqptr->msg_first = NULL;
+ msqptr->msg_last = NULL;
+ msqptr->msg_cbytes = 0;
+ msqptr->msg_qnum = 0;
+ msqptr->msg_qbytes = msginfo.msgmnb;
+ msqptr->msg_lspid = 0;
+ msqptr->msg_lrpid = 0;
+ msqptr->msg_stime = 0;
+ msqptr->msg_rtime = 0;
+ msqptr->msg_ctime = time_second;
+ } else {
+#ifdef MSG_DEBUG_OK
+ printf("didn't find it and wasn't asked to create it\n");
+#endif
+ error = ENOENT;
+ goto done2;
+ }
+
+found:
+ /* Construct the unique msqid */
+ td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgsnd_args {
+ int msqid;
+ void *msgp;
+ size_t msgsz;
+ int msgflg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+msgsnd(td, uap)
+ struct thread *td;
+ register struct msgsnd_args *uap;
+{
+ int msqid = uap->msqid;
+ void *user_msgp = uap->msgp;
+ size_t msgsz = uap->msgsz;
+ int msgflg = uap->msgflg;
+ int segs_needed, error = 0;
+ register struct msqid_ds *msqptr;
+ register struct msg *msghdr;
+ short next;
+
+#ifdef MSG_DEBUG_OK
+ printf("call to msgsnd(%d, 0x%x, %d, %d)\n", msqid, user_msgp, msgsz,
+ msgflg);
+#endif
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+
+ mtx_lock(&Giant);
+ msqid = IPCID_TO_IX(msqid);
+
+ if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+ msginfo.msgmni);
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("no such message queue id\n");
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+ if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+ printf("wrong sequence number\n");
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+
+ if ((error = ipcperm(td, &msqptr->msg_perm, IPC_W))) {
+#ifdef MSG_DEBUG_OK
+ printf("requester doesn't have write access\n");
+#endif
+ goto done2;
+ }
+
+ segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
+#ifdef MSG_DEBUG_OK
+ printf("msgsz=%d, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz,
+ segs_needed);
+#endif
+ for (;;) {
+ int need_more_resources = 0;
+
+ /*
+ * check msgsz
+ * (inside this loop in case msg_qbytes changes while we sleep)
+ */
+
+ if (msgsz > msqptr->msg_qbytes) {
+#ifdef MSG_DEBUG_OK
+ printf("msgsz > msqptr->msg_qbytes\n");
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+
+ if (msqptr->msg_perm.mode & MSG_LOCKED) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid is locked\n");
+#endif
+ need_more_resources = 1;
+ }
+ if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes) {
+#ifdef MSG_DEBUG_OK
+ printf("msgsz + msg_cbytes > msg_qbytes\n");
+#endif
+ need_more_resources = 1;
+ }
+ if (segs_needed > nfree_msgmaps) {
+#ifdef MSG_DEBUG_OK
+ printf("segs_needed > nfree_msgmaps\n");
+#endif
+ need_more_resources = 1;
+ }
+ if (free_msghdrs == NULL) {
+#ifdef MSG_DEBUG_OK
+ printf("no more msghdrs\n");
+#endif
+ need_more_resources = 1;
+ }
+
+ if (need_more_resources) {
+ int we_own_it;
+
+ if ((msgflg & IPC_NOWAIT) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("need more resources but caller doesn't want to wait\n");
+#endif
+ error = EAGAIN;
+ goto done2;
+ }
+
+ if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("we don't own the msqid_ds\n");
+#endif
+ we_own_it = 0;
+ } else {
+ /* Force later arrivals to wait for our
+ request */
+#ifdef MSG_DEBUG_OK
+ printf("we own the msqid_ds\n");
+#endif
+ msqptr->msg_perm.mode |= MSG_LOCKED;
+ we_own_it = 1;
+ }
+#ifdef MSG_DEBUG_OK
+ printf("goodnight\n");
+#endif
+ error = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH,
+ "msgwait", 0);
+#ifdef MSG_DEBUG_OK
+ printf("good morning, error=%d\n", error);
+#endif
+ if (we_own_it)
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+ if (error != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("msgsnd: interrupted system call\n");
+#endif
+ error = EINTR;
+ goto done2;
+ }
+
+ /*
+ * Make sure that the msq queue still exists
+ */
+
+ if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid deleted\n");
+#endif
+ error = EIDRM;
+ goto done2;
+ }
+
+ } else {
+#ifdef MSG_DEBUG_OK
+ printf("got all the resources that we need\n");
+#endif
+ break;
+ }
+ }
+
+ /*
+ * We have the resources that we need.
+ * Make sure!
+ */
+
+ if (msqptr->msg_perm.mode & MSG_LOCKED)
+ panic("msg_perm.mode & MSG_LOCKED");
+ if (segs_needed > nfree_msgmaps)
+ panic("segs_needed > nfree_msgmaps");
+ if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes)
+ panic("msgsz + msg_cbytes > msg_qbytes");
+ if (free_msghdrs == NULL)
+ panic("no more msghdrs");
+
+ /*
+ * Re-lock the msqid_ds in case we page-fault when copying in the
+ * message
+ */
+
+ if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0)
+ panic("msqid_ds is already locked");
+ msqptr->msg_perm.mode |= MSG_LOCKED;
+
+ /*
+ * Allocate a message header
+ */
+
+ msghdr = free_msghdrs;
+ free_msghdrs = msghdr->msg_next;
+ msghdr->msg_spot = -1;
+ msghdr->msg_ts = msgsz;
+
+ /*
+ * Allocate space for the message
+ */
+
+ while (segs_needed > 0) {
+ if (nfree_msgmaps <= 0)
+ panic("not enough msgmaps");
+ if (free_msgmaps == -1)
+ panic("nil free_msgmaps");
+ next = free_msgmaps;
+ if (next <= -1)
+ panic("next too low #1");
+ if (next >= msginfo.msgseg)
+ panic("next out of range #1");
+#ifdef MSG_DEBUG_OK
+ printf("allocating segment %d to message\n", next);
+#endif
+ free_msgmaps = msgmaps[next].next;
+ nfree_msgmaps--;
+ msgmaps[next].next = msghdr->msg_spot;
+ msghdr->msg_spot = next;
+ segs_needed--;
+ }
+
+ /*
+ * Copy in the message type
+ */
+
+ if ((error = copyin(user_msgp, &msghdr->msg_type,
+ sizeof(msghdr->msg_type))) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("error %d copying the message type\n", error);
+#endif
+ msg_freehdr(msghdr);
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+ wakeup((caddr_t)msqptr);
+ goto done2;
+ }
+ user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+
+ /*
+ * Validate the message type
+ */
+
+ if (msghdr->msg_type < 1) {
+ msg_freehdr(msghdr);
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+ wakeup((caddr_t)msqptr);
+#ifdef MSG_DEBUG_OK
+ printf("mtype (%d) < 1\n", msghdr->msg_type);
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+
+ /*
+ * Copy in the message body
+ */
+
+ next = msghdr->msg_spot;
+ while (msgsz > 0) {
+ size_t tlen;
+ if (msgsz > msginfo.msgssz)
+ tlen = msginfo.msgssz;
+ else
+ tlen = msgsz;
+ if (next <= -1)
+ panic("next too low #2");
+ if (next >= msginfo.msgseg)
+ panic("next out of range #2");
+ if ((error = copyin(user_msgp, &msgpool[next * msginfo.msgssz],
+ tlen)) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("error %d copying in message segment\n", error);
+#endif
+ msg_freehdr(msghdr);
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+ wakeup((caddr_t)msqptr);
+ goto done2;
+ }
+ msgsz -= tlen;
+ user_msgp = (char *)user_msgp + tlen;
+ next = msgmaps[next].next;
+ }
+ if (next != -1)
+ panic("didn't use all the msg segments");
+
+ /*
+ * We've got the message. Unlock the msqid_ds.
+ */
+
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+
+ /*
+ * Make sure that the msqid_ds is still allocated.
+ */
+
+ if (msqptr->msg_qbytes == 0) {
+ msg_freehdr(msghdr);
+ wakeup((caddr_t)msqptr);
+ error = EIDRM;
+ goto done2;
+ }
+
+ /*
+ * Put the message into the queue
+ */
+
+ if (msqptr->msg_first == NULL) {
+ msqptr->msg_first = msghdr;
+ msqptr->msg_last = msghdr;
+ } else {
+ msqptr->msg_last->msg_next = msghdr;
+ msqptr->msg_last = msghdr;
+ }
+ msqptr->msg_last->msg_next = NULL;
+
+ msqptr->msg_cbytes += msghdr->msg_ts;
+ msqptr->msg_qnum++;
+ msqptr->msg_lspid = td->td_proc->p_pid;
+ msqptr->msg_stime = time_second;
+
+ wakeup((caddr_t)msqptr);
+ td->td_retval[0] = 0;
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgrcv_args {
+ int msqid;
+ void *msgp;
+ size_t msgsz;
+ long msgtyp;
+ int msgflg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+msgrcv(td, uap)
+ struct thread *td;
+ register struct msgrcv_args *uap;
+{
+ int msqid = uap->msqid;
+ void *user_msgp = uap->msgp;
+ size_t msgsz = uap->msgsz;
+ long msgtyp = uap->msgtyp;
+ int msgflg = uap->msgflg;
+ size_t len;
+ register struct msqid_ds *msqptr;
+ register struct msg *msghdr;
+ int error = 0;
+ short next;
+
+#ifdef MSG_DEBUG_OK
+ printf("call to msgrcv(%d, 0x%x, %d, %ld, %d)\n", msqid, user_msgp,
+ msgsz, msgtyp, msgflg);
+#endif
+
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+
+ mtx_lock(&Giant);
+ msqid = IPCID_TO_IX(msqid);
+
+ if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+ msginfo.msgmni);
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("no such message queue id\n");
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+ if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+ printf("wrong sequence number\n");
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+
+ if ((error = ipcperm(td, &msqptr->msg_perm, IPC_R))) {
+#ifdef MSG_DEBUG_OK
+ printf("requester doesn't have read access\n");
+#endif
+ goto done2;
+ }
+
+ msghdr = NULL;
+ while (msghdr == NULL) {
+ if (msgtyp == 0) {
+ msghdr = msqptr->msg_first;
+ if (msghdr != NULL) {
+ if (msgsz < msghdr->msg_ts &&
+ (msgflg & MSG_NOERROR) == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("first message on the queue is too big (want %d, got %d)\n",
+ msgsz, msghdr->msg_ts);
+#endif
+ error = E2BIG;
+ goto done2;
+ }
+ if (msqptr->msg_first == msqptr->msg_last) {
+ msqptr->msg_first = NULL;
+ msqptr->msg_last = NULL;
+ } else {
+ msqptr->msg_first = msghdr->msg_next;
+ if (msqptr->msg_first == NULL)
+ panic("msg_first/last screwed up #1");
+ }
+ }
+ } else {
+ struct msg *previous;
+ struct msg **prev;
+
+ previous = NULL;
+ prev = &(msqptr->msg_first);
+ while ((msghdr = *prev) != NULL) {
+ /*
+ * Is this message's type an exact match or is
+ * this message's type less than or equal to
+ * the absolute value of a negative msgtyp?
+ * Note that the second half of this test can
+ * NEVER be true if msgtyp is positive since
+ * msg_type is always positive!
+ */
+
+ if (msgtyp == msghdr->msg_type ||
+ msghdr->msg_type <= -msgtyp) {
+#ifdef MSG_DEBUG_OK
+ printf("found message type %d, requested %d\n",
+ msghdr->msg_type, msgtyp);
+#endif
+ if (msgsz < msghdr->msg_ts &&
+ (msgflg & MSG_NOERROR) == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("requested message on the queue is too big (want %d, got %d)\n",
+ msgsz, msghdr->msg_ts);
+#endif
+ error = E2BIG;
+ goto done2;
+ }
+ *prev = msghdr->msg_next;
+ if (msghdr == msqptr->msg_last) {
+ if (previous == NULL) {
+ if (prev !=
+ &msqptr->msg_first)
+ panic("msg_first/last screwed up #2");
+ msqptr->msg_first =
+ NULL;
+ msqptr->msg_last =
+ NULL;
+ } else {
+ if (prev ==
+ &msqptr->msg_first)
+ panic("msg_first/last screwed up #3");
+ msqptr->msg_last =
+ previous;
+ }
+ }
+ break;
+ }
+ previous = msghdr;
+ prev = &(msghdr->msg_next);
+ }
+ }
+
+ /*
+ * We've either extracted the msghdr for the appropriate
+ * message or there isn't one.
+ * If there is one then bail out of this loop.
+ */
+
+ if (msghdr != NULL)
+ break;
+
+ /*
+ * Hmph! No message found. Does the user want to wait?
+ */
+
+ if ((msgflg & IPC_NOWAIT) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("no appropriate message found (msgtyp=%d)\n",
+ msgtyp);
+#endif
+ /* The SVID says to return ENOMSG. */
+ error = ENOMSG;
+ goto done2;
+ }
+
+ /*
+ * Wait for something to happen
+ */
+
+#ifdef MSG_DEBUG_OK
+ printf("msgrcv: goodnight\n");
+#endif
+ error = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, "msgwait",
+ 0);
+#ifdef MSG_DEBUG_OK
+ printf("msgrcv: good morning (error=%d)\n", error);
+#endif
+
+ if (error != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("msgsnd: interrupted system call\n");
+#endif
+ error = EINTR;
+ goto done2;
+ }
+
+ /*
+ * Make sure that the msq queue still exists
+ */
+
+ if (msqptr->msg_qbytes == 0 ||
+ msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid deleted\n");
+#endif
+ error = EIDRM;
+ goto done2;
+ }
+ }
+
+ /*
+ * Return the message to the user.
+ *
+ * First, do the bookkeeping (before we risk being interrupted).
+ */
+
+ msqptr->msg_cbytes -= msghdr->msg_ts;
+ msqptr->msg_qnum--;
+ msqptr->msg_lrpid = td->td_proc->p_pid;
+ msqptr->msg_rtime = time_second;
+
+ /*
+ * Make msgsz the actual amount that we'll be returning.
+ * Note that this effectively truncates the message if it is too long
+ * (since msgsz is never increased).
+ */
+
+#ifdef MSG_DEBUG_OK
+ printf("found a message, msgsz=%d, msg_ts=%d\n", msgsz,
+ msghdr->msg_ts);
+#endif
+ if (msgsz > msghdr->msg_ts)
+ msgsz = msghdr->msg_ts;
+
+ /*
+ * Return the type to the user.
+ */
+
+ error = copyout((caddr_t)&(msghdr->msg_type), user_msgp,
+ sizeof(msghdr->msg_type));
+ if (error != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("error (%d) copying out message type\n", error);
+#endif
+ msg_freehdr(msghdr);
+ wakeup((caddr_t)msqptr);
+ goto done2;
+ }
+ user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+
+ /*
+ * Return the segments to the user
+ */
+
+ next = msghdr->msg_spot;
+ for (len = 0; len < msgsz; len += msginfo.msgssz) {
+ size_t tlen;
+
+ if (msgsz - len > msginfo.msgssz)
+ tlen = msginfo.msgssz;
+ else
+ tlen = msgsz - len;
+ if (next <= -1)
+ panic("next too low #3");
+ if (next >= msginfo.msgseg)
+ panic("next out of range #3");
+ error = copyout((caddr_t)&msgpool[next * msginfo.msgssz],
+ user_msgp, tlen);
+ if (error != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("error (%d) copying out message segment\n",
+ error);
+#endif
+ msg_freehdr(msghdr);
+ wakeup((caddr_t)msqptr);
+ goto done2;
+ }
+ user_msgp = (char *)user_msgp + tlen;
+ next = msgmaps[next].next;
+ }
+
+ /*
+ * Done, return the actual number of bytes copied out.
+ */
+
+ msg_freehdr(msghdr);
+ wakeup((caddr_t)msqptr);
+ td->td_retval[0] = msgsz;
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static int
+sysctl_msqids(SYSCTL_HANDLER_ARGS)
+{
+
+ return (SYSCTL_OUT(req, msqids,
+ sizeof(struct msqid_ds) * msginfo.msgmni));
+}
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RD, &msginfo.msgmni, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RD, &msginfo.msgmnb, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RD, &msginfo.msgtql, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RD, &msginfo.msgssz, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RD, &msginfo.msgseg, 0, "")
+SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLFLAG_RD,
+ NULL, 0, sysctl_msqids, "", "Message queue IDs");
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
new file mode 100644
index 0000000..af784b8
--- /dev/null
+++ b/sys/kern/sysv_sem.c
@@ -0,0 +1,1193 @@
+/* $FreeBSD$ */
+
+/*
+ * Implementation of SVID semaphores
+ *
+ * Author: Daniel Boulet
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sem.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/jail.h>
+
+static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
+
+static void seminit(void);
+static int sysvsem_modload(struct module *, int, void *);
+static int semunload(void);
+static void semexit_myhook(struct proc *p);
+static int sysctl_sema(SYSCTL_HANDLER_ARGS);
+
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args;
+int __semctl(struct thread *td, struct __semctl_args *uap);
+struct semget_args;
+int semget(struct thread *td, struct semget_args *uap);
+struct semop_args;
+int semop(struct thread *td, struct semop_args *uap);
+#endif
+
+static struct sem_undo *semu_alloc(struct thread *td);
+static int semundo_adjust(struct thread *td, struct sem_undo **supptr,
+ int semid, int semnum, int adjval);
+static void semundo_clear(int semid, int semnum);
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *semcalls[] = {
+ (sy_call_t *)__semctl, (sy_call_t *)semget,
+ (sy_call_t *)semop
+};
+
+static int semtot = 0;
+static struct semid_ds *sema; /* semaphore id pool */
+static struct sem *sem; /* semaphore pool */
+static struct sem_undo *semu_list; /* list of active undo structures */
+static int *semu; /* undo structure pool */
+
+struct sem {
+ u_short semval; /* semaphore value */
+ pid_t sempid; /* pid of last operation */
+ u_short semncnt; /* # awaiting semval > cval */
+ u_short semzcnt; /* # awaiting semval = 0 */
+};
+
+/*
+ * Undo structure (one per process)
+ */
+struct sem_undo {
+ struct sem_undo *un_next; /* ptr to next active undo structure */
+ struct proc *un_proc; /* owner of this structure */
+ short un_cnt; /* # of active entries */
+ struct undo {
+ short un_adjval; /* adjust on exit values */
+ short un_num; /* semaphore # */
+ int un_id; /* semid */
+ } un_ent[1]; /* undo entries */
+};
+
+/*
+ * Configuration parameters
+ */
+#ifndef SEMMNI
+#define SEMMNI 10 /* # of semaphore identifiers */
+#endif
+#ifndef SEMMNS
+#define SEMMNS 60 /* # of semaphores in system */
+#endif
+#ifndef SEMUME
+#define SEMUME 10 /* max # of undo entries per process */
+#endif
+#ifndef SEMMNU
+#define SEMMNU 30 /* # of undo structures in system */
+#endif
+
+/* shouldn't need tuning */
+#ifndef SEMMAP
+#define SEMMAP 30 /* # of entries in semaphore map */
+#endif
+#ifndef SEMMSL
+#define SEMMSL SEMMNS /* max # of semaphores per id */
+#endif
+#ifndef SEMOPM
+#define SEMOPM 100 /* max # of operations per semop call */
+#endif
+
+#define SEMVMX 32767 /* semaphore maximum value */
+#define SEMAEM 16384 /* adjust on exit max value */
+
+/*
+ * Due to the way semaphore memory is allocated, we have to ensure that
+ * SEMUSZ is properly aligned.
+ */
+
+#define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
+
+/* actual size of an undo structure */
+#define SEMUSZ SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME]))
+
+/*
+ * Macro to find a particular sem_undo vector
+ */
+#define SEMU(ix) ((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz))
+
+/*
+ * semaphore info struct
+ */
+struct seminfo seminfo = {
+ SEMMAP, /* # of entries in semaphore map */
+ SEMMNI, /* # of semaphore identifiers */
+ SEMMNS, /* # of semaphores in system */
+ SEMMNU, /* # of undo structures in system */
+ SEMMSL, /* max # of semaphores per id */
+ SEMOPM, /* max # of operations per semop call */
+ SEMUME, /* max # of undo entries per process */
+ SEMUSZ, /* size in bytes of undo structure */
+ SEMVMX, /* semaphore maximum value */
+ SEMAEM /* adjust on exit max value */
+};
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmap, CTLFLAG_RW, &seminfo.semmap, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RD, &seminfo.semmni, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RD, &seminfo.semmns, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RD, &seminfo.semmnu, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RW, &seminfo.semmsl, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RD, &seminfo.semopm, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RD, &seminfo.semume, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RD, &seminfo.semusz, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RW, &seminfo.semvmx, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RW, &seminfo.semaem, 0, "");
+SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLFLAG_RD,
+ NULL, 0, sysctl_sema, "", "");
+
+static void
+seminit(void)
+{
+ register int i;
+
+ TUNABLE_INT_FETCH("kern.ipc.semmap", &seminfo.semmap);
+ TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni);
+ TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns);
+ TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu);
+ TUNABLE_INT_FETCH("kern.ipc.semmsl", &seminfo.semmsl);
+ TUNABLE_INT_FETCH("kern.ipc.semopm", &seminfo.semopm);
+ TUNABLE_INT_FETCH("kern.ipc.semume", &seminfo.semume);
+ TUNABLE_INT_FETCH("kern.ipc.semusz", &seminfo.semusz);
+ TUNABLE_INT_FETCH("kern.ipc.semvmx", &seminfo.semvmx);
+ TUNABLE_INT_FETCH("kern.ipc.semaem", &seminfo.semaem);
+
+ sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK);
+ if (sem == NULL)
+ panic("sem is NULL");
+ sema = malloc(sizeof(struct semid_ds) * seminfo.semmni, M_SEM, M_WAITOK);
+ if (sema == NULL)
+ panic("sema is NULL");
+ semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK);
+ if (semu == NULL)
+ panic("semu is NULL");
+
+ for (i = 0; i < seminfo.semmni; i++) {
+ sema[i].sem_base = 0;
+ sema[i].sem_perm.mode = 0;
+ }
+ for (i = 0; i < seminfo.semmnu; i++) {
+ register struct sem_undo *suptr = SEMU(i);
+ suptr->un_proc = NULL;
+ }
+ semu_list = NULL;
+ at_exit(semexit_myhook);
+}
+
+static int
+semunload(void)
+{
+
+ if (semtot != 0)
+ return (EBUSY);
+
+ free(sem, M_SEM);
+ free(sema, M_SEM);
+ free(semu, M_SEM);
+ rm_at_exit(semexit_myhook);
+ return (0);
+}
+
+static int
+sysvsem_modload(struct module *module, int cmd, void *arg)
+{
+ int error = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ seminit();
+ break;
+ case MOD_UNLOAD:
+ error = semunload();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t sysvsem_mod = {
+ "sysvsem",
+ &sysvsem_modload,
+ NULL
+};
+
+SYSCALL_MODULE_HELPER(semsys);
+SYSCALL_MODULE_HELPER(__semctl);
+SYSCALL_MODULE_HELPER(semget);
+SYSCALL_MODULE_HELPER(semop);
+
+DECLARE_MODULE(sysvsem, sysvsem_mod,
+ SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
+MODULE_VERSION(sysvsem, 1);
+
+/*
+ * Entry point for all SEM calls
+ *
+ * MPSAFE
+ */
+int
+semsys(td, uap)
+ struct thread *td;
+ /* XXX actually varargs. */
+ struct semsys_args /* {
+ u_int which;
+ int a2;
+ int a3;
+ int a4;
+ int a5;
+ } */ *uap;
+{
+ int error;
+
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+ if (uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
+ return (EINVAL);
+ mtx_lock(&Giant);
+ error = (*semcalls[uap->which])(td, &uap->a2);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Allocate a new sem_undo structure for a process
+ * (returns ptr to structure or NULL if no more room)
+ */
+
+static struct sem_undo *
+semu_alloc(td)
+ struct thread *td;
+{
+ register int i;
+ register struct sem_undo *suptr;
+ register struct sem_undo **supptr;
+ int attempt;
+
+ /*
+ * Try twice to allocate something.
+ * (we'll purge any empty structures after the first pass so
+ * two passes are always enough)
+ */
+
+ for (attempt = 0; attempt < 2; attempt++) {
+ /*
+ * Look for a free structure.
+ * Fill it in and return it if we find one.
+ */
+
+ for (i = 0; i < seminfo.semmnu; i++) {
+ suptr = SEMU(i);
+ if (suptr->un_proc == NULL) {
+ suptr->un_next = semu_list;
+ semu_list = suptr;
+ suptr->un_cnt = 0;
+ suptr->un_proc = td->td_proc;
+ return(suptr);
+ }
+ }
+
+ /*
+ * We didn't find a free one, if this is the first attempt
+ * then try to free some structures.
+ */
+
+ if (attempt == 0) {
+ /* All the structures are in use - try to free some */
+ int did_something = 0;
+
+ supptr = &semu_list;
+ while ((suptr = *supptr) != NULL) {
+ if (suptr->un_cnt == 0) {
+ suptr->un_proc = NULL;
+ *supptr = suptr->un_next;
+ did_something = 1;
+ } else
+ supptr = &(suptr->un_next);
+ }
+
+ /* If we didn't free anything then just give-up */
+ if (!did_something)
+ return(NULL);
+ } else {
+ /*
+ * The second pass failed even though we freed
+ * something after the first pass!
+ * This is IMPOSSIBLE!
+ */
+ panic("semu_alloc - second attempt failed");
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Adjust a particular entry for a particular proc
+ */
+
+static int
+semundo_adjust(td, supptr, semid, semnum, adjval)
+ register struct thread *td;
+ struct sem_undo **supptr;
+ int semid, semnum;
+ int adjval;
+{
+ struct proc *p = td->td_proc;
+ register struct sem_undo *suptr;
+ register struct undo *sunptr;
+ int i;
+
+ /* Look for and remember the sem_undo if the caller doesn't provide
+ it */
+
+ suptr = *supptr;
+ if (suptr == NULL) {
+ for (suptr = semu_list; suptr != NULL;
+ suptr = suptr->un_next) {
+ if (suptr->un_proc == p) {
+ *supptr = suptr;
+ break;
+ }
+ }
+ if (suptr == NULL) {
+ if (adjval == 0)
+ return(0);
+ suptr = semu_alloc(td);
+ if (suptr == NULL)
+ return(ENOSPC);
+ *supptr = suptr;
+ }
+ }
+
+ /*
+ * Look for the requested entry and adjust it (delete if adjval becomes
+ * 0).
+ */
+ sunptr = &suptr->un_ent[0];
+ for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
+ if (sunptr->un_id != semid || sunptr->un_num != semnum)
+ continue;
+ if (adjval != 0) {
+ adjval += sunptr->un_adjval;
+ if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
+ return (ERANGE);
+ }
+ sunptr->un_adjval = adjval;
+ if (sunptr->un_adjval == 0) {
+ suptr->un_cnt--;
+ if (i < suptr->un_cnt)
+ suptr->un_ent[i] =
+ suptr->un_ent[suptr->un_cnt];
+ }
+ return(0);
+ }
+
+ /* Didn't find the right entry - create it */
+ if (adjval == 0)
+ return(0);
+ if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
+ return (ERANGE);
+ if (suptr->un_cnt != seminfo.semume) {
+ sunptr = &suptr->un_ent[suptr->un_cnt];
+ suptr->un_cnt++;
+ sunptr->un_adjval = adjval;
+ sunptr->un_id = semid; sunptr->un_num = semnum;
+ } else
+ return(EINVAL);
+ return(0);
+}
+
+static void
+semundo_clear(semid, semnum)
+ int semid, semnum;
+{
+ register struct sem_undo *suptr;
+
+ for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) {
+ register struct undo *sunptr = &suptr->un_ent[0];
+ register int i = 0;
+
+ while (i < suptr->un_cnt) {
+ if (sunptr->un_id == semid) {
+ if (semnum == -1 || sunptr->un_num == semnum) {
+ suptr->un_cnt--;
+ if (i < suptr->un_cnt) {
+ suptr->un_ent[i] =
+ suptr->un_ent[suptr->un_cnt];
+ continue;
+ }
+ }
+ if (semnum != -1)
+ break;
+ }
+ i++, sunptr++;
+ }
+ }
+}
+
+/*
+ * Note that the user-mode half of this passes a union, not a pointer
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args {
+ int semid;
+ int semnum;
+ int cmd;
+ union semun *arg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+__semctl(td, uap)
+ struct thread *td;
+ register struct __semctl_args *uap;
+{
+ int semid = uap->semid;
+ int semnum = uap->semnum;
+ int cmd = uap->cmd;
+ union semun *arg = uap->arg;
+ union semun real_arg;
+ struct ucred *cred = td->td_ucred;
+ int i, rval, error;
+ struct semid_ds sbuf;
+ register struct semid_ds *semaptr;
+ u_short usval;
+
+#ifdef SEM_DEBUG
+ printf("call to semctl(%d, %d, %d, 0x%x)\n", semid, semnum, cmd, arg);
+#endif
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+
+ mtx_lock(&Giant);
+ switch(cmd) {
+ case SEM_STAT:
+ if (semid < 0 || semid >= seminfo.semmni)
+ UGAR(EINVAL);
+ semaptr = &sema[semid];
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 )
+ UGAR(EINVAL);
+ if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+ UGAR(error);
+ if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ UGAR(error);
+ error = copyout((caddr_t)semaptr, real_arg.buf,
+ sizeof(struct semid_ds));
+ rval = IXSEQ_TO_IPCID(semid,semaptr->sem_perm);
+ if (error == 0)
+ td->td_retval[0] = rval;
+ goto done2;
+ }
+
+ semid = IPCID_TO_IX(semid);
+ if (semid < 0 || semid >= seminfo.semmni) {
+ error = EINVAL;
+ goto done2;
+ }
+
+ semaptr = &sema[semid];
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
+ semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
+ error = EINVAL;
+ goto done2;
+ }
+
+ error = 0;
+ rval = 0;
+
+ switch (cmd) {
+ case IPC_RMID:
+ if ((error = ipcperm(td, &semaptr->sem_perm, IPC_M)))
+ goto done2;
+ semaptr->sem_perm.cuid = cred->cr_uid;
+ semaptr->sem_perm.uid = cred->cr_uid;
+ semtot -= semaptr->sem_nsems;
+ for (i = semaptr->sem_base - sem; i < semtot; i++)
+ sem[i] = sem[i + semaptr->sem_nsems];
+ for (i = 0; i < seminfo.semmni; i++) {
+ if ((sema[i].sem_perm.mode & SEM_ALLOC) &&
+ sema[i].sem_base > semaptr->sem_base)
+ sema[i].sem_base -= semaptr->sem_nsems;
+ }
+ semaptr->sem_perm.mode = 0;
+ semundo_clear(semid, -1);
+ wakeup((caddr_t)semaptr);
+ break;
+
+ case IPC_SET:
+ if ((error = ipcperm(td, &semaptr->sem_perm, IPC_M)))
+ goto done2;
+ if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ goto done2;
+ if ((error = copyin(real_arg.buf, (caddr_t)&sbuf,
+ sizeof(sbuf))) != 0) {
+ goto done2;
+ }
+ semaptr->sem_perm.uid = sbuf.sem_perm.uid;
+ semaptr->sem_perm.gid = sbuf.sem_perm.gid;
+ semaptr->sem_perm.mode = (semaptr->sem_perm.mode & ~0777) |
+ (sbuf.sem_perm.mode & 0777);
+ semaptr->sem_ctime = time_second;
+ break;
+
+ case IPC_STAT:
+ if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+ goto done2;
+ if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ goto done2;
+ error = copyout((caddr_t)semaptr, real_arg.buf,
+ sizeof(struct semid_ds));
+ break;
+
+ case GETNCNT:
+ if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+ goto done2;
+ if (semnum < 0 || semnum >= semaptr->sem_nsems) {
+ error = EINVAL;
+ goto done2;
+ }
+ rval = semaptr->sem_base[semnum].semncnt;
+ break;
+
+ case GETPID:
+ if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+ goto done2;
+ if (semnum < 0 || semnum >= semaptr->sem_nsems) {
+ error = EINVAL;
+ goto done2;
+ }
+ rval = semaptr->sem_base[semnum].sempid;
+ break;
+
+ case GETVAL:
+ if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+ goto done2;
+ if (semnum < 0 || semnum >= semaptr->sem_nsems) {
+ error = EINVAL;
+ goto done2;
+ }
+ rval = semaptr->sem_base[semnum].semval;
+ break;
+
+ case GETALL:
+ if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+ goto done2;
+ if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ goto done2;
+ for (i = 0; i < semaptr->sem_nsems; i++) {
+ error = copyout((caddr_t)&semaptr->sem_base[i].semval,
+ &real_arg.array[i], sizeof(real_arg.array[0]));
+ if (error != 0)
+ break;
+ }
+ break;
+
+ case GETZCNT:
+ if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+ goto done2;
+ if (semnum < 0 || semnum >= semaptr->sem_nsems) {
+ error = EINVAL;
+ goto done2;
+ }
+ rval = semaptr->sem_base[semnum].semzcnt;
+ break;
+
+ case SETVAL:
+ if ((error = ipcperm(td, &semaptr->sem_perm, IPC_W)))
+ goto done2;
+ if (semnum < 0 || semnum >= semaptr->sem_nsems) {
+ error = EINVAL;
+ goto done2;
+ }
+ if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ goto done2;
+ if (real_arg.val < 0 || real_arg.val > seminfo.semvmx) {
+ error = ERANGE;
+ goto done2;
+ }
+ semaptr->sem_base[semnum].semval = real_arg.val;
+ semundo_clear(semid, semnum);
+ wakeup((caddr_t)semaptr);
+ break;
+
+ case SETALL:
+ if ((error = ipcperm(td, &semaptr->sem_perm, IPC_W)))
+ goto done2;
+ if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ goto done2;
+ for (i = 0; i < semaptr->sem_nsems; i++) {
+ error = copyin(&real_arg.array[i],
+ (caddr_t)&usval, sizeof(real_arg.array[0]));
+ if (error != 0)
+ break;
+ if (usval > seminfo.semvmx) {
+ error = ERANGE;
+ break;
+ }
+ semaptr->sem_base[i].semval = usval;
+ }
+ semundo_clear(semid, -1);
+ wakeup((caddr_t)semaptr);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ if (error == 0)
+ td->td_retval[0] = rval;
+done2:
+ mtx_unlock(&Giant);
+ return(error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semget_args {
+ key_t key;
+ int nsems;
+ int semflg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+semget(td, uap)
+ struct thread *td;
+ register struct semget_args *uap;
+{
+ int semid, error = 0;
+ int key = uap->key;
+ int nsems = uap->nsems;
+ int semflg = uap->semflg;
+ struct ucred *cred = td->td_ucred;
+
+#ifdef SEM_DEBUG
+ printf("semget(0x%x, %d, 0%o)\n", key, nsems, semflg);
+#endif
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+
+ mtx_lock(&Giant);
+ if (key != IPC_PRIVATE) {
+ for (semid = 0; semid < seminfo.semmni; semid++) {
+ if ((sema[semid].sem_perm.mode & SEM_ALLOC) &&
+ sema[semid].sem_perm.key == key)
+ break;
+ }
+ if (semid < seminfo.semmni) {
+#ifdef SEM_DEBUG
+ printf("found public key\n");
+#endif
+ if ((error = ipcperm(td, &sema[semid].sem_perm,
+ semflg & 0700))) {
+ goto done2;
+ }
+ if (nsems > 0 && sema[semid].sem_nsems < nsems) {
+#ifdef SEM_DEBUG
+ printf("too small\n");
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+ if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
+#ifdef SEM_DEBUG
+ printf("not exclusive\n");
+#endif
+ error = EEXIST;
+ goto done2;
+ }
+ goto found;
+ }
+ }
+
+#ifdef SEM_DEBUG
+ printf("need to allocate the semid_ds\n");
+#endif
+ if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
+ if (nsems <= 0 || nsems > seminfo.semmsl) {
+#ifdef SEM_DEBUG
+ printf("nsems out of range (0<%d<=%d)\n", nsems,
+ seminfo.semmsl);
+#endif
+ error = EINVAL;
+ goto done2;
+ }
+ if (nsems > seminfo.semmns - semtot) {
+#ifdef SEM_DEBUG
+ printf("not enough semaphores left (need %d, got %d)\n",
+ nsems, seminfo.semmns - semtot);
+#endif
+ error = ENOSPC;
+ goto done2;
+ }
+ for (semid = 0; semid < seminfo.semmni; semid++) {
+ if ((sema[semid].sem_perm.mode & SEM_ALLOC) == 0)
+ break;
+ }
+ if (semid == seminfo.semmni) {
+#ifdef SEM_DEBUG
+ printf("no more semid_ds's available\n");
+#endif
+ error = ENOSPC;
+ goto done2;
+ }
+#ifdef SEM_DEBUG
+ printf("semid %d is available\n", semid);
+#endif
+ sema[semid].sem_perm.key = key;
+ sema[semid].sem_perm.cuid = cred->cr_uid;
+ sema[semid].sem_perm.uid = cred->cr_uid;
+ sema[semid].sem_perm.cgid = cred->cr_gid;
+ sema[semid].sem_perm.gid = cred->cr_gid;
+ sema[semid].sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
+ sema[semid].sem_perm.seq =
+ (sema[semid].sem_perm.seq + 1) & 0x7fff;
+ sema[semid].sem_nsems = nsems;
+ sema[semid].sem_otime = 0;
+ sema[semid].sem_ctime = time_second;
+ sema[semid].sem_base = &sem[semtot];
+ semtot += nsems;
+ bzero(sema[semid].sem_base,
+ sizeof(sema[semid].sem_base[0])*nsems);
+#ifdef SEM_DEBUG
+ printf("sembase = 0x%x, next = 0x%x\n", sema[semid].sem_base,
+ &sem[semtot]);
+#endif
+ } else {
+#ifdef SEM_DEBUG
+ printf("didn't find it and wasn't asked to create it\n");
+#endif
+ error = ENOENT;
+ goto done2;
+ }
+
+found:
+ td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].sem_perm);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semop_args {
+ int semid;
+ struct sembuf *sops;
+ u_int nsops;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+semop(td, uap)
+ struct thread *td;
+ register struct semop_args *uap;
+{
+ int semid = uap->semid;
+ u_int nsops = uap->nsops;
+ struct sembuf *sops = NULL;
+ register struct semid_ds *semaptr;
+ register struct sembuf *sopptr = 0;
+ register struct sem *semptr = 0;
+ struct sem_undo *suptr;
+ int i, j, error;
+ int do_wakeup, do_undos;
+
+#ifdef SEM_DEBUG
+ printf("call to semop(%d, 0x%x, %u)\n", semid, sops, nsops);
+#endif
+
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+
+ mtx_lock(&Giant);
+ semid = IPCID_TO_IX(semid); /* Convert back to zero origin */
+
+ if (semid < 0 || semid >= seminfo.semmni) {
+ error = EINVAL;
+ goto done2;
+ }
+
+ semaptr = &sema[semid];
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0) {
+ error = EINVAL;
+ goto done2;
+ }
+ if (semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
+ error = EINVAL;
+ goto done2;
+ }
+ if (nsops > seminfo.semopm) {
+#ifdef SEM_DEBUG
+ printf("too many sops (max=%d, nsops=%d)\n", seminfo.semopm,
+ nsops);
+#endif
+ error = E2BIG;
+ goto done2;
+ }
+
+ /* Allocate memory for sem_ops */
+ sops = malloc(nsops * sizeof(sops[0]), M_SEM, M_WAITOK);
+ if (!sops)
+ panic("Failed to allocate %d sem_ops", nsops);
+
+ if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
+#ifdef SEM_DEBUG
+ printf("error = %d from copyin(%08x, %08x, %d)\n", error,
+ uap->sops, sops, nsops * sizeof(sops[0]));
+#endif
+ goto done2;
+ }
+
+ /*
+ * Initial pass thru sops to see what permissions are needed.
+ * Also perform any checks that don't need repeating on each
+ * attempt to satisfy the request vector.
+ */
+ j = 0; /* permission needed */
+ do_undos = 0;
+ for (i = 0; i < nsops; i++) {
+ sopptr = &sops[i];
+ if (sopptr->sem_num >= semaptr->sem_nsems) {
+ error = EFBIG;
+ goto done2;
+ }
+ if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0)
+ do_undos = 1;
+ j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A;
+ }
+
+ if ((error = ipcperm(td, &semaptr->sem_perm, j))) {
+#ifdef SEM_DEBUG
+ printf("error = %d from ipaccess\n", error);
+#endif
+ goto done2;
+ }
+
+ /*
+ * Loop trying to satisfy the vector of requests.
+ * If we reach a point where we must wait, any requests already
+ * performed are rolled back and we go to sleep until some other
+ * process wakes us up. At this point, we start all over again.
+ *
+ * This ensures that from the perspective of other tasks, a set
+ * of requests is atomic (never partially satisfied).
+ */
+ for (;;) {
+ do_wakeup = 0;
+ error = 0; /* error return if necessary */
+
+ for (i = 0; i < nsops; i++) {
+ sopptr = &sops[i];
+ semptr = &semaptr->sem_base[sopptr->sem_num];
+
+#ifdef SEM_DEBUG
+ printf("semop: semaptr=%x, sem_base=%x, semptr=%x, sem[%d]=%d : op=%d, flag=%s\n",
+ semaptr, semaptr->sem_base, semptr,
+ sopptr->sem_num, semptr->semval, sopptr->sem_op,
+ (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait");
+#endif
+
+ if (sopptr->sem_op < 0) {
+ if (semptr->semval + sopptr->sem_op < 0) {
+#ifdef SEM_DEBUG
+ printf("semop: can't do it now\n");
+#endif
+ break;
+ } else {
+ semptr->semval += sopptr->sem_op;
+ if (semptr->semval == 0 &&
+ semptr->semzcnt > 0)
+ do_wakeup = 1;
+ }
+ } else if (sopptr->sem_op == 0) {
+ if (semptr->semval != 0) {
+#ifdef SEM_DEBUG
+ printf("semop: not zero now\n");
+#endif
+ break;
+ }
+ } else if (semptr->semval + sopptr->sem_op >
+ seminfo.semvmx) {
+ error = ERANGE;
+ break;
+ } else {
+ if (semptr->semncnt > 0)
+ do_wakeup = 1;
+ semptr->semval += sopptr->sem_op;
+ }
+ }
+
+ /*
+ * Did we get through the entire vector?
+ */
+ if (i >= nsops)
+ goto done;
+
+ /*
+ * No ... rollback anything that we've already done
+ */
+#ifdef SEM_DEBUG
+ printf("semop: rollback 0 through %d\n", i-1);
+#endif
+ for (j = 0; j < i; j++)
+ semaptr->sem_base[sops[j].sem_num].semval -=
+ sops[j].sem_op;
+
+ /* If we detected an error, return it */
+ if (error != 0)
+ goto done2;
+
+ /*
+ * If the request that we couldn't satisfy has the
+ * NOWAIT flag set then return with EAGAIN.
+ */
+ if (sopptr->sem_flg & IPC_NOWAIT) {
+ error = EAGAIN;
+ goto done2;
+ }
+
+ if (sopptr->sem_op == 0)
+ semptr->semzcnt++;
+ else
+ semptr->semncnt++;
+
+#ifdef SEM_DEBUG
+ printf("semop: good night!\n");
+#endif
+ error = tsleep((caddr_t)semaptr, (PZERO - 4) | PCATCH,
+ "semwait", 0);
+#ifdef SEM_DEBUG
+ printf("semop: good morning (error=%d)!\n", error);
+#endif
+
+ if (error != 0) {
+ error = EINTR;
+ goto done2;
+ }
+#ifdef SEM_DEBUG
+ printf("semop: good morning!\n");
+#endif
+
+ /*
+ * Make sure that the semaphore still exists
+ */
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
+ semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
+ error = EIDRM;
+ goto done2;
+ }
+
+ /*
+ * The semaphore is still alive. Readjust the count of
+ * waiting processes.
+ */
+ if (sopptr->sem_op == 0)
+ semptr->semzcnt--;
+ else
+ semptr->semncnt--;
+ }
+
+done:
+ /*
+ * Process any SEM_UNDO requests.
+ */
+ if (do_undos) {
+ suptr = NULL;
+ for (i = 0; i < nsops; i++) {
+ /*
+ * We only need to deal with SEM_UNDO's for non-zero
+ * op's.
+ */
+ int adjval;
+
+ if ((sops[i].sem_flg & SEM_UNDO) == 0)
+ continue;
+ adjval = sops[i].sem_op;
+ if (adjval == 0)
+ continue;
+ error = semundo_adjust(td, &suptr, semid,
+ sops[i].sem_num, -adjval);
+ if (error == 0)
+ continue;
+
+ /*
+ * Oh-Oh! We ran out of either sem_undo's or undo's.
+ * Rollback the adjustments to this point and then
+ * rollback the semaphore ups and down so we can return
+ * with an error with all structures restored. We
+ * rollback the undo's in the exact reverse order that
+ * we applied them. This guarantees that we won't run
+ * out of space as we roll things back out.
+ */
+ for (j = i - 1; j >= 0; j--) {
+ if ((sops[j].sem_flg & SEM_UNDO) == 0)
+ continue;
+ adjval = sops[j].sem_op;
+ if (adjval == 0)
+ continue;
+ if (semundo_adjust(td, &suptr, semid,
+ sops[j].sem_num, adjval) != 0)
+ panic("semop - can't undo undos");
+ }
+
+ for (j = 0; j < nsops; j++)
+ semaptr->sem_base[sops[j].sem_num].semval -=
+ sops[j].sem_op;
+
+#ifdef SEM_DEBUG
+ printf("error = %d from semundo_adjust\n", error);
+#endif
+ goto done2;
+ } /* loop through the sops */
+ } /* if (do_undos) */
+
+ /* We're definitely done - set the sempid's and time */
+ for (i = 0; i < nsops; i++) {
+ sopptr = &sops[i];
+ semptr = &semaptr->sem_base[sopptr->sem_num];
+ semptr->sempid = td->td_proc->p_pid;
+ }
+ semaptr->sem_otime = time_second;
+
+ /*
+ * Do a wakeup if any semaphore was up'd whilst something was
+ * sleeping on it.
+ */
+ if (do_wakeup) {
+#ifdef SEM_DEBUG
+ printf("semop: doing wakeup\n");
+#endif
+ wakeup((caddr_t)semaptr);
+#ifdef SEM_DEBUG
+ printf("semop: back from wakeup\n");
+#endif
+ }
+#ifdef SEM_DEBUG
+ printf("semop: done\n");
+#endif
+ td->td_retval[0] = 0;
+done2:
+ if (sops)
+ free(sops, M_SEM);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Go through the undo structures for this process and apply the adjustments to
+ * semaphores.
+ */
+static void
+semexit_myhook(p)
+ struct proc *p;
+{
+ register struct sem_undo *suptr;
+ register struct sem_undo **supptr;
+
+ /*
+ * Go through the chain of undo vectors looking for one
+ * associated with this process.
+ */
+
+ for (supptr = &semu_list; (suptr = *supptr) != NULL;
+ supptr = &suptr->un_next) {
+ if (suptr->un_proc == p)
+ break;
+ }
+
+ if (suptr == NULL)
+ return;
+
+#ifdef SEM_DEBUG
+ printf("proc @%08x has undo structure with %d entries\n", p,
+ suptr->un_cnt);
+#endif
+
+ /*
+ * If there are any active undo elements then process them.
+ */
+ if (suptr->un_cnt > 0) {
+ int ix;
+
+ for (ix = 0; ix < suptr->un_cnt; ix++) {
+ int semid = suptr->un_ent[ix].un_id;
+ int semnum = suptr->un_ent[ix].un_num;
+ int adjval = suptr->un_ent[ix].un_adjval;
+ struct semid_ds *semaptr;
+
+ semaptr = &sema[semid];
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0)
+ panic("semexit - semid not allocated");
+ if (semnum >= semaptr->sem_nsems)
+ panic("semexit - semnum out of range");
+
+#ifdef SEM_DEBUG
+ printf("semexit: %08x id=%d num=%d(adj=%d) ; sem=%d\n",
+ suptr->un_proc, suptr->un_ent[ix].un_id,
+ suptr->un_ent[ix].un_num,
+ suptr->un_ent[ix].un_adjval,
+ semaptr->sem_base[semnum].semval);
+#endif
+
+ if (adjval < 0) {
+ if (semaptr->sem_base[semnum].semval < -adjval)
+ semaptr->sem_base[semnum].semval = 0;
+ else
+ semaptr->sem_base[semnum].semval +=
+ adjval;
+ } else
+ semaptr->sem_base[semnum].semval += adjval;
+
+ wakeup((caddr_t)semaptr);
+#ifdef SEM_DEBUG
+ printf("semexit: back from wakeup\n");
+#endif
+ }
+ }
+
+ /*
+ * Deallocate the undo vector.
+ */
+#ifdef SEM_DEBUG
+ printf("removing vector\n");
+#endif
+ suptr->un_proc = NULL;
+ *supptr = suptr->un_next;
+}
+
+static int
+sysctl_sema(SYSCTL_HANDLER_ARGS)
+{
+
+ return (SYSCTL_OUT(req, sema,
+ sizeof(struct semid_ds) * seminfo.semmni));
+}
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
new file mode 100644
index 0000000..85356a0
--- /dev/null
+++ b/sys/kern/sysv_shm.c
@@ -0,0 +1,890 @@
+/* $FreeBSD$ */
+/* $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $ */
+
+/*
+ * Copyright (c) 1994 Adam Glass and Charles Hannum. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Adam Glass and Charles
+ * Hannum.
+ * 4. The names of the authors may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_compat.h"
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/sysctl.h>
+#include <sys/shm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/jail.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_object.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
+
+struct oshmctl_args;
+static int oshmctl(struct thread *td, struct oshmctl_args *uap);
+
+static int shmget_allocate_segment(struct thread *td,
+ struct shmget_args *uap, int mode);
+static int shmget_existing(struct thread *td, struct shmget_args *uap,
+ int mode, int segnum);
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *shmcalls[] = {
+ (sy_call_t *)shmat, (sy_call_t *)oshmctl,
+ (sy_call_t *)shmdt, (sy_call_t *)shmget,
+ (sy_call_t *)shmctl
+};
+
+#define SHMSEG_FREE 0x0200
+#define SHMSEG_REMOVED 0x0400
+#define SHMSEG_ALLOCATED 0x0800
+#define SHMSEG_WANTED 0x1000
+
+static int shm_last_free, shm_nused, shm_committed, shmalloced;
+static struct shmid_ds *shmsegs;
+
+struct shm_handle {
+ /* vm_offset_t kva; */
+ vm_object_t shm_object;
+};
+
+struct shmmap_state {
+ vm_offset_t va;
+ int shmid;
+};
+
+static void shm_deallocate_segment(struct shmid_ds *);
+static int shm_find_segment_by_key(key_t);
+static struct shmid_ds *shm_find_segment_by_shmid(int);
+static struct shmid_ds *shm_find_segment_by_shmidx(int);
+static int shm_delete_mapping(struct proc *p, struct shmmap_state *);
+static void shmrealloc(void);
+static void shminit(void);
+static int sysvshm_modload(struct module *, int, void *);
+static int shmunload(void);
+static void shmexit_myhook(struct proc *p);
+static void shmfork_myhook(struct proc *p1, struct proc *p2);
+static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);
+
+/*
+ * Tuneable values.
+ */
+#ifndef SHMMAXPGS
+#define SHMMAXPGS 8192 /* Note: sysv shared memory is swap backed. */
+#endif
+#ifndef SHMMAX
+#define SHMMAX (SHMMAXPGS*PAGE_SIZE)
+#endif
+#ifndef SHMMIN
+#define SHMMIN 1
+#endif
+#ifndef SHMMNI
+#define SHMMNI 192
+#endif
+#ifndef SHMSEG
+#define SHMSEG 128
+#endif
+#ifndef SHMALL
+#define SHMALL (SHMMAXPGS)
+#endif
+
+struct shminfo shminfo = {
+ SHMMAX,
+ SHMMIN,
+ SHMMNI,
+ SHMSEG,
+ SHMALL
+};
+
+static int shm_use_phys;
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RD, &shminfo.shmmni, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RD, &shminfo.shmseg, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW,
+ &shm_use_phys, 0, "");
+SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLFLAG_RD,
+ NULL, 0, sysctl_shmsegs, "", "");
+
+static int
+shm_find_segment_by_key(key)
+ key_t key;
+{
+ int i;
+
+ for (i = 0; i < shmalloced; i++)
+ if ((shmsegs[i].shm_perm.mode & SHMSEG_ALLOCATED) &&
+ shmsegs[i].shm_perm.key == key)
+ return i;
+ return -1;
+}
+
+static struct shmid_ds *
+shm_find_segment_by_shmid(shmid)
+ int shmid;
+{
+ int segnum;
+ struct shmid_ds *shmseg;
+
+ segnum = IPCID_TO_IX(shmid);
+ if (segnum < 0 || segnum >= shmalloced)
+ return NULL;
+ shmseg = &shmsegs[segnum];
+ if ((shmseg->shm_perm.mode & (SHMSEG_ALLOCATED | SHMSEG_REMOVED))
+ != SHMSEG_ALLOCATED ||
+ shmseg->shm_perm.seq != IPCID_TO_SEQ(shmid))
+ return NULL;
+ return shmseg;
+}
+
+static struct shmid_ds *
+shm_find_segment_by_shmidx(int segnum)
+{
+ struct shmid_ds *shmseg;
+
+ if (segnum < 0 || segnum >= shmalloced)
+ return NULL;
+ shmseg = &shmsegs[segnum];
+ if ((shmseg->shm_perm.mode & (SHMSEG_ALLOCATED | SHMSEG_REMOVED))
+ != SHMSEG_ALLOCATED )
+ return NULL;
+ return shmseg;
+}
+
+static void
+shm_deallocate_segment(shmseg)
+ struct shmid_ds *shmseg;
+{
+ struct shm_handle *shm_handle;
+ size_t size;
+
+ GIANT_REQUIRED;
+
+ shm_handle = shmseg->shm_internal;
+ vm_object_deallocate(shm_handle->shm_object);
+ free((caddr_t)shm_handle, M_SHM);
+ shmseg->shm_internal = NULL;
+ size = round_page(shmseg->shm_segsz);
+ shm_committed -= btoc(size);
+ shm_nused--;
+ shmseg->shm_perm.mode = SHMSEG_FREE;
+}
+
+static int
+shm_delete_mapping(p, shmmap_s)
+ struct proc *p;
+ struct shmmap_state *shmmap_s;
+{
+ struct shmid_ds *shmseg;
+ int segnum, result;
+ size_t size;
+
+ GIANT_REQUIRED;
+
+ segnum = IPCID_TO_IX(shmmap_s->shmid);
+ shmseg = &shmsegs[segnum];
+ size = round_page(shmseg->shm_segsz);
+ result = vm_map_remove(&p->p_vmspace->vm_map, shmmap_s->va,
+ shmmap_s->va + size);
+ if (result != KERN_SUCCESS)
+ return EINVAL;
+ shmmap_s->shmid = -1;
+ shmseg->shm_dtime = time_second;
+ if ((--shmseg->shm_nattch <= 0) &&
+ (shmseg->shm_perm.mode & SHMSEG_REMOVED)) {
+ shm_deallocate_segment(shmseg);
+ shm_last_free = segnum;
+ }
+ return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmdt_args {
+ void *shmaddr;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+shmdt(td, uap)
+ struct thread *td;
+ struct shmdt_args *uap;
+{
+ struct proc *p = td->td_proc;
+ struct shmmap_state *shmmap_s;
+ int i;
+ int error = 0;
+
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+ mtx_lock(&Giant);
+ shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+ if (shmmap_s == NULL) {
+ error = EINVAL;
+ goto done2;
+ }
+ for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
+ if (shmmap_s->shmid != -1 &&
+ shmmap_s->va == (vm_offset_t)uap->shmaddr) {
+ break;
+ }
+ }
+ if (i == shminfo.shmseg) {
+ error = EINVAL;
+ goto done2;
+ }
+ error = shm_delete_mapping(p, shmmap_s);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmat_args {
+ int shmid;
+ void *shmaddr;
+ int shmflg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+shmat(td, uap)
+ struct thread *td;
+ struct shmat_args *uap;
+{
+ struct proc *p = td->td_proc;
+ int i, flags;
+ struct shmid_ds *shmseg;
+ struct shmmap_state *shmmap_s = NULL;
+ struct shm_handle *shm_handle;
+ vm_offset_t attach_va;
+ vm_prot_t prot;
+ vm_size_t size;
+ int rv;
+ int error = 0;
+
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+ mtx_lock(&Giant);
+ shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+ if (shmmap_s == NULL) {
+ size = shminfo.shmseg * sizeof(struct shmmap_state);
+ shmmap_s = malloc(size, M_SHM, M_WAITOK);
+ for (i = 0; i < shminfo.shmseg; i++)
+ shmmap_s[i].shmid = -1;
+ p->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+ }
+ shmseg = shm_find_segment_by_shmid(uap->shmid);
+ if (shmseg == NULL) {
+ error = EINVAL;
+ goto done2;
+ }
+ error = ipcperm(td, &shmseg->shm_perm,
+ (uap->shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
+ if (error)
+ goto done2;
+ for (i = 0; i < shminfo.shmseg; i++) {
+ if (shmmap_s->shmid == -1)
+ break;
+ shmmap_s++;
+ }
+ if (i >= shminfo.shmseg) {
+ error = EMFILE;
+ goto done2;
+ }
+ size = round_page(shmseg->shm_segsz);
+#ifdef VM_PROT_READ_IS_EXEC
+ prot = VM_PROT_READ | VM_PROT_EXECUTE;
+#else
+ prot = VM_PROT_READ;
+#endif
+ if ((uap->shmflg & SHM_RDONLY) == 0)
+ prot |= VM_PROT_WRITE;
+ flags = MAP_ANON | MAP_SHARED;
+ if (uap->shmaddr) {
+ flags |= MAP_FIXED;
+ if (uap->shmflg & SHM_RND) {
+ attach_va = (vm_offset_t)uap->shmaddr & ~(SHMLBA-1);
+ } else if (((vm_offset_t)uap->shmaddr & (SHMLBA-1)) == 0) {
+ attach_va = (vm_offset_t)uap->shmaddr;
+ } else {
+ error = EINVAL;
+ goto done2;
+ }
+ } else {
+ /*
+ * This is just a hint to vm_map_find() about where to
+ * put it.
+ */
+ attach_va = round_page((vm_offset_t)p->p_vmspace->vm_taddr
+ + maxtsiz + maxdsiz);
+ }
+
+ shm_handle = shmseg->shm_internal;
+ vm_object_reference(shm_handle->shm_object);
+ rv = vm_map_find(&p->p_vmspace->vm_map, shm_handle->shm_object,
+ 0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0);
+ if (rv != KERN_SUCCESS) {
+ error = ENOMEM;
+ goto done2;
+ }
+ vm_map_inherit(&p->p_vmspace->vm_map,
+ attach_va, attach_va + size, VM_INHERIT_SHARE);
+
+ shmmap_s->va = attach_va;
+ shmmap_s->shmid = uap->shmid;
+ shmseg->shm_lpid = p->p_pid;
+ shmseg->shm_atime = time_second;
+ shmseg->shm_nattch++;
+ td->td_retval[0] = attach_va;
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+struct oshmid_ds {
+ struct ipc_perm shm_perm; /* operation perms */
+ int shm_segsz; /* size of segment (bytes) */
+ ushort shm_cpid; /* pid, creator */
+ ushort shm_lpid; /* pid, last operation */
+ short shm_nattch; /* no. of current attaches */
+ time_t shm_atime; /* last attach time */
+ time_t shm_dtime; /* last detach time */
+ time_t shm_ctime; /* last change time */
+ void *shm_handle; /* internal handle for shm segment */
+};
+
+struct oshmctl_args {
+ int shmid;
+ int cmd;
+ struct oshmid_ds *ubuf;
+};
+
+/*
+ * MPSAFE
+ */
+static int
+oshmctl(td, uap)
+ struct thread *td;
+ struct oshmctl_args *uap;
+{
+#ifdef COMPAT_43
+ int error = 0;
+ struct shmid_ds *shmseg;
+ struct oshmid_ds outbuf;
+
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+ mtx_lock(&Giant);
+ shmseg = shm_find_segment_by_shmid(uap->shmid);
+ if (shmseg == NULL) {
+ error = EINVAL;
+ goto done2;
+ }
+ switch (uap->cmd) {
+ case IPC_STAT:
+ error = ipcperm(td, &shmseg->shm_perm, IPC_R);
+ if (error)
+ goto done2;
+ outbuf.shm_perm = shmseg->shm_perm;
+ outbuf.shm_segsz = shmseg->shm_segsz;
+ outbuf.shm_cpid = shmseg->shm_cpid;
+ outbuf.shm_lpid = shmseg->shm_lpid;
+ outbuf.shm_nattch = shmseg->shm_nattch;
+ outbuf.shm_atime = shmseg->shm_atime;
+ outbuf.shm_dtime = shmseg->shm_dtime;
+ outbuf.shm_ctime = shmseg->shm_ctime;
+ outbuf.shm_handle = shmseg->shm_internal;
+ error = copyout((caddr_t)&outbuf, uap->ubuf, sizeof(outbuf));
+ if (error)
+ goto done2;
+ break;
+ default:
+ /* XXX casting to (sy_call_t *) is bogus, as usual. */
+ error = ((sy_call_t *)shmctl)(td, uap);
+ break;
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+#else
+ return EINVAL;
+#endif
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmctl_args {
+ int shmid;
+ int cmd;
+ struct shmid_ds *buf;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+shmctl(td, uap)
+ struct thread *td;
+ struct shmctl_args *uap;
+{
+ int error = 0;
+ struct shmid_ds inbuf;
+ struct shmid_ds *shmseg;
+
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+ mtx_lock(&Giant);
+ switch (uap->cmd) {
+ case IPC_INFO:
+ error = copyout( (caddr_t)&shminfo, uap->buf, sizeof( shminfo ) );
+ if (error)
+ goto done2;
+ td->td_retval[0] = shmalloced;
+ goto done2;
+ case SHM_INFO: {
+ struct shm_info shm_info;
+ shm_info.used_ids = shm_nused;
+ shm_info.shm_rss = 0; /*XXX where to get from ? */
+ shm_info.shm_tot = 0; /*XXX where to get from ? */
+ shm_info.shm_swp = 0; /*XXX where to get from ? */
+ shm_info.swap_attempts = 0; /*XXX where to get from ? */
+ shm_info.swap_successes = 0; /*XXX where to get from ? */
+ error = copyout( (caddr_t)&shm_info, uap->buf, sizeof( shm_info ) );
+ if (error)
+ goto done2;
+ td->td_retval[0] = shmalloced;
+ goto done2;
+ }
+ }
+ if( (uap->cmd) == SHM_STAT )
+ shmseg = shm_find_segment_by_shmidx(uap->shmid);
+ else
+ shmseg = shm_find_segment_by_shmid(uap->shmid);
+ if (shmseg == NULL) {
+ error = EINVAL;
+ goto done2;
+ }
+ switch (uap->cmd) {
+ case SHM_STAT:
+ case IPC_STAT:
+ error = ipcperm(td, &shmseg->shm_perm, IPC_R);
+ if (error)
+ goto done2;
+ error = copyout((caddr_t)shmseg, uap->buf, sizeof(inbuf));
+ if (error)
+ goto done2;
+ else if( (uap->cmd) == SHM_STAT )
+ td->td_retval[0] = IXSEQ_TO_IPCID( uap->shmid, shmseg->shm_perm );
+ break;
+ case IPC_SET:
+ error = ipcperm(td, &shmseg->shm_perm, IPC_M);
+ if (error)
+ goto done2;
+ error = copyin(uap->buf, (caddr_t)&inbuf, sizeof(inbuf));
+ if (error)
+ goto done2;
+ shmseg->shm_perm.uid = inbuf.shm_perm.uid;
+ shmseg->shm_perm.gid = inbuf.shm_perm.gid;
+ shmseg->shm_perm.mode =
+ (shmseg->shm_perm.mode & ~ACCESSPERMS) |
+ (inbuf.shm_perm.mode & ACCESSPERMS);
+ shmseg->shm_ctime = time_second;
+ break;
+ case IPC_RMID:
+ error = ipcperm(td, &shmseg->shm_perm, IPC_M);
+ if (error)
+ goto done2;
+ shmseg->shm_perm.key = IPC_PRIVATE;
+ shmseg->shm_perm.mode |= SHMSEG_REMOVED;
+ if (shmseg->shm_nattch <= 0) {
+ shm_deallocate_segment(shmseg);
+ shm_last_free = IPCID_TO_IX(uap->shmid);
+ }
+ break;
+#if 0
+ case SHM_LOCK:
+ case SHM_UNLOCK:
+#endif
+ default:
+ error = EINVAL;
+ break;
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmget_args {
+ key_t key;
+ size_t size;
+ int shmflg;
+};
+#endif
+
+static int
+shmget_existing(td, uap, mode, segnum)
+ struct thread *td;
+ struct shmget_args *uap;
+ int mode;
+ int segnum;
+{
+ struct shmid_ds *shmseg;
+ int error;
+
+ shmseg = &shmsegs[segnum];
+ if (shmseg->shm_perm.mode & SHMSEG_REMOVED) {
+ /*
+ * This segment is in the process of being allocated. Wait
+ * until it's done, and look the key up again (in case the
+ * allocation failed or it was freed).
+ */
+ shmseg->shm_perm.mode |= SHMSEG_WANTED;
+ error = tsleep((caddr_t)shmseg, PLOCK | PCATCH, "shmget", 0);
+ if (error)
+ return error;
+ return EAGAIN;
+ }
+ if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
+ return EEXIST;
+ error = ipcperm(td, &shmseg->shm_perm, mode);
+ if (error)
+ return error;
+ if (uap->size && uap->size > shmseg->shm_segsz)
+ return EINVAL;
+ td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
+ return 0;
+}
+
+static int
+shmget_allocate_segment(td, uap, mode)
+ struct thread *td;
+ struct shmget_args *uap;
+ int mode;
+{
+ int i, segnum, shmid, size;
+ struct ucred *cred = td->td_ucred;
+ struct shmid_ds *shmseg;
+ struct shm_handle *shm_handle;
+
+ GIANT_REQUIRED;
+
+ if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
+ return EINVAL;
+ if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
+ return ENOSPC;
+ size = round_page(uap->size);
+ if (shm_committed + btoc(size) > shminfo.shmall)
+ return ENOMEM;
+ if (shm_last_free < 0) {
+ shmrealloc(); /* Maybe expand the shmsegs[] array. */
+ for (i = 0; i < shmalloced; i++)
+ if (shmsegs[i].shm_perm.mode & SHMSEG_FREE)
+ break;
+ if (i == shmalloced)
+ return ENOSPC;
+ segnum = i;
+ } else {
+ segnum = shm_last_free;
+ shm_last_free = -1;
+ }
+ shmseg = &shmsegs[segnum];
+ /*
+ * In case we sleep in malloc(), mark the segment present but deleted
+ * so that noone else tries to create the same key.
+ */
+ shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
+ shmseg->shm_perm.key = uap->key;
+ shmseg->shm_perm.seq = (shmseg->shm_perm.seq + 1) & 0x7fff;
+ shm_handle = (struct shm_handle *)
+ malloc(sizeof(struct shm_handle), M_SHM, M_WAITOK);
+ shmid = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
+
+ /*
+ * We make sure that we have allocated a pager before we need
+ * to.
+ */
+ if (shm_use_phys) {
+ shm_handle->shm_object =
+ vm_pager_allocate(OBJT_PHYS, 0, size, VM_PROT_DEFAULT, 0);
+ } else {
+ shm_handle->shm_object =
+ vm_pager_allocate(OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0);
+ }
+ vm_object_clear_flag(shm_handle->shm_object, OBJ_ONEMAPPING);
+ vm_object_set_flag(shm_handle->shm_object, OBJ_NOSPLIT);
+
+ shmseg->shm_internal = shm_handle;
+ shmseg->shm_perm.cuid = shmseg->shm_perm.uid = cred->cr_uid;
+ shmseg->shm_perm.cgid = shmseg->shm_perm.gid = cred->cr_gid;
+ shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) |
+ (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
+ shmseg->shm_segsz = uap->size;
+ shmseg->shm_cpid = td->td_proc->p_pid;
+ shmseg->shm_lpid = shmseg->shm_nattch = 0;
+ shmseg->shm_atime = shmseg->shm_dtime = 0;
+ shmseg->shm_ctime = time_second;
+ shm_committed += btoc(size);
+ shm_nused++;
+ if (shmseg->shm_perm.mode & SHMSEG_WANTED) {
+ /*
+ * Somebody else wanted this key while we were asleep. Wake
+ * them up now.
+ */
+ shmseg->shm_perm.mode &= ~SHMSEG_WANTED;
+ wakeup((caddr_t)shmseg);
+ }
+ td->td_retval[0] = shmid;
+ return 0;
+}
+
+/*
+ * MPSAFE
+ */
+int
+shmget(td, uap)
+ struct thread *td;
+ struct shmget_args *uap;
+{
+ int segnum, mode;
+ int error;
+
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+ mtx_lock(&Giant);
+ mode = uap->shmflg & ACCESSPERMS;
+ if (uap->key != IPC_PRIVATE) {
+ again:
+ segnum = shm_find_segment_by_key(uap->key);
+ if (segnum >= 0) {
+ error = shmget_existing(td, uap, mode, segnum);
+ if (error == EAGAIN)
+ goto again;
+ goto done2;
+ }
+ if ((uap->shmflg & IPC_CREAT) == 0) {
+ error = ENOENT;
+ goto done2;
+ }
+ }
+ error = shmget_allocate_segment(td, uap, mode);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+shmsys(td, uap)
+ struct thread *td;
+ /* XXX actually varargs. */
+ struct shmsys_args /* {
+ u_int which;
+ int a2;
+ int a3;
+ int a4;
+ } */ *uap;
+{
+ int error;
+
+ if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+ return (ENOSYS);
+ if (uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0]))
+ return (EINVAL);
+ mtx_lock(&Giant);
+ error = (*shmcalls[uap->which])(td, &uap->a2);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static void
+shmfork_myhook(p1, p2)
+ struct proc *p1, *p2;
+{
+ struct shmmap_state *shmmap_s;
+ size_t size;
+ int i;
+
+ size = shminfo.shmseg * sizeof(struct shmmap_state);
+ shmmap_s = malloc(size, M_SHM, M_WAITOK);
+ bcopy((caddr_t)p1->p_vmspace->vm_shm, (caddr_t)shmmap_s, size);
+ p2->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+ for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+ if (shmmap_s->shmid != -1)
+ shmsegs[IPCID_TO_IX(shmmap_s->shmid)].shm_nattch++;
+}
+
+static void
+shmexit_myhook(p)
+ struct proc *p;
+{
+ struct shmmap_state *shmmap_s;
+ int i;
+
+ GIANT_REQUIRED;
+
+ shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+ for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+ if (shmmap_s->shmid != -1)
+ shm_delete_mapping(p, shmmap_s);
+ free((caddr_t)p->p_vmspace->vm_shm, M_SHM);
+ p->p_vmspace->vm_shm = NULL;
+}
+
+static void
+shmrealloc(void)
+{
+ int i;
+ struct shmid_ds *newsegs;
+
+ if (shmalloced >= shminfo.shmmni)
+ return;
+
+ newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK);
+ if (newsegs == NULL)
+ return;
+ for (i = 0; i < shmalloced; i++)
+ bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
+ for (; i < shminfo.shmmni; i++) {
+ shmsegs[i].shm_perm.mode = SHMSEG_FREE;
+ shmsegs[i].shm_perm.seq = 0;
+ }
+ free(shmsegs, M_SHM);
+ shmsegs = newsegs;
+ shmalloced = shminfo.shmmni;
+}
+
+static void
+shminit()
+{
+ int i;
+
+ TUNABLE_INT_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall);
+ shminfo.shmmax = shminfo.shmall * PAGE_SIZE;
+ TUNABLE_INT_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
+ TUNABLE_INT_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
+ TUNABLE_INT_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
+ TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys);
+
+ shmalloced = shminfo.shmmni;
+ shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK);
+ if (shmsegs == NULL)
+ panic("cannot allocate initial memory for sysvshm");
+ for (i = 0; i < shmalloced; i++) {
+ shmsegs[i].shm_perm.mode = SHMSEG_FREE;
+ shmsegs[i].shm_perm.seq = 0;
+ }
+ shm_last_free = 0;
+ shm_nused = 0;
+ shm_committed = 0;
+ shmexit_hook = &shmexit_myhook;
+ shmfork_hook = &shmfork_myhook;
+}
+
+static int
+shmunload()
+{
+
+ if (shm_nused > 0)
+ return (EBUSY);
+
+ free(shmsegs, M_SHM);
+ shmexit_hook = NULL;
+ shmfork_hook = NULL;
+ return (0);
+}
+
+static int
+sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
+{
+
+ return (SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0])));
+}
+
+static int
+sysvshm_modload(struct module *module, int cmd, void *arg)
+{
+ int error = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ shminit();
+ break;
+ case MOD_UNLOAD:
+ error = shmunload();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t sysvshm_mod = {
+ "sysvshm",
+ &sysvshm_modload,
+ NULL
+};
+
+SYSCALL_MODULE_HELPER(shmsys);
+SYSCALL_MODULE_HELPER(shmat);
+SYSCALL_MODULE_HELPER(shmctl);
+SYSCALL_MODULE_HELPER(shmdt);
+SYSCALL_MODULE_HELPER(shmget);
+
+DECLARE_MODULE(sysvshm, sysvshm_mod,
+ SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
+MODULE_VERSION(sysvshm, 1);
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
new file mode 100644
index 0000000..b9c5743
--- /dev/null
+++ b/sys/kern/tty.c
@@ -0,0 +1,2660 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Copyright (c) 2002 Networks Associates Technologies, Inc.
+ * All rights reserved.
+ *
+ * Portions of this software were developed for the FreeBSD Project by
+ * ThinkSec AS and NAI Labs, the Security Research Division of Network
+ * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035
+ * ("CBOSS"), as part of the DARPA CHATS research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty.c 8.8 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+/*-
+ * TODO:
+ * o Fix races for sending the start char in ttyflush().
+ * o Handle inter-byte timeout for "MIN > 0, TIME > 0" in ttyselect().
+ * With luck, there will be MIN chars before select() returns().
+ * o Handle CLOCAL consistently for ptys. Perhaps disallow setting it.
+ * o Don't allow input in TS_ZOMBIE case. It would be visible through
+ * FIONREAD.
+ * o Do the new sio locking stuff here and use it to avoid special
+ * case for EXTPROC?
+ * o Lock PENDIN too?
+ * o Move EXTPROC and/or PENDIN to t_state?
+ * o Wrap most of ttioctl in spltty/splx.
+ * o Implement TIOCNOTTY or remove it from <sys/ioctl.h>.
+ * o Send STOP if IXOFF is toggled off while TS_TBLOCK is set.
+ * o Don't allow certain termios flags to affect disciplines other
+ * than TTYDISC. Cancel their effects before switch disciplines
+ * and ignore them if they are set while we are in another
+ * discipline.
+ * o Now that historical speed conversions are handled here, don't
+ * do them in drivers.
+ * o Check for TS_CARR_ON being set while everything is closed and not
+ * waiting for carrier. TS_CARR_ON isn't cleared if nothing is open,
+ * so it would live until the next open even if carrier drops.
+ * o Restore TS_WOPEN since it is useful in pstat. It must be cleared
+ * only when _all_ openers leave open().
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/filio.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sx.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
+#include <sys/proc.h>
+#define TTYDEFCHARS
+#include <sys/tty.h>
+#undef TTYDEFCHARS
+#include <sys/fcntl.h>
+#include <sys/conf.h>
+#include <sys/dkstat.h>
+#include <sys/poll.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/malloc.h>
+#include <sys/filedesc.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+MALLOC_DEFINE(M_TTYS, "ttys", "tty data structures");
+
+static int proc_compare(struct proc *p1, struct proc *p2);
+static int ttnread(struct tty *tp);
+static void ttyecho(int c, struct tty *tp);
+static int ttyoutput(int c, struct tty *tp);
+static void ttypend(struct tty *tp);
+static void ttyretype(struct tty *tp);
+static void ttyrub(int c, struct tty *tp);
+static void ttyrubo(struct tty *tp, int cnt);
+static void ttyunblock(struct tty *tp);
+static int ttywflush(struct tty *tp);
+static int filt_ttyread(struct knote *kn, long hint);
+static void filt_ttyrdetach(struct knote *kn);
+static int filt_ttywrite(struct knote *kn, long hint);
+static void filt_ttywdetach(struct knote *kn);
+
+/*
+ * Table with character classes and parity. The 8th bit indicates parity,
+ * the 7th bit indicates the character is an alphameric or underscore (for
+ * ALTWERASE), and the low 6 bits indicate delay type. If the low 6 bits
+ * are 0 then the character needs no special processing on output; classes
+ * other than 0 might be translated or (not currently) require delays.
+ */
+#define E 0x00 /* Even parity. */
+#define O 0x80 /* Odd parity. */
+#define PARITY(c) (char_type[c] & O)
+
+#define ALPHA 0x40 /* Alpha or underscore. */
+#define ISALPHA(c) (char_type[(c) & TTY_CHARMASK] & ALPHA)
+
+#define CCLASSMASK 0x3f
+#define CCLASS(c) (char_type[c] & CCLASSMASK)
+
+#define BS BACKSPACE
+#define CC CONTROL
+#define CR RETURN
+#define NA ORDINARY | ALPHA
+#define NL NEWLINE
+#define NO ORDINARY
+#define TB TAB
+#define VT VTAB
+
+static u_char const char_type[] = {
+ E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* nul - bel */
+ O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */
+ O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */
+ E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */
+ O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */
+ E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */
+ E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */
+ O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */
+ O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */
+ E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */
+ E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */
+ O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */
+ E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */
+ O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */
+ O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */
+ E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */
+ /*
+ * Meta chars; should be settable per character set;
+ * for now, treat them all as normal characters.
+ */
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+};
+#undef BS
+#undef CC
+#undef CR
+#undef NA
+#undef NL
+#undef NO
+#undef TB
+#undef VT
+
+/* Macros to clear/set/test flags. */
+#define SET(t, f) (t) |= (f)
+#define CLR(t, f) (t) &= ~(f)
+#define ISSET(t, f) ((t) & (f))
+
+#undef MAX_INPUT /* XXX wrong in <sys/syslimits.h> */
+#define MAX_INPUT TTYHOG /* XXX limit is usually larger for !ICANON */
+
+/*
+ * list of struct tty where pstat(8) can pick it up with sysctl
+ */
+static SLIST_HEAD(, tty) tty_list;
+
+static int drainwait = 5*60;
+SYSCTL_INT(_kern, OID_AUTO, drainwait, CTLFLAG_RW, &drainwait,
+ 0, "Output drain timeout in seconds");
+
+/*
+ * Initial open of tty, or (re)entry to standard tty line discipline.
+ */
+int
+ttyopen(dev_t device, struct tty *tp)
+{
+ int s;
+
+ s = spltty();
+ tp->t_dev = device;
+ if (!ISSET(tp->t_state, TS_ISOPEN)) {
+ SET(tp->t_state, TS_ISOPEN);
+ if (ISSET(tp->t_cflag, CLOCAL))
+ SET(tp->t_state, TS_CONNECTED);
+ bzero(&tp->t_winsize, sizeof(tp->t_winsize));
+ }
+ /* XXX don't hang forever on output */
+ if (tp->t_timeout < 0)
+ tp->t_timeout = drainwait*hz;
+ ttsetwater(tp);
+ splx(s);
+ return (0);
+}
+
+/*
+ * Handle close() on a tty line: flush and set to initial state,
+ * bumping generation number so that pending read/write calls
+ * can detect recycling of the tty.
+ * XXX our caller should have done `spltty(); l_close(); ttyclose();'
+ * and l_close() should have flushed, but we repeat the spltty() and
+ * the flush in case there are buggy callers.
+ */
+int
+ttyclose(struct tty *tp)
+{
+ int s;
+
+ funsetown(&tp->t_sigio);
+ s = spltty();
+ if (constty == tp)
+ constty = NULL;
+
+ ttyflush(tp, FREAD | FWRITE);
+ clist_free_cblocks(&tp->t_canq);
+ clist_free_cblocks(&tp->t_outq);
+ clist_free_cblocks(&tp->t_rawq);
+
+ tp->t_gen++;
+ tp->t_line = TTYDISC;
+ tp->t_pgrp = NULL;
+ tp->t_session = NULL;
+ tp->t_state = 0;
+ splx(s);
+ return (0);
+}
+
+#define FLUSHQ(q) { \
+ if ((q)->c_cc) \
+ ndflush(q, (q)->c_cc); \
+}
+
+/* Is 'c' a line delimiter ("break" character)? */
+#define TTBREAKC(c, lflag) \
+ ((c) == '\n' || (((c) == cc[VEOF] || \
+ (c) == cc[VEOL] || ((c) == cc[VEOL2] && lflag & IEXTEN)) && \
+ (c) != _POSIX_VDISABLE))
+
+/*
+ * Process input of a single character received on a tty.
+ */
+int
+ttyinput(int c, struct tty *tp)
+{
+ tcflag_t iflag, lflag;
+ cc_t *cc;
+ int i, err;
+
+ /*
+ * If input is pending take it first.
+ */
+ lflag = tp->t_lflag;
+ if (ISSET(lflag, PENDIN))
+ ttypend(tp);
+ /*
+ * Gather stats.
+ */
+ if (ISSET(lflag, ICANON)) {
+ ++tk_cancc;
+ ++tp->t_cancc;
+ } else {
+ ++tk_rawcc;
+ ++tp->t_rawcc;
+ }
+ ++tk_nin;
+
+ /*
+ * Block further input iff:
+ * current input > threshold AND input is available to user program
+ * AND input flow control is enabled and not yet invoked.
+ * The 3 is slop for PARMRK.
+ */
+ iflag = tp->t_iflag;
+ if (tp->t_rawq.c_cc + tp->t_canq.c_cc > tp->t_ihiwat - 3 &&
+ (!ISSET(lflag, ICANON) || tp->t_canq.c_cc != 0) &&
+ (ISSET(tp->t_cflag, CRTS_IFLOW) || ISSET(iflag, IXOFF)) &&
+ !ISSET(tp->t_state, TS_TBLOCK))
+ ttyblock(tp);
+
+ /* Handle exceptional conditions (break, parity, framing). */
+ cc = tp->t_cc;
+ err = (ISSET(c, TTY_ERRORMASK));
+ if (err) {
+ CLR(c, TTY_ERRORMASK);
+ if (ISSET(err, TTY_BI)) {
+ if (ISSET(iflag, IGNBRK))
+ return (0);
+ if (ISSET(iflag, BRKINT)) {
+ ttyflush(tp, FREAD | FWRITE);
+ if (tp->t_pgrp != NULL) {
+ PGRP_LOCK(tp->t_pgrp);
+ pgsignal(tp->t_pgrp, SIGINT, 1);
+ PGRP_UNLOCK(tp->t_pgrp);
+ }
+ goto endcase;
+ }
+ if (ISSET(iflag, PARMRK))
+ goto parmrk;
+ } else if ((ISSET(err, TTY_PE) && ISSET(iflag, INPCK))
+ || ISSET(err, TTY_FE)) {
+ if (ISSET(iflag, IGNPAR))
+ return (0);
+ else if (ISSET(iflag, PARMRK)) {
+parmrk:
+ if (tp->t_rawq.c_cc + tp->t_canq.c_cc >
+ MAX_INPUT - 3)
+ goto input_overflow;
+ (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
+ (void)putc(0 | TTY_QUOTE, &tp->t_rawq);
+ (void)putc(c | TTY_QUOTE, &tp->t_rawq);
+ goto endcase;
+ } else
+ c = 0;
+ }
+ }
+
+ if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP))
+ CLR(c, 0x80);
+ if (!ISSET(lflag, EXTPROC)) {
+ /*
+ * Check for literal nexting very first
+ */
+ if (ISSET(tp->t_state, TS_LNCH)) {
+ SET(c, TTY_QUOTE);
+ CLR(tp->t_state, TS_LNCH);
+ }
+ /*
+ * Scan for special characters. This code
+ * is really just a big case statement with
+ * non-constant cases. The bottom of the
+ * case statement is labeled ``endcase'', so goto
+ * it after a case match, or similar.
+ */
+
+ /*
+ * Control chars which aren't controlled
+ * by ICANON, ISIG, or IXON.
+ */
+ if (ISSET(lflag, IEXTEN)) {
+ if (CCEQ(cc[VLNEXT], c)) {
+ if (ISSET(lflag, ECHO)) {
+ if (ISSET(lflag, ECHOE)) {
+ (void)ttyoutput('^', tp);
+ (void)ttyoutput('\b', tp);
+ } else
+ ttyecho(c, tp);
+ }
+ SET(tp->t_state, TS_LNCH);
+ goto endcase;
+ }
+ if (CCEQ(cc[VDISCARD], c)) {
+ if (ISSET(lflag, FLUSHO))
+ CLR(tp->t_lflag, FLUSHO);
+ else {
+ ttyflush(tp, FWRITE);
+ ttyecho(c, tp);
+ if (tp->t_rawq.c_cc + tp->t_canq.c_cc)
+ ttyretype(tp);
+ SET(tp->t_lflag, FLUSHO);
+ }
+ goto startoutput;
+ }
+ }
+ /*
+ * Signals.
+ */
+ if (ISSET(lflag, ISIG)) {
+ if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) {
+ if (!ISSET(lflag, NOFLSH))
+ ttyflush(tp, FREAD | FWRITE);
+ ttyecho(c, tp);
+ if (tp->t_pgrp != NULL) {
+ PGRP_LOCK(tp->t_pgrp);
+ pgsignal(tp->t_pgrp,
+ CCEQ(cc[VINTR], c) ? SIGINT : SIGQUIT, 1);
+ PGRP_UNLOCK(tp->t_pgrp);
+ }
+ goto endcase;
+ }
+ if (CCEQ(cc[VSUSP], c)) {
+ if (!ISSET(lflag, NOFLSH))
+ ttyflush(tp, FREAD);
+ ttyecho(c, tp);
+ if (tp->t_pgrp != NULL) {
+ PGRP_LOCK(tp->t_pgrp);
+ pgsignal(tp->t_pgrp, SIGTSTP, 1);
+ PGRP_UNLOCK(tp->t_pgrp);
+ }
+ goto endcase;
+ }
+ }
+ /*
+ * Handle start/stop characters.
+ */
+ if (ISSET(iflag, IXON)) {
+ if (CCEQ(cc[VSTOP], c)) {
+ if (!ISSET(tp->t_state, TS_TTSTOP)) {
+ SET(tp->t_state, TS_TTSTOP);
+ (*tp->t_stop)(tp, 0);
+ return (0);
+ }
+ if (!CCEQ(cc[VSTART], c))
+ return (0);
+ /*
+ * if VSTART == VSTOP then toggle
+ */
+ goto endcase;
+ }
+ if (CCEQ(cc[VSTART], c))
+ goto restartoutput;
+ }
+ /*
+ * IGNCR, ICRNL, & INLCR
+ */
+ if (c == '\r') {
+ if (ISSET(iflag, IGNCR))
+ return (0);
+ else if (ISSET(iflag, ICRNL))
+ c = '\n';
+ } else if (c == '\n' && ISSET(iflag, INLCR))
+ c = '\r';
+ }
+ if (!ISSET(tp->t_lflag, EXTPROC) && ISSET(lflag, ICANON)) {
+ /*
+ * From here on down canonical mode character
+ * processing takes place.
+ */
+ /*
+ * erase or erase2 (^H / ^?)
+ */
+ if (CCEQ(cc[VERASE], c) || CCEQ(cc[VERASE2], c) ) {
+ if (tp->t_rawq.c_cc)
+ ttyrub(unputc(&tp->t_rawq), tp);
+ goto endcase;
+ }
+ /*
+ * kill (^U)
+ */
+ if (CCEQ(cc[VKILL], c)) {
+ if (ISSET(lflag, ECHOKE) &&
+ tp->t_rawq.c_cc == tp->t_rocount &&
+ !ISSET(lflag, ECHOPRT))
+ while (tp->t_rawq.c_cc)
+ ttyrub(unputc(&tp->t_rawq), tp);
+ else {
+ ttyecho(c, tp);
+ if (ISSET(lflag, ECHOK) ||
+ ISSET(lflag, ECHOKE))
+ ttyecho('\n', tp);
+ FLUSHQ(&tp->t_rawq);
+ tp->t_rocount = 0;
+ }
+ CLR(tp->t_state, TS_LOCAL);
+ goto endcase;
+ }
+ /*
+ * word erase (^W)
+ */
+ if (CCEQ(cc[VWERASE], c) && ISSET(lflag, IEXTEN)) {
+ int ctype;
+
+ /*
+ * erase whitespace
+ */
+ while ((c = unputc(&tp->t_rawq)) == ' ' || c == '\t')
+ ttyrub(c, tp);
+ if (c == -1)
+ goto endcase;
+ /*
+ * erase last char of word and remember the
+ * next chars type (for ALTWERASE)
+ */
+ ttyrub(c, tp);
+ c = unputc(&tp->t_rawq);
+ if (c == -1)
+ goto endcase;
+ if (c == ' ' || c == '\t') {
+ (void)putc(c, &tp->t_rawq);
+ goto endcase;
+ }
+ ctype = ISALPHA(c);
+ /*
+ * erase rest of word
+ */
+ do {
+ ttyrub(c, tp);
+ c = unputc(&tp->t_rawq);
+ if (c == -1)
+ goto endcase;
+ } while (c != ' ' && c != '\t' &&
+ (!ISSET(lflag, ALTWERASE) || ISALPHA(c) == ctype));
+ (void)putc(c, &tp->t_rawq);
+ goto endcase;
+ }
+ /*
+ * reprint line (^R)
+ */
+ if (CCEQ(cc[VREPRINT], c) && ISSET(lflag, IEXTEN)) {
+ ttyretype(tp);
+ goto endcase;
+ }
+ /*
+ * ^T - kernel info and generate SIGINFO
+ */
+ if (CCEQ(cc[VSTATUS], c) && ISSET(lflag, IEXTEN)) {
+ if (ISSET(lflag, ISIG) && tp->t_pgrp != NULL) {
+ PGRP_LOCK(tp->t_pgrp);
+ pgsignal(tp->t_pgrp, SIGINFO, 1);
+ PGRP_UNLOCK(tp->t_pgrp);
+ }
+ if (!ISSET(lflag, NOKERNINFO))
+ ttyinfo(tp);
+ goto endcase;
+ }
+ }
+ /*
+ * Check for input buffer overflow
+ */
+ if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= MAX_INPUT) {
+input_overflow:
+ if (ISSET(iflag, IMAXBEL)) {
+ if (tp->t_outq.c_cc < tp->t_ohiwat)
+ (void)ttyoutput(CTRL('g'), tp);
+ }
+ goto endcase;
+ }
+
+ if ( c == 0377 && ISSET(iflag, PARMRK) && !ISSET(iflag, ISTRIP)
+ && ISSET(iflag, IGNBRK|IGNPAR) != (IGNBRK|IGNPAR))
+ (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
+
+ /*
+ * Put data char in q for user and
+ * wakeup on seeing a line delimiter.
+ */
+ if (putc(c, &tp->t_rawq) >= 0) {
+ if (!ISSET(lflag, ICANON)) {
+ ttwakeup(tp);
+ ttyecho(c, tp);
+ goto endcase;
+ }
+ if (TTBREAKC(c, lflag)) {
+ tp->t_rocount = 0;
+ catq(&tp->t_rawq, &tp->t_canq);
+ ttwakeup(tp);
+ } else if (tp->t_rocount++ == 0)
+ tp->t_rocol = tp->t_column;
+ if (ISSET(tp->t_state, TS_ERASE)) {
+ /*
+ * end of prterase \.../
+ */
+ CLR(tp->t_state, TS_ERASE);
+ (void)ttyoutput('/', tp);
+ }
+ i = tp->t_column;
+ ttyecho(c, tp);
+ if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) {
+ /*
+ * Place the cursor over the '^' of the ^D.
+ */
+ i = imin(2, tp->t_column - i);
+ while (i > 0) {
+ (void)ttyoutput('\b', tp);
+ i--;
+ }
+ }
+ }
+endcase:
+ /*
+ * IXANY means allow any character to restart output.
+ */
+ if (ISSET(tp->t_state, TS_TTSTOP) &&
+ !ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP])
+ return (0);
+restartoutput:
+ CLR(tp->t_lflag, FLUSHO);
+ CLR(tp->t_state, TS_TTSTOP);
+startoutput:
+ return (ttstart(tp));
+}
+
+/*
+ * Output a single character on a tty, doing output processing
+ * as needed (expanding tabs, newline processing, etc.).
+ * Returns < 0 if succeeds, otherwise returns char to resend.
+ * Must be recursive.
+ */
+static int
+ttyoutput(int c, struct tty *tp)
+{
+ tcflag_t oflag;
+ int col, s;
+
+ oflag = tp->t_oflag;
+ if (!ISSET(oflag, OPOST)) {
+ if (ISSET(tp->t_lflag, FLUSHO))
+ return (-1);
+ if (putc(c, &tp->t_outq))
+ return (c);
+ tk_nout++;
+ tp->t_outcc++;
+ return (-1);
+ }
+ /*
+ * Do tab expansion if OXTABS is set. Special case if we external
+ * processing, we don't do the tab expansion because we'll probably
+ * get it wrong. If tab expansion needs to be done, let it happen
+ * externally.
+ */
+ CLR(c, ~TTY_CHARMASK);
+ if (c == '\t' &&
+ ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) {
+ c = 8 - (tp->t_column & 7);
+ if (!ISSET(tp->t_lflag, FLUSHO)) {
+ s = spltty(); /* Don't interrupt tabs. */
+ c -= b_to_q(" ", c, &tp->t_outq);
+ tk_nout += c;
+ tp->t_outcc += c;
+ splx(s);
+ }
+ tp->t_column += c;
+ return (c ? -1 : '\t');
+ }
+ if (c == CEOT && ISSET(oflag, ONOEOT))
+ return (-1);
+
+ /*
+ * Newline translation: if ONLCR is set,
+ * translate newline into "\r\n".
+ */
+ if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) {
+ tk_nout++;
+ tp->t_outcc++;
+ if (!ISSET(tp->t_lflag, FLUSHO) && putc('\r', &tp->t_outq))
+ return (c);
+ }
+ /* If OCRNL is set, translate "\r" into "\n". */
+ else if (c == '\r' && ISSET(tp->t_oflag, OCRNL))
+ c = '\n';
+ /* If ONOCR is set, don't transmit CRs when on column 0. */
+ else if (c == '\r' && ISSET(tp->t_oflag, ONOCR) && tp->t_column == 0)
+ return (-1);
+
+ tk_nout++;
+ tp->t_outcc++;
+ if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq))
+ return (c);
+
+ col = tp->t_column;
+ switch (CCLASS(c)) {
+ case BACKSPACE:
+ if (col > 0)
+ --col;
+ break;
+ case CONTROL:
+ break;
+ case NEWLINE:
+ if (ISSET(tp->t_oflag, ONLCR | ONLRET))
+ col = 0;
+ break;
+ case RETURN:
+ col = 0;
+ break;
+ case ORDINARY:
+ ++col;
+ break;
+ case TAB:
+ col = (col + 8) & ~7;
+ break;
+ }
+ tp->t_column = col;
+ return (-1);
+}
+
+/*
+ * Ioctls for all tty devices. Called after line-discipline specific ioctl
+ * has been called to do discipline-specific functions and/or reject any
+ * of these ioctl commands.
+ */
+/* ARGSUSED */
+int
+ttioctl(struct tty *tp, u_long cmd, void *data, int flag)
+{
+ struct proc *p;
+ struct thread *td;
+ struct pgrp *pgrp;
+ int s, error;
+
+ td = curthread; /* XXX */
+ p = td->td_proc;
+
+ /* If the ioctl involves modification, hang if in the background. */
+ switch (cmd) {
+ case TIOCCBRK:
+ case TIOCCONS:
+ case TIOCDRAIN:
+ case TIOCEXCL:
+ case TIOCFLUSH:
+#ifdef TIOCHPCL
+ case TIOCHPCL:
+#endif
+ case TIOCNXCL:
+ case TIOCSBRK:
+ case TIOCSCTTY:
+ case TIOCSDRAINWAIT:
+ case TIOCSETA:
+ case TIOCSETAF:
+ case TIOCSETAW:
+ case TIOCSETD:
+ case TIOCSPGRP:
+ case TIOCSTART:
+ case TIOCSTAT:
+ case TIOCSTI:
+ case TIOCSTOP:
+ case TIOCSWINSZ:
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+ case TIOCLBIC:
+ case TIOCLBIS:
+ case TIOCLSET:
+ case TIOCSETC:
+ case OTIOCSETD:
+ case TIOCSETN:
+ case TIOCSETP:
+ case TIOCSLTC:
+#endif
+ sx_slock(&proctree_lock);
+ PROC_LOCK(p);
+ while (isbackground(p, tp) && !(p->p_flag & P_PPWAIT) &&
+ !SIGISMEMBER(p->p_sigignore, SIGTTOU) &&
+ !SIGISMEMBER(p->p_sigmask, SIGTTOU)) {
+ pgrp = p->p_pgrp;
+ PROC_UNLOCK(p);
+ if (pgrp->pg_jobc == 0) {
+ sx_sunlock(&proctree_lock);
+ return (EIO);
+ }
+ PGRP_LOCK(pgrp);
+ sx_sunlock(&proctree_lock);
+ pgsignal(pgrp, SIGTTOU, 1);
+ PGRP_UNLOCK(pgrp);
+ error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, "ttybg1",
+ 0);
+ if (error)
+ return (error);
+ sx_slock(&proctree_lock);
+ PROC_LOCK(p);
+ }
+ PROC_UNLOCK(p);
+ sx_sunlock(&proctree_lock);
+ break;
+ }
+
+ switch (cmd) { /* Process the ioctl. */
+ case FIOASYNC: /* set/clear async i/o */
+ s = spltty();
+ if (*(int *)data)
+ SET(tp->t_state, TS_ASYNC);
+ else
+ CLR(tp->t_state, TS_ASYNC);
+ splx(s);
+ break;
+ case FIONBIO: /* set/clear non-blocking i/o */
+ break; /* XXX: delete. */
+ case FIONREAD: /* get # bytes to read */
+ s = spltty();
+ *(int *)data = ttnread(tp);
+ splx(s);
+ break;
+
+ case FIOSETOWN:
+ /*
+ * Policy -- Don't allow FIOSETOWN on someone else's
+ * controlling tty
+ */
+ if (tp->t_session != NULL && !isctty(p, tp))
+ return (ENOTTY);
+
+ error = fsetown(*(int *)data, &tp->t_sigio);
+ if (error)
+ return (error);
+ break;
+ case FIOGETOWN:
+ if (tp->t_session != NULL && !isctty(p, tp))
+ return (ENOTTY);
+ *(int *)data = fgetown(tp->t_sigio);
+ break;
+
+ case TIOCEXCL: /* set exclusive use of tty */
+ s = spltty();
+ SET(tp->t_state, TS_XCLUDE);
+ splx(s);
+ break;
+ case TIOCFLUSH: { /* flush buffers */
+ int flags = *(int *)data;
+
+ if (flags == 0)
+ flags = FREAD | FWRITE;
+ else
+ flags &= FREAD | FWRITE;
+ ttyflush(tp, flags);
+ break;
+ }
+ case TIOCCONS: /* become virtual console */
+ if (*(int *)data) {
+ struct nameidata nid;
+
+ if (constty && constty != tp &&
+ ISSET(constty->t_state, TS_CONNECTED))
+ return (EBUSY);
+
+ /* Ensure user can open the real console. */
+ NDINIT(&nid, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE,
+ "/dev/console", td);
+ if ((error = namei(&nid)) != 0)
+ return (error);
+ NDFREE(&nid, NDF_ONLY_PNBUF);
+ error = VOP_ACCESS(nid.ni_vp, VREAD, td->td_ucred, td);
+ vput(nid.ni_vp);
+ if (error)
+ return (error);
+
+ constty = tp;
+ } else if (tp == constty)
+ constty = NULL;
+ break;
+ case TIOCDRAIN: /* wait till output drained */
+ error = ttywait(tp);
+ if (error)
+ return (error);
+ break;
+ case TIOCGETA: { /* get termios struct */
+ struct termios *t = (struct termios *)data;
+
+ bcopy(&tp->t_termios, t, sizeof(struct termios));
+ break;
+ }
+ case TIOCGETD: /* get line discipline */
+ *(int *)data = tp->t_line;
+ break;
+ case TIOCGWINSZ: /* get window size */
+ *(struct winsize *)data = tp->t_winsize;
+ break;
+ case TIOCGPGRP: /* get pgrp of tty */
+ if (!isctty(p, tp))
+ return (ENOTTY);
+ *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
+ break;
+#ifdef TIOCHPCL
+ case TIOCHPCL: /* hang up on last close */
+ s = spltty();
+ SET(tp->t_cflag, HUPCL);
+ splx(s);
+ break;
+#endif
+ case TIOCNXCL: /* reset exclusive use of tty */
+ s = spltty();
+ CLR(tp->t_state, TS_XCLUDE);
+ splx(s);
+ break;
+ case TIOCOUTQ: /* output queue size */
+ *(int *)data = tp->t_outq.c_cc;
+ break;
+ case TIOCSETA: /* set termios struct */
+ case TIOCSETAW: /* drain output, set */
+ case TIOCSETAF: { /* drn out, fls in, set */
+ struct termios *t = (struct termios *)data;
+
+ if (t->c_ispeed == 0)
+ t->c_ispeed = t->c_ospeed;
+ if (t->c_ispeed == 0)
+ t->c_ispeed = tp->t_ospeed;
+ if (t->c_ispeed == 0)
+ return (EINVAL);
+ s = spltty();
+ if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
+ error = ttywait(tp);
+ if (error) {
+ splx(s);
+ return (error);
+ }
+ if (cmd == TIOCSETAF)
+ ttyflush(tp, FREAD);
+ }
+ if (!ISSET(t->c_cflag, CIGNORE)) {
+ /*
+ * Set device hardware.
+ */
+ if (tp->t_param && (error = (*tp->t_param)(tp, t))) {
+ splx(s);
+ return (error);
+ }
+ if (ISSET(t->c_cflag, CLOCAL) &&
+ !ISSET(tp->t_cflag, CLOCAL)) {
+ /*
+ * XXX disconnections would be too hard to
+ * get rid of without this kludge. The only
+ * way to get rid of controlling terminals
+ * is to exit from the session leader.
+ */
+ CLR(tp->t_state, TS_ZOMBIE);
+
+ wakeup(TSA_CARR_ON(tp));
+ ttwakeup(tp);
+ ttwwakeup(tp);
+ }
+ if ((ISSET(tp->t_state, TS_CARR_ON) ||
+ ISSET(t->c_cflag, CLOCAL)) &&
+ !ISSET(tp->t_state, TS_ZOMBIE))
+ SET(tp->t_state, TS_CONNECTED);
+ else
+ CLR(tp->t_state, TS_CONNECTED);
+ tp->t_cflag = t->c_cflag;
+ tp->t_ispeed = t->c_ispeed;
+ if (t->c_ospeed != 0)
+ tp->t_ospeed = t->c_ospeed;
+ ttsetwater(tp);
+ }
+ if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON) &&
+ cmd != TIOCSETAF) {
+ if (ISSET(t->c_lflag, ICANON))
+ SET(tp->t_lflag, PENDIN);
+ else {
+ /*
+ * XXX we really shouldn't allow toggling
+ * ICANON while we're in a non-termios line
+ * discipline. Now we have to worry about
+ * panicing for a null queue.
+ */
+ if (tp->t_canq.c_cbreserved > 0 &&
+ tp->t_rawq.c_cbreserved > 0) {
+ catq(&tp->t_rawq, &tp->t_canq);
+ /*
+ * XXX the queue limits may be
+ * different, so the old queue
+ * swapping method no longer works.
+ */
+ catq(&tp->t_canq, &tp->t_rawq);
+ }
+ CLR(tp->t_lflag, PENDIN);
+ }
+ ttwakeup(tp);
+ }
+ tp->t_iflag = t->c_iflag;
+ tp->t_oflag = t->c_oflag;
+ /*
+ * Make the EXTPROC bit read only.
+ */
+ if (ISSET(tp->t_lflag, EXTPROC))
+ SET(t->c_lflag, EXTPROC);
+ else
+ CLR(t->c_lflag, EXTPROC);
+ tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN);
+ if (t->c_cc[VMIN] != tp->t_cc[VMIN] ||
+ t->c_cc[VTIME] != tp->t_cc[VTIME])
+ ttwakeup(tp);
+ bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc));
+ splx(s);
+ break;
+ }
+ case TIOCSETD: { /* set line discipline */
+ int t = *(int *)data;
+ dev_t device = tp->t_dev;
+
+ if ((u_int)t >= nlinesw)
+ return (ENXIO);
+ if (t != tp->t_line) {
+ s = spltty();
+ (*linesw[tp->t_line].l_close)(tp, flag);
+ error = (*linesw[t].l_open)(device, tp);
+ if (error) {
+ (void)(*linesw[tp->t_line].l_open)(device, tp);
+ splx(s);
+ return (error);
+ }
+ tp->t_line = t;
+ splx(s);
+ }
+ break;
+ }
+ case TIOCSTART: /* start output, like ^Q */
+ s = spltty();
+ if (ISSET(tp->t_state, TS_TTSTOP) ||
+ ISSET(tp->t_lflag, FLUSHO)) {
+ CLR(tp->t_lflag, FLUSHO);
+ CLR(tp->t_state, TS_TTSTOP);
+ ttstart(tp);
+ }
+ splx(s);
+ break;
+ case TIOCSTI: /* simulate terminal input */
+ if ((flag & FREAD) == 0 && suser(td))
+ return (EPERM);
+ if (!isctty(p, tp) && suser(td))
+ return (EACCES);
+ s = spltty();
+ (*linesw[tp->t_line].l_rint)(*(u_char *)data, tp);
+ splx(s);
+ break;
+ case TIOCSTOP: /* stop output, like ^S */
+ s = spltty();
+ if (!ISSET(tp->t_state, TS_TTSTOP)) {
+ SET(tp->t_state, TS_TTSTOP);
+ (*tp->t_stop)(tp, 0);
+ }
+ splx(s);
+ break;
+ case TIOCSCTTY: /* become controlling tty */
+ /* Session ctty vnode pointer set in vnode layer. */
+ sx_slock(&proctree_lock);
+ if (!SESS_LEADER(p) ||
+ ((p->p_session->s_ttyvp || tp->t_session) &&
+ (tp->t_session != p->p_session))) {
+ sx_sunlock(&proctree_lock);
+ return (EPERM);
+ }
+ tp->t_session = p->p_session;
+ tp->t_pgrp = p->p_pgrp;
+ SESS_LOCK(p->p_session);
+ p->p_session->s_ttyp = tp;
+ SESS_UNLOCK(p->p_session);
+ PROC_LOCK(p);
+ p->p_flag |= P_CONTROLT;
+ PROC_UNLOCK(p);
+ sx_sunlock(&proctree_lock);
+ break;
+ case TIOCSPGRP: { /* set pgrp of tty */
+ sx_slock(&proctree_lock);
+ pgrp = pgfind(*(int *)data);
+ if (!isctty(p, tp)) {
+ if (pgrp != NULL)
+ PGRP_UNLOCK(pgrp);
+ sx_sunlock(&proctree_lock);
+ return (ENOTTY);
+ }
+ if (pgrp == NULL) {
+ sx_sunlock(&proctree_lock);
+ return (EPERM);
+ }
+ PGRP_UNLOCK(pgrp);
+ if (pgrp->pg_session != p->p_session) {
+ sx_sunlock(&proctree_lock);
+ return (EPERM);
+ }
+ sx_sunlock(&proctree_lock);
+ tp->t_pgrp = pgrp;
+ break;
+ }
+ case TIOCSTAT: /* simulate control-T */
+ s = spltty();
+ ttyinfo(tp);
+ splx(s);
+ break;
+ case TIOCSWINSZ: /* set window size */
+ if (bcmp((caddr_t)&tp->t_winsize, data,
+ sizeof (struct winsize))) {
+ tp->t_winsize = *(struct winsize *)data;
+ if (tp->t_pgrp != NULL) {
+ PGRP_LOCK(tp->t_pgrp);
+ pgsignal(tp->t_pgrp, SIGWINCH, 1);
+ PGRP_UNLOCK(tp->t_pgrp);
+ }
+ }
+ break;
+ case TIOCSDRAINWAIT:
+ error = suser(td);
+ if (error)
+ return (error);
+ tp->t_timeout = *(int *)data * hz;
+ wakeup(TSA_OCOMPLETE(tp));
+ wakeup(TSA_OLOWAT(tp));
+ break;
+ case TIOCGDRAINWAIT:
+ *(int *)data = tp->t_timeout / hz;
+ break;
+ default:
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+ return (ttcompat(tp, cmd, data, flag));
+#else
+ return (ENOIOCTL);
+#endif
+ }
+ return (0);
+}
+
+int
+ttypoll(dev_t dev, int events, struct thread *td)
+{
+ int s;
+ int revents = 0;
+ struct tty *tp;
+
+ tp = dev->si_tty;
+ if (tp == NULL) /* XXX used to return ENXIO, but that means true! */
+ return ((events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM))
+ | POLLHUP);
+
+ s = spltty();
+ if (events & (POLLIN | POLLRDNORM)) {
+ if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE))
+ revents |= events & (POLLIN | POLLRDNORM);
+ else
+ selrecord(td, &tp->t_rsel);
+ }
+ if (events & (POLLOUT | POLLWRNORM)) {
+ if ((tp->t_outq.c_cc <= tp->t_olowat &&
+ ISSET(tp->t_state, TS_CONNECTED))
+ || ISSET(tp->t_state, TS_ZOMBIE))
+ revents |= events & (POLLOUT | POLLWRNORM);
+ else
+ selrecord(td, &tp->t_wsel);
+ }
+ splx(s);
+ return (revents);
+}
+
+static struct filterops ttyread_filtops =
+ { 1, NULL, filt_ttyrdetach, filt_ttyread };
+static struct filterops ttywrite_filtops =
+ { 1, NULL, filt_ttywdetach, filt_ttywrite };
+
+int
+ttykqfilter(dev_t dev, struct knote *kn)
+{
+ struct tty *tp = dev->si_tty;
+ struct klist *klist;
+ int s;
+
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ klist = &tp->t_rsel.si_note;
+ kn->kn_fop = &ttyread_filtops;
+ break;
+ case EVFILT_WRITE:
+ klist = &tp->t_wsel.si_note;
+ kn->kn_fop = &ttywrite_filtops;
+ break;
+ default:
+ return (1);
+ }
+
+ kn->kn_hook = (caddr_t)dev;
+
+ s = spltty();
+ SLIST_INSERT_HEAD(klist, kn, kn_selnext);
+ splx(s);
+
+ return (0);
+}
+
+static void
+filt_ttyrdetach(struct knote *kn)
+{
+ struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
+ int s = spltty();
+
+ SLIST_REMOVE(&tp->t_rsel.si_note, kn, knote, kn_selnext);
+ splx(s);
+}
+
+static int
+filt_ttyread(struct knote *kn, long hint)
+{
+ struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
+
+ kn->kn_data = ttnread(tp);
+ if (ISSET(tp->t_state, TS_ZOMBIE)) {
+ kn->kn_flags |= EV_EOF;
+ return (1);
+ }
+ return (kn->kn_data > 0);
+}
+
+static void
+filt_ttywdetach(struct knote *kn)
+{
+ struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
+ int s = spltty();
+
+ SLIST_REMOVE(&tp->t_wsel.si_note, kn, knote, kn_selnext);
+ splx(s);
+}
+
+static int
+filt_ttywrite(struct knote *kn, long hint)
+{
+ struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
+
+ kn->kn_data = tp->t_outq.c_cc;
+ if (ISSET(tp->t_state, TS_ZOMBIE))
+ return (1);
+ return (kn->kn_data <= tp->t_olowat &&
+ ISSET(tp->t_state, TS_CONNECTED));
+}
+
+/*
+ * Must be called at spltty().
+ */
+static int
+ttnread(struct tty *tp)
+{
+ int nread;
+
+ if (ISSET(tp->t_lflag, PENDIN))
+ ttypend(tp);
+ nread = tp->t_canq.c_cc;
+ if (!ISSET(tp->t_lflag, ICANON)) {
+ nread += tp->t_rawq.c_cc;
+ if (nread < tp->t_cc[VMIN] && tp->t_cc[VTIME] == 0)
+ nread = 0;
+ }
+ return (nread);
+}
+
+/*
+ * Wait for output to drain.
+ */
+int
+ttywait(struct tty *tp)
+{
+ int error, s;
+
+ error = 0;
+ s = spltty();
+ while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
+ ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) {
+ (*tp->t_oproc)(tp);
+ if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
+ ISSET(tp->t_state, TS_CONNECTED)) {
+ SET(tp->t_state, TS_SO_OCOMPLETE);
+ error = ttysleep(tp, TSA_OCOMPLETE(tp),
+ TTOPRI | PCATCH, "ttywai",
+ tp->t_timeout);
+ if (error) {
+ if (error == EWOULDBLOCK)
+ error = EIO;
+ break;
+ }
+ } else
+ break;
+ }
+ if (!error && (tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)))
+ error = EIO;
+ splx(s);
+ return (error);
+}
+
+/*
+ * Flush if successfully wait.
+ */
+static int
+ttywflush(struct tty *tp)
+{
+ int error;
+
+ if ((error = ttywait(tp)) == 0)
+ ttyflush(tp, FREAD);
+ return (error);
+}
+
+/*
+ * Flush tty read and/or write queues, notifying anyone waiting.
+ */
+void
+ttyflush(struct tty *tp, int rw)
+{
+ int s;
+
+ s = spltty();
+#if 0
+again:
+#endif
+ if (rw & FWRITE) {
+ FLUSHQ(&tp->t_outq);
+ CLR(tp->t_state, TS_TTSTOP);
+ }
+ (*tp->t_stop)(tp, rw);
+ if (rw & FREAD) {
+ FLUSHQ(&tp->t_canq);
+ FLUSHQ(&tp->t_rawq);
+ CLR(tp->t_lflag, PENDIN);
+ tp->t_rocount = 0;
+ tp->t_rocol = 0;
+ CLR(tp->t_state, TS_LOCAL);
+ ttwakeup(tp);
+ if (ISSET(tp->t_state, TS_TBLOCK)) {
+ if (rw & FWRITE)
+ FLUSHQ(&tp->t_outq);
+ ttyunblock(tp);
+
+ /*
+ * Don't let leave any state that might clobber the
+ * next line discipline (although we should do more
+ * to send the START char). Not clearing the state
+ * may have caused the "putc to a clist with no
+ * reserved cblocks" panic/printf.
+ */
+ CLR(tp->t_state, TS_TBLOCK);
+
+#if 0 /* forget it, sleeping isn't always safe and we don't know when it is */
+ if (ISSET(tp->t_iflag, IXOFF)) {
+ /*
+ * XXX wait a bit in the hope that the stop
+ * character (if any) will go out. Waiting
+ * isn't good since it allows races. This
+ * will be fixed when the stop character is
+ * put in a special queue. Don't bother with
+ * the checks in ttywait() since the timeout
+ * will save us.
+ */
+ SET(tp->t_state, TS_SO_OCOMPLETE);
+ ttysleep(tp, TSA_OCOMPLETE(tp), TTOPRI,
+ "ttyfls", hz / 10);
+ /*
+ * Don't try sending the stop character again.
+ */
+ CLR(tp->t_state, TS_TBLOCK);
+ goto again;
+ }
+#endif
+ }
+ }
+ if (rw & FWRITE) {
+ FLUSHQ(&tp->t_outq);
+ ttwwakeup(tp);
+ }
+ splx(s);
+}
+
+/*
+ * Copy in the default termios characters.
+ */
+void
+termioschars(struct termios *t)
+{
+
+ bcopy(ttydefchars, t->c_cc, sizeof t->c_cc);
+}
+
+/*
+ * Old interface.
+ */
+void
+ttychars(struct tty *tp)
+{
+
+ termioschars(&tp->t_termios);
+}
+
+/*
+ * Handle input high water. Send stop character for the IXOFF case. Turn
+ * on our input flow control bit and propagate the changes to the driver.
+ * XXX the stop character should be put in a special high priority queue.
+ */
+void
+ttyblock(struct tty *tp)
+{
+
+ SET(tp->t_state, TS_TBLOCK);
+ if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE &&
+ putc(tp->t_cc[VSTOP], &tp->t_outq) != 0)
+ CLR(tp->t_state, TS_TBLOCK); /* try again later */
+ ttstart(tp);
+}
+
+/*
+ * Handle input low water. Send start character for the IXOFF case. Turn
+ * off our input flow control bit and propagate the changes to the driver.
+ * XXX the start character should be put in a special high priority queue.
+ */
+static void
+ttyunblock(struct tty *tp)
+{
+
+ CLR(tp->t_state, TS_TBLOCK);
+ if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTART] != _POSIX_VDISABLE &&
+ putc(tp->t_cc[VSTART], &tp->t_outq) != 0)
+ SET(tp->t_state, TS_TBLOCK); /* try again later */
+ ttstart(tp);
+}
+
+#ifdef notyet
+/* Not used by any current (i386) drivers. */
+/*
+ * Restart after an inter-char delay.
+ */
+void
+ttrstrt(void *tp_arg)
+{
+ struct tty *tp;
+ int s;
+
+ KASSERT(tp_arg != NULL, ("ttrstrt"));
+
+ tp = tp_arg;
+ s = spltty();
+
+ CLR(tp->t_state, TS_TIMEOUT);
+ ttstart(tp);
+
+ splx(s);
+}
+#endif
+
+int
+ttstart(struct tty *tp)
+{
+
+ if (tp->t_oproc != NULL) /* XXX: Kludge for pty. */
+ (*tp->t_oproc)(tp);
+ return (0);
+}
+
+/*
+ * "close" a line discipline
+ */
+int
+ttylclose(struct tty *tp, int flag)
+{
+
+ if (flag & FNONBLOCK || ttywflush(tp))
+ ttyflush(tp, FREAD | FWRITE);
+ return (0);
+}
+
+/*
+ * Handle modem control transition on a tty.
+ * Flag indicates new state of carrier.
+ * Returns 0 if the line should be turned off, otherwise 1.
+ */
+int
+ttymodem(struct tty *tp, int flag)
+{
+
+ if (ISSET(tp->t_state, TS_CARR_ON) && ISSET(tp->t_cflag, MDMBUF)) {
+ /*
+ * MDMBUF: do flow control according to carrier flag
+ * XXX TS_CAR_OFLOW doesn't do anything yet. TS_TTSTOP
+ * works if IXON and IXANY are clear.
+ */
+ if (flag) {
+ CLR(tp->t_state, TS_CAR_OFLOW);
+ CLR(tp->t_state, TS_TTSTOP);
+ ttstart(tp);
+ } else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) {
+ SET(tp->t_state, TS_CAR_OFLOW);
+ SET(tp->t_state, TS_TTSTOP);
+ (*tp->t_stop)(tp, 0);
+ }
+ } else if (flag == 0) {
+ /*
+ * Lost carrier.
+ */
+ CLR(tp->t_state, TS_CARR_ON);
+ if (ISSET(tp->t_state, TS_ISOPEN) &&
+ !ISSET(tp->t_cflag, CLOCAL)) {
+ SET(tp->t_state, TS_ZOMBIE);
+ CLR(tp->t_state, TS_CONNECTED);
+ if (tp->t_session) {
+ sx_slock(&proctree_lock);
+ if (tp->t_session->s_leader) {
+ struct proc *p;
+
+ p = tp->t_session->s_leader;
+ PROC_LOCK(p);
+ psignal(p, SIGHUP);
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&proctree_lock);
+ }
+ ttyflush(tp, FREAD | FWRITE);
+ return (0);
+ }
+ } else {
+ /*
+ * Carrier now on.
+ */
+ SET(tp->t_state, TS_CARR_ON);
+ if (!ISSET(tp->t_state, TS_ZOMBIE))
+ SET(tp->t_state, TS_CONNECTED);
+ wakeup(TSA_CARR_ON(tp));
+ ttwakeup(tp);
+ ttwwakeup(tp);
+ }
+ return (1);
+}
+
+/*
+ * Reinput pending characters after state switch
+ * call at spltty().
+ */
+static void
+ttypend(struct tty *tp)
+{
+ struct clist tq;
+ int c;
+
+ CLR(tp->t_lflag, PENDIN);
+ SET(tp->t_state, TS_TYPEN);
+ /*
+ * XXX this assumes too much about clist internals. It may even
+ * fail if the cblock slush pool is empty. We can't allocate more
+ * cblocks here because we are called from an interrupt handler
+ * and clist_alloc_cblocks() can wait.
+ */
+ tq = tp->t_rawq;
+ bzero(&tp->t_rawq, sizeof tp->t_rawq);
+ tp->t_rawq.c_cbmax = tq.c_cbmax;
+ tp->t_rawq.c_cbreserved = tq.c_cbreserved;
+ while ((c = getc(&tq)) >= 0)
+ ttyinput(c, tp);
+ CLR(tp->t_state, TS_TYPEN);
+}
+
+/*
+ * Process a read call on a tty device.
+ */
+int
+ttread(struct tty *tp, struct uio *uio, int flag)
+{
+ struct clist *qp;
+ int c;
+ tcflag_t lflag;
+ cc_t *cc = tp->t_cc;
+ struct proc *p = curproc;
+ int s, first, error = 0;
+ int has_stime = 0, last_cc = 0;
+ long slp = 0; /* XXX this should be renamed `timo'. */
+ struct timeval stime;
+ struct pgrp *pg;
+
+loop:
+ s = spltty();
+ lflag = tp->t_lflag;
+ /*
+ * take pending input first
+ */
+ if (ISSET(lflag, PENDIN)) {
+ ttypend(tp);
+ splx(s); /* reduce latency */
+ s = spltty();
+ lflag = tp->t_lflag; /* XXX ttypend() clobbers it */
+ }
+
+ /*
+ * Hang process if it's in the background.
+ */
+ if (isbackground(p, tp)) {
+ splx(s);
+ sx_slock(&proctree_lock);
+ PROC_LOCK(p);
+ if (SIGISMEMBER(p->p_sigignore, SIGTTIN) ||
+ SIGISMEMBER(p->p_sigmask, SIGTTIN) ||
+ (p->p_flag & P_PPWAIT) || p->p_pgrp->pg_jobc == 0) {
+ PROC_UNLOCK(p);
+ sx_sunlock(&proctree_lock);
+ return (EIO);
+ }
+ pg = p->p_pgrp;
+ PROC_UNLOCK(p);
+ PGRP_LOCK(pg);
+ sx_sunlock(&proctree_lock);
+ pgsignal(pg, SIGTTIN, 1);
+ PGRP_UNLOCK(pg);
+ error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg2", 0);
+ if (error)
+ return (error);
+ goto loop;
+ }
+
+ if (ISSET(tp->t_state, TS_ZOMBIE)) {
+ splx(s);
+ return (0); /* EOF */
+ }
+
+ /*
+ * If canonical, use the canonical queue,
+ * else use the raw queue.
+ *
+ * (should get rid of clists...)
+ */
+ qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq;
+
+ if (flag & IO_NDELAY) {
+ if (qp->c_cc > 0)
+ goto read;
+ if (!ISSET(lflag, ICANON) && cc[VMIN] == 0) {
+ splx(s);
+ return (0);
+ }
+ splx(s);
+ return (EWOULDBLOCK);
+ }
+ if (!ISSET(lflag, ICANON)) {
+ int m = cc[VMIN];
+ long t = cc[VTIME];
+ struct timeval timecopy;
+
+ /*
+ * Check each of the four combinations.
+ * (m > 0 && t == 0) is the normal read case.
+ * It should be fairly efficient, so we check that and its
+ * companion case (m == 0 && t == 0) first.
+ * For the other two cases, we compute the target sleep time
+ * into slp.
+ */
+ if (t == 0) {
+ if (qp->c_cc < m)
+ goto sleep;
+ if (qp->c_cc > 0)
+ goto read;
+
+ /* m, t and qp->c_cc are all 0. 0 is enough input. */
+ splx(s);
+ return (0);
+ }
+ t *= 100000; /* time in us */
+#define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 1000000 + \
+ ((t1).tv_usec - (t2).tv_usec))
+ if (m > 0) {
+ if (qp->c_cc <= 0)
+ goto sleep;
+ if (qp->c_cc >= m)
+ goto read;
+ getmicrotime(&timecopy);
+ if (!has_stime) {
+ /* first character, start timer */
+ has_stime = 1;
+ stime = timecopy;
+ slp = t;
+ } else if (qp->c_cc > last_cc) {
+ /* got a character, restart timer */
+ stime = timecopy;
+ slp = t;
+ } else {
+ /* nothing, check expiration */
+ slp = t - diff(timecopy, stime);
+ if (slp <= 0)
+ goto read;
+ }
+ last_cc = qp->c_cc;
+ } else { /* m == 0 */
+ if (qp->c_cc > 0)
+ goto read;
+ getmicrotime(&timecopy);
+ if (!has_stime) {
+ has_stime = 1;
+ stime = timecopy;
+ slp = t;
+ } else {
+ slp = t - diff(timecopy, stime);
+ if (slp <= 0) {
+ /* Timed out, but 0 is enough input. */
+ splx(s);
+ return (0);
+ }
+ }
+ }
+#undef diff
+ /*
+ * Rounding down may make us wake up just short
+ * of the target, so we round up.
+ * The formula is ceiling(slp * hz/1000000).
+ * 32-bit arithmetic is enough for hz < 169.
+ * XXX see tvtohz() for how to avoid overflow if hz
+ * is large (divide by `tick' and/or arrange to
+ * use tvtohz() if hz is large).
+ */
+ slp = (long) (((u_long)slp * hz) + 999999) / 1000000;
+ goto sleep;
+ }
+ if (qp->c_cc <= 0) {
+sleep:
+ /*
+ * There is no input, or not enough input and we can block.
+ */
+ error = ttysleep(tp, TSA_HUP_OR_INPUT(tp), TTIPRI | PCATCH,
+ ISSET(tp->t_state, TS_CONNECTED) ?
+ "ttyin" : "ttyhup", (int)slp);
+ splx(s);
+ if (error == EWOULDBLOCK)
+ error = 0;
+ else if (error)
+ return (error);
+ /*
+ * XXX what happens if another process eats some input
+ * while we are asleep (not just here)? It would be
+ * safest to detect changes and reset our state variables
+ * (has_stime and last_cc).
+ */
+ slp = 0;
+ goto loop;
+ }
+read:
+ splx(s);
+ /*
+ * Input present, check for input mapping and processing.
+ */
+ first = 1;
+ if (ISSET(lflag, ICANON | ISIG))
+ goto slowcase;
+ for (;;) {
+ char ibuf[IBUFSIZ];
+ int icc;
+
+ icc = imin(uio->uio_resid, IBUFSIZ);
+ icc = q_to_b(qp, ibuf, icc);
+ if (icc <= 0) {
+ if (first)
+ goto loop;
+ break;
+ }
+ error = uiomove(ibuf, icc, uio);
+ /*
+ * XXX if there was an error then we should ungetc() the
+ * unmoved chars and reduce icc here.
+ */
+ if (error)
+ break;
+ if (uio->uio_resid == 0)
+ break;
+ first = 0;
+ }
+ goto out;
+slowcase:
+ for (;;) {
+ c = getc(qp);
+ if (c < 0) {
+ if (first)
+ goto loop;
+ break;
+ }
+ /*
+ * delayed suspend (^Y)
+ */
+ if (CCEQ(cc[VDSUSP], c) &&
+ ISSET(lflag, IEXTEN | ISIG) == (IEXTEN | ISIG)) {
+ if (tp->t_pgrp != NULL) {
+ PGRP_LOCK(tp->t_pgrp);
+ pgsignal(tp->t_pgrp, SIGTSTP, 1);
+ PGRP_UNLOCK(tp->t_pgrp);
+ }
+ if (first) {
+ error = ttysleep(tp, &lbolt, TTIPRI | PCATCH,
+ "ttybg3", 0);
+ if (error)
+ break;
+ goto loop;
+ }
+ break;
+ }
+ /*
+ * Interpret EOF only in canonical mode.
+ */
+ if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON))
+ break;
+ /*
+ * Give user character.
+ */
+ error = ureadc(c, uio);
+ if (error)
+ /* XXX should ungetc(c, qp). */
+ break;
+ if (uio->uio_resid == 0)
+ break;
+ /*
+ * In canonical mode check for a "break character"
+ * marking the end of a "line of input".
+ */
+ if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag))
+ break;
+ first = 0;
+ }
+
+out:
+ /*
+ * Look to unblock input now that (presumably)
+ * the input queue has gone down.
+ */
+ s = spltty();
+ if (ISSET(tp->t_state, TS_TBLOCK) &&
+ tp->t_rawq.c_cc + tp->t_canq.c_cc <= tp->t_ilowat)
+ ttyunblock(tp);
+ splx(s);
+
+ return (error);
+}
+
+/*
+ * Check the output queue on tp for space for a kernel message (from uprintf
+ * or tprintf). Allow some space over the normal hiwater mark so we don't
+ * lose messages due to normal flow control, but don't let the tty run amok.
+ * Sleeps here are not interruptible, but we return prematurely if new signals
+ * arrive.
+ */
+int
+ttycheckoutq(struct tty *tp, int wait)
+{
+ int hiwat, s;
+ sigset_t oldmask;
+
+ hiwat = tp->t_ohiwat;
+ SIGEMPTYSET(oldmask);
+ s = spltty();
+ if (wait)
+ oldmask = curproc->p_siglist;
+ if (tp->t_outq.c_cc > hiwat + OBUFSIZ + 100)
+ while (tp->t_outq.c_cc > hiwat) {
+ ttstart(tp);
+ if (tp->t_outq.c_cc <= hiwat)
+ break;
+ if (!(wait && SIGSETEQ(curproc->p_siglist, oldmask))) {
+ splx(s);
+ return (0);
+ }
+ SET(tp->t_state, TS_SO_OLOWAT);
+ tsleep(TSA_OLOWAT(tp), PZERO - 1, "ttoutq", hz);
+ }
+ splx(s);
+ return (1);
+}
+
+/*
+ * Process a write call on a tty device.
+ */
+int
+ttwrite(struct tty *tp, struct uio *uio, int flag)
+{
+ char *cp = NULL;
+ int cc, ce;
+ struct proc *p;
+ int i, hiwat, cnt, error, s;
+ char obuf[OBUFSIZ];
+
+ hiwat = tp->t_ohiwat;
+ cnt = uio->uio_resid;
+ error = 0;
+ cc = 0;
+loop:
+ s = spltty();
+ if (ISSET(tp->t_state, TS_ZOMBIE)) {
+ splx(s);
+ if (uio->uio_resid == cnt)
+ error = EIO;
+ goto out;
+ }
+ if (!ISSET(tp->t_state, TS_CONNECTED)) {
+ if (flag & IO_NDELAY) {
+ splx(s);
+ error = EWOULDBLOCK;
+ goto out;
+ }
+ error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+ "ttydcd", 0);
+ splx(s);
+ if (error)
+ goto out;
+ goto loop;
+ }
+ splx(s);
+ /*
+ * Hang the process if it's in the background.
+ */
+ p = curproc;
+ sx_slock(&proctree_lock);
+ PROC_LOCK(p);
+ if (isbackground(p, tp) &&
+ ISSET(tp->t_lflag, TOSTOP) && !(p->p_flag & P_PPWAIT) &&
+ !SIGISMEMBER(p->p_sigignore, SIGTTOU) &&
+ !SIGISMEMBER(p->p_sigmask, SIGTTOU)) {
+ if (p->p_pgrp->pg_jobc == 0) {
+ PROC_UNLOCK(p);
+ sx_sunlock(&proctree_lock);
+ error = EIO;
+ goto out;
+ }
+ PROC_UNLOCK(p);
+ PGRP_LOCK(p->p_pgrp);
+ sx_sunlock(&proctree_lock);
+ pgsignal(p->p_pgrp, SIGTTOU, 1);
+ PGRP_UNLOCK(p->p_pgrp);
+ error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg4", 0);
+ if (error)
+ goto out;
+ goto loop;
+ } else {
+ PROC_UNLOCK(p);
+ sx_sunlock(&proctree_lock);
+ }
+ /*
+ * Process the user's data in at most OBUFSIZ chunks. Perform any
+ * output translation. Keep track of high water mark, sleep on
+ * overflow awaiting device aid in acquiring new space.
+ */
+ while (uio->uio_resid > 0 || cc > 0) {
+ if (ISSET(tp->t_lflag, FLUSHO)) {
+ uio->uio_resid = 0;
+ return (0);
+ }
+ if (tp->t_outq.c_cc > hiwat)
+ goto ovhiwat;
+ /*
+ * Grab a hunk of data from the user, unless we have some
+ * leftover from last time.
+ */
+ if (cc == 0) {
+ cc = imin(uio->uio_resid, OBUFSIZ);
+ cp = obuf;
+ error = uiomove(cp, cc, uio);
+ if (error) {
+ cc = 0;
+ break;
+ }
+ }
+ /*
+ * If nothing fancy need be done, grab those characters we
+ * can handle without any of ttyoutput's processing and
+ * just transfer them to the output q. For those chars
+ * which require special processing (as indicated by the
+ * bits in char_type), call ttyoutput. After processing
+ * a hunk of data, look for FLUSHO so ^O's will take effect
+ * immediately.
+ */
+ while (cc > 0) {
+ if (!ISSET(tp->t_oflag, OPOST))
+ ce = cc;
+ else {
+ ce = cc - scanc((u_int)cc, (u_char *)cp,
+ char_type, CCLASSMASK);
+ /*
+ * If ce is zero, then we're processing
+ * a special character through ttyoutput.
+ */
+ if (ce == 0) {
+ tp->t_rocount = 0;
+ if (ttyoutput(*cp, tp) >= 0) {
+ /* No Clists, wait a bit. */
+ ttstart(tp);
+ if (flag & IO_NDELAY) {
+ error = EWOULDBLOCK;
+ goto out;
+ }
+ error = ttysleep(tp, &lbolt,
+ TTOPRI|PCATCH,
+ "ttybf1", 0);
+ if (error)
+ goto out;
+ goto loop;
+ }
+ cp++;
+ cc--;
+ if (ISSET(tp->t_lflag, FLUSHO) ||
+ tp->t_outq.c_cc > hiwat)
+ goto ovhiwat;
+ continue;
+ }
+ }
+ /*
+ * A bunch of normal characters have been found.
+ * Transfer them en masse to the output queue and
+ * continue processing at the top of the loop.
+ * If there are any further characters in this
+ * <= OBUFSIZ chunk, the first should be a character
+ * requiring special handling by ttyoutput.
+ */
+ tp->t_rocount = 0;
+ i = b_to_q(cp, ce, &tp->t_outq);
+ ce -= i;
+ tp->t_column += ce;
+ cp += ce, cc -= ce, tk_nout += ce;
+ tp->t_outcc += ce;
+ if (i > 0) {
+ /* No Clists, wait a bit. */
+ ttstart(tp);
+ if (flag & IO_NDELAY) {
+ error = EWOULDBLOCK;
+ goto out;
+ }
+ error = ttysleep(tp, &lbolt, TTOPRI | PCATCH,
+ "ttybf2", 0);
+ if (error)
+ goto out;
+ goto loop;
+ }
+ if (ISSET(tp->t_lflag, FLUSHO) ||
+ tp->t_outq.c_cc > hiwat)
+ break;
+ }
+ ttstart(tp);
+ }
+out:
+ /*
+ * If cc is nonzero, we leave the uio structure inconsistent, as the
+ * offset and iov pointers have moved forward, but it doesn't matter
+ * (the call will either return short or restart with a new uio).
+ */
+ uio->uio_resid += cc;
+ return (error);
+
+ovhiwat:
+ ttstart(tp);
+ s = spltty();
+ /*
+ * This can only occur if FLUSHO is set in t_lflag,
+ * or if ttstart/oproc is synchronous (or very fast).
+ */
+ if (tp->t_outq.c_cc <= hiwat) {
+ splx(s);
+ goto loop;
+ }
+ if (flag & IO_NDELAY) {
+ splx(s);
+ uio->uio_resid += cc;
+ return (uio->uio_resid == cnt ? EWOULDBLOCK : 0);
+ }
+ SET(tp->t_state, TS_SO_OLOWAT);
+ error = ttysleep(tp, TSA_OLOWAT(tp), TTOPRI | PCATCH, "ttywri",
+ tp->t_timeout);
+ splx(s);
+ if (error == EWOULDBLOCK)
+ error = EIO;
+ if (error)
+ goto out;
+ goto loop;
+}
+
+/*
+ * Rubout one character from the rawq of tp
+ * as cleanly as possible.
+ */
+static void
+ttyrub(int c, struct tty *tp)
+{
+ char *cp;
+ int savecol;
+ int tabc, s;
+
+ if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC))
+ return;
+ CLR(tp->t_lflag, FLUSHO);
+ if (ISSET(tp->t_lflag, ECHOE)) {
+ if (tp->t_rocount == 0) {
+ /*
+ * Screwed by ttwrite; retype
+ */
+ ttyretype(tp);
+ return;
+ }
+ if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE))
+ ttyrubo(tp, 2);
+ else {
+ CLR(c, ~TTY_CHARMASK);
+ switch (CCLASS(c)) {
+ case ORDINARY:
+ ttyrubo(tp, 1);
+ break;
+ case BACKSPACE:
+ case CONTROL:
+ case NEWLINE:
+ case RETURN:
+ case VTAB:
+ if (ISSET(tp->t_lflag, ECHOCTL))
+ ttyrubo(tp, 2);
+ break;
+ case TAB:
+ if (tp->t_rocount < tp->t_rawq.c_cc) {
+ ttyretype(tp);
+ return;
+ }
+ s = spltty();
+ savecol = tp->t_column;
+ SET(tp->t_state, TS_CNTTB);
+ SET(tp->t_lflag, FLUSHO);
+ tp->t_column = tp->t_rocol;
+ cp = tp->t_rawq.c_cf;
+ if (cp)
+ tabc = *cp; /* XXX FIX NEXTC */
+ for (; cp; cp = nextc(&tp->t_rawq, cp, &tabc))
+ ttyecho(tabc, tp);
+ CLR(tp->t_lflag, FLUSHO);
+ CLR(tp->t_state, TS_CNTTB);
+ splx(s);
+
+ /* savecol will now be length of the tab. */
+ savecol -= tp->t_column;
+ tp->t_column += savecol;
+ if (savecol > 8)
+ savecol = 8; /* overflow screw */
+ while (--savecol >= 0)
+ (void)ttyoutput('\b', tp);
+ break;
+ default: /* XXX */
+#define PANICSTR "ttyrub: would panic c = %d, val = %d\n"
+ (void)printf(PANICSTR, c, CCLASS(c));
+#ifdef notdef
+ panic(PANICSTR, c, CCLASS(c));
+#endif
+ }
+ }
+ } else if (ISSET(tp->t_lflag, ECHOPRT)) {
+ if (!ISSET(tp->t_state, TS_ERASE)) {
+ SET(tp->t_state, TS_ERASE);
+ (void)ttyoutput('\\', tp);
+ }
+ ttyecho(c, tp);
+ } else {
+ ttyecho(tp->t_cc[VERASE], tp);
+ /*
+ * This code may be executed not only when an ERASE key
+ * is pressed, but also when ^U (KILL) or ^W (WERASE) are.
+ * So, I didn't think it was worthwhile to pass the extra
+ * information (which would need an extra parameter,
+ * changing every call) needed to distinguish the ERASE2
+ * case from the ERASE.
+ */
+ }
+ --tp->t_rocount;
+}
+
+/*
+ * Back over cnt characters, erasing them.
+ */
+static void
+ttyrubo(struct tty *tp, int cnt)
+{
+
+ while (cnt-- > 0) {
+ (void)ttyoutput('\b', tp);
+ (void)ttyoutput(' ', tp);
+ (void)ttyoutput('\b', tp);
+ }
+}
+
+/*
+ * ttyretype --
+ * Reprint the rawq line. Note, it is assumed that c_cc has already
+ * been checked.
+ */
+static void
+ttyretype(struct tty *tp)
+{
+ char *cp;
+ int s, c;
+
+ /* Echo the reprint character. */
+ if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE)
+ ttyecho(tp->t_cc[VREPRINT], tp);
+
+ (void)ttyoutput('\n', tp);
+
+ /*
+ * XXX
+ * FIX: NEXTC IS BROKEN - DOESN'T CHECK QUOTE
+ * BIT OF FIRST CHAR.
+ */
+ s = spltty();
+ for (cp = tp->t_canq.c_cf, c = (cp != NULL ? *cp : 0);
+ cp != NULL; cp = nextc(&tp->t_canq, cp, &c))
+ ttyecho(c, tp);
+ for (cp = tp->t_rawq.c_cf, c = (cp != NULL ? *cp : 0);
+ cp != NULL; cp = nextc(&tp->t_rawq, cp, &c))
+ ttyecho(c, tp);
+ CLR(tp->t_state, TS_ERASE);
+ splx(s);
+
+ tp->t_rocount = tp->t_rawq.c_cc;
+ tp->t_rocol = 0;
+}
+
+/*
+ * Echo a typed character to the terminal.
+ */
+static void
+ttyecho(int c, struct tty *tp)
+{
+
+ if (!ISSET(tp->t_state, TS_CNTTB))
+ CLR(tp->t_lflag, FLUSHO);
+ if ((!ISSET(tp->t_lflag, ECHO) &&
+ (c != '\n' || !ISSET(tp->t_lflag, ECHONL))) ||
+ ISSET(tp->t_lflag, EXTPROC))
+ return;
+ if (ISSET(tp->t_lflag, ECHOCTL) &&
+ ((ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n') ||
+ ISSET(c, TTY_CHARMASK) == 0177)) {
+ (void)ttyoutput('^', tp);
+ CLR(c, ~TTY_CHARMASK);
+ if (c == 0177)
+ c = '?';
+ else
+ c += 'A' - 1;
+ }
+ (void)ttyoutput(c, tp);
+}
+
+/*
+ * Wake up any readers on a tty.
+ */
+void
+ttwakeup(struct tty *tp)
+{
+
+ if (SEL_WAITING(&tp->t_rsel))
+ selwakeup(&tp->t_rsel);
+ if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL)
+ pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
+ wakeup(TSA_HUP_OR_INPUT(tp));
+ KNOTE(&tp->t_rsel.si_note, 0);
+}
+
+/*
+ * Wake up any writers on a tty.
+ */
+void
+ttwwakeup(struct tty *tp)
+{
+
+ if (SEL_WAITING(&tp->t_wsel) && tp->t_outq.c_cc <= tp->t_olowat)
+ selwakeup(&tp->t_wsel);
+ if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL)
+ pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
+ if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) ==
+ TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) {
+ CLR(tp->t_state, TS_SO_OCOMPLETE);
+ wakeup(TSA_OCOMPLETE(tp));
+ }
+ if (ISSET(tp->t_state, TS_SO_OLOWAT) &&
+ tp->t_outq.c_cc <= tp->t_olowat) {
+ CLR(tp->t_state, TS_SO_OLOWAT);
+ wakeup(TSA_OLOWAT(tp));
+ }
+ KNOTE(&tp->t_wsel.si_note, 0);
+}
+
+/*
+ * Look up a code for a specified speed in a conversion table;
+ * used by drivers to map software speed values to hardware parameters.
+ */
+int
+ttspeedtab(int speed, struct speedtab *table)
+{
+
+ for ( ; table->sp_speed != -1; table++)
+ if (table->sp_speed == speed)
+ return (table->sp_code);
+ return (-1);
+}
+
+/*
+ * Set input and output watermarks and buffer sizes. For input, the
+ * high watermark is about one second's worth of input above empty, the
+ * low watermark is slightly below high water, and the buffer size is a
+ * driver-dependent amount above high water. For output, the watermarks
+ * are near the ends of the buffer, with about 1 second's worth of input
+ * between them. All this only applies to the standard line discipline.
+ */
+void
+ttsetwater(struct tty *tp)
+{
+ int cps, ttmaxhiwat, x;
+
+ /* Input. */
+ clist_alloc_cblocks(&tp->t_canq, TTYHOG, 512);
+ switch (tp->t_ispeedwat) {
+ case (speed_t)-1:
+ cps = tp->t_ispeed / 10;
+ break;
+ case 0:
+ /*
+ * This case is for old drivers that don't know about
+ * t_ispeedwat. Arrange for them to get the old buffer
+ * sizes and watermarks.
+ */
+ cps = TTYHOG - 2 * 256;
+ tp->t_ififosize = 2 * 256;
+ break;
+ default:
+ cps = tp->t_ispeedwat / 10;
+ break;
+ }
+ tp->t_ihiwat = cps;
+ tp->t_ilowat = 7 * cps / 8;
+ x = cps + tp->t_ififosize;
+ clist_alloc_cblocks(&tp->t_rawq, x, x);
+
+ /* Output. */
+ switch (tp->t_ospeedwat) {
+ case (speed_t)-1:
+ cps = tp->t_ospeed / 10;
+ ttmaxhiwat = 2 * TTMAXHIWAT;
+ break;
+ case 0:
+ cps = tp->t_ospeed / 10;
+ ttmaxhiwat = TTMAXHIWAT;
+ break;
+ default:
+ cps = tp->t_ospeedwat / 10;
+ ttmaxhiwat = 8 * TTMAXHIWAT;
+ break;
+ }
+#define CLAMP(x, h, l) ((x) > h ? h : ((x) < l) ? l : (x))
+ tp->t_olowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT);
+ x += cps;
+ x = CLAMP(x, ttmaxhiwat, TTMINHIWAT); /* XXX clamps are too magic */
+ tp->t_ohiwat = roundup(x, CBSIZE); /* XXX for compat */
+ x = imax(tp->t_ohiwat, TTMAXHIWAT); /* XXX for compat/safety */
+ x += OBUFSIZ + 100;
+ clist_alloc_cblocks(&tp->t_outq, x, x);
+#undef CLAMP
+}
+
+/*
+ * Report on state of foreground process group.
+ */
+void
+ttyinfo(struct tty *tp)
+{
+ struct proc *p, *pick;
+ struct timeval utime, stime;
+ const char *stmp;
+ long ltmp;
+ int tmp;
+ struct thread *td;
+
+ if (ttycheckoutq(tp,0) == 0)
+ return;
+
+ /* Print load average. */
+ tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
+ ttyprintf(tp, "load: %d.%02d ", tmp / 100, tmp % 100);
+
+ if (tp->t_session == NULL)
+ ttyprintf(tp, "not a controlling terminal\n");
+ else if (tp->t_pgrp == NULL)
+ ttyprintf(tp, "no foreground process group\n");
+ else {
+ PGRP_LOCK(tp->t_pgrp);
+ if ((p = LIST_FIRST(&tp->t_pgrp->pg_members)) == 0) {
+ PGRP_UNLOCK(tp->t_pgrp);
+ ttyprintf(tp, "empty foreground process group\n");
+ } else {
+ mtx_lock_spin(&sched_lock);
+
+ /* Pick interesting process. */
+ for (pick = NULL; p != 0; p = LIST_NEXT(p, p_pglist))
+ if (proc_compare(pick, p))
+ pick = p;
+ PGRP_UNLOCK(tp->t_pgrp);
+
+ td = FIRST_THREAD_IN_PROC(pick);
+ stmp = pick->p_stat == SRUN ? "running" : /* XXXKSE */
+ pick->p_stat == SMTX ? td->td_mtxname :
+ td->td_wmesg ? td->td_wmesg : "iowait";
+ calcru(pick, &utime, &stime, NULL);
+ ltmp = pick->p_stat == SIDL || pick->p_stat == SWAIT ||
+ pick->p_stat == SZOMB ? 0 :
+ pgtok(vmspace_resident_count(pick->p_vmspace));
+ mtx_unlock_spin(&sched_lock);
+
+ ttyprintf(tp, " cmd: %s %d [%s%s] ", pick->p_comm,
+ pick->p_pid, pick->p_stat == SMTX ? "*" : "", stmp);
+
+ /* Print user time. */
+ ttyprintf(tp, "%ld.%02ldu ",
+ utime.tv_sec, utime.tv_usec / 10000);
+
+ /* Print system time. */
+ ttyprintf(tp, "%ld.%02lds ",
+ (long)stime.tv_sec, stime.tv_usec / 10000);
+
+ /* Print percentage cpu, resident set size. */
+ ttyprintf(tp, "%d%% %ldk\n", tmp / 100, ltmp);
+
+ }
+ }
+ tp->t_rocount = 0; /* so pending input will be retyped if BS */
+}
+
+/*
+ * Returns 1 if p2 is "better" than p1
+ *
+ * The algorithm for picking the "interesting" process is thus:
+ *
+ * 1) Only foreground processes are eligible - implied.
+ * 2) Runnable processes are favored over anything else. The runner
+ * with the highest cpu utilization is picked (p_estcpu). Ties are
+ * broken by picking the highest pid.
+ * 3) The sleeper with the shortest sleep time is next. With ties,
+ * we pick out just "short-term" sleepers (P_SINTR == 0).
+ * 4) Further ties are broken by picking the highest pid.
+ */
+#define ISRUN(p) (((p)->p_stat == SRUN) || ((p)->p_stat == SIDL))
+#define TESTAB(a, b) ((a)<<1 | (b))
+#define ONLYA 2
+#define ONLYB 1
+#define BOTH 3
+
+static int
+proc_compare(struct proc *p1, struct proc *p2)
+{
+
+ int esta, estb;
+ struct ksegrp *kg;
+ mtx_assert(&sched_lock, MA_OWNED);
+ if (p1 == NULL)
+ return (1);
+
+ /*
+ * see if at least one of them is runnable
+ */
+ switch (TESTAB(ISRUN(p1), ISRUN(p2))) {
+ case ONLYA:
+ return (0);
+ case ONLYB:
+ return (1);
+ case BOTH:
+ /*
+ * tie - favor one with highest recent cpu utilization
+ */
+ esta = estb = 0;
+ FOREACH_KSEGRP_IN_PROC(p1,kg) {
+ esta += kg->kg_estcpu;
+ }
+ FOREACH_KSEGRP_IN_PROC(p2,kg) {
+ estb += kg->kg_estcpu;
+ }
+ if (estb > esta)
+ return (1);
+ if (esta > estb)
+ return (0);
+ return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+ }
+ /*
+ * weed out zombies
+ */
+ switch (TESTAB(p1->p_stat == SZOMB, p2->p_stat == SZOMB)) {
+ case ONLYA:
+ return (1);
+ case ONLYB:
+ return (0);
+ case BOTH:
+ return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+ }
+
+#if 0 /* XXXKSE */
+ /*
+ * pick the one with the smallest sleep time
+ */
+ if (p2->p_slptime > p1->p_slptime)
+ return (0);
+ if (p1->p_slptime > p2->p_slptime)
+ return (1);
+ /*
+ * favor one sleeping in a non-interruptible sleep
+ */
+ if (p1->p_sflag & PS_SINTR && (p2->p_sflag & PS_SINTR) == 0)
+ return (1);
+ if (p2->p_sflag & PS_SINTR && (p1->p_sflag & PS_SINTR) == 0)
+ return (0);
+#endif
+ return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+}
+
+/*
+ * Output char to tty; console putchar style.
+ */
+int
+tputchar(int c, struct tty *tp)
+{
+ int s;
+
+ s = spltty();
+ if (!ISSET(tp->t_state, TS_CONNECTED)) {
+ splx(s);
+ return (-1);
+ }
+ if (c == '\n')
+ (void)ttyoutput('\r', tp);
+ (void)ttyoutput(c, tp);
+ ttstart(tp);
+ splx(s);
+ return (0);
+}
+
+/*
+ * Sleep on chan, returning ERESTART if tty changed while we napped and
+ * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by tsleep. If
+ * the tty is revoked, restarting a pending call will redo validation done
+ * at the start of the call.
+ */
+int
+ttysleep(struct tty *tp, void *chan, int pri, char *wmesg, int timo)
+{
+ int error;
+ int gen;
+
+ gen = tp->t_gen;
+ error = tsleep(chan, pri, wmesg, timo);
+ if (error)
+ return (error);
+ return (tp->t_gen == gen ? 0 : ERESTART);
+}
+
+/*
+ * Allocate a tty struct. Clists in the struct will be allocated by
+ * ttyopen().
+ */
+struct tty *
+ttymalloc(struct tty *tp)
+{
+
+ if (tp)
+ return(tp);
+ tp = malloc(sizeof *tp, M_TTYS, M_WAITOK | M_ZERO);
+ ttyregister(tp);
+ return (tp);
+}
+
+#if 0 /* XXX not yet usable: session leader holds a ref (see kern_exit.c). */
+/*
+ * Free a tty struct. Clists in the struct should have been freed by
+ * ttyclose().
+ */
+void
+ttyfree(struct tty *tp)
+{
+ free(tp, M_TTYS);
+}
+#endif /* 0 */
+
+void
+ttyregister(struct tty *tp)
+{
+ tp->t_timeout = -1;
+ SLIST_INSERT_HEAD(&tty_list, tp, t_list);
+}
+
+static int
+sysctl_kern_ttys(SYSCTL_HANDLER_ARGS)
+{
+ struct tty *tp;
+ struct xtty xt;
+ int error;
+
+ SLIST_FOREACH(tp, &tty_list, t_list) {
+ bzero(&xt, sizeof xt);
+ xt.xt_size = sizeof xt;
+#define XT_COPY(field) xt.xt_##field = tp->t_##field
+ xt.xt_rawcc = tp->t_rawq.c_cc;
+ xt.xt_cancc = tp->t_canq.c_cc;
+ xt.xt_outcc = tp->t_outq.c_cc;
+ XT_COPY(line);
+ if (tp->t_dev)
+ xt.xt_dev = dev2udev(tp->t_dev);
+ XT_COPY(state);
+ XT_COPY(flags);
+ XT_COPY(timeout);
+ if (tp->t_pgrp)
+ xt.xt_pgid = tp->t_pgrp->pg_id;
+ if (tp->t_session)
+ xt.xt_sid = tp->t_session->s_sid;
+ XT_COPY(termios);
+ XT_COPY(winsize);
+ XT_COPY(column);
+ XT_COPY(rocount);
+ XT_COPY(rocol);
+ XT_COPY(ififosize);
+ XT_COPY(ihiwat);
+ XT_COPY(ilowat);
+ XT_COPY(ispeedwat);
+ XT_COPY(ohiwat);
+ XT_COPY(olowat);
+ XT_COPY(ospeedwat);
+#undef XT_COPY
+ error = SYSCTL_OUT(req, &xt, sizeof xt);
+ if (error)
+ return (error);
+ }
+ return (0);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, 0, sysctl_kern_ttys, "S,xtty", "All ttys");
+SYSCTL_LONG(_kern, OID_AUTO, tty_nin, CTLFLAG_RD,
+ &tk_nin, 0, "Total TTY in characters");
+SYSCTL_LONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD,
+ &tk_nout, 0, "Total TTY out characters");
+
+void
+nottystop(struct tty *tp, int rw)
+{
+
+ return;
+}
+
+int
+ttyread(dev_t dev, struct uio *uio, int flag)
+{
+ struct tty *tp;
+
+ tp = dev->si_tty;
+ if (tp == NULL)
+ return (ENODEV);
+ return ((*linesw[tp->t_line].l_read)(tp, uio, flag));
+}
+
+int
+ttywrite(dev_t dev, struct uio *uio, int flag)
+{
+ struct tty *tp;
+
+ tp = dev->si_tty;
+ if (tp == NULL)
+ return (ENODEV);
+ return ((*linesw[tp->t_line].l_write)(tp, uio, flag));
+}
diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c
new file mode 100644
index 0000000..01628ff
--- /dev/null
+++ b/sys/kern/tty_compat.c
@@ -0,0 +1,490 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty_compat.c 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+/*
+ * mapping routines for old line discipline (yuck)
+ */
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ioctl_compat.h>
+#include <sys/tty.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+static int ttcompatgetflags(struct tty *tp);
+static void ttcompatsetflags(struct tty *tp, struct termios *t);
+static void ttcompatsetlflags(struct tty *tp, struct termios *t);
+static int ttcompatspeedtab(int speed, struct speedtab *table);
+
+static int ttydebug = 0;
+SYSCTL_INT(_debug, OID_AUTO, ttydebug, CTLFLAG_RW, &ttydebug, 0, "");
+
+static struct speedtab compatspeeds[] = {
+#define MAX_SPEED 17
+ { 115200, 17 },
+ { 57600, 16 },
+ { 38400, 15 },
+ { 19200, 14 },
+ { 9600, 13 },
+ { 4800, 12 },
+ { 2400, 11 },
+ { 1800, 10 },
+ { 1200, 9 },
+ { 600, 8 },
+ { 300, 7 },
+ { 200, 6 },
+ { 150, 5 },
+ { 134, 4 },
+ { 110, 3 },
+ { 75, 2 },
+ { 50, 1 },
+ { 0, 0 },
+ { -1, -1 },
+};
+static int compatspcodes[] = {
+ 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200,
+ 1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200,
+};
+
+static int
+ttcompatspeedtab(speed, table)
+ int speed;
+ register struct speedtab *table;
+{
+ if (speed == 0)
+ return (0); /* hangup */
+ for ( ; table->sp_speed > 0; table++)
+ if (table->sp_speed <= speed) /* nearest one, rounded down */
+ return (table->sp_code);
+ return (1); /* 50, min and not hangup */
+}
+
+int
+ttsetcompat(tp, com, data, term)
+ register struct tty *tp;
+ u_long *com;
+ caddr_t data;
+ struct termios *term;
+{
+ switch (*com) {
+ case TIOCSETP:
+ case TIOCSETN: {
+ register struct sgttyb *sg = (struct sgttyb *)data;
+ int speed;
+
+ if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0)
+ return(EINVAL);
+ else if (speed != ttcompatspeedtab(tp->t_ispeed, compatspeeds))
+ term->c_ispeed = compatspcodes[speed];
+ else
+ term->c_ispeed = tp->t_ispeed;
+ if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0)
+ return(EINVAL);
+ else if (speed != ttcompatspeedtab(tp->t_ospeed, compatspeeds))
+ term->c_ospeed = compatspcodes[speed];
+ else
+ term->c_ospeed = tp->t_ospeed;
+ term->c_cc[VERASE] = sg->sg_erase;
+ term->c_cc[VKILL] = sg->sg_kill;
+ tp->t_flags = (tp->t_flags&0xffff0000) | (sg->sg_flags&0xffff);
+ ttcompatsetflags(tp, term);
+ *com = (*com == TIOCSETP) ? TIOCSETAF : TIOCSETA;
+ break;
+ }
+ case TIOCSETC: {
+ struct tchars *tc = (struct tchars *)data;
+ register cc_t *cc;
+
+ cc = term->c_cc;
+ cc[VINTR] = tc->t_intrc;
+ cc[VQUIT] = tc->t_quitc;
+ cc[VSTART] = tc->t_startc;
+ cc[VSTOP] = tc->t_stopc;
+ cc[VEOF] = tc->t_eofc;
+ cc[VEOL] = tc->t_brkc;
+ if (tc->t_brkc == -1)
+ cc[VEOL2] = _POSIX_VDISABLE;
+ *com = TIOCSETA;
+ break;
+ }
+ case TIOCSLTC: {
+ struct ltchars *ltc = (struct ltchars *)data;
+ register cc_t *cc;
+
+ cc = term->c_cc;
+ cc[VSUSP] = ltc->t_suspc;
+ cc[VDSUSP] = ltc->t_dsuspc;
+ cc[VREPRINT] = ltc->t_rprntc;
+ cc[VDISCARD] = ltc->t_flushc;
+ cc[VWERASE] = ltc->t_werasc;
+ cc[VLNEXT] = ltc->t_lnextc;
+ *com = TIOCSETA;
+ break;
+ }
+ case TIOCLBIS:
+ case TIOCLBIC:
+ case TIOCLSET:
+ if (*com == TIOCLSET)
+ tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16;
+ else {
+ tp->t_flags =
+ (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff);
+ if (*com == TIOCLBIS)
+ tp->t_flags |= *(int *)data<<16;
+ else
+ tp->t_flags &= ~(*(int *)data<<16);
+ }
+ ttcompatsetlflags(tp, term);
+ *com = TIOCSETA;
+ break;
+ }
+ return 0;
+}
+
+/*ARGSUSED*/
+int
+ttcompat(tp, com, data, flag)
+ register struct tty *tp;
+ u_long com;
+ caddr_t data;
+ int flag;
+{
+ switch (com) {
+ case TIOCSETP:
+ case TIOCSETN:
+ case TIOCSETC:
+ case TIOCSLTC:
+ case TIOCLBIS:
+ case TIOCLBIC:
+ case TIOCLSET: {
+ struct termios term;
+ int error;
+
+ term = tp->t_termios;
+ if ((error = ttsetcompat(tp, &com, data, &term)) != 0)
+ return error;
+ return ttioctl(tp, com, &term, flag);
+ }
+ case TIOCGETP: {
+ register struct sgttyb *sg = (struct sgttyb *)data;
+ register cc_t *cc = tp->t_cc;
+
+ sg->sg_ospeed = ttcompatspeedtab(tp->t_ospeed, compatspeeds);
+ if (tp->t_ispeed == 0)
+ sg->sg_ispeed = sg->sg_ospeed;
+ else
+ sg->sg_ispeed = ttcompatspeedtab(tp->t_ispeed, compatspeeds);
+ sg->sg_erase = cc[VERASE];
+ sg->sg_kill = cc[VKILL];
+ sg->sg_flags = tp->t_flags = ttcompatgetflags(tp);
+ break;
+ }
+ case TIOCGETC: {
+ struct tchars *tc = (struct tchars *)data;
+ register cc_t *cc = tp->t_cc;
+
+ tc->t_intrc = cc[VINTR];
+ tc->t_quitc = cc[VQUIT];
+ tc->t_startc = cc[VSTART];
+ tc->t_stopc = cc[VSTOP];
+ tc->t_eofc = cc[VEOF];
+ tc->t_brkc = cc[VEOL];
+ break;
+ }
+ case TIOCGLTC: {
+ struct ltchars *ltc = (struct ltchars *)data;
+ register cc_t *cc = tp->t_cc;
+
+ ltc->t_suspc = cc[VSUSP];
+ ltc->t_dsuspc = cc[VDSUSP];
+ ltc->t_rprntc = cc[VREPRINT];
+ ltc->t_flushc = cc[VDISCARD];
+ ltc->t_werasc = cc[VWERASE];
+ ltc->t_lnextc = cc[VLNEXT];
+ break;
+ }
+ case TIOCLGET:
+ tp->t_flags =
+ (ttcompatgetflags(tp) & 0xffff0000UL)
+ | (tp->t_flags & 0xffff);
+ *(int *)data = tp->t_flags>>16;
+ if (ttydebug)
+ printf("CLGET: returning %x\n", *(int *)data);
+ break;
+
+ case OTIOCGETD:
+ *(int *)data = tp->t_line ? tp->t_line : 2;
+ break;
+
+ case OTIOCSETD: {
+ int ldisczero = 0;
+
+ return (ttioctl(tp, TIOCSETD,
+ *(int *)data == 2 ? (caddr_t)&ldisczero : data, flag));
+ }
+
+ case OTIOCCONS:
+ *(int *)data = 1;
+ return (ttioctl(tp, TIOCCONS, data, flag));
+
+ default:
+ return (ENOIOCTL);
+ }
+ return (0);
+}
+
+static int
+ttcompatgetflags(tp)
+ register struct tty *tp;
+{
+ register tcflag_t iflag = tp->t_iflag;
+ register tcflag_t lflag = tp->t_lflag;
+ register tcflag_t oflag = tp->t_oflag;
+ register tcflag_t cflag = tp->t_cflag;
+ register int flags = 0;
+
+ if (iflag&IXOFF)
+ flags |= TANDEM;
+ if (iflag&ICRNL || oflag&ONLCR)
+ flags |= CRMOD;
+ if ((cflag&CSIZE) == CS8) {
+ flags |= PASS8;
+ if (iflag&ISTRIP)
+ flags |= ANYP;
+ }
+ else if (cflag&PARENB) {
+ if (iflag&INPCK) {
+ if (cflag&PARODD)
+ flags |= ODDP;
+ else
+ flags |= EVENP;
+ } else
+ flags |= EVENP | ODDP;
+ }
+
+ if ((lflag&ICANON) == 0) {
+ /* fudge */
+ if (iflag&(INPCK|ISTRIP|IXON) || lflag&(IEXTEN|ISIG)
+ || (cflag&(CSIZE|PARENB)) != CS8)
+ flags |= CBREAK;
+ else
+ flags |= RAW;
+ }
+ if (!(flags&RAW) && !(oflag&OPOST) && (cflag&(CSIZE|PARENB)) == CS8)
+ flags |= LITOUT;
+ if (cflag&MDMBUF)
+ flags |= MDMBUF;
+ if ((cflag&HUPCL) == 0)
+ flags |= NOHANG;
+ if (oflag&OXTABS)
+ flags |= XTABS;
+ if (lflag&ECHOE)
+ flags |= CRTERA|CRTBS;
+ if (lflag&ECHOKE)
+ flags |= CRTKIL|CRTBS;
+ if (lflag&ECHOPRT)
+ flags |= PRTERA;
+ if (lflag&ECHOCTL)
+ flags |= CTLECH;
+ if ((iflag&IXANY) == 0)
+ flags |= DECCTQ;
+ flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH);
+ if (ttydebug)
+ printf("getflags: %x\n", flags);
+ return (flags);
+}
+
+static void
+ttcompatsetflags(tp, t)
+ register struct tty *tp;
+ register struct termios *t;
+{
+ register int flags = tp->t_flags;
+ register tcflag_t iflag = t->c_iflag;
+ register tcflag_t oflag = t->c_oflag;
+ register tcflag_t lflag = t->c_lflag;
+ register tcflag_t cflag = t->c_cflag;
+
+ if (flags & RAW) {
+ iflag = IGNBRK;
+ lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN);
+ } else {
+ iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
+ iflag |= BRKINT|IXON|IMAXBEL;
+ lflag |= ISIG|IEXTEN|ECHOCTL; /* XXX was echoctl on ? */
+ if (flags & XTABS)
+ oflag |= OXTABS;
+ else
+ oflag &= ~OXTABS;
+ if (flags & CBREAK)
+ lflag &= ~ICANON;
+ else
+ lflag |= ICANON;
+ if (flags&CRMOD) {
+ iflag |= ICRNL;
+ oflag |= ONLCR;
+ } else {
+ iflag &= ~ICRNL;
+ oflag &= ~ONLCR;
+ }
+ }
+ if (flags&ECHO)
+ lflag |= ECHO;
+ else
+ lflag &= ~ECHO;
+
+ cflag &= ~(CSIZE|PARENB);
+ if (flags&(RAW|LITOUT|PASS8)) {
+ cflag |= CS8;
+ if (!(flags&(RAW|PASS8))
+ || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
+ iflag |= ISTRIP;
+ else
+ iflag &= ~ISTRIP;
+ if (flags&(RAW|LITOUT))
+ oflag &= ~OPOST;
+ else
+ oflag |= OPOST;
+ } else {
+ cflag |= CS7|PARENB;
+ iflag |= ISTRIP;
+ oflag |= OPOST;
+ }
+ /* XXX don't set INPCK if RAW or PASS8? */
+ if ((flags&(EVENP|ODDP)) == EVENP) {
+ iflag |= INPCK;
+ cflag &= ~PARODD;
+ } else if ((flags&(EVENP|ODDP)) == ODDP) {
+ iflag |= INPCK;
+ cflag |= PARODD;
+ } else
+ iflag &= ~INPCK;
+ if (flags&TANDEM)
+ iflag |= IXOFF;
+ else
+ iflag &= ~IXOFF;
+ if ((flags&DECCTQ) == 0)
+ iflag |= IXANY;
+ else
+ iflag &= ~IXANY;
+ t->c_iflag = iflag;
+ t->c_oflag = oflag;
+ t->c_lflag = lflag;
+ t->c_cflag = cflag;
+}
+
+static void
+ttcompatsetlflags(tp, t)
+ register struct tty *tp;
+ register struct termios *t;
+{
+ register int flags = tp->t_flags;
+ register tcflag_t iflag = t->c_iflag;
+ register tcflag_t oflag = t->c_oflag;
+ register tcflag_t lflag = t->c_lflag;
+ register tcflag_t cflag = t->c_cflag;
+
+ iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
+ if (flags&CRTERA)
+ lflag |= ECHOE;
+ else
+ lflag &= ~ECHOE;
+ if (flags&CRTKIL)
+ lflag |= ECHOKE;
+ else
+ lflag &= ~ECHOKE;
+ if (flags&PRTERA)
+ lflag |= ECHOPRT;
+ else
+ lflag &= ~ECHOPRT;
+ if (flags&CTLECH)
+ lflag |= ECHOCTL;
+ else
+ lflag &= ~ECHOCTL;
+ if (flags&TANDEM)
+ iflag |= IXOFF;
+ else
+ iflag &= ~IXOFF;
+ if ((flags&DECCTQ) == 0)
+ iflag |= IXANY;
+ else
+ iflag &= ~IXANY;
+ if (flags & MDMBUF)
+ cflag |= MDMBUF;
+ else
+ cflag &= ~MDMBUF;
+ if (flags&NOHANG)
+ cflag &= ~HUPCL;
+ else
+ cflag |= HUPCL;
+ lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH);
+ lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH);
+
+ /*
+ * The next if-else statement is copied from above so don't bother
+ * checking it separately. We could avoid fiddlling with the
+ * character size if the mode is already RAW or if neither the
+ * LITOUT bit or the PASS8 bit is being changed, but the delta of
+ * the change is not available here and skipping the RAW case would
+ * make the code different from above.
+ */
+ cflag &= ~(CSIZE|PARENB);
+ if (flags&(RAW|LITOUT|PASS8)) {
+ cflag |= CS8;
+ if (!(flags&(RAW|PASS8))
+ || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
+ iflag |= ISTRIP;
+ else
+ iflag &= ~ISTRIP;
+ if (flags&(RAW|LITOUT))
+ oflag &= ~OPOST;
+ else
+ oflag |= OPOST;
+ } else {
+ cflag |= CS7|PARENB;
+ iflag |= ISTRIP;
+ oflag |= OPOST;
+ }
+ t->c_iflag = iflag;
+ t->c_oflag = oflag;
+ t->c_lflag = lflag;
+ t->c_cflag = cflag;
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
diff --git a/sys/kern/tty_conf.c b/sys/kern/tty_conf.c
new file mode 100644
index 0000000..0609dc9
--- /dev/null
+++ b/sys/kern/tty_conf.c
@@ -0,0 +1,210 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty_conf.c 8.4 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+
+#ifndef MAXLDISC
+#define MAXLDISC 9
+#endif
+
+static l_open_t l_noopen;
+static l_close_t l_noclose;
+static l_rint_t l_norint;
+static l_start_t l_nostart;
+
+/*
+ * XXX it probably doesn't matter what the entries other than the l_open
+ * entry are here. The l_nullioctl and ttymodem entries still look fishy.
+ * Reconsider the removal of nullmodem anyway. It was too much like
+ * ttymodem, but a completely null version might be useful.
+ */
+#define NODISC(n) \
+ { l_noopen, l_noclose, l_noread, l_nowrite, \
+ l_nullioctl, l_norint, l_nostart, ttymodem }
+
+struct linesw linesw[MAXLDISC] =
+{
+ /* 0- termios */
+ { ttyopen, ttylclose, ttread, ttwrite,
+ l_nullioctl, ttyinput, ttstart, ttymodem },
+ NODISC(1), /* 1- defunct */
+ /* 2- NTTYDISC */
+#ifdef COMPAT_43
+ { ttyopen, ttylclose, ttread, ttwrite,
+ l_nullioctl, ttyinput, ttstart, ttymodem },
+#else
+ NODISC(2),
+#endif
+ NODISC(3), /* loadable */
+ NODISC(4), /* SLIPDISC */
+ NODISC(5), /* PPPDISC */
+ NODISC(6), /* NETGRAPHDISC */
+ NODISC(7), /* loadable */
+ NODISC(8), /* loadable */
+};
+
+int nlinesw = sizeof (linesw) / sizeof (linesw[0]);
+
+static struct linesw nodisc = NODISC(0);
+
+#define LOADABLE_LDISC 7
+/*
+ * ldisc_register: Register a line discipline.
+ *
+ * discipline: Index for discipline to load, or LDISC_LOAD for us to choose.
+ * linesw_p: Pointer to linesw_p.
+ *
+ * Returns: Index used or -1 on failure.
+ */
+int
+ldisc_register(discipline, linesw_p)
+ int discipline;
+ struct linesw *linesw_p;
+{
+ int slot = -1;
+
+ if (discipline == LDISC_LOAD) {
+ int i;
+ for (i = LOADABLE_LDISC; i < MAXLDISC; i++)
+ if (bcmp(linesw + i, &nodisc, sizeof(nodisc)) == 0) {
+ slot = i;
+ }
+ }
+ else if (discipline >= 0 && discipline < MAXLDISC) {
+ slot = discipline;
+ }
+
+ if (slot != -1 && linesw_p)
+ linesw[slot] = *linesw_p;
+
+ return slot;
+}
+
+/*
+ * ldisc_deregister: Deregister a line discipline obtained with
+ * ldisc_register.
+ *
+ * discipline: Index for discipline to unload.
+ */
+void
+ldisc_deregister(discipline)
+ int discipline;
+{
+ if (discipline < MAXLDISC) {
+ linesw[discipline] = nodisc;
+ }
+}
+
+static int
+l_noopen(dev, tp)
+ dev_t dev;
+ struct tty *tp;
+{
+
+ return (ENODEV);
+}
+
+static int
+l_noclose(tp, flag)
+ struct tty *tp;
+ int flag;
+{
+
+ return (ENODEV);
+}
+
+int
+l_noread(tp, uio, flag)
+ struct tty *tp;
+ struct uio *uio;
+ int flag;
+{
+
+ return (ENODEV);
+}
+
+int
+l_nowrite(tp, uio, flag)
+ struct tty *tp;
+ struct uio *uio;
+ int flag;
+{
+
+ return (ENODEV);
+}
+
+static int
+l_norint(c, tp)
+ int c;
+ struct tty *tp;
+{
+
+ return (ENODEV);
+}
+
+static int
+l_nostart(tp)
+ struct tty *tp;
+{
+
+ return (ENODEV);
+}
+
+/*
+ * Do nothing specific version of line
+ * discipline specific ioctl command.
+ */
+int
+l_nullioctl(tp, cmd, data, flags, td)
+ struct tty *tp;
+ u_long cmd;
+ char *data;
+ int flags;
+ struct thread *td;
+{
+
+ return (ENOIOCTL);
+}
diff --git a/sys/kern/tty_cons.c b/sys/kern/tty_cons.c
new file mode 100644
index 0000000..91713c1
--- /dev/null
+++ b/sys/kern/tty_cons.c
@@ -0,0 +1,597 @@
+/*
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)cons.c 7.2 (Berkeley) 5/9/91
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/reboot.h>
+#include <sys/sysctl.h>
+#include <sys/tty.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+#include <ddb/ddb.h>
+
+#include <machine/cpu.h>
+
+static d_open_t cnopen;
+static d_close_t cnclose;
+static d_read_t cnread;
+static d_write_t cnwrite;
+static d_ioctl_t cnioctl;
+static d_poll_t cnpoll;
+static d_kqfilter_t cnkqfilter;
+
+#define CDEV_MAJOR 0
+static struct cdevsw cn_cdevsw = {
+ /* open */ cnopen,
+ /* close */ cnclose,
+ /* read */ cnread,
+ /* write */ cnwrite,
+ /* ioctl */ cnioctl,
+ /* poll */ cnpoll,
+ /* mmap */ nommap,
+ /* strategy */ nostrategy,
+ /* name */ "console",
+ /* maj */ CDEV_MAJOR,
+ /* dump */ nodump,
+ /* psize */ nopsize,
+ /* flags */ D_TTY | D_KQFILTER,
+ /* kqfilter */ cnkqfilter,
+};
+
+struct cn_device {
+ STAILQ_ENTRY(cn_device) cnd_next;
+ char cnd_name[16];
+ struct vnode *cnd_vp;
+ struct consdev *cnd_cn;
+};
+
+#define CNDEVPATHMAX 32
+#define CNDEVTAB_SIZE 4
+static struct cn_device cn_devtab[CNDEVTAB_SIZE];
+static STAILQ_HEAD(, cn_device) cn_devlist =
+ STAILQ_HEAD_INITIALIZER(cn_devlist);
+
+#define CND_INVALID(cnd, td) \
+ (cnd == NULL || cnd->cnd_vp == NULL || \
+ (cnd->cnd_vp->v_type == VBAD && !cn_devopen(cnd, td, 1)))
+
+static udev_t cn_udev_t;
+SYSCTL_OPAQUE(_machdep, CPU_CONSDEV, consdev, CTLFLAG_RD,
+ &cn_udev_t, sizeof cn_udev_t, "T,dev_t", "");
+
+int cons_unavail = 0; /* XXX:
+ * physical console not available for
+ * input (i.e., it is in graphics mode)
+ */
+static int cn_mute;
+static int openflag; /* how /dev/console was opened */
+static int cn_is_open;
+static dev_t cn_devfsdev; /* represents the device private info */
+static u_char console_pausing; /* pause after each line during probe */
+static char *console_pausestr=
+"<pause; press any key to proceed to next line or '.' to end pause mode>";
+
+void cndebug(char *);
+
+CONS_DRIVER(cons, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+SET_DECLARE(cons_set, struct consdev);
+
+void
+cninit(void)
+{
+ struct consdev *best_cn, *cn, **list;
+
+ /*
+ * Check if we should mute the console (for security reasons perhaps)
+ * It can be changes dynamically using sysctl kern.consmute
+ * once we are up and going.
+ *
+ */
+ cn_mute = ((boothowto & (RB_MUTE
+ |RB_SINGLE
+ |RB_VERBOSE
+ |RB_ASKNAME
+ |RB_CONFIG)) == RB_MUTE);
+
+ /*
+ * Find the first console with the highest priority.
+ */
+ best_cn = NULL;
+ SET_FOREACH(list, cons_set) {
+ cn = *list;
+ if (cn->cn_probe == NULL)
+ continue;
+ cn->cn_probe(cn);
+ if (cn->cn_pri == CN_DEAD)
+ continue;
+ if (best_cn == NULL || cn->cn_pri > best_cn->cn_pri)
+ best_cn = cn;
+ if (boothowto & RB_MULTIPLE) {
+ /*
+ * Initialize console, and attach to it.
+ */
+ cnadd(cn);
+ cn->cn_init(cn);
+ }
+ }
+ if (best_cn == NULL)
+ return;
+ if ((boothowto & RB_MULTIPLE) == 0) {
+ cnadd(best_cn);
+ best_cn->cn_init(best_cn);
+ }
+ if (boothowto & RB_PAUSE)
+ console_pausing = 1;
+ /*
+ * Make the best console the preferred console.
+ */
+ cnselect(best_cn);
+}
+
+void
+cninit_finish()
+{
+ console_pausing = 0;
+}
+
+/* add a new physical console to back the virtual console */
+int
+cnadd(struct consdev *cn)
+{
+ struct cn_device *cnd;
+ int i;
+
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
+ if (cnd->cnd_cn == cn)
+ return (0);
+ for (i = 0; i < CNDEVTAB_SIZE; i++) {
+ cnd = &cn_devtab[i];
+ if (cnd->cnd_cn == NULL)
+ break;
+ }
+ if (cnd->cnd_cn != NULL)
+ return (ENOMEM);
+ cnd->cnd_cn = cn;
+ STAILQ_INSERT_TAIL(&cn_devlist, cnd, cnd_next);
+ return (0);
+}
+
+void
+cnremove(struct consdev *cn)
+{
+ struct cn_device *cnd;
+
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+ if (cnd->cnd_cn != cn)
+ continue;
+ STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next);
+ if (cnd->cnd_vp != NULL)
+ vn_close(cnd->cnd_vp, openflag, NOCRED, NULL);
+ cnd->cnd_vp = NULL;
+ cnd->cnd_cn = NULL;
+ cnd->cnd_name[0] = '\0';
+#if 0
+ /*
+ * XXX
+ * syscons gets really confused if console resources are
+ * freed after the system has initialized.
+ */
+ if (cn->cn_term != NULL)
+ cn->cn_term(cn);
+#endif
+ return;
+ }
+}
+
+void
+cnselect(struct consdev *cn)
+{
+ struct cn_device *cnd;
+
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+ if (cnd->cnd_cn != cn)
+ continue;
+ if (cnd == STAILQ_FIRST(&cn_devlist))
+ return;
+ STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next);
+ STAILQ_INSERT_HEAD(&cn_devlist, cnd, cnd_next);
+ return;
+ }
+}
+
+void
+cndebug(char *str)
+{
+ int i, len;
+
+ len = strlen(str);
+ cnputc('>'); cnputc('>'); cnputc('>'); cnputc(' ');
+ for (i = 0; i < len; i++)
+ cnputc(str[i]);
+ cnputc('\n');
+}
+
+static int
+sysctl_kern_console(SYSCTL_HANDLER_ARGS)
+{
+ struct cn_device *cnd;
+ struct consdev *cp, **list;
+ char *name, *p;
+ int delete, len, error;
+
+ len = 2;
+ SET_FOREACH(list, cons_set) {
+ cp = *list;
+ if (cp->cn_dev != NULL)
+ len += strlen(devtoname(cp->cn_dev)) + 1;
+ }
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
+ len += strlen(devtoname(cnd->cnd_cn->cn_dev)) + 1;
+ len = len > CNDEVPATHMAX ? len : CNDEVPATHMAX;
+ MALLOC(name, char *, len, M_TEMP, M_WAITOK | M_ZERO);
+ p = name;
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
+ p += sprintf(p, "%s,", devtoname(cnd->cnd_cn->cn_dev));
+ *p++ = '/';
+ SET_FOREACH(list, cons_set) {
+ cp = *list;
+ if (cp->cn_dev != NULL)
+ p += sprintf(p, "%s,", devtoname(cp->cn_dev));
+ }
+ error = sysctl_handle_string(oidp, name, len, req);
+ if (error == 0 && req->newptr != NULL) {
+ p = name;
+ error = ENXIO;
+ delete = 0;
+ if (*p == '-') {
+ delete = 1;
+ p++;
+ }
+ SET_FOREACH(list, cons_set) {
+ cp = *list;
+ if (cp->cn_dev == NULL ||
+ strcmp(p, devtoname(cp->cn_dev)) != 0)
+ continue;
+ if (delete) {
+ cnremove(cp);
+ error = 0;
+ } else {
+ error = cnadd(cp);
+ if (error == 0)
+ cnselect(cp);
+ }
+ break;
+ }
+ }
+ FREE(name, M_TEMP);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, console, CTLTYPE_STRING|CTLFLAG_RW,
+ 0, 0, sysctl_kern_console, "A", "Console device control");
+
+/*
+ * User has changed the state of the console muting.
+ * This may require us to open or close the device in question.
+ */
+static int
+sysctl_kern_consmute(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ int ocn_mute;
+
+ ocn_mute = cn_mute;
+ error = sysctl_handle_int(oidp, &cn_mute, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (ocn_mute && !cn_mute && cn_is_open)
+ error = cnopen(NODEV, openflag, 0, curthread);
+ else if (!ocn_mute && cn_mute && cn_is_open) {
+ error = cnclose(NODEV, openflag, 0, curthread);
+ cn_is_open = 1; /* XXX hack */
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, consmute, CTLTYPE_INT|CTLFLAG_RW,
+ 0, sizeof(cn_mute), sysctl_kern_consmute, "I", "");
+
+static int
+cn_devopen(struct cn_device *cnd, struct thread *td, int forceopen)
+{
+ char path[CNDEVPATHMAX];
+ struct nameidata nd;
+ struct vnode *vp;
+ dev_t dev;
+ int error;
+
+ if ((vp = cnd->cnd_vp) != NULL) {
+ if (!forceopen && vp->v_type != VBAD) {
+ dev = vp->v_rdev;
+ return ((*devsw(dev)->d_open)(dev, openflag, 0, td));
+ }
+ cnd->cnd_vp = NULL;
+ vn_close(vp, openflag, td->td_ucred, td);
+ }
+ if (cnd->cnd_name[0] == '\0')
+ strncpy(cnd->cnd_name, devtoname(cnd->cnd_cn->cn_dev),
+ sizeof(cnd->cnd_name));
+ snprintf(path, sizeof(path), "/dev/%s", cnd->cnd_name);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, td);
+ error = vn_open(&nd, &openflag, 0);
+ if (error == 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ VOP_UNLOCK(nd.ni_vp, 0, td);
+ if (nd.ni_vp->v_type == VCHR)
+ cnd->cnd_vp = nd.ni_vp;
+ else
+ vn_close(nd.ni_vp, openflag, td->td_ucred, td);
+ }
+ return (cnd->cnd_vp != NULL);
+}
+
+static int
+cnopen(dev_t dev, int flag, int mode, struct thread *td)
+{
+ struct cn_device *cnd;
+
+ openflag = flag | FWRITE; /* XXX */
+ cn_is_open = 1; /* console is logically open */
+ if (cn_mute)
+ return (0);
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
+ cn_devopen(cnd, td, 0);
+ return (0);
+}
+
+static int
+cnclose(dev_t dev, int flag, int mode, struct thread *td)
+{
+ struct cn_device *cnd;
+ struct vnode *vp;
+
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+ if ((vp = cnd->cnd_vp) == NULL)
+ continue;
+ cnd->cnd_vp = NULL;
+ vn_close(vp, openflag, td->td_ucred, td);
+ }
+ cn_is_open = 0;
+ return (0);
+}
+
+static int
+cnread(dev_t dev, struct uio *uio, int flag)
+{
+ struct cn_device *cnd;
+
+ cnd = STAILQ_FIRST(&cn_devlist);
+ if (cn_mute || CND_INVALID(cnd, curthread))
+ return (0);
+ dev = cnd->cnd_vp->v_rdev;
+ return ((*devsw(dev)->d_read)(dev, uio, flag));
+}
+
+static int
+cnwrite(dev_t dev, struct uio *uio, int flag)
+{
+ struct cn_device *cnd;
+
+ cnd = STAILQ_FIRST(&cn_devlist);
+ if (cn_mute || CND_INVALID(cnd, curthread))
+ goto done;
+ if (constty)
+ dev = constty->t_dev;
+ else
+ dev = cnd->cnd_vp->v_rdev;
+ if (dev != NULL) {
+ log_console(uio);
+ return ((*devsw(dev)->d_write)(dev, uio, flag));
+ }
+done:
+ uio->uio_resid = 0; /* dump the data */
+ return (0);
+}
+
+static int
+cnioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
+{
+ struct cn_device *cnd;
+ int error;
+
+ cnd = STAILQ_FIRST(&cn_devlist);
+ if (cn_mute || CND_INVALID(cnd, td))
+ return (0);
+ /*
+ * Superuser can always use this to wrest control of console
+ * output from the "virtual" console.
+ */
+ if (cmd == TIOCCONS && constty) {
+ error = suser(td);
+ if (error)
+ return (error);
+ constty = NULL;
+ return (0);
+ }
+ dev = cnd->cnd_vp->v_rdev;
+ if (dev != NULL)
+ return ((*devsw(dev)->d_ioctl)(dev, cmd, data, flag, td));
+ return (0);
+}
+
+/*
+ * XXX
+ * poll/kqfilter do not appear to be correct
+ */
+static int
+cnpoll(dev_t dev, int events, struct thread *td)
+{
+ struct cn_device *cnd;
+
+ cnd = STAILQ_FIRST(&cn_devlist);
+ if (cn_mute || CND_INVALID(cnd, td))
+ return (0);
+ dev = cnd->cnd_vp->v_rdev;
+ if (dev != NULL)
+ return ((*devsw(dev)->d_poll)(dev, events, td));
+ return (0);
+}
+
+static int
+cnkqfilter(dev_t dev, struct knote *kn)
+{
+ struct cn_device *cnd;
+
+ cnd = STAILQ_FIRST(&cn_devlist);
+ if (cn_mute || CND_INVALID(cnd, curthread))
+ return (1);
+ dev = cnd->cnd_vp->v_rdev;
+ if (dev != NULL)
+ return ((*devsw(dev)->d_kqfilter)(dev, kn));
+ return (1);
+}
+
+/*
+ * Low level console routines.
+ */
+int
+cngetc(void)
+{
+ int c;
+
+ if (cn_mute)
+ return (-1);
+ while ((c = cncheckc()) == -1)
+ ;
+ if (c == '\r')
+ c = '\n'; /* console input is always ICRNL */
+ return (c);
+}
+
+int
+cncheckc(void)
+{
+ struct cn_device *cnd;
+ struct consdev *cn;
+ int c;
+
+ if (cn_mute)
+ return (-1);
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+ cn = cnd->cnd_cn;
+ c = cn->cn_checkc(cn->cn_dev);
+ if (c != -1) {
+ return (c);
+ }
+ }
+ return (-1);
+}
+
+void
+cnputc(int c)
+{
+ struct cn_device *cnd;
+ struct consdev *cn;
+ char *cp;
+
+ if (cn_mute || c == '\0')
+ return;
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+ cn = cnd->cnd_cn;
+ if (c == '\n')
+ cn->cn_putc(cn->cn_dev, '\r');
+ cn->cn_putc(cn->cn_dev, c);
+ }
+#ifdef DDB
+ if (console_pausing && !db_active && (c == '\n')) {
+#else
+ if (console_pausing && (c == '\n')) {
+#endif
+ for (cp = console_pausestr; *cp != '\0'; cp++)
+ cnputc(*cp);
+ if (cngetc() == '.')
+ console_pausing = 0;
+ cnputc('\r');
+ for (cp = console_pausestr; *cp != '\0'; cp++)
+ cnputc(' ');
+ cnputc('\r');
+ }
+}
+
+void
+cndbctl(int on)
+{
+ struct cn_device *cnd;
+ struct consdev *cn;
+ static int refcount;
+
+ if (!on)
+ refcount--;
+ if (refcount == 0)
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+ cn = cnd->cnd_cn;
+ if (cn->cn_dbctl != NULL)
+ cn->cn_dbctl(cn->cn_dev, on);
+ }
+ if (on)
+ refcount++;
+}
+
+static void
+cn_drvinit(void *unused)
+{
+
+ cn_devfsdev = make_dev(&cn_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+ "console");
+}
+
+SYSINIT(cndev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,cn_drvinit,NULL)
diff --git a/sys/kern/tty_pty.c b/sys/kern/tty_pty.c
new file mode 100644
index 0000000..7d6e736
--- /dev/null
+++ b/sys/kern/tty_pty.c
@@ -0,0 +1,874 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty_pty.c 8.4 (Berkeley) 2/20/95
+ * $FreeBSD$
+ */
+
+/*
+ * Pseudo-teletype Driver
+ * (Actually two drivers, requiring two entries in 'cdevsw')
+ */
+#include "opt_compat.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/poll.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+
+static MALLOC_DEFINE(M_PTY, "ptys", "pty data structures");
+
+static void ptsstart(struct tty *tp);
+static void ptsstop(struct tty *tp, int rw);
+static void ptcwakeup(struct tty *tp, int flag);
+static dev_t ptyinit(dev_t cdev);
+
+static d_open_t ptsopen;
+static d_close_t ptsclose;
+static d_read_t ptsread;
+static d_write_t ptswrite;
+static d_ioctl_t ptyioctl;
+static d_open_t ptcopen;
+static d_close_t ptcclose;
+static d_read_t ptcread;
+static d_write_t ptcwrite;
+static d_poll_t ptcpoll;
+
+#define CDEV_MAJOR_S 5
+static struct cdevsw pts_cdevsw = {
+ /* open */ ptsopen,
+ /* close */ ptsclose,
+ /* read */ ptsread,
+ /* write */ ptswrite,
+ /* ioctl */ ptyioctl,
+ /* poll */ ttypoll,
+ /* mmap */ nommap,
+ /* strategy */ nostrategy,
+ /* name */ "pts",
+ /* maj */ CDEV_MAJOR_S,
+ /* dump */ nodump,
+ /* psize */ nopsize,
+ /* flags */ D_TTY | D_KQFILTER,
+ /* kqfilter */ ttykqfilter,
+};
+
+#define CDEV_MAJOR_C 6
+static struct cdevsw ptc_cdevsw = {
+ /* open */ ptcopen,
+ /* close */ ptcclose,
+ /* read */ ptcread,
+ /* write */ ptcwrite,
+ /* ioctl */ ptyioctl,
+ /* poll */ ptcpoll,
+ /* mmap */ nommap,
+ /* strategy */ nostrategy,
+ /* name */ "ptc",
+ /* maj */ CDEV_MAJOR_C,
+ /* dump */ nodump,
+ /* psize */ nopsize,
+ /* flags */ D_TTY | D_KQFILTER,
+ /* kqfilter */ ttykqfilter,
+};
+
+#define BUFSIZ 100 /* Chunk size iomoved to/from user */
+
+struct pt_ioctl {
+ int pt_flags;
+ struct selinfo pt_selr, pt_selw;
+ u_char pt_send;
+ u_char pt_ucntl;
+ struct tty pt_tty;
+ dev_t devs, devc;
+ struct prison *pt_prison;
+};
+
+#define PF_PKT 0x08 /* packet mode */
+#define PF_STOPPED 0x10 /* user told stopped */
+#define PF_REMOTE 0x20 /* remote and flow controlled input */
+#define PF_NOSTOP 0x40
+#define PF_UCNTL 0x80 /* user control mode */
+
+static char *names = "pqrsPQRS";
+/*
+ * This function creates and initializes a pts/ptc pair
+ *
+ * pts == /dev/tty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
+ * ptc == /dev/pty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
+ *
+ * XXX: define and add mapping of upper minor bits to allow more
+ * than 256 ptys.
+ */
+static dev_t
+ptyinit(dev_t devc)
+{
+ dev_t devs;
+ struct pt_ioctl *pt;
+ int n;
+
+ n = minor(devc);
+ /* For now we only map the lower 8 bits of the minor */
+ if (n & ~0xff)
+ return (NODEV);
+
+ devc->si_flags &= ~SI_CHEAPCLONE;
+
+ pt = malloc(sizeof(*pt), M_PTY, M_WAITOK | M_ZERO);
+ pt->devs = devs = make_dev(&pts_cdevsw, n,
+ UID_ROOT, GID_WHEEL, 0666, "tty%c%r", names[n / 32], n % 32);
+ pt->devc = devc;
+
+ devs->si_drv1 = devc->si_drv1 = pt;
+ devs->si_tty = devc->si_tty = &pt->pt_tty;
+ pt->pt_tty.t_dev = devs;
+ ttyregister(&pt->pt_tty);
+ return (devc);
+}
+
+/*ARGSUSED*/
+static int
+ptsopen(dev, flag, devtype, td)
+ dev_t dev;
+ int flag, devtype;
+ struct thread *td;
+{
+ register struct tty *tp;
+ int error;
+ struct pt_ioctl *pti;
+
+ if (!dev->si_drv1)
+ return(ENXIO);
+ pti = dev->si_drv1;
+ tp = dev->si_tty;
+ if ((tp->t_state & TS_ISOPEN) == 0) {
+ ttychars(tp); /* Set up default chars */
+ tp->t_iflag = TTYDEF_IFLAG;
+ tp->t_oflag = TTYDEF_OFLAG;
+ tp->t_lflag = TTYDEF_LFLAG;
+ tp->t_cflag = TTYDEF_CFLAG;
+ tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED;
+ } else if (tp->t_state & TS_XCLUDE && suser(td)) {
+ return (EBUSY);
+ } else if (pti->pt_prison != td->td_ucred->cr_prison) {
+ return (EBUSY);
+ }
+ if (tp->t_oproc) /* Ctrlr still around. */
+ (void)(*linesw[tp->t_line].l_modem)(tp, 1);
+ while ((tp->t_state & TS_CARR_ON) == 0) {
+ if (flag&FNONBLOCK)
+ break;
+ error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+ "ptsopn", 0);
+ if (error)
+ return (error);
+ }
+ error = (*linesw[tp->t_line].l_open)(dev, tp);
+ if (error == 0)
+ ptcwakeup(tp, FREAD|FWRITE);
+ return (error);
+}
+
+static int
+ptsclose(dev, flag, mode, td)
+ dev_t dev;
+ int flag, mode;
+ struct thread *td;
+{
+ register struct tty *tp;
+ int err;
+
+ tp = dev->si_tty;
+ err = (*linesw[tp->t_line].l_close)(tp, flag);
+ ptsstop(tp, FREAD|FWRITE);
+ (void) ttyclose(tp);
+ return (err);
+}
+
+static int
+ptsread(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ register struct tty *tp = dev->si_tty;
+ register struct pt_ioctl *pti = dev->si_drv1;
+ struct pgrp *pg;
+ int error = 0;
+
+again:
+ if (pti->pt_flags & PF_REMOTE) {
+ while (isbackground(p, tp)) {
+ sx_slock(&proctree_lock);
+ PROC_LOCK(p);
+ if (SIGISMEMBER(p->p_sigignore, SIGTTIN) ||
+ SIGISMEMBER(p->p_sigmask, SIGTTIN) ||
+ p->p_pgrp->pg_jobc == 0 || p->p_flag & P_PPWAIT) {
+ PROC_UNLOCK(p);
+ sx_sunlock(&proctree_lock);
+ return (EIO);
+ }
+ pg = p->p_pgrp;
+ PROC_UNLOCK(p);
+ PGRP_LOCK(pg);
+ sx_sunlock(&proctree_lock);
+ pgsignal(pg, SIGTTIN, 1);
+ PGRP_UNLOCK(pg);
+ error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ptsbg",
+ 0);
+ if (error)
+ return (error);
+ }
+ if (tp->t_canq.c_cc == 0) {
+ if (flag & IO_NDELAY)
+ return (EWOULDBLOCK);
+ error = ttysleep(tp, TSA_PTS_READ(tp), TTIPRI | PCATCH,
+ "ptsin", 0);
+ if (error)
+ return (error);
+ goto again;
+ }
+ while (tp->t_canq.c_cc > 1 && uio->uio_resid > 0)
+ if (ureadc(getc(&tp->t_canq), uio) < 0) {
+ error = EFAULT;
+ break;
+ }
+ if (tp->t_canq.c_cc == 1)
+ (void) getc(&tp->t_canq);
+ if (tp->t_canq.c_cc)
+ return (error);
+ } else
+ if (tp->t_oproc)
+ error = (*linesw[tp->t_line].l_read)(tp, uio, flag);
+ ptcwakeup(tp, FWRITE);
+ return (error);
+}
+
+/*
+ * Write to pseudo-tty.
+ * Wakeups of controlling tty will happen
+ * indirectly, when tty driver calls ptsstart.
+ */
+static int
+ptswrite(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ register struct tty *tp;
+
+ tp = dev->si_tty;
+ if (tp->t_oproc == 0)
+ return (EIO);
+ return ((*linesw[tp->t_line].l_write)(tp, uio, flag));
+}
+
+/*
+ * Start output on pseudo-tty.
+ * Wake up process selecting or sleeping for input from controlling tty.
+ */
+static void
+ptsstart(tp)
+ struct tty *tp;
+{
+ register struct pt_ioctl *pti = tp->t_dev->si_drv1;
+
+ if (tp->t_state & TS_TTSTOP)
+ return;
+ if (pti->pt_flags & PF_STOPPED) {
+ pti->pt_flags &= ~PF_STOPPED;
+ pti->pt_send = TIOCPKT_START;
+ }
+ ptcwakeup(tp, FREAD);
+}
+
+static void
+ptcwakeup(tp, flag)
+ struct tty *tp;
+ int flag;
+{
+ struct pt_ioctl *pti = tp->t_dev->si_drv1;
+
+ if (flag & FREAD) {
+ selwakeup(&pti->pt_selr);
+ wakeup(TSA_PTC_READ(tp));
+ }
+ if (flag & FWRITE) {
+ selwakeup(&pti->pt_selw);
+ wakeup(TSA_PTC_WRITE(tp));
+ }
+}
+
+static int
+ptcopen(dev, flag, devtype, td)
+ dev_t dev;
+ int flag, devtype;
+ struct thread *td;
+{
+ register struct tty *tp;
+ struct pt_ioctl *pti;
+
+ if (!dev->si_drv1)
+ ptyinit(dev);
+ if (!dev->si_drv1)
+ return(ENXIO);
+ tp = dev->si_tty;
+ if (tp->t_oproc)
+ return (EIO);
+ tp->t_timeout = -1;
+ tp->t_oproc = ptsstart;
+ tp->t_stop = ptsstop;
+ (void)(*linesw[tp->t_line].l_modem)(tp, 1);
+ tp->t_lflag &= ~EXTPROC;
+ pti = dev->si_drv1;
+ pti->pt_prison = td->td_ucred->cr_prison;
+ pti->pt_flags = 0;
+ pti->pt_send = 0;
+ pti->pt_ucntl = 0;
+ return (0);
+}
+
+static int
+ptcclose(dev, flags, fmt, td)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct thread *td;
+{
+ register struct tty *tp;
+
+ tp = dev->si_tty;
+ (void)(*linesw[tp->t_line].l_modem)(tp, 0);
+
+ /*
+ * XXX MDMBUF makes no sense for ptys but would inhibit the above
+ * l_modem(). CLOCAL makes sense but isn't supported. Special
+ * l_modem()s that ignore carrier drop make no sense for ptys but
+ * may be in use because other parts of the line discipline make
+ * sense for ptys. Recover by doing everything that a normal
+ * ttymodem() would have done except for sending a SIGHUP.
+ */
+ if (tp->t_state & TS_ISOPEN) {
+ tp->t_state &= ~(TS_CARR_ON | TS_CONNECTED);
+ tp->t_state |= TS_ZOMBIE;
+ ttyflush(tp, FREAD | FWRITE);
+ }
+
+ tp->t_oproc = 0; /* mark closed */
+ return (0);
+}
+
+static int
+ptcread(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ register struct tty *tp = dev->si_tty;
+ struct pt_ioctl *pti = dev->si_drv1;
+ char buf[BUFSIZ];
+ int error = 0, cc;
+
+ /*
+ * We want to block until the slave
+ * is open, and there's something to read;
+ * but if we lost the slave or we're NBIO,
+ * then return the appropriate error instead.
+ */
+ for (;;) {
+ if (tp->t_state&TS_ISOPEN) {
+ if (pti->pt_flags&PF_PKT && pti->pt_send) {
+ error = ureadc((int)pti->pt_send, uio);
+ if (error)
+ return (error);
+ if (pti->pt_send & TIOCPKT_IOCTL) {
+ cc = min(uio->uio_resid,
+ sizeof(tp->t_termios));
+ uiomove((caddr_t)&tp->t_termios, cc,
+ uio);
+ }
+ pti->pt_send = 0;
+ return (0);
+ }
+ if (pti->pt_flags&PF_UCNTL && pti->pt_ucntl) {
+ error = ureadc((int)pti->pt_ucntl, uio);
+ if (error)
+ return (error);
+ pti->pt_ucntl = 0;
+ return (0);
+ }
+ if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0)
+ break;
+ }
+ if ((tp->t_state & TS_CONNECTED) == 0)
+ return (0); /* EOF */
+ if (flag & IO_NDELAY)
+ return (EWOULDBLOCK);
+ error = tsleep(TSA_PTC_READ(tp), TTIPRI | PCATCH, "ptcin", 0);
+ if (error)
+ return (error);
+ }
+ if (pti->pt_flags & (PF_PKT|PF_UCNTL))
+ error = ureadc(0, uio);
+ while (uio->uio_resid > 0 && error == 0) {
+ cc = q_to_b(&tp->t_outq, buf, min(uio->uio_resid, BUFSIZ));
+ if (cc <= 0)
+ break;
+ error = uiomove(buf, cc, uio);
+ }
+ ttwwakeup(tp);
+ return (error);
+}
+
+static void
+ptsstop(tp, flush)
+ register struct tty *tp;
+ int flush;
+{
+ struct pt_ioctl *pti = tp->t_dev->si_drv1;
+ int flag;
+
+ /* note: FLUSHREAD and FLUSHWRITE already ok */
+ if (flush == 0) {
+ flush = TIOCPKT_STOP;
+ pti->pt_flags |= PF_STOPPED;
+ } else
+ pti->pt_flags &= ~PF_STOPPED;
+ pti->pt_send |= flush;
+ /* change of perspective */
+ flag = 0;
+ if (flush & FREAD)
+ flag |= FWRITE;
+ if (flush & FWRITE)
+ flag |= FREAD;
+ ptcwakeup(tp, flag);
+}
+
+static int
+ptcpoll(dev, events, td)
+ dev_t dev;
+ int events;
+ struct thread *td;
+{
+ register struct tty *tp = dev->si_tty;
+ struct pt_ioctl *pti = dev->si_drv1;
+ int revents = 0;
+ int s;
+
+ if ((tp->t_state & TS_CONNECTED) == 0)
+ return (seltrue(dev, events, td) | POLLHUP);
+
+ /*
+ * Need to block timeouts (ttrstart).
+ */
+ s = spltty();
+
+ if (events & (POLLIN | POLLRDNORM))
+ if ((tp->t_state & TS_ISOPEN) &&
+ ((tp->t_outq.c_cc && (tp->t_state & TS_TTSTOP) == 0) ||
+ ((pti->pt_flags & PF_PKT) && pti->pt_send) ||
+ ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)))
+ revents |= events & (POLLIN | POLLRDNORM);
+
+ if (events & (POLLOUT | POLLWRNORM))
+ if (tp->t_state & TS_ISOPEN &&
+ ((pti->pt_flags & PF_REMOTE) ?
+ (tp->t_canq.c_cc == 0) :
+ ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG - 2) ||
+ (tp->t_canq.c_cc == 0 && (tp->t_lflag & ICANON)))))
+ revents |= events & (POLLOUT | POLLWRNORM);
+
+ if (events & POLLHUP)
+ if ((tp->t_state & TS_CARR_ON) == 0)
+ revents |= POLLHUP;
+
+ if (revents == 0) {
+ if (events & (POLLIN | POLLRDNORM))
+ selrecord(td, &pti->pt_selr);
+
+ if (events & (POLLOUT | POLLWRNORM))
+ selrecord(td, &pti->pt_selw);
+ }
+ splx(s);
+
+ return (revents);
+}
+
+static int
+ptcwrite(dev, uio, flag)
+ dev_t dev;
+ register struct uio *uio;
+ int flag;
+{
+ register struct tty *tp = dev->si_tty;
+ register u_char *cp = 0;
+ register int cc = 0;
+ u_char locbuf[BUFSIZ];
+ int cnt = 0;
+ struct pt_ioctl *pti = dev->si_drv1;
+ int error = 0;
+
+again:
+ if ((tp->t_state&TS_ISOPEN) == 0)
+ goto block;
+ if (pti->pt_flags & PF_REMOTE) {
+ if (tp->t_canq.c_cc)
+ goto block;
+ while ((uio->uio_resid > 0 || cc > 0) &&
+ tp->t_canq.c_cc < TTYHOG - 1) {
+ if (cc == 0) {
+ cc = min(uio->uio_resid, BUFSIZ);
+ cc = min(cc, TTYHOG - 1 - tp->t_canq.c_cc);
+ cp = locbuf;
+ error = uiomove((caddr_t)cp, cc, uio);
+ if (error)
+ return (error);
+ /* check again for safety */
+ if ((tp->t_state & TS_ISOPEN) == 0) {
+ /* adjust as usual */
+ uio->uio_resid += cc;
+ return (EIO);
+ }
+ }
+ if (cc > 0) {
+ cc = b_to_q((char *)cp, cc, &tp->t_canq);
+ /*
+ * XXX we don't guarantee that the canq size
+ * is >= TTYHOG, so the above b_to_q() may
+ * leave some bytes uncopied. However, space
+ * is guaranteed for the null terminator if
+ * we don't fail here since (TTYHOG - 1) is
+ * not a multiple of CBSIZE.
+ */
+ if (cc > 0)
+ break;
+ }
+ }
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ (void) putc(0, &tp->t_canq);
+ ttwakeup(tp);
+ wakeup(TSA_PTS_READ(tp));
+ return (0);
+ }
+ while (uio->uio_resid > 0 || cc > 0) {
+ if (cc == 0) {
+ cc = min(uio->uio_resid, BUFSIZ);
+ cp = locbuf;
+ error = uiomove((caddr_t)cp, cc, uio);
+ if (error)
+ return (error);
+ /* check again for safety */
+ if ((tp->t_state & TS_ISOPEN) == 0) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ return (EIO);
+ }
+ }
+ while (cc > 0) {
+ if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 &&
+ (tp->t_canq.c_cc > 0 || !(tp->t_lflag&ICANON))) {
+ wakeup(TSA_HUP_OR_INPUT(tp));
+ goto block;
+ }
+ (*linesw[tp->t_line].l_rint)(*cp++, tp);
+ cnt++;
+ cc--;
+ }
+ cc = 0;
+ }
+ return (0);
+block:
+ /*
+ * Come here to wait for slave to open, for space
+ * in outq, or space in rawq, or an empty canq.
+ */
+ if ((tp->t_state & TS_CONNECTED) == 0) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ return (EIO);
+ }
+ if (flag & IO_NDELAY) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ if (cnt == 0)
+ return (EWOULDBLOCK);
+ return (0);
+ }
+ error = tsleep(TSA_PTC_WRITE(tp), TTOPRI | PCATCH, "ptcout", 0);
+ if (error) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ return (error);
+ }
+ goto again;
+}
+
+/*ARGSUSED*/
+static int
+ptyioctl(dev, cmd, data, flag, td)
+ dev_t dev;
+ u_long cmd;
+ caddr_t data;
+ int flag;
+ struct thread *td;
+{
+ register struct tty *tp = dev->si_tty;
+ register struct pt_ioctl *pti = dev->si_drv1;
+ register u_char *cc = tp->t_cc;
+ int stop, error;
+
+ if (devsw(dev)->d_open == ptcopen) {
+ switch (cmd) {
+
+ case TIOCGPGRP:
+ /*
+ * We avoid calling ttioctl on the controller since,
+ * in that case, tp must be the controlling terminal.
+ */
+ *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
+ return (0);
+
+ case TIOCPKT:
+ if (*(int *)data) {
+ if (pti->pt_flags & PF_UCNTL)
+ return (EINVAL);
+ pti->pt_flags |= PF_PKT;
+ } else
+ pti->pt_flags &= ~PF_PKT;
+ return (0);
+
+ case TIOCUCNTL:
+ if (*(int *)data) {
+ if (pti->pt_flags & PF_PKT)
+ return (EINVAL);
+ pti->pt_flags |= PF_UCNTL;
+ } else
+ pti->pt_flags &= ~PF_UCNTL;
+ return (0);
+
+ case TIOCREMOTE:
+ if (*(int *)data)
+ pti->pt_flags |= PF_REMOTE;
+ else
+ pti->pt_flags &= ~PF_REMOTE;
+ ttyflush(tp, FREAD|FWRITE);
+ return (0);
+ }
+
+ /*
+ * The rest of the ioctls shouldn't be called until
+ * the slave is open.
+ */
+ if ((tp->t_state & TS_ISOPEN) == 0)
+ return (EAGAIN);
+
+ switch (cmd) {
+#ifdef COMPAT_43
+ case TIOCSETP:
+ case TIOCSETN:
+#endif
+ case TIOCSETD:
+ case TIOCSETA:
+ case TIOCSETAW:
+ case TIOCSETAF:
+ /*
+ * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG.
+ * ttywflush(tp) will hang if there are characters in
+ * the outq.
+ */
+ ndflush(&tp->t_outq, tp->t_outq.c_cc);
+ break;
+
+ case TIOCSIG:
+ if (*(unsigned int *)data >= NSIG ||
+ *(unsigned int *)data == 0)
+ return(EINVAL);
+ if ((tp->t_lflag&NOFLSH) == 0)
+ ttyflush(tp, FREAD|FWRITE);
+ if (tp->t_pgrp != NULL) {
+ PGRP_LOCK(tp->t_pgrp);
+ pgsignal(tp->t_pgrp, *(unsigned int *)data, 1);
+ PGRP_UNLOCK(tp->t_pgrp);
+ }
+ if ((*(unsigned int *)data == SIGINFO) &&
+ ((tp->t_lflag&NOKERNINFO) == 0))
+ ttyinfo(tp);
+ return(0);
+ }
+ }
+ if (cmd == TIOCEXT) {
+ /*
+ * When the EXTPROC bit is being toggled, we need
+ * to send an TIOCPKT_IOCTL if the packet driver
+ * is turned on.
+ */
+ if (*(int *)data) {
+ if (pti->pt_flags & PF_PKT) {
+ pti->pt_send |= TIOCPKT_IOCTL;
+ ptcwakeup(tp, FREAD);
+ }
+ tp->t_lflag |= EXTPROC;
+ } else {
+ if ((tp->t_lflag & EXTPROC) &&
+ (pti->pt_flags & PF_PKT)) {
+ pti->pt_send |= TIOCPKT_IOCTL;
+ ptcwakeup(tp, FREAD);
+ }
+ tp->t_lflag &= ~EXTPROC;
+ }
+ return(0);
+ }
+ error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, td);
+ if (error == ENOIOCTL)
+ error = ttioctl(tp, cmd, data, flag);
+ if (error == ENOIOCTL) {
+ if (pti->pt_flags & PF_UCNTL &&
+ (cmd & ~0xff) == UIOCCMD(0)) {
+ if (cmd & 0xff) {
+ pti->pt_ucntl = (u_char)cmd;
+ ptcwakeup(tp, FREAD);
+ }
+ return (0);
+ }
+ error = ENOTTY;
+ }
+ /*
+ * If external processing and packet mode send ioctl packet.
+ */
+ if ((tp->t_lflag&EXTPROC) && (pti->pt_flags & PF_PKT)) {
+ switch(cmd) {
+ case TIOCSETA:
+ case TIOCSETAW:
+ case TIOCSETAF:
+#ifdef COMPAT_43
+ case TIOCSETP:
+ case TIOCSETN:
+#endif
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+ case TIOCSETC:
+ case TIOCSLTC:
+ case TIOCLBIS:
+ case TIOCLBIC:
+ case TIOCLSET:
+#endif
+ pti->pt_send |= TIOCPKT_IOCTL;
+ ptcwakeup(tp, FREAD);
+ default:
+ break;
+ }
+ }
+ stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s'))
+ && CCEQ(cc[VSTART], CTRL('q'));
+ if (pti->pt_flags & PF_NOSTOP) {
+ if (stop) {
+ pti->pt_send &= ~TIOCPKT_NOSTOP;
+ pti->pt_send |= TIOCPKT_DOSTOP;
+ pti->pt_flags &= ~PF_NOSTOP;
+ ptcwakeup(tp, FREAD);
+ }
+ } else {
+ if (!stop) {
+ pti->pt_send &= ~TIOCPKT_DOSTOP;
+ pti->pt_send |= TIOCPKT_NOSTOP;
+ pti->pt_flags |= PF_NOSTOP;
+ ptcwakeup(tp, FREAD);
+ }
+ }
+ return (error);
+}
+
+
+static void ptc_drvinit(void *unused);
+
+static void pty_clone(void *arg, char *name, int namelen, dev_t *dev);
+
+static void
+pty_clone(arg, name, namelen, dev)
+ void *arg;
+ char *name;
+ int namelen;
+ dev_t *dev;
+{
+ int u;
+
+ if (*dev != NODEV)
+ return;
+ if (bcmp(name, "pty", 3) != 0)
+ return;
+ if (name[5] != '\0')
+ return;
+ switch (name[3]) {
+ case 'p': u = 0; break;
+ case 'q': u = 32; break;
+ case 'r': u = 64; break;
+ case 's': u = 96; break;
+ case 'P': u = 128; break;
+ case 'Q': u = 160; break;
+ case 'R': u = 192; break;
+ case 'S': u = 224; break;
+ default: return;
+ }
+ if (name[4] >= '0' && name[4] <= '9')
+ u += name[4] - '0';
+ else if (name[4] >= 'a' && name[4] <= 'v')
+ u += name[4] - 'a' + 10;
+ else
+ return;
+ *dev = make_dev(&ptc_cdevsw, u,
+ UID_ROOT, GID_WHEEL, 0666, "pty%c%r", names[u / 32], u % 32);
+ (*dev)->si_flags |= SI_CHEAPCLONE;
+ return;
+}
+
+static void
+ptc_drvinit(unused)
+ void *unused;
+{
+ EVENTHANDLER_REGISTER(dev_clone, pty_clone, 0, 1000);
+ cdevsw_add(&pts_cdevsw);
+ cdevsw_add(&ptc_cdevsw);
+}
+
+SYSINIT(ptcdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR_C,ptc_drvinit,NULL)
diff --git a/sys/kern/tty_subr.c b/sys/kern/tty_subr.c
new file mode 100644
index 0000000..78bb231
--- /dev/null
+++ b/sys/kern/tty_subr.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 1994, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * clist support routines
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/tty.h>
+#include <sys/clist.h>
+
+static void clist_init(void *);
+SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL)
+
+static struct cblock *cfreelist = 0;
+int cfreecount = 0;
+static int cslushcount;
+static int ctotcount;
+
+#ifndef INITIAL_CBLOCKS
+#define INITIAL_CBLOCKS 50
+#endif
+
+static struct cblock *cblock_alloc(void);
+static void cblock_alloc_cblocks(int number);
+static void cblock_free(struct cblock *cblockp);
+static void cblock_free_cblocks(int number);
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cbstat, cbstat)
+{
+ int cbsize = CBSIZE;
+
+ printf(
+ "tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n",
+ ctotcount * cbsize, ctotcount * cbsize - cfreecount, cfreecount,
+ cfreecount - cslushcount * cbsize, cslushcount * cbsize);
+}
+#endif /* DDB */
+
+/*
+ * Called from init_main.c
+ */
+/* ARGSUSED*/
+static void
+clist_init(dummy)
+ void *dummy;
+{
+ /*
+ * Allocate an initial base set of cblocks as a 'slush'.
+ * We allocate non-slush cblocks with each initial ttyopen() and
+ * deallocate them with each ttyclose().
+ * We should adjust the slush allocation. This can't be done in
+ * the i/o routines because they are sometimes called from
+ * interrupt handlers when it may be unsafe to call malloc().
+ */
+ cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS);
+}
+
+/*
+ * Remove a cblock from the cfreelist queue and return a pointer
+ * to it.
+ */
+static __inline struct cblock *
+cblock_alloc()
+{
+ struct cblock *cblockp;
+
+ cblockp = cfreelist;
+ if (cblockp == NULL)
+ panic("clist reservation botch");
+ cfreelist = cblockp->c_next;
+ cblockp->c_next = NULL;
+ cfreecount -= CBSIZE;
+ return (cblockp);
+}
+
+/*
+ * Add a cblock to the cfreelist queue.
+ */
+static __inline void
+cblock_free(cblockp)
+ struct cblock *cblockp;
+{
+ if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1))
+ bzero(cblockp->c_quote, sizeof cblockp->c_quote);
+ cblockp->c_next = cfreelist;
+ cfreelist = cblockp;
+ cfreecount += CBSIZE;
+}
+
+/*
+ * Allocate some cblocks for the cfreelist queue.
+ */
+static void
+cblock_alloc_cblocks(number)
+ int number;
+{
+ int i;
+ struct cblock *cbp;
+
+ for (i = 0; i < number; ++i) {
+ cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT);
+ if (cbp == NULL) {
+ printf(
+"cblock_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n");
+ cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK);
+ }
+ /*
+ * Freed cblocks have zero quotes and garbage elsewhere.
+ * Set the may-have-quote bit to force zeroing the quotes.
+ */
+ setbit(cbp->c_quote, CBQSIZE * NBBY - 1);
+ cblock_free(cbp);
+ }
+ ctotcount += number;
+}
+
+/*
+ * Set the cblock allocation policy for a a clist.
+ * Must be called in process context at spltty().
+ */
+void
+clist_alloc_cblocks(clistp, ccmax, ccreserved)
+ struct clist *clistp;
+ int ccmax;
+ int ccreserved;
+{
+ int dcbr;
+
+ /*
+ * Allow for wasted space at the head.
+ */
+ if (ccmax != 0)
+ ccmax += CBSIZE - 1;
+ if (ccreserved != 0)
+ ccreserved += CBSIZE - 1;
+
+ clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE;
+ dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved;
+ if (dcbr >= 0)
+ cblock_alloc_cblocks(dcbr);
+ else {
+ if (clistp->c_cbreserved + dcbr < clistp->c_cbcount)
+ dcbr = clistp->c_cbcount - clistp->c_cbreserved;
+ cblock_free_cblocks(-dcbr);
+ }
+ clistp->c_cbreserved += dcbr;
+}
+
+/*
+ * Free some cblocks from the cfreelist queue back to the
+ * system malloc pool.
+ */
+static void
+cblock_free_cblocks(number)
+ int number;
+{
+ int i;
+
+ for (i = 0; i < number; ++i)
+ free(cblock_alloc(), M_TTYS);
+ ctotcount -= number;
+}
+
+/*
+ * Free the cblocks reserved for a clist.
+ * Must be called at spltty().
+ */
+void
+clist_free_cblocks(clistp)
+ struct clist *clistp;
+{
+ if (clistp->c_cbcount != 0)
+ panic("freeing active clist cblocks");
+ cblock_free_cblocks(clistp->c_cbreserved);
+ clistp->c_cbmax = 0;
+ clistp->c_cbreserved = 0;
+}
+
+/*
+ * Get a character from the head of a clist.
+ */
+int
+getc(clistp)
+ struct clist *clistp;
+{
+ int chr = -1;
+ int s;
+ struct cblock *cblockp;
+
+ s = spltty();
+
+ /* If there are characters in the list, get one */
+ if (clistp->c_cc) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+ chr = (u_char)*clistp->c_cf;
+
+ /*
+ * If this char is quoted, set the flag.
+ */
+ if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info))
+ chr |= TTY_QUOTE;
+
+ /*
+ * Advance to next character.
+ */
+ clistp->c_cf++;
+ clistp->c_cc--;
+ /*
+ * If we have advanced the 'first' character pointer
+ * past the end of this cblock, advance to the next one.
+ * If there are no more characters, set the first and
+ * last pointers to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+ return (chr);
+}
+
+/*
+ * Copy 'amount' of chars, beginning at head of clist 'clistp' to
+ * destination linear buffer 'dest'. Return number of characters
+ * actually copied.
+ */
+int
+q_to_b(clistp, dest, amount)
+ struct clist *clistp;
+ char *dest;
+ int amount;
+{
+ struct cblock *cblockp;
+ struct cblock *cblockn;
+ char *dest_orig = dest;
+ int numc;
+ int s;
+
+ s = spltty();
+
+ while (clistp && amount && (clistp->c_cc > 0)) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+ cblockn = cblockp + 1; /* pointer arithmetic! */
+ numc = min(amount, (char *)cblockn - clistp->c_cf);
+ numc = min(numc, clistp->c_cc);
+ bcopy(clistp->c_cf, dest, numc);
+ amount -= numc;
+ clistp->c_cf += numc;
+ clistp->c_cc -= numc;
+ dest += numc;
+ /*
+ * If this cblock has been emptied, advance to the next
+ * one. If there are no more characters, set the first
+ * and last pointer to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+ return (dest - dest_orig);
+}
+
+/*
+ * Flush 'amount' of chars, beginning at head of clist 'clistp'.
+ */
+void
+ndflush(clistp, amount)
+ struct clist *clistp;
+ int amount;
+{
+ struct cblock *cblockp;
+ struct cblock *cblockn;
+ int numc;
+ int s;
+
+ s = spltty();
+
+ while (amount && (clistp->c_cc > 0)) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+ cblockn = cblockp + 1; /* pointer arithmetic! */
+ numc = min(amount, (char *)cblockn - clistp->c_cf);
+ numc = min(numc, clistp->c_cc);
+ amount -= numc;
+ clistp->c_cf += numc;
+ clistp->c_cc -= numc;
+ /*
+ * If this cblock has been emptied, advance to the next
+ * one. If there are no more characters, set the first
+ * and last pointer to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+}
+
+/*
+ * Add a character to the end of a clist. Return -1 is no
+ * more clists, or 0 for success.
+ */
+int
+putc(chr, clistp)
+ int chr;
+ struct clist *clistp;
+{
+ struct cblock *cblockp;
+ int s;
+
+ s = spltty();
+
+ if (clistp->c_cl == NULL) {
+ if (clistp->c_cbreserved < 1) {
+ splx(s);
+ printf("putc to a clist with no reserved cblocks\n");
+ return (-1); /* nothing done */
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount = 1;
+ clistp->c_cf = clistp->c_cl = cblockp->c_info;
+ clistp->c_cc = 0;
+ } else {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+ if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+ struct cblock *prev = (cblockp - 1);
+
+ if (clistp->c_cbcount >= clistp->c_cbreserved) {
+ if (clistp->c_cbcount >= clistp->c_cbmax
+ || cslushcount <= 0) {
+ splx(s);
+ return (-1);
+ }
+ --cslushcount;
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount++;
+ prev->c_next = cblockp;
+ clistp->c_cl = cblockp->c_info;
+ }
+ }
+
+ /*
+ * If this character is quoted, set the quote bit, if not, clear it.
+ */
+ if (chr & TTY_QUOTE) {
+ setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+ /*
+ * Use one of the spare quote bits to record that something
+ * may be quoted.
+ */
+ setbit(cblockp->c_quote, CBQSIZE * NBBY - 1);
+ } else
+ clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+
+ *clistp->c_cl++ = chr;
+ clistp->c_cc++;
+
+ splx(s);
+ return (0);
+}
+
+/*
+ * Copy data from linear buffer to clist chain. Return the
+ * number of characters not copied.
+ */
+int
+b_to_q(src, amount, clistp)
+ char *src;
+ int amount;
+ struct clist *clistp;
+{
+ struct cblock *cblockp;
+ char *firstbyte, *lastbyte;
+ u_char startmask, endmask;
+ int startbit, endbit, num_between, numc;
+ int s;
+
+ /*
+ * Avoid allocating an initial cblock and then not using it.
+ * c_cc == 0 must imply c_cbount == 0.
+ */
+ if (amount <= 0)
+ return (amount);
+
+ s = spltty();
+
+ /*
+ * If there are no cblocks assigned to this clist yet,
+ * then get one.
+ */
+ if (clistp->c_cl == NULL) {
+ if (clistp->c_cbreserved < 1) {
+ splx(s);
+ printf("b_to_q to a clist with no reserved cblocks.\n");
+ return (amount); /* nothing done */
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount = 1;
+ clistp->c_cf = clistp->c_cl = cblockp->c_info;
+ clistp->c_cc = 0;
+ } else {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+ }
+
+ while (amount) {
+ /*
+ * Get another cblock if needed.
+ */
+ if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+ struct cblock *prev = cblockp - 1;
+
+ if (clistp->c_cbcount >= clistp->c_cbreserved) {
+ if (clistp->c_cbcount >= clistp->c_cbmax
+ || cslushcount <= 0) {
+ splx(s);
+ return (amount);
+ }
+ --cslushcount;
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount++;
+ prev->c_next = cblockp;
+ clistp->c_cl = cblockp->c_info;
+ }
+
+ /*
+ * Copy a chunk of the linear buffer up to the end
+ * of this cblock.
+ */
+ numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl);
+ bcopy(src, clistp->c_cl, numc);
+
+ /*
+ * Clear quote bits if they aren't known to be clear.
+ * The following could probably be made into a separate
+ * "bitzero()" routine, but why bother?
+ */
+ if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) {
+ startbit = clistp->c_cl - (char *)cblockp->c_info;
+ endbit = startbit + numc - 1;
+
+ firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY);
+ lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY);
+
+ /*
+ * Calculate mask of bits to preserve in first and
+ * last bytes.
+ */
+ startmask = NBBY - (startbit % NBBY);
+ startmask = 0xff >> startmask;
+ endmask = (endbit % NBBY);
+ endmask = 0xff << (endmask + 1);
+
+ if (firstbyte != lastbyte) {
+ *firstbyte &= startmask;
+ *lastbyte &= endmask;
+
+ num_between = lastbyte - firstbyte - 1;
+ if (num_between)
+ bzero(firstbyte + 1, num_between);
+ } else {
+ *firstbyte &= (startmask | endmask);
+ }
+ }
+
+ /*
+ * ...and update pointer for the next chunk.
+ */
+ src += numc;
+ clistp->c_cl += numc;
+ clistp->c_cc += numc;
+ amount -= numc;
+ /*
+ * If we go through the loop again, it's always
+ * for data in the next cblock, so by adding one (cblock),
+ * (which makes the pointer 1 beyond the end of this
+ * cblock) we prepare for the assignment of 'prev'
+ * above.
+ */
+ cblockp += 1;
+
+ }
+
+ splx(s);
+ return (amount);
+}
+
+/*
+ * Get the next character in the clist. Store it at dst. Don't
+ * advance any clist pointers, but return a pointer to the next
+ * character position.
+ */
+char *
+nextc(clistp, cp, dst)
+ struct clist *clistp;
+ char *cp;
+ int *dst;
+{
+ struct cblock *cblockp;
+
+ ++cp;
+ /*
+ * See if the next character is beyond the end of
+ * the clist.
+ */
+ if (clistp->c_cc && (cp != clistp->c_cl)) {
+ /*
+ * If the next character is beyond the end of this
+ * cblock, advance to the next cblock.
+ */
+ if (((intptr_t)cp & CROUND) == 0)
+ cp = ((struct cblock *)cp - 1)->c_next->c_info;
+ cblockp = (struct cblock *)((intptr_t)cp & ~CROUND);
+
+ /*
+ * Get the character. Set the quote flag if this character
+ * is quoted.
+ */
+ *dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0);
+
+ return (cp);
+ }
+
+ return (NULL);
+}
+
+/*
+ * "Unput" a character from a clist.
+ */
+int
+unputc(clistp)
+ struct clist *clistp;
+{
+ struct cblock *cblockp = 0, *cbp = 0;
+ int s;
+ int chr = -1;
+
+
+ s = spltty();
+
+ if (clistp->c_cc) {
+ --clistp->c_cc;
+ --clistp->c_cl;
+
+ chr = (u_char)*clistp->c_cl;
+
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+
+ /*
+ * Set quote flag if this character was quoted.
+ */
+ if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info))
+ chr |= TTY_QUOTE;
+
+ /*
+ * If all of the characters have been unput in this
+ * cblock, then find the previous one and free this
+ * one.
+ */
+ if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) {
+ cbp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+
+ while (cbp->c_next != cblockp)
+ cbp = cbp->c_next;
+
+ /*
+ * When the previous cblock is at the end, the 'last'
+ * pointer always points (invalidly) one past.
+ */
+ clistp->c_cl = (char *)(cbp+1);
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ cbp->c_next = NULL;
+ }
+ }
+
+ /*
+ * If there are no more characters on the list, then
+ * free the last cblock.
+ */
+ if ((clistp->c_cc == 0) && clistp->c_cl) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+
+ splx(s);
+ return (chr);
+}
+
+/*
+ * Move characters in source clist to destination clist,
+ * preserving quote bits.
+ */
+void
+catq(src_clistp, dest_clistp)
+ struct clist *src_clistp, *dest_clistp;
+{
+ int chr, s;
+
+ s = spltty();
+ /*
+ * If the destination clist is empty (has no cblocks atttached),
+ * and there are no possible complications with the resource counters,
+ * then we simply assign the current clist to the destination.
+ */
+ if (!dest_clistp->c_cf
+ && src_clistp->c_cbcount <= src_clistp->c_cbmax
+ && src_clistp->c_cbcount <= dest_clistp->c_cbmax) {
+ dest_clistp->c_cf = src_clistp->c_cf;
+ dest_clistp->c_cl = src_clistp->c_cl;
+ src_clistp->c_cf = src_clistp->c_cl = NULL;
+
+ dest_clistp->c_cc = src_clistp->c_cc;
+ src_clistp->c_cc = 0;
+ dest_clistp->c_cbcount = src_clistp->c_cbcount;
+ src_clistp->c_cbcount = 0;
+
+ splx(s);
+ return;
+ }
+
+ splx(s);
+
+ /*
+ * XXX This should probably be optimized to more than one
+ * character at a time.
+ */
+ while ((chr = getc(src_clistp)) != -1)
+ putc(chr, dest_clistp);
+}
diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c
new file mode 100644
index 0000000..e1e03bd
--- /dev/null
+++ b/sys/kern/tty_tty.c
@@ -0,0 +1,252 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty_tty.c 8.2 (Berkeley) 9/23/93
+ * $FreeBSD$
+ */
+
+/*
+ * Indirect driver for controlling tty.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/proc.h>
+#include <sys/ttycom.h>
+#include <sys/vnode.h>
+
+static d_open_t cttyopen;
+static d_read_t cttyread;
+static d_write_t cttywrite;
+static d_ioctl_t cttyioctl;
+static d_poll_t cttypoll;
+
+#define CDEV_MAJOR 1
+
+static struct cdevsw ctty_cdevsw = {
+ /* open */ cttyopen,
+ /* close */ nullclose,
+ /* read */ cttyread,
+ /* write */ cttywrite,
+ /* ioctl */ cttyioctl,
+ /* poll */ cttypoll,
+ /* mmap */ nommap,
+ /* strategy */ nostrategy,
+ /* name */ "ctty",
+ /* maj */ CDEV_MAJOR,
+ /* dump */ nodump,
+ /* psize */ nopsize,
+ /* flags */ D_TTY,
+};
+
+#define cttyvp(td) ((td)->td_proc->p_flag & P_CONTROLT ? (td)->td_proc->p_session->s_ttyvp : NULL)
+
+/*ARGSUSED*/
+static int
+cttyopen(dev, flag, mode, td)
+ dev_t dev;
+ int flag, mode;
+ struct thread *td;
+{
+ struct vnode *ttyvp;
+ int error;
+
+ PROC_LOCK(td->td_proc);
+ SESS_LOCK(td->td_proc->p_session);
+ ttyvp = cttyvp(td);
+ SESS_UNLOCK(td->td_proc->p_session);
+ PROC_UNLOCK(td->td_proc);
+
+ if (ttyvp == NULL)
+ return (ENXIO);
+ vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_OPEN(ttyvp, flag, NOCRED, td);
+ VOP_UNLOCK(ttyvp, 0, td);
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+cttyread(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ struct thread *td = uio->uio_td;
+ register struct vnode *ttyvp;
+ int error;
+
+ PROC_LOCK(td->td_proc);
+ SESS_LOCK(td->td_proc->p_session);
+ ttyvp = cttyvp(td);
+ SESS_UNLOCK(td->td_proc->p_session);
+ PROC_UNLOCK(td->td_proc);
+
+ if (ttyvp == NULL)
+ return (EIO);
+ vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_READ(ttyvp, uio, flag, NOCRED);
+ VOP_UNLOCK(ttyvp, 0, td);
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+cttywrite(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ struct thread *td = uio->uio_td;
+ struct vnode *ttyvp;
+ struct mount *mp;
+ int error;
+
+ PROC_LOCK(td->td_proc);
+ SESS_LOCK(td->td_proc->p_session);
+ ttyvp = cttyvp(td);
+ SESS_UNLOCK(td->td_proc->p_session);
+ PROC_UNLOCK(td->td_proc);
+
+ if (ttyvp == NULL)
+ return (EIO);
+ mp = NULL;
+ if (ttyvp->v_type != VCHR &&
+ (error = vn_start_write(ttyvp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_WRITE(ttyvp, uio, flag, NOCRED);
+ VOP_UNLOCK(ttyvp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+cttyioctl(dev, cmd, addr, flag, td)
+ dev_t dev;
+ u_long cmd;
+ caddr_t addr;
+ int flag;
+ struct thread *td;
+{
+ struct vnode *ttyvp;
+ int error;
+
+ PROC_LOCK(td->td_proc);
+ SESS_LOCK(td->td_proc->p_session);
+ ttyvp = cttyvp(td);
+ SESS_UNLOCK(td->td_proc->p_session);
+ PROC_UNLOCK(td->td_proc);
+
+ if (ttyvp == NULL)
+ return (EIO);
+ if (cmd == TIOCSCTTY) /* don't allow controlling tty to be set */
+ return EINVAL; /* to controlling tty -- infinite recursion */
+ if (cmd == TIOCNOTTY) {
+ PROC_LOCK(td->td_proc);
+ SESS_LOCK(td->td_proc->p_session);
+ error = 0;
+ if (!SESS_LEADER(td->td_proc))
+ td->td_proc->p_flag &= ~P_CONTROLT;
+ else
+ error = EINVAL;
+ SESS_UNLOCK(td->td_proc->p_session);
+ PROC_UNLOCK(td->td_proc);
+ return (error);
+ }
+ return (VOP_IOCTL(ttyvp, cmd, addr, flag, NOCRED, td));
+}
+
+/*ARGSUSED*/
+static int
+cttypoll(dev, events, td)
+ dev_t dev;
+ int events;
+ struct thread *td;
+{
+ struct vnode *ttyvp;
+
+ PROC_LOCK(td->td_proc);
+ SESS_LOCK(td->td_proc->p_session);
+ ttyvp = cttyvp(td);
+ SESS_UNLOCK(td->td_proc->p_session);
+ PROC_UNLOCK(td->td_proc);
+
+ if (ttyvp == NULL)
+ /* try operation to get EOF/failure */
+ return (seltrue(dev, events, td));
+ return (VOP_POLL(ttyvp, events, td->td_ucred, td));
+}
+
+static void ctty_clone(void *arg, char *name, int namelen, dev_t *dev);
+
+static dev_t ctty;
+
+static void
+ctty_clone(void *arg, char *name, int namelen, dev_t *dev)
+{
+ struct vnode *vp;
+
+ if (*dev != NODEV)
+ return;
+ if (strcmp(name, "tty"))
+ return;
+ vp = cttyvp(curthread);
+ if (vp == NULL) {
+ if (ctty)
+ *dev = ctty;
+ } else
+ *dev = vp->v_rdev;
+}
+
+
+static void ctty_drvinit(void *unused);
+static void
+ctty_drvinit(unused)
+ void *unused;
+{
+
+ if (devfs_present) {
+ EVENTHANDLER_REGISTER(dev_clone, ctty_clone, 0, 1000);
+ ctty = make_dev(&ctty_cdevsw, 0, 0, 0, 0666, "ctty");
+ } else {
+ make_dev(&ctty_cdevsw, 0, 0, 0, 0666, "tty");
+ }
+}
+
+SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,ctty_drvinit,NULL)
diff --git a/sys/kern/uipc_accf.c b/sys/kern/uipc_accf.c
new file mode 100644
index 0000000..b31026a
--- /dev/null
+++ b/sys/kern/uipc_accf.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2000 Paycounter, Inc.
+ * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define ACCEPT_FILTER_MOD
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/sysctl.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/queue.h>
+
+static SLIST_HEAD(, accept_filter) accept_filtlsthd =
+ SLIST_HEAD_INITIALIZER(&accept_filtlsthd);
+
+MALLOC_DEFINE(M_ACCF, "accf", "accept filter data");
+
+static int unloadable = 0;
+
+SYSCTL_DECL(_net_inet); /* XXX: some header should do this for me */
+SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW, 0, "Accept filters");
+SYSCTL_INT(_net_inet_accf, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
+ "Allow unload of accept filters (not recommended)");
+
+/*
+ * must be passed a malloc'd structure so we don't explode if the kld
+ * is unloaded, we leak the struct on deallocation to deal with this,
+ * but if a filter is loaded with the same name as a leaked one we re-use
+ * the entry.
+ */
+int
+accept_filt_add(struct accept_filter *filt)
+{
+ struct accept_filter *p;
+
+ SLIST_FOREACH(p, &accept_filtlsthd, accf_next)
+ if (strcmp(p->accf_name, filt->accf_name) == 0) {
+ if (p->accf_callback != NULL) {
+ return (EEXIST);
+ } else {
+ p->accf_callback = filt->accf_callback;
+ FREE(filt, M_ACCF);
+ return (0);
+ }
+ }
+
+ if (p == NULL)
+ SLIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next);
+ return (0);
+}
+
+int
+accept_filt_del(char *name)
+{
+ struct accept_filter *p;
+
+ p = accept_filt_get(name);
+ if (p == NULL)
+ return (ENOENT);
+
+ p->accf_callback = NULL;
+ return (0);
+}
+
+struct accept_filter *
+accept_filt_get(char *name)
+{
+ struct accept_filter *p;
+
+ SLIST_FOREACH(p, &accept_filtlsthd, accf_next)
+ if (strcmp(p->accf_name, name) == 0)
+ return (p);
+
+ return (NULL);
+}
+
+int
+accept_filt_generic_mod_event(module_t mod, int event, void *data)
+{
+ struct accept_filter *p;
+ struct accept_filter *accfp = (struct accept_filter *) data;
+ int s, error;
+
+ switch (event) {
+ case MOD_LOAD:
+ MALLOC(p, struct accept_filter *, sizeof(*p), M_ACCF, M_WAITOK);
+ bcopy(accfp, p, sizeof(*p));
+ s = splnet();
+ error = accept_filt_add(p);
+ splx(s);
+ break;
+
+ case MOD_UNLOAD:
+ /*
+ * Do not support unloading yet. we don't keep track of refcounts
+ * and unloading an accept filter callback and then having it called
+ * is a bad thing. A simple fix would be to track the refcount
+ * in the struct accept_filter.
+ */
+ if (unloadable != 0) {
+ s = splnet();
+ error = accept_filt_del(accfp->accf_name);
+ splx(s);
+ } else
+ error = EOPNOTSUPP;
+ break;
+
+ case MOD_SHUTDOWN:
+ error = 0;
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ return (error);
+}
diff --git a/sys/kern/uipc_cow.c b/sys/kern/uipc_cow.c
new file mode 100644
index 0000000..239e7c5
--- /dev/null
+++ b/sys/kern/uipc_cow.c
@@ -0,0 +1,181 @@
+/*-
+ * Copyright (c) 1997, Duke University
+ * All rights reserved.
+ *
+ * Author:
+ * Andrew Gallatin <gallatin@cs.duke.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgements:
+ * This product includes software developed by Duke University
+ * 4. The name of Duke University may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY DUKE UNIVERSITY ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DUKE UNIVERSITY BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITSOR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * This is a set of routines for enabling and disabling copy on write
+ * protection for data written into sockets.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/mbuf.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#if 0
+#include <vm/vm_pager.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_zone.h>
+#include <vm/swap_pager.h>
+#endif
+
+
+struct netsend_cow_stats {
+ int attempted;
+ int fail_not_mapped;
+ int fail_wired;
+ int fail_not_anon;
+ int fail_pmap_cow;
+ int fail_pg_error;
+ int fail_kva;
+ int free_post_exit;
+ int success;
+ int iodone;
+ int freed;
+};
+
+static struct netsend_cow_stats socow_stats = {0,0,0,0,0,0,0,0,0,0,0};
+
+extern struct sf_buf *sf_bufs;
+extern vm_offset_t sf_base;
+#define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
+void sf_buf_free(caddr_t addr, void *args);
+struct sf_buf *sf_buf_alloc(void);
+static void socow_iodone(caddr_t addr, void *args);
+
+static void
+socow_iodone(caddr_t addr, void *args)
+{
+ int s;
+ struct sf_buf *sf;
+
+ vm_offset_t paddr;
+ vm_page_t pp;
+
+ sf = dtosf(addr);
+ paddr = vtophys((vm_offset_t)addr);
+ pp = PHYS_TO_VM_PAGE(paddr);
+ s = splvm();
+ /* remove COW mapping */
+ vm_page_cowclear(pp);
+ vm_object_deallocate(pp->object);
+ splx(s);
+ /* note that sf_buf_free() unwires the page for us*/
+ sf_buf_free(addr, NULL);
+ socow_stats.iodone++;
+}
+
+int
+socow_setup(struct mbuf *m0, struct uio *uio)
+{
+ struct sf_buf *sf;
+ vm_page_t pp;
+ vm_offset_t pa;
+ struct iovec *iov;
+ struct vmspace *vmspace;
+ struct vm_map *map;
+ vm_offset_t uva;
+ int s;
+
+ vmspace = curproc->p_vmspace;;
+ map = &vmspace->vm_map;
+ uva = (vm_offset_t) uio->uio_iov->iov_base;
+
+ s = splvm();
+
+ /*
+ * verify page is mapped & not already wired for i/o
+ */
+ socow_stats.attempted++;
+ pa=pmap_extract(map->pmap, uva);
+ if(!pa) {
+ socow_stats.fail_not_mapped++;
+ splx(s);
+ return(0);
+ }
+ pp = PHYS_TO_VM_PAGE(pa);
+
+ sf = sf_buf_alloc();
+ sf->m = pp;
+ pmap_qenter(sf->kva, &pp, 1);
+
+ /*
+ * set up COW
+ */
+ vm_page_cowsetup(pp);
+
+ /*
+ * wire the page for I/O
+ */
+ vm_page_wire(pp);
+
+ /*
+ * prevent the process from exiting on us.
+ */
+ vm_object_reference(pp->object);
+
+ /*
+ * attach to mbuf
+ */
+ m0->m_data = (caddr_t)sf->kva;
+ m0->m_len = PAGE_SIZE;
+ MEXTADD(m0, sf->kva, PAGE_SIZE, socow_iodone, NULL, 0, EXT_SFBUF);
+ socow_stats.success++;
+
+ iov = uio->uio_iov;
+ iov->iov_base += PAGE_SIZE;
+ iov->iov_len -= PAGE_SIZE;
+ uio->uio_resid -= PAGE_SIZE;
+ uio->uio_offset += PAGE_SIZE;
+ if (iov->iov_len == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ }
+
+ splx(s);
+ return(1);
+}
diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c
new file mode 100644
index 0000000..b8321eb
--- /dev/null
+++ b/sys/kern/uipc_domain.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/socketvar.h>
+#include <sys/systm.h>
+#include <vm/uma.h>
+
+/*
+ * System initialization
+ *
+ * Note: domain initialization takes place on a per domain basis
+ * as a result of traversing a SYSINIT linker set. Most likely,
+ * each domain would want to call DOMAIN_SET(9) itself, which
+ * would cause the domain to be added just after domaininit()
+ * is called during startup.
+ *
+ * See DOMAIN_SET(9) for details on its use.
+ */
+
+static void domaininit(void *);
+SYSINIT(domain, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, domaininit, NULL)
+
+static struct callout pffast_callout;
+static struct callout pfslow_callout;
+
+static void pffasttimo(void *);
+static void pfslowtimo(void *);
+
+struct domain *domains;
+
+/*
+ * Add a new protocol domain to the list of supported domains
+ * Note: you cant unload it again because a socket may be using it.
+ * XXX can't fail at this time.
+ */
+static void
+net_init_domain(struct domain *dp)
+{
+ register struct protosw *pr;
+ int s;
+
+ s = splnet();
+ if (dp->dom_init)
+ (*dp->dom_init)();
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++){
+ if (pr->pr_usrreqs == 0)
+ panic("domaininit: %ssw[%d] has no usrreqs!",
+ dp->dom_name,
+ (int)(pr - dp->dom_protosw));
+ if (pr->pr_init)
+ (*pr->pr_init)();
+ }
+ /*
+ * update global informatio about maximums
+ */
+ max_hdr = max_linkhdr + max_protohdr;
+ max_datalen = MHLEN - max_hdr;
+ splx(s);
+}
+
+/*
+ * Add a new protocol domain to the list of supported domains
+ * Note: you cant unload it again because a socket may be using it.
+ * XXX can't fail at this time.
+ */
+void
+net_add_domain(void *data)
+{
+ int s;
+ struct domain *dp;
+
+ dp = (struct domain *)data;
+ s = splnet();
+ dp->dom_next = domains;
+ domains = dp;
+ splx(s);
+ net_init_domain(dp);
+}
+
+/* ARGSUSED*/
+static void
+domaininit(void *dummy)
+{
+ /*
+ * Before we do any setup, make sure to initialize the
+ * zone allocator we get struct sockets from.
+ */
+
+ socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(socket_zone, maxsockets);
+
+ if (max_linkhdr < 16) /* XXX */
+ max_linkhdr = 16;
+
+ callout_init(&pffast_callout, 0);
+ callout_init(&pfslow_callout, 0);
+
+ callout_reset(&pffast_callout, 1, pffasttimo, NULL);
+ callout_reset(&pfslow_callout, 1, pfslowtimo, NULL);
+}
+
+
+struct protosw *
+pffindtype(family, type)
+ int family;
+ int type;
+{
+ register struct domain *dp;
+ register struct protosw *pr;
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ if (dp->dom_family == family)
+ goto found;
+ return (0);
+found:
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_type && pr->pr_type == type)
+ return (pr);
+ return (0);
+}
+
+struct protosw *
+pffindproto(family, protocol, type)
+ int family;
+ int protocol;
+ int type;
+{
+ register struct domain *dp;
+ register struct protosw *pr;
+ struct protosw *maybe = 0;
+
+ if (family == 0)
+ return (0);
+ for (dp = domains; dp; dp = dp->dom_next)
+ if (dp->dom_family == family)
+ goto found;
+ return (0);
+found:
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
+ if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
+ return (pr);
+
+ if (type == SOCK_RAW && pr->pr_type == SOCK_RAW &&
+ pr->pr_protocol == 0 && maybe == (struct protosw *)0)
+ maybe = pr;
+ }
+ return (maybe);
+}
+
+void
+pfctlinput(cmd, sa)
+ int cmd;
+ struct sockaddr *sa;
+{
+ register struct domain *dp;
+ register struct protosw *pr;
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_ctlinput)
+ (*pr->pr_ctlinput)(cmd, sa, (void *)0);
+}
+
+void
+pfctlinput2(cmd, sa, ctlparam)
+ int cmd;
+ struct sockaddr *sa;
+ void *ctlparam;
+{
+ struct domain *dp;
+ struct protosw *pr;
+
+ if (!sa)
+ return;
+ for (dp = domains; dp; dp = dp->dom_next) {
+ /*
+ * the check must be made by xx_ctlinput() anyways, to
+ * make sure we use data item pointed to by ctlparam in
+ * correct way. the following check is made just for safety.
+ */
+ if (dp->dom_family != sa->sa_family)
+ continue;
+
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_ctlinput)
+ (*pr->pr_ctlinput)(cmd, sa, ctlparam);
+ }
+}
+
+static void
+pfslowtimo(arg)
+ void *arg;
+{
+ register struct domain *dp;
+ register struct protosw *pr;
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_slowtimo)
+ (*pr->pr_slowtimo)();
+ callout_reset(&pfslow_callout, hz/2, pfslowtimo, NULL);
+}
+
+static void
+pffasttimo(arg)
+ void *arg;
+{
+ register struct domain *dp;
+ register struct protosw *pr;
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_fasttimo)
+ (*pr->pr_fasttimo)();
+ callout_reset(&pffast_callout, hz/5, pffasttimo, NULL);
+}
diff --git a/sys/kern/uipc_jumbo.c b/sys/kern/uipc_jumbo.c
new file mode 100644
index 0000000..4625752
--- /dev/null
+++ b/sys/kern/uipc_jumbo.c
@@ -0,0 +1,252 @@
+/*-
+ * Copyright (c) 1997, Duke University
+ * All rights reserved.
+ *
+ * Author:
+ * Andrew Gallatin <gallatin@cs.duke.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgements:
+ * This product includes software developed by Duke University
+ * 4. The name of Duke University may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY DUKE UNIVERSITY ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DUKE UNIVERSITY BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITSOR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * This is a set of routines for allocating large-sized mbuf payload
+ * areas, and is primarily intended for use in receive side mbuf
+ * allocation.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/sockio.h>
+#include <sys/uio.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_map.h>
+#include <vm/vm_param.h>
+#include <vm/vm_pageout.h>
+#include <sys/vmmeter.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <sys/proc.h>
+#include <sys/jumbo.h>
+
+/*
+ * XXX this may be too high or too low.
+ */
+#define JUMBO_MAX_PAGES 3072
+
+struct jumbo_kmap {
+ vm_offset_t kva;
+ SLIST_ENTRY(jumbo_kmap) entries; /* Singly-linked List. */
+};
+
+static SLIST_HEAD(jumbo_kmap_head, jumbo_kmap) jumbo_kmap_free,
+ jumbo_kmap_inuse;
+
+static struct mtx jumbo_mutex;
+MTX_SYSINIT(jumbo_lock, &jumbo_mutex, "jumbo mutex", MTX_DEF);
+
+static struct vm_object *jumbo_vm_object;
+static unsigned long jumbo_vmuiomove_pgs_freed = 0;
+#if 0
+static int jumbo_vm_wakeup_wanted = 0;
+#endif
+vm_offset_t jumbo_basekva;
+
+int
+jumbo_vm_init(void)
+{
+ int i;
+ struct jumbo_kmap *entry;
+
+ mtx_lock(&jumbo_mutex);
+
+ if (jumbo_vm_object != NULL) {
+ mtx_unlock(&jumbo_mutex);
+ return (1);
+ }
+
+ /* allocate our object */
+ jumbo_vm_object = vm_object_allocate_wait(OBJT_DEFAULT, JUMBO_MAX_PAGES,
+ M_NOWAIT);
+
+ if (jumbo_vm_object == NULL) {
+ mtx_unlock(&jumbo_mutex);
+ return (0);
+ }
+
+ SLIST_INIT(&jumbo_kmap_free);
+ SLIST_INIT(&jumbo_kmap_inuse);
+
+ /* grab some kernel virtual address space */
+ jumbo_basekva = kmem_alloc_pageable(kernel_map,
+ PAGE_SIZE * JUMBO_MAX_PAGES);
+ if (jumbo_basekva == 0) {
+ vm_object_deallocate(jumbo_vm_object);
+ jumbo_vm_object = NULL;
+ mtx_unlock(&jumbo_mutex);
+ return 0;
+ }
+ for (i = 0; i < JUMBO_MAX_PAGES; i++) {
+ entry = malloc(sizeof(struct jumbo_kmap), M_TEMP, M_NOWAIT);
+ if (!entry && !i) {
+ mtx_unlock(&jumbo_mutex);
+ panic("jumbo_vm_init: unable to allocated kvas");
+ } else if (!entry) {
+ printf("warning: jumbo_vm_init allocated only %d kva\n",
+ i);
+ mtx_unlock(&jumbo_mutex);
+ return 1;
+ }
+ entry->kva = jumbo_basekva + (vm_offset_t)i * PAGE_SIZE;
+ SLIST_INSERT_HEAD(&jumbo_kmap_free, entry, entries);
+ }
+ mtx_unlock(&jumbo_mutex);
+ return 1;
+}
+
+void
+jumbo_freem(caddr_t addr, void *args)
+{
+ vm_page_t frame;
+
+ frame = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)addr));
+
+ /*
+ * Need giant for looking at the hold count below. Convert this
+ * to the vm mutex once the VM code has been moved out from under
+ * giant.
+ */
+ GIANT_REQUIRED;
+
+ if (frame->hold_count == 0)
+ jumbo_pg_free((vm_offset_t)addr);
+ else
+ printf("jumbo_freem: hold count for %p is %d!!??\n",
+ frame, frame->hold_count);
+}
+
+void
+jumbo_pg_steal(vm_page_t pg)
+{
+ vm_offset_t addr;
+ struct jumbo_kmap *entry;
+
+ addr = ptoa(pg->pindex) + jumbo_basekva;
+
+ if (pg->object != jumbo_vm_object)
+ panic("stealing a non jumbo_vm_object page");
+ vm_page_remove(pg);
+
+ mtx_lock(&jumbo_mutex);
+
+ pmap_qremove(addr,1);
+ entry = SLIST_FIRST(&jumbo_kmap_inuse);
+ entry->kva = addr;
+ SLIST_REMOVE_HEAD(&jumbo_kmap_inuse, entries);
+ SLIST_INSERT_HEAD(&jumbo_kmap_free, entry, entries);
+
+ mtx_unlock(&jumbo_mutex);
+
+#if 0
+ if (jumbo_vm_wakeup_wanted)
+ wakeup(jumbo_vm_object);
+#endif
+}
+
+
+vm_page_t
+jumbo_pg_alloc(void)
+{
+ vm_page_t pg;
+ vm_pindex_t pindex;
+ struct jumbo_kmap *entry;
+
+ pg = NULL;
+ mtx_lock(&jumbo_mutex);
+
+ entry = SLIST_FIRST(&jumbo_kmap_free);
+ if (entry != NULL){
+ pindex = atop(entry->kva - jumbo_basekva);
+ pg = vm_page_alloc(jumbo_vm_object, pindex, VM_ALLOC_INTERRUPT);
+ if (pg != NULL) {
+ SLIST_REMOVE_HEAD(&jumbo_kmap_free, entries);
+ SLIST_INSERT_HEAD(&jumbo_kmap_inuse, entry, entries);
+ pmap_qenter(entry->kva, &pg, 1);
+ }
+ }
+ mtx_unlock(&jumbo_mutex);
+ return(pg);
+}
+
+void
+jumbo_pg_free(vm_offset_t addr)
+{
+ struct jumbo_kmap *entry;
+ vm_offset_t paddr;
+ vm_page_t pg;
+
+ paddr = pmap_kextract((vm_offset_t)addr);
+ pg = PHYS_TO_VM_PAGE(paddr);
+
+ if (pg->object != jumbo_vm_object) {
+ jumbo_vmuiomove_pgs_freed++;
+/* if(vm_page_lookup(jumbo_vm_object, atop(addr - jumbo_basekva)))
+ panic("vm_page_rename didn't");
+ printf("freeing uiomoved pg:\t pindex = %d, padd = 0x%lx\n",
+ atop(addr - jumbo_basekva), paddr);
+*/
+ } else {
+ vm_page_busy(pg); /* vm_page_free wants pages to be busy*/
+ vm_page_free(pg);
+ }
+
+ mtx_lock(&jumbo_mutex);
+
+ pmap_qremove(addr,1);
+ entry = SLIST_FIRST(&jumbo_kmap_inuse);
+ entry->kva = addr;
+ SLIST_REMOVE_HEAD(&jumbo_kmap_inuse, entries);
+ SLIST_INSERT_HEAD(&jumbo_kmap_free, entry, entries);
+
+ mtx_unlock(&jumbo_mutex);
+
+#if 0
+ if (jumbo_vm_wakeup_wanted)
+ wakeup(jumbo_vm_object);
+#endif
+}
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
new file mode 100644
index 0000000..27ca156
--- /dev/null
+++ b/sys/kern/uipc_mbuf.c
@@ -0,0 +1,753 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
+ * $FreeBSD$
+ */
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/sysctl.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+
+int max_linkhdr;
+int max_protohdr;
+int max_hdr;
+int max_datalen;
+
+/*
+ * sysctl(8) exported objects
+ */
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
+ &max_linkhdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
+ &max_protohdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
+ &max_datalen, 0, "");
+
+/*
+ * struct mbuf *
+ * m_getm(m, len, how, type)
+ *
+ * This will allocate len-worth of mbufs and/or mbuf clusters (whatever fits
+ * best) and return a pointer to the top of the allocated chain. If m is
+ * non-null, then we assume that it is a single mbuf or an mbuf chain to
+ * which we want len bytes worth of mbufs and/or clusters attached, and so
+ * if we succeed in allocating it, we will just return a pointer to m.
+ *
+ * If we happen to fail at any point during the allocation, we will free
+ * up everything we have already allocated and return NULL.
+ *
+ */
+struct mbuf *
+m_getm(struct mbuf *m, int len, int how, int type)
+{
+ struct mbuf *top, *tail, *mp, *mtail = NULL;
+
+ KASSERT(len >= 0, ("len is < 0 in m_getm"));
+
+ MGET(mp, how, type);
+ if (mp == NULL)
+ return (NULL);
+ else if (len > MINCLSIZE) {
+ MCLGET(mp, how);
+ if ((mp->m_flags & M_EXT) == 0) {
+ m_free(mp);
+ return (NULL);
+ }
+ }
+ mp->m_len = 0;
+ len -= M_TRAILINGSPACE(mp);
+
+ if (m != NULL)
+ for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
+ else
+ m = mp;
+
+ top = tail = mp;
+ while (len > 0) {
+ MGET(mp, how, type);
+ if (mp == NULL)
+ goto failed;
+
+ tail->m_next = mp;
+ tail = mp;
+ if (len > MINCLSIZE) {
+ MCLGET(mp, how);
+ if ((mp->m_flags & M_EXT) == 0)
+ goto failed;
+ }
+
+ mp->m_len = 0;
+ len -= M_TRAILINGSPACE(mp);
+ }
+
+ if (mtail != NULL)
+ mtail->m_next = top;
+ return (m);
+
+failed:
+ m_freem(top);
+ return (NULL);
+}
+
+void
+m_freem(struct mbuf *m)
+{
+ while (m) {
+ m = m_free(m);
+ }
+}
+
+/*
+ * Lesser-used path for M_PREPEND:
+ * allocate new mbuf to prepend to chain,
+ * copy junk along.
+ */
+struct mbuf *
+m_prepend(struct mbuf *m, int len, int how)
+{
+ struct mbuf *mn;
+
+ MGET(mn, how, m->m_type);
+ if (mn == NULL) {
+ m_freem(m);
+ return (NULL);
+ }
+ if (m->m_flags & M_PKTHDR) {
+ M_COPY_PKTHDR(mn, m);
+ m->m_flags &= ~M_PKTHDR;
+ }
+ mn->m_next = m;
+ m = mn;
+ if (len < MHLEN)
+ MH_ALIGN(m, len);
+ m->m_len = len;
+ return (m);
+}
+
+/*
+ * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
+ * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
+ * The wait parameter is a choice of M_TRYWAIT/M_DONTWAIT from caller.
+ * Note that the copy is read-only, because clusters are not copied,
+ * only their reference counts are incremented.
+ */
+struct mbuf *
+m_copym(struct mbuf *m, int off0, int len, int wait)
+{
+ struct mbuf *n, **np;
+ int off = off0;
+ struct mbuf *top;
+ int copyhdr = 0;
+
+ KASSERT(off >= 0, ("m_copym, negative off %d", off));
+ KASSERT(len >= 0, ("m_copym, negative len %d", len));
+ if (off == 0 && m->m_flags & M_PKTHDR)
+ copyhdr = 1;
+ while (off > 0) {
+ KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
+ if (off < m->m_len)
+ break;
+ off -= m->m_len;
+ m = m->m_next;
+ }
+ np = &top;
+ top = 0;
+ while (len > 0) {
+ if (m == NULL) {
+ KASSERT(len == M_COPYALL,
+ ("m_copym, length > size of mbuf chain"));
+ break;
+ }
+ MGET(n, wait, m->m_type);
+ *np = n;
+ if (n == NULL)
+ goto nospace;
+ if (copyhdr) {
+ M_COPY_PKTHDR(n, m);
+ if (len == M_COPYALL)
+ n->m_pkthdr.len -= off0;
+ else
+ n->m_pkthdr.len = len;
+ copyhdr = 0;
+ }
+ n->m_len = min(len, m->m_len - off);
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data + off;
+ n->m_ext = m->m_ext;
+ n->m_flags |= M_EXT;
+ MEXT_ADD_REF(m);
+ } else
+ bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
+ (unsigned)n->m_len);
+ if (len != M_COPYALL)
+ len -= n->m_len;
+ off = 0;
+ m = m->m_next;
+ np = &n->m_next;
+ }
+ if (top == NULL)
+ mbstat.m_mcfail++; /* XXX: No consistency. */
+
+ return (top);
+nospace:
+ m_freem(top);
+ mbstat.m_mcfail++; /* XXX: No consistency. */
+ return (NULL);
+}
+
+/*
+ * Copy an entire packet, including header (which must be present).
+ * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
+ * Note that the copy is read-only, because clusters are not copied,
+ * only their reference counts are incremented.
+ * Preserve alignment of the first mbuf so if the creator has left
+ * some room at the beginning (e.g. for inserting protocol headers)
+ * the copies still have the room available.
+ */
+struct mbuf *
+m_copypacket(struct mbuf *m, int how)
+{
+ struct mbuf *top, *n, *o;
+
+ MGET(n, how, m->m_type);
+ top = n;
+ if (n == NULL)
+ goto nospace;
+
+ M_COPY_PKTHDR(n, m);
+ n->m_len = m->m_len;
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data;
+ n->m_ext = m->m_ext;
+ n->m_flags |= M_EXT;
+ MEXT_ADD_REF(m);
+ } else {
+ n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
+ bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+ }
+
+ m = m->m_next;
+ while (m) {
+ MGET(o, how, m->m_type);
+ if (o == NULL)
+ goto nospace;
+
+ n->m_next = o;
+ n = n->m_next;
+
+ n->m_len = m->m_len;
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data;
+ n->m_ext = m->m_ext;
+ n->m_flags |= M_EXT;
+ MEXT_ADD_REF(m);
+ } else {
+ bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+ }
+
+ m = m->m_next;
+ }
+ return top;
+nospace:
+ m_freem(top);
+ mbstat.m_mcfail++; /* XXX: No consistency. */
+ return (NULL);
+}
+
+/*
+ * Copy data from an mbuf chain starting "off" bytes from the beginning,
+ * continuing for "len" bytes, into the indicated buffer.
+ */
+void
+m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
+{
+ unsigned count;
+
+ KASSERT(off >= 0, ("m_copydata, negative off %d", off));
+ KASSERT(len >= 0, ("m_copydata, negative len %d", len));
+ while (off > 0) {
+ KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
+ if (off < m->m_len)
+ break;
+ off -= m->m_len;
+ m = m->m_next;
+ }
+ while (len > 0) {
+ KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
+ count = min(m->m_len - off, len);
+ bcopy(mtod(m, caddr_t) + off, cp, count);
+ len -= count;
+ cp += count;
+ off = 0;
+ m = m->m_next;
+ }
+}
+
+/*
+ * Copy a packet header mbuf chain into a completely new chain, including
+ * copying any mbuf clusters. Use this instead of m_copypacket() when
+ * you need a writable copy of an mbuf chain.
+ */
+struct mbuf *
+m_dup(struct mbuf *m, int how)
+{
+ struct mbuf **p, *top = NULL;
+ int remain, moff, nsize;
+
+ /* Sanity check */
+ if (m == NULL)
+ return (NULL);
+ KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
+
+ /* While there's more data, get a new mbuf, tack it on, and fill it */
+ remain = m->m_pkthdr.len;
+ moff = 0;
+ p = &top;
+ while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */
+ struct mbuf *n;
+
+ /* Get the next new mbuf */
+ MGET(n, how, m->m_type);
+ if (n == NULL)
+ goto nospace;
+ if (top == NULL) { /* first one, must be PKTHDR */
+ M_COPY_PKTHDR(n, m);
+ nsize = MHLEN;
+ } else /* not the first one */
+ nsize = MLEN;
+ if (remain >= MINCLSIZE) {
+ MCLGET(n, how);
+ if ((n->m_flags & M_EXT) == 0) {
+ (void)m_free(n);
+ goto nospace;
+ }
+ nsize = MCLBYTES;
+ }
+ n->m_len = 0;
+
+ /* Link it into the new chain */
+ *p = n;
+ p = &n->m_next;
+
+ /* Copy data from original mbuf(s) into new mbuf */
+ while (n->m_len < nsize && m != NULL) {
+ int chunk = min(nsize - n->m_len, m->m_len - moff);
+
+ bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
+ moff += chunk;
+ n->m_len += chunk;
+ remain -= chunk;
+ if (moff == m->m_len) {
+ m = m->m_next;
+ moff = 0;
+ }
+ }
+
+ /* Check correct total mbuf length */
+ KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
+ ("%s: bogus m_pkthdr.len", __func__));
+ }
+ return (top);
+
+nospace:
+ m_freem(top);
+ mbstat.m_mcfail++; /* XXX: No consistency. */
+ return (NULL);
+}
+
+/*
+ * Concatenate mbuf chain n to m.
+ * Both chains must be of the same type (e.g. MT_DATA).
+ * Any m_pkthdr is not updated.
+ */
+void
+m_cat(struct mbuf *m, struct mbuf *n)
+{
+ while (m->m_next)
+ m = m->m_next;
+ while (n) {
+ if (m->m_flags & M_EXT ||
+ m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
+ /* just join the two chains */
+ m->m_next = n;
+ return;
+ }
+ /* splat the data from one into the other */
+ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
+ (u_int)n->m_len);
+ m->m_len += n->m_len;
+ n = m_free(n);
+ }
+}
+
+void
+m_adj(struct mbuf *mp, int req_len)
+{
+ int len = req_len;
+ struct mbuf *m;
+ int count;
+
+ if ((m = mp) == NULL)
+ return;
+ if (len >= 0) {
+ /*
+ * Trim from head.
+ */
+ while (m != NULL && len > 0) {
+ if (m->m_len <= len) {
+ len -= m->m_len;
+ m->m_len = 0;
+ m = m->m_next;
+ } else {
+ m->m_len -= len;
+ m->m_data += len;
+ len = 0;
+ }
+ }
+ m = mp;
+ if (mp->m_flags & M_PKTHDR)
+ m->m_pkthdr.len -= (req_len - len);
+ } else {
+ /*
+ * Trim from tail. Scan the mbuf chain,
+ * calculating its length and finding the last mbuf.
+ * If the adjustment only affects this mbuf, then just
+ * adjust and return. Otherwise, rescan and truncate
+ * after the remaining size.
+ */
+ len = -len;
+ count = 0;
+ for (;;) {
+ count += m->m_len;
+ if (m->m_next == (struct mbuf *)0)
+ break;
+ m = m->m_next;
+ }
+ if (m->m_len >= len) {
+ m->m_len -= len;
+ if (mp->m_flags & M_PKTHDR)
+ mp->m_pkthdr.len -= len;
+ return;
+ }
+ count -= len;
+ if (count < 0)
+ count = 0;
+ /*
+ * Correct length for chain is "count".
+ * Find the mbuf with last data, adjust its length,
+ * and toss data from remaining mbufs on chain.
+ */
+ m = mp;
+ if (m->m_flags & M_PKTHDR)
+ m->m_pkthdr.len = count;
+ for (; m; m = m->m_next) {
+ if (m->m_len >= count) {
+ m->m_len = count;
+ break;
+ }
+ count -= m->m_len;
+ }
+ while (m->m_next)
+ (m = m->m_next) ->m_len = 0;
+ }
+}
+
+/*
+ * Rearange an mbuf chain so that len bytes are contiguous
+ * and in the data area of an mbuf (so that mtod and dtom
+ * will work for a structure of size len). Returns the resulting
+ * mbuf chain on success, frees it and returns null on failure.
+ * If there is room, it will add up to max_protohdr-len extra bytes to the
+ * contiguous region in an attempt to avoid being called next time.
+ */
+struct mbuf *
+m_pullup(struct mbuf *n, int len)
+{
+ struct mbuf *m;
+ int count;
+ int space;
+
+ /*
+ * If first mbuf has no cluster, and has room for len bytes
+ * without shifting current data, pullup into it,
+ * otherwise allocate a new mbuf to prepend to the chain.
+ */
+ if ((n->m_flags & M_EXT) == 0 &&
+ n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
+ if (n->m_len >= len)
+ return (n);
+ m = n;
+ n = n->m_next;
+ len -= m->m_len;
+ } else {
+ if (len > MHLEN)
+ goto bad;
+ MGET(m, M_DONTWAIT, n->m_type);
+ if (m == NULL)
+ goto bad;
+ m->m_len = 0;
+ if (n->m_flags & M_PKTHDR) {
+ M_COPY_PKTHDR(m, n);
+ n->m_flags &= ~M_PKTHDR;
+ }
+ }
+ space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
+ do {
+ count = min(min(max(len, max_protohdr), space), n->m_len);
+ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
+ (unsigned)count);
+ len -= count;
+ m->m_len += count;
+ n->m_len -= count;
+ space -= count;
+ if (n->m_len)
+ n->m_data += count;
+ else
+ n = m_free(n);
+ } while (len > 0 && n);
+ if (len > 0) {
+ (void) m_free(m);
+ goto bad;
+ }
+ m->m_next = n;
+ return (m);
+bad:
+ m_freem(n);
+ mbstat.m_mpfail++; /* XXX: No consistency. */
+ return (NULL);
+}
+
+/*
+ * Partition an mbuf chain in two pieces, returning the tail --
+ * all but the first len0 bytes. In case of failure, it returns NULL and
+ * attempts to restore the chain to its original state.
+ *
+ * Note that the resulting mbufs might be read-only, because the new
+ * mbuf can end up sharing an mbuf cluster with the original mbuf if
+ * the "breaking point" happens to lie within a cluster mbuf. Use the
+ * M_WRITABLE() macro to check for this case.
+ */
+struct mbuf *
+m_split(struct mbuf *m0, int len0, int wait)
+{
+ struct mbuf *m, *n;
+ unsigned len = len0, remain;
+
+ for (m = m0; m && len > m->m_len; m = m->m_next)
+ len -= m->m_len;
+ if (m == NULL)
+ return (NULL);
+ remain = m->m_len - len;
+ if (m0->m_flags & M_PKTHDR) {
+ MGETHDR(n, wait, m0->m_type);
+ if (n == NULL)
+ return (NULL);
+ n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
+ n->m_pkthdr.len = m0->m_pkthdr.len - len0;
+ m0->m_pkthdr.len = len0;
+ if (m->m_flags & M_EXT)
+ goto extpacket;
+ if (remain > MHLEN) {
+ /* m can't be the lead packet */
+ MH_ALIGN(n, 0);
+ n->m_next = m_split(m, len, wait);
+ if (n->m_next == NULL) {
+ (void) m_free(n);
+ return (NULL);
+ } else {
+ n->m_len = 0;
+ return (n);
+ }
+ } else
+ MH_ALIGN(n, remain);
+ } else if (remain == 0) {
+ n = m->m_next;
+ m->m_next = NULL;
+ return (n);
+ } else {
+ MGET(n, wait, m->m_type);
+ if (n == NULL)
+ return (NULL);
+ M_ALIGN(n, remain);
+ }
+extpacket:
+ if (m->m_flags & M_EXT) {
+ n->m_flags |= M_EXT;
+ n->m_ext = m->m_ext;
+ MEXT_ADD_REF(m);
+ n->m_data = m->m_data + len;
+ } else {
+ bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
+ }
+ n->m_len = remain;
+ m->m_len = len;
+ n->m_next = m->m_next;
+ m->m_next = NULL;
+ return (n);
+}
+/*
+ * Routine to copy from device local memory into mbufs.
+ * Note that `off' argument is offset into first mbuf of target chain from
+ * which to begin copying the data to.
+ */
+struct mbuf *
+m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
+ void (*copy)(char *from, caddr_t to, u_int len))
+{
+ struct mbuf *m;
+ struct mbuf *top = 0, **mp = &top;
+ int len;
+
+ if (off < 0 || off > MHLEN)
+ return (NULL);
+
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m == NULL)
+ return (NULL);
+ m->m_pkthdr.rcvif = ifp;
+ m->m_pkthdr.len = totlen;
+ len = MHLEN;
+
+ while (totlen > 0) {
+ if (top) {
+ MGET(m, M_DONTWAIT, MT_DATA);
+ if (m == NULL) {
+ m_freem(top);
+ return (NULL);
+ }
+ len = MLEN;
+ }
+ if (totlen + off >= MINCLSIZE) {
+ MCLGET(m, M_DONTWAIT);
+ if (m->m_flags & M_EXT)
+ len = MCLBYTES;
+ } else {
+ /*
+ * Place initial small packet/header at end of mbuf.
+ */
+ if (top == NULL && totlen + off + max_linkhdr <= len) {
+ m->m_data += max_linkhdr;
+ len -= max_linkhdr;
+ }
+ }
+ if (off) {
+ m->m_data += off;
+ len -= off;
+ off = 0;
+ }
+ m->m_len = len = min(totlen, len);
+ if (copy)
+ copy(buf, mtod(m, caddr_t), (unsigned)len);
+ else
+ bcopy(buf, mtod(m, caddr_t), (unsigned)len);
+ buf += len;
+ *mp = m;
+ mp = &m->m_next;
+ totlen -= len;
+ }
+ return (top);
+}
+
+/*
+ * Copy data from a buffer back into the indicated mbuf chain,
+ * starting "off" bytes from the beginning, extending the mbuf
+ * chain if necessary.
+ */
+void
+m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
+{
+ int mlen;
+ struct mbuf *m = m0, *n;
+ int totlen = 0;
+
+ if (m0 == NULL)
+ return;
+ while (off > (mlen = m->m_len)) {
+ off -= mlen;
+ totlen += mlen;
+ if (m->m_next == NULL) {
+ n = m_get_clrd(M_DONTWAIT, m->m_type);
+ if (n == NULL)
+ goto out;
+ n->m_len = min(MLEN, len + off);
+ m->m_next = n;
+ }
+ m = m->m_next;
+ }
+ while (len > 0) {
+ mlen = min (m->m_len - off, len);
+ bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
+ cp += mlen;
+ len -= mlen;
+ mlen += off;
+ off = 0;
+ totlen += mlen;
+ if (len == 0)
+ break;
+ if (m->m_next == NULL) {
+ n = m_get(M_DONTWAIT, m->m_type);
+ if (n == NULL)
+ break;
+ n->m_len = min(MLEN, len);
+ m->m_next = n;
+ }
+ m = m->m_next;
+ }
+out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
+ m->m_pkthdr.len = totlen;
+}
+
+void
+m_print(const struct mbuf *m)
+{
+ int len;
+ const struct mbuf *m2;
+
+ len = m->m_pkthdr.len;
+ m2 = m;
+ while (len) {
+ printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
+ len -= m2->m_len;
+ m2 = m2->m_next;
+ }
+ return;
+}
diff --git a/sys/kern/uipc_mbuf2.c b/sys/kern/uipc_mbuf2.c
new file mode 100644
index 0000000..37ee53e
--- /dev/null
+++ b/sys/kern/uipc_mbuf2.c
@@ -0,0 +1,404 @@
+/* $FreeBSD$ */
+/* $KAME: uipc_mbuf2.c,v 1.31 2001/11/28 11:08:53 itojun Exp $ */
+/* $NetBSD: uipc_mbuf.c,v 1.40 1999/04/01 00:23:25 thorpej Exp $ */
+
+/*
+ * Copyright (C) 1999 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1988, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_mbuf.c 8.4 (Berkeley) 2/14/95
+ */
+
+/*#define PULLDOWN_DEBUG*/
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+
+/* can't call it m_dup(), as freebsd[34] uses m_dup() with different arg */
+static struct mbuf *m_dup1(struct mbuf *, int, int, int);
+
+/*
+ * ensure that [off, off + len) is contiguous on the mbuf chain "m".
+ * packet chain before "off" is kept untouched.
+ * if offp == NULL, the target will start at <retval, 0> on resulting chain.
+ * if offp != NULL, the target will start at <retval, *offp> on resulting chain.
+ *
+ * on error return (NULL return value), original "m" will be freed.
+ *
+ * XXX: M_TRAILINGSPACE/M_LEADINGSPACE only permitted on writable ext_buf.
+ */
+struct mbuf *
+m_pulldown(struct mbuf *m, int off, int len, int *offp)
+{
+ struct mbuf *n, *o;
+ int hlen, tlen, olen;
+ int writable;
+
+ /* check invalid arguments. */
+ if (m == NULL)
+ panic("m == NULL in m_pulldown()");
+ if (len > MCLBYTES) {
+ m_freem(m);
+ return NULL; /* impossible */
+ }
+
+#ifdef PULLDOWN_DEBUG
+ {
+ struct mbuf *t;
+ printf("before:");
+ for (t = m; t; t = t->m_next)
+ printf(" %d", t->m_len);
+ printf("\n");
+ }
+#endif
+ n = m;
+ while (n != NULL && off > 0) {
+ if (n->m_len > off)
+ break;
+ off -= n->m_len;
+ n = n->m_next;
+ }
+ /* be sure to point non-empty mbuf */
+ while (n != NULL && n->m_len == 0)
+ n = n->m_next;
+ if (!n) {
+ m_freem(m);
+ return NULL; /* mbuf chain too short */
+ }
+
+ /*
+ * XXX: This code is flawed because it considers a "writable" mbuf
+ * data region to require all of the following:
+ * (i) mbuf _has_ to have M_EXT set; if it is just a regular
+ * mbuf, it is still not considered "writable."
+ * (ii) since mbuf has M_EXT, the ext_type _has_ to be
+ * EXT_CLUSTER. Anything else makes it non-writable.
+ * (iii) M_WRITABLE() must evaluate true.
+ * Ideally, the requirement should only be (iii).
+ *
+ * If we're writable, we're sure we're writable, because the ref. count
+ * cannot increase from 1, as that would require posession of mbuf
+ * n by someone else (which is impossible). However, if we're _not_
+ * writable, we may eventually become writable )if the ref. count drops
+ * to 1), but we'll fail to notice it unless we re-evaluate
+ * M_WRITABLE(). For now, we only evaluate once at the beginning and
+ * live with this.
+ */
+ /*
+ * XXX: This is dumb. If we're just a regular mbuf with no M_EXT,
+ * then we're not "writable," according to this code.
+ */
+ writable = 0;
+ if ((n->m_flags & M_EXT) == 0 ||
+ (n->m_ext.ext_type == EXT_CLUSTER && M_WRITABLE(n)))
+ writable = 1;
+
+ /*
+ * the target data is on <n, off>.
+ * if we got enough data on the mbuf "n", we're done.
+ */
+ if ((off == 0 || offp) && len <= n->m_len - off && writable)
+ goto ok;
+
+ /*
+ * when len <= n->m_len - off and off != 0, it is a special case.
+ * len bytes from <n, off> sits in single mbuf, but the caller does
+ * not like the starting position (off).
+ * chop the current mbuf into two pieces, set off to 0.
+ */
+ if (len <= n->m_len - off) {
+ o = m_dup1(n, off, n->m_len - off, M_DONTWAIT);
+ if (o == NULL) {
+ m_freem(m);
+ return NULL; /* ENOBUFS */
+ }
+ n->m_len = off;
+ o->m_next = n->m_next;
+ n->m_next = o;
+ n = n->m_next;
+ off = 0;
+ goto ok;
+ }
+
+ /*
+ * we need to take hlen from <n, off> and tlen from <n->m_next, 0>,
+ * and construct contiguous mbuf with m_len == len.
+ * note that hlen + tlen == len, and tlen > 0.
+ */
+ hlen = n->m_len - off;
+ tlen = len - hlen;
+
+ /*
+ * ensure that we have enough trailing data on mbuf chain.
+ * if not, we can do nothing about the chain.
+ */
+ olen = 0;
+ for (o = n->m_next; o != NULL; o = o->m_next)
+ olen += o->m_len;
+ if (hlen + olen < len) {
+ m_freem(m);
+ return NULL; /* mbuf chain too short */
+ }
+
+ /*
+ * easy cases first.
+ * we need to use m_copydata() to get data from <n->m_next, 0>.
+ */
+ if ((off == 0 || offp) && M_TRAILINGSPACE(n) >= tlen
+ && writable) {
+ m_copydata(n->m_next, 0, tlen, mtod(n, caddr_t) + n->m_len);
+ n->m_len += tlen;
+ m_adj(n->m_next, tlen);
+ goto ok;
+ }
+ if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen
+ && writable) {
+ n->m_next->m_data -= hlen;
+ n->m_next->m_len += hlen;
+ bcopy(mtod(n, caddr_t) + off, mtod(n->m_next, caddr_t), hlen);
+ n->m_len -= hlen;
+ n = n->m_next;
+ off = 0;
+ goto ok;
+ }
+
+ /*
+ * now, we need to do the hard way. don't m_copy as there's no room
+ * on both end.
+ */
+ MGET(o, M_DONTWAIT, m->m_type);
+ if (o && len > MLEN) {
+ MCLGET(o, M_DONTWAIT);
+ if ((o->m_flags & M_EXT) == 0) {
+ m_free(o);
+ o = NULL;
+ }
+ }
+ if (!o) {
+ m_freem(m);
+ return NULL; /* ENOBUFS */
+ }
+ /* get hlen from <n, off> into <o, 0> */
+ o->m_len = hlen;
+ bcopy(mtod(n, caddr_t) + off, mtod(o, caddr_t), hlen);
+ n->m_len -= hlen;
+ /* get tlen from <n->m_next, 0> into <o, hlen> */
+ m_copydata(n->m_next, 0, tlen, mtod(o, caddr_t) + o->m_len);
+ o->m_len += tlen;
+ m_adj(n->m_next, tlen);
+ o->m_next = n->m_next;
+ n->m_next = o;
+ n = o;
+ off = 0;
+
+ok:
+#ifdef PULLDOWN_DEBUG
+ {
+ struct mbuf *t;
+ printf("after:");
+ for (t = m; t; t = t->m_next)
+ printf("%c%d", t == n ? '*' : ' ', t->m_len);
+ printf(" (off=%d)\n", off);
+ }
+#endif
+ if (offp)
+ *offp = off;
+ return n;
+}
+
+static struct mbuf *
+m_dup1(struct mbuf *m, int off, int len, int wait)
+{
+ struct mbuf *n;
+ int l;
+ int copyhdr;
+
+ if (len > MCLBYTES)
+ return NULL;
+ if (off == 0 && (m->m_flags & M_PKTHDR) != 0) {
+ copyhdr = 1;
+ MGETHDR(n, wait, m->m_type);
+ l = MHLEN;
+ } else {
+ copyhdr = 0;
+ MGET(n, wait, m->m_type);
+ l = MLEN;
+ }
+ if (n && len > l) {
+ MCLGET(n, wait);
+ if ((n->m_flags & M_EXT) == 0) {
+ m_free(n);
+ n = NULL;
+ }
+ }
+ if (!n)
+ return NULL;
+
+ if (copyhdr)
+ M_COPY_PKTHDR(n, m);
+ m_copydata(m, off, len, mtod(n, caddr_t));
+ return n;
+}
+
+/*
+ * pkthdr.aux chain manipulation.
+ * we don't allow clusters at this moment.
+ */
+struct mbuf *
+m_aux_add2(struct mbuf *m, int af, int type, void *p)
+{
+ struct mbuf *n;
+ struct mauxtag *t;
+
+ if ((m->m_flags & M_PKTHDR) == 0)
+ return NULL;
+
+ n = m_aux_find(m, af, type);
+ if (n)
+ return n;
+
+ MGET(n, M_DONTWAIT, m->m_type);
+ if (n == NULL)
+ return NULL;
+
+ t = mtod(n, struct mauxtag *);
+ bzero(t, sizeof(*t));
+ t->af = af;
+ t->type = type;
+ t->p = p;
+ n->m_data += sizeof(struct mauxtag);
+ n->m_len = 0;
+ n->m_next = m->m_pkthdr.aux;
+ m->m_pkthdr.aux = n;
+ return n;
+}
+
+struct mbuf *
+m_aux_find2(struct mbuf *m, int af, int type, void *p)
+{
+ struct mbuf *n;
+ struct mauxtag *t;
+
+ if ((m->m_flags & M_PKTHDR) == 0)
+ return NULL;
+
+ for (n = m->m_pkthdr.aux; n; n = n->m_next) {
+ t = (struct mauxtag *)n->m_dat;
+ if (n->m_data != ((caddr_t)t) + sizeof(struct mauxtag)) {
+ printf("m_aux_find: invalid m_data for mbuf=%p (%p %p)\n", n, t, n->m_data);
+ continue;
+ }
+ if (t->af == af && t->type == type && t->p == p)
+ return n;
+ }
+ return NULL;
+}
+
+struct mbuf *
+m_aux_find(struct mbuf *m, int af, int type)
+{
+
+ return m_aux_find2(m, af, type, NULL);
+}
+
+struct mbuf *
+m_aux_add(struct mbuf *m, int af, int type)
+{
+
+ return m_aux_add2(m, af, type, NULL);
+}
+
+void
+m_aux_delete(struct mbuf *m, struct mbuf *victim)
+{
+ struct mbuf *n, *prev, *next;
+ struct mauxtag *t;
+
+ if ((m->m_flags & M_PKTHDR) == 0)
+ return;
+
+ prev = NULL;
+ n = m->m_pkthdr.aux;
+ while (n) {
+ t = (struct mauxtag *)n->m_dat;
+ next = n->m_next;
+ if (n->m_data != ((caddr_t)t) + sizeof(struct mauxtag)) {
+ printf("m_aux_delete: invalid m_data for mbuf=%p (%p %p)\n", n, t, n->m_data);
+ prev = n;
+ n = next;
+ continue;
+ }
+ if (n == victim) {
+ if (prev)
+ prev->m_next = n->m_next;
+ else
+ m->m_pkthdr.aux = n->m_next;
+ n->m_next = NULL;
+ m_free(n);
+ return;
+ } else
+ prev = n;
+ n = next;
+ }
+}
diff --git a/sys/kern/uipc_proto.c b/sys/kern/uipc_proto.c
new file mode 100644
index 0000000..74dab78
--- /dev/null
+++ b/sys/kern/uipc_proto.c
@@ -0,0 +1,80 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_proto.c 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/un.h>
+
+#include <net/raw_cb.h>
+
+/*
+ * Definitions of protocols supported in the LOCAL domain.
+ */
+
+static struct protosw localsw[] = {
+{ SOCK_STREAM, &localdomain, 0, PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
+ 0, 0, 0, &uipc_ctloutput,
+ 0,
+ 0, 0, 0, 0,
+ &uipc_usrreqs
+},
+{ SOCK_DGRAM, &localdomain, 0, PR_ATOMIC|PR_ADDR|PR_RIGHTS,
+ 0, 0, 0, 0,
+ 0,
+ 0, 0, 0, 0,
+ &uipc_usrreqs
+},
+{ 0, 0, 0, 0,
+ 0, 0, raw_ctlinput, 0,
+ 0,
+ raw_init, 0, 0, 0,
+ &raw_usrreqs
+}
+};
+
+struct domain localdomain =
+ { AF_LOCAL, "local", unp_init, unp_externalize, unp_dispose,
+ localsw, &localsw[sizeof(localsw)/sizeof(localsw[0])] };
+DOMAIN_SET(local);
+
+SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
+SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM");
+SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
new file mode 100644
index 0000000..1e68f83
--- /dev/null
+++ b/sys/kern/uipc_sockbuf.c
@@ -0,0 +1,983 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/aio.h> /* for aio_swake proto */
+#include <sys/domain.h>
+#include <sys/event.h>
+#include <sys/file.h> /* for maxfiles */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+int maxsockets;
+
+void (*aio_swake)(struct socket *, struct sockbuf *);
+
+/*
+ * Primitive routines for operating on sockets and socket buffers
+ */
+
+u_long sb_max = SB_MAX; /* XXX should be static */
+
+static u_long sb_efficiency = 8; /* parameter for sbreserve() */
+
+/*
+ * Procedures to manipulate state flags of socket
+ * and do appropriate wakeups. Normal sequence from the
+ * active (originating) side is that soisconnecting() is
+ * called during processing of connect() call,
+ * resulting in an eventual call to soisconnected() if/when the
+ * connection is established. When the connection is torn down
+ * soisdisconnecting() is called during processing of disconnect() call,
+ * and soisdisconnected() is called when the connection to the peer
+ * is totally severed. The semantics of these routines are such that
+ * connectionless protocols can call soisconnected() and soisdisconnected()
+ * only, bypassing the in-progress calls when setting up a ``connection''
+ * takes no time.
+ *
+ * From the passive side, a socket is created with
+ * two queues of sockets: so_incomp for connections in progress
+ * and so_comp for connections already made and awaiting user acceptance.
+ * As a protocol is preparing incoming connections, it creates a socket
+ * structure queued on so_incomp by calling sonewconn(). When the connection
+ * is established, soisconnected() is called, and transfers the
+ * socket structure to so_comp, making it available to accept().
+ *
+ * If a socket is closed with sockets on either
+ * so_incomp or so_comp, these sockets are dropped.
+ *
+ * If higher level protocols are implemented in
+ * the kernel, the wakeups done here will sometimes
+ * cause software-interrupt process scheduling.
+ */
+
+void
+soisconnecting(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= SS_ISCONNECTING;
+}
+
+void
+soisconnected(so)
+ struct socket *so;
+{
+ struct socket *head = so->so_head;
+
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+ so->so_state |= SS_ISCONNECTED;
+ if (head && (so->so_state & SS_INCOMP)) {
+ if ((so->so_options & SO_ACCEPTFILTER) != 0) {
+ so->so_upcall = head->so_accf->so_accept_filter->accf_callback;
+ so->so_upcallarg = head->so_accf->so_accept_filter_arg;
+ so->so_rcv.sb_flags |= SB_UPCALL;
+ so->so_options &= ~SO_ACCEPTFILTER;
+ so->so_upcall(so, so->so_upcallarg, 0);
+ return;
+ }
+ TAILQ_REMOVE(&head->so_incomp, so, so_list);
+ head->so_incqlen--;
+ so->so_state &= ~SS_INCOMP;
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ head->so_qlen++;
+ so->so_state |= SS_COMP;
+ sorwakeup(head);
+ wakeup_one(&head->so_timeo);
+ } else {
+ wakeup(&so->so_timeo);
+ sorwakeup(so);
+ sowwakeup(so);
+ }
+}
+
+void
+soisdisconnecting(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~SS_ISCONNECTING;
+ so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
+ wakeup(&so->so_timeo);
+ sowwakeup(so);
+ sorwakeup(so);
+}
+
+void
+soisdisconnected(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
+ wakeup(&so->so_timeo);
+ sowwakeup(so);
+ sorwakeup(so);
+}
+
+/*
+ * When an attempt at a new connection is noted on a socket
+ * which accepts connections, sonewconn is called. If the
+ * connection is possible (subject to space constraints, etc.)
+ * then we allocate a new structure, propoerly linked into the
+ * data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ *
+ * note: the ref count on the socket is 0 on return
+ */
+struct socket *
+sonewconn(head, connstatus)
+ register struct socket *head;
+ int connstatus;
+{
+ register struct socket *so;
+
+ if (head->so_qlen > 3 * head->so_qlimit / 2)
+ return ((struct socket *)0);
+ so = soalloc(0);
+ if (so == NULL)
+ return ((struct socket *)0);
+ if ((head->so_options & SO_ACCEPTFILTER) != 0)
+ connstatus = 0;
+ so->so_head = head;
+ so->so_type = head->so_type;
+ so->so_options = head->so_options &~ SO_ACCEPTCONN;
+ so->so_linger = head->so_linger;
+ so->so_state = head->so_state | SS_NOFDREF;
+ so->so_proto = head->so_proto;
+ so->so_timeo = head->so_timeo;
+ so->so_cred = crhold(head->so_cred);
+ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
+ (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+ sotryfree(so);
+ return ((struct socket *)0);
+ }
+
+ if (connstatus) {
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ so->so_state |= SS_COMP;
+ head->so_qlen++;
+ } else {
+ if (head->so_incqlen > head->so_qlimit) {
+ struct socket *sp;
+ sp = TAILQ_FIRST(&head->so_incomp);
+ (void) soabort(sp);
+ }
+ TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+ so->so_state |= SS_INCOMP;
+ head->so_incqlen++;
+ }
+ if (connstatus) {
+ sorwakeup(head);
+ wakeup(&head->so_timeo);
+ so->so_state |= connstatus;
+ }
+ return (so);
+}
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the
+ * socket; it would normally be applied to a socket when the user
+ * informs the system that no more data is to be sent, by the protocol
+ * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
+ * will be received, and will normally be applied to the socket by a
+ * protocol when it detects that the peer will send no more data.
+ * Data queued for reading in the socket may yet be read.
+ */
+
+void
+socantsendmore(so)
+ struct socket *so;
+{
+
+ so->so_state |= SS_CANTSENDMORE;
+ sowwakeup(so);
+}
+
+void
+socantrcvmore(so)
+ struct socket *so;
+{
+
+ so->so_state |= SS_CANTRCVMORE;
+ sorwakeup(so);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(sb)
+ struct sockbuf *sb;
+{
+
+ sb->sb_flags |= SB_WAIT;
+ return (tsleep(&sb->sb_cc,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+ sb->sb_timeo));
+}
+
+/*
+ * Lock a sockbuf already known to be locked;
+ * return any error returned from sleep (EINTR).
+ */
+int
+sb_lock(sb)
+ register struct sockbuf *sb;
+{
+ int error;
+
+ while (sb->sb_flags & SB_LOCK) {
+ sb->sb_flags |= SB_WANT;
+ error = tsleep(&sb->sb_flags,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
+ "sblock", 0);
+ if (error)
+ return (error);
+ }
+ sb->sb_flags |= SB_LOCK;
+ return (0);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer.
+ * Do asynchronous notification via SIGIO
+ * if the socket has the SS_ASYNC flag set.
+ */
+void
+sowakeup(so, sb)
+ register struct socket *so;
+ register struct sockbuf *sb;
+{
+
+ selwakeup(&sb->sb_sel);
+ sb->sb_flags &= ~SB_SEL;
+ if (sb->sb_flags & SB_WAIT) {
+ sb->sb_flags &= ~SB_WAIT;
+ wakeup(&sb->sb_cc);
+ }
+ if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
+ pgsigio(&so->so_sigio, SIGIO, 0);
+ if (sb->sb_flags & SB_UPCALL)
+ (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
+ if (sb->sb_flags & SB_AIO)
+ aio_swake(so, sb);
+ KNOTE(&sb->sb_sel.si_note, 0);
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and
+ * one for receiving data. Each buffer contains a queue of mbufs,
+ * information about the number of mbufs and amount of data in the
+ * queue, and other fields allowing select() statements and notification
+ * on data availability to be implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records.
+ * Each record is a list of mbufs chained together with the m_next
+ * field. Records are chained together with the m_nextpkt field. The upper
+ * level routine soreceive() expects the following conventions to be
+ * observed when placing information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's
+ * name, then a record containing that name must be present before
+ * any associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really
+ * just additional data associated with the message), and there are
+ * ``rights'' to be received, then a record containing this data
+ * should be present (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by
+ * a data record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve(). This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits). The space
+ * should be released by calling sbrelease() when the socket is destroyed.
+ */
+
+int
+soreserve(so, sndcc, rcvcc)
+ register struct socket *so;
+ u_long sndcc, rcvcc;
+{
+ struct thread *td = curthread;
+
+ if (sbreserve(&so->so_snd, sndcc, so, td) == 0)
+ goto bad;
+ if (sbreserve(&so->so_rcv, rcvcc, so, td) == 0)
+ goto bad2;
+ if (so->so_rcv.sb_lowat == 0)
+ so->so_rcv.sb_lowat = 1;
+ if (so->so_snd.sb_lowat == 0)
+ so->so_snd.sb_lowat = MCLBYTES;
+ if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+ so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+ return (0);
+bad2:
+ sbrelease(&so->so_snd, so);
+bad:
+ return (ENOBUFS);
+}
+
+/*
+ * Allot mbufs to a sockbuf.
+ * Attempt to scale mbmax so that mbcnt doesn't become limiting
+ * if buffering efficiency is near the normal case.
+ */
+int
+sbreserve(sb, cc, so, td)
+ struct sockbuf *sb;
+ u_long cc;
+ struct socket *so;
+ struct thread *td;
+{
+
+ /*
+ * td will only be NULL when we're in an interrupt
+ * (e.g. in tcp_input())
+ */
+ if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
+ return (0);
+ if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
+ td ? td->td_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur : RLIM_INFINITY)) {
+ return (0);
+ }
+ sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+ if (sb->sb_lowat > sb->sb_hiwat)
+ sb->sb_lowat = sb->sb_hiwat;
+ return (1);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+void
+sbrelease(sb, so)
+ struct sockbuf *sb;
+ struct socket *so;
+{
+
+ sbflush(sb);
+ (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
+ RLIM_INFINITY);
+ sb->sb_mbmax = 0;
+}
+
+/*
+ * Routines to add and remove
+ * data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to
+ * append new mbufs to a socket buffer, after checking that adequate
+ * space is available, comparing the function sbspace() with the amount
+ * of data to be added. sbappendrecord() differs from sbappend() in
+ * that data supplied is treated as the beginning of a new record.
+ * To place a sender's address, optional access rights, and data in a
+ * socket receive buffer, sbappendaddr() should be used. To place
+ * access rights and data in a socket receive buffer, sbappendrights()
+ * should be used. In either case, the new data begins a new record.
+ * Note that unlike sbappend() and sbappendrecord(), these routines check
+ * for the caller that there will be enough space to store the data.
+ * Each fails if there is not enough space, or if it cannot find mbufs
+ * to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data
+ * awaiting acknowledgement. Data is normally copied from a socket
+ * send buffer in a protocol with m_copy for output to a peer,
+ * and then removing the data from the socket buffer with sbdrop()
+ * or sbdroprecord() when the data is acknowledged by the peer.
+ */
+
+/*
+ * Append mbuf chain m to the last record in the
+ * socket buffer sb. The additional space associated
+ * the mbuf chain is recorded in sb. Empty mbufs are
+ * discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(sb, m)
+ struct sockbuf *sb;
+ struct mbuf *m;
+{
+ register struct mbuf *n;
+
+ if (m == 0)
+ return;
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ do {
+ if (n->m_flags & M_EOR) {
+ sbappendrecord(sb, m); /* XXXXXX!!!! */
+ return;
+ }
+ } while (n->m_next && (n = n->m_next));
+ }
+ sbcompress(sb, m, n);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(sb)
+ register struct sockbuf *sb;
+{
+ register struct mbuf *m;
+ register struct mbuf *n = 0;
+ register u_long len = 0, mbcnt = 0;
+
+ for (m = sb->sb_mb; m; m = n) {
+ n = m->m_nextpkt;
+ for (; m; m = m->m_next) {
+ len += m->m_len;
+ mbcnt += MSIZE;
+ if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+ mbcnt += m->m_ext.ext_size;
+ }
+ }
+ if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+ printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
+ mbcnt, sb->sb_mbcnt);
+ panic("sbcheck");
+ }
+}
+#endif
+
+/*
+ * As above, except the mbuf chain
+ * begins a new record.
+ */
+void
+sbappendrecord(sb, m0)
+ register struct sockbuf *sb;
+ register struct mbuf *m0;
+{
+ register struct mbuf *m;
+
+ if (m0 == 0)
+ return;
+ m = sb->sb_mb;
+ if (m)
+ while (m->m_nextpkt)
+ m = m->m_nextpkt;
+ /*
+ * Put the first mbuf on the queue.
+ * Note this permits zero length records.
+ */
+ sballoc(sb, m0);
+ if (m)
+ m->m_nextpkt = m0;
+ else
+ sb->sb_mb = m0;
+ m = m0->m_next;
+ m0->m_next = 0;
+ if (m && (m0->m_flags & M_EOR)) {
+ m0->m_flags &= ~M_EOR;
+ m->m_flags |= M_EOR;
+ }
+ sbcompress(sb, m, m0);
+}
+
+/*
+ * As above except that OOB data
+ * is inserted at the beginning of the sockbuf,
+ * but after any other OOB data.
+ */
+void
+sbinsertoob(sb, m0)
+ register struct sockbuf *sb;
+ register struct mbuf *m0;
+{
+ register struct mbuf *m;
+ register struct mbuf **mp;
+
+ if (m0 == 0)
+ return;
+ for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
+ m = *mp;
+ again:
+ switch (m->m_type) {
+
+ case MT_OOBDATA:
+ continue; /* WANT next train */
+
+ case MT_CONTROL:
+ m = m->m_next;
+ if (m)
+ goto again; /* inspect THIS train further */
+ }
+ break;
+ }
+ /*
+ * Put the first mbuf on the queue.
+ * Note this permits zero length records.
+ */
+ sballoc(sb, m0);
+ m0->m_nextpkt = *mp;
+ *mp = m0;
+ m = m0->m_next;
+ m0->m_next = 0;
+ if (m && (m0->m_flags & M_EOR)) {
+ m0->m_flags &= ~M_EOR;
+ m->m_flags |= M_EOR;
+ }
+ sbcompress(sb, m, m0);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data
+ * to the receive queue of a socket. If present,
+ * m0 must include a packet header with total length.
+ * Returns 0 if no space in sockbuf or insufficient mbufs.
+ */
+int
+sbappendaddr(sb, asa, m0, control)
+ register struct sockbuf *sb;
+ struct sockaddr *asa;
+ struct mbuf *m0, *control;
+{
+ register struct mbuf *m, *n;
+ int space = asa->sa_len;
+
+ if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+ panic("sbappendaddr");
+ if (m0)
+ space += m0->m_pkthdr.len;
+ for (n = control; n; n = n->m_next) {
+ space += n->m_len;
+ if (n->m_next == 0) /* keep pointer to last control buf */
+ break;
+ }
+ if (space > sbspace(sb))
+ return (0);
+ if (asa->sa_len > MLEN)
+ return (0);
+ MGET(m, M_DONTWAIT, MT_SONAME);
+ if (m == 0)
+ return (0);
+ m->m_len = asa->sa_len;
+ bcopy(asa, mtod(m, caddr_t), asa->sa_len);
+ if (n)
+ n->m_next = m0; /* concatenate data to control */
+ else
+ control = m0;
+ m->m_next = control;
+ for (n = m; n; n = n->m_next)
+ sballoc(sb, n);
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ n->m_nextpkt = m;
+ } else
+ sb->sb_mb = m;
+ return (1);
+}
+
+int
+sbappendcontrol(sb, m0, control)
+ struct sockbuf *sb;
+ struct mbuf *control, *m0;
+{
+ register struct mbuf *m, *n;
+ int space = 0;
+
+ if (control == 0)
+ panic("sbappendcontrol");
+ for (m = control; ; m = m->m_next) {
+ space += m->m_len;
+ if (m->m_next == 0)
+ break;
+ }
+ n = m; /* save pointer to last control buffer */
+ for (m = m0; m; m = m->m_next)
+ space += m->m_len;
+ if (space > sbspace(sb))
+ return (0);
+ n->m_next = m0; /* concatenate data to control */
+ for (m = control; m; m = m->m_next)
+ sballoc(sb, m);
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ n->m_nextpkt = control;
+ } else
+ sb->sb_mb = control;
+ return (1);
+}
+
+/*
+ * Compress mbuf chain m into the socket
+ * buffer sb following mbuf n. If n
+ * is null, the buffer is presumed empty.
+ */
+void
+sbcompress(sb, m, n)
+ register struct sockbuf *sb;
+ register struct mbuf *m, *n;
+{
+ register int eor = 0;
+ register struct mbuf *o;
+
+ while (m) {
+ eor |= m->m_flags & M_EOR;
+ if (m->m_len == 0 &&
+ (eor == 0 ||
+ (((o = m->m_next) || (o = n)) &&
+ o->m_type == m->m_type))) {
+ m = m_free(m);
+ continue;
+ }
+ if (n && (n->m_flags & M_EOR) == 0 &&
+ M_WRITABLE(n) &&
+ m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
+ m->m_len <= M_TRAILINGSPACE(n) &&
+ n->m_type == m->m_type) {
+ bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+ (unsigned)m->m_len);
+ n->m_len += m->m_len;
+ sb->sb_cc += m->m_len;
+ m = m_free(m);
+ continue;
+ }
+ if (n)
+ n->m_next = m;
+ else
+ sb->sb_mb = m;
+ sballoc(sb, m);
+ n = m;
+ m->m_flags &= ~M_EOR;
+ m = m->m_next;
+ n->m_next = 0;
+ }
+ if (eor) {
+ if (n)
+ n->m_flags |= eor;
+ else
+ printf("semi-panic: sbcompress\n");
+ }
+}
+
+/*
+ * Free all mbufs in a sockbuf.
+ * Check that all resources are reclaimed.
+ */
+void
+sbflush(sb)
+ register struct sockbuf *sb;
+{
+
+ if (sb->sb_flags & SB_LOCK)
+ panic("sbflush: locked");
+ while (sb->sb_mbcnt) {
+ /*
+ * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
+ * we would loop forever. Panic instead.
+ */
+ if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
+ break;
+ sbdrop(sb, (int)sb->sb_cc);
+ }
+ if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
+ panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop(sb, len)
+ register struct sockbuf *sb;
+ register int len;
+{
+ register struct mbuf *m;
+ struct mbuf *next;
+
+ next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+ while (len > 0) {
+ if (m == 0) {
+ if (next == 0)
+ panic("sbdrop");
+ m = next;
+ next = m->m_nextpkt;
+ continue;
+ }
+ if (m->m_len > len) {
+ m->m_len -= len;
+ m->m_data += len;
+ sb->sb_cc -= len;
+ break;
+ }
+ len -= m->m_len;
+ sbfree(sb, m);
+ m = m_free(m);
+ }
+ while (m && m->m_len == 0) {
+ sbfree(sb, m);
+ m = m_free(m);
+ }
+ if (m) {
+ sb->sb_mb = m;
+ m->m_nextpkt = next;
+ } else
+ sb->sb_mb = next;
+}
+
+/*
+ * Drop a record off the front of a sockbuf
+ * and move the next record to the front.
+ */
+void
+sbdroprecord(sb)
+ register struct sockbuf *sb;
+{
+ register struct mbuf *m;
+
+ m = sb->sb_mb;
+ if (m) {
+ sb->sb_mb = m->m_nextpkt;
+ do {
+ sbfree(sb, m);
+ m = m_free(m);
+ } while (m);
+ }
+}
+
+/*
+ * Create a "control" mbuf containing the specified data
+ * with the specified type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(p, size, type, level)
+ caddr_t p;
+ register int size;
+ int type, level;
+{
+ register struct cmsghdr *cp;
+ struct mbuf *m;
+
+ if (CMSG_SPACE((u_int)size) > MCLBYTES)
+ return ((struct mbuf *) NULL);
+ if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+ return ((struct mbuf *) NULL);
+ if (CMSG_SPACE((u_int)size) > MLEN) {
+ MCLGET(m, M_DONTWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ m_free(m);
+ return ((struct mbuf *) NULL);
+ }
+ }
+ cp = mtod(m, struct cmsghdr *);
+ m->m_len = 0;
+ KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
+ ("sbcreatecontrol: short mbuf"));
+ if (p != NULL)
+ (void)memcpy(CMSG_DATA(cp), p, size);
+ m->m_len = CMSG_SPACE(size);
+ cp->cmsg_len = CMSG_LEN(size);
+ cp->cmsg_level = level;
+ cp->cmsg_type = type;
+ return (m);
+}
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol. Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
+ struct ifnet *ifp, struct thread *td)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so, struct thread *td)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+ return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one
+ * and doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+ sb->st_blksize = so->so_snd.sb_hiwat;
+ return 0;
+}
+
+/*
+ * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
+ */
+struct sockaddr *
+dup_sockaddr(sa, canwait)
+ struct sockaddr *sa;
+ int canwait;
+{
+ struct sockaddr *sa2;
+
+ MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME,
+ canwait ? M_WAITOK : M_NOWAIT);
+ if (sa2)
+ bcopy(sa, sa2, sa->sa_len);
+ return sa2;
+}
+
+/*
+ * Create an external-format (``xsocket'') structure using the information
+ * in the kernel-format socket structure pointed to by so. This is done
+ * to reduce the spew of irrelevant information over this interface,
+ * to isolate user code from changes in the kernel structure, and
+ * potentially to provide information-hiding if we decide that
+ * some of this information should be hidden from users.
+ */
+void
+sotoxsocket(struct socket *so, struct xsocket *xso)
+{
+ xso->xso_len = sizeof *xso;
+ xso->xso_so = so;
+ xso->so_type = so->so_type;
+ xso->so_options = so->so_options;
+ xso->so_linger = so->so_linger;
+ xso->so_state = so->so_state;
+ xso->so_pcb = so->so_pcb;
+ xso->xso_protocol = so->so_proto->pr_protocol;
+ xso->xso_family = so->so_proto->pr_domain->dom_family;
+ xso->so_qlen = so->so_qlen;
+ xso->so_incqlen = so->so_incqlen;
+ xso->so_qlimit = so->so_qlimit;
+ xso->so_timeo = so->so_timeo;
+ xso->so_error = so->so_error;
+ xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+ xso->so_oobmark = so->so_oobmark;
+ sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+ sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+ xso->so_uid = so->so_cred->cr_uid;
+}
+
+/*
+ * This does the same for sockbufs. Note that the xsockbuf structure,
+ * since it is always embedded in a socket, does not include a self
+ * pointer nor a length. We make this entry point public in case
+ * some other mechanism needs it.
+ */
+void
+sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
+{
+ xsb->sb_cc = sb->sb_cc;
+ xsb->sb_hiwat = sb->sb_hiwat;
+ xsb->sb_mbcnt = sb->sb_mbcnt;
+ xsb->sb_mbmax = sb->sb_mbmax;
+ xsb->sb_lowat = sb->sb_lowat;
+ xsb->sb_flags = sb->sb_flags;
+ xsb->sb_timeo = sb->sb_timeo;
+}
+
+/*
+ * Here is the definition of some of the basic objects in the kern.ipc
+ * branch of the MIB.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+
+SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW,
+ &sb_max, 0, "Maximum socket buffer size");
+SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD,
+ &maxsockets, 0, "Maximum number of sockets avaliable");
+SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+ &sb_efficiency, 0, "");
+
+/*
+ * Initialise maxsockets
+ */
+static void init_maxsockets(void *ignored)
+{
+ TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
+ maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
+}
+SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
new file mode 100644
index 0000000..d596294
--- /dev/null
+++ b/sys/kern/uipc_socket.c
@@ -0,0 +1,1792 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_zero.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/domain.h>
+#include <sys/file.h> /* for struct knote */
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/event.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/jail.h>
+
+#include <vm/uma.h>
+
+#include <machine/limits.h>
+
+#ifdef INET
+static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
+#endif
+
+static void filt_sordetach(struct knote *kn);
+static int filt_soread(struct knote *kn, long hint);
+static void filt_sowdetach(struct knote *kn);
+static int filt_sowrite(struct knote *kn, long hint);
+static int filt_solisten(struct knote *kn, long hint);
+
+static struct filterops solisten_filtops =
+ { 1, NULL, filt_sordetach, filt_solisten };
+static struct filterops soread_filtops =
+ { 1, NULL, filt_sordetach, filt_soread };
+static struct filterops sowrite_filtops =
+ { 1, NULL, filt_sowdetach, filt_sowrite };
+
+uma_zone_t socket_zone;
+so_gen_t so_gencnt; /* generation count for sockets */
+
+MALLOC_DEFINE(M_SONAME, "soname", "socket name");
+MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
+
+SYSCTL_DECL(_kern_ipc);
+
+static int somaxconn = SOMAXCONN;
+SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
+ &somaxconn, 0, "Maximum pending socket connection queue size");
+static int numopensockets;
+SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
+ &numopensockets, 0, "Number of open sockets");
+#ifdef ZERO_COPY_SOCKETS
+/* These aren't static because they're used in other files. */
+int so_zero_copy_send = 1;
+int so_zero_copy_receive = 1;
+SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
+ "Zero copy controls");
+SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
+ &so_zero_copy_receive, 0, "Enable zero copy receive");
+SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
+ &so_zero_copy_send, 0, "Enable zero copy send");
+#endif /* ZERO_COPY_SOCKETS */
+
+
+/*
+ * Socket operation routines.
+ * These routines are called by the routines in
+ * sys_socket.c or from a system process, and
+ * implement the semantics of socket operations by
+ * switching out to the protocol specific routines.
+ */
+
+/*
+ * Get a socket structure from our zone, and initialize it.
+ * Note that it would probably be better to allocate socket
+ * and PCB at the same time, but I'm not convinced that all
+ * the protocols can be easily modified to do this.
+ *
+ * soalloc() returns a socket with a ref count of 0.
+ */
+struct socket *
+soalloc(waitok)
+ int waitok;
+{
+ struct socket *so;
+ int flag;
+
+ if (waitok == 1)
+ flag = M_WAITOK;
+ else
+ flag = M_NOWAIT;
+ flag |= M_ZERO;
+ so = uma_zalloc(socket_zone, flag);
+ if (so) {
+ /* XXX race condition for reentrant kernel */
+ so->so_gencnt = ++so_gencnt;
+ /* sx_init(&so->so_sxlock, "socket sxlock"); */
+ TAILQ_INIT(&so->so_aiojobq);
+ ++numopensockets;
+ }
+ return so;
+}
+
+/*
+ * socreate returns a socket with a ref count of 1. The socket should be
+ * closed with soclose().
+ */
+int
+socreate(dom, aso, type, proto, cred, td)
+ int dom;
+ struct socket **aso;
+ register int type;
+ int proto;
+ struct ucred *cred;
+ struct thread *td;
+{
+ register struct protosw *prp;
+ register struct socket *so;
+ register int error;
+
+ if (proto)
+ prp = pffindproto(dom, proto, type);
+ else
+ prp = pffindtype(dom, type);
+
+ if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
+ return (EPROTONOSUPPORT);
+
+ if (jailed(td->td_ucred) && jail_socket_unixiproute_only &&
+ prp->pr_domain->dom_family != PF_LOCAL &&
+ prp->pr_domain->dom_family != PF_INET &&
+ prp->pr_domain->dom_family != PF_ROUTE) {
+ return (EPROTONOSUPPORT);
+ }
+
+ if (prp->pr_type != type)
+ return (EPROTOTYPE);
+ so = soalloc(M_NOWAIT);
+ if (so == NULL)
+ return (ENOBUFS);
+
+ TAILQ_INIT(&so->so_incomp);
+ TAILQ_INIT(&so->so_comp);
+ so->so_type = type;
+ so->so_cred = crhold(cred);
+ so->so_proto = prp;
+ soref(so);
+ error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
+ if (error) {
+ so->so_state |= SS_NOFDREF;
+ sorele(so);
+ return (error);
+ }
+ *aso = so;
+ return (0);
+}
+
+int
+sobind(so, nam, td)
+ struct socket *so;
+ struct sockaddr *nam;
+ struct thread *td;
+{
+ int s = splnet();
+ int error;
+
+ error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
+ splx(s);
+ return (error);
+}
+
+static void
+sodealloc(struct socket *so)
+{
+
+ KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
+ so->so_gencnt = ++so_gencnt;
+ if (so->so_rcv.sb_hiwat)
+ (void)chgsbsize(so->so_cred->cr_uidinfo,
+ &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
+ if (so->so_snd.sb_hiwat)
+ (void)chgsbsize(so->so_cred->cr_uidinfo,
+ &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
+#ifdef INET
+ if (so->so_accf != NULL) {
+ if (so->so_accf->so_accept_filter != NULL &&
+ so->so_accf->so_accept_filter->accf_destroy != NULL) {
+ so->so_accf->so_accept_filter->accf_destroy(so);
+ }
+ if (so->so_accf->so_accept_filter_str != NULL)
+ FREE(so->so_accf->so_accept_filter_str, M_ACCF);
+ FREE(so->so_accf, M_ACCF);
+ }
+#endif
+ crfree(so->so_cred);
+ /* sx_destroy(&so->so_sxlock); */
+ uma_zfree(socket_zone, so);
+ --numopensockets;
+}
+
+int
+solisten(so, backlog, td)
+ register struct socket *so;
+ int backlog;
+ struct thread *td;
+{
+ int s, error;
+
+ s = splnet();
+ error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
+ if (error) {
+ splx(s);
+ return (error);
+ }
+ if (TAILQ_EMPTY(&so->so_comp))
+ so->so_options |= SO_ACCEPTCONN;
+ if (backlog < 0 || backlog > somaxconn)
+ backlog = somaxconn;
+ so->so_qlimit = backlog;
+ splx(s);
+ return (0);
+}
+
+void
+sofree(so)
+ register struct socket *so;
+{
+ struct socket *head = so->so_head;
+
+ KASSERT(so->so_count == 0, ("socket %p so_count not 0", so));
+
+ if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
+ return;
+ if (head != NULL) {
+ if (so->so_state & SS_INCOMP) {
+ TAILQ_REMOVE(&head->so_incomp, so, so_list);
+ head->so_incqlen--;
+ } else if (so->so_state & SS_COMP) {
+ /*
+ * We must not decommission a socket that's
+ * on the accept(2) queue. If we do, then
+ * accept(2) may hang after select(2) indicated
+ * that the listening socket was ready.
+ */
+ return;
+ } else {
+ panic("sofree: not queued");
+ }
+ so->so_state &= ~SS_INCOMP;
+ so->so_head = NULL;
+ }
+ sbrelease(&so->so_snd, so);
+ sorflush(so);
+ sodealloc(so);
+}
+
+/*
+ * Close a socket on last file table reference removal.
+ * Initiate disconnect if connected.
+ * Free socket when disconnect complete.
+ *
+ * This function will sorele() the socket. Note that soclose() may be
+ * called prior to the ref count reaching zero. The actual socket
+ * structure will not be freed until the ref count reaches zero.
+ */
+int
+soclose(so)
+ register struct socket *so;
+{
+ int s = splnet(); /* conservative */
+ int error = 0;
+
+ funsetown(&so->so_sigio);
+ if (so->so_options & SO_ACCEPTCONN) {
+ struct socket *sp, *sonext;
+
+ sp = TAILQ_FIRST(&so->so_incomp);
+ for (; sp != NULL; sp = sonext) {
+ sonext = TAILQ_NEXT(sp, so_list);
+ (void) soabort(sp);
+ }
+ for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) {
+ sonext = TAILQ_NEXT(sp, so_list);
+ /* Dequeue from so_comp since sofree() won't do it */
+ TAILQ_REMOVE(&so->so_comp, sp, so_list);
+ so->so_qlen--;
+ sp->so_state &= ~SS_COMP;
+ sp->so_head = NULL;
+ (void) soabort(sp);
+ }
+ }
+ if (so->so_pcb == 0)
+ goto discard;
+ if (so->so_state & SS_ISCONNECTED) {
+ if ((so->so_state & SS_ISDISCONNECTING) == 0) {
+ error = sodisconnect(so);
+ if (error)
+ goto drop;
+ }
+ if (so->so_options & SO_LINGER) {
+ if ((so->so_state & SS_ISDISCONNECTING) &&
+ (so->so_state & SS_NBIO))
+ goto drop;
+ while (so->so_state & SS_ISCONNECTED) {
+ error = tsleep(&so->so_timeo,
+ PSOCK | PCATCH, "soclos", so->so_linger * hz);
+ if (error)
+ break;
+ }
+ }
+ }
+drop:
+ if (so->so_pcb) {
+ int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
+ if (error == 0)
+ error = error2;
+ }
+discard:
+ if (so->so_state & SS_NOFDREF)
+ panic("soclose: NOFDREF");
+ so->so_state |= SS_NOFDREF;
+ sorele(so);
+ splx(s);
+ return (error);
+}
+
+/*
+ * Must be called at splnet...
+ */
+int
+soabort(so)
+ struct socket *so;
+{
+ int error;
+
+ error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
+ if (error) {
+ sotryfree(so); /* note: does not decrement the ref count */
+ return error;
+ }
+ return (0);
+}
+
+int
+soaccept(so, nam)
+ register struct socket *so;
+ struct sockaddr **nam;
+{
+ int s = splnet();
+ int error;
+
+ if ((so->so_state & SS_NOFDREF) == 0)
+ panic("soaccept: !NOFDREF");
+ so->so_state &= ~SS_NOFDREF;
+ error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
+ splx(s);
+ return (error);
+}
+
+int
+soconnect(so, nam, td)
+ register struct socket *so;
+ struct sockaddr *nam;
+ struct thread *td;
+{
+ int s;
+ int error;
+
+ if (so->so_options & SO_ACCEPTCONN)
+ return (EOPNOTSUPP);
+ s = splnet();
+ /*
+ * If protocol is connection-based, can only connect once.
+ * Otherwise, if connected, try to disconnect first.
+ * This allows user to disconnect by connecting to, e.g.,
+ * a null address.
+ */
+ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
+ ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
+ (error = sodisconnect(so))))
+ error = EISCONN;
+ else
+ error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
+ splx(s);
+ return (error);
+}
+
+int
+soconnect2(so1, so2)
+ register struct socket *so1;
+ struct socket *so2;
+{
+ int s = splnet();
+ int error;
+
+ error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
+ splx(s);
+ return (error);
+}
+
+int
+sodisconnect(so)
+ register struct socket *so;
+{
+ int s = splnet();
+ int error;
+
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ error = ENOTCONN;
+ goto bad;
+ }
+ if (so->so_state & SS_ISDISCONNECTING) {
+ error = EALREADY;
+ goto bad;
+ }
+ error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
+bad:
+ splx(s);
+ return (error);
+}
+
+#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
+/*
+ * Send on a socket.
+ * If send must go all at once and message is larger than
+ * send buffering, then hard error.
+ * Lock against other senders.
+ * If must go all at once and not enough room now, then
+ * inform user that this would block and do nothing.
+ * Otherwise, if nonblocking, send as much as possible.
+ * The data to be sent is described by "uio" if nonzero,
+ * otherwise by the mbuf chain "top" (which must be null
+ * if uio is not). Data provided in mbuf chain must be small
+ * enough to send all at once.
+ *
+ * Returns nonzero on error, timeout or signal; callers
+ * must check for short counts if EINTR/ERESTART are returned.
+ * Data and control buffers are freed on return.
+ */
+
+#ifdef ZERO_COPY_SOCKETS
+struct so_zerocopy_stats{
+ int size_ok;
+ int align_ok;
+ int found_ifp;
+};
+struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
+#include <netinet/in.h>
+#include <net/route.h>
+#include <netinet/in_pcb.h>
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#endif /*ZERO_COPY_SOCKETS*/
+
+int
+sosend(so, addr, uio, top, control, flags, td)
+ register struct socket *so;
+ struct sockaddr *addr;
+ struct uio *uio;
+ struct mbuf *top;
+ struct mbuf *control;
+ int flags;
+ struct thread *td;
+{
+ struct mbuf **mp;
+ register struct mbuf *m;
+ register long space, len, resid;
+ int clen = 0, error, s, dontroute, mlen;
+ int atomic = sosendallatonce(so) || top;
+#ifdef ZERO_COPY_SOCKETS
+ int cow_send;
+#endif /* ZERO_COPY_SOCKETS */
+
+ if (uio)
+ resid = uio->uio_resid;
+ else
+ resid = top->m_pkthdr.len;
+ /*
+ * In theory resid should be unsigned.
+ * However, space must be signed, as it might be less than 0
+ * if we over-committed, and we must use a signed comparison
+ * of space and resid. On the other hand, a negative resid
+ * causes us to loop sending 0-length segments to the protocol.
+ *
+ * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+ * type sockets since that's an error.
+ */
+ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
+ error = EINVAL;
+ goto out;
+ }
+
+ dontroute =
+ (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
+ (so->so_proto->pr_flags & PR_ATOMIC);
+ if (td)
+ td->td_proc->p_stats->p_ru.ru_msgsnd++;
+ if (control)
+ clen = control->m_len;
+#define snderr(errno) { error = errno; splx(s); goto release; }
+
+restart:
+ error = sblock(&so->so_snd, SBLOCKWAIT(flags));
+ if (error)
+ goto out;
+ do {
+ s = splnet();
+ if (so->so_state & SS_CANTSENDMORE)
+ snderr(EPIPE);
+ if (so->so_error) {
+ error = so->so_error;
+ so->so_error = 0;
+ splx(s);
+ goto release;
+ }
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ /*
+ * `sendto' and `sendmsg' is allowed on a connection-
+ * based socket if it supports implied connect.
+ * Return ENOTCONN if not connected and no address is
+ * supplied.
+ */
+ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+ if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+ !(resid == 0 && clen != 0))
+ snderr(ENOTCONN);
+ } else if (addr == 0)
+ snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
+ ENOTCONN : EDESTADDRREQ);
+ }
+ space = sbspace(&so->so_snd);
+ if (flags & MSG_OOB)
+ space += 1024;
+ if ((atomic && resid > so->so_snd.sb_hiwat) ||
+ clen > so->so_snd.sb_hiwat)
+ snderr(EMSGSIZE);
+ if (space < resid + clen &&
+ (atomic || space < so->so_snd.sb_lowat || space < clen)) {
+ if (so->so_state & SS_NBIO)
+ snderr(EWOULDBLOCK);
+ sbunlock(&so->so_snd);
+ error = sbwait(&so->so_snd);
+ splx(s);
+ if (error)
+ goto out;
+ goto restart;
+ }
+ splx(s);
+ mp = &top;
+ space -= clen;
+ do {
+ if (uio == NULL) {
+ /*
+ * Data is prepackaged in "top".
+ */
+ resid = 0;
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ } else do {
+#ifdef ZERO_COPY_SOCKETS
+ cow_send = 0;
+#endif /* ZERO_COPY_SOCKETS */
+ if (top == 0) {
+ MGETHDR(m, M_TRYWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto release;
+ }
+ mlen = MHLEN;
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+ } else {
+ MGET(m, M_TRYWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto release;
+ }
+ mlen = MLEN;
+ }
+ if (resid >= MINCLSIZE) {
+#ifdef ZERO_COPY_SOCKETS
+ if (so_zero_copy_send &&
+ resid>=PAGE_SIZE &&
+ space>=PAGE_SIZE &&
+ uio->uio_iov->iov_len>=PAGE_SIZE) {
+ so_zerocp_stats.size_ok++;
+ if (!((vm_offset_t)
+ uio->uio_iov->iov_base & PAGE_MASK)){
+ so_zerocp_stats.align_ok++;
+ cow_send = socow_setup(m, uio);
+ }
+ }
+ if (!cow_send){
+#endif /* ZERO_COPY_SOCKETS */
+ MCLGET(m, M_TRYWAIT);
+ if ((m->m_flags & M_EXT) == 0)
+ goto nopages;
+ mlen = MCLBYTES;
+ len = min(min(mlen, resid), space);
+ } else {
+#ifdef ZERO_COPY_SOCKETS
+ len = PAGE_SIZE;
+ }
+
+ } else {
+#endif /* ZERO_COPY_SOCKETS */
+nopages:
+ len = min(min(mlen, resid), space);
+ /*
+ * For datagram protocols, leave room
+ * for protocol headers in first mbuf.
+ */
+ if (atomic && top == 0 && len < mlen)
+ MH_ALIGN(m, len);
+ }
+ space -= len;
+#ifdef ZERO_COPY_SOCKETS
+ if (cow_send)
+ error = 0;
+ else
+#endif /* ZERO_COPY_SOCKETS */
+ error = uiomove(mtod(m, caddr_t), (int)len, uio);
+ resid = uio->uio_resid;
+ m->m_len = len;
+ *mp = m;
+ top->m_pkthdr.len += len;
+ if (error)
+ goto release;
+ mp = &m->m_next;
+ if (resid <= 0) {
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ break;
+ }
+ } while (space > 0 && atomic);
+ if (dontroute)
+ so->so_options |= SO_DONTROUTE;
+ s = splnet(); /* XXX */
+ /*
+ * XXX all the SS_CANTSENDMORE checks previously
+ * done could be out of date. We could have recieved
+ * a reset packet in an interrupt or maybe we slept
+ * while doing page faults in uiomove() etc. We could
+ * probably recheck again inside the splnet() protection
+ * here, but there are probably other places that this
+ * also happens. We must rethink this.
+ */
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+ (flags & MSG_OOB) ? PRUS_OOB :
+ /*
+ * If the user set MSG_EOF, the protocol
+ * understands this flag and nothing left to
+ * send then use PRU_SEND_EOF instead of PRU_SEND.
+ */
+ ((flags & MSG_EOF) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+ (resid <= 0)) ?
+ PRUS_EOF :
+ /* If there is more to send set PRUS_MORETOCOME */
+ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+ top, addr, control, td);
+ splx(s);
+ if (dontroute)
+ so->so_options &= ~SO_DONTROUTE;
+ clen = 0;
+ control = 0;
+ top = 0;
+ mp = &top;
+ if (error)
+ goto release;
+ } while (resid && space > 0);
+ } while (resid);
+
+release:
+ sbunlock(&so->so_snd);
+out:
+ if (top)
+ m_freem(top);
+ if (control)
+ m_freem(control);
+ return (error);
+}
+
+/*
+ * Implement receive operations on a socket.
+ * We depend on the way that records are added to the sockbuf
+ * by sbappend*. In particular, each record (mbufs linked through m_next)
+ * must begin with an address if the protocol so specifies,
+ * followed by an optional mbuf or mbufs containing ancillary data,
+ * and then zero or more mbufs of data.
+ * In order to avoid blocking network interrupts for the entire time here,
+ * we splx() while doing the actual copy to user space.
+ * Although the sockbuf is locked, new data may still be appended,
+ * and thus we must maintain consistency of the sockbuf during that time.
+ *
+ * The caller may receive the data as a single mbuf chain by supplying
+ * an mbuf **mp0 for use in returning the chain. The uio is then used
+ * only for the count in uio_resid.
+ */
+int
+soreceive(so, psa, uio, mp0, controlp, flagsp)
+ register struct socket *so;
+ struct sockaddr **psa;
+ struct uio *uio;
+ struct mbuf **mp0;
+ struct mbuf **controlp;
+ int *flagsp;
+{
+ struct mbuf *m, **mp;
+ register int flags, len, error, s, offset;
+ struct protosw *pr = so->so_proto;
+ struct mbuf *nextrecord;
+ int moff, type = 0;
+ int orig_resid = uio->uio_resid;
+
+ mp = mp0;
+ if (psa)
+ *psa = 0;
+ if (controlp)
+ *controlp = 0;
+ if (flagsp)
+ flags = *flagsp &~ MSG_EOR;
+ else
+ flags = 0;
+ if (flags & MSG_OOB) {
+ m = m_get(M_TRYWAIT, MT_DATA);
+ if (m == NULL)
+ return (ENOBUFS);
+ error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
+ if (error)
+ goto bad;
+ do {
+#ifdef ZERO_COPY_SOCKETS
+ if (so_zero_copy_receive) {
+ vm_page_t pg;
+ int disposable;
+
+ if ((m->m_flags & M_EXT)
+ && (m->m_ext.ext_type == EXT_DISPOSABLE))
+ disposable = 1;
+ else
+ disposable = 0;
+
+ pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t)));
+ if (uio->uio_offset == -1)
+ uio->uio_offset =IDX_TO_OFF(pg->pindex);
+
+ error = uiomoveco(mtod(m, caddr_t),
+ min(uio->uio_resid, m->m_len),
+ uio, pg->object,
+ disposable);
+ } else
+#endif /* ZERO_COPY_SOCKETS */
+ error = uiomove(mtod(m, caddr_t),
+ (int) min(uio->uio_resid, m->m_len), uio);
+ m = m_free(m);
+ } while (uio->uio_resid && error == 0 && m);
+bad:
+ if (m)
+ m_freem(m);
+ return (error);
+ }
+ if (mp)
+ *mp = (struct mbuf *)0;
+ if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
+ (*pr->pr_usrreqs->pru_rcvd)(so, 0);
+
+restart:
+ error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+ if (error)
+ return (error);
+ s = splnet();
+
+ m = so->so_rcv.sb_mb;
+ /*
+ * If we have less data than requested, block awaiting more
+ * (subject to any timeout) if:
+ * 1. the current count is less than the low water mark, or
+ * 2. MSG_WAITALL is set, and it is possible to do the entire
+ * receive operation at once if we block (resid <= hiwat).
+ * 3. MSG_DONTWAIT is not set
+ * If MSG_WAITALL is set but resid is larger than the receive buffer,
+ * we have to do the receive in sections, and thus risk returning
+ * a short count if a timeout or signal occurs after we start.
+ */
+ if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
+ so->so_rcv.sb_cc < uio->uio_resid) &&
+ (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
+ ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
+ m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
+ KASSERT(m != 0 || !so->so_rcv.sb_cc,
+ ("receive: m == %p so->so_rcv.sb_cc == %lu",
+ m, so->so_rcv.sb_cc));
+ if (so->so_error) {
+ if (m)
+ goto dontblock;
+ error = so->so_error;
+ if ((flags & MSG_PEEK) == 0)
+ so->so_error = 0;
+ goto release;
+ }
+ if (so->so_state & SS_CANTRCVMORE) {
+ if (m)
+ goto dontblock;
+ else
+ goto release;
+ }
+ for (; m; m = m->m_next)
+ if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
+ m = so->so_rcv.sb_mb;
+ goto dontblock;
+ }
+ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
+ (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+ error = ENOTCONN;
+ goto release;
+ }
+ if (uio->uio_resid == 0)
+ goto release;
+ if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
+ error = EWOULDBLOCK;
+ goto release;
+ }
+ sbunlock(&so->so_rcv);
+ error = sbwait(&so->so_rcv);
+ splx(s);
+ if (error)
+ return (error);
+ goto restart;
+ }
+dontblock:
+ if (uio->uio_td)
+ uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
+ nextrecord = m->m_nextpkt;
+ if (pr->pr_flags & PR_ADDR) {
+ KASSERT(m->m_type == MT_SONAME,
+ ("m->m_type == %d", m->m_type));
+ orig_resid = 0;
+ if (psa)
+ *psa = dup_sockaddr(mtod(m, struct sockaddr *),
+ mp0 == 0);
+ if (flags & MSG_PEEK) {
+ m = m->m_next;
+ } else {
+ sbfree(&so->so_rcv, m);
+ so->so_rcv.sb_mb = m_free(m);
+ m = so->so_rcv.sb_mb;
+ }
+ }
+ while (m && m->m_type == MT_CONTROL && error == 0) {
+ if (flags & MSG_PEEK) {
+ if (controlp)
+ *controlp = m_copy(m, 0, m->m_len);
+ m = m->m_next;
+ } else {
+ sbfree(&so->so_rcv, m);
+ so->so_rcv.sb_mb = m->m_next;
+ m->m_next = NULL;
+ if (pr->pr_domain->dom_externalize)
+ error =
+ (*pr->pr_domain->dom_externalize)(m, controlp);
+ else if (controlp)
+ *controlp = m;
+ else
+ m_freem(m);
+ m = so->so_rcv.sb_mb;
+ }
+ if (controlp) {
+ orig_resid = 0;
+ do
+ controlp = &(*controlp)->m_next;
+ while (*controlp != NULL);
+ }
+ }
+ if (m) {
+ if ((flags & MSG_PEEK) == 0)
+ m->m_nextpkt = nextrecord;
+ type = m->m_type;
+ if (type == MT_OOBDATA)
+ flags |= MSG_OOB;
+ }
+ moff = 0;
+ offset = 0;
+ while (m && uio->uio_resid > 0 && error == 0) {
+ if (m->m_type == MT_OOBDATA) {
+ if (type != MT_OOBDATA)
+ break;
+ } else if (type == MT_OOBDATA)
+ break;
+ else
+ KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
+ ("m->m_type == %d", m->m_type));
+ so->so_state &= ~SS_RCVATMARK;
+ len = uio->uio_resid;
+ if (so->so_oobmark && len > so->so_oobmark - offset)
+ len = so->so_oobmark - offset;
+ if (len > m->m_len - moff)
+ len = m->m_len - moff;
+ /*
+ * If mp is set, just pass back the mbufs.
+ * Otherwise copy them out via the uio, then free.
+ * Sockbuf must be consistent here (points to current mbuf,
+ * it points to next record) when we drop priority;
+ * we must note any additions to the sockbuf when we
+ * block interrupts again.
+ */
+ if (mp == 0) {
+ splx(s);
+#ifdef ZERO_COPY_SOCKETS
+ if (so_zero_copy_receive) {
+ vm_page_t pg;
+ int disposable;
+
+ if ((m->m_flags & M_EXT)
+ && (m->m_ext.ext_type == EXT_DISPOSABLE))
+ disposable = 1;
+ else
+ disposable = 0;
+
+ pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) +
+ moff));
+
+ if (uio->uio_offset == -1)
+ uio->uio_offset =IDX_TO_OFF(pg->pindex);
+
+ error = uiomoveco(mtod(m, caddr_t) + moff,
+ (int)len, uio,pg->object,
+ disposable);
+ } else
+#endif /* ZERO_COPY_SOCKETS */
+ error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
+ s = splnet();
+ if (error)
+ goto release;
+ } else
+ uio->uio_resid -= len;
+ if (len == m->m_len - moff) {
+ if (m->m_flags & M_EOR)
+ flags |= MSG_EOR;
+ if (flags & MSG_PEEK) {
+ m = m->m_next;
+ moff = 0;
+ } else {
+ nextrecord = m->m_nextpkt;
+ sbfree(&so->so_rcv, m);
+ if (mp) {
+ *mp = m;
+ mp = &m->m_next;
+ so->so_rcv.sb_mb = m = m->m_next;
+ *mp = (struct mbuf *)0;
+ } else {
+ so->so_rcv.sb_mb = m_free(m);
+ m = so->so_rcv.sb_mb;
+ }
+ if (m)
+ m->m_nextpkt = nextrecord;
+ }
+ } else {
+ if (flags & MSG_PEEK)
+ moff += len;
+ else {
+ if (mp)
+ *mp = m_copym(m, 0, len, M_TRYWAIT);
+ m->m_data += len;
+ m->m_len -= len;
+ so->so_rcv.sb_cc -= len;
+ }
+ }
+ if (so->so_oobmark) {
+ if ((flags & MSG_PEEK) == 0) {
+ so->so_oobmark -= len;
+ if (so->so_oobmark == 0) {
+ so->so_state |= SS_RCVATMARK;
+ break;
+ }
+ } else {
+ offset += len;
+ if (offset == so->so_oobmark)
+ break;
+ }
+ }
+ if (flags & MSG_EOR)
+ break;
+ /*
+ * If the MSG_WAITALL flag is set (for non-atomic socket),
+ * we must not quit until "uio->uio_resid == 0" or an error
+ * termination. If a signal/timeout occurs, return
+ * with a short count but without error.
+ * Keep sockbuf locked against other readers.
+ */
+ while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
+ !sosendallatonce(so) && !nextrecord) {
+ if (so->so_error || so->so_state & SS_CANTRCVMORE)
+ break;
+ /*
+ * Notify the protocol that some data has been
+ * drained before blocking.
+ */
+ if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
+ (*pr->pr_usrreqs->pru_rcvd)(so, flags);
+ error = sbwait(&so->so_rcv);
+ if (error) {
+ sbunlock(&so->so_rcv);
+ splx(s);
+ return (0);
+ }
+ m = so->so_rcv.sb_mb;
+ if (m)
+ nextrecord = m->m_nextpkt;
+ }
+ }
+
+ if (m && pr->pr_flags & PR_ATOMIC) {
+ flags |= MSG_TRUNC;
+ if ((flags & MSG_PEEK) == 0)
+ (void) sbdroprecord(&so->so_rcv);
+ }
+ if ((flags & MSG_PEEK) == 0) {
+ if (m == 0)
+ so->so_rcv.sb_mb = nextrecord;
+ if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
+ (*pr->pr_usrreqs->pru_rcvd)(so, flags);
+ }
+ if (orig_resid == uio->uio_resid && orig_resid &&
+ (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
+ sbunlock(&so->so_rcv);
+ splx(s);
+ goto restart;
+ }
+
+ if (flagsp)
+ *flagsp |= flags;
+release:
+ sbunlock(&so->so_rcv);
+ splx(s);
+ return (error);
+}
+
+int
+soshutdown(so, how)
+ register struct socket *so;
+ register int how;
+{
+ register struct protosw *pr = so->so_proto;
+
+ if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
+ return (EINVAL);
+
+ if (how != SHUT_WR)
+ sorflush(so);
+ if (how != SHUT_RD)
+ return ((*pr->pr_usrreqs->pru_shutdown)(so));
+ return (0);
+}
+
+void
+sorflush(so)
+ register struct socket *so;
+{
+ register struct sockbuf *sb = &so->so_rcv;
+ register struct protosw *pr = so->so_proto;
+ register int s;
+ struct sockbuf asb;
+
+ sb->sb_flags |= SB_NOINTR;
+ (void) sblock(sb, M_WAITOK);
+ s = splimp();
+ socantrcvmore(so);
+ sbunlock(sb);
+ asb = *sb;
+ bzero(sb, sizeof (*sb));
+ splx(s);
+ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
+ (*pr->pr_domain->dom_dispose)(asb.sb_mb);
+ sbrelease(&asb, so);
+}
+
+#ifdef INET
+static int
+do_setopt_accept_filter(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ struct accept_filter_arg *afap = NULL;
+ struct accept_filter *afp;
+ struct so_accf *af = so->so_accf;
+ int error = 0;
+
+ /* do not set/remove accept filters on non listen sockets */
+ if ((so->so_options & SO_ACCEPTCONN) == 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /* removing the filter */
+ if (sopt == NULL) {
+ if (af != NULL) {
+ if (af->so_accept_filter != NULL &&
+ af->so_accept_filter->accf_destroy != NULL) {
+ af->so_accept_filter->accf_destroy(so);
+ }
+ if (af->so_accept_filter_str != NULL) {
+ FREE(af->so_accept_filter_str, M_ACCF);
+ }
+ FREE(af, M_ACCF);
+ so->so_accf = NULL;
+ }
+ so->so_options &= ~SO_ACCEPTFILTER;
+ return (0);
+ }
+ /* adding a filter */
+ /* must remove previous filter first */
+ if (af != NULL) {
+ error = EINVAL;
+ goto out;
+ }
+ /* don't put large objects on the kernel stack */
+ MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
+ afap->af_name[sizeof(afap->af_name)-1] = '\0';
+ afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
+ if (error)
+ goto out;
+ afp = accept_filt_get(afap->af_name);
+ if (afp == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO);
+ if (afp->accf_create != NULL) {
+ if (afap->af_name[0] != '\0') {
+ int len = strlen(afap->af_name) + 1;
+
+ MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK);
+ strcpy(af->so_accept_filter_str, afap->af_name);
+ }
+ af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg);
+ if (af->so_accept_filter_arg == NULL) {
+ FREE(af->so_accept_filter_str, M_ACCF);
+ FREE(af, M_ACCF);
+ so->so_accf = NULL;
+ error = EINVAL;
+ goto out;
+ }
+ }
+ af->so_accept_filter = afp;
+ so->so_accf = af;
+ so->so_options |= SO_ACCEPTFILTER;
+out:
+ if (afap != NULL)
+ FREE(afap, M_TEMP);
+ return (error);
+}
+#endif /* INET */
+
+/*
+ * Perhaps this routine, and sooptcopyout(), below, ought to come in
+ * an additional variant to handle the case where the option value needs
+ * to be some kind of integer, but not a specific size.
+ * In addition to their use here, these functions are also called by the
+ * protocol-level pr_ctloutput() routines.
+ */
+int
+sooptcopyin(sopt, buf, len, minlen)
+ struct sockopt *sopt;
+ void *buf;
+ size_t len;
+ size_t minlen;
+{
+ size_t valsize;
+
+ /*
+ * If the user gives us more than we wanted, we ignore it,
+ * but if we don't get the minimum length the caller
+ * wants, we return EINVAL. On success, sopt->sopt_valsize
+ * is set to however much we actually retrieved.
+ */
+ if ((valsize = sopt->sopt_valsize) < minlen)
+ return EINVAL;
+ if (valsize > len)
+ sopt->sopt_valsize = valsize = len;
+
+ if (sopt->sopt_td != 0)
+ return (copyin(sopt->sopt_val, buf, valsize));
+
+ bcopy(sopt->sopt_val, buf, valsize);
+ return 0;
+}
+
+int
+sosetopt(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ int error, optval;
+ struct linger l;
+ struct timeval tv;
+ u_long val;
+
+ error = 0;
+ if (sopt->sopt_level != SOL_SOCKET) {
+ if (so->so_proto && so->so_proto->pr_ctloutput)
+ return ((*so->so_proto->pr_ctloutput)
+ (so, sopt));
+ error = ENOPROTOOPT;
+ } else {
+ switch (sopt->sopt_name) {
+#ifdef INET
+ case SO_ACCEPTFILTER:
+ error = do_setopt_accept_filter(so, sopt);
+ if (error)
+ goto bad;
+ break;
+#endif
+ case SO_LINGER:
+ error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
+ if (error)
+ goto bad;
+
+ so->so_linger = l.l_linger;
+ if (l.l_onoff)
+ so->so_options |= SO_LINGER;
+ else
+ so->so_options &= ~SO_LINGER;
+ break;
+
+ case SO_DEBUG:
+ case SO_KEEPALIVE:
+ case SO_DONTROUTE:
+ case SO_USELOOPBACK:
+ case SO_BROADCAST:
+ case SO_REUSEADDR:
+ case SO_REUSEPORT:
+ case SO_OOBINLINE:
+ case SO_TIMESTAMP:
+ case SO_NOSIGPIPE:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ goto bad;
+ if (optval)
+ so->so_options |= sopt->sopt_name;
+ else
+ so->so_options &= ~sopt->sopt_name;
+ break;
+
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ case SO_SNDLOWAT:
+ case SO_RCVLOWAT:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ goto bad;
+
+ /*
+ * Values < 1 make no sense for any of these
+ * options, so disallow them.
+ */
+ if (optval < 1) {
+ error = EINVAL;
+ goto bad;
+ }
+
+ switch (sopt->sopt_name) {
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
+ &so->so_snd : &so->so_rcv, (u_long)optval,
+ so, curthread) == 0) {
+ error = ENOBUFS;
+ goto bad;
+ }
+ break;
+
+ /*
+ * Make sure the low-water is never greater than
+ * the high-water.
+ */
+ case SO_SNDLOWAT:
+ so->so_snd.sb_lowat =
+ (optval > so->so_snd.sb_hiwat) ?
+ so->so_snd.sb_hiwat : optval;
+ break;
+ case SO_RCVLOWAT:
+ so->so_rcv.sb_lowat =
+ (optval > so->so_rcv.sb_hiwat) ?
+ so->so_rcv.sb_hiwat : optval;
+ break;
+ }
+ break;
+
+ case SO_SNDTIMEO:
+ case SO_RCVTIMEO:
+ error = sooptcopyin(sopt, &tv, sizeof tv,
+ sizeof tv);
+ if (error)
+ goto bad;
+
+ /* assert(hz > 0); */
+ if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz ||
+ tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
+ error = EDOM;
+ goto bad;
+ }
+ /* assert(tick > 0); */
+ /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */
+ val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
+ if (val > SHRT_MAX) {
+ error = EDOM;
+ goto bad;
+ }
+
+ switch (sopt->sopt_name) {
+ case SO_SNDTIMEO:
+ so->so_snd.sb_timeo = val;
+ break;
+ case SO_RCVTIMEO:
+ so->so_rcv.sb_timeo = val;
+ break;
+ }
+ break;
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
+ (void) ((*so->so_proto->pr_ctloutput)
+ (so, sopt));
+ }
+ }
+bad:
+ return (error);
+}
+
+/* Helper routine for getsockopt */
+int
+sooptcopyout(sopt, buf, len)
+ struct sockopt *sopt;
+ void *buf;
+ size_t len;
+{
+ int error;
+ size_t valsize;
+
+ error = 0;
+
+ /*
+ * Documented get behavior is that we always return a value,
+ * possibly truncated to fit in the user's buffer.
+ * Traditional behavior is that we always tell the user
+ * precisely how much we copied, rather than something useful
+ * like the total amount we had available for her.
+ * Note that this interface is not idempotent; the entire answer must
+ * generated ahead of time.
+ */
+ valsize = min(len, sopt->sopt_valsize);
+ sopt->sopt_valsize = valsize;
+ if (sopt->sopt_val != 0) {
+ if (sopt->sopt_td != 0)
+ error = copyout(buf, sopt->sopt_val, valsize);
+ else
+ bcopy(buf, sopt->sopt_val, valsize);
+ }
+ return error;
+}
+
+int
+sogetopt(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ int error, optval;
+ struct linger l;
+ struct timeval tv;
+#ifdef INET
+ struct accept_filter_arg *afap;
+#endif
+
+ error = 0;
+ if (sopt->sopt_level != SOL_SOCKET) {
+ if (so->so_proto && so->so_proto->pr_ctloutput) {
+ return ((*so->so_proto->pr_ctloutput)
+ (so, sopt));
+ } else
+ return (ENOPROTOOPT);
+ } else {
+ switch (sopt->sopt_name) {
+#ifdef INET
+ case SO_ACCEPTFILTER:
+ if ((so->so_options & SO_ACCEPTCONN) == 0)
+ return (EINVAL);
+ MALLOC(afap, struct accept_filter_arg *, sizeof(*afap),
+ M_TEMP, M_WAITOK | M_ZERO);
+ if ((so->so_options & SO_ACCEPTFILTER) != 0) {
+ strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
+ if (so->so_accf->so_accept_filter_str != NULL)
+ strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
+ }
+ error = sooptcopyout(sopt, afap, sizeof(*afap));
+ FREE(afap, M_TEMP);
+ break;
+#endif
+
+ case SO_LINGER:
+ l.l_onoff = so->so_options & SO_LINGER;
+ l.l_linger = so->so_linger;
+ error = sooptcopyout(sopt, &l, sizeof l);
+ break;
+
+ case SO_USELOOPBACK:
+ case SO_DONTROUTE:
+ case SO_DEBUG:
+ case SO_KEEPALIVE:
+ case SO_REUSEADDR:
+ case SO_REUSEPORT:
+ case SO_BROADCAST:
+ case SO_OOBINLINE:
+ case SO_TIMESTAMP:
+ case SO_NOSIGPIPE:
+ optval = so->so_options & sopt->sopt_name;
+integer:
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+
+ case SO_TYPE:
+ optval = so->so_type;
+ goto integer;
+
+ case SO_ERROR:
+ optval = so->so_error;
+ so->so_error = 0;
+ goto integer;
+
+ case SO_SNDBUF:
+ optval = so->so_snd.sb_hiwat;
+ goto integer;
+
+ case SO_RCVBUF:
+ optval = so->so_rcv.sb_hiwat;
+ goto integer;
+
+ case SO_SNDLOWAT:
+ optval = so->so_snd.sb_lowat;
+ goto integer;
+
+ case SO_RCVLOWAT:
+ optval = so->so_rcv.sb_lowat;
+ goto integer;
+
+ case SO_SNDTIMEO:
+ case SO_RCVTIMEO:
+ optval = (sopt->sopt_name == SO_SNDTIMEO ?
+ so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
+
+ tv.tv_sec = optval / hz;
+ tv.tv_usec = (optval % hz) * tick;
+ error = sooptcopyout(sopt, &tv, sizeof tv);
+ break;
+
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ return (error);
+ }
+}
+
+/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
+int
+soopt_getm(struct sockopt *sopt, struct mbuf **mp)
+{
+ struct mbuf *m, *m_prev;
+ int sopt_size = sopt->sopt_valsize;
+
+ MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
+ if (m == 0)
+ return ENOBUFS;
+ if (sopt_size > MLEN) {
+ MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ m_free(m);
+ return ENOBUFS;
+ }
+ m->m_len = min(MCLBYTES, sopt_size);
+ } else {
+ m->m_len = min(MLEN, sopt_size);
+ }
+ sopt_size -= m->m_len;
+ *mp = m;
+ m_prev = m;
+
+ while (sopt_size) {
+ MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
+ if (m == 0) {
+ m_freem(*mp);
+ return ENOBUFS;
+ }
+ if (sopt_size > MLEN) {
+ MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ m_freem(*mp);
+ return ENOBUFS;
+ }
+ m->m_len = min(MCLBYTES, sopt_size);
+ } else {
+ m->m_len = min(MLEN, sopt_size);
+ }
+ sopt_size -= m->m_len;
+ m_prev->m_next = m;
+ m_prev = m;
+ }
+ return 0;
+}
+
+/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
+int
+soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
+{
+ struct mbuf *m0 = m;
+
+ if (sopt->sopt_val == NULL)
+ return 0;
+ while (m != NULL && sopt->sopt_valsize >= m->m_len) {
+ if (sopt->sopt_td != NULL) {
+ int error;
+
+ error = copyin(sopt->sopt_val, mtod(m, char *),
+ m->m_len);
+ if (error != 0) {
+ m_freem(m0);
+ return(error);
+ }
+ } else
+ bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
+ sopt->sopt_valsize -= m->m_len;
+ (caddr_t)sopt->sopt_val += m->m_len;
+ m = m->m_next;
+ }
+ if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
+ panic("ip6_sooptmcopyin");
+ return 0;
+}
+
+/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
+int
+soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
+{
+ struct mbuf *m0 = m;
+ size_t valsize = 0;
+
+ if (sopt->sopt_val == NULL)
+ return 0;
+ while (m != NULL && sopt->sopt_valsize >= m->m_len) {
+ if (sopt->sopt_td != NULL) {
+ int error;
+
+ error = copyout(mtod(m, char *), sopt->sopt_val,
+ m->m_len);
+ if (error != 0) {
+ m_freem(m0);
+ return(error);
+ }
+ } else
+ bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
+ sopt->sopt_valsize -= m->m_len;
+ (caddr_t)sopt->sopt_val += m->m_len;
+ valsize += m->m_len;
+ m = m->m_next;
+ }
+ if (m != NULL) {
+ /* enough soopt buffer should be given from user-land */
+ m_freem(m0);
+ return(EINVAL);
+ }
+ sopt->sopt_valsize = valsize;
+ return 0;
+}
+
+void
+sohasoutofband(so)
+ register struct socket *so;
+{
+ if (so->so_sigio != NULL)
+ pgsigio(&so->so_sigio, SIGURG, 0);
+ selwakeup(&so->so_rcv.sb_sel);
+}
+
+int
+sopoll(struct socket *so, int events, struct ucred *cred, struct thread *td)
+{
+ int revents = 0;
+ int s = splnet();
+
+ if (events & (POLLIN | POLLRDNORM))
+ if (soreadable(so))
+ revents |= events & (POLLIN | POLLRDNORM);
+
+ if (events & POLLINIGNEOF)
+ if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
+ !TAILQ_EMPTY(&so->so_comp) || so->so_error)
+ revents |= POLLINIGNEOF;
+
+ if (events & (POLLOUT | POLLWRNORM))
+ if (sowriteable(so))
+ revents |= events & (POLLOUT | POLLWRNORM);
+
+ if (events & (POLLPRI | POLLRDBAND))
+ if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
+ revents |= events & (POLLPRI | POLLRDBAND);
+
+ if (revents == 0) {
+ if (events &
+ (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
+ POLLRDBAND)) {
+ selrecord(td, &so->so_rcv.sb_sel);
+ so->so_rcv.sb_flags |= SB_SEL;
+ }
+
+ if (events & (POLLOUT | POLLWRNORM)) {
+ selrecord(td, &so->so_snd.sb_sel);
+ so->so_snd.sb_flags |= SB_SEL;
+ }
+ }
+
+ splx(s);
+ return (revents);
+}
+
+int
+sokqfilter(struct file *fp, struct knote *kn)
+{
+ struct socket *so = (struct socket *)kn->kn_fp->f_data;
+ struct sockbuf *sb;
+ int s;
+
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ if (so->so_options & SO_ACCEPTCONN)
+ kn->kn_fop = &solisten_filtops;
+ else
+ kn->kn_fop = &soread_filtops;
+ sb = &so->so_rcv;
+ break;
+ case EVFILT_WRITE:
+ kn->kn_fop = &sowrite_filtops;
+ sb = &so->so_snd;
+ break;
+ default:
+ return (1);
+ }
+
+ s = splnet();
+ SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
+ sb->sb_flags |= SB_KNOTE;
+ splx(s);
+ return (0);
+}
+
+static void
+filt_sordetach(struct knote *kn)
+{
+ struct socket *so = (struct socket *)kn->kn_fp->f_data;
+ int s = splnet();
+
+ SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
+ if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
+ so->so_rcv.sb_flags &= ~SB_KNOTE;
+ splx(s);
+}
+
+/*ARGSUSED*/
+static int
+filt_soread(struct knote *kn, long hint)
+{
+ struct socket *so = (struct socket *)kn->kn_fp->f_data;
+
+ kn->kn_data = so->so_rcv.sb_cc;
+ if (so->so_state & SS_CANTRCVMORE) {
+ kn->kn_flags |= EV_EOF;
+ kn->kn_fflags = so->so_error;
+ return (1);
+ }
+ if (so->so_error) /* temporary udp error */
+ return (1);
+ if (kn->kn_sfflags & NOTE_LOWAT)
+ return (kn->kn_data >= kn->kn_sdata);
+ return (kn->kn_data >= so->so_rcv.sb_lowat);
+}
+
+static void
+filt_sowdetach(struct knote *kn)
+{
+ struct socket *so = (struct socket *)kn->kn_fp->f_data;
+ int s = splnet();
+
+ SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
+ if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
+ so->so_snd.sb_flags &= ~SB_KNOTE;
+ splx(s);
+}
+
+/*ARGSUSED*/
+static int
+filt_sowrite(struct knote *kn, long hint)
+{
+ struct socket *so = (struct socket *)kn->kn_fp->f_data;
+
+ kn->kn_data = sbspace(&so->so_snd);
+ if (so->so_state & SS_CANTSENDMORE) {
+ kn->kn_flags |= EV_EOF;
+ kn->kn_fflags = so->so_error;
+ return (1);
+ }
+ if (so->so_error) /* temporary udp error */
+ return (1);
+ if (((so->so_state & SS_ISCONNECTED) == 0) &&
+ (so->so_proto->pr_flags & PR_CONNREQUIRED))
+ return (0);
+ if (kn->kn_sfflags & NOTE_LOWAT)
+ return (kn->kn_data >= kn->kn_sdata);
+ return (kn->kn_data >= so->so_snd.sb_lowat);
+}
+
+/*ARGSUSED*/
+static int
+filt_solisten(struct knote *kn, long hint)
+{
+ struct socket *so = (struct socket *)kn->kn_fp->f_data;
+
+ kn->kn_data = so->so_qlen;
+ return (! TAILQ_EMPTY(&so->so_comp));
+}
+
+int
+socheckuid(struct socket *so, uid_t uid)
+{
+
+ if (so == NULL)
+ return (EPERM);
+ if (so->so_cred->cr_uid == uid)
+ return (0);
+ return (EPERM);
+}
diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c
new file mode 100644
index 0000000..1e68f83
--- /dev/null
+++ b/sys/kern/uipc_socket2.c
@@ -0,0 +1,983 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/aio.h> /* for aio_swake proto */
+#include <sys/domain.h>
+#include <sys/event.h>
+#include <sys/file.h> /* for maxfiles */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+int maxsockets;
+
+void (*aio_swake)(struct socket *, struct sockbuf *);
+
+/*
+ * Primitive routines for operating on sockets and socket buffers
+ */
+
+u_long sb_max = SB_MAX; /* XXX should be static */
+
+static u_long sb_efficiency = 8; /* parameter for sbreserve() */
+
+/*
+ * Procedures to manipulate state flags of socket
+ * and do appropriate wakeups. Normal sequence from the
+ * active (originating) side is that soisconnecting() is
+ * called during processing of connect() call,
+ * resulting in an eventual call to soisconnected() if/when the
+ * connection is established. When the connection is torn down
+ * soisdisconnecting() is called during processing of disconnect() call,
+ * and soisdisconnected() is called when the connection to the peer
+ * is totally severed. The semantics of these routines are such that
+ * connectionless protocols can call soisconnected() and soisdisconnected()
+ * only, bypassing the in-progress calls when setting up a ``connection''
+ * takes no time.
+ *
+ * From the passive side, a socket is created with
+ * two queues of sockets: so_incomp for connections in progress
+ * and so_comp for connections already made and awaiting user acceptance.
+ * As a protocol is preparing incoming connections, it creates a socket
+ * structure queued on so_incomp by calling sonewconn(). When the connection
+ * is established, soisconnected() is called, and transfers the
+ * socket structure to so_comp, making it available to accept().
+ *
+ * If a socket is closed with sockets on either
+ * so_incomp or so_comp, these sockets are dropped.
+ *
+ * If higher level protocols are implemented in
+ * the kernel, the wakeups done here will sometimes
+ * cause software-interrupt process scheduling.
+ */
+
+void
+soisconnecting(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= SS_ISCONNECTING;
+}
+
+void
+soisconnected(so)
+ struct socket *so;
+{
+ struct socket *head = so->so_head;
+
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+ so->so_state |= SS_ISCONNECTED;
+ if (head && (so->so_state & SS_INCOMP)) {
+ if ((so->so_options & SO_ACCEPTFILTER) != 0) {
+ so->so_upcall = head->so_accf->so_accept_filter->accf_callback;
+ so->so_upcallarg = head->so_accf->so_accept_filter_arg;
+ so->so_rcv.sb_flags |= SB_UPCALL;
+ so->so_options &= ~SO_ACCEPTFILTER;
+ so->so_upcall(so, so->so_upcallarg, 0);
+ return;
+ }
+ TAILQ_REMOVE(&head->so_incomp, so, so_list);
+ head->so_incqlen--;
+ so->so_state &= ~SS_INCOMP;
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ head->so_qlen++;
+ so->so_state |= SS_COMP;
+ sorwakeup(head);
+ wakeup_one(&head->so_timeo);
+ } else {
+ wakeup(&so->so_timeo);
+ sorwakeup(so);
+ sowwakeup(so);
+ }
+}
+
+void
+soisdisconnecting(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~SS_ISCONNECTING;
+ so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
+ wakeup(&so->so_timeo);
+ sowwakeup(so);
+ sorwakeup(so);
+}
+
+void
+soisdisconnected(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
+ wakeup(&so->so_timeo);
+ sowwakeup(so);
+ sorwakeup(so);
+}
+
+/*
+ * When an attempt at a new connection is noted on a socket
+ * which accepts connections, sonewconn is called. If the
+ * connection is possible (subject to space constraints, etc.)
+ * then we allocate a new structure, propoerly linked into the
+ * data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ *
+ * note: the ref count on the socket is 0 on return
+ */
+struct socket *
+sonewconn(head, connstatus)
+ register struct socket *head;
+ int connstatus;
+{
+ register struct socket *so;
+
+ if (head->so_qlen > 3 * head->so_qlimit / 2)
+ return ((struct socket *)0);
+ so = soalloc(0);
+ if (so == NULL)
+ return ((struct socket *)0);
+ if ((head->so_options & SO_ACCEPTFILTER) != 0)
+ connstatus = 0;
+ so->so_head = head;
+ so->so_type = head->so_type;
+ so->so_options = head->so_options &~ SO_ACCEPTCONN;
+ so->so_linger = head->so_linger;
+ so->so_state = head->so_state | SS_NOFDREF;
+ so->so_proto = head->so_proto;
+ so->so_timeo = head->so_timeo;
+ so->so_cred = crhold(head->so_cred);
+ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
+ (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+ sotryfree(so);
+ return ((struct socket *)0);
+ }
+
+ if (connstatus) {
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ so->so_state |= SS_COMP;
+ head->so_qlen++;
+ } else {
+ if (head->so_incqlen > head->so_qlimit) {
+ struct socket *sp;
+ sp = TAILQ_FIRST(&head->so_incomp);
+ (void) soabort(sp);
+ }
+ TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+ so->so_state |= SS_INCOMP;
+ head->so_incqlen++;
+ }
+ if (connstatus) {
+ sorwakeup(head);
+ wakeup(&head->so_timeo);
+ so->so_state |= connstatus;
+ }
+ return (so);
+}
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the
+ * socket; it would normally be applied to a socket when the user
+ * informs the system that no more data is to be sent, by the protocol
+ * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
+ * will be received, and will normally be applied to the socket by a
+ * protocol when it detects that the peer will send no more data.
+ * Data queued for reading in the socket may yet be read.
+ */
+
+void
+socantsendmore(so)
+ struct socket *so;
+{
+
+ so->so_state |= SS_CANTSENDMORE;
+ sowwakeup(so);
+}
+
+void
+socantrcvmore(so)
+ struct socket *so;
+{
+
+ so->so_state |= SS_CANTRCVMORE;
+ sorwakeup(so);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(sb)
+ struct sockbuf *sb;
+{
+
+ sb->sb_flags |= SB_WAIT;
+ return (tsleep(&sb->sb_cc,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+ sb->sb_timeo));
+}
+
+/*
+ * Lock a sockbuf already known to be locked;
+ * return any error returned from sleep (EINTR).
+ */
+int
+sb_lock(sb)
+ register struct sockbuf *sb;
+{
+ int error;
+
+ while (sb->sb_flags & SB_LOCK) {
+ sb->sb_flags |= SB_WANT;
+ error = tsleep(&sb->sb_flags,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
+ "sblock", 0);
+ if (error)
+ return (error);
+ }
+ sb->sb_flags |= SB_LOCK;
+ return (0);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer.
+ * Do asynchronous notification via SIGIO
+ * if the socket has the SS_ASYNC flag set.
+ */
+void
+sowakeup(so, sb)
+ register struct socket *so;
+ register struct sockbuf *sb;
+{
+
+ selwakeup(&sb->sb_sel);
+ sb->sb_flags &= ~SB_SEL;
+ if (sb->sb_flags & SB_WAIT) {
+ sb->sb_flags &= ~SB_WAIT;
+ wakeup(&sb->sb_cc);
+ }
+ if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
+ pgsigio(&so->so_sigio, SIGIO, 0);
+ if (sb->sb_flags & SB_UPCALL)
+ (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
+ if (sb->sb_flags & SB_AIO)
+ aio_swake(so, sb);
+ KNOTE(&sb->sb_sel.si_note, 0);
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and
+ * one for receiving data. Each buffer contains a queue of mbufs,
+ * information about the number of mbufs and amount of data in the
+ * queue, and other fields allowing select() statements and notification
+ * on data availability to be implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records.
+ * Each record is a list of mbufs chained together with the m_next
+ * field. Records are chained together with the m_nextpkt field. The upper
+ * level routine soreceive() expects the following conventions to be
+ * observed when placing information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's
+ * name, then a record containing that name must be present before
+ * any associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really
+ * just additional data associated with the message), and there are
+ * ``rights'' to be received, then a record containing this data
+ * should be present (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by
+ * a data record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve(). This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits). The space
+ * should be released by calling sbrelease() when the socket is destroyed.
+ */
+
+int
+soreserve(so, sndcc, rcvcc)
+ register struct socket *so;
+ u_long sndcc, rcvcc;
+{
+ struct thread *td = curthread;
+
+ if (sbreserve(&so->so_snd, sndcc, so, td) == 0)
+ goto bad;
+ if (sbreserve(&so->so_rcv, rcvcc, so, td) == 0)
+ goto bad2;
+ if (so->so_rcv.sb_lowat == 0)
+ so->so_rcv.sb_lowat = 1;
+ if (so->so_snd.sb_lowat == 0)
+ so->so_snd.sb_lowat = MCLBYTES;
+ if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+ so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+ return (0);
+bad2:
+ sbrelease(&so->so_snd, so);
+bad:
+ return (ENOBUFS);
+}
+
+/*
+ * Allot mbufs to a sockbuf.
+ * Attempt to scale mbmax so that mbcnt doesn't become limiting
+ * if buffering efficiency is near the normal case.
+ */
+int
+sbreserve(sb, cc, so, td)
+ struct sockbuf *sb;
+ u_long cc;
+ struct socket *so;
+ struct thread *td;
+{
+
+ /*
+ * td will only be NULL when we're in an interrupt
+ * (e.g. in tcp_input())
+ */
+ if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
+ return (0);
+ if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
+ td ? td->td_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur : RLIM_INFINITY)) {
+ return (0);
+ }
+ sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+ if (sb->sb_lowat > sb->sb_hiwat)
+ sb->sb_lowat = sb->sb_hiwat;
+ return (1);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+void
+sbrelease(sb, so)
+ struct sockbuf *sb;
+ struct socket *so;
+{
+
+ sbflush(sb);
+ (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
+ RLIM_INFINITY);
+ sb->sb_mbmax = 0;
+}
+
+/*
+ * Routines to add and remove
+ * data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to
+ * append new mbufs to a socket buffer, after checking that adequate
+ * space is available, comparing the function sbspace() with the amount
+ * of data to be added. sbappendrecord() differs from sbappend() in
+ * that data supplied is treated as the beginning of a new record.
+ * To place a sender's address, optional access rights, and data in a
+ * socket receive buffer, sbappendaddr() should be used. To place
+ * access rights and data in a socket receive buffer, sbappendrights()
+ * should be used. In either case, the new data begins a new record.
+ * Note that unlike sbappend() and sbappendrecord(), these routines check
+ * for the caller that there will be enough space to store the data.
+ * Each fails if there is not enough space, or if it cannot find mbufs
+ * to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data
+ * awaiting acknowledgement. Data is normally copied from a socket
+ * send buffer in a protocol with m_copy for output to a peer,
+ * and then removing the data from the socket buffer with sbdrop()
+ * or sbdroprecord() when the data is acknowledged by the peer.
+ */
+
+/*
+ * Append mbuf chain m to the last record in the
+ * socket buffer sb. The additional space associated
+ * the mbuf chain is recorded in sb. Empty mbufs are
+ * discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(sb, m)
+ struct sockbuf *sb;
+ struct mbuf *m;
+{
+ register struct mbuf *n;
+
+ if (m == 0)
+ return;
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ do {
+ if (n->m_flags & M_EOR) {
+ sbappendrecord(sb, m); /* XXXXXX!!!! */
+ return;
+ }
+ } while (n->m_next && (n = n->m_next));
+ }
+ sbcompress(sb, m, n);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(sb)
+ register struct sockbuf *sb;
+{
+ register struct mbuf *m;
+ register struct mbuf *n = 0;
+ register u_long len = 0, mbcnt = 0;
+
+ for (m = sb->sb_mb; m; m = n) {
+ n = m->m_nextpkt;
+ for (; m; m = m->m_next) {
+ len += m->m_len;
+ mbcnt += MSIZE;
+ if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+ mbcnt += m->m_ext.ext_size;
+ }
+ }
+ if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+ printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
+ mbcnt, sb->sb_mbcnt);
+ panic("sbcheck");
+ }
+}
+#endif
+
+/*
+ * As above, except the mbuf chain
+ * begins a new record.
+ */
+void
+sbappendrecord(sb, m0)
+ register struct sockbuf *sb;
+ register struct mbuf *m0;
+{
+ register struct mbuf *m;
+
+ if (m0 == 0)
+ return;
+ m = sb->sb_mb;
+ if (m)
+ while (m->m_nextpkt)
+ m = m->m_nextpkt;
+ /*
+ * Put the first mbuf on the queue.
+ * Note this permits zero length records.
+ */
+ sballoc(sb, m0);
+ if (m)
+ m->m_nextpkt = m0;
+ else
+ sb->sb_mb = m0;
+ m = m0->m_next;
+ m0->m_next = 0;
+ if (m && (m0->m_flags & M_EOR)) {
+ m0->m_flags &= ~M_EOR;
+ m->m_flags |= M_EOR;
+ }
+ sbcompress(sb, m, m0);
+}
+
+/*
+ * As above except that OOB data
+ * is inserted at the beginning of the sockbuf,
+ * but after any other OOB data.
+ */
+void
+sbinsertoob(sb, m0)
+ register struct sockbuf *sb;
+ register struct mbuf *m0;
+{
+ register struct mbuf *m;
+ register struct mbuf **mp;
+
+ if (m0 == 0)
+ return;
+ for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
+ m = *mp;
+ again:
+ switch (m->m_type) {
+
+ case MT_OOBDATA:
+ continue; /* WANT next train */
+
+ case MT_CONTROL:
+ m = m->m_next;
+ if (m)
+ goto again; /* inspect THIS train further */
+ }
+ break;
+ }
+ /*
+ * Put the first mbuf on the queue.
+ * Note this permits zero length records.
+ */
+ sballoc(sb, m0);
+ m0->m_nextpkt = *mp;
+ *mp = m0;
+ m = m0->m_next;
+ m0->m_next = 0;
+ if (m && (m0->m_flags & M_EOR)) {
+ m0->m_flags &= ~M_EOR;
+ m->m_flags |= M_EOR;
+ }
+ sbcompress(sb, m, m0);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data
+ * to the receive queue of a socket. If present,
+ * m0 must include a packet header with total length.
+ * Returns 0 if no space in sockbuf or insufficient mbufs.
+ */
+int
+sbappendaddr(sb, asa, m0, control)
+ register struct sockbuf *sb;
+ struct sockaddr *asa;
+ struct mbuf *m0, *control;
+{
+ register struct mbuf *m, *n;
+ int space = asa->sa_len;
+
+ if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+ panic("sbappendaddr");
+ if (m0)
+ space += m0->m_pkthdr.len;
+ for (n = control; n; n = n->m_next) {
+ space += n->m_len;
+ if (n->m_next == 0) /* keep pointer to last control buf */
+ break;
+ }
+ if (space > sbspace(sb))
+ return (0);
+ if (asa->sa_len > MLEN)
+ return (0);
+ MGET(m, M_DONTWAIT, MT_SONAME);
+ if (m == 0)
+ return (0);
+ m->m_len = asa->sa_len;
+ bcopy(asa, mtod(m, caddr_t), asa->sa_len);
+ if (n)
+ n->m_next = m0; /* concatenate data to control */
+ else
+ control = m0;
+ m->m_next = control;
+ for (n = m; n; n = n->m_next)
+ sballoc(sb, n);
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ n->m_nextpkt = m;
+ } else
+ sb->sb_mb = m;
+ return (1);
+}
+
+int
+sbappendcontrol(sb, m0, control)
+ struct sockbuf *sb;
+ struct mbuf *control, *m0;
+{
+ register struct mbuf *m, *n;
+ int space = 0;
+
+ if (control == 0)
+ panic("sbappendcontrol");
+ for (m = control; ; m = m->m_next) {
+ space += m->m_len;
+ if (m->m_next == 0)
+ break;
+ }
+ n = m; /* save pointer to last control buffer */
+ for (m = m0; m; m = m->m_next)
+ space += m->m_len;
+ if (space > sbspace(sb))
+ return (0);
+ n->m_next = m0; /* concatenate data to control */
+ for (m = control; m; m = m->m_next)
+ sballoc(sb, m);
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ n->m_nextpkt = control;
+ } else
+ sb->sb_mb = control;
+ return (1);
+}
+
+/*
+ * Compress mbuf chain m into the socket
+ * buffer sb following mbuf n. If n
+ * is null, the buffer is presumed empty.
+ */
+void
+sbcompress(sb, m, n)
+ register struct sockbuf *sb;
+ register struct mbuf *m, *n;
+{
+ register int eor = 0;
+ register struct mbuf *o;
+
+ while (m) {
+ eor |= m->m_flags & M_EOR;
+ if (m->m_len == 0 &&
+ (eor == 0 ||
+ (((o = m->m_next) || (o = n)) &&
+ o->m_type == m->m_type))) {
+ m = m_free(m);
+ continue;
+ }
+ if (n && (n->m_flags & M_EOR) == 0 &&
+ M_WRITABLE(n) &&
+ m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
+ m->m_len <= M_TRAILINGSPACE(n) &&
+ n->m_type == m->m_type) {
+ bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+ (unsigned)m->m_len);
+ n->m_len += m->m_len;
+ sb->sb_cc += m->m_len;
+ m = m_free(m);
+ continue;
+ }
+ if (n)
+ n->m_next = m;
+ else
+ sb->sb_mb = m;
+ sballoc(sb, m);
+ n = m;
+ m->m_flags &= ~M_EOR;
+ m = m->m_next;
+ n->m_next = 0;
+ }
+ if (eor) {
+ if (n)
+ n->m_flags |= eor;
+ else
+ printf("semi-panic: sbcompress\n");
+ }
+}
+
+/*
+ * Free all mbufs in a sockbuf.
+ * Check that all resources are reclaimed.
+ */
+void
+sbflush(sb)
+ register struct sockbuf *sb;
+{
+
+ if (sb->sb_flags & SB_LOCK)
+ panic("sbflush: locked");
+ while (sb->sb_mbcnt) {
+ /*
+ * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
+ * we would loop forever. Panic instead.
+ */
+ if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
+ break;
+ sbdrop(sb, (int)sb->sb_cc);
+ }
+ if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
+ panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop(sb, len)
+ register struct sockbuf *sb;
+ register int len;
+{
+ register struct mbuf *m;
+ struct mbuf *next;
+
+ next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+ while (len > 0) {
+ if (m == 0) {
+ if (next == 0)
+ panic("sbdrop");
+ m = next;
+ next = m->m_nextpkt;
+ continue;
+ }
+ if (m->m_len > len) {
+ m->m_len -= len;
+ m->m_data += len;
+ sb->sb_cc -= len;
+ break;
+ }
+ len -= m->m_len;
+ sbfree(sb, m);
+ m = m_free(m);
+ }
+ while (m && m->m_len == 0) {
+ sbfree(sb, m);
+ m = m_free(m);
+ }
+ if (m) {
+ sb->sb_mb = m;
+ m->m_nextpkt = next;
+ } else
+ sb->sb_mb = next;
+}
+
+/*
+ * Drop a record off the front of a sockbuf
+ * and move the next record to the front.
+ */
+void
+sbdroprecord(sb)
+ register struct sockbuf *sb;
+{
+ register struct mbuf *m;
+
+ m = sb->sb_mb;
+ if (m) {
+ sb->sb_mb = m->m_nextpkt;
+ do {
+ sbfree(sb, m);
+ m = m_free(m);
+ } while (m);
+ }
+}
+
+/*
+ * Create a "control" mbuf containing the specified data
+ * with the specified type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(p, size, type, level)
+ caddr_t p;
+ register int size;
+ int type, level;
+{
+ register struct cmsghdr *cp;
+ struct mbuf *m;
+
+ if (CMSG_SPACE((u_int)size) > MCLBYTES)
+ return ((struct mbuf *) NULL);
+ if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+ return ((struct mbuf *) NULL);
+ if (CMSG_SPACE((u_int)size) > MLEN) {
+ MCLGET(m, M_DONTWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ m_free(m);
+ return ((struct mbuf *) NULL);
+ }
+ }
+ cp = mtod(m, struct cmsghdr *);
+ m->m_len = 0;
+ KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
+ ("sbcreatecontrol: short mbuf"));
+ if (p != NULL)
+ (void)memcpy(CMSG_DATA(cp), p, size);
+ m->m_len = CMSG_SPACE(size);
+ cp->cmsg_len = CMSG_LEN(size);
+ cp->cmsg_level = level;
+ cp->cmsg_type = type;
+ return (m);
+}
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol. Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
+ struct ifnet *ifp, struct thread *td)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so, struct thread *td)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+ return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one
+ * and doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+ sb->st_blksize = so->so_snd.sb_hiwat;
+ return 0;
+}
+
+/*
+ * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
+ */
+struct sockaddr *
+dup_sockaddr(sa, canwait)
+ struct sockaddr *sa;
+ int canwait;
+{
+ struct sockaddr *sa2;
+
+ MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME,
+ canwait ? M_WAITOK : M_NOWAIT);
+ if (sa2)
+ bcopy(sa, sa2, sa->sa_len);
+ return sa2;
+}
+
+/*
+ * Create an external-format (``xsocket'') structure using the information
+ * in the kernel-format socket structure pointed to by so. This is done
+ * to reduce the spew of irrelevant information over this interface,
+ * to isolate user code from changes in the kernel structure, and
+ * potentially to provide information-hiding if we decide that
+ * some of this information should be hidden from users.
+ */
+void
+sotoxsocket(struct socket *so, struct xsocket *xso)
+{
+ xso->xso_len = sizeof *xso;
+ xso->xso_so = so;
+ xso->so_type = so->so_type;
+ xso->so_options = so->so_options;
+ xso->so_linger = so->so_linger;
+ xso->so_state = so->so_state;
+ xso->so_pcb = so->so_pcb;
+ xso->xso_protocol = so->so_proto->pr_protocol;
+ xso->xso_family = so->so_proto->pr_domain->dom_family;
+ xso->so_qlen = so->so_qlen;
+ xso->so_incqlen = so->so_incqlen;
+ xso->so_qlimit = so->so_qlimit;
+ xso->so_timeo = so->so_timeo;
+ xso->so_error = so->so_error;
+ xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+ xso->so_oobmark = so->so_oobmark;
+ sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+ sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+ xso->so_uid = so->so_cred->cr_uid;
+}
+
+/*
+ * This does the same for sockbufs. Note that the xsockbuf structure,
+ * since it is always embedded in a socket, does not include a self
+ * pointer nor a length. We make this entry point public in case
+ * some other mechanism needs it.
+ */
+void
+sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
+{
+ xsb->sb_cc = sb->sb_cc;
+ xsb->sb_hiwat = sb->sb_hiwat;
+ xsb->sb_mbcnt = sb->sb_mbcnt;
+ xsb->sb_mbmax = sb->sb_mbmax;
+ xsb->sb_lowat = sb->sb_lowat;
+ xsb->sb_flags = sb->sb_flags;
+ xsb->sb_timeo = sb->sb_timeo;
+}
+
+/*
+ * Here is the definition of some of the basic objects in the kern.ipc
+ * branch of the MIB.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+
+SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW,
+ &sb_max, 0, "Maximum socket buffer size");
+SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD,
+ &maxsockets, 0, "Maximum number of sockets avaliable");
+SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+ &sb_efficiency, 0, "");
+
+/*
+ * Initialise maxsockets
+ */
+static void init_maxsockets(void *ignored)
+{
+ TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
+ maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
+}
+SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
new file mode 100644
index 0000000..1e9c5fa
--- /dev/null
+++ b/sys/kern/uipc_syscalls.c
@@ -0,0 +1,1945 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * sendfile(2) and related extensions:
+ * Copyright (c) 1998, David Greenman. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/malloc.h>
+#include <sys/filedesc.h>
+#include <sys/event.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+static void sf_buf_init(void *arg);
+SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
+struct sf_buf *sf_buf_alloc(void);
+void sf_buf_free(void *addr, void *args);
+
+static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
+static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
+
+static int accept1(struct thread *td, struct accept_args *uap, int compat);
+static int getsockname1(struct thread *td, struct getsockname_args *uap,
+ int compat);
+static int getpeername1(struct thread *td, struct getpeername_args *uap,
+ int compat);
+
+/*
+ * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the
+ * sf_freelist head with the sf_lock mutex.
+ */
+static struct {
+ SLIST_HEAD(, sf_buf) sf_head;
+ struct mtx sf_lock;
+} sf_freelist;
+
+vm_offset_t sf_base;
+struct sf_buf *sf_bufs;
+u_int sf_buf_alloc_want;
+
+/*
+ * System call interface to the socket abstraction.
+ */
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#define COMPAT_OLDSOCK
+#endif
+
+extern struct fileops socketops;
+
+/*
+ * MPSAFE
+ */
+int
+socket(td, uap)
+ struct thread *td;
+ register struct socket_args /* {
+ int domain;
+ int type;
+ int protocol;
+ } */ *uap;
+{
+ struct filedesc *fdp;
+ struct socket *so;
+ struct file *fp;
+ int fd, error;
+
+ mtx_lock(&Giant);
+ fdp = td->td_proc->p_fd;
+ error = falloc(td, &fp, &fd);
+ if (error)
+ goto done2;
+ fhold(fp);
+ error = socreate(uap->domain, &so, uap->type, uap->protocol,
+ td->td_ucred, td);
+ FILEDESC_LOCK(fdp);
+ if (error) {
+ if (fdp->fd_ofiles[fd] == fp) {
+ fdp->fd_ofiles[fd] = NULL;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ } else
+ FILEDESC_UNLOCK(fdp);
+ } else {
+ fp->f_data = so; /* already has ref count */
+ fp->f_flag = FREAD|FWRITE;
+ fp->f_ops = &socketops;
+ fp->f_type = DTYPE_SOCKET;
+ FILEDESC_UNLOCK(fdp);
+ td->td_retval[0] = fd;
+ }
+ fdrop(fp, td);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+bind(td, uap)
+ struct thread *td;
+ register struct bind_args /* {
+ int s;
+ caddr_t name;
+ int namelen;
+ } */ *uap;
+{
+ struct socket *so;
+ struct sockaddr *sa;
+ int error;
+
+ mtx_lock(&Giant);
+ if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
+ goto done2;
+ if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
+ goto done1;
+ error = sobind(so, sa, td);
+ FREE(sa, M_SONAME);
+done1:
+ fputsock(so);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+listen(td, uap)
+ struct thread *td;
+ register struct listen_args /* {
+ int s;
+ int backlog;
+ } */ *uap;
+{
+ struct socket *so;
+ int error;
+
+ mtx_lock(&Giant);
+ if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
+ error = solisten(so, uap->backlog, td);
+ fputsock(so);
+ }
+ mtx_unlock(&Giant);
+ return(error);
+}
+
+/*
+ * accept1()
+ * MPSAFE
+ */
+static int
+accept1(td, uap, compat)
+ struct thread *td;
+ register struct accept_args /* {
+ int s;
+ caddr_t name;
+ int *anamelen;
+ } */ *uap;
+ int compat;
+{
+ struct filedesc *fdp;
+ struct file *nfp = NULL;
+ struct sockaddr *sa;
+ int namelen, error, s;
+ struct socket *head, *so;
+ int fd;
+ u_int fflag;
+
+ mtx_lock(&Giant);
+ fdp = td->td_proc->p_fd;
+ if (uap->name) {
+ error = copyin(uap->anamelen, &namelen, sizeof (namelen));
+ if(error)
+ goto done2;
+ }
+ error = fgetsock(td, uap->s, &head, &fflag);
+ if (error)
+ goto done2;
+ s = splnet();
+ if ((head->so_options & SO_ACCEPTCONN) == 0) {
+ splx(s);
+ error = EINVAL;
+ goto done;
+ }
+ if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
+ splx(s);
+ error = EWOULDBLOCK;
+ goto done;
+ }
+ while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
+ if (head->so_state & SS_CANTRCVMORE) {
+ head->so_error = ECONNABORTED;
+ break;
+ }
+ error = tsleep(&head->so_timeo, PSOCK | PCATCH,
+ "accept", 0);
+ if (error) {
+ splx(s);
+ goto done;
+ }
+ }
+ if (head->so_error) {
+ error = head->so_error;
+ head->so_error = 0;
+ splx(s);
+ goto done;
+ }
+
+ /*
+ * At this point we know that there is at least one connection
+ * ready to be accepted. Remove it from the queue prior to
+ * allocating the file descriptor for it since falloc() may
+ * block allowing another process to accept the connection
+ * instead.
+ */
+ so = TAILQ_FIRST(&head->so_comp);
+ TAILQ_REMOVE(&head->so_comp, so, so_list);
+ head->so_qlen--;
+
+ error = falloc(td, &nfp, &fd);
+ if (error) {
+ /*
+ * Probably ran out of file descriptors. Put the
+ * unaccepted connection back onto the queue and
+ * do another wakeup so some other process might
+ * have a chance at it.
+ */
+ TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
+ head->so_qlen++;
+ wakeup_one(&head->so_timeo);
+ splx(s);
+ goto done;
+ }
+ fhold(nfp);
+ td->td_retval[0] = fd;
+
+ /* connection has been removed from the listen queue */
+ KNOTE(&head->so_rcv.sb_sel.si_note, 0);
+
+ so->so_state &= ~SS_COMP;
+ so->so_head = NULL;
+ if (head->so_sigio != NULL)
+ fsetown(fgetown(head->so_sigio), &so->so_sigio);
+
+ FILE_LOCK(nfp);
+ soref(so); /* file descriptor reference */
+ nfp->f_data = so; /* nfp has ref count from falloc */
+ nfp->f_flag = fflag;
+ nfp->f_ops = &socketops;
+ nfp->f_type = DTYPE_SOCKET;
+ FILE_UNLOCK(nfp);
+ sa = 0;
+ error = soaccept(so, &sa);
+ if (error) {
+ /*
+ * return a namelen of zero for older code which might
+ * ignore the return value from accept.
+ */
+ if (uap->name != NULL) {
+ namelen = 0;
+ (void) copyout(&namelen,
+ uap->anamelen, sizeof(*uap->anamelen));
+ }
+ goto noconnection;
+ }
+ if (sa == NULL) {
+ namelen = 0;
+ if (uap->name)
+ goto gotnoname;
+ splx(s);
+ error = 0;
+ goto done;
+ }
+ if (uap->name) {
+ /* check sa_len before it is destroyed */
+ if (namelen > sa->sa_len)
+ namelen = sa->sa_len;
+#ifdef COMPAT_OLDSOCK
+ if (compat)
+ ((struct osockaddr *)sa)->sa_family =
+ sa->sa_family;
+#endif
+ error = copyout(sa, uap->name, (u_int)namelen);
+ if (!error)
+gotnoname:
+ error = copyout(&namelen,
+ uap->anamelen, sizeof (*uap->anamelen));
+ }
+noconnection:
+ if (sa)
+ FREE(sa, M_SONAME);
+
+ /*
+ * close the new descriptor, assuming someone hasn't ripped it
+ * out from under us.
+ */
+ if (error) {
+ FILEDESC_LOCK(fdp);
+ if (fdp->fd_ofiles[fd] == nfp) {
+ fdp->fd_ofiles[fd] = NULL;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(nfp, td);
+ } else {
+ FILEDESC_UNLOCK(fdp);
+ }
+ }
+ splx(s);
+
+ /*
+ * Release explicitly held references before returning.
+ */
+done:
+ if (nfp != NULL)
+ fdrop(nfp, td);
+ fputsock(head);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE (accept1() is MPSAFE)
+ */
+int
+accept(td, uap)
+ struct thread *td;
+ struct accept_args *uap;
+{
+
+ return (accept1(td, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+/*
+ * MPSAFE (accept1() is MPSAFE)
+ */
+int
+oaccept(td, uap)
+ struct thread *td;
+ struct accept_args *uap;
+{
+
+ return (accept1(td, uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+connect(td, uap)
+ struct thread *td;
+ register struct connect_args /* {
+ int s;
+ caddr_t name;
+ int namelen;
+ } */ *uap;
+{
+ struct socket *so;
+ struct sockaddr *sa;
+ int error, s;
+
+ mtx_lock(&Giant);
+ if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
+ goto done2;
+ if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
+ error = EALREADY;
+ goto done1;
+ }
+ error = getsockaddr(&sa, uap->name, uap->namelen);
+ if (error)
+ goto done1;
+ error = soconnect(so, sa, td);
+ if (error)
+ goto bad;
+ if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
+ FREE(sa, M_SONAME);
+ error = EINPROGRESS;
+ goto done1;
+ }
+ s = splnet();
+ while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
+ error = tsleep(&so->so_timeo, PSOCK | PCATCH, "connec", 0);
+ if (error)
+ break;
+ }
+ if (error == 0) {
+ error = so->so_error;
+ so->so_error = 0;
+ }
+ splx(s);
+bad:
+ so->so_state &= ~SS_ISCONNECTING;
+ FREE(sa, M_SONAME);
+ if (error == ERESTART)
+ error = EINTR;
+done1:
+ fputsock(so);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+socketpair(td, uap)
+ struct thread *td;
+ register struct socketpair_args /* {
+ int domain;
+ int type;
+ int protocol;
+ int *rsv;
+ } */ *uap;
+{
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ struct file *fp1, *fp2;
+ struct socket *so1, *so2;
+ int fd, error, sv[2];
+
+ mtx_lock(&Giant);
+ error = socreate(uap->domain, &so1, uap->type, uap->protocol,
+ td->td_ucred, td);
+ if (error)
+ goto done2;
+ error = socreate(uap->domain, &so2, uap->type, uap->protocol,
+ td->td_ucred, td);
+ if (error)
+ goto free1;
+ error = falloc(td, &fp1, &fd);
+ if (error)
+ goto free2;
+ fhold(fp1);
+ sv[0] = fd;
+ fp1->f_data = so1; /* so1 already has ref count */
+ error = falloc(td, &fp2, &fd);
+ if (error)
+ goto free3;
+ fhold(fp2);
+ fp2->f_data = so2; /* so2 already has ref count */
+ sv[1] = fd;
+ error = soconnect2(so1, so2);
+ if (error)
+ goto free4;
+ if (uap->type == SOCK_DGRAM) {
+ /*
+ * Datagram socket connection is asymmetric.
+ */
+ error = soconnect2(so2, so1);
+ if (error)
+ goto free4;
+ }
+ FILE_LOCK(fp1);
+ fp1->f_flag = FREAD|FWRITE;
+ fp1->f_ops = &socketops;
+ fp1->f_type = DTYPE_SOCKET;
+ FILE_UNLOCK(fp1);
+ FILE_LOCK(fp2);
+ fp2->f_flag = FREAD|FWRITE;
+ fp2->f_ops = &socketops;
+ fp2->f_type = DTYPE_SOCKET;
+ FILE_UNLOCK(fp2);
+ error = copyout(sv, uap->rsv, 2 * sizeof (int));
+ fdrop(fp1, td);
+ fdrop(fp2, td);
+ goto done2;
+free4:
+ FILEDESC_LOCK(fdp);
+ if (fdp->fd_ofiles[sv[1]] == fp2) {
+ fdp->fd_ofiles[sv[1]] = NULL;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp2, td);
+ } else
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp2, td);
+free3:
+ FILEDESC_LOCK(fdp);
+ if (fdp->fd_ofiles[sv[0]] == fp1) {
+ fdp->fd_ofiles[sv[0]] = NULL;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp1, td);
+ } else
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp1, td);
+free2:
+ (void)soclose(so2);
+free1:
+ (void)soclose(so1);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static int
+sendit(td, s, mp, flags)
+ register struct thread *td;
+ int s;
+ register struct msghdr *mp;
+ int flags;
+{
+ struct uio auio;
+ register struct iovec *iov;
+ register int i;
+ struct mbuf *control;
+ struct sockaddr *to = NULL;
+ int len, error;
+ struct socket *so;
+#ifdef KTRACE
+ struct iovec *ktriov = NULL;
+ struct uio ktruio;
+ int iovlen;
+#endif
+
+ if ((error = fgetsock(td, s, &so, NULL)) != 0)
+ return (error);
+ auio.uio_iov = mp->msg_iov;
+ auio.uio_iovcnt = mp->msg_iovlen;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = td;
+ auio.uio_offset = 0; /* XXX */
+ auio.uio_resid = 0;
+ iov = mp->msg_iov;
+ for (i = 0; i < mp->msg_iovlen; i++, iov++) {
+ if ((auio.uio_resid += iov->iov_len) < 0) {
+ error = EINVAL;
+ goto bad;
+ }
+ }
+ if (mp->msg_name) {
+ error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
+ if (error)
+ goto bad;
+ }
+ if (mp->msg_control) {
+ if (mp->msg_controllen < sizeof(struct cmsghdr)
+#ifdef COMPAT_OLDSOCK
+ && mp->msg_flags != MSG_COMPAT
+#endif
+ ) {
+ error = EINVAL;
+ goto bad;
+ }
+ error = sockargs(&control, mp->msg_control,
+ mp->msg_controllen, MT_CONTROL);
+ if (error)
+ goto bad;
+#ifdef COMPAT_OLDSOCK
+ if (mp->msg_flags == MSG_COMPAT) {
+ register struct cmsghdr *cm;
+
+ M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
+ if (control == 0) {
+ error = ENOBUFS;
+ goto bad;
+ } else {
+ cm = mtod(control, struct cmsghdr *);
+ cm->cmsg_len = control->m_len;
+ cm->cmsg_level = SOL_SOCKET;
+ cm->cmsg_type = SCM_RIGHTS;
+ }
+ }
+#endif
+ } else {
+ control = 0;
+ }
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_GENIO)) {
+ iovlen = auio.uio_iovcnt * sizeof (struct iovec);
+ MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+ bcopy(auio.uio_iov, ktriov, iovlen);
+ ktruio = auio;
+ }
+#endif
+ len = auio.uio_resid;
+ error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
+ flags, td);
+ if (error) {
+ if (auio.uio_resid != len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ /* Generation of SIGPIPE can be controlled per socket */
+ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE)) {
+ PROC_LOCK(td->td_proc);
+ psignal(td->td_proc, SIGPIPE);
+ PROC_UNLOCK(td->td_proc);
+ }
+ }
+ if (error == 0)
+ td->td_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+ if (ktriov != NULL) {
+ if (error == 0) {
+ ktruio.uio_iov = ktriov;
+ ktruio.uio_resid = td->td_retval[0];
+ ktrgenio(s, UIO_WRITE, &ktruio, error);
+ }
+ FREE(ktriov, M_TEMP);
+ }
+#endif
+bad:
+ fputsock(so);
+ if (to)
+ FREE(to, M_SONAME);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+sendto(td, uap)
+ struct thread *td;
+ register struct sendto_args /* {
+ int s;
+ caddr_t buf;
+ size_t len;
+ int flags;
+ caddr_t to;
+ int tolen;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+ int error;
+
+ msg.msg_name = uap->to;
+ msg.msg_namelen = uap->tolen;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = 0;
+#ifdef COMPAT_OLDSOCK
+ msg.msg_flags = 0;
+#endif
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ mtx_lock(&Giant);
+ error = sendit(td, uap->s, &msg, uap->flags);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifdef COMPAT_OLDSOCK
+/*
+ * MPSAFE
+ */
+int
+osend(td, uap)
+ struct thread *td;
+ register struct osend_args /* {
+ int s;
+ caddr_t buf;
+ int len;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+ int error;
+
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ msg.msg_control = 0;
+ msg.msg_flags = 0;
+ mtx_lock(&Giant);
+ error = sendit(td, uap->s, &msg, uap->flags);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+osendmsg(td, uap)
+ struct thread *td;
+ register struct osendmsg_args /* {
+ int s;
+ caddr_t msg;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov[UIO_SMALLIOV], *iov;
+ int error;
+
+ mtx_lock(&Giant);
+ error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
+ if (error)
+ goto done2;
+ if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+ if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
+ error = EMSGSIZE;
+ goto done2;
+ }
+ MALLOC(iov, struct iovec *,
+ sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+ M_WAITOK);
+ } else {
+ iov = aiov;
+ }
+ error = copyin(msg.msg_iov, iov,
+ (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+ if (error)
+ goto done;
+ msg.msg_flags = MSG_COMPAT;
+ msg.msg_iov = iov;
+ error = sendit(td, uap->s, &msg, uap->flags);
+done:
+ if (iov != aiov)
+ FREE(iov, M_IOV);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+sendmsg(td, uap)
+ struct thread *td;
+ register struct sendmsg_args /* {
+ int s;
+ caddr_t msg;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov[UIO_SMALLIOV], *iov;
+ int error;
+
+ mtx_lock(&Giant);
+ error = copyin(uap->msg, &msg, sizeof (msg));
+ if (error)
+ goto done2;
+ if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+ if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
+ error = EMSGSIZE;
+ goto done2;
+ }
+ MALLOC(iov, struct iovec *,
+ sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+ M_WAITOK);
+ } else {
+ iov = aiov;
+ }
+ if (msg.msg_iovlen &&
+ (error = copyin(msg.msg_iov, iov,
+ (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
+ goto done;
+ msg.msg_iov = iov;
+#ifdef COMPAT_OLDSOCK
+ msg.msg_flags = 0;
+#endif
+ error = sendit(td, uap->s, &msg, uap->flags);
+done:
+ if (iov != aiov)
+ FREE(iov, M_IOV);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static int
+recvit(td, s, mp, namelenp)
+ register struct thread *td;
+ int s;
+ register struct msghdr *mp;
+ void *namelenp;
+{
+ struct uio auio;
+ register struct iovec *iov;
+ register int i;
+ int len, error;
+ struct mbuf *m, *control = 0;
+ caddr_t ctlbuf;
+ struct socket *so;
+ struct sockaddr *fromsa = 0;
+#ifdef KTRACE
+ struct iovec *ktriov = NULL;
+ struct uio ktruio;
+ int iovlen;
+#endif
+
+ if ((error = fgetsock(td, s, &so, NULL)) != 0)
+ return (error);
+ auio.uio_iov = mp->msg_iov;
+ auio.uio_iovcnt = mp->msg_iovlen;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_td = td;
+ auio.uio_offset = 0; /* XXX */
+ auio.uio_resid = 0;
+ iov = mp->msg_iov;
+ for (i = 0; i < mp->msg_iovlen; i++, iov++) {
+ if ((auio.uio_resid += iov->iov_len) < 0) {
+ fputsock(so);
+ return (EINVAL);
+ }
+ }
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_GENIO)) {
+ iovlen = auio.uio_iovcnt * sizeof (struct iovec);
+ MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+ bcopy(auio.uio_iov, ktriov, iovlen);
+ ktruio = auio;
+ }
+#endif
+ len = auio.uio_resid;
+ error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
+ (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
+ &mp->msg_flags);
+ if (error) {
+ if (auio.uio_resid != len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ }
+#ifdef KTRACE
+ if (ktriov != NULL) {
+ if (error == 0) {
+ ktruio.uio_iov = ktriov;
+ ktruio.uio_resid = len - auio.uio_resid;
+ ktrgenio(s, UIO_READ, &ktruio, error);
+ }
+ FREE(ktriov, M_TEMP);
+ }
+#endif
+ if (error)
+ goto out;
+ td->td_retval[0] = len - auio.uio_resid;
+ if (mp->msg_name) {
+ len = mp->msg_namelen;
+ if (len <= 0 || fromsa == 0)
+ len = 0;
+ else {
+#ifndef MIN
+#define MIN(a,b) ((a)>(b)?(b):(a))
+#endif
+ /* save sa_len before it is destroyed by MSG_COMPAT */
+ len = MIN(len, fromsa->sa_len);
+#ifdef COMPAT_OLDSOCK
+ if (mp->msg_flags & MSG_COMPAT)
+ ((struct osockaddr *)fromsa)->sa_family =
+ fromsa->sa_family;
+#endif
+ error = copyout(fromsa, mp->msg_name, (unsigned)len);
+ if (error)
+ goto out;
+ }
+ mp->msg_namelen = len;
+ if (namelenp &&
+ (error = copyout(&len, namelenp, sizeof (int)))) {
+#ifdef COMPAT_OLDSOCK
+ if (mp->msg_flags & MSG_COMPAT)
+ error = 0; /* old recvfrom didn't check */
+ else
+#endif
+ goto out;
+ }
+ }
+ if (mp->msg_control) {
+#ifdef COMPAT_OLDSOCK
+ /*
+ * We assume that old recvmsg calls won't receive access
+ * rights and other control info, esp. as control info
+ * is always optional and those options didn't exist in 4.3.
+ * If we receive rights, trim the cmsghdr; anything else
+ * is tossed.
+ */
+ if (control && mp->msg_flags & MSG_COMPAT) {
+ if (mtod(control, struct cmsghdr *)->cmsg_level !=
+ SOL_SOCKET ||
+ mtod(control, struct cmsghdr *)->cmsg_type !=
+ SCM_RIGHTS) {
+ mp->msg_controllen = 0;
+ goto out;
+ }
+ control->m_len -= sizeof (struct cmsghdr);
+ control->m_data += sizeof (struct cmsghdr);
+ }
+#endif
+ len = mp->msg_controllen;
+ m = control;
+ mp->msg_controllen = 0;
+ ctlbuf = mp->msg_control;
+
+ while (m && len > 0) {
+ unsigned int tocopy;
+
+ if (len >= m->m_len)
+ tocopy = m->m_len;
+ else {
+ mp->msg_flags |= MSG_CTRUNC;
+ tocopy = len;
+ }
+
+ if ((error = copyout(mtod(m, caddr_t),
+ ctlbuf, tocopy)) != 0)
+ goto out;
+
+ ctlbuf += tocopy;
+ len -= tocopy;
+ m = m->m_next;
+ }
+ mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
+ }
+out:
+ fputsock(so);
+ if (fromsa)
+ FREE(fromsa, M_SONAME);
+ if (control)
+ m_freem(control);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+recvfrom(td, uap)
+ struct thread *td;
+ register struct recvfrom_args /* {
+ int s;
+ caddr_t buf;
+ size_t len;
+ int flags;
+ caddr_t from;
+ int *fromlenaddr;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+ int error;
+
+ mtx_lock(&Giant);
+ if (uap->fromlenaddr) {
+ error = copyin(uap->fromlenaddr,
+ &msg.msg_namelen, sizeof (msg.msg_namelen));
+ if (error)
+ goto done2;
+ } else {
+ msg.msg_namelen = 0;
+ }
+ msg.msg_name = uap->from;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ msg.msg_control = 0;
+ msg.msg_flags = uap->flags;
+ error = recvit(td, uap->s, &msg, uap->fromlenaddr);
+done2:
+ mtx_unlock(&Giant);
+ return(error);
+}
+
+#ifdef COMPAT_OLDSOCK
+/*
+ * MPSAFE
+ */
+int
+orecvfrom(td, uap)
+ struct thread *td;
+ struct recvfrom_args *uap;
+{
+
+ uap->flags |= MSG_COMPAT;
+ return (recvfrom(td, uap));
+}
+#endif
+
+
+#ifdef COMPAT_OLDSOCK
+/*
+ * MPSAFE
+ */
+int
+orecv(td, uap)
+ struct thread *td;
+ register struct orecv_args /* {
+ int s;
+ caddr_t buf;
+ int len;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+ int error;
+
+ mtx_lock(&Giant);
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ msg.msg_control = 0;
+ msg.msg_flags = uap->flags;
+ error = recvit(td, uap->s, &msg, NULL);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Old recvmsg. This code takes advantage of the fact that the old msghdr
+ * overlays the new one, missing only the flags, and with the (old) access
+ * rights where the control fields are now.
+ *
+ * MPSAFE
+ */
+int
+orecvmsg(td, uap)
+ struct thread *td;
+ register struct orecvmsg_args /* {
+ int s;
+ struct omsghdr *msg;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov[UIO_SMALLIOV], *iov;
+ int error;
+
+ error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
+ if (error)
+ return (error);
+
+ mtx_lock(&Giant);
+ if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+ if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
+ error = EMSGSIZE;
+ goto done2;
+ }
+ MALLOC(iov, struct iovec *,
+ sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+ M_WAITOK);
+ } else {
+ iov = aiov;
+ }
+ msg.msg_flags = uap->flags | MSG_COMPAT;
+ error = copyin(msg.msg_iov, iov,
+ (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+ if (error)
+ goto done;
+ msg.msg_iov = iov;
+ error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
+
+ if (msg.msg_controllen && error == 0)
+ error = copyout(&msg.msg_controllen,
+ &uap->msg->msg_accrightslen, sizeof (int));
+done:
+ if (iov != aiov)
+ FREE(iov, M_IOV);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+recvmsg(td, uap)
+ struct thread *td;
+ register struct recvmsg_args /* {
+ int s;
+ struct msghdr *msg;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
+ register int error;
+
+ mtx_lock(&Giant);
+ error = copyin(uap->msg, &msg, sizeof (msg));
+ if (error)
+ goto done2;
+ if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+ if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
+ error = EMSGSIZE;
+ goto done2;
+ }
+ MALLOC(iov, struct iovec *,
+ sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+ M_WAITOK);
+ } else {
+ iov = aiov;
+ }
+#ifdef COMPAT_OLDSOCK
+ msg.msg_flags = uap->flags &~ MSG_COMPAT;
+#else
+ msg.msg_flags = uap->flags;
+#endif
+ uiov = msg.msg_iov;
+ msg.msg_iov = iov;
+ error = copyin(uiov, iov,
+ (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+ if (error)
+ goto done;
+ error = recvit(td, uap->s, &msg, NULL);
+ if (!error) {
+ msg.msg_iov = uiov;
+ error = copyout(&msg, uap->msg, sizeof(msg));
+ }
+done:
+ if (iov != aiov)
+ FREE(iov, M_IOV);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+shutdown(td, uap)
+ struct thread *td;
+ register struct shutdown_args /* {
+ int s;
+ int how;
+ } */ *uap;
+{
+ struct socket *so;
+ int error;
+
+ mtx_lock(&Giant);
+ if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
+ error = soshutdown(so, uap->how);
+ fputsock(so);
+ }
+ mtx_unlock(&Giant);
+ return(error);
+}
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setsockopt(td, uap)
+ struct thread *td;
+ register struct setsockopt_args /* {
+ int s;
+ int level;
+ int name;
+ caddr_t val;
+ int valsize;
+ } */ *uap;
+{
+ struct socket *so;
+ struct sockopt sopt;
+ int error;
+
+ if (uap->val == 0 && uap->valsize != 0)
+ return (EFAULT);
+ if (uap->valsize < 0)
+ return (EINVAL);
+
+ mtx_lock(&Giant);
+ if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
+ sopt.sopt_dir = SOPT_SET;
+ sopt.sopt_level = uap->level;
+ sopt.sopt_name = uap->name;
+ sopt.sopt_val = uap->val;
+ sopt.sopt_valsize = uap->valsize;
+ sopt.sopt_td = td;
+ error = sosetopt(so, &sopt);
+ fputsock(so);
+ }
+ mtx_unlock(&Giant);
+ return(error);
+}
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getsockopt(td, uap)
+ struct thread *td;
+ register struct getsockopt_args /* {
+ int s;
+ int level;
+ int name;
+ caddr_t val;
+ int *avalsize;
+ } */ *uap;
+{
+ int valsize, error;
+ struct socket *so;
+ struct sockopt sopt;
+
+ mtx_lock(&Giant);
+ if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
+ goto done2;
+ if (uap->val) {
+ error = copyin(uap->avalsize, &valsize, sizeof (valsize));
+ if (error)
+ goto done1;
+ if (valsize < 0) {
+ error = EINVAL;
+ goto done1;
+ }
+ } else {
+ valsize = 0;
+ }
+
+ sopt.sopt_dir = SOPT_GET;
+ sopt.sopt_level = uap->level;
+ sopt.sopt_name = uap->name;
+ sopt.sopt_val = uap->val;
+ sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
+ sopt.sopt_td = td;
+
+ error = sogetopt(so, &sopt);
+ if (error == 0) {
+ valsize = sopt.sopt_valsize;
+ error = copyout(&valsize, uap->avalsize, sizeof (valsize));
+ }
+done1:
+ fputsock(so);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * getsockname1() - Get socket name.
+ *
+ * MPSAFE
+ */
+/* ARGSUSED */
+static int
+getsockname1(td, uap, compat)
+ struct thread *td;
+ register struct getsockname_args /* {
+ int fdes;
+ caddr_t asa;
+ int *alen;
+ } */ *uap;
+ int compat;
+{
+ struct socket *so;
+ struct sockaddr *sa;
+ int len, error;
+
+ mtx_lock(&Giant);
+ if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
+ goto done2;
+ error = copyin(uap->alen, &len, sizeof (len));
+ if (error)
+ goto done1;
+ sa = 0;
+ error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
+ if (error)
+ goto bad;
+ if (sa == 0) {
+ len = 0;
+ goto gotnothing;
+ }
+
+ len = MIN(len, sa->sa_len);
+#ifdef COMPAT_OLDSOCK
+ if (compat)
+ ((struct osockaddr *)sa)->sa_family = sa->sa_family;
+#endif
+ error = copyout(sa, uap->asa, (u_int)len);
+ if (error == 0)
+gotnothing:
+ error = copyout(&len, uap->alen, sizeof (len));
+bad:
+ if (sa)
+ FREE(sa, M_SONAME);
+done1:
+ fputsock(so);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+getsockname(td, uap)
+ struct thread *td;
+ struct getsockname_args *uap;
+{
+
+ return (getsockname1(td, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+/*
+ * MPSAFE
+ */
+int
+ogetsockname(td, uap)
+ struct thread *td;
+ struct getsockname_args *uap;
+{
+
+ return (getsockname1(td, uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+/*
+ * getpeername1() - Get name of peer for connected socket.
+ *
+ * MPSAFE
+ */
+/* ARGSUSED */
+static int
+getpeername1(td, uap, compat)
+ struct thread *td;
+ register struct getpeername_args /* {
+ int fdes;
+ caddr_t asa;
+ int *alen;
+ } */ *uap;
+ int compat;
+{
+ struct socket *so;
+ struct sockaddr *sa;
+ int len, error;
+
+ mtx_lock(&Giant);
+ if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
+ goto done2;
+ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
+ error = ENOTCONN;
+ goto done1;
+ }
+ error = copyin(uap->alen, &len, sizeof (len));
+ if (error)
+ goto done1;
+ sa = 0;
+ error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
+ if (error)
+ goto bad;
+ if (sa == 0) {
+ len = 0;
+ goto gotnothing;
+ }
+ len = MIN(len, sa->sa_len);
+#ifdef COMPAT_OLDSOCK
+ if (compat)
+ ((struct osockaddr *)sa)->sa_family =
+ sa->sa_family;
+#endif
+ error = copyout(sa, uap->asa, (u_int)len);
+ if (error)
+ goto bad;
+gotnothing:
+ error = copyout(&len, uap->alen, sizeof (len));
+bad:
+ if (sa)
+ FREE(sa, M_SONAME);
+done1:
+ fputsock(so);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+getpeername(td, uap)
+ struct thread *td;
+ struct getpeername_args *uap;
+{
+
+ return (getpeername1(td, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+/*
+ * MPSAFE
+ */
+int
+ogetpeername(td, uap)
+ struct thread *td;
+ struct ogetpeername_args *uap;
+{
+
+ /* XXX uap should have type `getpeername_args *' to begin with. */
+ return (getpeername1(td, (struct getpeername_args *)uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+int
+sockargs(mp, buf, buflen, type)
+ struct mbuf **mp;
+ caddr_t buf;
+ int buflen, type;
+{
+ register struct sockaddr *sa;
+ register struct mbuf *m;
+ int error;
+
+ if ((u_int)buflen > MLEN) {
+#ifdef COMPAT_OLDSOCK
+ if (type == MT_SONAME && (u_int)buflen <= 112)
+ buflen = MLEN; /* unix domain compat. hack */
+ else
+#endif
+ return (EINVAL);
+ }
+ m = m_get(M_TRYWAIT, type);
+ if (m == NULL)
+ return (ENOBUFS);
+ m->m_len = buflen;
+ error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
+ if (error)
+ (void) m_free(m);
+ else {
+ *mp = m;
+ if (type == MT_SONAME) {
+ sa = mtod(m, struct sockaddr *);
+
+#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
+ if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+ sa->sa_family = sa->sa_len;
+#endif
+ sa->sa_len = buflen;
+ }
+ }
+ return (error);
+}
+
+int
+getsockaddr(namp, uaddr, len)
+ struct sockaddr **namp;
+ caddr_t uaddr;
+ size_t len;
+{
+ struct sockaddr *sa;
+ int error;
+
+ if (len > SOCK_MAXADDRLEN)
+ return ENAMETOOLONG;
+ MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
+ error = copyin(uaddr, sa, len);
+ if (error) {
+ FREE(sa, M_SONAME);
+ } else {
+#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
+ if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+ sa->sa_family = sa->sa_len;
+#endif
+ sa->sa_len = len;
+ *namp = sa;
+ }
+ return error;
+}
+
+/*
+ * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
+ * XXX - The sf_buf functions are currently private to sendfile(2), so have
+ * been made static, but may be useful in the future for doing zero-copy in
+ * other parts of the networking code.
+ */
+static void
+sf_buf_init(void *arg)
+{
+ int i;
+
+ mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF);
+ mtx_lock(&sf_freelist.sf_lock);
+ SLIST_INIT(&sf_freelist.sf_head);
+ sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
+ sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
+ M_NOWAIT | M_ZERO);
+ for (i = 0; i < nsfbufs; i++) {
+ sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
+ SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list);
+ }
+ sf_buf_alloc_want = 0;
+ mtx_unlock(&sf_freelist.sf_lock);
+}
+
+/*
+ * Get an sf_buf from the freelist. Will block if none are available.
+ */
+struct sf_buf *
+sf_buf_alloc()
+{
+ struct sf_buf *sf;
+ int error;
+
+ mtx_lock(&sf_freelist.sf_lock);
+ while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) {
+ sf_buf_alloc_want++;
+ error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH,
+ "sfbufa", 0);
+ sf_buf_alloc_want--;
+
+ /*
+ * If we got a signal, don't risk going back to sleep.
+ */
+ if (error)
+ break;
+ }
+ if (sf != NULL)
+ SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list);
+ mtx_unlock(&sf_freelist.sf_lock);
+ return (sf);
+}
+
+#define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
+
+/*
+ * Detatch mapped page and release resources back to the system.
+ */
+void
+sf_buf_free(void *addr, void *args)
+{
+ struct sf_buf *sf;
+ struct vm_page *m;
+
+ GIANT_REQUIRED;
+
+ sf = dtosf(addr);
+ pmap_qremove((vm_offset_t)addr, 1);
+ m = sf->m;
+ vm_page_unwire(m, 0);
+ /*
+ * Check for the object going away on us. This can
+ * happen since we don't hold a reference to it.
+ * If so, we're responsible for freeing the page.
+ */
+ if (m->wire_count == 0 && m->object == NULL)
+ vm_page_free(m);
+ sf->m = NULL;
+ mtx_lock(&sf_freelist.sf_lock);
+ SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list);
+ if (sf_buf_alloc_want > 0)
+ wakeup_one(&sf_freelist);
+ mtx_unlock(&sf_freelist.sf_lock);
+}
+
+/*
+ * sendfile(2)
+ *
+ * MPSAFE
+ *
+ * int sendfile(int fd, int s, off_t offset, size_t nbytes,
+ * struct sf_hdtr *hdtr, off_t *sbytes, int flags)
+ *
+ * Send a file specified by 'fd' and starting at 'offset' to a socket
+ * specified by 's'. Send only 'nbytes' of the file or until EOF if
+ * nbytes == 0. Optionally add a header and/or trailer to the socket
+ * output. If specified, write the total number of bytes sent into *sbytes.
+ *
+ */
+int
+sendfile(struct thread *td, struct sendfile_args *uap)
+{
+ struct vnode *vp;
+ struct vm_object *obj;
+ struct socket *so = NULL;
+ struct mbuf *m;
+ struct sf_buf *sf;
+ struct vm_page *pg;
+ struct writev_args nuap;
+ struct sf_hdtr hdtr;
+ off_t off, xfsize, hdtr_size, sbytes = 0;
+ int error, s;
+
+ mtx_lock(&Giant);
+
+ hdtr_size = 0;
+
+ /*
+ * The descriptor must be a regular file and have a backing VM object.
+ */
+ if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
+ goto done;
+ if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
+ error = EINVAL;
+ goto done;
+ }
+ if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
+ goto done;
+ if (so->so_type != SOCK_STREAM) {
+ error = EINVAL;
+ goto done;
+ }
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ error = ENOTCONN;
+ goto done;
+ }
+ if (uap->offset < 0) {
+ error = EINVAL;
+ goto done;
+ }
+
+ /*
+ * If specified, get the pointer to the sf_hdtr struct for
+ * any headers/trailers.
+ */
+ if (uap->hdtr != NULL) {
+ error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
+ if (error)
+ goto done;
+ /*
+ * Send any headers. Wimp out and use writev(2).
+ */
+ if (hdtr.headers != NULL) {
+ nuap.fd = uap->s;
+ nuap.iovp = hdtr.headers;
+ nuap.iovcnt = hdtr.hdr_cnt;
+ error = writev(td, &nuap);
+ if (error)
+ goto done;
+ hdtr_size += td->td_retval[0];
+ }
+ }
+
+ /*
+ * Protect against multiple writers to the socket.
+ */
+ (void) sblock(&so->so_snd, M_WAITOK);
+
+ /*
+ * Loop through the pages in the file, starting with the requested
+ * offset. Get a file page (do I/O if necessary), map the file page
+ * into an sf_buf, attach an mbuf header to the sf_buf, and queue
+ * it on the socket.
+ */
+ for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
+ vm_pindex_t pindex;
+ vm_offset_t pgoff;
+
+ pindex = OFF_TO_IDX(off);
+retry_lookup:
+ /*
+ * Calculate the amount to transfer. Not to exceed a page,
+ * the EOF, or the passed in nbytes.
+ */
+ xfsize = obj->un_pager.vnp.vnp_size - off;
+ if (xfsize > PAGE_SIZE)
+ xfsize = PAGE_SIZE;
+ pgoff = (vm_offset_t)(off & PAGE_MASK);
+ if (PAGE_SIZE - pgoff < xfsize)
+ xfsize = PAGE_SIZE - pgoff;
+ if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
+ xfsize = uap->nbytes - sbytes;
+ if (xfsize <= 0)
+ break;
+ /*
+ * Optimize the non-blocking case by looking at the socket space
+ * before going to the extra work of constituting the sf_buf.
+ */
+ if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
+ if (so->so_state & SS_CANTSENDMORE)
+ error = EPIPE;
+ else
+ error = EAGAIN;
+ sbunlock(&so->so_snd);
+ goto done;
+ }
+ /*
+ * Attempt to look up the page.
+ *
+ * Allocate if not found
+ *
+ * Wait and loop if busy.
+ */
+ pg = vm_page_lookup(obj, pindex);
+
+ if (pg == NULL) {
+ pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
+ if (pg == NULL) {
+ VM_WAIT;
+ goto retry_lookup;
+ }
+ vm_page_wakeup(pg);
+ } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
+ goto retry_lookup;
+ }
+
+ /*
+ * Wire the page so it does not get ripped out from under
+ * us.
+ */
+
+ vm_page_wire(pg);
+
+ /*
+ * If page is not valid for what we need, initiate I/O
+ */
+
+ if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
+ int bsize;
+
+ /*
+ * Ensure that our page is still around when the I/O
+ * completes.
+ */
+ vm_page_io_start(pg);
+
+ /*
+ * Get the page from backing store.
+ */
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+ vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
+ error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
+ trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
+ IO_VMIO | ((MAXBSIZE / bsize) << 16),
+ td->td_ucred, NULL, td);
+ VOP_UNLOCK(vp, 0, td);
+ vm_page_flag_clear(pg, PG_ZERO);
+ vm_page_io_finish(pg);
+ if (error) {
+ vm_page_unwire(pg, 0);
+ /*
+ * See if anyone else might know about this page.
+ * If not and it is not valid, then free it.
+ */
+ if (pg->wire_count == 0 && pg->valid == 0 &&
+ pg->busy == 0 && !(pg->flags & PG_BUSY) &&
+ pg->hold_count == 0) {
+ vm_page_busy(pg);
+ vm_page_free(pg);
+ }
+ sbunlock(&so->so_snd);
+ goto done;
+ }
+ }
+
+
+ /*
+ * Get a sendfile buf. We usually wait as long as necessary,
+ * but this wait can be interrupted.
+ */
+ if ((sf = sf_buf_alloc()) == NULL) {
+ vm_page_unwire(pg, 0);
+ if (pg->wire_count == 0 && pg->object == NULL)
+ vm_page_free(pg);
+ sbunlock(&so->so_snd);
+ error = EINTR;
+ goto done;
+ }
+
+ /*
+ * Allocate a kernel virtual page and insert the physical page
+ * into it.
+ */
+ sf->m = pg;
+ pmap_qenter(sf->kva, &pg, 1);
+ /*
+ * Get an mbuf header and set it up as having external storage.
+ */
+ MGETHDR(m, M_TRYWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ sf_buf_free((void *)sf->kva, NULL);
+ sbunlock(&so->so_snd);
+ goto done;
+ }
+ /*
+ * Setup external storage for mbuf.
+ */
+ MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY,
+ EXT_SFBUF);
+ m->m_data = (char *) sf->kva + pgoff;
+ m->m_pkthdr.len = m->m_len = xfsize;
+ /*
+ * Add the buffer to the socket buffer chain.
+ */
+ s = splnet();
+retry_space:
+ /*
+ * Make sure that the socket is still able to take more data.
+ * CANTSENDMORE being true usually means that the connection
+ * was closed. so_error is true when an error was sensed after
+ * a previous send.
+ * The state is checked after the page mapping and buffer
+ * allocation above since those operations may block and make
+ * any socket checks stale. From this point forward, nothing
+ * blocks before the pru_send (or more accurately, any blocking
+ * results in a loop back to here to re-check).
+ */
+ if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
+ if (so->so_state & SS_CANTSENDMORE) {
+ error = EPIPE;
+ } else {
+ error = so->so_error;
+ so->so_error = 0;
+ }
+ m_freem(m);
+ sbunlock(&so->so_snd);
+ splx(s);
+ goto done;
+ }
+ /*
+ * Wait for socket space to become available. We do this just
+ * after checking the connection state above in order to avoid
+ * a race condition with sbwait().
+ */
+ if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
+ if (so->so_state & SS_NBIO) {
+ m_freem(m);
+ sbunlock(&so->so_snd);
+ splx(s);
+ error = EAGAIN;
+ goto done;
+ }
+ error = sbwait(&so->so_snd);
+ /*
+ * An error from sbwait usually indicates that we've
+ * been interrupted by a signal. If we've sent anything
+ * then return bytes sent, otherwise return the error.
+ */
+ if (error) {
+ m_freem(m);
+ sbunlock(&so->so_snd);
+ splx(s);
+ goto done;
+ }
+ goto retry_space;
+ }
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
+ splx(s);
+ if (error) {
+ sbunlock(&so->so_snd);
+ goto done;
+ }
+ }
+ sbunlock(&so->so_snd);
+
+ /*
+ * Send trailers. Wimp out and use writev(2).
+ */
+ if (uap->hdtr != NULL && hdtr.trailers != NULL) {
+ nuap.fd = uap->s;
+ nuap.iovp = hdtr.trailers;
+ nuap.iovcnt = hdtr.trl_cnt;
+ error = writev(td, &nuap);
+ if (error)
+ goto done;
+ hdtr_size += td->td_retval[0];
+ }
+
+done:
+ /*
+ * If there was no error we have to clear td->td_retval[0]
+ * because it may have been set by writev.
+ */
+ if (error == 0) {
+ td->td_retval[0] = 0;
+ }
+ if (uap->sbytes != NULL) {
+ sbytes += hdtr_size;
+ copyout(&sbytes, uap->sbytes, sizeof(off_t));
+ }
+ if (vp)
+ vrele(vp);
+ if (so)
+ fputsock(so);
+ mtx_unlock(&Giant);
+ return (error);
+}
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
new file mode 100644
index 0000000..b227d91
--- /dev/null
+++ b/sys/kern/uipc_usrreq.c
@@ -0,0 +1,1503 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/domain.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h> /* XXX must be before <sys/file.h> */
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/resourcevar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/un.h>
+#include <sys/unpcb.h>
+#include <sys/vnode.h>
+
+#include <vm/uma.h>
+
+static uma_zone_t unp_zone;
+static unp_gen_t unp_gencnt;
+static u_int unp_count;
+
+static struct unp_head unp_shead, unp_dhead;
+
+/*
+ * Unix communications domain.
+ *
+ * TODO:
+ * SEQPACKET, RDM
+ * rethink name space problems
+ * need a proper out-of-band
+ * lock pushdown
+ */
+static struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
+static ino_t unp_ino; /* prototype for fake inode numbers */
+
+static int unp_attach(struct socket *);
+static void unp_detach(struct unpcb *);
+static int unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
+static int unp_connect(struct socket *,struct sockaddr *, struct thread *);
+static void unp_disconnect(struct unpcb *);
+static void unp_shutdown(struct unpcb *);
+static void unp_drop(struct unpcb *, int);
+static void unp_gc(void);
+static void unp_scan(struct mbuf *, void (*)(struct file *));
+static void unp_mark(struct file *);
+static void unp_discard(struct file *);
+static void unp_freerights(struct file **, int);
+static int unp_internalize(struct mbuf **, struct thread *);
+static int unp_listen(struct unpcb *, struct thread *);
+
+static int
+uipc_abort(struct socket *so)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+ unp_drop(unp, ECONNABORTED);
+ unp_detach(unp);
+ sotryfree(so);
+ return 0;
+}
+
+static int
+uipc_accept(struct socket *so, struct sockaddr **nam)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+
+ /*
+ * Pass back name of connected socket,
+ * if it was bound and we are still connected
+ * (our peer may have closed already!).
+ */
+ if (unp->unp_conn && unp->unp_conn->unp_addr) {
+ *nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr,
+ 1);
+ } else {
+ *nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1);
+ }
+ return 0;
+}
+
+static int
+uipc_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp != 0)
+ return EISCONN;
+ return unp_attach(so);
+}
+
+static int
+uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+
+ return unp_bind(unp, nam, td);
+}
+
+static int
+uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+ return unp_connect(so, nam, curthread);
+}
+
+static int
+uipc_connect2(struct socket *so1, struct socket *so2)
+{
+ struct unpcb *unp = sotounpcb(so1);
+
+ if (unp == 0)
+ return EINVAL;
+
+ return unp_connect2(so1, so2);
+}
+
+/* control is EOPNOTSUPP */
+
+static int
+uipc_detach(struct socket *so)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+
+ unp_detach(unp);
+ return 0;
+}
+
+static int
+uipc_disconnect(struct socket *so)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+ unp_disconnect(unp);
+ return 0;
+}
+
+static int
+uipc_listen(struct socket *so, struct thread *td)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0 || unp->unp_vnode == 0)
+ return EINVAL;
+ return unp_listen(unp, td);
+}
+
+static int
+uipc_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+ if (unp->unp_conn && unp->unp_conn->unp_addr)
+ *nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr,
+ 1);
+ return 0;
+}
+
+static int
+uipc_rcvd(struct socket *so, int flags)
+{
+ struct unpcb *unp = sotounpcb(so);
+ struct socket *so2;
+ u_long newhiwat;
+
+ if (unp == 0)
+ return EINVAL;
+ switch (so->so_type) {
+ case SOCK_DGRAM:
+ panic("uipc_rcvd DGRAM?");
+ /*NOTREACHED*/
+
+ case SOCK_STREAM:
+ if (unp->unp_conn == 0)
+ break;
+ so2 = unp->unp_conn->unp_socket;
+ /*
+ * Adjust backpressure on sender
+ * and wakeup any waiting to write.
+ */
+ so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
+ unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
+ newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
+ so->so_rcv.sb_cc;
+ (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
+ newhiwat, RLIM_INFINITY);
+ unp->unp_cc = so->so_rcv.sb_cc;
+ sowwakeup(so2);
+ break;
+
+ default:
+ panic("uipc_rcvd unknown socktype");
+ }
+ return 0;
+}
+
+/* pru_rcvoob is EOPNOTSUPP */
+
+static int
+uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+ struct mbuf *control, struct thread *td)
+{
+ int error = 0;
+ struct unpcb *unp = sotounpcb(so);
+ struct socket *so2;
+ u_long newhiwat;
+
+ if (unp == 0) {
+ error = EINVAL;
+ goto release;
+ }
+ if (flags & PRUS_OOB) {
+ error = EOPNOTSUPP;
+ goto release;
+ }
+
+ if (control && (error = unp_internalize(&control, td)))
+ goto release;
+
+ switch (so->so_type) {
+ case SOCK_DGRAM:
+ {
+ struct sockaddr *from;
+
+ if (nam) {
+ if (unp->unp_conn) {
+ error = EISCONN;
+ break;
+ }
+ error = unp_connect(so, nam, td);
+ if (error)
+ break;
+ } else {
+ if (unp->unp_conn == 0) {
+ error = ENOTCONN;
+ break;
+ }
+ }
+ so2 = unp->unp_conn->unp_socket;
+ if (unp->unp_addr)
+ from = (struct sockaddr *)unp->unp_addr;
+ else
+ from = &sun_noname;
+ if (sbappendaddr(&so2->so_rcv, from, m, control)) {
+ sorwakeup(so2);
+ m = 0;
+ control = 0;
+ } else
+ error = ENOBUFS;
+ if (nam)
+ unp_disconnect(unp);
+ break;
+ }
+
+ case SOCK_STREAM:
+ /* Connect if not connected yet. */
+ /*
+ * Note: A better implementation would complain
+ * if not equal to the peer's address.
+ */
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ if (nam) {
+ error = unp_connect(so, nam, td);
+ if (error)
+ break; /* XXX */
+ } else {
+ error = ENOTCONN;
+ break;
+ }
+ }
+
+ if (so->so_state & SS_CANTSENDMORE) {
+ error = EPIPE;
+ break;
+ }
+ if (unp->unp_conn == 0)
+ panic("uipc_send connected but no connection?");
+ so2 = unp->unp_conn->unp_socket;
+ /*
+ * Send to paired receive port, and then reduce
+ * send buffer hiwater marks to maintain backpressure.
+ * Wake up readers.
+ */
+ if (control) {
+ if (sbappendcontrol(&so2->so_rcv, m, control))
+ control = 0;
+ } else
+ sbappend(&so2->so_rcv, m);
+ so->so_snd.sb_mbmax -=
+ so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
+ unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
+ newhiwat = so->so_snd.sb_hiwat -
+ (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
+ (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
+ newhiwat, RLIM_INFINITY);
+ unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
+ sorwakeup(so2);
+ m = 0;
+ break;
+
+ default:
+ panic("uipc_send unknown socktype");
+ }
+
+ /*
+ * SEND_EOF is equivalent to a SEND followed by
+ * a SHUTDOWN.
+ */
+ if (flags & PRUS_EOF) {
+ socantsendmore(so);
+ unp_shutdown(unp);
+ }
+
+ if (control && error != 0)
+ unp_dispose(control);
+
+release:
+ if (control)
+ m_freem(control);
+ if (m)
+ m_freem(m);
+ return error;
+}
+
+static int
+uipc_sense(struct socket *so, struct stat *sb)
+{
+ struct unpcb *unp = sotounpcb(so);
+ struct socket *so2;
+
+ if (unp == 0)
+ return EINVAL;
+ sb->st_blksize = so->so_snd.sb_hiwat;
+ if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
+ so2 = unp->unp_conn->unp_socket;
+ sb->st_blksize += so2->so_rcv.sb_cc;
+ }
+ sb->st_dev = NOUDEV;
+ if (unp->unp_ino == 0)
+ unp->unp_ino = unp_ino++;
+ sb->st_ino = unp->unp_ino;
+ return (0);
+}
+
+static int
+uipc_shutdown(struct socket *so)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+ socantsendmore(so);
+ unp_shutdown(unp);
+ return 0;
+}
+
+static int
+uipc_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+ if (unp->unp_addr)
+ *nam = dup_sockaddr((struct sockaddr *)unp->unp_addr, 1);
+ else
+ *nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1);
+ return 0;
+}
+
+struct pr_usrreqs uipc_usrreqs = {
+ uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect,
+ uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect,
+ uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp,
+ uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr,
+ sosend, soreceive, sopoll
+};
+
+int
+uipc_ctloutput(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ struct unpcb *unp = sotounpcb(so);
+ int error;
+
+ switch (sopt->sopt_dir) {
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case LOCAL_PEERCRED:
+ if (unp->unp_flags & UNP_HAVEPC)
+ error = sooptcopyout(sopt, &unp->unp_peercred,
+ sizeof(unp->unp_peercred));
+ else {
+ if (so->so_type == SOCK_STREAM)
+ error = ENOTCONN;
+ else
+ error = EINVAL;
+ }
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ break;
+ case SOPT_SET:
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ return (error);
+}
+
+/*
+ * Both send and receive buffers are allocated PIPSIZ bytes of buffering
+ * for stream sockets, although the total for sender and receiver is
+ * actually only PIPSIZ.
+ * Datagram sockets really use the sendspace as the maximum datagram size,
+ * and don't really want to reserve the sendspace. Their recvspace should
+ * be large enough for at least one max-size datagram plus address.
+ */
+#ifndef PIPSIZ
+#define PIPSIZ 8192
+#endif
+static u_long unpst_sendspace = PIPSIZ;
+static u_long unpst_recvspace = PIPSIZ;
+static u_long unpdg_sendspace = 2*1024; /* really max datagram size */
+static u_long unpdg_recvspace = 4*1024;
+
+static int unp_rights; /* file descriptors in flight */
+
+SYSCTL_DECL(_net_local_stream);
+SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
+ &unpst_sendspace, 0, "");
+SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
+ &unpst_recvspace, 0, "");
+SYSCTL_DECL(_net_local_dgram);
+SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
+ &unpdg_sendspace, 0, "");
+SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
+ &unpdg_recvspace, 0, "");
+SYSCTL_DECL(_net_local);
+SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
+
+static int
+unp_attach(so)
+ struct socket *so;
+{
+ register struct unpcb *unp;
+ int error;
+
+ if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+ switch (so->so_type) {
+
+ case SOCK_STREAM:
+ error = soreserve(so, unpst_sendspace, unpst_recvspace);
+ break;
+
+ case SOCK_DGRAM:
+ error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
+ break;
+
+ default:
+ panic("unp_attach");
+ }
+ if (error)
+ return (error);
+ }
+ unp = uma_zalloc(unp_zone, M_WAITOK);
+ if (unp == NULL)
+ return (ENOBUFS);
+ bzero(unp, sizeof *unp);
+ unp->unp_gencnt = ++unp_gencnt;
+ unp_count++;
+ LIST_INIT(&unp->unp_refs);
+ unp->unp_socket = so;
+ FILEDESC_LOCK(curproc->p_fd);
+ unp->unp_rvnode = curthread->td_proc->p_fd->fd_rdir;
+ FILEDESC_UNLOCK(curproc->p_fd);
+ LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
+ : &unp_shead, unp, unp_link);
+ so->so_pcb = unp;
+ return (0);
+}
+
+static void
+unp_detach(unp)
+ register struct unpcb *unp;
+{
+ LIST_REMOVE(unp, unp_link);
+ unp->unp_gencnt = ++unp_gencnt;
+ --unp_count;
+ if (unp->unp_vnode) {
+ unp->unp_vnode->v_socket = 0;
+ vrele(unp->unp_vnode);
+ unp->unp_vnode = 0;
+ }
+ if (unp->unp_conn)
+ unp_disconnect(unp);
+ while (!LIST_EMPTY(&unp->unp_refs))
+ unp_drop(LIST_FIRST(&unp->unp_refs), ECONNRESET);
+ soisdisconnected(unp->unp_socket);
+ unp->unp_socket->so_pcb = 0;
+ if (unp_rights) {
+ /*
+ * Normally the receive buffer is flushed later,
+ * in sofree, but if our receive buffer holds references
+ * to descriptors that are now garbage, we will dispose
+ * of those descriptor references after the garbage collector
+ * gets them (resulting in a "panic: closef: count < 0").
+ */
+ sorflush(unp->unp_socket);
+ unp_gc();
+ }
+ if (unp->unp_addr)
+ FREE(unp->unp_addr, M_SONAME);
+ uma_zfree(unp_zone, unp);
+}
+
+static int
+unp_bind(unp, nam, td)
+ struct unpcb *unp;
+ struct sockaddr *nam;
+ struct thread *td;
+{
+ struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+ struct vnode *vp;
+ struct mount *mp;
+ struct vattr vattr;
+ int error, namelen;
+ struct nameidata nd;
+ char *buf;
+
+ if (unp->unp_vnode != NULL)
+ return (EINVAL);
+ namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
+ if (namelen <= 0)
+ return EINVAL;
+ buf = malloc(SOCK_MAXADDRLEN, M_TEMP, M_WAITOK);
+ strncpy(buf, soun->sun_path, namelen);
+ buf[namelen] = 0; /* null-terminate the string */
+restart:
+ NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
+ buf, td);
+/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
+ error = namei(&nd);
+ if (error) {
+ free(buf, M_TEMP);
+ return (error);
+ }
+ vp = nd.ni_vp;
+ if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (vp != NULL) {
+ vrele(vp);
+ free(buf, M_TEMP);
+ return (EADDRINUSE);
+ }
+ error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
+ if (error) {
+ free(buf, M_TEMP);
+ return (error);
+ }
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VSOCK;
+ FILEDESC_LOCK(td->td_proc->p_fd);
+ vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if (error) {
+ free(buf, M_TEMP);
+ return (error);
+ }
+ vp = nd.ni_vp;
+ vp->v_socket = unp->unp_socket;
+ unp->unp_vnode = vp;
+ unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam, 1);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ free(buf, M_TEMP);
+ return (0);
+}
+
+static int
+unp_connect(so, nam, td)
+ struct socket *so;
+ struct sockaddr *nam;
+ struct thread *td;
+{
+ register struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+ register struct vnode *vp;
+ register struct socket *so2, *so3;
+ struct unpcb *unp, *unp2, *unp3;
+ int error, len;
+ struct nameidata nd;
+ char buf[SOCK_MAXADDRLEN];
+
+ len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
+ if (len <= 0)
+ return EINVAL;
+ strncpy(buf, soun->sun_path, len);
+ buf[len] = 0;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (vp->v_type != VSOCK) {
+ error = ENOTSOCK;
+ goto bad;
+ }
+ error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
+ if (error)
+ goto bad;
+ so2 = vp->v_socket;
+ if (so2 == 0) {
+ error = ECONNREFUSED;
+ goto bad;
+ }
+ if (so->so_type != so2->so_type) {
+ error = EPROTOTYPE;
+ goto bad;
+ }
+ if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
+ if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
+ (so3 = sonewconn(so2, 0)) == 0) {
+ error = ECONNREFUSED;
+ goto bad;
+ }
+ unp = sotounpcb(so);
+ unp2 = sotounpcb(so2);
+ unp3 = sotounpcb(so3);
+ if (unp2->unp_addr)
+ unp3->unp_addr = (struct sockaddr_un *)
+ dup_sockaddr((struct sockaddr *)
+ unp2->unp_addr, 1);
+
+ /*
+ * unp_peercred management:
+ *
+ * The connecter's (client's) credentials are copied
+ * from its process structure at the time of connect()
+ * (which is now).
+ */
+ cru2x(td->td_ucred, &unp3->unp_peercred);
+ unp3->unp_flags |= UNP_HAVEPC;
+ /*
+ * The receiver's (server's) credentials are copied
+ * from the unp_peercred member of socket on which the
+ * former called listen(); unp_listen() cached that
+ * process's credentials at that time so we can use
+ * them now.
+ */
+ KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
+ ("unp_connect: listener without cached peercred"));
+ memcpy(&unp->unp_peercred, &unp2->unp_peercred,
+ sizeof(unp->unp_peercred));
+ unp->unp_flags |= UNP_HAVEPC;
+
+ so2 = so3;
+ }
+ error = unp_connect2(so, so2);
+bad:
+ vput(vp);
+ return (error);
+}
+
+int
+unp_connect2(so, so2)
+ register struct socket *so;
+ register struct socket *so2;
+{
+ register struct unpcb *unp = sotounpcb(so);
+ register struct unpcb *unp2;
+
+ if (so2->so_type != so->so_type)
+ return (EPROTOTYPE);
+ unp2 = sotounpcb(so2);
+ unp->unp_conn = unp2;
+ switch (so->so_type) {
+
+ case SOCK_DGRAM:
+ LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
+ soisconnected(so);
+ break;
+
+ case SOCK_STREAM:
+ unp2->unp_conn = unp;
+ soisconnected(so);
+ soisconnected(so2);
+ break;
+
+ default:
+ panic("unp_connect2");
+ }
+ return (0);
+}
+
+static void
+unp_disconnect(unp)
+ struct unpcb *unp;
+{
+ register struct unpcb *unp2 = unp->unp_conn;
+
+ if (unp2 == 0)
+ return;
+ unp->unp_conn = 0;
+ switch (unp->unp_socket->so_type) {
+
+ case SOCK_DGRAM:
+ LIST_REMOVE(unp, unp_reflink);
+ unp->unp_socket->so_state &= ~SS_ISCONNECTED;
+ break;
+
+ case SOCK_STREAM:
+ soisdisconnected(unp->unp_socket);
+ unp2->unp_conn = 0;
+ soisdisconnected(unp2->unp_socket);
+ break;
+ }
+}
+
+#ifdef notdef
+void
+unp_abort(unp)
+ struct unpcb *unp;
+{
+
+ unp_detach(unp);
+}
+#endif
+
+static int
+unp_pcblist(SYSCTL_HANDLER_ARGS)
+{
+ int error, i, n;
+ struct unpcb *unp, **unp_list;
+ unp_gen_t gencnt;
+ struct xunpgen *xug;
+ struct unp_head *head;
+ struct xunpcb *xu;
+
+ head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
+
+ /*
+ * The process of preparing the PCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == 0) {
+ n = unp_count;
+ req->oldidx = 2 * (sizeof *xug)
+ + (n + n/8) * sizeof(struct xunpcb);
+ return 0;
+ }
+
+ if (req->newptr != 0)
+ return EPERM;
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
+ gencnt = unp_gencnt;
+ n = unp_count;
+
+ xug->xug_len = sizeof *xug;
+ xug->xug_count = n;
+ xug->xug_gen = gencnt;
+ xug->xug_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, xug, sizeof *xug);
+ if (error) {
+ free(xug, M_TEMP);
+ return error;
+ }
+
+ unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
+
+ for (unp = LIST_FIRST(head), i = 0; unp && i < n;
+ unp = LIST_NEXT(unp, unp_link)) {
+ if (unp->unp_gencnt <= gencnt) {
+ if (cr_cansee(req->td->td_ucred,
+ unp->unp_socket->so_cred))
+ continue;
+ unp_list[i++] = unp;
+ }
+ }
+ n = i; /* in case we lost some during malloc */
+
+ error = 0;
+ xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK);
+ for (i = 0; i < n; i++) {
+ unp = unp_list[i];
+ if (unp->unp_gencnt <= gencnt) {
+ xu->xu_len = sizeof *xu;
+ xu->xu_unpp = unp;
+ /*
+ * XXX - need more locking here to protect against
+ * connect/disconnect races for SMP.
+ */
+ if (unp->unp_addr)
+ bcopy(unp->unp_addr, &xu->xu_addr,
+ unp->unp_addr->sun_len);
+ if (unp->unp_conn && unp->unp_conn->unp_addr)
+ bcopy(unp->unp_conn->unp_addr,
+ &xu->xu_caddr,
+ unp->unp_conn->unp_addr->sun_len);
+ bcopy(unp, &xu->xu_unp, sizeof *unp);
+ sotoxsocket(unp->unp_socket, &xu->xu_socket);
+ error = SYSCTL_OUT(req, xu, sizeof *xu);
+ }
+ }
+ free(xu, M_TEMP);
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state.
+ * If the generation differs from what we told
+ * her before, she knows that something happened
+ * while we were processing this request, and it
+ * might be necessary to retry.
+ */
+ xug->xug_gen = unp_gencnt;
+ xug->xug_sogen = so_gencnt;
+ xug->xug_count = unp_count;
+ error = SYSCTL_OUT(req, xug, sizeof *xug);
+ }
+ free(unp_list, M_TEMP);
+ free(xug, M_TEMP);
+ return error;
+}
+
+SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
+ (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
+ "List of active local datagram sockets");
+SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
+ (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
+ "List of active local stream sockets");
+
+static void
+unp_shutdown(unp)
+ struct unpcb *unp;
+{
+ struct socket *so;
+
+ if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
+ (so = unp->unp_conn->unp_socket))
+ socantrcvmore(so);
+}
+
+static void
+unp_drop(unp, errno)
+ struct unpcb *unp;
+ int errno;
+{
+ struct socket *so = unp->unp_socket;
+
+ so->so_error = errno;
+ unp_disconnect(unp);
+}
+
+#ifdef notdef
+void
+unp_drain()
+{
+
+}
+#endif
+
+static void
+unp_freerights(rp, fdcount)
+ struct file **rp;
+ int fdcount;
+{
+ int i;
+ struct file *fp;
+
+ for (i = 0; i < fdcount; i++) {
+ fp = *rp;
+ /*
+ * zero the pointer before calling
+ * unp_discard since it may end up
+ * in unp_gc()..
+ */
+ *rp++ = 0;
+ unp_discard(fp);
+ }
+}
+
+int
+unp_externalize(control, controlp)
+ struct mbuf *control, **controlp;
+{
+ struct thread *td = curthread; /* XXX */
+ struct cmsghdr *cm = mtod(control, struct cmsghdr *);
+ int i;
+ int *fdp;
+ struct file **rp;
+ struct file *fp;
+ void *data;
+ socklen_t clen = control->m_len, datalen;
+ int error, newfds;
+ int f;
+ u_int newlen;
+
+ error = 0;
+ if (controlp != NULL) /* controlp == NULL => free control messages */
+ *controlp = NULL;
+
+ while (cm != NULL) {
+ if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
+ error = EINVAL;
+ break;
+ }
+
+ data = CMSG_DATA(cm);
+ datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
+
+ if (cm->cmsg_level == SOL_SOCKET
+ && cm->cmsg_type == SCM_RIGHTS) {
+ newfds = datalen / sizeof(struct file *);
+ rp = data;
+
+ /* If we're not outputting the discriptors free them. */
+ if (error || controlp == NULL) {
+ unp_freerights(rp, newfds);
+ goto next;
+ }
+ FILEDESC_LOCK(td->td_proc->p_fd);
+ /* if the new FD's will not fit free them. */
+ if (!fdavail(td, newfds)) {
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ error = EMSGSIZE;
+ unp_freerights(rp, newfds);
+ goto next;
+ }
+ /*
+ * now change each pointer to an fd in the global
+ * table to an integer that is the index to the
+ * local fd table entry that we set up to point
+ * to the global one we are transferring.
+ */
+ newlen = newfds * sizeof(int);
+ *controlp = sbcreatecontrol(NULL, newlen,
+ SCM_RIGHTS, SOL_SOCKET);
+ if (*controlp == NULL) {
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ error = E2BIG;
+ unp_freerights(rp, newfds);
+ goto next;
+ }
+
+ fdp = (int *)
+ CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ for (i = 0; i < newfds; i++) {
+ if (fdalloc(td, 0, &f))
+ panic("unp_externalize fdalloc failed");
+ fp = *rp++;
+ td->td_proc->p_fd->fd_ofiles[f] = fp;
+ FILE_LOCK(fp);
+ fp->f_msgcount--;
+ FILE_UNLOCK(fp);
+ unp_rights--;
+ *fdp++ = f;
+ }
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ } else { /* We can just copy anything else across */
+ if (error || controlp == NULL)
+ goto next;
+ *controlp = sbcreatecontrol(NULL, datalen,
+ cm->cmsg_type, cm->cmsg_level);
+ if (*controlp == NULL) {
+ error = ENOBUFS;
+ goto next;
+ }
+ bcopy(data,
+ CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
+ datalen);
+ }
+
+ controlp = &(*controlp)->m_next;
+
+next:
+ if (CMSG_SPACE(datalen) < clen) {
+ clen -= CMSG_SPACE(datalen);
+ cm = (struct cmsghdr *)
+ ((caddr_t)cm + CMSG_SPACE(datalen));
+ } else {
+ clen = 0;
+ cm = NULL;
+ }
+ }
+
+ m_freem(control);
+
+ return (error);
+}
+
+void
+unp_init(void)
+{
+ unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(unp_zone, nmbclusters);
+ if (unp_zone == 0)
+ panic("unp_init");
+ LIST_INIT(&unp_dhead);
+ LIST_INIT(&unp_shead);
+}
+
+#ifndef MIN
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+
+static int
+unp_internalize(controlp, td)
+ struct mbuf **controlp;
+ struct thread *td;
+{
+ struct mbuf *control = *controlp;
+ struct proc *p = td->td_proc;
+ struct filedesc *fdescp = p->p_fd;
+ struct cmsghdr *cm = mtod(control, struct cmsghdr *);
+ struct cmsgcred *cmcred;
+ struct file **rp;
+ struct file *fp;
+ struct timeval *tv;
+ int i, fd, *fdp;
+ void *data;
+ socklen_t clen = control->m_len, datalen;
+ int error, oldfds;
+ u_int newlen;
+
+ error = 0;
+ *controlp = NULL;
+
+ while (cm != NULL) {
+ if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
+ || cm->cmsg_len > clen) {
+ error = EINVAL;
+ goto out;
+ }
+
+ data = CMSG_DATA(cm);
+ datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
+
+ switch (cm->cmsg_type) {
+ /*
+ * Fill in credential information.
+ */
+ case SCM_CREDS:
+ *controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
+ SCM_CREDS, SOL_SOCKET);
+ if (*controlp == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+
+ cmcred = (struct cmsgcred *)
+ CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ cmcred->cmcred_pid = p->p_pid;
+ cmcred->cmcred_uid = td->td_ucred->cr_ruid;
+ cmcred->cmcred_gid = td->td_ucred->cr_rgid;
+ cmcred->cmcred_euid = td->td_ucred->cr_uid;
+ cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
+ CMGROUP_MAX);
+ for (i = 0; i < cmcred->cmcred_ngroups; i++)
+ cmcred->cmcred_groups[i] =
+ td->td_ucred->cr_groups[i];
+ break;
+
+ case SCM_RIGHTS:
+ oldfds = datalen / sizeof (int);
+ /*
+ * check that all the FDs passed in refer to legal files
+ * If not, reject the entire operation.
+ */
+ fdp = data;
+ FILEDESC_LOCK(fdescp);
+ for (i = 0; i < oldfds; i++) {
+ fd = *fdp++;
+ if ((unsigned)fd >= fdescp->fd_nfiles ||
+ fdescp->fd_ofiles[fd] == NULL) {
+ FILEDESC_UNLOCK(fdescp);
+ error = EBADF;
+ goto out;
+ }
+ }
+ /*
+ * Now replace the integer FDs with pointers to
+ * the associated global file table entry..
+ */
+ newlen = oldfds * sizeof(struct file *);
+ *controlp = sbcreatecontrol(NULL, newlen,
+ SCM_RIGHTS, SOL_SOCKET);
+ if (*controlp == NULL) {
+ FILEDESC_UNLOCK(fdescp);
+ error = E2BIG;
+ goto out;
+ }
+
+ fdp = data;
+ rp = (struct file **)
+ CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ for (i = 0; i < oldfds; i++) {
+ fp = fdescp->fd_ofiles[*fdp++];
+ *rp++ = fp;
+ FILE_LOCK(fp);
+ fp->f_count++;
+ fp->f_msgcount++;
+ FILE_UNLOCK(fp);
+ unp_rights++;
+ }
+ FILEDESC_UNLOCK(fdescp);
+ break;
+
+ case SCM_TIMESTAMP:
+ *controlp = sbcreatecontrol(NULL, sizeof(*tv),
+ SCM_TIMESTAMP, SOL_SOCKET);
+ if (*controlp == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+ tv = (struct timeval *)
+ CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ microtime(tv);
+ break;
+
+ default:
+ error = EINVAL;
+ goto out;
+ }
+
+ controlp = &(*controlp)->m_next;
+
+ if (CMSG_SPACE(datalen) < clen) {
+ clen -= CMSG_SPACE(datalen);
+ cm = (struct cmsghdr *)
+ ((caddr_t)cm + CMSG_SPACE(datalen));
+ } else {
+ clen = 0;
+ cm = NULL;
+ }
+ }
+
+out:
+ m_freem(control);
+
+ return (error);
+}
+
+static int unp_defer, unp_gcing;
+
+static void
+unp_gc()
+{
+ register struct file *fp, *nextfp;
+ register struct socket *so;
+ struct file **extra_ref, **fpp;
+ int nunref, i;
+
+ if (unp_gcing)
+ return;
+ unp_gcing = 1;
+ unp_defer = 0;
+ /*
+ * before going through all this, set all FDs to
+ * be NOT defered and NOT externally accessible
+ */
+ sx_slock(&filelist_lock);
+ LIST_FOREACH(fp, &filehead, f_list)
+ fp->f_gcflag &= ~(FMARK|FDEFER);
+ do {
+ LIST_FOREACH(fp, &filehead, f_list) {
+ FILE_LOCK(fp);
+ /*
+ * If the file is not open, skip it
+ */
+ if (fp->f_count == 0) {
+ FILE_UNLOCK(fp);
+ continue;
+ }
+ /*
+ * If we already marked it as 'defer' in a
+ * previous pass, then try process it this time
+ * and un-mark it
+ */
+ if (fp->f_gcflag & FDEFER) {
+ fp->f_gcflag &= ~FDEFER;
+ unp_defer--;
+ } else {
+ /*
+ * if it's not defered, then check if it's
+ * already marked.. if so skip it
+ */
+ if (fp->f_gcflag & FMARK) {
+ FILE_UNLOCK(fp);
+ continue;
+ }
+ /*
+ * If all references are from messages
+ * in transit, then skip it. it's not
+ * externally accessible.
+ */
+ if (fp->f_count == fp->f_msgcount) {
+ FILE_UNLOCK(fp);
+ continue;
+ }
+ /*
+ * If it got this far then it must be
+ * externally accessible.
+ */
+ fp->f_gcflag |= FMARK;
+ }
+ /*
+ * either it was defered, or it is externally
+ * accessible and not already marked so.
+ * Now check if it is possibly one of OUR sockets.
+ */
+ if (fp->f_type != DTYPE_SOCKET ||
+ (so = (struct socket *)fp->f_data) == 0) {
+ FILE_UNLOCK(fp);
+ continue;
+ }
+ FILE_UNLOCK(fp);
+ if (so->so_proto->pr_domain != &localdomain ||
+ (so->so_proto->pr_flags&PR_RIGHTS) == 0)
+ continue;
+#ifdef notdef
+ if (so->so_rcv.sb_flags & SB_LOCK) {
+ /*
+ * This is problematical; it's not clear
+ * we need to wait for the sockbuf to be
+ * unlocked (on a uniprocessor, at least),
+ * and it's also not clear what to do
+ * if sbwait returns an error due to receipt
+ * of a signal. If sbwait does return
+ * an error, we'll go into an infinite
+ * loop. Delete all of this for now.
+ */
+ (void) sbwait(&so->so_rcv);
+ goto restart;
+ }
+#endif
+ /*
+ * So, Ok, it's one of our sockets and it IS externally
+ * accessible (or was defered). Now we look
+ * to see if we hold any file descriptors in its
+ * message buffers. Follow those links and mark them
+ * as accessible too.
+ */
+ unp_scan(so->so_rcv.sb_mb, unp_mark);
+ }
+ } while (unp_defer);
+ sx_sunlock(&filelist_lock);
+ /*
+ * We grab an extra reference to each of the file table entries
+ * that are not otherwise accessible and then free the rights
+ * that are stored in messages on them.
+ *
+ * The bug in the orginal code is a little tricky, so I'll describe
+ * what's wrong with it here.
+ *
+ * It is incorrect to simply unp_discard each entry for f_msgcount
+ * times -- consider the case of sockets A and B that contain
+ * references to each other. On a last close of some other socket,
+ * we trigger a gc since the number of outstanding rights (unp_rights)
+ * is non-zero. If during the sweep phase the gc code un_discards,
+ * we end up doing a (full) closef on the descriptor. A closef on A
+ * results in the following chain. Closef calls soo_close, which
+ * calls soclose. Soclose calls first (through the switch
+ * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply
+ * returns because the previous instance had set unp_gcing, and
+ * we return all the way back to soclose, which marks the socket
+ * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush
+ * to free up the rights that are queued in messages on the socket A,
+ * i.e., the reference on B. The sorflush calls via the dom_dispose
+ * switch unp_dispose, which unp_scans with unp_discard. This second
+ * instance of unp_discard just calls closef on B.
+ *
+ * Well, a similar chain occurs on B, resulting in a sorflush on B,
+ * which results in another closef on A. Unfortunately, A is already
+ * being closed, and the descriptor has already been marked with
+ * SS_NOFDREF, and soclose panics at this point.
+ *
+ * Here, we first take an extra reference to each inaccessible
+ * descriptor. Then, we call sorflush ourself, since we know
+ * it is a Unix domain socket anyhow. After we destroy all the
+ * rights carried in messages, we do a last closef to get rid
+ * of our extra reference. This is the last close, and the
+ * unp_detach etc will shut down the socket.
+ *
+ * 91/09/19, bsy@cs.cmu.edu
+ */
+ extra_ref = malloc(nfiles * sizeof(struct file *), M_TEMP, M_WAITOK);
+ sx_slock(&filelist_lock);
+ for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0;
+ fp = nextfp) {
+ nextfp = LIST_NEXT(fp, f_list);
+ FILE_LOCK(fp);
+ /*
+ * If it's not open, skip it
+ */
+ if (fp->f_count == 0) {
+ FILE_UNLOCK(fp);
+ continue;
+ }
+ /*
+ * If all refs are from msgs, and it's not marked accessible
+ * then it must be referenced from some unreachable cycle
+ * of (shut-down) FDs, so include it in our
+ * list of FDs to remove
+ */
+ if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
+ *fpp++ = fp;
+ nunref++;
+ fp->f_count++;
+ }
+ FILE_UNLOCK(fp);
+ }
+ sx_sunlock(&filelist_lock);
+ /*
+ * for each FD on our hit list, do the following two things
+ */
+ for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
+ struct file *tfp = *fpp;
+ FILE_LOCK(tfp);
+ if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL) {
+ FILE_UNLOCK(tfp);
+ sorflush((struct socket *)(tfp->f_data));
+ } else
+ FILE_UNLOCK(tfp);
+ }
+ for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
+ closef(*fpp, (struct thread *) NULL);
+ free(extra_ref, M_TEMP);
+ unp_gcing = 0;
+}
+
+void
+unp_dispose(m)
+ struct mbuf *m;
+{
+
+ if (m)
+ unp_scan(m, unp_discard);
+}
+
+static int
+unp_listen(unp, td)
+ struct unpcb *unp;
+ struct thread *td;
+{
+
+ cru2x(td->td_ucred, &unp->unp_peercred);
+ unp->unp_flags |= UNP_HAVEPCCACHED;
+ return (0);
+}
+
+static void
+unp_scan(m0, op)
+ register struct mbuf *m0;
+ void (*op)(struct file *);
+{
+ struct mbuf *m;
+ struct file **rp;
+ struct cmsghdr *cm;
+ void *data;
+ int i;
+ socklen_t clen, datalen;
+ int qfds;
+
+ while (m0) {
+ for (m = m0; m; m = m->m_next) {
+ if (m->m_type != MT_CONTROL)
+ continue;
+
+ cm = mtod(m, struct cmsghdr *);
+ clen = m->m_len;
+
+ while (cm != NULL) {
+ if (sizeof(*cm) > clen || cm->cmsg_len > clen)
+ break;
+
+ data = CMSG_DATA(cm);
+ datalen = (caddr_t)cm + cm->cmsg_len
+ - (caddr_t)data;
+
+ if (cm->cmsg_level == SOL_SOCKET &&
+ cm->cmsg_type == SCM_RIGHTS) {
+ qfds = datalen / sizeof (struct file *);
+ rp = data;
+ for (i = 0; i < qfds; i++)
+ (*op)(*rp++);
+ }
+
+ if (CMSG_SPACE(datalen) < clen) {
+ clen -= CMSG_SPACE(datalen);
+ cm = (struct cmsghdr *)
+ ((caddr_t)cm + CMSG_SPACE(datalen));
+ } else {
+ clen = 0;
+ cm = NULL;
+ }
+ }
+ }
+ m0 = m0->m_act;
+ }
+}
+
+static void
+unp_mark(fp)
+ struct file *fp;
+{
+ if (fp->f_gcflag & FMARK)
+ return;
+ unp_defer++;
+ fp->f_gcflag |= (FMARK|FDEFER);
+}
+
+static void
+unp_discard(fp)
+ struct file *fp;
+{
+ FILE_LOCK(fp);
+ fp->f_msgcount--;
+ unp_rights--;
+ FILE_UNLOCK(fp);
+ (void) closef(fp, (struct thread *)NULL);
+}
diff --git a/sys/kern/vfs_acl.c b/sys/kern/vfs_acl.c
new file mode 100644
index 0000000..70be0ec
--- /dev/null
+++ b/sys/kern/vfs_acl.c
@@ -0,0 +1,830 @@
+/*-
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ * Support for POSIX.1e access control lists.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+
+MALLOC_DEFINE(M_ACL, "acl", "access control list");
+
+static int vacl_set_acl(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+static int vacl_get_acl(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+static int vacl_aclcheck(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+
+/*
+ * Implement a version of vaccess() that understands POSIX.1e ACL semantics.
+ * Return 0 on success, else an errno value. Should be merged into
+ * vaccess() eventually.
+ */
+int
+vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
+ struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
+{
+ struct acl_entry *acl_other, *acl_mask;
+ mode_t dac_granted;
+ mode_t cap_granted;
+ mode_t acl_mask_granted;
+ int group_matched, i;
+
+ /*
+ * Look for a normal, non-privileged way to access the file/directory
+ * as requested. If it exists, go with that. Otherwise, attempt
+ * to use privileges granted via cap_granted. In some cases,
+ * which privileges to use may be ambiguous due to "best match",
+ * in which case fall back on first match for the time being.
+ */
+ if (privused != NULL)
+ *privused = 0;
+
+ /*
+ * Determine privileges now, but don't apply until we've found
+ * a DAC entry that matches but has failed to allow access.
+ */
+#ifndef CAPABILITIES
+ if (suser_cred(cred, PRISON_ROOT) == 0)
+ cap_granted = (VEXEC | VREAD | VWRITE | VADMIN);
+ else
+ cap_granted = 0;
+#else
+ cap_granted = 0;
+
+ if (type == VDIR) {
+ if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
+ CAP_DAC_READ_SEARCH, PRISON_ROOT))
+ cap_granted |= VEXEC;
+ } else {
+ if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
+ CAP_DAC_EXECUTE, PRISON_ROOT))
+ cap_granted |= VEXEC;
+ }
+
+ if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH,
+ PRISON_ROOT))
+ cap_granted |= VREAD;
+
+ if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE,
+ PRISON_ROOT))
+ cap_granted |= VWRITE;
+
+ if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER,
+ PRISON_ROOT))
+ cap_granted |= VADMIN;
+#endif /* CAPABILITIES */
+
+ /*
+ * The owner matches if the effective uid associated with the
+ * credential matches that of the ACL_USER_OBJ entry. While we're
+ * doing the first scan, also cache the location of the ACL_MASK
+ * and ACL_OTHER entries, preventing some future iterations.
+ */
+ acl_mask = acl_other = NULL;
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_USER_OBJ:
+ if (file_uid != cred->cr_uid)
+ break;
+ dac_granted = 0;
+ dac_granted |= VADMIN;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+ if ((acc_mode & (dac_granted | cap_granted)) ==
+ acc_mode) {
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+ goto error;
+
+ case ACL_MASK:
+ acl_mask = &acl->acl_entry[i];
+ break;
+
+ case ACL_OTHER:
+ acl_other = &acl->acl_entry[i];
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ /*
+ * An ACL_OTHER entry should always exist in a valid access
+ * ACL. If it doesn't, then generate a serious failure. For now,
+ * this means a debugging message and EPERM, but in the future
+ * should probably be a panic.
+ */
+ if (acl_other == NULL) {
+ /*
+ * XXX This should never happen
+ */
+ printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
+ return (EPERM);
+ }
+
+ /*
+ * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields
+ * are masked by an ACL_MASK entry, if any. As such, first identify
+ * the ACL_MASK field, then iterate through identifying potential
+ * user matches, then group matches. If there is no ACL_MASK,
+ * assume that the mask allows all requests to succeed.
+ */
+ if (acl_mask != NULL) {
+ acl_mask_granted = 0;
+ if (acl_mask->ae_perm & ACL_EXECUTE)
+ acl_mask_granted |= VEXEC;
+ if (acl_mask->ae_perm & ACL_READ)
+ acl_mask_granted |= VREAD;
+ if (acl_mask->ae_perm & ACL_WRITE)
+ acl_mask_granted |= VWRITE;
+ } else
+ acl_mask_granted = VEXEC | VREAD | VWRITE;
+
+ /*
+ * Iterate through user ACL entries. Do checks twice, first
+ * without privilege, and then if a match is found but failed,
+ * a second time with privilege.
+ */
+
+ /*
+ * Check ACL_USER ACL entries.
+ */
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_USER:
+ if (acl->acl_entry[i].ae_id != cred->cr_uid)
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+ if ((acc_mode & (dac_granted | cap_granted)) !=
+ acc_mode)
+ goto error;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+ }
+
+ /*
+ * Group match is best-match, not first-match, so find a
+ * "best" match. Iterate across, testing each potential group
+ * match. Make sure we keep track of whether we found a match
+ * or not, so that we know if we should try again with any
+ * available privilege, or if we should move on to ACL_OTHER.
+ */
+ group_matched = 0;
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_GROUP_OBJ:
+ if (!groupmember(file_gid, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+
+ group_matched = 1;
+ break;
+
+ case ACL_GROUP:
+ if (!groupmember(acl->acl_entry[i].ae_id, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+
+ group_matched = 1;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (group_matched == 1) {
+ /*
+ * There was a match, but it did not grant rights via
+ * pure DAC. Try again, this time with privilege.
+ */
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_GROUP_OBJ:
+ if (!groupmember(file_gid, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & (dac_granted | cap_granted)) !=
+ acc_mode)
+ break;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+
+ case ACL_GROUP:
+ if (!groupmember(acl->acl_entry[i].ae_id,
+ cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+ dac_granted &= acl_mask_granted;
+
+ if ((acc_mode & (dac_granted | cap_granted)) !=
+ acc_mode)
+ break;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+
+ default:
+ break;
+ }
+ }
+ /*
+ * Even with privilege, group membership was not sufficient.
+ * Return failure.
+ */
+ goto error;
+ }
+
+ /*
+ * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER.
+ */
+ dac_granted = 0;
+ if (acl_other->ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl_other->ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl_other->ae_perm & ACL_WRITE)
+ dac_granted |= VWRITE;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+ if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) {
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+
+error:
+ return ((acc_mode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * For the purposes of filesystems maintaining the _OBJ entries in an
+ * inode with a mode_t field, this routine converts a mode_t entry
+ * to an acl_perm_t.
+ */
+acl_perm_t
+acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
+{
+ acl_perm_t perm = 0;
+
+ switch(tag) {
+ case ACL_USER_OBJ:
+ if (mode & S_IXUSR)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IRUSR)
+ perm |= ACL_READ;
+ if (mode & S_IWUSR)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ case ACL_GROUP_OBJ:
+ if (mode & S_IXGRP)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IRGRP)
+ perm |= ACL_READ;
+ if (mode & S_IWGRP)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ case ACL_OTHER:
+ if (mode & S_IXOTH)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IROTH)
+ perm |= ACL_READ;
+ if (mode & S_IWOTH)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ default:
+ printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
+ return (0);
+ }
+}
+
+/*
+ * Given inode information (uid, gid, mode), return an acl entry of the
+ * appropriate type.
+ */
+struct acl_entry
+acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
+{
+ struct acl_entry acl_entry;
+
+ acl_entry.ae_tag = tag;
+ acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
+ switch(tag) {
+ case ACL_USER_OBJ:
+ acl_entry.ae_id = uid;
+ break;
+
+ case ACL_GROUP_OBJ:
+ acl_entry.ae_id = gid;
+ break;
+
+ case ACL_OTHER:
+ acl_entry.ae_id = ACL_UNDEFINED_ID;
+ break;
+
+ default:
+ acl_entry.ae_id = ACL_UNDEFINED_ID;
+ printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
+ }
+
+ return (acl_entry);
+}
+
+/*
+ * Utility function to generate a file mode given appropriate ACL entries.
+ */
+mode_t
+acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
+ struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
+{
+ mode_t mode;
+
+ mode = 0;
+ if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXUSR;
+ if (acl_user_obj_entry->ae_perm & ACL_READ)
+ mode |= S_IRUSR;
+ if (acl_user_obj_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWUSR;
+ if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXGRP;
+ if (acl_group_obj_entry->ae_perm & ACL_READ)
+ mode |= S_IRGRP;
+ if (acl_group_obj_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWGRP;
+ if (acl_other_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXOTH;
+ if (acl_other_entry->ae_perm & ACL_READ)
+ mode |= S_IROTH;
+ if (acl_other_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWOTH;
+
+ return (mode);
+}
+
+/*
+ * Perform a syntactic check of the ACL, sufficient to allow an
+ * implementing filesystem to determine if it should accept this and
+ * rely on the POSIX.1e ACL properties.
+ */
+int
+acl_posix1e_check(struct acl *acl)
+{
+ int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
+ int num_acl_mask, num_acl_other, i;
+
+ /*
+ * Verify that the number of entries does not exceed the maximum
+ * defined for acl_t.
+ * Verify that the correct number of various sorts of ae_tags are
+ * present:
+ * Exactly one ACL_USER_OBJ
+ * Exactly one ACL_GROUP_OBJ
+ * Exactly one ACL_OTHER
+ * If any ACL_USER or ACL_GROUP entries appear, then exactly one
+ * ACL_MASK entry must also appear.
+ * Verify that all ae_perm entries are in ACL_PERM_BITS.
+ * Verify all ae_tag entries are understood by this implementation.
+ * Note: Does not check for uniqueness of qualifier (ae_id) field.
+ */
+ num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
+ num_acl_mask = num_acl_other = 0;
+ if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
+ return (EINVAL);
+ for (i = 0; i < acl->acl_cnt; i++) {
+ /*
+ * Check for a valid tag.
+ */
+ switch(acl->acl_entry[i].ae_tag) {
+ case ACL_USER_OBJ:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_user_obj++;
+ break;
+ case ACL_GROUP_OBJ:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_group_obj++;
+ break;
+ case ACL_USER:
+ if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_user++;
+ break;
+ case ACL_GROUP:
+ if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_group++;
+ break;
+ case ACL_OTHER:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_other++;
+ break;
+ case ACL_MASK:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_mask++;
+ break;
+ default:
+ return (EINVAL);
+ }
+ /*
+ * Check for valid perm entries.
+ */
+ if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
+ ACL_PERM_BITS)
+ return (EINVAL);
+ }
+ if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
+ (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
+ return (EINVAL);
+ if (((num_acl_group != 0) || (num_acl_user != 0)) &&
+ (num_acl_mask != 1))
+ return (EINVAL);
+ return (0);
+}
+
+/*
+ * These calls wrap the real vnode operations, and are called by the
+ * syscall code once the syscall has converted the path or file
+ * descriptor to a vnode (unlocked). The aclp pointer is assumed
+ * still to point to userland, so this should not be consumed within
+ * the kernel except by syscall code. Other code should directly
+ * invoke VOP_{SET,GET}ACL.
+ */
+
+/*
+ * Given a vnode, set its ACL.
+ */
+static int
+vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl inkernacl;
+ struct mount *mp;
+ int error;
+
+ error = copyin(aclp, &inkernacl, sizeof(struct acl));
+ if (error)
+ return(error);
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return(error);
+}
+
+/*
+ * Given a vnode, get its ACL.
+ */
+static int
+vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl inkernelacl;
+ int error;
+
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ if (error == 0)
+ error = copyout(&inkernelacl, aclp, sizeof(struct acl));
+ return (error);
+}
+
+/*
+ * Given a vnode, delete its ACL.
+ */
+static int
+vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
+{
+ struct mount *mp;
+ int error;
+
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_SETACL(vp, type, NULL, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Given a vnode, check whether an ACL is appropriate for it
+ */
+static int
+vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl inkernelacl;
+ int error;
+
+ error = copyin(aclp, &inkernelacl, sizeof(struct acl));
+ if (error)
+ return(error);
+ error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
+ return (error);
+}
+
+/*
+ * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.
+ * Don't need to lock, as the vacl_ code will get/release any locks
+ * required.
+ */
+
+/*
+ * Given a file path, get an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ mtx_lock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type),
+ SCARG(uap, aclp));
+ NDFREE(&nd, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ mtx_lock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type),
+ SCARG(uap, aclp));
+ NDFREE(&nd, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file descriptor, get an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ mtx_lock(&Giant);
+ error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+ if (error == 0) {
+ error = vacl_get_acl(td, (struct vnode *)fp->f_data,
+ SCARG(uap, type), SCARG(uap, aclp));
+ fdrop(fp, td);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file descriptor, set an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ mtx_lock(&Giant);
+ error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+ if (error == 0) {
+ error = vacl_set_acl(td, (struct vnode *)fp->f_data,
+ SCARG(uap, type), SCARG(uap, aclp));
+ fdrop(fp, td);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ *
+ * MPSAFE
+ */
+int
+__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ mtx_lock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_delete(td, nd.ni_vp, SCARG(uap, type));
+ NDFREE(&nd, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ *
+ * MPSAFE
+ */
+int
+__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ mtx_lock(&Giant);
+ error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+ if (error == 0) {
+ error = vacl_delete(td, (struct vnode *)fp->f_data,
+ SCARG(uap, type));
+ fdrop(fp, td);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ mtx_lock(&Giant);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type),
+ SCARG(uap, aclp));
+ NDFREE(&nd, 0);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Given a file descriptor, check an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ mtx_lock(&Giant);
+ error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+ if (error == 0) {
+ error = vacl_aclcheck(td, (struct vnode *)fp->f_data,
+ SCARG(uap, type), SCARG(uap, aclp));
+ fdrop(fp, td);
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
new file mode 100644
index 0000000..891f272
--- /dev/null
+++ b/sys/kern/vfs_aio.c
@@ -0,0 +1,2307 @@
+/*
+ * Copyright (c) 1997 John S. Dyson. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. John S. Dyson's name may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * DISCLAIMER: This code isn't warranted to do anything useful. Anything
+ * bad that happens because of using this software isn't the responsibility
+ * of the author. This software is distributed AS-IS.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This file contains support for the POSIX 1003.1B AIO/LIO facility.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/unistd.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/protosw.h>
+#include <sys/socketvar.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/sx.h>
+#include <sys/vnode.h>
+#include <sys/conf.h>
+#include <sys/event.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/uma.h>
+#include <sys/aio.h>
+
+#include <machine/limits.h>
+
+#include "opt_vfs_aio.h"
+
+/*
+ * Counter for allocating reference ids to new jobs. Wrapped to 1 on
+ * overflow.
+ */
+static long jobrefid;
+
+#define JOBST_NULL 0x0
+#define JOBST_JOBQGLOBAL 0x2
+#define JOBST_JOBRUNNING 0x3
+#define JOBST_JOBFINISHED 0x4
+#define JOBST_JOBQBUF 0x5
+#define JOBST_JOBBFINISHED 0x6
+
+#ifndef MAX_AIO_PER_PROC
+#define MAX_AIO_PER_PROC 32
+#endif
+
+#ifndef MAX_AIO_QUEUE_PER_PROC
+#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
+#endif
+
+#ifndef MAX_AIO_PROCS
+#define MAX_AIO_PROCS 32
+#endif
+
+#ifndef MAX_AIO_QUEUE
+#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
+#endif
+
+#ifndef TARGET_AIO_PROCS
+#define TARGET_AIO_PROCS 4
+#endif
+
+#ifndef MAX_BUF_AIO
+#define MAX_BUF_AIO 16
+#endif
+
+#ifndef AIOD_TIMEOUT_DEFAULT
+#define AIOD_TIMEOUT_DEFAULT (10 * hz)
+#endif
+
+#ifndef AIOD_LIFETIME_DEFAULT
+#define AIOD_LIFETIME_DEFAULT (30 * hz)
+#endif
+
+SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
+
+static int max_aio_procs = MAX_AIO_PROCS;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
+ CTLFLAG_RW, &max_aio_procs, 0,
+ "Maximum number of kernel threads to use for handling async IO ");
+
+static int num_aio_procs = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
+ CTLFLAG_RD, &num_aio_procs, 0,
+ "Number of presently active kernel threads for async IO");
+
+/*
+ * The code will adjust the actual number of AIO processes towards this
+ * number when it gets a chance.
+ */
+static int target_aio_procs = TARGET_AIO_PROCS;
+SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
+ 0, "Preferred number of ready kernel threads for async IO");
+
+static int max_queue_count = MAX_AIO_QUEUE;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
+ "Maximum number of aio requests to queue, globally");
+
+static int num_queue_count = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
+ "Number of queued aio requests");
+
+static int num_buf_aio = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
+ "Number of aio requests presently handled by the buf subsystem");
+
+/* Number of async I/O thread in the process of being started */
+/* XXX This should be local to _aio_aqueue() */
+static int num_aio_resv_start = 0;
+
+static int aiod_timeout;
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
+ "Timeout value for synchronous aio operations");
+
+static int aiod_lifetime;
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
+ "Maximum lifetime for idle aiod");
+
+static int unloadable = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
+ "Allow unload of aio (not recommended)");
+
+
+static int max_aio_per_proc = MAX_AIO_PER_PROC;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
+ 0, "Maximum active aio requests per process (stored in the process)");
+
+static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
+ &max_aio_queue_per_proc, 0,
+ "Maximum queued aio requests per process (stored in the process)");
+
+static int max_buf_aio = MAX_BUF_AIO;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
+ "Maximum buf aio requests per process (stored in the process)");
+
+struct aiocblist {
+ TAILQ_ENTRY(aiocblist) list; /* List of jobs */
+ TAILQ_ENTRY(aiocblist) plist; /* List of jobs for proc */
+ int jobflags;
+ int jobstate;
+ int inputcharge;
+ int outputcharge;
+ struct callout_handle timeouthandle;
+ struct buf *bp; /* Buffer pointer */
+ struct proc *userproc; /* User process */ /* Not td! */
+ struct file *fd_file; /* Pointer to file structure */
+ struct aio_liojob *lio; /* Optional lio job */
+ struct aiocb *uuaiocb; /* Pointer in userspace of aiocb */
+ struct klist klist; /* list of knotes */
+ struct aiocb uaiocb; /* Kernel I/O control block */
+};
+
+/* jobflags */
+#define AIOCBLIST_RUNDOWN 0x4
+#define AIOCBLIST_ASYNCFREE 0x8
+#define AIOCBLIST_DONE 0x10
+
+/*
+ * AIO process info
+ */
+#define AIOP_FREE 0x1 /* proc on free queue */
+#define AIOP_SCHED 0x2 /* proc explicitly scheduled */
+
+struct aiothreadlist {
+ int aiothreadflags; /* AIO proc flags */
+ TAILQ_ENTRY(aiothreadlist) list; /* List of processes */
+ struct thread *aiothread; /* The AIO thread */
+};
+
+/*
+ * data-structure for lio signal management
+ */
+struct aio_liojob {
+ int lioj_flags;
+ int lioj_buffer_count;
+ int lioj_buffer_finished_count;
+ int lioj_queue_count;
+ int lioj_queue_finished_count;
+ struct sigevent lioj_signal; /* signal on all I/O done */
+ TAILQ_ENTRY(aio_liojob) lioj_list;
+ struct kaioinfo *lioj_ki;
+};
+#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
+#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
+
+/*
+ * per process aio data structure
+ */
+struct kaioinfo {
+ int kaio_flags; /* per process kaio flags */
+ int kaio_maxactive_count; /* maximum number of AIOs */
+ int kaio_active_count; /* number of currently used AIOs */
+ int kaio_qallowed_count; /* maxiumu size of AIO queue */
+ int kaio_queue_count; /* size of AIO queue */
+ int kaio_ballowed_count; /* maximum number of buffers */
+ int kaio_queue_finished_count; /* number of daemon jobs finished */
+ int kaio_buffer_count; /* number of physio buffers */
+ int kaio_buffer_finished_count; /* count of I/O done */
+ struct proc *kaio_p; /* process that uses this kaio block */
+ TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
+ TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */
+ TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */
+ TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */
+ TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */
+ TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */
+};
+
+#define KAIO_RUNDOWN 0x1 /* process is being run down */
+#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */
+
+static TAILQ_HEAD(,aiothreadlist) aio_activeproc; /* Active daemons */
+static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* Idle daemons */
+static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */
+static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */
+
+static void aio_init_aioinfo(struct proc *p);
+static void aio_onceonly(void);
+static int aio_free_entry(struct aiocblist *aiocbe);
+static void aio_process(struct aiocblist *aiocbe);
+static int aio_newproc(void);
+static int aio_aqueue(struct thread *td, struct aiocb *job, int type);
+static void aio_physwakeup(struct buf *bp);
+static void aio_proc_rundown(struct proc *p);
+static int aio_fphysio(struct aiocblist *aiocbe);
+static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
+static void aio_daemon(void *uproc);
+static void aio_swake_cb(struct socket *, struct sockbuf *);
+static int aio_unload(void);
+static void process_signal(void *aioj);
+static int filt_aioattach(struct knote *kn);
+static void filt_aiodetach(struct knote *kn);
+static int filt_aio(struct knote *kn, long hint);
+
+/*
+ * Zones for:
+ * kaio Per process async io info
+ * aiop async io thread data
+ * aiocb async io jobs
+ * aiol list io job pointer - internal to aio_suspend XXX
+ * aiolio list io jobs
+ */
+static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
+
+/* kqueue filters for aio */
+static struct filterops aio_filtops =
+ { 0, filt_aioattach, filt_aiodetach, filt_aio };
+
+/*
+ * Main operations function for use as a kernel module.
+ */
+static int
+aio_modload(struct module *module, int cmd, void *arg)
+{
+ int error = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ aio_onceonly();
+ break;
+ case MOD_UNLOAD:
+ error = aio_unload();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t aio_mod = {
+ "aio",
+ &aio_modload,
+ NULL
+};
+
+SYSCALL_MODULE_HELPER(aio_return);
+SYSCALL_MODULE_HELPER(aio_suspend);
+SYSCALL_MODULE_HELPER(aio_cancel);
+SYSCALL_MODULE_HELPER(aio_error);
+SYSCALL_MODULE_HELPER(aio_read);
+SYSCALL_MODULE_HELPER(aio_write);
+SYSCALL_MODULE_HELPER(aio_waitcomplete);
+SYSCALL_MODULE_HELPER(lio_listio);
+
+DECLARE_MODULE(aio, aio_mod,
+ SI_SUB_VFS, SI_ORDER_ANY);
+MODULE_VERSION(aio, 1);
+
+/*
+ * Startup initialization
+ */
+static void
+aio_onceonly(void)
+{
+
+ /* XXX: should probably just use so->callback */
+ aio_swake = &aio_swake_cb;
+ at_exit(aio_proc_rundown);
+ at_exec(aio_proc_rundown);
+ kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
+ TAILQ_INIT(&aio_freeproc);
+ TAILQ_INIT(&aio_activeproc);
+ TAILQ_INIT(&aio_jobs);
+ TAILQ_INIT(&aio_bufjobs);
+ kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
+ NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
+ NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aio_liojob), NULL,
+ NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiod_timeout = AIOD_TIMEOUT_DEFAULT;
+ aiod_lifetime = AIOD_LIFETIME_DEFAULT;
+ jobrefid = 1;
+}
+
+/*
+ * Callback for unload of AIO when used as a module.
+ */
+static int
+aio_unload(void)
+{
+
+ /*
+ * XXX: no unloads by default, it's too dangerous.
+ * perhaps we could do it if locked out callers and then
+ * did an aio_proc_rundown() on each process.
+ */
+ if (!unloadable)
+ return (EOPNOTSUPP);
+
+ aio_swake = NULL;
+ rm_at_exit(aio_proc_rundown);
+ rm_at_exec(aio_proc_rundown);
+ kqueue_del_filteropts(EVFILT_AIO);
+ return (0);
+}
+
+/*
+ * Init the per-process aioinfo structure. The aioinfo limits are set
+ * per-process for user limit (resource) management.
+ */
+static void
+aio_init_aioinfo(struct proc *p)
+{
+ struct kaioinfo *ki;
+ if (p->p_aioinfo == NULL) {
+ ki = uma_zalloc(kaio_zone, M_WAITOK);
+ p->p_aioinfo = ki;
+ ki->kaio_flags = 0;
+ ki->kaio_maxactive_count = max_aio_per_proc;
+ ki->kaio_active_count = 0;
+ ki->kaio_qallowed_count = max_aio_queue_per_proc;
+ ki->kaio_queue_count = 0;
+ ki->kaio_ballowed_count = max_buf_aio;
+ ki->kaio_buffer_count = 0;
+ ki->kaio_buffer_finished_count = 0;
+ ki->kaio_p = p;
+ TAILQ_INIT(&ki->kaio_jobdone);
+ TAILQ_INIT(&ki->kaio_jobqueue);
+ TAILQ_INIT(&ki->kaio_bufdone);
+ TAILQ_INIT(&ki->kaio_bufqueue);
+ TAILQ_INIT(&ki->kaio_liojoblist);
+ TAILQ_INIT(&ki->kaio_sockqueue);
+ }
+
+ while (num_aio_procs < target_aio_procs)
+ aio_newproc();
+}
+
+/*
+ * Free a job entry. Wait for completion if it is currently active, but don't
+ * delay forever. If we delay, we return a flag that says that we have to
+ * restart the queue scan.
+ */
+static int
+aio_free_entry(struct aiocblist *aiocbe)
+{
+ struct kaioinfo *ki;
+ struct aio_liojob *lj;
+ struct proc *p;
+ int error;
+ int s;
+
+ if (aiocbe->jobstate == JOBST_NULL)
+ panic("aio_free_entry: freeing already free job");
+
+ p = aiocbe->userproc;
+ ki = p->p_aioinfo;
+ lj = aiocbe->lio;
+ if (ki == NULL)
+ panic("aio_free_entry: missing p->p_aioinfo");
+
+ while (aiocbe->jobstate == JOBST_JOBRUNNING) {
+ if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
+ return 0;
+ aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
+ tsleep(aiocbe, PRIBIO, "jobwai", 0);
+ }
+ aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
+
+ if (aiocbe->bp == NULL) {
+ if (ki->kaio_queue_count <= 0)
+ panic("aio_free_entry: process queue size <= 0");
+ if (num_queue_count <= 0)
+ panic("aio_free_entry: system wide queue size <= 0");
+
+ if (lj) {
+ lj->lioj_queue_count--;
+ if (aiocbe->jobflags & AIOCBLIST_DONE)
+ lj->lioj_queue_finished_count--;
+ }
+ ki->kaio_queue_count--;
+ if (aiocbe->jobflags & AIOCBLIST_DONE)
+ ki->kaio_queue_finished_count--;
+ num_queue_count--;
+ } else {
+ if (lj) {
+ lj->lioj_buffer_count--;
+ if (aiocbe->jobflags & AIOCBLIST_DONE)
+ lj->lioj_buffer_finished_count--;
+ }
+ if (aiocbe->jobflags & AIOCBLIST_DONE)
+ ki->kaio_buffer_finished_count--;
+ ki->kaio_buffer_count--;
+ num_buf_aio--;
+ }
+
+ /* aiocbe is going away, we need to destroy any knotes */
+ /* XXXKSE Note the thread here is used to eventually find the
+ * owning process again, but it is also used to do a fo_close
+ * and that requires the thread. (but does it require the
+ * OWNING thread? (or maybe the running thread?)
+ * There is a semantic problem here...
+ */
+ knote_remove(FIRST_THREAD_IN_PROC(p), &aiocbe->klist); /* XXXKSE */
+
+ if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
+ && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
+ ki->kaio_flags &= ~KAIO_WAKEUP;
+ wakeup(p);
+ }
+
+ if (aiocbe->jobstate == JOBST_JOBQBUF) {
+ if ((error = aio_fphysio(aiocbe)) != 0)
+ return error;
+ if (aiocbe->jobstate != JOBST_JOBBFINISHED)
+ panic("aio_free_entry: invalid physio finish-up state");
+ s = splbio();
+ TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
+ splx(s);
+ } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
+ s = splnet();
+ TAILQ_REMOVE(&aio_jobs, aiocbe, list);
+ TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
+ splx(s);
+ } else if (aiocbe->jobstate == JOBST_JOBFINISHED)
+ TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
+ else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
+ s = splbio();
+ TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
+ splx(s);
+ if (aiocbe->bp) {
+ vunmapbuf(aiocbe->bp);
+ relpbuf(aiocbe->bp, NULL);
+ aiocbe->bp = NULL;
+ }
+ }
+ if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
+ TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ uma_zfree(aiolio_zone, lj);
+ }
+ aiocbe->jobstate = JOBST_NULL;
+ untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
+ fdrop(aiocbe->fd_file, curthread);
+ uma_zfree(aiocb_zone, aiocbe);
+ return 0;
+}
+
+/*
+ * Rundown the jobs for a given process.
+ */
+static void
+aio_proc_rundown(struct proc *p)
+{
+ int s;
+ struct kaioinfo *ki;
+ struct aio_liojob *lj, *ljn;
+ struct aiocblist *aiocbe, *aiocbn;
+ struct file *fp;
+ struct socket *so;
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return;
+
+ ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
+ while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
+ ki->kaio_buffer_finished_count)) {
+ ki->kaio_flags |= KAIO_RUNDOWN;
+ if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
+ break;
+ }
+
+ /*
+ * Move any aio ops that are waiting on socket I/O to the normal job
+ * queues so they are cleaned up with any others.
+ */
+ s = splnet();
+ for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
+ aiocbn) {
+ aiocbn = TAILQ_NEXT(aiocbe, plist);
+ fp = aiocbe->fd_file;
+ if (fp != NULL) {
+ so = (struct socket *)fp->f_data;
+ TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
+ if (TAILQ_EMPTY(&so->so_aiojobq)) {
+ so->so_snd.sb_flags &= ~SB_AIO;
+ so->so_rcv.sb_flags &= ~SB_AIO;
+ }
+ }
+ TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
+ TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
+ TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
+ }
+ splx(s);
+
+restart1:
+ for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
+ aiocbn = TAILQ_NEXT(aiocbe, plist);
+ if (aio_free_entry(aiocbe))
+ goto restart1;
+ }
+
+restart2:
+ for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
+ aiocbn) {
+ aiocbn = TAILQ_NEXT(aiocbe, plist);
+ if (aio_free_entry(aiocbe))
+ goto restart2;
+ }
+
+/*
+ * Note the use of lots of splbio here, trying to avoid splbio for long chains
+ * of I/O. Probably unnecessary.
+ */
+restart3:
+ s = splbio();
+ while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
+ ki->kaio_flags |= KAIO_WAKEUP;
+ tsleep(p, PRIBIO, "aioprn", 0);
+ splx(s);
+ goto restart3;
+ }
+ splx(s);
+
+restart4:
+ s = splbio();
+ for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
+ aiocbn = TAILQ_NEXT(aiocbe, plist);
+ if (aio_free_entry(aiocbe)) {
+ splx(s);
+ goto restart4;
+ }
+ }
+ splx(s);
+
+ /*
+ * If we've slept, jobs might have moved from one queue to another.
+ * Retry rundown if we didn't manage to empty the queues.
+ */
+ if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
+ TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
+ TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
+ TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
+ goto restart1;
+
+ for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
+ ljn = TAILQ_NEXT(lj, lioj_list);
+ if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
+ 0)) {
+ TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ uma_zfree(aiolio_zone, lj);
+ } else {
+#ifdef DIAGNOSTIC
+ printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
+ "QF:%d\n", lj->lioj_buffer_count,
+ lj->lioj_buffer_finished_count,
+ lj->lioj_queue_count,
+ lj->lioj_queue_finished_count);
+#endif
+ }
+ }
+
+ uma_zfree(kaio_zone, ki);
+ p->p_aioinfo = NULL;
+}
+
+/*
+ * Select a job to run (called by an AIO daemon).
+ */
+static struct aiocblist *
+aio_selectjob(struct aiothreadlist *aiop)
+{
+ int s;
+ struct aiocblist *aiocbe;
+ struct kaioinfo *ki;
+ struct proc *userp;
+
+ s = splnet();
+ for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
+ TAILQ_NEXT(aiocbe, list)) {
+ userp = aiocbe->userproc;
+ ki = userp->p_aioinfo;
+
+ if (ki->kaio_active_count < ki->kaio_maxactive_count) {
+ TAILQ_REMOVE(&aio_jobs, aiocbe, list);
+ splx(s);
+ return aiocbe;
+ }
+ }
+ splx(s);
+
+ return NULL;
+}
+
+/*
+ * The AIO processing activity. This is the code that does the I/O request for
+ * the non-physio version of the operations. The normal vn operations are used,
+ * and this code should work in all instances for every type of file, including
+ * pipes, sockets, fifos, and regular files.
+ */
+static void
+aio_process(struct aiocblist *aiocbe)
+{
+ struct thread *td;
+ struct proc *mycp;
+ struct aiocb *cb;
+ struct file *fp;
+ struct uio auio;
+ struct iovec aiov;
+ int cnt;
+ int error;
+ int oublock_st, oublock_end;
+ int inblock_st, inblock_end;
+
+ td = curthread;
+ mycp = td->td_proc;
+ cb = &aiocbe->uaiocb;
+ fp = aiocbe->fd_file;
+
+ aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
+ aiov.iov_len = cb->aio_nbytes;
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = cb->aio_offset;
+ auio.uio_resid = cb->aio_nbytes;
+ cnt = cb->aio_nbytes;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+
+ inblock_st = mycp->p_stats->p_ru.ru_inblock;
+ oublock_st = mycp->p_stats->p_ru.ru_oublock;
+ /*
+ * _aio_aqueue() acquires a reference to the file that is
+ * released in aio_free_entry().
+ */
+ if (cb->aio_lio_opcode == LIO_READ) {
+ auio.uio_rw = UIO_READ;
+ error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+ } else {
+ auio.uio_rw = UIO_WRITE;
+ error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+ }
+ inblock_end = mycp->p_stats->p_ru.ru_inblock;
+ oublock_end = mycp->p_stats->p_ru.ru_oublock;
+
+ aiocbe->inputcharge = inblock_end - inblock_st;
+ aiocbe->outputcharge = oublock_end - oublock_st;
+
+ if ((error) && (auio.uio_resid != cnt)) {
+ if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
+ error = 0;
+ if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
+ PROC_LOCK(aiocbe->userproc);
+ psignal(aiocbe->userproc, SIGPIPE);
+ PROC_UNLOCK(aiocbe->userproc);
+ }
+ }
+
+ cnt -= auio.uio_resid;
+ cb->_aiocb_private.error = error;
+ cb->_aiocb_private.status = cnt;
+}
+
+/*
+ * The AIO daemon, most of the actual work is done in aio_process,
+ * but the setup (and address space mgmt) is done in this routine.
+ */
+static void
+aio_daemon(void *uproc)
+{
+ int s;
+ struct aio_liojob *lj;
+ struct aiocb *cb;
+ struct aiocblist *aiocbe;
+ struct aiothreadlist *aiop;
+ struct kaioinfo *ki;
+ struct proc *curcp, *mycp, *userp;
+ struct vmspace *myvm, *tmpvm;
+ struct thread *td = curthread;
+ struct pgrp *newpgrp;
+ struct session *newsess;
+
+ mtx_lock(&Giant);
+ /*
+ * Local copies of curproc (cp) and vmspace (myvm)
+ */
+ mycp = td->td_proc;
+ myvm = mycp->p_vmspace;
+
+ if (mycp->p_textvp) {
+ vrele(mycp->p_textvp);
+ mycp->p_textvp = NULL;
+ }
+
+ /*
+ * Allocate and ready the aio control info. There is one aiop structure
+ * per daemon.
+ */
+ aiop = uma_zalloc(aiop_zone, M_WAITOK);
+ aiop->aiothread = td;
+ aiop->aiothreadflags |= AIOP_FREE;
+
+ s = splnet();
+
+ /*
+ * Place thread (lightweight process) onto the AIO free thread list.
+ */
+ if (TAILQ_EMPTY(&aio_freeproc))
+ wakeup(&aio_freeproc);
+ TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
+
+ splx(s);
+
+ /*
+ * Get rid of our current filedescriptors. AIOD's don't need any
+ * filedescriptors, except as temporarily inherited from the client.
+ */
+ fdfree(td);
+ mycp->p_fd = NULL;
+
+ mtx_unlock(&Giant);
+ /* The daemon resides in its own pgrp. */
+ MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP,
+ M_WAITOK | M_ZERO);
+ MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION,
+ M_WAITOK | M_ZERO);
+
+ sx_xlock(&proctree_lock);
+ enterpgrp(mycp, mycp->p_pid, newpgrp, newsess);
+ sx_xunlock(&proctree_lock);
+ mtx_lock(&Giant);
+
+ /* Mark special process type. */
+ mycp->p_flag |= P_SYSTEM;
+
+ /*
+ * Wakeup parent process. (Parent sleeps to keep from blasting away
+ * and creating too many daemons.)
+ */
+ wakeup(mycp);
+
+ for (;;) {
+ /*
+ * curcp is the current daemon process context.
+ * userp is the current user process context.
+ */
+ curcp = mycp;
+
+ /*
+ * Take daemon off of free queue
+ */
+ if (aiop->aiothreadflags & AIOP_FREE) {
+ s = splnet();
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
+ aiop->aiothreadflags &= ~AIOP_FREE;
+ splx(s);
+ }
+ aiop->aiothreadflags &= ~AIOP_SCHED;
+
+ /*
+ * Check for jobs.
+ */
+ while ((aiocbe = aio_selectjob(aiop)) != NULL) {
+ cb = &aiocbe->uaiocb;
+ userp = aiocbe->userproc;
+
+ aiocbe->jobstate = JOBST_JOBRUNNING;
+
+ /*
+ * Connect to process address space for user program.
+ */
+ if (userp != curcp) {
+ /*
+ * Save the current address space that we are
+ * connected to.
+ */
+ tmpvm = mycp->p_vmspace;
+
+ /*
+ * Point to the new user address space, and
+ * refer to it.
+ */
+ mycp->p_vmspace = userp->p_vmspace;
+ mycp->p_vmspace->vm_refcnt++;
+
+ /* Activate the new mapping. */
+ pmap_activate(FIRST_THREAD_IN_PROC(mycp));
+
+ /*
+ * If the old address space wasn't the daemons
+ * own address space, then we need to remove the
+ * daemon's reference from the other process
+ * that it was acting on behalf of.
+ */
+ if (tmpvm != myvm) {
+ vmspace_free(tmpvm);
+ }
+ curcp = userp;
+ }
+
+ ki = userp->p_aioinfo;
+ lj = aiocbe->lio;
+
+ /* Account for currently active jobs. */
+ ki->kaio_active_count++;
+
+ /* Do the I/O function. */
+ aio_process(aiocbe);
+
+ /* Decrement the active job count. */
+ ki->kaio_active_count--;
+
+ /*
+ * Increment the completion count for wakeup/signal
+ * comparisons.
+ */
+ aiocbe->jobflags |= AIOCBLIST_DONE;
+ ki->kaio_queue_finished_count++;
+ if (lj)
+ lj->lioj_queue_finished_count++;
+ if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
+ & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
+ ki->kaio_flags &= ~KAIO_WAKEUP;
+ wakeup(userp);
+ }
+
+ s = splbio();
+ if (lj && (lj->lioj_flags &
+ (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
+ if ((lj->lioj_queue_finished_count ==
+ lj->lioj_queue_count) &&
+ (lj->lioj_buffer_finished_count ==
+ lj->lioj_buffer_count)) {
+ PROC_LOCK(userp);
+ psignal(userp,
+ lj->lioj_signal.sigev_signo);
+ PROC_UNLOCK(userp);
+ lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+ }
+ }
+ splx(s);
+
+ aiocbe->jobstate = JOBST_JOBFINISHED;
+
+ /*
+ * If the I/O request should be automatically rundown,
+ * do the needed cleanup. Otherwise, place the queue
+ * entry for the just finished I/O request into the done
+ * queue for the associated client.
+ */
+ s = splnet();
+ if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
+ aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
+ uma_zfree(aiocb_zone, aiocbe);
+ } else {
+ TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe,
+ plist);
+ }
+ splx(s);
+ KNOTE(&aiocbe->klist, 0);
+
+ if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
+ wakeup(aiocbe);
+ aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
+ }
+
+ if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
+ PROC_LOCK(userp);
+ psignal(userp, cb->aio_sigevent.sigev_signo);
+ PROC_UNLOCK(userp);
+ }
+ }
+
+ /*
+ * Disconnect from user address space.
+ */
+ if (curcp != mycp) {
+ /* Get the user address space to disconnect from. */
+ tmpvm = mycp->p_vmspace;
+
+ /* Get original address space for daemon. */
+ mycp->p_vmspace = myvm;
+
+ /* Activate the daemon's address space. */
+ pmap_activate(FIRST_THREAD_IN_PROC(mycp));
+#ifdef DIAGNOSTIC
+ if (tmpvm == myvm) {
+ printf("AIOD: vmspace problem -- %d\n",
+ mycp->p_pid);
+ }
+#endif
+ /* Remove our vmspace reference. */
+ vmspace_free(tmpvm);
+
+ curcp = mycp;
+ }
+
+ /*
+ * If we are the first to be put onto the free queue, wakeup
+ * anyone waiting for a daemon.
+ */
+ s = splnet();
+ TAILQ_REMOVE(&aio_activeproc, aiop, list);
+ if (TAILQ_EMPTY(&aio_freeproc))
+ wakeup(&aio_freeproc);
+ TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
+ aiop->aiothreadflags |= AIOP_FREE;
+ splx(s);
+
+ /*
+ * If daemon is inactive for a long time, allow it to exit,
+ * thereby freeing resources.
+ */
+ if ((aiop->aiothreadflags & AIOP_SCHED) == 0 &&
+ tsleep(aiop->aiothread, PRIBIO, "aiordy", aiod_lifetime)) {
+ s = splnet();
+ if (TAILQ_EMPTY(&aio_jobs)) {
+ if ((aiop->aiothreadflags & AIOP_FREE) &&
+ (num_aio_procs > target_aio_procs)) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ splx(s);
+ uma_zfree(aiop_zone, aiop);
+ num_aio_procs--;
+#ifdef DIAGNOSTIC
+ if (mycp->p_vmspace->vm_refcnt <= 1) {
+ printf("AIOD: bad vm refcnt for"
+ " exiting daemon: %d\n",
+ mycp->p_vmspace->vm_refcnt);
+ }
+#endif
+ kthread_exit(0);
+ }
+ }
+ splx(s);
+ }
+ }
+}
+
+/*
+ * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
+ * AIO daemon modifies its environment itself.
+ */
+static int
+aio_newproc()
+{
+ int error;
+ struct proc *p;
+
+ error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, "aiod%d",
+ num_aio_procs);
+ if (error)
+ return error;
+
+ /*
+ * Wait until daemon is started, but continue on just in case to
+ * handle error conditions.
+ */
+ error = tsleep(p, PZERO, "aiosta", aiod_timeout);
+
+ num_aio_procs++;
+
+ return error;
+}
+
+/*
+ * Try the high-performance, low-overhead physio method for eligible
+ * VCHR devices. This method doesn't use an aio helper thread, and
+ * thus has very low overhead.
+ *
+ * Assumes that the caller, _aio_aqueue(), has incremented the file
+ * structure's reference count, preventing its deallocation for the
+ * duration of this call.
+ */
+static int
+aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
+{
+ int error;
+ struct aiocb *cb;
+ struct file *fp;
+ struct buf *bp;
+ struct vnode *vp;
+ struct kaioinfo *ki;
+ struct aio_liojob *lj;
+ int s;
+ int notify;
+
+ cb = &aiocbe->uaiocb;
+ fp = aiocbe->fd_file;
+
+ if (fp->f_type != DTYPE_VNODE)
+ return (-1);
+
+ vp = (struct vnode *)fp->f_data;
+
+ /*
+ * If its not a disk, we don't want to return a positive error.
+ * It causes the aio code to not fall through to try the thread
+ * way when you're talking to a regular file.
+ */
+ if (!vn_isdisk(vp, &error)) {
+ if (error == ENOTBLK)
+ return (-1);
+ else
+ return (error);
+ }
+
+ if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
+ return (-1);
+
+ if (cb->aio_nbytes >
+ MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
+ return (-1);
+
+ ki = p->p_aioinfo;
+ if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
+ return (-1);
+
+ ki->kaio_buffer_count++;
+
+ lj = aiocbe->lio;
+ if (lj)
+ lj->lioj_buffer_count++;
+
+ /* Create and build a buffer header for a transfer. */
+ bp = (struct buf *)getpbuf(NULL);
+ BUF_KERNPROC(bp);
+
+ /*
+ * Get a copy of the kva from the physical buffer.
+ */
+ bp->b_caller1 = p;
+ bp->b_dev = vp->v_rdev;
+ error = bp->b_error = 0;
+
+ bp->b_bcount = cb->aio_nbytes;
+ bp->b_bufsize = cb->aio_nbytes;
+ bp->b_flags = B_PHYS;
+ bp->b_iodone = aio_physwakeup;
+ bp->b_saveaddr = bp->b_data;
+ bp->b_data = (void *)(uintptr_t)cb->aio_buf;
+ bp->b_blkno = btodb(cb->aio_offset);
+
+ if (cb->aio_lio_opcode == LIO_WRITE) {
+ bp->b_iocmd = BIO_WRITE;
+ if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
+ error = EFAULT;
+ goto doerror;
+ }
+ } else {
+ bp->b_iocmd = BIO_READ;
+ if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
+ error = EFAULT;
+ goto doerror;
+ }
+ }
+
+ /* Bring buffer into kernel space. */
+ vmapbuf(bp);
+
+ s = splbio();
+ aiocbe->bp = bp;
+ bp->b_spc = (void *)aiocbe;
+ TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
+ TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
+ aiocbe->jobstate = JOBST_JOBQBUF;
+ cb->_aiocb_private.status = cb->aio_nbytes;
+ num_buf_aio++;
+ bp->b_error = 0;
+
+ splx(s);
+
+ /* Perform transfer. */
+ DEV_STRATEGY(bp, 0);
+
+ notify = 0;
+ s = splbio();
+
+ /*
+ * If we had an error invoking the request, or an error in processing
+ * the request before we have returned, we process it as an error in
+ * transfer. Note that such an I/O error is not indicated immediately,
+ * but is returned using the aio_error mechanism. In this case,
+ * aio_suspend will return immediately.
+ */
+ if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
+ struct aiocb *job = aiocbe->uuaiocb;
+
+ aiocbe->uaiocb._aiocb_private.status = 0;
+ suword(&job->_aiocb_private.status, 0);
+ aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+ suword(&job->_aiocb_private.error, bp->b_error);
+
+ ki->kaio_buffer_finished_count++;
+
+ if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
+ aiocbe->jobstate = JOBST_JOBBFINISHED;
+ aiocbe->jobflags |= AIOCBLIST_DONE;
+ TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
+ TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
+ notify = 1;
+ }
+ }
+ splx(s);
+ if (notify)
+ KNOTE(&aiocbe->klist, 0);
+ return 0;
+
+doerror:
+ ki->kaio_buffer_count--;
+ if (lj)
+ lj->lioj_buffer_count--;
+ aiocbe->bp = NULL;
+ relpbuf(bp, NULL);
+ return error;
+}
+
+/*
+ * This waits/tests physio completion.
+ */
+static int
+aio_fphysio(struct aiocblist *iocb)
+{
+ int s;
+ struct buf *bp;
+ int error;
+
+ bp = iocb->bp;
+
+ s = splbio();
+ while ((bp->b_flags & B_DONE) == 0) {
+ if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
+ if ((bp->b_flags & B_DONE) == 0) {
+ splx(s);
+ return EINPROGRESS;
+ } else
+ break;
+ }
+ }
+ splx(s);
+
+ /* Release mapping into kernel space. */
+ vunmapbuf(bp);
+ iocb->bp = 0;
+
+ error = 0;
+
+ /* Check for an error. */
+ if (bp->b_ioflags & BIO_ERROR)
+ error = bp->b_error;
+
+ relpbuf(bp, NULL);
+ return (error);
+}
+
+/*
+ * Wake up aio requests that may be serviceable now.
+ */
+static void
+aio_swake_cb(struct socket *so, struct sockbuf *sb)
+{
+ struct aiocblist *cb,*cbn;
+ struct proc *p;
+ struct kaioinfo *ki = NULL;
+ int opcode, wakecount = 0;
+ struct aiothreadlist *aiop;
+
+ if (sb == &so->so_snd) {
+ opcode = LIO_WRITE;
+ so->so_snd.sb_flags &= ~SB_AIO;
+ } else {
+ opcode = LIO_READ;
+ so->so_rcv.sb_flags &= ~SB_AIO;
+ }
+
+ for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
+ cbn = TAILQ_NEXT(cb, list);
+ if (opcode == cb->uaiocb.aio_lio_opcode) {
+ p = cb->userproc;
+ ki = p->p_aioinfo;
+ TAILQ_REMOVE(&so->so_aiojobq, cb, list);
+ TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
+ TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
+ TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
+ wakecount++;
+ if (cb->jobstate != JOBST_JOBQGLOBAL)
+ panic("invalid queue value");
+ }
+ }
+
+ while (wakecount--) {
+ if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
+ aiop->aiothreadflags &= ~AIOP_FREE;
+ wakeup(aiop->aiothread);
+ }
+ }
+}
+
+/*
+ * Queue a new AIO request. Choosing either the threaded or direct physio VCHR
+ * technique is done in this code.
+ */
+static int
+_aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type)
+{
+ struct proc *p = td->td_proc;
+ struct filedesc *fdp;
+ struct file *fp;
+ unsigned int fd;
+ struct socket *so;
+ int s;
+ int error;
+ int opcode;
+ struct aiocblist *aiocbe;
+ struct aiothreadlist *aiop;
+ struct kaioinfo *ki;
+ struct kevent kev;
+ struct kqueue *kq;
+ struct file *kq_fp;
+
+ aiocbe = uma_zalloc(aiocb_zone, M_WAITOK);
+ aiocbe->inputcharge = 0;
+ aiocbe->outputcharge = 0;
+ callout_handle_init(&aiocbe->timeouthandle);
+ SLIST_INIT(&aiocbe->klist);
+
+ suword(&job->_aiocb_private.status, -1);
+ suword(&job->_aiocb_private.error, 0);
+ suword(&job->_aiocb_private.kernelinfo, -1);
+
+ error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
+ if (error) {
+ suword(&job->_aiocb_private.error, error);
+ uma_zfree(aiocb_zone, aiocbe);
+ return error;
+ }
+ if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
+ !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
+ uma_zfree(aiocb_zone, aiocbe);
+ return EINVAL;
+ }
+
+ /* Save userspace address of the job info. */
+ aiocbe->uuaiocb = job;
+
+ /* Get the opcode. */
+ if (type != LIO_NOP)
+ aiocbe->uaiocb.aio_lio_opcode = type;
+ opcode = aiocbe->uaiocb.aio_lio_opcode;
+
+ /* Get the fd info for process. */
+ fdp = p->p_fd;
+
+ /*
+ * Range check file descriptor.
+ */
+ fd = aiocbe->uaiocb.aio_fildes;
+ if (fd >= fdp->fd_nfiles) {
+ uma_zfree(aiocb_zone, aiocbe);
+ if (type == 0)
+ suword(&job->_aiocb_private.error, EBADF);
+ return EBADF;
+ }
+
+ fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
+ if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
+ 0))) {
+ uma_zfree(aiocb_zone, aiocbe);
+ if (type == 0)
+ suword(&job->_aiocb_private.error, EBADF);
+ return EBADF;
+ }
+ fhold(fp);
+
+ if (aiocbe->uaiocb.aio_offset == -1LL) {
+ error = EINVAL;
+ goto aqueue_fail;
+ }
+ error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
+ if (error) {
+ error = EINVAL;
+ goto aqueue_fail;
+ }
+ aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
+ if (jobrefid == LONG_MAX)
+ jobrefid = 1;
+ else
+ jobrefid++;
+
+ if (opcode == LIO_NOP) {
+ fdrop(fp, td);
+ uma_zfree(aiocb_zone, aiocbe);
+ if (type == 0) {
+ suword(&job->_aiocb_private.error, 0);
+ suword(&job->_aiocb_private.status, 0);
+ suword(&job->_aiocb_private.kernelinfo, 0);
+ }
+ return 0;
+ }
+ if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
+ if (type == 0)
+ suword(&job->_aiocb_private.status, 0);
+ error = EINVAL;
+ goto aqueue_fail;
+ }
+
+ if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
+ kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
+ kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
+ }
+ else {
+ /*
+ * This method for requesting kevent-based notification won't
+ * work on the alpha, since we're passing in a pointer
+ * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT-
+ * based method instead.
+ */
+ struct kevent *kevp;
+
+ kevp = (struct kevent *)(uintptr_t)job->aio_lio_opcode;
+ if (kevp == NULL)
+ goto no_kqueue;
+
+ error = copyin(kevp, &kev, sizeof(kev));
+ if (error)
+ goto aqueue_fail;
+ }
+ if ((u_int)kev.ident >= fdp->fd_nfiles ||
+ (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
+ (kq_fp->f_type != DTYPE_KQUEUE)) {
+ error = EBADF;
+ goto aqueue_fail;
+ }
+ kq = (struct kqueue *)kq_fp->f_data;
+ kev.ident = (uintptr_t)aiocbe;
+ kev.filter = EVFILT_AIO;
+ kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
+ error = kqueue_register(kq, &kev, td);
+aqueue_fail:
+ if (error) {
+ fdrop(fp, td);
+ uma_zfree(aiocb_zone, aiocbe);
+ if (type == 0)
+ suword(&job->_aiocb_private.error, error);
+ goto done;
+ }
+no_kqueue:
+
+ suword(&job->_aiocb_private.error, EINPROGRESS);
+ aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
+ aiocbe->userproc = p;
+ aiocbe->jobflags = 0;
+ aiocbe->lio = lj;
+ ki = p->p_aioinfo;
+
+ if (fp->f_type == DTYPE_SOCKET) {
+ /*
+ * Alternate queueing for socket ops: Reach down into the
+ * descriptor to get the socket data. Then check to see if the
+ * socket is ready to be read or written (based on the requested
+ * operation).
+ *
+ * If it is not ready for io, then queue the aiocbe on the
+ * socket, and set the flags so we get a call when sbnotify()
+ * happens.
+ */
+ so = (struct socket *)fp->f_data;
+ s = splnet();
+ if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
+ LIO_WRITE) && (!sowriteable(so)))) {
+ TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
+ TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
+ if (opcode == LIO_READ)
+ so->so_rcv.sb_flags |= SB_AIO;
+ else
+ so->so_snd.sb_flags |= SB_AIO;
+ aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
+ ki->kaio_queue_count++;
+ num_queue_count++;
+ splx(s);
+ error = 0;
+ goto done;
+ }
+ splx(s);
+ }
+
+ if ((error = aio_qphysio(p, aiocbe)) == 0)
+ goto done;
+ if (error > 0) {
+ suword(&job->_aiocb_private.status, 0);
+ aiocbe->uaiocb._aiocb_private.error = error;
+ suword(&job->_aiocb_private.error, error);
+ goto done;
+ }
+
+ /* No buffer for daemon I/O. */
+ aiocbe->bp = NULL;
+
+ ki->kaio_queue_count++;
+ if (lj)
+ lj->lioj_queue_count++;
+ s = splnet();
+ TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
+ TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
+ splx(s);
+ aiocbe->jobstate = JOBST_JOBQGLOBAL;
+
+ num_queue_count++;
+ error = 0;
+
+ /*
+ * If we don't have a free AIO process, and we are below our quota, then
+ * start one. Otherwise, depend on the subsequent I/O completions to
+ * pick-up this job. If we don't sucessfully create the new process
+ * (thread) due to resource issues, we return an error for now (EAGAIN),
+ * which is likely not the correct thing to do.
+ */
+ s = splnet();
+retryproc:
+ if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
+ aiop->aiothreadflags &= ~AIOP_FREE;
+ wakeup(aiop->aiothread);
+ } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
+ ((ki->kaio_active_count + num_aio_resv_start) <
+ ki->kaio_maxactive_count)) {
+ num_aio_resv_start++;
+ if ((error = aio_newproc()) == 0) {
+ num_aio_resv_start--;
+ goto retryproc;
+ }
+ num_aio_resv_start--;
+ }
+ splx(s);
+done:
+ return error;
+}
+
+/*
+ * This routine queues an AIO request, checking for quotas.
+ */
+static int
+aio_aqueue(struct thread *td, struct aiocb *job, int type)
+{
+ struct proc *p = td->td_proc;
+ struct kaioinfo *ki;
+
+ if (p->p_aioinfo == NULL)
+ aio_init_aioinfo(p);
+
+ if (num_queue_count >= max_queue_count)
+ return EAGAIN;
+
+ ki = p->p_aioinfo;
+ if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
+ return EAGAIN;
+
+ return _aio_aqueue(td, job, NULL, type);
+}
+
+/*
+ * Support the aio_return system call, as a side-effect, kernel resources are
+ * released.
+ */
+int
+aio_return(struct thread *td, struct aio_return_args *uap)
+{
+ struct proc *p = td->td_proc;
+ int s;
+ long jobref;
+ struct aiocblist *cb, *ncb;
+ struct aiocb *ujob;
+ struct kaioinfo *ki;
+
+ ujob = uap->aiocbp;
+ jobref = fuword(&ujob->_aiocb_private.kernelinfo);
+ if (jobref == -1 || jobref == 0)
+ return EINVAL;
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return EINVAL;
+ TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
+ if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
+ jobref) {
+ if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+ p->p_stats->p_ru.ru_oublock +=
+ cb->outputcharge;
+ cb->outputcharge = 0;
+ } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+ p->p_stats->p_ru.ru_inblock += cb->inputcharge;
+ cb->inputcharge = 0;
+ }
+ goto done;
+ }
+ }
+ s = splbio();
+ for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
+ ncb = TAILQ_NEXT(cb, plist);
+ if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
+ == jobref) {
+ break;
+ }
+ }
+ splx(s);
+ done:
+ if (cb != NULL) {
+ if (ujob == cb->uuaiocb) {
+ td->td_retval[0] =
+ cb->uaiocb._aiocb_private.status;
+ } else
+ td->td_retval[0] = EFAULT;
+ aio_free_entry(cb);
+ return (0);
+ }
+ return (EINVAL);
+}
+
+/*
+ * Allow a process to wakeup when any of the I/O requests are completed.
+ */
+int
+aio_suspend(struct thread *td, struct aio_suspend_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct timeval atv;
+ struct timespec ts;
+ struct aiocb *const *cbptr, *cbp;
+ struct kaioinfo *ki;
+ struct aiocblist *cb;
+ int i;
+ int njoblist;
+ int error, s, timo;
+ long *ijoblist;
+ struct aiocb **ujoblist;
+
+ if (uap->nent > AIO_LISTIO_MAX)
+ return EINVAL;
+
+ timo = 0;
+ if (uap->timeout) {
+ /* Get timespec struct. */
+ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
+ return error;
+
+ if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
+ return (EINVAL);
+
+ TIMESPEC_TO_TIMEVAL(&atv, &ts);
+ if (itimerfix(&atv))
+ return (EINVAL);
+ timo = tvtohz(&atv);
+ }
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return EAGAIN;
+
+ njoblist = 0;
+ ijoblist = uma_zalloc(aiol_zone, M_WAITOK);
+ ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
+ cbptr = uap->aiocbp;
+
+ for (i = 0; i < uap->nent; i++) {
+ cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
+ if (cbp == 0)
+ continue;
+ ujoblist[njoblist] = cbp;
+ ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
+ njoblist++;
+ }
+
+ if (njoblist == 0) {
+ uma_zfree(aiol_zone, ijoblist);
+ uma_zfree(aiol_zone, ujoblist);
+ return 0;
+ }
+
+ error = 0;
+ for (;;) {
+ TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
+ for (i = 0; i < njoblist; i++) {
+ if (((intptr_t)
+ cb->uaiocb._aiocb_private.kernelinfo) ==
+ ijoblist[i]) {
+ if (ujoblist[i] != cb->uuaiocb)
+ error = EINVAL;
+ uma_zfree(aiol_zone, ijoblist);
+ uma_zfree(aiol_zone, ujoblist);
+ return error;
+ }
+ }
+ }
+
+ s = splbio();
+ for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
+ TAILQ_NEXT(cb, plist)) {
+ for (i = 0; i < njoblist; i++) {
+ if (((intptr_t)
+ cb->uaiocb._aiocb_private.kernelinfo) ==
+ ijoblist[i]) {
+ splx(s);
+ if (ujoblist[i] != cb->uuaiocb)
+ error = EINVAL;
+ uma_zfree(aiol_zone, ijoblist);
+ uma_zfree(aiol_zone, ujoblist);
+ return error;
+ }
+ }
+ }
+
+ ki->kaio_flags |= KAIO_WAKEUP;
+ error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
+ splx(s);
+
+ if (error == ERESTART || error == EINTR) {
+ uma_zfree(aiol_zone, ijoblist);
+ uma_zfree(aiol_zone, ujoblist);
+ return EINTR;
+ } else if (error == EWOULDBLOCK) {
+ uma_zfree(aiol_zone, ijoblist);
+ uma_zfree(aiol_zone, ujoblist);
+ return EAGAIN;
+ }
+ }
+
+/* NOTREACHED */
+ return EINVAL;
+}
+
+/*
+ * aio_cancel cancels any non-physio aio operations not currently in
+ * progress.
+ */
+int
+aio_cancel(struct thread *td, struct aio_cancel_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct kaioinfo *ki;
+ struct aiocblist *cbe, *cbn;
+ struct file *fp;
+ struct filedesc *fdp;
+ struct socket *so;
+ struct proc *po;
+ int s,error;
+ int cancelled=0;
+ int notcancelled=0;
+ struct vnode *vp;
+
+ fdp = p->p_fd;
+ if ((u_int)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+ return (EBADF);
+
+ if (fp->f_type == DTYPE_VNODE) {
+ vp = (struct vnode *)fp->f_data;
+
+ if (vn_isdisk(vp,&error)) {
+ td->td_retval[0] = AIO_NOTCANCELED;
+ return 0;
+ }
+ } else if (fp->f_type == DTYPE_SOCKET) {
+ so = (struct socket *)fp->f_data;
+
+ s = splnet();
+
+ for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
+ cbn = TAILQ_NEXT(cbe, list);
+ if ((uap->aiocbp == NULL) ||
+ (uap->aiocbp == cbe->uuaiocb) ) {
+ po = cbe->userproc;
+ ki = po->p_aioinfo;
+ TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
+ TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
+ if (ki->kaio_flags & KAIO_WAKEUP) {
+ wakeup(po);
+ }
+ cbe->jobstate = JOBST_JOBFINISHED;
+ cbe->uaiocb._aiocb_private.status=-1;
+ cbe->uaiocb._aiocb_private.error=ECANCELED;
+ cancelled++;
+/* XXX cancelled, knote? */
+ if (cbe->uaiocb.aio_sigevent.sigev_notify ==
+ SIGEV_SIGNAL) {
+ PROC_LOCK(cbe->userproc);
+ psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
+ PROC_UNLOCK(cbe->userproc);
+ }
+ if (uap->aiocbp)
+ break;
+ }
+ }
+ splx(s);
+
+ if ((cancelled) && (uap->aiocbp)) {
+ td->td_retval[0] = AIO_CANCELED;
+ return 0;
+ }
+ }
+ ki=p->p_aioinfo;
+ s = splnet();
+
+ for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
+ cbn = TAILQ_NEXT(cbe, plist);
+
+ if ((uap->fd == cbe->uaiocb.aio_fildes) &&
+ ((uap->aiocbp == NULL ) ||
+ (uap->aiocbp == cbe->uuaiocb))) {
+
+ if (cbe->jobstate == JOBST_JOBQGLOBAL) {
+ TAILQ_REMOVE(&aio_jobs, cbe, list);
+ TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
+ plist);
+ cancelled++;
+ ki->kaio_queue_finished_count++;
+ cbe->jobstate = JOBST_JOBFINISHED;
+ cbe->uaiocb._aiocb_private.status = -1;
+ cbe->uaiocb._aiocb_private.error = ECANCELED;
+/* XXX cancelled, knote? */
+ if (cbe->uaiocb.aio_sigevent.sigev_notify ==
+ SIGEV_SIGNAL) {
+ PROC_LOCK(cbe->userproc);
+ psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
+ PROC_UNLOCK(cbe->userproc);
+ }
+ } else {
+ notcancelled++;
+ }
+ }
+ }
+ splx(s);
+
+ if (notcancelled) {
+ td->td_retval[0] = AIO_NOTCANCELED;
+ return 0;
+ }
+ if (cancelled) {
+ td->td_retval[0] = AIO_CANCELED;
+ return 0;
+ }
+ td->td_retval[0] = AIO_ALLDONE;
+
+ return 0;
+}
+
+/*
+ * aio_error is implemented in the kernel level for compatibility purposes only.
+ * For a user mode async implementation, it would be best to do it in a userland
+ * subroutine.
+ */
+int
+aio_error(struct thread *td, struct aio_error_args *uap)
+{
+ struct proc *p = td->td_proc;
+ int s;
+ struct aiocblist *cb;
+ struct kaioinfo *ki;
+ long jobref;
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return EINVAL;
+
+ jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
+ if ((jobref == -1) || (jobref == 0))
+ return EINVAL;
+
+ TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
+ if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
+ jobref) {
+ td->td_retval[0] = cb->uaiocb._aiocb_private.error;
+ return 0;
+ }
+ }
+
+ s = splnet();
+
+ for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
+ plist)) {
+ if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
+ jobref) {
+ td->td_retval[0] = EINPROGRESS;
+ splx(s);
+ return 0;
+ }
+ }
+
+ for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
+ plist)) {
+ if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
+ jobref) {
+ td->td_retval[0] = EINPROGRESS;
+ splx(s);
+ return 0;
+ }
+ }
+ splx(s);
+
+ s = splbio();
+ for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
+ plist)) {
+ if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
+ jobref) {
+ td->td_retval[0] = cb->uaiocb._aiocb_private.error;
+ splx(s);
+ return 0;
+ }
+ }
+
+ for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
+ plist)) {
+ if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
+ jobref) {
+ td->td_retval[0] = EINPROGRESS;
+ splx(s);
+ return 0;
+ }
+ }
+ splx(s);
+
+#if (0)
+ /*
+ * Hack for lio.
+ */
+ status = fuword(&uap->aiocbp->_aiocb_private.status);
+ if (status == -1)
+ return fuword(&uap->aiocbp->_aiocb_private.error);
+#endif
+ return EINVAL;
+}
+
+/* syscall - asynchronous read from a file (REALTIME) */
+int
+aio_read(struct thread *td, struct aio_read_args *uap)
+{
+
+ return aio_aqueue(td, uap->aiocbp, LIO_READ);
+}
+
+/* syscall - asynchronous write to a file (REALTIME) */
+int
+aio_write(struct thread *td, struct aio_write_args *uap)
+{
+
+ return aio_aqueue(td, uap->aiocbp, LIO_WRITE);
+}
+
+/* syscall - XXX undocumented */
+int
+lio_listio(struct thread *td, struct lio_listio_args *uap)
+{
+ struct proc *p = td->td_proc;
+ int nent, nentqueued;
+ struct aiocb *iocb, * const *cbptr;
+ struct aiocblist *cb;
+ struct kaioinfo *ki;
+ struct aio_liojob *lj;
+ int error, runningcode;
+ int nerror;
+ int i;
+ int s;
+
+ if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+ return EINVAL;
+
+ nent = uap->nent;
+ if (nent > AIO_LISTIO_MAX)
+ return EINVAL;
+
+ if (p->p_aioinfo == NULL)
+ aio_init_aioinfo(p);
+
+ if ((nent + num_queue_count) > max_queue_count)
+ return EAGAIN;
+
+ ki = p->p_aioinfo;
+ if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
+ return EAGAIN;
+
+ lj = uma_zalloc(aiolio_zone, M_WAITOK);
+ if (!lj)
+ return EAGAIN;
+
+ lj->lioj_flags = 0;
+ lj->lioj_buffer_count = 0;
+ lj->lioj_buffer_finished_count = 0;
+ lj->lioj_queue_count = 0;
+ lj->lioj_queue_finished_count = 0;
+ lj->lioj_ki = ki;
+
+ /*
+ * Setup signal.
+ */
+ if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+ error = copyin(uap->sig, &lj->lioj_signal,
+ sizeof(lj->lioj_signal));
+ if (error) {
+ uma_zfree(aiolio_zone, lj);
+ return error;
+ }
+ if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
+ uma_zfree(aiolio_zone, lj);
+ return EINVAL;
+ }
+ lj->lioj_flags |= LIOJ_SIGNAL;
+ lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
+ } else
+ lj->lioj_flags &= ~LIOJ_SIGNAL;
+
+ TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
+ /*
+ * Get pointers to the list of I/O requests.
+ */
+ nerror = 0;
+ nentqueued = 0;
+ cbptr = uap->acb_list;
+ for (i = 0; i < uap->nent; i++) {
+ iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
+ if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) {
+ error = _aio_aqueue(td, iocb, lj, 0);
+ if (error == 0)
+ nentqueued++;
+ else
+ nerror++;
+ }
+ }
+
+ /*
+ * If we haven't queued any, then just return error.
+ */
+ if (nentqueued == 0)
+ return 0;
+
+ /*
+ * Calculate the appropriate error return.
+ */
+ runningcode = 0;
+ if (nerror)
+ runningcode = EIO;
+
+ if (uap->mode == LIO_WAIT) {
+ int command, found, jobref;
+
+ for (;;) {
+ found = 0;
+ for (i = 0; i < uap->nent; i++) {
+ /*
+ * Fetch address of the control buf pointer in
+ * user space.
+ */
+ iocb = (struct aiocb *)
+ (intptr_t)fuword(&cbptr[i]);
+ if (((intptr_t)iocb == -1) || ((intptr_t)iocb
+ == 0))
+ continue;
+
+ /*
+ * Fetch the associated command from user space.
+ */
+ command = fuword(&iocb->aio_lio_opcode);
+ if (command == LIO_NOP) {
+ found++;
+ continue;
+ }
+
+ jobref = fuword(&iocb->_aiocb_private.kernelinfo);
+
+ TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
+ if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
+ == jobref) {
+ if (cb->uaiocb.aio_lio_opcode
+ == LIO_WRITE) {
+ p->p_stats->p_ru.ru_oublock
+ +=
+ cb->outputcharge;
+ cb->outputcharge = 0;
+ } else if (cb->uaiocb.aio_lio_opcode
+ == LIO_READ) {
+ p->p_stats->p_ru.ru_inblock
+ += cb->inputcharge;
+ cb->inputcharge = 0;
+ }
+ found++;
+ break;
+ }
+ }
+
+ s = splbio();
+ TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
+ if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
+ == jobref) {
+ found++;
+ break;
+ }
+ }
+ splx(s);
+ }
+
+ /*
+ * If all I/Os have been disposed of, then we can
+ * return.
+ */
+ if (found == nentqueued)
+ return runningcode;
+
+ ki->kaio_flags |= KAIO_WAKEUP;
+ error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
+
+ if (error == EINTR)
+ return EINTR;
+ else if (error == EWOULDBLOCK)
+ return EAGAIN;
+ }
+ }
+
+ return runningcode;
+}
+
+/*
+ * This is a weird hack so that we can post a signal. It is safe to do so from
+ * a timeout routine, but *not* from an interrupt routine.
+ */
+static void
+process_signal(void *aioj)
+{
+ struct aiocblist *aiocbe = aioj;
+ struct aio_liojob *lj = aiocbe->lio;
+ struct aiocb *cb = &aiocbe->uaiocb;
+
+ if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
+ (lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
+ PROC_LOCK(lj->lioj_ki->kaio_p);
+ psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
+ PROC_UNLOCK(lj->lioj_ki->kaio_p);
+ lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+ }
+
+ if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
+ PROC_LOCK(aiocbe->userproc);
+ psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
+ PROC_UNLOCK(aiocbe->userproc);
+ }
+}
+
+/*
+ * Interrupt handler for physio, performs the necessary process wakeups, and
+ * signals.
+ */
+static void
+aio_physwakeup(struct buf *bp)
+{
+ struct aiocblist *aiocbe;
+ struct proc *p;
+ struct kaioinfo *ki;
+ struct aio_liojob *lj;
+
+ wakeup(bp);
+
+ aiocbe = (struct aiocblist *)bp->b_spc;
+ if (aiocbe) {
+ p = bp->b_caller1;
+
+ aiocbe->jobstate = JOBST_JOBBFINISHED;
+ aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+ aiocbe->uaiocb._aiocb_private.error = 0;
+ aiocbe->jobflags |= AIOCBLIST_DONE;
+
+ if (bp->b_ioflags & BIO_ERROR)
+ aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+
+ lj = aiocbe->lio;
+ if (lj) {
+ lj->lioj_buffer_finished_count++;
+
+ /*
+ * wakeup/signal if all of the interrupt jobs are done.
+ */
+ if (lj->lioj_buffer_finished_count ==
+ lj->lioj_buffer_count) {
+ /*
+ * Post a signal if it is called for.
+ */
+ if ((lj->lioj_flags &
+ (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
+ LIOJ_SIGNAL) {
+ lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+ aiocbe->timeouthandle =
+ timeout(process_signal,
+ aiocbe, 0);
+ }
+ }
+ }
+
+ ki = p->p_aioinfo;
+ if (ki) {
+ ki->kaio_buffer_finished_count++;
+ TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
+ TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
+
+ KNOTE(&aiocbe->klist, 0);
+ /* Do the wakeup. */
+ if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
+ ki->kaio_flags &= ~KAIO_WAKEUP;
+ wakeup(p);
+ }
+ }
+
+ if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
+ aiocbe->timeouthandle =
+ timeout(process_signal, aiocbe, 0);
+ }
+}
+
+/* syscall - wait for the next completion of an aio request */
+int
+aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct timeval atv;
+ struct timespec ts;
+ struct kaioinfo *ki;
+ struct aiocblist *cb = NULL;
+ int error, s, timo;
+
+ suword(uap->aiocbp, (int)NULL);
+
+ timo = 0;
+ if (uap->timeout) {
+ /* Get timespec struct. */
+ error = copyin(uap->timeout, &ts, sizeof(ts));
+ if (error)
+ return error;
+
+ if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
+ return (EINVAL);
+
+ TIMESPEC_TO_TIMEVAL(&atv, &ts);
+ if (itimerfix(&atv))
+ return (EINVAL);
+ timo = tvtohz(&atv);
+ }
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return EAGAIN;
+
+ for (;;) {
+ if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
+ suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
+ td->td_retval[0] = cb->uaiocb._aiocb_private.status;
+ if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+ p->p_stats->p_ru.ru_oublock +=
+ cb->outputcharge;
+ cb->outputcharge = 0;
+ } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+ p->p_stats->p_ru.ru_inblock += cb->inputcharge;
+ cb->inputcharge = 0;
+ }
+ aio_free_entry(cb);
+ return cb->uaiocb._aiocb_private.error;
+ }
+
+ s = splbio();
+ if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
+ splx(s);
+ suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
+ td->td_retval[0] = cb->uaiocb._aiocb_private.status;
+ aio_free_entry(cb);
+ return cb->uaiocb._aiocb_private.error;
+ }
+
+ ki->kaio_flags |= KAIO_WAKEUP;
+ error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
+ splx(s);
+
+ if (error == ERESTART)
+ return EINTR;
+ else if (error < 0)
+ return error;
+ else if (error == EINTR)
+ return EINTR;
+ else if (error == EWOULDBLOCK)
+ return EAGAIN;
+ }
+}
+
+/* kqueue attach function */
+static int
+filt_aioattach(struct knote *kn)
+{
+ struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
+
+ /*
+ * The aiocbe pointer must be validated before using it, so
+ * registration is restricted to the kernel; the user cannot
+ * set EV_FLAG1.
+ */
+ if ((kn->kn_flags & EV_FLAG1) == 0)
+ return (EPERM);
+ kn->kn_flags &= ~EV_FLAG1;
+
+ SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
+
+ return (0);
+}
+
+/* kqueue detach function */
+static void
+filt_aiodetach(struct knote *kn)
+{
+ struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
+
+ SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
+}
+
+/* kqueue filter function */
+/*ARGSUSED*/
+static int
+filt_aio(struct knote *kn, long hint)
+{
+ struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
+
+ kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
+ if (aiocbe->jobstate != JOBST_JOBFINISHED &&
+ aiocbe->jobstate != JOBST_JOBBFINISHED)
+ return (0);
+ kn->kn_flags |= EV_EOF;
+ return (1);
+}
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
new file mode 100644
index 0000000..30dc753
--- /dev/null
+++ b/sys/kern/vfs_bio.c
@@ -0,0 +1,3395 @@
+/*
+ * Copyright (c) 1994,1997 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice immediately at the beginning of the file, without modification,
+ * this list of conditions, and the following disclaimer.
+ * 2. Absolutely no warranty of function or purpose is made by the author
+ * John S. Dyson.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * this file contains a new buffer I/O scheme implementing a coherent
+ * VM object and buffer cache scheme. Pains have been taken to make
+ * sure that the performance degradation associated with schemes such
+ * as this is not realized.
+ *
+ * Author: John S. Dyson
+ * Significant help during the development and debugging phases
+ * had been provided by David Greenman, also of the FreeBSD core team.
+ *
+ * see man buf(9) for more info.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/stdint.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/proc.h>
+#include <sys/reboot.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+
+static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
+
+struct bio_ops bioops; /* I/O operation notification */
+
+struct buf_ops buf_ops_bio = {
+ "buf_ops_bio",
+ bwrite
+};
+
+/*
+ * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
+ * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c.
+ */
+struct buf *buf; /* buffer header pool */
+struct mtx buftimelock; /* Interlock on setting prio and timo */
+
+static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
+ vm_offset_t to);
+static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
+ vm_offset_t to);
+static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
+ int pageno, vm_page_t m);
+static void vfs_clean_pages(struct buf * bp);
+static void vfs_setdirty(struct buf *bp);
+static void vfs_vmio_release(struct buf *bp);
+static void vfs_backgroundwritedone(struct buf *bp);
+static int flushbufqueues(void);
+static void buf_daemon(void);
+
+int vmiodirenable = TRUE;
+SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
+ "Use the VM system for directory writes");
+int runningbufspace;
+SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
+ "Amount of presently outstanding async buffer io");
+static int bufspace;
+SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
+ "KVA memory used for bufs");
+static int maxbufspace;
+SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
+ "Maximum allowed value of bufspace (including buf_daemon)");
+static int bufmallocspace;
+SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
+ "Amount of malloced memory for buffers");
+static int maxbufmallocspace;
+SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
+ "Maximum amount of malloced memory for buffers");
+static int lobufspace;
+SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
+ "Minimum amount of buffers we want to have");
+static int hibufspace;
+SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
+ "Maximum allowed value of bufspace (excluding buf_daemon)");
+static int bufreusecnt;
+SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
+ "Number of times we have reused a buffer");
+static int buffreekvacnt;
+SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
+ "Number of times we have freed the KVA space from some buffer");
+static int bufdefragcnt;
+SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
+ "Number of times we have had to repeat buffer allocation to defragment");
+static int lorunningspace;
+SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
+ "Minimum preferred space used for in-progress I/O");
+static int hirunningspace;
+SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
+ "Maximum amount of space to use for in-progress I/O");
+static int numdirtybuffers;
+SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
+ "Number of buffers that are dirty (has unwritten changes) at the moment");
+static int lodirtybuffers;
+SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
+ "How many buffers we want to have free before bufdaemon can sleep");
+static int hidirtybuffers;
+SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
+ "When the number of dirty buffers is considered severe");
+static int numfreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
+ "Number of free buffers");
+static int lofreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
+ "XXX Unused");
+static int hifreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
+ "XXX Complicatedly unused");
+static int getnewbufcalls;
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
+ "Number of calls to getnewbuf");
+static int getnewbufrestarts;
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
+ "Number of times getnewbuf has had to restart a buffer aquisition");
+static int dobkgrdwrite = 1;
+SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
+ "Do background writes (honoring the BX_BKGRDWRITE flag)?");
+
+/*
+ * Wakeup point for bufdaemon, as well as indicator of whether it is already
+ * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it
+ * is idling.
+ */
+static int bd_request;
+
+/*
+ * bogus page -- for I/O to/from partially complete buffers
+ * this is a temporary solution to the problem, but it is not
+ * really that bad. it would be better to split the buffer
+ * for input in the case of buffers partially already in memory,
+ * but the code is intricate enough already.
+ */
+vm_page_t bogus_page;
+
+/*
+ * Offset for bogus_page.
+ * XXX bogus_offset should be local to bufinit
+ */
+static vm_offset_t bogus_offset;
+
+/*
+ * Synchronization (sleep/wakeup) variable for active buffer space requests.
+ * Set when wait starts, cleared prior to wakeup().
+ * Used in runningbufwakeup() and waitrunningbufspace().
+ */
+static int runningbufreq;
+
+/*
+ * Synchronization (sleep/wakeup) variable for buffer requests.
+ * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
+ * by and/or.
+ * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(),
+ * getnewbuf(), and getblk().
+ */
+static int needsbuffer;
+
+/*
+ * Mask for index into the buffer hash table, which needs to be power of 2 in
+ * size. Set in kern_vfs_bio_buffer_alloc.
+ */
+static int bufhashmask;
+
+/*
+ * Hash table for all buffers, with a linked list hanging from each table
+ * entry. Set in kern_vfs_bio_buffer_alloc, initialized in buf_init.
+ */
+static LIST_HEAD(bufhashhdr, buf) *bufhashtbl;
+
+/*
+ * Somewhere to store buffers when they are not in another list, to always
+ * have them in a list (and thus being able to use the same set of operations
+ * on them.)
+ */
+static struct bufhashhdr invalhash;
+
+/*
+ * Definitions for the buffer free lists.
+ */
+#define BUFFER_QUEUES 6 /* number of free buffer queues */
+
+#define QUEUE_NONE 0 /* on no queue */
+#define QUEUE_LOCKED 1 /* locked buffers */
+#define QUEUE_CLEAN 2 /* non-B_DELWRI buffers */
+#define QUEUE_DIRTY 3 /* B_DELWRI buffers */
+#define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */
+#define QUEUE_EMPTY 5 /* empty buffer headers */
+
+/* Queues for free buffers with various properties */
+static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
+/*
+ * Single global constant for BUF_WMESG, to avoid getting multiple references.
+ * buf_wmesg is referred from macros.
+ */
+const char *buf_wmesg = BUF_WMESG;
+
+#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
+#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */
+#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
+#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
+
+/*
+ * Buffer hash table code. Note that the logical block scans linearly, which
+ * gives us some L1 cache locality.
+ */
+
+static __inline
+struct bufhashhdr *
+bufhash(struct vnode *vnp, daddr_t bn)
+{
+ return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
+}
+
+/*
+ * numdirtywakeup:
+ *
+ * If someone is blocked due to there being too many dirty buffers,
+ * and numdirtybuffers is now reasonable, wake them up.
+ */
+
+static __inline void
+numdirtywakeup(int level)
+{
+ if (numdirtybuffers <= level) {
+ if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
+ needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
+ wakeup(&needsbuffer);
+ }
+ }
+}
+
+/*
+ * bufspacewakeup:
+ *
+ * Called when buffer space is potentially available for recovery.
+ * getnewbuf() will block on this flag when it is unable to free
+ * sufficient buffer space. Buffer space becomes recoverable when
+ * bp's get placed back in the queues.
+ */
+
+static __inline void
+bufspacewakeup(void)
+{
+ /*
+ * If someone is waiting for BUF space, wake them up. Even
+ * though we haven't freed the kva space yet, the waiting
+ * process will be able to now.
+ */
+ if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
+ needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
+ wakeup(&needsbuffer);
+ }
+}
+
+/*
+ * runningbufwakeup() - in-progress I/O accounting.
+ *
+ */
+static __inline void
+runningbufwakeup(struct buf *bp)
+{
+ if (bp->b_runningbufspace) {
+ runningbufspace -= bp->b_runningbufspace;
+ bp->b_runningbufspace = 0;
+ if (runningbufreq && runningbufspace <= lorunningspace) {
+ runningbufreq = 0;
+ wakeup(&runningbufreq);
+ }
+ }
+}
+
+/*
+ * bufcountwakeup:
+ *
+ * Called when a buffer has been added to one of the free queues to
+ * account for the buffer and to wakeup anyone waiting for free buffers.
+ * This typically occurs when large amounts of metadata are being handled
+ * by the buffer cache ( else buffer space runs out first, usually ).
+ */
+
+static __inline void
+bufcountwakeup(void)
+{
+ ++numfreebuffers;
+ if (needsbuffer) {
+ needsbuffer &= ~VFS_BIO_NEED_ANY;
+ if (numfreebuffers >= hifreebuffers)
+ needsbuffer &= ~VFS_BIO_NEED_FREE;
+ wakeup(&needsbuffer);
+ }
+}
+
+/*
+ * waitrunningbufspace()
+ *
+ * runningbufspace is a measure of the amount of I/O currently
+ * running. This routine is used in async-write situations to
+ * prevent creating huge backups of pending writes to a device.
+ * Only asynchronous writes are governed by this function.
+ *
+ * Reads will adjust runningbufspace, but will not block based on it.
+ * The read load has a side effect of reducing the allowed write load.
+ *
+ * This does NOT turn an async write into a sync write. It waits
+ * for earlier writes to complete and generally returns before the
+ * caller's write has reached the device.
+ */
+static __inline void
+waitrunningbufspace(void)
+{
+ /*
+ * XXX race against wakeup interrupt, currently
+ * protected by Giant. FIXME!
+ */
+ while (runningbufspace > hirunningspace) {
+ ++runningbufreq;
+ tsleep(&runningbufreq, PVM, "wdrain", 0);
+ }
+}
+
+
+/*
+ * vfs_buf_test_cache:
+ *
+ * Called when a buffer is extended. This function clears the B_CACHE
+ * bit if the newly extended portion of the buffer does not contain
+ * valid data.
+ */
+static __inline__
+void
+vfs_buf_test_cache(struct buf *bp,
+ vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
+ vm_page_t m)
+{
+ GIANT_REQUIRED;
+
+ if (bp->b_flags & B_CACHE) {
+ int base = (foff + off) & PAGE_MASK;
+ if (vm_page_is_valid(m, base, size) == 0)
+ bp->b_flags &= ~B_CACHE;
+ }
+}
+
+/* Wake up the buffer deamon if necessary */
+static __inline__
+void
+bd_wakeup(int dirtybuflevel)
+{
+ if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
+ bd_request = 1;
+ wakeup(&bd_request);
+ }
+}
+
+/*
+ * bd_speedup - speedup the buffer cache flushing code
+ */
+
+static __inline__
+void
+bd_speedup(void)
+{
+ bd_wakeup(1);
+}
+
+/*
+ * Calculating buffer cache scaling values and reserve space for buffer
+ * headers. This is called during low level kernel initialization and
+ * may be called more then once. We CANNOT write to the memory area
+ * being reserved at this time.
+ */
+caddr_t
+kern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est)
+{
+ /*
+ * physmem_est is in pages. Convert it to kilobytes (assumes
+ * PAGE_SIZE is >= 1K)
+ */
+ physmem_est = physmem_est * (PAGE_SIZE / 1024);
+
+ /*
+ * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
+ * For the first 64MB of ram nominally allocate sufficient buffers to
+ * cover 1/4 of our ram. Beyond the first 64MB allocate additional
+ * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
+ * the buffer cache we limit the eventual kva reservation to
+ * maxbcache bytes.
+ *
+ * factor represents the 1/4 x ram conversion.
+ */
+ if (nbuf == 0) {
+ int factor = 4 * BKVASIZE / 1024;
+
+ nbuf = 50;
+ if (physmem_est > 4096)
+ nbuf += min((physmem_est - 4096) / factor,
+ 65536 / factor);
+ if (physmem_est > 65536)
+ nbuf += (physmem_est - 65536) * 2 / (factor * 5);
+
+ if (maxbcache && nbuf > maxbcache / BKVASIZE)
+ nbuf = maxbcache / BKVASIZE;
+ }
+
+#if 0
+ /*
+ * Do not allow the buffer_map to be more then 1/2 the size of the
+ * kernel_map.
+ */
+ if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) /
+ (BKVASIZE * 2)) {
+ nbuf = (kernel_map->max_offset - kernel_map->min_offset) /
+ (BKVASIZE * 2);
+ printf("Warning: nbufs capped at %d\n", nbuf);
+ }
+#endif
+
+ /*
+ * swbufs are used as temporary holders for I/O, such as paging I/O.
+ * We have no less then 16 and no more then 256.
+ */
+ nswbuf = max(min(nbuf/4, 256), 16);
+
+ /*
+ * Reserve space for the buffer cache buffers
+ */
+ swbuf = (void *)v;
+ v = (caddr_t)(swbuf + nswbuf);
+ buf = (void *)v;
+ v = (caddr_t)(buf + nbuf);
+
+ /*
+ * Calculate the hash table size and reserve space
+ */
+ for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
+ ;
+ bufhashtbl = (void *)v;
+ v = (caddr_t)(bufhashtbl + bufhashmask);
+ --bufhashmask;
+
+ return(v);
+}
+
+/* Initialize the buffer subsystem. Called before use of any buffers. */
+void
+bufinit(void)
+{
+ struct buf *bp;
+ int i;
+
+ GIANT_REQUIRED;
+
+ LIST_INIT(&invalhash);
+ mtx_init(&buftimelock, "buftime lock", NULL, MTX_DEF);
+
+ for (i = 0; i <= bufhashmask; i++)
+ LIST_INIT(&bufhashtbl[i]);
+
+ /* next, make a null set of free lists */
+ for (i = 0; i < BUFFER_QUEUES; i++)
+ TAILQ_INIT(&bufqueues[i]);
+
+ /* finally, initialize each buffer header and stick on empty q */
+ for (i = 0; i < nbuf; i++) {
+ bp = &buf[i];
+ bzero(bp, sizeof *bp);
+ bp->b_flags = B_INVAL; /* we're just an empty header */
+ bp->b_dev = NODEV;
+ bp->b_rcred = NOCRED;
+ bp->b_wcred = NOCRED;
+ bp->b_qindex = QUEUE_EMPTY;
+ bp->b_xflags = 0;
+ LIST_INIT(&bp->b_dep);
+ BUF_LOCKINIT(bp);
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+ }
+
+ /*
+ * maxbufspace is the absolute maximum amount of buffer space we are
+ * allowed to reserve in KVM and in real terms. The absolute maximum
+ * is nominally used by buf_daemon. hibufspace is the nominal maximum
+ * used by most other processes. The differential is required to
+ * ensure that buf_daemon is able to run when other processes might
+ * be blocked waiting for buffer space.
+ *
+ * maxbufspace is based on BKVASIZE. Allocating buffers larger then
+ * this may result in KVM fragmentation which is not handled optimally
+ * by the system.
+ */
+ maxbufspace = nbuf * BKVASIZE;
+ hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
+ lobufspace = hibufspace - MAXBSIZE;
+
+ lorunningspace = 512 * 1024;
+ hirunningspace = 1024 * 1024;
+
+/*
+ * Limit the amount of malloc memory since it is wired permanently into
+ * the kernel space. Even though this is accounted for in the buffer
+ * allocation, we don't want the malloced region to grow uncontrolled.
+ * The malloc scheme improves memory utilization significantly on average
+ * (small) directories.
+ */
+ maxbufmallocspace = hibufspace / 20;
+
+/*
+ * Reduce the chance of a deadlock occuring by limiting the number
+ * of delayed-write dirty buffers we allow to stack up.
+ */
+ hidirtybuffers = nbuf / 4 + 20;
+ numdirtybuffers = 0;
+/*
+ * To support extreme low-memory systems, make sure hidirtybuffers cannot
+ * eat up all available buffer space. This occurs when our minimum cannot
+ * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming
+ * BKVASIZE'd (8K) buffers.
+ */
+ while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
+ hidirtybuffers >>= 1;
+ }
+ lodirtybuffers = hidirtybuffers / 2;
+
+/*
+ * Try to keep the number of free buffers in the specified range,
+ * and give special processes (e.g. like buf_daemon) access to an
+ * emergency reserve.
+ */
+ lofreebuffers = nbuf / 18 + 5;
+ hifreebuffers = 2 * lofreebuffers;
+ numfreebuffers = nbuf;
+
+/*
+ * Maximum number of async ops initiated per buf_daemon loop. This is
+ * somewhat of a hack at the moment, we really need to limit ourselves
+ * based on the number of bytes of I/O in-transit that were initiated
+ * from buf_daemon.
+ */
+
+ bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
+ bogus_page = vm_page_alloc(kernel_object,
+ ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
+ VM_ALLOC_NORMAL);
+ cnt.v_wire_count++;
+}
+
+/*
+ * bfreekva() - free the kva allocation for a buffer.
+ *
+ * Must be called at splbio() or higher as this is the only locking for
+ * buffer_map.
+ *
+ * Since this call frees up buffer space, we call bufspacewakeup().
+ */
+static void
+bfreekva(struct buf * bp)
+{
+ GIANT_REQUIRED;
+
+ if (bp->b_kvasize) {
+ ++buffreekvacnt;
+ bufspace -= bp->b_kvasize;
+ vm_map_delete(buffer_map,
+ (vm_offset_t) bp->b_kvabase,
+ (vm_offset_t) bp->b_kvabase + bp->b_kvasize
+ );
+ bp->b_kvasize = 0;
+ bufspacewakeup();
+ }
+}
+
+/*
+ * bremfree:
+ *
+ * Remove the buffer from the appropriate free list.
+ */
+void
+bremfree(struct buf * bp)
+{
+ int s = splbio();
+ int old_qindex = bp->b_qindex;
+
+ GIANT_REQUIRED;
+
+ if (bp->b_qindex != QUEUE_NONE) {
+ KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
+ TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+ bp->b_qindex = QUEUE_NONE;
+ } else {
+ if (BUF_REFCNT(bp) <= 1)
+ panic("bremfree: removing a buffer not on a queue");
+ }
+
+ /*
+ * Fixup numfreebuffers count. If the buffer is invalid or not
+ * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
+ * the buffer was free and we must decrement numfreebuffers.
+ */
+ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
+ switch(old_qindex) {
+ case QUEUE_DIRTY:
+ case QUEUE_CLEAN:
+ case QUEUE_EMPTY:
+ case QUEUE_EMPTYKVA:
+ --numfreebuffers;
+ break;
+ default:
+ break;
+ }
+ }
+ splx(s);
+}
+
+
+/*
+ * Get a buffer with the specified data. Look in the cache first. We
+ * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE
+ * is set, the buffer is valid and we do not have to do anything ( see
+ * getblk() ). This is really just a special case of breadn().
+ */
+int
+bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
+ struct buf ** bpp)
+{
+
+ return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp));
+}
+
+/*
+ * Operates like bread, but also starts asynchronous I/O on
+ * read-ahead blocks. We must clear BIO_ERROR and B_INVAL prior
+ * to initiating I/O . If B_CACHE is set, the buffer is valid
+ * and we do not have to do anything.
+ */
+int
+breadn(struct vnode * vp, daddr_t blkno, int size,
+ daddr_t * rablkno, int *rabsize,
+ int cnt, struct ucred * cred, struct buf ** bpp)
+{
+ struct buf *bp, *rabp;
+ int i;
+ int rv = 0, readwait = 0;
+
+ *bpp = bp = getblk(vp, blkno, size, 0, 0);
+
+ /* if not found in cache, do some I/O */
+ if ((bp->b_flags & B_CACHE) == 0) {
+ if (curthread != PCPU_GET(idlethread))
+ curthread->td_proc->p_stats->p_ru.ru_inblock++;
+ bp->b_iocmd = BIO_READ;
+ bp->b_flags &= ~B_INVAL;
+ bp->b_ioflags &= ~BIO_ERROR;
+ if (bp->b_rcred == NOCRED && cred != NOCRED)
+ bp->b_rcred = crhold(cred);
+ vfs_busy_pages(bp, 0);
+ VOP_STRATEGY(vp, bp);
+ ++readwait;
+ }
+
+ for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+ if (inmem(vp, *rablkno))
+ continue;
+ rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
+
+ if ((rabp->b_flags & B_CACHE) == 0) {
+ if (curthread != PCPU_GET(idlethread))
+ curthread->td_proc->p_stats->p_ru.ru_inblock++;
+ rabp->b_flags |= B_ASYNC;
+ rabp->b_flags &= ~B_INVAL;
+ rabp->b_ioflags &= ~BIO_ERROR;
+ rabp->b_iocmd = BIO_READ;
+ if (rabp->b_rcred == NOCRED && cred != NOCRED)
+ rabp->b_rcred = crhold(cred);
+ vfs_busy_pages(rabp, 0);
+ BUF_KERNPROC(rabp);
+ VOP_STRATEGY(vp, rabp);
+ } else {
+ brelse(rabp);
+ }
+ }
+
+ if (readwait) {
+ rv = bufwait(bp);
+ }
+ return (rv);
+}
+
+/*
+ * Write, release buffer on completion. (Done by iodone
+ * if async). Do not bother writing anything if the buffer
+ * is invalid.
+ *
+ * Note that we set B_CACHE here, indicating that buffer is
+ * fully valid and thus cacheable. This is true even of NFS
+ * now so we set it generally. This could be set either here
+ * or in biodone() since the I/O is synchronous. We put it
+ * here.
+ */
+
+int
+bwrite(struct buf * bp)
+{
+ int oldflags, s;
+ struct buf *newbp;
+
+ if (bp->b_flags & B_INVAL) {
+ brelse(bp);
+ return (0);
+ }
+
+ oldflags = bp->b_flags;
+
+ if (BUF_REFCNT(bp) == 0)
+ panic("bwrite: buffer is not busy???");
+ s = splbio();
+ /*
+ * If a background write is already in progress, delay
+ * writing this block if it is asynchronous. Otherwise
+ * wait for the background write to complete.
+ */
+ if (bp->b_xflags & BX_BKGRDINPROG) {
+ if (bp->b_flags & B_ASYNC) {
+ splx(s);
+ bdwrite(bp);
+ return (0);
+ }
+ bp->b_xflags |= BX_BKGRDWAIT;
+ tsleep(&bp->b_xflags, PRIBIO, "bwrbg", 0);
+ if (bp->b_xflags & BX_BKGRDINPROG)
+ panic("bwrite: still writing");
+ }
+
+ /* Mark the buffer clean */
+ bundirty(bp);
+
+ /*
+ * If this buffer is marked for background writing and we
+ * do not have to wait for it, make a copy and write the
+ * copy so as to leave this buffer ready for further use.
+ *
+ * This optimization eats a lot of memory. If we have a page
+ * or buffer shortfall we can't do it.
+ */
+ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) &&
+ (bp->b_flags & B_ASYNC) &&
+ !vm_page_count_severe() &&
+ !buf_dirty_count_severe()) {
+ if (bp->b_iodone != NULL) {
+ printf("bp->b_iodone = %p\n", bp->b_iodone);
+ panic("bwrite: need chained iodone");
+ }
+
+ /* get a new block */
+ newbp = geteblk(bp->b_bufsize);
+
+ /* set it to be identical to the old block */
+ memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
+ bgetvp(bp->b_vp, newbp);
+ newbp->b_lblkno = bp->b_lblkno;
+ newbp->b_blkno = bp->b_blkno;
+ newbp->b_offset = bp->b_offset;
+ newbp->b_iodone = vfs_backgroundwritedone;
+ newbp->b_flags |= B_ASYNC;
+ newbp->b_flags &= ~B_INVAL;
+
+ /* move over the dependencies */
+ if (LIST_FIRST(&bp->b_dep) != NULL)
+ buf_movedeps(bp, newbp);
+
+ /*
+ * Initiate write on the copy, release the original to
+ * the B_LOCKED queue so that it cannot go away until
+ * the background write completes. If not locked it could go
+ * away and then be reconstituted while it was being written.
+ * If the reconstituted buffer were written, we could end up
+ * with two background copies being written at the same time.
+ */
+ bp->b_xflags |= BX_BKGRDINPROG;
+ bp->b_flags |= B_LOCKED;
+ bqrelse(bp);
+ bp = newbp;
+ }
+
+ bp->b_flags &= ~B_DONE;
+ bp->b_ioflags &= ~BIO_ERROR;
+ bp->b_flags |= B_WRITEINPROG | B_CACHE;
+ bp->b_iocmd = BIO_WRITE;
+
+ bp->b_vp->v_numoutput++;
+ vfs_busy_pages(bp, 1);
+
+ /*
+ * Normal bwrites pipeline writes
+ */
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
+
+ if (curthread != PCPU_GET(idlethread))
+ curthread->td_proc->p_stats->p_ru.ru_oublock++;
+ splx(s);
+ if (oldflags & B_ASYNC)
+ BUF_KERNPROC(bp);
+ BUF_STRATEGY(bp);
+
+ if ((oldflags & B_ASYNC) == 0) {
+ int rtval = bufwait(bp);
+ brelse(bp);
+ return (rtval);
+ } else if ((oldflags & B_NOWDRAIN) == 0) {
+ /*
+ * don't allow the async write to saturate the I/O
+ * system. Deadlocks can occur only if a device strategy
+ * routine (like in MD) turns around and issues another
+ * high-level write, in which case B_NOWDRAIN is expected
+ * to be set. Otherwise we will not deadlock here because
+ * we are blocking waiting for I/O that is already in-progress
+ * to complete.
+ */
+ waitrunningbufspace();
+ }
+
+ return (0);
+}
+
+/*
+ * Complete a background write started from bwrite.
+ */
+static void
+vfs_backgroundwritedone(bp)
+ struct buf *bp;
+{
+ struct buf *origbp;
+
+ /*
+ * Find the original buffer that we are writing.
+ */
+ if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL)
+ panic("backgroundwritedone: lost buffer");
+ /*
+ * Process dependencies then return any unfinished ones.
+ */
+ if (LIST_FIRST(&bp->b_dep) != NULL)
+ buf_complete(bp);
+ if (LIST_FIRST(&bp->b_dep) != NULL)
+ buf_movedeps(bp, origbp);
+ /*
+ * Clear the BX_BKGRDINPROG flag in the original buffer
+ * and awaken it if it is waiting for the write to complete.
+ * If BX_BKGRDINPROG is not set in the original buffer it must
+ * have been released and re-instantiated - which is not legal.
+ */
+ KASSERT((origbp->b_xflags & BX_BKGRDINPROG),
+ ("backgroundwritedone: lost buffer2"));
+ origbp->b_xflags &= ~BX_BKGRDINPROG;
+ if (origbp->b_xflags & BX_BKGRDWAIT) {
+ origbp->b_xflags &= ~BX_BKGRDWAIT;
+ wakeup(&origbp->b_xflags);
+ }
+ /*
+ * Clear the B_LOCKED flag and remove it from the locked
+ * queue if it currently resides there.
+ */
+ origbp->b_flags &= ~B_LOCKED;
+ if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
+ bremfree(origbp);
+ bqrelse(origbp);
+ }
+ /*
+ * This buffer is marked B_NOCACHE, so when it is released
+ * by biodone, it will be tossed. We mark it with BIO_READ
+ * to avoid biodone doing a second vwakeup.
+ */
+ bp->b_flags |= B_NOCACHE;
+ bp->b_iocmd = BIO_READ;
+ bp->b_flags &= ~(B_CACHE | B_DONE);
+ bp->b_iodone = 0;
+ bufdone(bp);
+}
+
+/*
+ * Delayed write. (Buffer is marked dirty). Do not bother writing
+ * anything if the buffer is marked invalid.
+ *
+ * Note that since the buffer must be completely valid, we can safely
+ * set B_CACHE. In fact, we have to set B_CACHE here rather then in
+ * biodone() in order to prevent getblk from writing the buffer
+ * out synchronously.
+ */
+void
+bdwrite(struct buf * bp)
+{
+ GIANT_REQUIRED;
+
+ if (BUF_REFCNT(bp) == 0)
+ panic("bdwrite: buffer is not busy");
+
+ if (bp->b_flags & B_INVAL) {
+ brelse(bp);
+ return;
+ }
+ bdirty(bp);
+
+ /*
+ * Set B_CACHE, indicating that the buffer is fully valid. This is
+ * true even of NFS now.
+ */
+ bp->b_flags |= B_CACHE;
+
+ /*
+ * This bmap keeps the system from needing to do the bmap later,
+ * perhaps when the system is attempting to do a sync. Since it
+ * is likely that the indirect block -- or whatever other datastructure
+ * that the filesystem needs is still in memory now, it is a good
+ * thing to do this. Note also, that if the pageout daemon is
+ * requesting a sync -- there might not be enough memory to do
+ * the bmap then... So, this is important to do.
+ */
+ if (bp->b_lblkno == bp->b_blkno) {
+ VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+ }
+
+ /*
+ * Set the *dirty* buffer range based upon the VM system dirty pages.
+ */
+ vfs_setdirty(bp);
+
+ /*
+ * We need to do this here to satisfy the vnode_pager and the
+ * pageout daemon, so that it thinks that the pages have been
+ * "cleaned". Note that since the pages are in a delayed write
+ * buffer -- the VFS layer "will" see that the pages get written
+ * out on the next sync, or perhaps the cluster will be completed.
+ */
+ vfs_clean_pages(bp);
+ bqrelse(bp);
+
+ /*
+ * Wakeup the buffer flushing daemon if we have a lot of dirty
+ * buffers (midpoint between our recovery point and our stall
+ * point).
+ */
+ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
+
+ /*
+ * note: we cannot initiate I/O from a bdwrite even if we wanted to,
+ * due to the softdep code.
+ */
+}
+
+/*
+ * bdirty:
+ *
+ * Turn buffer into delayed write request. We must clear BIO_READ and
+ * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to
+ * itself to properly update it in the dirty/clean lists. We mark it
+ * B_DONE to ensure that any asynchronization of the buffer properly
+ * clears B_DONE ( else a panic will occur later ).
+ *
+ * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
+ * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty()
+ * should only be called if the buffer is known-good.
+ *
+ * Since the buffer is not on a queue, we do not update the numfreebuffers
+ * count.
+ *
+ * Must be called at splbio().
+ * The buffer must be on QUEUE_NONE.
+ */
+void
+bdirty(bp)
+ struct buf *bp;
+{
+ KASSERT(bp->b_qindex == QUEUE_NONE,
+ ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
+ bp->b_flags &= ~(B_RELBUF);
+ bp->b_iocmd = BIO_WRITE;
+
+ if ((bp->b_flags & B_DELWRI) == 0) {
+ bp->b_flags |= B_DONE | B_DELWRI;
+ reassignbuf(bp, bp->b_vp);
+ ++numdirtybuffers;
+ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
+ }
+}
+
+/*
+ * bundirty:
+ *
+ * Clear B_DELWRI for buffer.
+ *
+ * Since the buffer is not on a queue, we do not update the numfreebuffers
+ * count.
+ *
+ * Must be called at splbio().
+ * The buffer must be on QUEUE_NONE.
+ */
+
+void
+bundirty(bp)
+ struct buf *bp;
+{
+ KASSERT(bp->b_qindex == QUEUE_NONE,
+ ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
+
+ if (bp->b_flags & B_DELWRI) {
+ bp->b_flags &= ~B_DELWRI;
+ reassignbuf(bp, bp->b_vp);
+ --numdirtybuffers;
+ numdirtywakeup(lodirtybuffers);
+ }
+ /*
+ * Since it is now being written, we can clear its deferred write flag.
+ */
+ bp->b_flags &= ~B_DEFERRED;
+}
+
+/*
+ * bawrite:
+ *
+ * Asynchronous write. Start output on a buffer, but do not wait for
+ * it to complete. The buffer is released when the output completes.
+ *
+ * bwrite() ( or the VOP routine anyway ) is responsible for handling
+ * B_INVAL buffers. Not us.
+ */
+void
+bawrite(struct buf * bp)
+{
+ bp->b_flags |= B_ASYNC;
+ (void) BUF_WRITE(bp);
+}
+
+/*
+ * bwillwrite:
+ *
+ * Called prior to the locking of any vnodes when we are expecting to
+ * write. We do not want to starve the buffer cache with too many
+ * dirty buffers so we block here. By blocking prior to the locking
+ * of any vnodes we attempt to avoid the situation where a locked vnode
+ * prevents the various system daemons from flushing related buffers.
+ */
+
+void
+bwillwrite(void)
+{
+ if (numdirtybuffers >= hidirtybuffers) {
+ int s;
+
+ mtx_lock(&Giant);
+ s = splbio();
+ while (numdirtybuffers >= hidirtybuffers) {
+ bd_wakeup(1);
+ needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
+ tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
+ }
+ splx(s);
+ mtx_unlock(&Giant);
+ }
+}
+
+/*
+ * Return true if we have too many dirty buffers.
+ */
+int
+buf_dirty_count_severe(void)
+{
+ return(numdirtybuffers >= hidirtybuffers);
+}
+
+/*
+ * brelse:
+ *
+ * Release a busy buffer and, if requested, free its resources. The
+ * buffer will be stashed in the appropriate bufqueue[] allowing it
+ * to be accessed later as a cache entity or reused for other purposes.
+ */
+void
+brelse(struct buf * bp)
+{
+ int s;
+
+ GIANT_REQUIRED;
+
+ KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
+ ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+
+ s = splbio();
+
+ if (bp->b_flags & B_LOCKED)
+ bp->b_ioflags &= ~BIO_ERROR;
+
+ if (bp->b_iocmd == BIO_WRITE &&
+ (bp->b_ioflags & BIO_ERROR) &&
+ !(bp->b_flags & B_INVAL)) {
+ /*
+ * Failed write, redirty. Must clear BIO_ERROR to prevent
+ * pages from being scrapped. If B_INVAL is set then
+ * this case is not run and the next case is run to
+ * destroy the buffer. B_INVAL can occur if the buffer
+ * is outside the range supported by the underlying device.
+ */
+ bp->b_ioflags &= ~BIO_ERROR;
+ bdirty(bp);
+ } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
+ (bp->b_ioflags & BIO_ERROR) ||
+ bp->b_iocmd == BIO_DELETE || (bp->b_bufsize <= 0)) {
+ /*
+ * Either a failed I/O or we were asked to free or not
+ * cache the buffer.
+ */
+ bp->b_flags |= B_INVAL;
+ if (LIST_FIRST(&bp->b_dep) != NULL)
+ buf_deallocate(bp);
+ if (bp->b_flags & B_DELWRI) {
+ --numdirtybuffers;
+ numdirtywakeup(lodirtybuffers);
+ }
+ bp->b_flags &= ~(B_DELWRI | B_CACHE);
+ if ((bp->b_flags & B_VMIO) == 0) {
+ if (bp->b_bufsize)
+ allocbuf(bp, 0);
+ if (bp->b_vp)
+ brelvp(bp);
+ }
+ }
+
+ /*
+ * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release()
+ * is called with B_DELWRI set, the underlying pages may wind up
+ * getting freed causing a previous write (bdwrite()) to get 'lost'
+ * because pages associated with a B_DELWRI bp are marked clean.
+ *
+ * We still allow the B_INVAL case to call vfs_vmio_release(), even
+ * if B_DELWRI is set.
+ *
+ * If B_DELWRI is not set we may have to set B_RELBUF if we are low
+ * on pages to return pages to the VM page queues.
+ */
+ if (bp->b_flags & B_DELWRI)
+ bp->b_flags &= ~B_RELBUF;
+ else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
+ bp->b_flags |= B_RELBUF;
+
+ /*
+ * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
+ * constituted, not even NFS buffers now. Two flags effect this. If
+ * B_INVAL, the struct buf is invalidated but the VM object is kept
+ * around ( i.e. so it is trivial to reconstitute the buffer later ).
+ *
+ * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
+ * invalidated. BIO_ERROR cannot be set for a failed write unless the
+ * buffer is also B_INVAL because it hits the re-dirtying code above.
+ *
+ * Normally we can do this whether a buffer is B_DELWRI or not. If
+ * the buffer is an NFS buffer, it is tracking piecemeal writes or
+ * the commit state and we cannot afford to lose the buffer. If the
+ * buffer has a background write in progress, we need to keep it
+ * around to prevent it from being reconstituted and starting a second
+ * background write.
+ */
+ if ((bp->b_flags & B_VMIO)
+ && !(bp->b_vp->v_tag == VT_NFS &&
+ !vn_isdisk(bp->b_vp, NULL) &&
+ (bp->b_flags & B_DELWRI))
+ ) {
+
+ int i, j, resid;
+ vm_page_t m;
+ off_t foff;
+ vm_pindex_t poff;
+ vm_object_t obj;
+ struct vnode *vp;
+
+ vp = bp->b_vp;
+
+ /*
+ * Get the base offset and length of the buffer. Note that
+ * in the VMIO case if the buffer block size is not
+ * page-aligned then b_data pointer may not be page-aligned.
+ * But our b_pages[] array *IS* page aligned.
+ *
+ * block sizes less then DEV_BSIZE (usually 512) are not
+ * supported due to the page granularity bits (m->valid,
+ * m->dirty, etc...).
+ *
+ * See man buf(9) for more information
+ */
+ resid = bp->b_bufsize;
+ foff = bp->b_offset;
+
+ for (i = 0; i < bp->b_npages; i++) {
+ int had_bogus = 0;
+
+ m = bp->b_pages[i];
+ vm_page_flag_clear(m, PG_ZERO);
+
+ /*
+ * If we hit a bogus page, fixup *all* the bogus pages
+ * now.
+ */
+ if (m == bogus_page) {
+ VOP_GETVOBJECT(vp, &obj);
+ poff = OFF_TO_IDX(bp->b_offset);
+ had_bogus = 1;
+
+ for (j = i; j < bp->b_npages; j++) {
+ vm_page_t mtmp;
+ mtmp = bp->b_pages[j];
+ if (mtmp == bogus_page) {
+ mtmp = vm_page_lookup(obj, poff + j);
+ if (!mtmp) {
+ panic("brelse: page missing\n");
+ }
+ bp->b_pages[j] = mtmp;
+ }
+ }
+
+ if ((bp->b_flags & B_INVAL) == 0) {
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+ }
+ m = bp->b_pages[i];
+ }
+ if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) {
+ int poffset = foff & PAGE_MASK;
+ int presid = resid > (PAGE_SIZE - poffset) ?
+ (PAGE_SIZE - poffset) : resid;
+
+ KASSERT(presid >= 0, ("brelse: extra page"));
+ vm_page_set_invalid(m, poffset, presid);
+ if (had_bogus)
+ printf("avoided corruption bug in bogus_page/brelse code\n");
+ }
+ resid -= PAGE_SIZE - (foff & PAGE_MASK);
+ foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ }
+
+ if (bp->b_flags & (B_INVAL | B_RELBUF))
+ vfs_vmio_release(bp);
+
+ } else if (bp->b_flags & B_VMIO) {
+
+ if (bp->b_flags & (B_INVAL | B_RELBUF)) {
+ vfs_vmio_release(bp);
+ }
+
+ }
+
+ if (bp->b_qindex != QUEUE_NONE)
+ panic("brelse: free buffer onto another queue???");
+ if (BUF_REFCNT(bp) > 1) {
+ /* do not release to free list */
+ BUF_UNLOCK(bp);
+ splx(s);
+ return;
+ }
+
+ /* enqueue */
+
+ /* buffers with no memory */
+ if (bp->b_bufsize == 0) {
+ bp->b_flags |= B_INVAL;
+ bp->b_xflags &= ~BX_BKGRDWRITE;
+ if (bp->b_xflags & BX_BKGRDINPROG)
+ panic("losing buffer 1");
+ if (bp->b_kvasize) {
+ bp->b_qindex = QUEUE_EMPTYKVA;
+ } else {
+ bp->b_qindex = QUEUE_EMPTY;
+ }
+ TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+ LIST_REMOVE(bp, b_hash);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+ bp->b_dev = NODEV;
+ /* buffers with junk contents */
+ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
+ (bp->b_ioflags & BIO_ERROR)) {
+ bp->b_flags |= B_INVAL;
+ bp->b_xflags &= ~BX_BKGRDWRITE;
+ if (bp->b_xflags & BX_BKGRDINPROG)
+ panic("losing buffer 2");
+ bp->b_qindex = QUEUE_CLEAN;
+ TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
+ LIST_REMOVE(bp, b_hash);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+ bp->b_dev = NODEV;
+
+ /* buffers that are locked */
+ } else if (bp->b_flags & B_LOCKED) {
+ bp->b_qindex = QUEUE_LOCKED;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+
+ /* remaining buffers */
+ } else {
+ if (bp->b_flags & B_DELWRI)
+ bp->b_qindex = QUEUE_DIRTY;
+ else
+ bp->b_qindex = QUEUE_CLEAN;
+ if (bp->b_flags & B_AGE)
+ TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+ else
+ TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+ }
+
+ /*
+ * If B_INVAL, clear B_DELWRI. We've already placed the buffer
+ * on the correct queue.
+ */
+ if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI))
+ bundirty(bp);
+
+ /*
+ * Fixup numfreebuffers count. The bp is on an appropriate queue
+ * unless locked. We then bump numfreebuffers if it is not B_DELWRI.
+ * We've already handled the B_INVAL case ( B_DELWRI will be clear
+ * if B_INVAL is set ).
+ */
+
+ if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
+ bufcountwakeup();
+
+ /*
+ * Something we can maybe free or reuse
+ */
+ if (bp->b_bufsize || bp->b_kvasize)
+ bufspacewakeup();
+
+ /* unlock */
+ BUF_UNLOCK(bp);
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF |
+ B_DIRECT | B_NOWDRAIN);
+ if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
+ panic("brelse: not dirty");
+ splx(s);
+}
+
+/*
+ * Release a buffer back to the appropriate queue but do not try to free
+ * it. The buffer is expected to be used again soon.
+ *
+ * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
+ * biodone() to requeue an async I/O on completion. It is also used when
+ * known good buffers need to be requeued but we think we may need the data
+ * again soon.
+ *
+ * XXX we should be able to leave the B_RELBUF hint set on completion.
+ */
+void
+bqrelse(struct buf * bp)
+{
+ int s;
+
+ s = splbio();
+
+ KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+
+ if (bp->b_qindex != QUEUE_NONE)
+ panic("bqrelse: free buffer onto another queue???");
+ if (BUF_REFCNT(bp) > 1) {
+ /* do not release to free list */
+ BUF_UNLOCK(bp);
+ splx(s);
+ return;
+ }
+ if (bp->b_flags & B_LOCKED) {
+ bp->b_ioflags &= ~BIO_ERROR;
+ bp->b_qindex = QUEUE_LOCKED;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+ /* buffers with stale but valid contents */
+ } else if (bp->b_flags & B_DELWRI) {
+ bp->b_qindex = QUEUE_DIRTY;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
+ } else if (vm_page_count_severe()) {
+ /*
+ * We are too low on memory, we have to try to free the
+ * buffer (most importantly: the wired pages making up its
+ * backing store) *now*.
+ */
+ splx(s);
+ brelse(bp);
+ return;
+ } else {
+ bp->b_qindex = QUEUE_CLEAN;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
+ }
+
+ if ((bp->b_flags & B_LOCKED) == 0 &&
+ ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
+ bufcountwakeup();
+ }
+
+ /*
+ * Something we can maybe free or reuse.
+ */
+ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
+ bufspacewakeup();
+
+ /* unlock */
+ BUF_UNLOCK(bp);
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+ if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
+ panic("bqrelse: not dirty");
+ splx(s);
+}
+
+/* Give pages used by the bp back to the VM system (where possible) */
+static void
+vfs_vmio_release(bp)
+ struct buf *bp;
+{
+ int i;
+ vm_page_t m;
+
+ GIANT_REQUIRED;
+
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ bp->b_pages[i] = NULL;
+ /*
+ * In order to keep page LRU ordering consistent, put
+ * everything on the inactive queue.
+ */
+ vm_page_unwire(m, 0);
+ /*
+ * We don't mess with busy pages, it is
+ * the responsibility of the process that
+ * busied the pages to deal with them.
+ */
+ if ((m->flags & PG_BUSY) || (m->busy != 0))
+ continue;
+
+ if (m->wire_count == 0) {
+ vm_page_flag_clear(m, PG_ZERO);
+ /*
+ * Might as well free the page if we can and it has
+ * no valid data. We also free the page if the
+ * buffer was used for direct I/O
+ */
+ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid &&
+ m->hold_count == 0) {
+ vm_page_busy(m);
+ vm_page_protect(m, VM_PROT_NONE);
+ vm_page_free(m);
+ } else if (bp->b_flags & B_DIRECT) {
+ vm_page_try_to_free(m);
+ } else if (vm_page_count_severe()) {
+ vm_page_try_to_cache(m);
+ }
+ }
+ }
+ pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+
+ if (bp->b_bufsize) {
+ bufspacewakeup();
+ bp->b_bufsize = 0;
+ }
+ bp->b_npages = 0;
+ bp->b_flags &= ~B_VMIO;
+ if (bp->b_vp)
+ brelvp(bp);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
+struct buf *
+gbincore(struct vnode * vp, daddr_t blkno)
+{
+ struct buf *bp;
+ struct bufhashhdr *bh;
+
+ bh = bufhash(vp, blkno);
+
+ /* Search hash chain */
+ LIST_FOREACH(bp, bh, b_hash) {
+ /* hit */
+ if (bp->b_vp == vp && bp->b_lblkno == blkno &&
+ (bp->b_flags & B_INVAL) == 0) {
+ break;
+ }
+ }
+ return (bp);
+}
+
+/*
+ * vfs_bio_awrite:
+ *
+ * Implement clustered async writes for clearing out B_DELWRI buffers.
+ * This is much better then the old way of writing only one buffer at
+ * a time. Note that we may not be presented with the buffers in the
+ * correct order, so we search for the cluster in both directions.
+ */
+int
+vfs_bio_awrite(struct buf * bp)
+{
+ int i;
+ int j;
+ daddr_t lblkno = bp->b_lblkno;
+ struct vnode *vp = bp->b_vp;
+ int s;
+ int ncl;
+ struct buf *bpa;
+ int nwritten;
+ int size;
+ int maxcl;
+
+ s = splbio();
+ /*
+ * right now we support clustered writing only to regular files. If
+ * we find a clusterable block we could be in the middle of a cluster
+ * rather then at the beginning.
+ */
+ if ((vp->v_type == VREG) &&
+ (vp->v_mount != 0) && /* Only on nodes that have the size info */
+ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
+
+ size = vp->v_mount->mnt_stat.f_iosize;
+ maxcl = MAXPHYS / size;
+
+ for (i = 1; i < maxcl; i++) {
+ if ((bpa = gbincore(vp, lblkno + i)) &&
+ BUF_REFCNT(bpa) == 0 &&
+ ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
+ (B_DELWRI | B_CLUSTEROK)) &&
+ (bpa->b_bufsize == size)) {
+ if ((bpa->b_blkno == bpa->b_lblkno) ||
+ (bpa->b_blkno !=
+ bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
+ break;
+ } else {
+ break;
+ }
+ }
+ for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
+ if ((bpa = gbincore(vp, lblkno - j)) &&
+ BUF_REFCNT(bpa) == 0 &&
+ ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
+ (B_DELWRI | B_CLUSTEROK)) &&
+ (bpa->b_bufsize == size)) {
+ if ((bpa->b_blkno == bpa->b_lblkno) ||
+ (bpa->b_blkno !=
+ bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
+ break;
+ } else {
+ break;
+ }
+ }
+ --j;
+ ncl = i + j;
+ /*
+ * this is a possible cluster write
+ */
+ if (ncl != 1) {
+ nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
+ splx(s);
+ return nwritten;
+ }
+ }
+
+ BUF_LOCK(bp, LK_EXCLUSIVE);
+ bremfree(bp);
+ bp->b_flags |= B_ASYNC;
+
+ splx(s);
+ /*
+ * default (old) behavior, writing out only one block
+ *
+ * XXX returns b_bufsize instead of b_bcount for nwritten?
+ */
+ nwritten = bp->b_bufsize;
+ (void) BUF_WRITE(bp);
+
+ return nwritten;
+}
+
+/*
+ * getnewbuf:
+ *
+ * Find and initialize a new buffer header, freeing up existing buffers
+ * in the bufqueues as necessary. The new buffer is returned locked.
+ *
+ * Important: B_INVAL is not set. If the caller wishes to throw the
+ * buffer away, the caller must set B_INVAL prior to calling brelse().
+ *
+ * We block if:
+ * We have insufficient buffer headers
+ * We have insufficient buffer space
+ * buffer_map is too fragmented ( space reservation fails )
+ * If we have to flush dirty buffers ( but we try to avoid this )
+ *
+ * To avoid VFS layer recursion we do not flush dirty buffers ourselves.
+ * Instead we ask the buf daemon to do it for us. We attempt to
+ * avoid piecemeal wakeups of the pageout daemon.
+ */
+
+static struct buf *
+getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
+{
+ struct buf *bp;
+ struct buf *nbp;
+ int defrag = 0;
+ int nqindex;
+ static int flushingbufs;
+
+ GIANT_REQUIRED;
+
+ /*
+ * We can't afford to block since we might be holding a vnode lock,
+ * which may prevent system daemons from running. We deal with
+ * low-memory situations by proactively returning memory and running
+ * async I/O rather then sync I/O.
+ */
+
+ ++getnewbufcalls;
+ --getnewbufrestarts;
+restart:
+ ++getnewbufrestarts;
+
+ /*
+ * Setup for scan. If we do not have enough free buffers,
+ * we setup a degenerate case that immediately fails. Note
+ * that if we are specially marked process, we are allowed to
+ * dip into our reserves.
+ *
+ * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
+ *
+ * We start with EMPTYKVA. If the list is empty we backup to EMPTY.
+ * However, there are a number of cases (defragging, reusing, ...)
+ * where we cannot backup.
+ */
+ nqindex = QUEUE_EMPTYKVA;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+
+ if (nbp == NULL) {
+ /*
+ * If no EMPTYKVA buffers and we are either
+ * defragging or reusing, locate a CLEAN buffer
+ * to free or reuse. If bufspace useage is low
+ * skip this step so we can allocate a new buffer.
+ */
+ if (defrag || bufspace >= lobufspace) {
+ nqindex = QUEUE_CLEAN;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+ }
+
+ /*
+ * If we could not find or were not allowed to reuse a
+ * CLEAN buffer, check to see if it is ok to use an EMPTY
+ * buffer. We can only use an EMPTY buffer if allocating
+ * its KVA would not otherwise run us out of buffer space.
+ */
+ if (nbp == NULL && defrag == 0 &&
+ bufspace + maxsize < hibufspace) {
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ }
+ }
+
+ /*
+ * Run scan, possibly freeing data and/or kva mappings on the fly
+ * depending.
+ */
+
+ while ((bp = nbp) != NULL) {
+ int qindex = nqindex;
+
+ /*
+ * Calculate next bp ( we can only use it if we do not block
+ * or do other fancy things ).
+ */
+ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
+ switch(qindex) {
+ case QUEUE_EMPTY:
+ nqindex = QUEUE_EMPTYKVA;
+ if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
+ break;
+ /* fall through */
+ case QUEUE_EMPTYKVA:
+ nqindex = QUEUE_CLEAN;
+ if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
+ break;
+ /* fall through */
+ case QUEUE_CLEAN:
+ /*
+ * nbp is NULL.
+ */
+ break;
+ }
+ }
+
+ /*
+ * Sanity Checks
+ */
+ KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
+
+ /*
+ * Note: we no longer distinguish between VMIO and non-VMIO
+ * buffers.
+ */
+
+ KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
+
+ /*
+ * If we are defragging then we need a buffer with
+ * b_kvasize != 0. XXX this situation should no longer
+ * occur, if defrag is non-zero the buffer's b_kvasize
+ * should also be non-zero at this point. XXX
+ */
+ if (defrag && bp->b_kvasize == 0) {
+ printf("Warning: defrag empty buffer %p\n", bp);
+ continue;
+ }
+
+ /*
+ * Start freeing the bp. This is somewhat involved. nbp
+ * remains valid only for QUEUE_EMPTY[KVA] bp's.
+ */
+
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
+ panic("getnewbuf: locked buf");
+ bremfree(bp);
+
+ if (qindex == QUEUE_CLEAN) {
+ if (bp->b_flags & B_VMIO) {
+ bp->b_flags &= ~B_ASYNC;
+ vfs_vmio_release(bp);
+ }
+ if (bp->b_vp)
+ brelvp(bp);
+ }
+
+ /*
+ * NOTE: nbp is now entirely invalid. We can only restart
+ * the scan from this point on.
+ *
+ * Get the rest of the buffer freed up. b_kva* is still
+ * valid after this operation.
+ */
+
+ if (bp->b_rcred != NOCRED) {
+ crfree(bp->b_rcred);
+ bp->b_rcred = NOCRED;
+ }
+ if (bp->b_wcred != NOCRED) {
+ crfree(bp->b_wcred);
+ bp->b_wcred = NOCRED;
+ }
+ if (LIST_FIRST(&bp->b_dep) != NULL)
+ buf_deallocate(bp);
+ if (bp->b_xflags & BX_BKGRDINPROG)
+ panic("losing buffer 3");
+ LIST_REMOVE(bp, b_hash);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+
+ if (bp->b_bufsize)
+ allocbuf(bp, 0);
+
+ bp->b_flags = 0;
+ bp->b_ioflags = 0;
+ bp->b_xflags = 0;
+ bp->b_dev = NODEV;
+ bp->b_vp = NULL;
+ bp->b_blkno = bp->b_lblkno = 0;
+ bp->b_offset = NOOFFSET;
+ bp->b_iodone = 0;
+ bp->b_error = 0;
+ bp->b_resid = 0;
+ bp->b_bcount = 0;
+ bp->b_npages = 0;
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ bp->b_magic = B_MAGIC_BIO;
+ bp->b_op = &buf_ops_bio;
+
+ LIST_INIT(&bp->b_dep);
+
+ /*
+ * If we are defragging then free the buffer.
+ */
+ if (defrag) {
+ bp->b_flags |= B_INVAL;
+ bfreekva(bp);
+ brelse(bp);
+ defrag = 0;
+ goto restart;
+ }
+
+ /*
+ * If we are overcomitted then recover the buffer and its
+ * KVM space. This occurs in rare situations when multiple
+ * processes are blocked in getnewbuf() or allocbuf().
+ */
+ if (bufspace >= hibufspace)
+ flushingbufs = 1;
+ if (flushingbufs && bp->b_kvasize != 0) {
+ bp->b_flags |= B_INVAL;
+ bfreekva(bp);
+ brelse(bp);
+ goto restart;
+ }
+ if (bufspace < lobufspace)
+ flushingbufs = 0;
+ break;
+ }
+
+ /*
+ * If we exhausted our list, sleep as appropriate. We may have to
+ * wakeup various daemons and write out some dirty buffers.
+ *
+ * Generally we are sleeping due to insufficient buffer space.
+ */
+
+ if (bp == NULL) {
+ int flags;
+ char *waitmsg;
+
+ if (defrag) {
+ flags = VFS_BIO_NEED_BUFSPACE;
+ waitmsg = "nbufkv";
+ } else if (bufspace >= hibufspace) {
+ waitmsg = "nbufbs";
+ flags = VFS_BIO_NEED_BUFSPACE;
+ } else {
+ waitmsg = "newbuf";
+ flags = VFS_BIO_NEED_ANY;
+ }
+
+ bd_speedup(); /* heeeelp */
+
+ needsbuffer |= flags;
+ while (needsbuffer & flags) {
+ if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
+ waitmsg, slptimeo))
+ return (NULL);
+ }
+ } else {
+ /*
+ * We finally have a valid bp. We aren't quite out of the
+ * woods, we still have to reserve kva space. In order
+ * to keep fragmentation sane we only allocate kva in
+ * BKVASIZE chunks.
+ */
+ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
+
+ if (maxsize != bp->b_kvasize) {
+ vm_offset_t addr = 0;
+
+ bfreekva(bp);
+
+ if (vm_map_findspace(buffer_map,
+ vm_map_min(buffer_map), maxsize, &addr)) {
+ /*
+ * Uh oh. Buffer map is to fragmented. We
+ * must defragment the map.
+ */
+ ++bufdefragcnt;
+ defrag = 1;
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ goto restart;
+ }
+ if (addr) {
+ vm_map_insert(buffer_map, NULL, 0,
+ addr, addr + maxsize,
+ VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
+
+ bp->b_kvabase = (caddr_t) addr;
+ bp->b_kvasize = maxsize;
+ bufspace += bp->b_kvasize;
+ ++bufreusecnt;
+ }
+ }
+ bp->b_data = bp->b_kvabase;
+ }
+ return(bp);
+}
+
+/*
+ * buf_daemon:
+ *
+ * buffer flushing daemon. Buffers are normally flushed by the
+ * update daemon but if it cannot keep up this process starts to
+ * take the load in an attempt to prevent getnewbuf() from blocking.
+ */
+
+static struct proc *bufdaemonproc;
+
+static struct kproc_desc buf_kp = {
+ "bufdaemon",
+ buf_daemon,
+ &bufdaemonproc
+};
+SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
+
+static void
+buf_daemon()
+{
+ int s;
+
+ mtx_lock(&Giant);
+
+ /*
+ * This process needs to be suspended prior to shutdown sync.
+ */
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
+ SHUTDOWN_PRI_LAST);
+
+ /*
+ * This process is allowed to take the buffer cache to the limit
+ */
+ s = splbio();
+
+ for (;;) {
+ kthread_suspend_check(bufdaemonproc);
+
+ bd_request = 0;
+
+ /*
+ * Do the flush. Limit the amount of in-transit I/O we
+ * allow to build up, otherwise we would completely saturate
+ * the I/O system. Wakeup any waiting processes before we
+ * normally would so they can run in parallel with our drain.
+ */
+ while (numdirtybuffers > lodirtybuffers) {
+ if (flushbufqueues() == 0)
+ break;
+ waitrunningbufspace();
+ numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
+ }
+
+ /*
+ * Only clear bd_request if we have reached our low water
+ * mark. The buf_daemon normally waits 1 second and
+ * then incrementally flushes any dirty buffers that have
+ * built up, within reason.
+ *
+ * If we were unable to hit our low water mark and couldn't
+ * find any flushable buffers, we sleep half a second.
+ * Otherwise we loop immediately.
+ */
+ if (numdirtybuffers <= lodirtybuffers) {
+ /*
+ * We reached our low water mark, reset the
+ * request and sleep until we are needed again.
+ * The sleep is just so the suspend code works.
+ */
+ bd_request = 0;
+ tsleep(&bd_request, PVM, "psleep", hz);
+ } else {
+ /*
+ * We couldn't find any flushable dirty buffers but
+ * still have too many dirty buffers, we
+ * have to sleep and try again. (rare)
+ */
+ tsleep(&bd_request, PVM, "qsleep", hz / 2);
+ }
+ }
+}
+
+/*
+ * flushbufqueues:
+ *
+ * Try to flush a buffer in the dirty queue. We must be careful to
+ * free up B_INVAL buffers instead of write them, which NFS is
+ * particularly sensitive to.
+ */
+
+static int
+flushbufqueues(void)
+{
+ struct buf *bp;
+ int r = 0;
+
+ bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
+
+ while (bp) {
+ KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
+ if ((bp->b_flags & B_DELWRI) != 0 &&
+ (bp->b_xflags & BX_BKGRDINPROG) == 0) {
+ if (bp->b_flags & B_INVAL) {
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
+ panic("flushbufqueues: locked buf");
+ bremfree(bp);
+ brelse(bp);
+ ++r;
+ break;
+ }
+ if (LIST_FIRST(&bp->b_dep) != NULL &&
+ (bp->b_flags & B_DEFERRED) == 0 &&
+ buf_countdeps(bp, 0)) {
+ TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY],
+ bp, b_freelist);
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY],
+ bp, b_freelist);
+ bp->b_flags |= B_DEFERRED;
+ bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
+ continue;
+ }
+ vfs_bio_awrite(bp);
+ ++r;
+ break;
+ }
+ bp = TAILQ_NEXT(bp, b_freelist);
+ }
+ return (r);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
+struct buf *
+incore(struct vnode * vp, daddr_t blkno)
+{
+ struct buf *bp;
+
+ int s = splbio();
+ bp = gbincore(vp, blkno);
+ splx(s);
+ return (bp);
+}
+
+/*
+ * Returns true if no I/O is needed to access the
+ * associated VM object. This is like incore except
+ * it also hunts around in the VM system for the data.
+ */
+
+int
+inmem(struct vnode * vp, daddr_t blkno)
+{
+ vm_object_t obj;
+ vm_offset_t toff, tinc, size;
+ vm_page_t m;
+ vm_ooffset_t off;
+
+ GIANT_REQUIRED;
+
+ if (incore(vp, blkno))
+ return 1;
+ if (vp->v_mount == NULL)
+ return 0;
+ if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_flag & VOBJBUF) == 0)
+ return 0;
+
+ size = PAGE_SIZE;
+ if (size > vp->v_mount->mnt_stat.f_iosize)
+ size = vp->v_mount->mnt_stat.f_iosize;
+ off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
+
+ for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
+ m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
+ if (!m)
+ goto notinmem;
+ tinc = size;
+ if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
+ tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
+ if (vm_page_is_valid(m,
+ (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
+ goto notinmem;
+ }
+ return 1;
+
+notinmem:
+ return (0);
+}
+
+/*
+ * vfs_setdirty:
+ *
+ * Sets the dirty range for a buffer based on the status of the dirty
+ * bits in the pages comprising the buffer.
+ *
+ * The range is limited to the size of the buffer.
+ *
+ * This routine is primarily used by NFS, but is generalized for the
+ * B_VMIO case.
+ */
+static void
+vfs_setdirty(struct buf *bp)
+{
+ int i;
+ vm_object_t object;
+
+ GIANT_REQUIRED;
+ /*
+ * Degenerate case - empty buffer
+ */
+
+ if (bp->b_bufsize == 0)
+ return;
+
+ /*
+ * We qualify the scan for modified pages on whether the
+ * object has been flushed yet. The OBJ_WRITEABLE flag
+ * is not cleared simply by protecting pages off.
+ */
+
+ if ((bp->b_flags & B_VMIO) == 0)
+ return;
+
+ object = bp->b_pages[0]->object;
+
+ if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
+ printf("Warning: object %p writeable but not mightbedirty\n", object);
+ if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
+ printf("Warning: object %p mightbedirty but not writeable\n", object);
+
+ if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
+ vm_offset_t boffset;
+ vm_offset_t eoffset;
+
+ /*
+ * test the pages to see if they have been modified directly
+ * by users through the VM system.
+ */
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
+ vm_page_test_dirty(bp->b_pages[i]);
+ }
+
+ /*
+ * Calculate the encompassing dirty range, boffset and eoffset,
+ * (eoffset - boffset) bytes.
+ */
+
+ for (i = 0; i < bp->b_npages; i++) {
+ if (bp->b_pages[i]->dirty)
+ break;
+ }
+ boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+ for (i = bp->b_npages - 1; i >= 0; --i) {
+ if (bp->b_pages[i]->dirty) {
+ break;
+ }
+ }
+ eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+ /*
+ * Fit it to the buffer.
+ */
+
+ if (eoffset > bp->b_bcount)
+ eoffset = bp->b_bcount;
+
+ /*
+ * If we have a good dirty range, merge with the existing
+ * dirty range.
+ */
+
+ if (boffset < eoffset) {
+ if (bp->b_dirtyoff > boffset)
+ bp->b_dirtyoff = boffset;
+ if (bp->b_dirtyend < eoffset)
+ bp->b_dirtyend = eoffset;
+ }
+ }
+}
+
+/*
+ * getblk:
+ *
+ * Get a block given a specified block and offset into a file/device.
+ * The buffers B_DONE bit will be cleared on return, making it almost
+ * ready for an I/O initiation. B_INVAL may or may not be set on
+ * return. The caller should clear B_INVAL prior to initiating a
+ * READ.
+ *
+ * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
+ * an existing buffer.
+ *
+ * For a VMIO buffer, B_CACHE is modified according to the backing VM.
+ * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
+ * and then cleared based on the backing VM. If the previous buffer is
+ * non-0-sized but invalid, B_CACHE will be cleared.
+ *
+ * If getblk() must create a new buffer, the new buffer is returned with
+ * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
+ * case it is returned with B_INVAL clear and B_CACHE set based on the
+ * backing VM.
+ *
+ * getblk() also forces a BUF_WRITE() for any B_DELWRI buffer whos
+ * B_CACHE bit is clear.
+ *
+ * What this means, basically, is that the caller should use B_CACHE to
+ * determine whether the buffer is fully valid or not and should clear
+ * B_INVAL prior to issuing a read. If the caller intends to validate
+ * the buffer by loading its data area with something, the caller needs
+ * to clear B_INVAL. If the caller does this without issuing an I/O,
+ * the caller should set B_CACHE ( as an optimization ), else the caller
+ * should issue the I/O and biodone() will set B_CACHE if the I/O was
+ * a write attempt or if it was a successfull read. If the caller
+ * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
+ * prior to issuing the READ. biodone() will *not* clear B_INVAL.
+ */
+struct buf *
+getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
+{
+ struct buf *bp;
+ int s;
+ struct bufhashhdr *bh;
+
+ if (size > MAXBSIZE)
+ panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+
+ s = splbio();
+loop:
+ /*
+ * Block if we are low on buffers. Certain processes are allowed
+ * to completely exhaust the buffer cache.
+ *
+ * If this check ever becomes a bottleneck it may be better to
+ * move it into the else, when gbincore() fails. At the moment
+ * it isn't a problem.
+ *
+ * XXX remove if 0 sections (clean this up after its proven)
+ */
+ if (numfreebuffers == 0) {
+ if (curthread == PCPU_GET(idlethread))
+ return NULL;
+ needsbuffer |= VFS_BIO_NEED_ANY;
+ }
+
+ if ((bp = gbincore(vp, blkno))) {
+ /*
+ * Buffer is in-core. If the buffer is not busy, it must
+ * be on a queue.
+ */
+
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+ if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
+ "getblk", slpflag, slptimeo) == ENOLCK)
+ goto loop;
+ splx(s);
+ return (struct buf *) NULL;
+ }
+
+ /*
+ * The buffer is locked. B_CACHE is cleared if the buffer is
+ * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set
+ * and for a VMIO buffer B_CACHE is adjusted according to the
+ * backing VM cache.
+ */
+ if (bp->b_flags & B_INVAL)
+ bp->b_flags &= ~B_CACHE;
+ else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
+ bp->b_flags |= B_CACHE;
+ bremfree(bp);
+
+ /*
+ * check for size inconsistancies for non-VMIO case.
+ */
+
+ if (bp->b_bcount != size) {
+ if ((bp->b_flags & B_VMIO) == 0 ||
+ (size > bp->b_kvasize)) {
+ if (bp->b_flags & B_DELWRI) {
+ bp->b_flags |= B_NOCACHE;
+ BUF_WRITE(bp);
+ } else {
+ if ((bp->b_flags & B_VMIO) &&
+ (LIST_FIRST(&bp->b_dep) == NULL)) {
+ bp->b_flags |= B_RELBUF;
+ brelse(bp);
+ } else {
+ bp->b_flags |= B_NOCACHE;
+ BUF_WRITE(bp);
+ }
+ }
+ goto loop;
+ }
+ }
+
+ /*
+ * If the size is inconsistant in the VMIO case, we can resize
+ * the buffer. This might lead to B_CACHE getting set or
+ * cleared. If the size has not changed, B_CACHE remains
+ * unchanged from its previous state.
+ */
+
+ if (bp->b_bcount != size)
+ allocbuf(bp, size);
+
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("getblk: no buffer offset"));
+
+ /*
+ * A buffer with B_DELWRI set and B_CACHE clear must
+ * be committed before we can return the buffer in
+ * order to prevent the caller from issuing a read
+ * ( due to B_CACHE not being set ) and overwriting
+ * it.
+ *
+ * Most callers, including NFS and FFS, need this to
+ * operate properly either because they assume they
+ * can issue a read if B_CACHE is not set, or because
+ * ( for example ) an uncached B_DELWRI might loop due
+ * to softupdates re-dirtying the buffer. In the latter
+ * case, B_CACHE is set after the first write completes,
+ * preventing further loops.
+ * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE
+ * above while extending the buffer, we cannot allow the
+ * buffer to remain with B_CACHE set after the write
+ * completes or it will represent a corrupt state. To
+ * deal with this we set B_NOCACHE to scrap the buffer
+ * after the write.
+ *
+ * We might be able to do something fancy, like setting
+ * B_CACHE in bwrite() except if B_DELWRI is already set,
+ * so the below call doesn't set B_CACHE, but that gets real
+ * confusing. This is much easier.
+ */
+
+ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+ bp->b_flags |= B_NOCACHE;
+ BUF_WRITE(bp);
+ goto loop;
+ }
+
+ splx(s);
+ bp->b_flags &= ~B_DONE;
+ } else {
+ /*
+ * Buffer is not in-core, create new buffer. The buffer
+ * returned by getnewbuf() is locked. Note that the returned
+ * buffer is also considered valid (not marked B_INVAL).
+ */
+ int bsize, maxsize, vmio;
+ off_t offset;
+
+ if (vn_isdisk(vp, NULL))
+ bsize = DEV_BSIZE;
+ else if (vp->v_mountedhere)
+ bsize = vp->v_mountedhere->mnt_stat.f_iosize;
+ else if (vp->v_mount)
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+ else
+ bsize = size;
+
+ offset = blkno * bsize;
+ vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF);
+ maxsize = vmio ? size + (offset & PAGE_MASK) : size;
+ maxsize = imax(maxsize, bsize);
+
+ if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
+ if (slpflag || slptimeo) {
+ splx(s);
+ return NULL;
+ }
+ goto loop;
+ }
+
+ /*
+ * This code is used to make sure that a buffer is not
+ * created while the getnewbuf routine is blocked.
+ * This can be a problem whether the vnode is locked or not.
+ * If the buffer is created out from under us, we have to
+ * throw away the one we just created. There is now window
+ * race because we are safely running at splbio() from the
+ * point of the duplicate buffer creation through to here,
+ * and we've locked the buffer.
+ */
+ if (gbincore(vp, blkno)) {
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ goto loop;
+ }
+
+ /*
+ * Insert the buffer into the hash, so that it can
+ * be found by incore.
+ */
+ bp->b_blkno = bp->b_lblkno = blkno;
+ bp->b_offset = offset;
+
+ bgetvp(vp, bp);
+ LIST_REMOVE(bp, b_hash);
+ bh = bufhash(vp, blkno);
+ LIST_INSERT_HEAD(bh, bp, b_hash);
+
+ /*
+ * set B_VMIO bit. allocbuf() the buffer bigger. Since the
+ * buffer size starts out as 0, B_CACHE will be set by
+ * allocbuf() for the VMIO case prior to it testing the
+ * backing store for validity.
+ */
+
+ if (vmio) {
+ bp->b_flags |= B_VMIO;
+#if defined(VFS_BIO_DEBUG)
+ if (vp->v_type != VREG)
+ printf("getblk: vmioing file type %d???\n", vp->v_type);
+#endif
+ } else {
+ bp->b_flags &= ~B_VMIO;
+ }
+
+ allocbuf(bp, size);
+
+ splx(s);
+ bp->b_flags &= ~B_DONE;
+ }
+ return (bp);
+}
+
+/*
+ * Get an empty, disassociated buffer of given size. The buffer is initially
+ * set to B_INVAL.
+ */
+struct buf *
+geteblk(int size)
+{
+ struct buf *bp;
+ int s;
+ int maxsize;
+
+ maxsize = (size + BKVAMASK) & ~BKVAMASK;
+
+ s = splbio();
+ while ((bp = getnewbuf(0, 0, size, maxsize)) == 0);
+ splx(s);
+ allocbuf(bp, size);
+ bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
+ return (bp);
+}
+
+
+/*
+ * This code constitutes the buffer memory from either anonymous system
+ * memory (in the case of non-VMIO operations) or from an associated
+ * VM object (in the case of VMIO operations). This code is able to
+ * resize a buffer up or down.
+ *
+ * Note that this code is tricky, and has many complications to resolve
+ * deadlock or inconsistant data situations. Tread lightly!!!
+ * There are B_CACHE and B_DELWRI interactions that must be dealt with by
+ * the caller. Calling this code willy nilly can result in the loss of data.
+ *
+ * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with
+ * B_CACHE for the non-VMIO case.
+ */
+
+int
+allocbuf(struct buf *bp, int size)
+{
+ int newbsize, mbsize;
+ int i;
+
+ GIANT_REQUIRED;
+
+ if (BUF_REFCNT(bp) == 0)
+ panic("allocbuf: buffer not busy");
+
+ if (bp->b_kvasize < size)
+ panic("allocbuf: buffer too small");
+
+ if ((bp->b_flags & B_VMIO) == 0) {
+ caddr_t origbuf;
+ int origbufsize;
+ /*
+ * Just get anonymous memory from the kernel. Don't
+ * mess with B_CACHE.
+ */
+ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+ if (bp->b_flags & B_MALLOC)
+ newbsize = mbsize;
+ else
+ newbsize = round_page(size);
+
+ if (newbsize < bp->b_bufsize) {
+ /*
+ * malloced buffers are not shrunk
+ */
+ if (bp->b_flags & B_MALLOC) {
+ if (newbsize) {
+ bp->b_bcount = size;
+ } else {
+ free(bp->b_data, M_BIOBUF);
+ if (bp->b_bufsize) {
+ bufmallocspace -= bp->b_bufsize;
+ bufspacewakeup();
+ bp->b_bufsize = 0;
+ }
+ bp->b_data = bp->b_kvabase;
+ bp->b_bcount = 0;
+ bp->b_flags &= ~B_MALLOC;
+ }
+ return 1;
+ }
+ vm_hold_free_pages(
+ bp,
+ (vm_offset_t) bp->b_data + newbsize,
+ (vm_offset_t) bp->b_data + bp->b_bufsize);
+ } else if (newbsize > bp->b_bufsize) {
+ /*
+ * We only use malloced memory on the first allocation.
+ * and revert to page-allocated memory when the buffer
+ * grows.
+ */
+ if ( (bufmallocspace < maxbufmallocspace) &&
+ (bp->b_bufsize == 0) &&
+ (mbsize <= PAGE_SIZE/2)) {
+
+ bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
+ bp->b_bufsize = mbsize;
+ bp->b_bcount = size;
+ bp->b_flags |= B_MALLOC;
+ bufmallocspace += mbsize;
+ return 1;
+ }
+ origbuf = NULL;
+ origbufsize = 0;
+ /*
+ * If the buffer is growing on its other-than-first allocation,
+ * then we revert to the page-allocation scheme.
+ */
+ if (bp->b_flags & B_MALLOC) {
+ origbuf = bp->b_data;
+ origbufsize = bp->b_bufsize;
+ bp->b_data = bp->b_kvabase;
+ if (bp->b_bufsize) {
+ bufmallocspace -= bp->b_bufsize;
+ bufspacewakeup();
+ bp->b_bufsize = 0;
+ }
+ bp->b_flags &= ~B_MALLOC;
+ newbsize = round_page(newbsize);
+ }
+ vm_hold_load_pages(
+ bp,
+ (vm_offset_t) bp->b_data + bp->b_bufsize,
+ (vm_offset_t) bp->b_data + newbsize);
+ if (origbuf) {
+ bcopy(origbuf, bp->b_data, origbufsize);
+ free(origbuf, M_BIOBUF);
+ }
+ }
+ } else {
+ vm_page_t m;
+ int desiredpages;
+
+ newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+ desiredpages = (size == 0) ? 0 :
+ num_pages((bp->b_offset & PAGE_MASK) + newbsize);
+
+ if (bp->b_flags & B_MALLOC)
+ panic("allocbuf: VMIO buffer can't be malloced");
+ /*
+ * Set B_CACHE initially if buffer is 0 length or will become
+ * 0-length.
+ */
+ if (size == 0 || bp->b_bufsize == 0)
+ bp->b_flags |= B_CACHE;
+
+ if (newbsize < bp->b_bufsize) {
+ /*
+ * DEV_BSIZE aligned new buffer size is less then the
+ * DEV_BSIZE aligned existing buffer size. Figure out
+ * if we have to remove any pages.
+ */
+ if (desiredpages < bp->b_npages) {
+ for (i = desiredpages; i < bp->b_npages; i++) {
+ /*
+ * the page is not freed here -- it
+ * is the responsibility of
+ * vnode_pager_setsize
+ */
+ m = bp->b_pages[i];
+ KASSERT(m != bogus_page,
+ ("allocbuf: bogus page found"));
+ while (vm_page_sleep_busy(m, TRUE, "biodep"))
+ ;
+
+ bp->b_pages[i] = NULL;
+ vm_page_unwire(m, 0);
+ }
+ pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
+ (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
+ bp->b_npages = desiredpages;
+ }
+ } else if (size > bp->b_bcount) {
+ /*
+ * We are growing the buffer, possibly in a
+ * byte-granular fashion.
+ */
+ struct vnode *vp;
+ vm_object_t obj;
+ vm_offset_t toff;
+ vm_offset_t tinc;
+
+ /*
+ * Step 1, bring in the VM pages from the object,
+ * allocating them if necessary. We must clear
+ * B_CACHE if these pages are not valid for the
+ * range covered by the buffer.
+ */
+
+ vp = bp->b_vp;
+ VOP_GETVOBJECT(vp, &obj);
+
+ while (bp->b_npages < desiredpages) {
+ vm_page_t m;
+ vm_pindex_t pi;
+
+ pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
+ if ((m = vm_page_lookup(obj, pi)) == NULL) {
+ /*
+ * note: must allocate system pages
+ * since blocking here could intefere
+ * with paging I/O, no matter which
+ * process we are.
+ */
+ m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM);
+ if (m == NULL) {
+ VM_WAIT;
+ vm_pageout_deficit += desiredpages - bp->b_npages;
+ } else {
+ vm_page_wire(m);
+ vm_page_wakeup(m);
+ bp->b_flags &= ~B_CACHE;
+ bp->b_pages[bp->b_npages] = m;
+ ++bp->b_npages;
+ }
+ continue;
+ }
+
+ /*
+ * We found a page. If we have to sleep on it,
+ * retry because it might have gotten freed out
+ * from under us.
+ *
+ * We can only test PG_BUSY here. Blocking on
+ * m->busy might lead to a deadlock:
+ *
+ * vm_fault->getpages->cluster_read->allocbuf
+ *
+ */
+
+ if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
+ continue;
+
+ /*
+ * We have a good page. Should we wakeup the
+ * page daemon?
+ */
+ if ((curproc != pageproc) &&
+ ((m->queue - m->pc) == PQ_CACHE) &&
+ ((cnt.v_free_count + cnt.v_cache_count) <
+ (cnt.v_free_min + cnt.v_cache_min))) {
+ pagedaemon_wakeup();
+ }
+ vm_page_flag_clear(m, PG_ZERO);
+ vm_page_wire(m);
+ bp->b_pages[bp->b_npages] = m;
+ ++bp->b_npages;
+ }
+
+ /*
+ * Step 2. We've loaded the pages into the buffer,
+ * we have to figure out if we can still have B_CACHE
+ * set. Note that B_CACHE is set according to the
+ * byte-granular range ( bcount and size ), new the
+ * aligned range ( newbsize ).
+ *
+ * The VM test is against m->valid, which is DEV_BSIZE
+ * aligned. Needless to say, the validity of the data
+ * needs to also be DEV_BSIZE aligned. Note that this
+ * fails with NFS if the server or some other client
+ * extends the file's EOF. If our buffer is resized,
+ * B_CACHE may remain set! XXX
+ */
+
+ toff = bp->b_bcount;
+ tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
+
+ while ((bp->b_flags & B_CACHE) && toff < size) {
+ vm_pindex_t pi;
+
+ if (tinc > (size - toff))
+ tinc = size - toff;
+
+ pi = ((bp->b_offset & PAGE_MASK) + toff) >>
+ PAGE_SHIFT;
+
+ vfs_buf_test_cache(
+ bp,
+ bp->b_offset,
+ toff,
+ tinc,
+ bp->b_pages[pi]
+ );
+ toff += tinc;
+ tinc = PAGE_SIZE;
+ }
+
+ /*
+ * Step 3, fixup the KVM pmap. Remember that
+ * bp->b_data is relative to bp->b_offset, but
+ * bp->b_offset may be offset into the first page.
+ */
+
+ bp->b_data = (caddr_t)
+ trunc_page((vm_offset_t)bp->b_data);
+ pmap_qenter(
+ (vm_offset_t)bp->b_data,
+ bp->b_pages,
+ bp->b_npages
+ );
+
+ bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
+ (vm_offset_t)(bp->b_offset & PAGE_MASK));
+ }
+ }
+ if (newbsize < bp->b_bufsize)
+ bufspacewakeup();
+ bp->b_bufsize = newbsize; /* actual buffer allocation */
+ bp->b_bcount = size; /* requested buffer size */
+ return 1;
+}
+
+/*
+ * bufwait:
+ *
+ * Wait for buffer I/O completion, returning error status. The buffer
+ * is left locked and B_DONE on return. B_EINTR is converted into a EINTR
+ * error and cleared.
+ */
+int
+bufwait(register struct buf * bp)
+{
+ int s;
+
+ s = splbio();
+ while ((bp->b_flags & B_DONE) == 0) {
+ if (bp->b_iocmd == BIO_READ)
+ tsleep(bp, PRIBIO, "biord", 0);
+ else
+ tsleep(bp, PRIBIO, "biowr", 0);
+ }
+ splx(s);
+ if (bp->b_flags & B_EINTR) {
+ bp->b_flags &= ~B_EINTR;
+ return (EINTR);
+ }
+ if (bp->b_ioflags & BIO_ERROR) {
+ return (bp->b_error ? bp->b_error : EIO);
+ } else {
+ return (0);
+ }
+}
+
+ /*
+ * Call back function from struct bio back up to struct buf.
+ * The corresponding initialization lives in sys/conf.h:DEV_STRATEGY().
+ */
+void
+bufdonebio(struct bio *bp)
+{
+ bufdone(bp->bio_caller2);
+}
+
+/*
+ * bufdone:
+ *
+ * Finish I/O on a buffer, optionally calling a completion function.
+ * This is usually called from an interrupt so process blocking is
+ * not allowed.
+ *
+ * biodone is also responsible for setting B_CACHE in a B_VMIO bp.
+ * In a non-VMIO bp, B_CACHE will be set on the next getblk()
+ * assuming B_INVAL is clear.
+ *
+ * For the VMIO case, we set B_CACHE if the op was a read and no
+ * read error occured, or if the op was a write. B_CACHE is never
+ * set if the buffer is invalid or otherwise uncacheable.
+ *
+ * biodone does not mess with B_INVAL, allowing the I/O routine or the
+ * initiator to leave B_INVAL set to brelse the buffer out of existance
+ * in the biodone routine.
+ */
+void
+bufdone(struct buf *bp)
+{
+ int s, error;
+ void (*biodone)(struct buf *);
+
+ GIANT_REQUIRED;
+
+ s = splbio();
+
+ KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
+ KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
+
+ bp->b_flags |= B_DONE;
+ runningbufwakeup(bp);
+
+ if (bp->b_iocmd == BIO_DELETE) {
+ brelse(bp);
+ splx(s);
+ return;
+ }
+
+ if (bp->b_iocmd == BIO_WRITE) {
+ vwakeup(bp);
+ }
+
+ /* call optional completion function if requested */
+ if (bp->b_iodone != NULL) {
+ biodone = bp->b_iodone;
+ bp->b_iodone = NULL;
+ (*biodone) (bp);
+ splx(s);
+ return;
+ }
+ if (LIST_FIRST(&bp->b_dep) != NULL)
+ buf_complete(bp);
+
+ if (bp->b_flags & B_VMIO) {
+ int i;
+ vm_ooffset_t foff;
+ vm_page_t m;
+ vm_object_t obj;
+ int iosize;
+ struct vnode *vp = bp->b_vp;
+
+ error = VOP_GETVOBJECT(vp, &obj);
+
+#if defined(VFS_BIO_DEBUG)
+ if (vp->v_usecount == 0) {
+ panic("biodone: zero vnode ref count");
+ }
+
+ if (error) {
+ panic("biodone: missing VM object");
+ }
+
+ if ((vp->v_flag & VOBJBUF) == 0) {
+ panic("biodone: vnode is not setup for merged cache");
+ }
+#endif
+
+ foff = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("biodone: no buffer offset"));
+
+ if (error) {
+ panic("biodone: no object");
+ }
+#if defined(VFS_BIO_DEBUG)
+ if (obj->paging_in_progress < bp->b_npages) {
+ printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
+ obj->paging_in_progress, bp->b_npages);
+ }
+#endif
+
+ /*
+ * Set B_CACHE if the op was a normal read and no error
+ * occured. B_CACHE is set for writes in the b*write()
+ * routines.
+ */
+ iosize = bp->b_bcount - bp->b_resid;
+ if (bp->b_iocmd == BIO_READ &&
+ !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
+ !(bp->b_ioflags & BIO_ERROR)) {
+ bp->b_flags |= B_CACHE;
+ }
+
+ for (i = 0; i < bp->b_npages; i++) {
+ int bogusflag = 0;
+ int resid;
+
+ resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
+ if (resid > iosize)
+ resid = iosize;
+
+ /*
+ * cleanup bogus pages, restoring the originals
+ */
+ m = bp->b_pages[i];
+ if (m == bogus_page) {
+ bogusflag = 1;
+ m = vm_page_lookup(obj, OFF_TO_IDX(foff));
+ if (m == NULL)
+ panic("biodone: page disappeared!");
+ bp->b_pages[i] = m;
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+ }
+#if defined(VFS_BIO_DEBUG)
+ if (OFF_TO_IDX(foff) != m->pindex) {
+ printf(
+"biodone: foff(%lu)/m->pindex(%d) mismatch\n",
+ (unsigned long)foff, m->pindex);
+ }
+#endif
+
+ /*
+ * In the write case, the valid and clean bits are
+ * already changed correctly ( see bdwrite() ), so we
+ * only need to do this here in the read case.
+ */
+ if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
+ vfs_page_set_valid(bp, foff, i, m);
+ }
+ vm_page_flag_clear(m, PG_ZERO);
+
+ /*
+ * when debugging new filesystems or buffer I/O methods, this
+ * is the most common error that pops up. if you see this, you
+ * have not set the page busy flag correctly!!!
+ */
+ if (m->busy == 0) {
+ printf("biodone: page busy < 0, "
+ "pindex: %d, foff: 0x(%x,%x), "
+ "resid: %d, index: %d\n",
+ (int) m->pindex, (int)(foff >> 32),
+ (int) foff & 0xffffffff, resid, i);
+ if (!vn_isdisk(vp, NULL))
+ printf(" iosize: %ld, lblkno: %jd, flags: 0x%lx, npages: %d\n",
+ bp->b_vp->v_mount->mnt_stat.f_iosize,
+ (intmax_t) bp->b_lblkno,
+ bp->b_flags, bp->b_npages);
+ else
+ printf(" VDEV, lblkno: %jd, flags: 0x%lx, npages: %d\n",
+ (intmax_t) bp->b_lblkno,
+ bp->b_flags, bp->b_npages);
+ printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
+ m->valid, m->dirty, m->wire_count);
+ panic("biodone: page busy < 0\n");
+ }
+ vm_page_io_finish(m);
+ vm_object_pip_subtract(obj, 1);
+ foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ iosize -= resid;
+ }
+ if (obj)
+ vm_object_pip_wakeupn(obj, 0);
+ }
+
+ /*
+ * For asynchronous completions, release the buffer now. The brelse
+ * will do a wakeup there if necessary - so no need to do a wakeup
+ * here in the async case. The sync case always needs to do a wakeup.
+ */
+
+ if (bp->b_flags & B_ASYNC) {
+ if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
+ brelse(bp);
+ else
+ bqrelse(bp);
+ } else {
+ wakeup(bp);
+ }
+ splx(s);
+}
+
+/*
+ * This routine is called in lieu of iodone in the case of
+ * incomplete I/O. This keeps the busy status for pages
+ * consistant.
+ */
+void
+vfs_unbusy_pages(struct buf * bp)
+{
+ int i;
+
+ GIANT_REQUIRED;
+
+ runningbufwakeup(bp);
+ if (bp->b_flags & B_VMIO) {
+ struct vnode *vp = bp->b_vp;
+ vm_object_t obj;
+
+ VOP_GETVOBJECT(vp, &obj);
+
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m = bp->b_pages[i];
+
+ if (m == bogus_page) {
+ m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
+ if (!m) {
+ panic("vfs_unbusy_pages: page missing\n");
+ }
+ bp->b_pages[i] = m;
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+ }
+ vm_object_pip_subtract(obj, 1);
+ vm_page_flag_clear(m, PG_ZERO);
+ vm_page_io_finish(m);
+ }
+ vm_object_pip_wakeupn(obj, 0);
+ }
+}
+
+/*
+ * vfs_page_set_valid:
+ *
+ * Set the valid bits in a page based on the supplied offset. The
+ * range is restricted to the buffer's size.
+ *
+ * This routine is typically called after a read completes.
+ */
+static void
+vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
+{
+ vm_ooffset_t soff, eoff;
+
+ GIANT_REQUIRED;
+ /*
+ * Start and end offsets in buffer. eoff - soff may not cross a
+ * page boundry or cross the end of the buffer. The end of the
+ * buffer, in this case, is our file EOF, not the allocation size
+ * of the buffer.
+ */
+ soff = off;
+ eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ if (eoff > bp->b_offset + bp->b_bcount)
+ eoff = bp->b_offset + bp->b_bcount;
+
+ /*
+ * Set valid range. This is typically the entire buffer and thus the
+ * entire page.
+ */
+ if (eoff > soff) {
+ vm_page_set_validclean(
+ m,
+ (vm_offset_t) (soff & PAGE_MASK),
+ (vm_offset_t) (eoff - soff)
+ );
+ }
+}
+
+/*
+ * This routine is called before a device strategy routine.
+ * It is used to tell the VM system that paging I/O is in
+ * progress, and treat the pages associated with the buffer
+ * almost as being PG_BUSY. Also the object paging_in_progress
+ * flag is handled to make sure that the object doesn't become
+ * inconsistant.
+ *
+ * Since I/O has not been initiated yet, certain buffer flags
+ * such as BIO_ERROR or B_INVAL may be in an inconsistant state
+ * and should be ignored.
+ */
+void
+vfs_busy_pages(struct buf * bp, int clear_modify)
+{
+ int i, bogus;
+
+ GIANT_REQUIRED;
+
+ if (bp->b_flags & B_VMIO) {
+ struct vnode *vp = bp->b_vp;
+ vm_object_t obj;
+ vm_ooffset_t foff;
+
+ VOP_GETVOBJECT(vp, &obj);
+ foff = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("vfs_busy_pages: no buffer offset"));
+ vfs_setdirty(bp);
+
+retry:
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m = bp->b_pages[i];
+ if (vm_page_sleep_busy(m, FALSE, "vbpage"))
+ goto retry;
+ }
+
+ bogus = 0;
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m = bp->b_pages[i];
+
+ vm_page_flag_clear(m, PG_ZERO);
+ if ((bp->b_flags & B_CLUSTER) == 0) {
+ vm_object_pip_add(obj, 1);
+ vm_page_io_start(m);
+ }
+
+ /*
+ * When readying a buffer for a read ( i.e
+ * clear_modify == 0 ), it is important to do
+ * bogus_page replacement for valid pages in
+ * partially instantiated buffers. Partially
+ * instantiated buffers can, in turn, occur when
+ * reconstituting a buffer from its VM backing store
+ * base. We only have to do this if B_CACHE is
+ * clear ( which causes the I/O to occur in the
+ * first place ). The replacement prevents the read
+ * I/O from overwriting potentially dirty VM-backed
+ * pages. XXX bogus page replacement is, uh, bogus.
+ * It may not work properly with small-block devices.
+ * We need to find a better way.
+ */
+
+ vm_page_protect(m, VM_PROT_NONE);
+ if (clear_modify)
+ vfs_page_set_valid(bp, foff, i, m);
+ else if (m->valid == VM_PAGE_BITS_ALL &&
+ (bp->b_flags & B_CACHE) == 0) {
+ bp->b_pages[i] = bogus_page;
+ bogus++;
+ }
+ foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ }
+ if (bogus)
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+ }
+}
+
+/*
+ * Tell the VM system that the pages associated with this buffer
+ * are clean. This is used for delayed writes where the data is
+ * going to go to disk eventually without additional VM intevention.
+ *
+ * Note that while we only really need to clean through to b_bcount, we
+ * just go ahead and clean through to b_bufsize.
+ */
+static void
+vfs_clean_pages(struct buf * bp)
+{
+ int i;
+
+ GIANT_REQUIRED;
+
+ if (bp->b_flags & B_VMIO) {
+ vm_ooffset_t foff;
+
+ foff = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("vfs_clean_pages: no buffer offset"));
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m = bp->b_pages[i];
+ vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ vm_ooffset_t eoff = noff;
+
+ if (eoff > bp->b_offset + bp->b_bufsize)
+ eoff = bp->b_offset + bp->b_bufsize;
+ vfs_page_set_valid(bp, foff, i, m);
+ /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
+ foff = noff;
+ }
+ }
+}
+
+/*
+ * vfs_bio_set_validclean:
+ *
+ * Set the range within the buffer to valid and clean. The range is
+ * relative to the beginning of the buffer, b_offset. Note that b_offset
+ * itself may be offset from the beginning of the first page.
+ *
+ */
+
+void
+vfs_bio_set_validclean(struct buf *bp, int base, int size)
+{
+ if (bp->b_flags & B_VMIO) {
+ int i;
+ int n;
+
+ /*
+ * Fixup base to be relative to beginning of first page.
+ * Set initial n to be the maximum number of bytes in the
+ * first page that can be validated.
+ */
+
+ base += (bp->b_offset & PAGE_MASK);
+ n = PAGE_SIZE - (base & PAGE_MASK);
+
+ for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+ vm_page_t m = bp->b_pages[i];
+
+ if (n > size)
+ n = size;
+
+ vm_page_set_validclean(m, base & PAGE_MASK, n);
+ base += n;
+ size -= n;
+ n = PAGE_SIZE;
+ }
+ }
+}
+
+/*
+ * vfs_bio_clrbuf:
+ *
+ * clear a buffer. This routine essentially fakes an I/O, so we need
+ * to clear BIO_ERROR and B_INVAL.
+ *
+ * Note that while we only theoretically need to clear through b_bcount,
+ * we go ahead and clear through b_bufsize.
+ */
+
+void
+vfs_bio_clrbuf(struct buf *bp)
+{
+ int i, mask = 0;
+ caddr_t sa, ea;
+
+ GIANT_REQUIRED;
+
+ if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
+ bp->b_flags &= ~B_INVAL;
+ bp->b_ioflags &= ~BIO_ERROR;
+ if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
+ (bp->b_offset & PAGE_MASK) == 0) {
+ mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
+ if ((bp->b_pages[0]->valid & mask) == mask) {
+ bp->b_resid = 0;
+ return;
+ }
+ if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
+ ((bp->b_pages[0]->valid & mask) == 0)) {
+ bzero(bp->b_data, bp->b_bufsize);
+ bp->b_pages[0]->valid |= mask;
+ bp->b_resid = 0;
+ return;
+ }
+ }
+ ea = sa = bp->b_data;
+ for(i=0;i<bp->b_npages;i++,sa=ea) {
+ int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
+ ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
+ ea = (caddr_t)(vm_offset_t)ulmin(
+ (u_long)(vm_offset_t)ea,
+ (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
+ mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
+ if ((bp->b_pages[i]->valid & mask) == mask)
+ continue;
+ if ((bp->b_pages[i]->valid & mask) == 0) {
+ if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
+ bzero(sa, ea - sa);
+ }
+ } else {
+ for (; sa < ea; sa += DEV_BSIZE, j++) {
+ if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
+ (bp->b_pages[i]->valid & (1<<j)) == 0)
+ bzero(sa, DEV_BSIZE);
+ }
+ }
+ bp->b_pages[i]->valid |= mask;
+ vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
+ }
+ bp->b_resid = 0;
+ } else {
+ clrbuf(bp);
+ }
+}
+
+/*
+ * vm_hold_load_pages and vm_hold_free_pages get pages into
+ * a buffers address space. The pages are anonymous and are
+ * not associated with a file object.
+ */
+static void
+vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
+{
+ vm_offset_t pg;
+ vm_page_t p;
+ int index;
+
+ GIANT_REQUIRED;
+
+ to = round_page(to);
+ from = round_page(from);
+ index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+
+ for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+tryagain:
+ /*
+ * note: must allocate system pages since blocking here
+ * could intefere with paging I/O, no matter which
+ * process we are.
+ */
+ p = vm_page_alloc(kernel_object,
+ ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
+ VM_ALLOC_SYSTEM);
+ if (!p) {
+ vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
+ VM_WAIT;
+ goto tryagain;
+ }
+ vm_page_wire(p);
+ p->valid = VM_PAGE_BITS_ALL;
+ vm_page_flag_clear(p, PG_ZERO);
+ pmap_qenter(pg, &p, 1);
+ bp->b_pages[index] = p;
+ vm_page_wakeup(p);
+ }
+ bp->b_npages = index;
+}
+
+/* Return pages associated with this buf to the vm system */
+void
+vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
+{
+ vm_offset_t pg;
+ vm_page_t p;
+ int index, newnpages;
+
+ GIANT_REQUIRED;
+
+ from = round_page(from);
+ to = round_page(to);
+ newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+
+ for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+ p = bp->b_pages[index];
+ if (p && (index < bp->b_npages)) {
+ if (p->busy) {
+ printf(
+ "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
+ (intmax_t)bp->b_blkno,
+ (intmax_t)bp->b_lblkno);
+ }
+ bp->b_pages[index] = NULL;
+ pmap_qremove(pg, 1);
+ vm_page_busy(p);
+ vm_page_unwire(p, 0);
+ vm_page_free(p);
+ }
+ }
+ bp->b_npages = newnpages;
+}
+
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+/* DDB command to show buffer data */
+DB_SHOW_COMMAND(buffer, db_show_buffer)
+{
+ /* get args */
+ struct buf *bp = (struct buf *)addr;
+
+ if (!have_addr) {
+ db_printf("usage: show buffer <addr>\n");
+ return;
+ }
+
+ db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
+ db_printf(
+ "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
+ "b_dev = (%d,%d), b_data = %p, b_blkno = %jd, b_pblkno = %jd\n",
+ bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
+ major(bp->b_dev), minor(bp->b_dev), bp->b_data,
+ (intmax_t)bp->b_blkno, (intmax_t)bp->b_pblkno);
+ if (bp->b_npages) {
+ int i;
+ db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m;
+ m = bp->b_pages[i];
+ db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
+ (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
+ if ((i + 1) < bp->b_npages)
+ db_printf(",");
+ }
+ db_printf("\n");
+ }
+}
+#endif /* DDB */
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
new file mode 100644
index 0000000..be79fc2
--- /dev/null
+++ b/sys/kern/vfs_cache.c
@@ -0,0 +1,898 @@
+/*
+ * Copyright (c) 1989, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Poul-Henning Kamp of the FreeBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/namei.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/filedesc.h>
+#include <sys/fnv_hash.h>
+
+/*
+ * This structure describes the elements in the cache of recent
+ * names looked up by namei.
+ */
+
+struct namecache {
+ LIST_ENTRY(namecache) nc_hash; /* hash chain */
+ LIST_ENTRY(namecache) nc_src; /* source vnode list */
+ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
+ struct vnode *nc_dvp; /* vnode of parent of name */
+ struct vnode *nc_vp; /* vnode the name refers to */
+ u_char nc_flag; /* flag bits */
+ u_char nc_nlen; /* length of name */
+ char nc_name[0]; /* segment name */
+};
+
+/*
+ * Name caching works as follows:
+ *
+ * Names found by directory scans are retained in a cache
+ * for future reference. It is managed LRU, so frequently
+ * used names will hang around. Cache is indexed by hash value
+ * obtained from (vp, name) where vp refers to the directory
+ * containing name.
+ *
+ * If it is a "negative" entry, (i.e. for a name that is known NOT to
+ * exist) the vnode pointer will be NULL.
+ *
+ * Upon reaching the last segment of a path, if the reference
+ * is for DELETE, or NOCACHE is set (rewrite), and the
+ * name is located in the cache, it will be dropped.
+ */
+
+/*
+ * Structures associated with name cacheing.
+ */
+#define NCHHASH(hash) \
+ (&nchashtbl[(hash) & nchash])
+static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */
+static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */
+static u_long nchash; /* size of hash table */
+SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
+static u_long ncnegfactor = 16; /* ratio of negative entries */
+SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
+static u_long numneg; /* number of cache entries allocated */
+SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
+static u_long numcache; /* number of cache entries allocated */
+SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
+static u_long numcachehv; /* number of cache entries with vnodes held */
+SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
+#if 0
+static u_long numcachepl; /* number of cache purge for leaf entries */
+SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
+#endif
+struct nchstats nchstats; /* cache effectiveness statistics */
+
+static int doingcache = 1; /* 1 => enable the cache */
+SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
+
+/* Export size information to userland */
+SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
+SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
+
+/*
+ * The new name cache statistics
+ */
+SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
+#define STATNODE(mode, name, var) \
+ SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
+STATNODE(CTLFLAG_RD, numneg, &numneg);
+STATNODE(CTLFLAG_RD, numcache, &numcache);
+static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
+static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
+static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
+static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
+static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
+static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
+static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
+static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
+static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
+static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
+
+SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
+ sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
+
+
+
+static void cache_zap(struct namecache *ncp);
+
+static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
+
+/*
+ * Flags in namecache.nc_flag
+ */
+#define NCF_WHITE 1
+
+/*
+ * Grab an atomic snapshot of the name cache hash chain lengths
+ */
+SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
+
+static int
+sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct nchashhead *ncpp;
+ struct namecache *ncp;
+ int n_nchash;
+ int count;
+
+ n_nchash = nchash + 1; /* nchash is max index, not count */
+ if (!req->oldptr)
+ return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
+
+ /* Scan hash tables for applicable entries */
+ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
+ count = 0;
+ LIST_FOREACH(ncp, ncpp, nc_hash) {
+ count++;
+ }
+ error = SYSCTL_OUT(req, &count, sizeof(count));
+ if (error)
+ return (error);
+ }
+ return (0);
+}
+SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
+ 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
+
+static int
+sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct nchashhead *ncpp;
+ struct namecache *ncp;
+ int n_nchash;
+ int count, maxlength, used, pct;
+
+ if (!req->oldptr)
+ return SYSCTL_OUT(req, 0, 4 * sizeof(int));
+
+ n_nchash = nchash + 1; /* nchash is max index, not count */
+ used = 0;
+ maxlength = 0;
+
+ /* Scan hash tables for applicable entries */
+ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
+ count = 0;
+ LIST_FOREACH(ncp, ncpp, nc_hash) {
+ count++;
+ }
+ if (count)
+ used++;
+ if (maxlength < count)
+ maxlength = count;
+ }
+ n_nchash = nchash + 1;
+ pct = (used * 100 * 100) / n_nchash;
+ error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req, &used, sizeof(used));
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req, &pct, sizeof(pct));
+ if (error)
+ return (error);
+ return (0);
+}
+SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
+ 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
+
+/*
+ * Delete an entry from its hash list and move it to the front
+ * of the LRU list for immediate reuse.
+ */
+static void
+cache_zap(ncp)
+ struct namecache *ncp;
+{
+ LIST_REMOVE(ncp, nc_hash);
+ LIST_REMOVE(ncp, nc_src);
+ if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
+ vdrop(ncp->nc_dvp);
+ numcachehv--;
+ }
+ if (ncp->nc_vp) {
+ TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
+ } else {
+ TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+ numneg--;
+ }
+ numcache--;
+ free(ncp, M_VFSCACHE);
+}
+
+/*
+ * cache_leaf_test()
+ *
+ * Test whether this (directory) vnode's namei cache entry contains
+ * subdirectories or not. Used to determine whether the directory is
+ * a leaf in the namei cache or not. Note: the directory may still
+ * contain files in the namei cache.
+ *
+ * Returns 0 if the directory is a leaf, -1 if it isn't.
+ */
+int
+cache_leaf_test(struct vnode *vp)
+{
+ struct namecache *ncpc;
+
+ for (ncpc = LIST_FIRST(&vp->v_cache_src);
+ ncpc != NULL;
+ ncpc = LIST_NEXT(ncpc, nc_src)
+ ) {
+ if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR)
+ return(-1);
+ }
+ return(0);
+}
+
+/*
+ * Lookup an entry in the cache
+ *
+ * Lookup is called with dvp pointing to the directory to search,
+ * cnp pointing to the name of the entry being sought. If the lookup
+ * succeeds, the vnode is returned in *vpp, and a status of -1 is
+ * returned. If the lookup determines that the name does not exist
+ * (negative cacheing), a status of ENOENT is returned. If the lookup
+ * fails, a status of zero is returned.
+ */
+
+int
+cache_lookup(dvp, vpp, cnp)
+ struct vnode *dvp;
+ struct vnode **vpp;
+ struct componentname *cnp;
+{
+ struct namecache *ncp;
+ u_int32_t hash;
+
+ if (!doingcache) {
+ cnp->cn_flags &= ~MAKEENTRY;
+ return (0);
+ }
+
+ numcalls++;
+
+ if (cnp->cn_nameptr[0] == '.') {
+ if (cnp->cn_namelen == 1) {
+ *vpp = dvp;
+ dothits++;
+ return (-1);
+ }
+ if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+ dotdothits++;
+ if (dvp->v_dd->v_id != dvp->v_ddid ||
+ (cnp->cn_flags & MAKEENTRY) == 0) {
+ dvp->v_ddid = 0;
+ return (0);
+ }
+ *vpp = dvp->v_dd;
+ return (-1);
+ }
+ }
+
+ hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
+ hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
+ LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+ numchecks++;
+ if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+ !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+ break;
+ }
+
+ /* We failed to find an entry */
+ if (ncp == 0) {
+ if ((cnp->cn_flags & MAKEENTRY) == 0) {
+ nummisszap++;
+ } else {
+ nummiss++;
+ }
+ nchstats.ncs_miss++;
+ return (0);
+ }
+
+ /* We don't want to have an entry, so dump it */
+ if ((cnp->cn_flags & MAKEENTRY) == 0) {
+ numposzaps++;
+ nchstats.ncs_badhits++;
+ cache_zap(ncp);
+ return (0);
+ }
+
+ /* We found a "positive" match, return the vnode */
+ if (ncp->nc_vp) {
+ numposhits++;
+ nchstats.ncs_goodhits++;
+ *vpp = ncp->nc_vp;
+ return (-1);
+ }
+
+ /* We found a negative match, and want to create it, so purge */
+ if (cnp->cn_nameiop == CREATE) {
+ numnegzaps++;
+ nchstats.ncs_badhits++;
+ cache_zap(ncp);
+ return (0);
+ }
+
+ numneghits++;
+ /*
+ * We found a "negative" match, ENOENT notifies client of this match.
+ * The nc_vpid field records whether this is a whiteout.
+ */
+ TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+ TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+ nchstats.ncs_neghits++;
+ if (ncp->nc_flag & NCF_WHITE)
+ cnp->cn_flags |= ISWHITEOUT;
+ return (ENOENT);
+}
+
+/*
+ * Add an entry to the cache.
+ */
+void
+cache_enter(dvp, vp, cnp)
+ struct vnode *dvp;
+ struct vnode *vp;
+ struct componentname *cnp;
+{
+ struct namecache *ncp;
+ struct nchashhead *ncpp;
+ u_int32_t hash;
+ int len;
+
+ if (!doingcache)
+ return;
+
+ if (cnp->cn_nameptr[0] == '.') {
+ if (cnp->cn_namelen == 1) {
+ return;
+ }
+ if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+ if (vp) {
+ dvp->v_dd = vp;
+ dvp->v_ddid = vp->v_id;
+ } else {
+ dvp->v_dd = dvp;
+ dvp->v_ddid = 0;
+ }
+ return;
+ }
+ }
+
+ ncp = (struct namecache *)
+ malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
+ bzero((char *)ncp, sizeof *ncp);
+ numcache++;
+ if (!vp) {
+ numneg++;
+ ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
+ } else if (vp->v_type == VDIR) {
+ vp->v_dd = dvp;
+ vp->v_ddid = dvp->v_id;
+ }
+
+ /*
+ * Fill in cache info, if vp is NULL this is a "negative" cache entry.
+ * For negative entries, we have to record whether it is a whiteout.
+ * the whiteout flag is stored in the nc_vpid field which is
+ * otherwise unused.
+ */
+ ncp->nc_vp = vp;
+ ncp->nc_dvp = dvp;
+ len = ncp->nc_nlen = cnp->cn_namelen;
+ hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
+ bcopy(cnp->cn_nameptr, ncp->nc_name, len);
+ hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
+ ncpp = NCHHASH(hash);
+ LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+ if (LIST_EMPTY(&dvp->v_cache_src)) {
+ vhold(dvp);
+ numcachehv++;
+ }
+ LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
+ if (vp) {
+ TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
+ } else {
+ TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+ }
+ if (numneg * ncnegfactor > numcache) {
+ ncp = TAILQ_FIRST(&ncneg);
+ cache_zap(ncp);
+ }
+}
+
+/*
+ * Name cache initialization, from vfs_init() when we are booting
+ */
+static void
+nchinit(void *dummy __unused)
+{
+
+ TAILQ_INIT(&ncneg);
+ nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
+
+
+/*
+ * Invalidate all entries to a particular vnode.
+ *
+ * Remove all entries in the namecache relating to this vnode and
+ * change the v_id. We take the v_id from a global counter, since
+ * it becomes a handy sequence number in crash-dumps that way.
+ * No valid vnode will ever have (v_id == 0).
+ *
+ * XXX: Only time and the size of v_id prevents this from failing:
+ * XXX: In theory we should hunt down all (struct vnode*, v_id)
+ * XXX: soft references and nuke them, at least on the global
+ * XXX: v_id wraparound. The period of resistance can be extended
+ * XXX: by incrementing each vnodes v_id individually instead of
+ * XXX: using the global v_id.
+ */
+
+void
+cache_purge(vp)
+ struct vnode *vp;
+{
+ static u_long nextid;
+
+ while (!LIST_EMPTY(&vp->v_cache_src))
+ cache_zap(LIST_FIRST(&vp->v_cache_src));
+ while (!TAILQ_EMPTY(&vp->v_cache_dst))
+ cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
+
+ do
+ nextid++;
+ while (nextid == vp->v_id || !nextid);
+ vp->v_id = nextid;
+ vp->v_dd = vp;
+ vp->v_ddid = 0;
+}
+
+/*
+ * Flush all entries referencing a particular filesystem.
+ *
+ * Since we need to check it anyway, we will flush all the invalid
+ * entries at the same time.
+ */
+void
+cache_purgevfs(mp)
+ struct mount *mp;
+{
+ struct nchashhead *ncpp;
+ struct namecache *ncp, *nnp;
+
+ /* Scan hash tables for applicable entries */
+ for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
+ for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
+ nnp = LIST_NEXT(ncp, nc_hash);
+ if (ncp->nc_dvp->v_mount == mp) {
+ cache_zap(ncp);
+ }
+ }
+ }
+}
+
+/*
+ * Perform canonical checks and cache lookup and pass on to filesystem
+ * through the vop_cachedlookup only if needed.
+ */
+
+int
+vfs_cache_lookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ struct vnode *dvp, *vp;
+ int lockparent;
+ int error;
+ struct vnode **vpp = ap->a_vpp;
+ struct componentname *cnp = ap->a_cnp;
+ struct ucred *cred = cnp->cn_cred;
+ int flags = cnp->cn_flags;
+ struct thread *td = cnp->cn_thread;
+ u_long vpid; /* capability number of vnode */
+
+ *vpp = NULL;
+ dvp = ap->a_dvp;
+ lockparent = flags & LOCKPARENT;
+
+ if (dvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+ return (EROFS);
+
+ error = VOP_ACCESS(dvp, VEXEC, cred, td);
+
+ if (error)
+ return (error);
+
+ error = cache_lookup(dvp, vpp, cnp);
+
+#ifdef LOOKUP_SHARED
+ if (!error) {
+ /* We do this because the rest of the system now expects to get
+ * a shared lock, which is later upgraded if LOCKSHARED is not
+ * set. We have so many cases here because of bugs that yield
+ * inconsistant lock states. This all badly needs to be fixed
+ */
+ error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
+ if (!error) {
+ int flock;
+
+ flock = VOP_ISLOCKED(*vpp, td);
+ if (flock != LK_EXCLUSIVE) {
+ if (flock == 0) {
+ if ((flags & ISLASTCN) &&
+ (flags & LOCKSHARED))
+ VOP_LOCK(*vpp, LK_SHARED, td);
+ else
+ VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
+ }
+ } else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
+ VOP_LOCK(*vpp, LK_DOWNGRADE, td);
+ }
+ return (error);
+ }
+#else
+ if (!error)
+ return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
+#endif
+
+ if (error == ENOENT)
+ return (error);
+
+ vp = *vpp;
+ vpid = vp->v_id;
+ cnp->cn_flags &= ~PDIRUNLOCK;
+ if (dvp == vp) { /* lookup on "." */
+ VREF(vp);
+ error = 0;
+ } else if (flags & ISDOTDOT) {
+ VOP_UNLOCK(dvp, 0, td);
+ cnp->cn_flags |= PDIRUNLOCK;
+#ifdef LOOKUP_SHARED
+ if ((flags & ISLASTCN) && (flags & LOCKSHARED))
+ error = vget(vp, LK_SHARED, td);
+ else
+ error = vget(vp, LK_EXCLUSIVE, td);
+#else
+ error = vget(vp, LK_EXCLUSIVE, td);
+#endif
+
+ if (!error && lockparent && (flags & ISLASTCN)) {
+ if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0)
+ cnp->cn_flags &= ~PDIRUNLOCK;
+ }
+ } else {
+#ifdef LOOKUP_SHARED
+ if ((flags & ISLASTCN) && (flags & LOCKSHARED))
+ error = vget(vp, LK_SHARED, td);
+ else
+ error = vget(vp, LK_EXCLUSIVE, td);
+#else
+ error = vget(vp, LK_EXCLUSIVE, td);
+#endif
+ if (!lockparent || error || !(flags & ISLASTCN)) {
+ VOP_UNLOCK(dvp, 0, td);
+ cnp->cn_flags |= PDIRUNLOCK;
+ }
+ }
+ /*
+ * Check that the capability number did not change
+ * while we were waiting for the lock.
+ */
+ if (!error) {
+ if (vpid == vp->v_id)
+ return (0);
+ vput(vp);
+ if (lockparent && dvp != vp && (flags & ISLASTCN)) {
+ VOP_UNLOCK(dvp, 0, td);
+ cnp->cn_flags |= PDIRUNLOCK;
+ }
+ }
+ if (cnp->cn_flags & PDIRUNLOCK) {
+ error = vn_lock(dvp, LK_EXCLUSIVE, td);
+ if (error)
+ return (error);
+ cnp->cn_flags &= ~PDIRUNLOCK;
+ }
+#ifdef LOOKUP_SHARED
+ error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
+
+ if (!error) {
+ int flock = 0;
+
+ flock = VOP_ISLOCKED(*vpp, td);
+ if (flock != LK_EXCLUSIVE) {
+ if (flock == 0) {
+ if ((flags & ISLASTCN) && (flags & LOCKSHARED))
+ VOP_LOCK(*vpp, LK_SHARED, td);
+ else
+ VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
+ }
+ } else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
+ VOP_LOCK(*vpp, LK_DOWNGRADE, td);
+ }
+
+ return (error);
+#else
+ return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
+#endif
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct __getcwd_args {
+ u_char *buf;
+ u_int buflen;
+};
+#endif
+
+/*
+ * XXX All of these sysctls would probably be more productive dead.
+ */
+static int disablecwd;
+SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
+ "Disable the getcwd syscall");
+
+/* Various statistics for the getcwd syscall */
+static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
+static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
+static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
+static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
+static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
+static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
+
+/* Implementation of the getcwd syscall */
+int
+__getcwd(td, uap)
+ struct thread *td;
+ struct __getcwd_args *uap;
+{
+ char *bp, *buf;
+ int error, i, slash_prefixed;
+ struct filedesc *fdp;
+ struct namecache *ncp;
+ struct vnode *vp;
+
+ numcwdcalls++;
+ if (disablecwd)
+ return (ENODEV);
+ if (uap->buflen < 2)
+ return (EINVAL);
+ if (uap->buflen > MAXPATHLEN)
+ uap->buflen = MAXPATHLEN;
+ buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
+ bp += uap->buflen - 1;
+ *bp = '\0';
+ fdp = td->td_proc->p_fd;
+ slash_prefixed = 0;
+ FILEDESC_LOCK(fdp);
+ for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
+ if (vp->v_flag & VROOT) {
+ if (vp->v_mount == NULL) { /* forced unmount */
+ FILEDESC_UNLOCK(fdp);
+ free(buf, M_TEMP);
+ return (EBADF);
+ }
+ vp = vp->v_mount->mnt_vnodecovered;
+ continue;
+ }
+ if (vp->v_dd->v_id != vp->v_ddid) {
+ FILEDESC_UNLOCK(fdp);
+ numcwdfail1++;
+ free(buf, M_TEMP);
+ return (ENOTDIR);
+ }
+ ncp = TAILQ_FIRST(&vp->v_cache_dst);
+ if (!ncp) {
+ FILEDESC_UNLOCK(fdp);
+ numcwdfail2++;
+ free(buf, M_TEMP);
+ return (ENOENT);
+ }
+ if (ncp->nc_dvp != vp->v_dd) {
+ FILEDESC_UNLOCK(fdp);
+ numcwdfail3++;
+ free(buf, M_TEMP);
+ return (EBADF);
+ }
+ for (i = ncp->nc_nlen - 1; i >= 0; i--) {
+ if (bp == buf) {
+ FILEDESC_UNLOCK(fdp);
+ numcwdfail4++;
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+ *--bp = ncp->nc_name[i];
+ }
+ if (bp == buf) {
+ FILEDESC_UNLOCK(fdp);
+ numcwdfail4++;
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+ *--bp = '/';
+ slash_prefixed = 1;
+ vp = vp->v_dd;
+ }
+ FILEDESC_UNLOCK(fdp);
+ if (!slash_prefixed) {
+ if (bp == buf) {
+ numcwdfail4++;
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+ *--bp = '/';
+ }
+ numcwdfound++;
+ error = copyout(bp, uap->buf, strlen(bp) + 1);
+ free(buf, M_TEMP);
+ return (error);
+}
+
+/*
+ * Thus begins the fullpath magic.
+ */
+
+#undef STATNODE
+#define STATNODE(name) \
+ static u_int name; \
+ SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
+
+static int disablefullpath;
+SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
+ "Disable the vn_fullpath function");
+
+STATNODE(numfullpathcalls);
+STATNODE(numfullpathfail1);
+STATNODE(numfullpathfail2);
+STATNODE(numfullpathfail3);
+STATNODE(numfullpathfail4);
+STATNODE(numfullpathfound);
+
+/*
+ * Retrieve the full filesystem path that correspond to a vnode from the name
+ * cache (if available)
+ */
+int
+vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
+{
+ char *bp, *buf;
+ int i, slash_prefixed;
+ struct filedesc *fdp;
+ struct namecache *ncp;
+ struct vnode *vp;
+
+ numfullpathcalls++;
+ if (disablefullpath)
+ return (ENODEV);
+ if (vn == NULL)
+ return (EINVAL);
+ buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ bp = buf + MAXPATHLEN - 1;
+ *bp = '\0';
+ fdp = td->td_proc->p_fd;
+ slash_prefixed = 0;
+ FILEDESC_LOCK(fdp);
+ for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
+ if (vp->v_flag & VROOT) {
+ if (vp->v_mount == NULL) { /* forced unmount */
+ FILEDESC_UNLOCK(fdp);
+ free(buf, M_TEMP);
+ return (EBADF);
+ }
+ vp = vp->v_mount->mnt_vnodecovered;
+ continue;
+ }
+ if (vp != vn && vp->v_dd->v_id != vp->v_ddid) {
+ FILEDESC_UNLOCK(fdp);
+ numfullpathfail1++;
+ free(buf, M_TEMP);
+ return (ENOTDIR);
+ }
+ ncp = TAILQ_FIRST(&vp->v_cache_dst);
+ if (!ncp) {
+ FILEDESC_UNLOCK(fdp);
+ numfullpathfail2++;
+ free(buf, M_TEMP);
+ return (ENOENT);
+ }
+ if (vp != vn && ncp->nc_dvp != vp->v_dd) {
+ FILEDESC_UNLOCK(fdp);
+ numfullpathfail3++;
+ free(buf, M_TEMP);
+ return (EBADF);
+ }
+ for (i = ncp->nc_nlen - 1; i >= 0; i--) {
+ if (bp == buf) {
+ FILEDESC_UNLOCK(fdp);
+ numfullpathfail4++;
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+ *--bp = ncp->nc_name[i];
+ }
+ if (bp == buf) {
+ FILEDESC_UNLOCK(fdp);
+ numfullpathfail4++;
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+ *--bp = '/';
+ slash_prefixed = 1;
+ vp = ncp->nc_dvp;
+ }
+ if (!slash_prefixed) {
+ if (bp == buf) {
+ FILEDESC_UNLOCK(fdp);
+ numfullpathfail4++;
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+ *--bp = '/';
+ }
+ FILEDESC_UNLOCK(fdp);
+ numfullpathfound++;
+ *retbuf = bp;
+ *freebuf = buf;
+ return (0);
+}
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
new file mode 100644
index 0000000..4c11952
--- /dev/null
+++ b/sys/kern/vfs_cluster.c
@@ -0,0 +1,1008 @@
+/*-
+ * Copyright (c) 1993
+ * The Regents of the University of California. All rights reserved.
+ * Modifications/enhancements:
+ * Copyright (c) 1995 John S. Dyson. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
+ * $FreeBSD$
+ */
+
+#include "opt_debug_cluster.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/stdint.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <sys/sysctl.h>
+
+#if defined(CLUSTERDEBUG)
+#include <sys/sysctl.h>
+static int rcluster= 0;
+SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
+ "Debug VFS clustering code");
+#endif
+
+static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer");
+
+static struct cluster_save *
+ cluster_collectbufs(struct vnode *vp, struct buf *last_bp);
+static struct buf *
+ cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+ daddr_t blkno, long size, int run, struct buf *fbp);
+
+static int write_behind = 1;
+SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
+ "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
+
+/* Page expended to mark partially backed buffers */
+extern vm_page_t bogus_page;
+
+/*
+ * Number of physical bufs (pbufs) this subsystem is allowed.
+ * Manipulated by vm_pager.c
+ */
+extern int cluster_pbuf_freecnt;
+
+/*
+ * Maximum number of blocks for read-ahead.
+ */
+#define MAXRA 32
+
+/*
+ * Read data to a buf, including read-ahead if we find this to be beneficial.
+ * cluster_read replaces bread.
+ */
+int
+cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
+ struct vnode *vp;
+ u_quad_t filesize;
+ daddr_t lblkno;
+ long size;
+ struct ucred *cred;
+ long totread;
+ int seqcount;
+ struct buf **bpp;
+{
+ struct buf *bp, *rbp, *reqbp;
+ daddr_t blkno, origblkno;
+ int error, num_ra;
+ int i;
+ int maxra, racluster;
+ long origtotread;
+
+ error = 0;
+
+ /*
+ * Try to limit the amount of read-ahead by a few
+ * ad-hoc parameters. This needs work!!!
+ */
+ racluster = vp->v_mount->mnt_iosize_max / size;
+ maxra = 2 * racluster + (totread / size);
+ if (maxra > MAXRA)
+ maxra = MAXRA;
+ if (maxra > nbuf/8)
+ maxra = nbuf/8;
+
+ /*
+ * get the requested block
+ */
+ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
+ origblkno = lblkno;
+ origtotread = totread;
+
+ /*
+ * if it is in the cache, then check to see if the reads have been
+ * sequential. If they have, then try some read-ahead, otherwise
+ * back-off on prospective read-aheads.
+ */
+ if (bp->b_flags & B_CACHE) {
+ if (!seqcount) {
+ return 0;
+ } else if ((bp->b_flags & B_RAM) == 0) {
+ return 0;
+ } else {
+ int s;
+ struct buf *tbp;
+ bp->b_flags &= ~B_RAM;
+ /*
+ * We do the spl here so that there is no window
+ * between the incore and the b_usecount increment
+ * below. We opt to keep the spl out of the loop
+ * for efficiency.
+ */
+ s = splbio();
+ for (i = 1; i < maxra; i++) {
+
+ if (!(tbp = incore(vp, lblkno+i))) {
+ break;
+ }
+
+ /*
+ * Set another read-ahead mark so we know
+ * to check again.
+ */
+ if (((i % racluster) == (racluster - 1)) ||
+ (i == (maxra - 1)))
+ tbp->b_flags |= B_RAM;
+ }
+ splx(s);
+ if (i >= maxra) {
+ return 0;
+ }
+ lblkno += i;
+ }
+ reqbp = bp = NULL;
+ } else {
+ off_t firstread = bp->b_offset;
+
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("cluster_read: no buffer offset"));
+ if (firstread + totread > filesize)
+ totread = filesize - firstread;
+ if (totread > size) {
+ int nblks = 0;
+ int ncontigafter;
+ while (totread > 0) {
+ nblks++;
+ totread -= size;
+ }
+ if (nblks == 1)
+ goto single_block_read;
+ if (nblks > racluster)
+ nblks = racluster;
+
+ error = VOP_BMAP(vp, lblkno, NULL,
+ &blkno, &ncontigafter, NULL);
+ if (error)
+ goto single_block_read;
+ if (blkno == -1)
+ goto single_block_read;
+ if (ncontigafter == 0)
+ goto single_block_read;
+ if (ncontigafter + 1 < nblks)
+ nblks = ncontigafter + 1;
+
+ bp = cluster_rbuild(vp, filesize, lblkno,
+ blkno, size, nblks, bp);
+ lblkno += (bp->b_bufsize / size);
+ } else {
+single_block_read:
+ /*
+ * if it isn't in the cache, then get a chunk from
+ * disk if sequential, otherwise just get the block.
+ */
+ bp->b_flags |= B_RAM;
+ bp->b_iocmd = BIO_READ;
+ lblkno += 1;
+ }
+ }
+
+ /*
+ * if we have been doing sequential I/O, then do some read-ahead
+ */
+ rbp = NULL;
+ if (seqcount && (lblkno < (origblkno + seqcount))) {
+ /*
+ * we now build the read-ahead buffer if it is desirable.
+ */
+ if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
+ !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
+ blkno != -1) {
+ int nblksread;
+ int ntoread = num_ra + 1;
+ nblksread = (origtotread + size - 1) / size;
+ if (seqcount < nblksread)
+ seqcount = nblksread;
+ if (seqcount < ntoread)
+ ntoread = seqcount;
+ if (num_ra) {
+ rbp = cluster_rbuild(vp, filesize, lblkno,
+ blkno, size, ntoread, NULL);
+ } else {
+ rbp = getblk(vp, lblkno, size, 0, 0);
+ rbp->b_flags |= B_ASYNC | B_RAM;
+ rbp->b_iocmd = BIO_READ;
+ rbp->b_blkno = blkno;
+ }
+ }
+ }
+
+ /*
+ * handle the synchronous read
+ */
+ if (bp) {
+#if defined(CLUSTERDEBUG)
+ if (rcluster)
+ printf("S(%ld,%ld,%d) ",
+ (long)bp->b_lblkno, bp->b_bcount, seqcount);
+#endif
+ if ((bp->b_flags & B_CLUSTER) == 0) {
+ vfs_busy_pages(bp, 0);
+ }
+ bp->b_flags &= ~B_INVAL;
+ bp->b_ioflags &= ~BIO_ERROR;
+ if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
+ BUF_KERNPROC(bp);
+ error = VOP_STRATEGY(vp, bp);
+ curproc->p_stats->p_ru.ru_inblock++;
+ }
+
+ /*
+ * and if we have read-aheads, do them too
+ */
+ if (rbp) {
+ if (error) {
+ rbp->b_flags &= ~B_ASYNC;
+ brelse(rbp);
+ } else if (rbp->b_flags & B_CACHE) {
+ rbp->b_flags &= ~B_ASYNC;
+ bqrelse(rbp);
+ } else {
+#if defined(CLUSTERDEBUG)
+ if (rcluster) {
+ if (bp)
+ printf("A+");
+ else
+ printf("A");
+ printf("(%lld,%ld,%lld,%d) ",
+ (intmax_t)rbp->b_lblkno, rbp->b_bcount,
+ (intmax_t)(rbp->b_lblkno - origblkno),
+ seqcount);
+ }
+#endif
+
+ if ((rbp->b_flags & B_CLUSTER) == 0) {
+ vfs_busy_pages(rbp, 0);
+ }
+ rbp->b_flags &= ~B_INVAL;
+ rbp->b_ioflags &= ~BIO_ERROR;
+ if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
+ BUF_KERNPROC(rbp);
+ (void) VOP_STRATEGY(vp, rbp);
+ curproc->p_stats->p_ru.ru_inblock++;
+ }
+ }
+ if (reqbp)
+ return (bufwait(reqbp));
+ else
+ return (error);
+}
+
+/*
+ * If blocks are contiguous on disk, use this to provide clustered
+ * read ahead. We will read as many blocks as possible sequentially
+ * and then parcel them up into logical blocks in the buffer hash table.
+ */
+static struct buf *
+cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
+ struct vnode *vp;
+ u_quad_t filesize;
+ daddr_t lbn;
+ daddr_t blkno;
+ long size;
+ int run;
+ struct buf *fbp;
+{
+ struct buf *bp, *tbp;
+ daddr_t bn;
+ int i, inc, j;
+
+ GIANT_REQUIRED;
+
+ KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
+ ("cluster_rbuild: size %ld != filesize %ld\n",
+ size, vp->v_mount->mnt_stat.f_iosize));
+
+ /*
+ * avoid a division
+ */
+ while ((u_quad_t) size * (lbn + run) > filesize) {
+ --run;
+ }
+
+ if (fbp) {
+ tbp = fbp;
+ tbp->b_iocmd = BIO_READ;
+ } else {
+ tbp = getblk(vp, lbn, size, 0, 0);
+ if (tbp->b_flags & B_CACHE)
+ return tbp;
+ tbp->b_flags |= B_ASYNC | B_RAM;
+ tbp->b_iocmd = BIO_READ;
+ }
+
+ tbp->b_blkno = blkno;
+ if( (tbp->b_flags & B_MALLOC) ||
+ ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
+ return tbp;
+
+ bp = trypbuf(&cluster_pbuf_freecnt);
+ if (bp == 0)
+ return tbp;
+
+ /*
+ * We are synthesizing a buffer out of vm_page_t's, but
+ * if the block size is not page aligned then the starting
+ * address may not be either. Inherit the b_data offset
+ * from the original buffer.
+ */
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
+ bp->b_iocmd = BIO_READ;
+ bp->b_iodone = cluster_callback;
+ bp->b_blkno = blkno;
+ bp->b_lblkno = lbn;
+ bp->b_offset = tbp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
+ pbgetvp(vp, bp);
+
+ TAILQ_INIT(&bp->b_cluster.cluster_head);
+
+ bp->b_bcount = 0;
+ bp->b_bufsize = 0;
+ bp->b_npages = 0;
+
+ inc = btodb(size);
+ for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
+ if (i != 0) {
+ if ((bp->b_npages * PAGE_SIZE) +
+ round_page(size) > vp->v_mount->mnt_iosize_max) {
+ break;
+ }
+
+ /*
+ * Shortcut some checks and try to avoid buffers that
+ * would block in the lock. The same checks have to
+ * be made again after we officially get the buffer.
+ */
+ if ((tbp = incore(vp, lbn + i)) != NULL) {
+ if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT))
+ break;
+ BUF_UNLOCK(tbp);
+
+ for (j = 0; j < tbp->b_npages; j++) {
+ if (tbp->b_pages[j]->valid)
+ break;
+ }
+
+ if (j != tbp->b_npages)
+ break;
+
+ if (tbp->b_bcount != size)
+ break;
+ }
+
+ tbp = getblk(vp, lbn + i, size, 0, 0);
+
+ /*
+ * Stop scanning if the buffer is fully valid
+ * (marked B_CACHE), or locked (may be doing a
+ * background write), or if the buffer is not
+ * VMIO backed. The clustering code can only deal
+ * with VMIO-backed buffers.
+ */
+ if ((tbp->b_flags & (B_CACHE|B_LOCKED)) ||
+ (tbp->b_flags & B_VMIO) == 0) {
+ bqrelse(tbp);
+ break;
+ }
+
+ /*
+ * The buffer must be completely invalid in order to
+ * take part in the cluster. If it is partially valid
+ * then we stop.
+ */
+ for (j = 0;j < tbp->b_npages; j++) {
+ if (tbp->b_pages[j]->valid)
+ break;
+ }
+ if (j != tbp->b_npages) {
+ bqrelse(tbp);
+ break;
+ }
+
+ /*
+ * Set a read-ahead mark as appropriate
+ */
+ if ((fbp && (i == 1)) || (i == (run - 1)))
+ tbp->b_flags |= B_RAM;
+
+ /*
+ * Set the buffer up for an async read (XXX should
+ * we do this only if we do not wind up brelse()ing?).
+ * Set the block number if it isn't set, otherwise
+ * if it is make sure it matches the block number we
+ * expect.
+ */
+ tbp->b_flags |= B_ASYNC;
+ tbp->b_iocmd = BIO_READ;
+ if (tbp->b_blkno == tbp->b_lblkno) {
+ tbp->b_blkno = bn;
+ } else if (tbp->b_blkno != bn) {
+ brelse(tbp);
+ break;
+ }
+ }
+ /*
+ * XXX fbp from caller may not be B_ASYNC, but we are going
+ * to biodone() it in cluster_callback() anyway
+ */
+ BUF_KERNPROC(tbp);
+ TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+ tbp, b_cluster.cluster_entry);
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ vm_page_t m;
+ m = tbp->b_pages[j];
+ vm_page_io_start(m);
+ vm_object_pip_add(m->object, 1);
+ if ((bp->b_npages == 0) ||
+ (bp->b_pages[bp->b_npages-1] != m)) {
+ bp->b_pages[bp->b_npages] = m;
+ bp->b_npages++;
+ }
+ if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
+ tbp->b_pages[j] = bogus_page;
+ }
+ /*
+ * XXX shouldn't this be += size for both, like in
+ * cluster_wbuild()?
+ *
+ * Don't inherit tbp->b_bufsize as it may be larger due to
+ * a non-page-aligned size. Instead just aggregate using
+ * 'size'.
+ */
+ if (tbp->b_bcount != size)
+ printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
+ if (tbp->b_bufsize != size)
+ printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
+ bp->b_bcount += size;
+ bp->b_bufsize += size;
+ }
+
+ /*
+ * Fully valid pages in the cluster are already good and do not need
+ * to be re-read from disk. Replace the page with bogus_page
+ */
+ for (j = 0; j < bp->b_npages; j++) {
+ if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
+ VM_PAGE_BITS_ALL) {
+ bp->b_pages[j] = bogus_page;
+ }
+ }
+ if (bp->b_bufsize > bp->b_kvasize)
+ panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+ bp->b_bufsize, bp->b_kvasize);
+ bp->b_kvasize = bp->b_bufsize;
+
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ return (bp);
+}
+
+/*
+ * Cleanup after a clustered read or write.
+ * This is complicated by the fact that any of the buffers might have
+ * extra memory (if there were no empty buffer headers at allocbuf time)
+ * that we will need to shift around.
+ */
+void
+cluster_callback(bp)
+ struct buf *bp;
+{
+ struct buf *nbp, *tbp;
+ int error = 0;
+
+ GIANT_REQUIRED;
+
+ /*
+ * Must propogate errors to all the components.
+ */
+ if (bp->b_ioflags & BIO_ERROR)
+ error = bp->b_error;
+
+ pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+ /*
+ * Move memory from the large cluster buffer into the component
+ * buffers and mark IO as done on these.
+ */
+ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
+ tbp; tbp = nbp) {
+ nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
+ if (error) {
+ tbp->b_ioflags |= BIO_ERROR;
+ tbp->b_error = error;
+ } else {
+ tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+ tbp->b_flags &= ~B_INVAL;
+ tbp->b_ioflags &= ~BIO_ERROR;
+ /*
+ * XXX the bdwrite()/bqrelse() issued during
+ * cluster building clears B_RELBUF (see bqrelse()
+ * comment). If direct I/O was specified, we have
+ * to restore it here to allow the buffer and VM
+ * to be freed.
+ */
+ if (tbp->b_flags & B_DIRECT)
+ tbp->b_flags |= B_RELBUF;
+ }
+ bufdone(tbp);
+ }
+ relpbuf(bp, &cluster_pbuf_freecnt);
+}
+
+/*
+ * cluster_wbuild_wb:
+ *
+ * Implement modified write build for cluster.
+ *
+ * write_behind = 0 write behind disabled
+ * write_behind = 1 write behind normal (default)
+ * write_behind = 2 write behind backed-off
+ */
+
+static __inline int
+cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
+{
+ int r = 0;
+
+ switch(write_behind) {
+ case 2:
+ if (start_lbn < len)
+ break;
+ start_lbn -= len;
+ /* fall through */
+ case 1:
+ r = cluster_wbuild(vp, size, start_lbn, len);
+ /* fall through */
+ default:
+ /* fall through */
+ break;
+ }
+ return(r);
+}
+
+/*
+ * Do clustered write for FFS.
+ *
+ * Three cases:
+ * 1. Write is not sequential (write asynchronously)
+ * Write is sequential:
+ * 2. beginning of cluster - begin cluster
+ * 3. middle of a cluster - add to cluster
+ * 4. end of a cluster - asynchronously write cluster
+ */
+void
+cluster_write(bp, filesize, seqcount)
+ struct buf *bp;
+ u_quad_t filesize;
+ int seqcount;
+{
+ struct vnode *vp;
+ daddr_t lbn;
+ int maxclen, cursize;
+ int lblocksize;
+ int async;
+
+ vp = bp->b_vp;
+ if (vp->v_type == VREG) {
+ async = vp->v_mount->mnt_flag & MNT_ASYNC;
+ lblocksize = vp->v_mount->mnt_stat.f_iosize;
+ } else {
+ async = 0;
+ lblocksize = bp->b_bufsize;
+ }
+ lbn = bp->b_lblkno;
+ KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
+
+ /* Initialize vnode to beginning of file. */
+ if (lbn == 0)
+ vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+
+ if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
+ (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
+ maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
+ if (vp->v_clen != 0) {
+ /*
+ * Next block is not sequential.
+ *
+ * If we are not writing at end of file, the process
+ * seeked to another point in the file since its last
+ * write, or we have reached our maximum cluster size,
+ * then push the previous cluster. Otherwise try
+ * reallocating to make it sequential.
+ *
+ * Change to algorithm: only push previous cluster if
+ * it was sequential from the point of view of the
+ * seqcount heuristic, otherwise leave the buffer
+ * intact so we can potentially optimize the I/O
+ * later on in the buf_daemon or update daemon
+ * flush.
+ */
+ cursize = vp->v_lastw - vp->v_cstart + 1;
+ if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
+ lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
+ if (!async && seqcount > 0) {
+ cluster_wbuild_wb(vp, lblocksize,
+ vp->v_cstart, cursize);
+ }
+ } else {
+ struct buf **bpp, **endbp;
+ struct cluster_save *buflist;
+
+ buflist = cluster_collectbufs(vp, bp);
+ endbp = &buflist->bs_children
+ [buflist->bs_nchildren - 1];
+ if (VOP_REALLOCBLKS(vp, buflist)) {
+ /*
+ * Failed, push the previous cluster
+ * if *really* writing sequentially
+ * in the logical file (seqcount > 1),
+ * otherwise delay it in the hopes that
+ * the low level disk driver can
+ * optimize the write ordering.
+ */
+ for (bpp = buflist->bs_children;
+ bpp < endbp; bpp++)
+ brelse(*bpp);
+ free(buflist, M_SEGMENT);
+ if (seqcount > 1) {
+ cluster_wbuild_wb(vp,
+ lblocksize, vp->v_cstart,
+ cursize);
+ }
+ } else {
+ /*
+ * Succeeded, keep building cluster.
+ */
+ for (bpp = buflist->bs_children;
+ bpp <= endbp; bpp++)
+ bdwrite(*bpp);
+ free(buflist, M_SEGMENT);
+ vp->v_lastw = lbn;
+ vp->v_lasta = bp->b_blkno;
+ return;
+ }
+ }
+ }
+ /*
+ * Consider beginning a cluster. If at end of file, make
+ * cluster as large as possible, otherwise find size of
+ * existing cluster.
+ */
+ if ((vp->v_type == VREG) &&
+ ((u_quad_t) bp->b_offset + lblocksize) != filesize &&
+ (bp->b_blkno == bp->b_lblkno) &&
+ (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
+ bp->b_blkno == -1)) {
+ bawrite(bp);
+ vp->v_clen = 0;
+ vp->v_lasta = bp->b_blkno;
+ vp->v_cstart = lbn + 1;
+ vp->v_lastw = lbn;
+ return;
+ }
+ vp->v_clen = maxclen;
+ if (!async && maxclen == 0) { /* I/O not contiguous */
+ vp->v_cstart = lbn + 1;
+ bawrite(bp);
+ } else { /* Wait for rest of cluster */
+ vp->v_cstart = lbn;
+ bdwrite(bp);
+ }
+ } else if (lbn == vp->v_cstart + vp->v_clen) {
+ /*
+ * At end of cluster, write it out if seqcount tells us we
+ * are operating sequentially, otherwise let the buf or
+ * update daemon handle it.
+ */
+ bdwrite(bp);
+ if (seqcount > 1)
+ cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
+ vp->v_clen = 0;
+ vp->v_cstart = lbn + 1;
+ } else if (vm_page_count_severe()) {
+ /*
+ * We are low on memory, get it going NOW
+ */
+ bawrite(bp);
+ } else {
+ /*
+ * In the middle of a cluster, so just delay the I/O for now.
+ */
+ bdwrite(bp);
+ }
+ vp->v_lastw = lbn;
+ vp->v_lasta = bp->b_blkno;
+}
+
+
+/*
+ * This is an awful lot like cluster_rbuild...wish they could be combined.
+ * The last lbn argument is the current block on which I/O is being
+ * performed. Check to see that it doesn't fall in the middle of
+ * the current block (if last_bp == NULL).
+ */
+int
+cluster_wbuild(vp, size, start_lbn, len)
+ struct vnode *vp;
+ long size;
+ daddr_t start_lbn;
+ int len;
+{
+ struct buf *bp, *tbp;
+ int i, j, s;
+ int totalwritten = 0;
+ int dbsize = btodb(size);
+
+ GIANT_REQUIRED;
+
+ while (len > 0) {
+ s = splbio();
+ /*
+ * If the buffer is not delayed-write (i.e. dirty), or it
+ * is delayed-write but either locked or inval, it cannot
+ * partake in the clustered write.
+ */
+ if (((tbp = gbincore(vp, start_lbn)) == NULL) ||
+ ((tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI) ||
+ BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) {
+ ++start_lbn;
+ --len;
+ splx(s);
+ continue;
+ }
+ bremfree(tbp);
+ tbp->b_flags &= ~B_DONE;
+ splx(s);
+
+ /*
+ * Extra memory in the buffer, punt on this buffer.
+ * XXX we could handle this in most cases, but we would
+ * have to push the extra memory down to after our max
+ * possible cluster size and then potentially pull it back
+ * up if the cluster was terminated prematurely--too much
+ * hassle.
+ */
+ if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) !=
+ (B_CLUSTEROK | B_VMIO)) ||
+ (tbp->b_bcount != tbp->b_bufsize) ||
+ (tbp->b_bcount != size) ||
+ (len == 1) ||
+ ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
+ totalwritten += tbp->b_bufsize;
+ bawrite(tbp);
+ ++start_lbn;
+ --len;
+ continue;
+ }
+
+ /*
+ * We got a pbuf to make the cluster in.
+ * so initialise it.
+ */
+ TAILQ_INIT(&bp->b_cluster.cluster_head);
+ bp->b_bcount = 0;
+ bp->b_magic = tbp->b_magic;
+ bp->b_op = tbp->b_op;
+ bp->b_bufsize = 0;
+ bp->b_npages = 0;
+ if (tbp->b_wcred != NOCRED)
+ bp->b_wcred = crhold(tbp->b_wcred);
+
+ bp->b_blkno = tbp->b_blkno;
+ bp->b_lblkno = tbp->b_lblkno;
+ bp->b_offset = tbp->b_offset;
+
+ /*
+ * We are synthesizing a buffer out of vm_page_t's, but
+ * if the block size is not page aligned then the starting
+ * address may not be either. Inherit the b_data offset
+ * from the original buffer.
+ */
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ bp->b_flags |= B_CLUSTER |
+ (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT | B_NOWDRAIN));
+ bp->b_iodone = cluster_callback;
+ pbgetvp(vp, bp);
+ /*
+ * From this location in the file, scan forward to see
+ * if there are buffers with adjacent data that need to
+ * be written as well.
+ */
+ for (i = 0; i < len; ++i, ++start_lbn) {
+ if (i != 0) { /* If not the first buffer */
+ s = splbio();
+ /*
+ * If the adjacent data is not even in core it
+ * can't need to be written.
+ */
+ if ((tbp = gbincore(vp, start_lbn)) == NULL) {
+ splx(s);
+ break;
+ }
+
+ /*
+ * If it IS in core, but has different
+ * characteristics, or is locked (which
+ * means it could be undergoing a background
+ * I/O or be in a weird state), then don't
+ * cluster with it.
+ */
+ if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
+ B_INVAL | B_DELWRI | B_NEEDCOMMIT))
+ != (B_DELWRI | B_CLUSTEROK |
+ (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
+ (tbp->b_flags & B_LOCKED) ||
+ tbp->b_wcred != bp->b_wcred ||
+ BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) {
+ splx(s);
+ break;
+ }
+
+ /*
+ * Check that the combined cluster
+ * would make sense with regard to pages
+ * and would not be too large
+ */
+ if ((tbp->b_bcount != size) ||
+ ((bp->b_blkno + (dbsize * i)) !=
+ tbp->b_blkno) ||
+ ((tbp->b_npages + bp->b_npages) >
+ (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
+ BUF_UNLOCK(tbp);
+ splx(s);
+ break;
+ }
+ /*
+ * Ok, it's passed all the tests,
+ * so remove it from the free list
+ * and mark it busy. We will use it.
+ */
+ bremfree(tbp);
+ tbp->b_flags &= ~B_DONE;
+ splx(s);
+ } /* end of code for non-first buffers only */
+ /* check for latent dependencies to be handled */
+ if ((LIST_FIRST(&tbp->b_dep)) != NULL)
+ buf_start(tbp);
+ /*
+ * If the IO is via the VM then we do some
+ * special VM hackery (yuck). Since the buffer's
+ * block size may not be page-aligned it is possible
+ * for a page to be shared between two buffers. We
+ * have to get rid of the duplication when building
+ * the cluster.
+ */
+ if (tbp->b_flags & B_VMIO) {
+ vm_page_t m;
+
+ if (i != 0) { /* if not first buffer */
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ m = tbp->b_pages[j];
+ if (m->flags & PG_BUSY) {
+ bqrelse(tbp);
+ goto finishcluster;
+ }
+ }
+ }
+
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ m = tbp->b_pages[j];
+ vm_page_io_start(m);
+ vm_object_pip_add(m->object, 1);
+ if ((bp->b_npages == 0) ||
+ (bp->b_pages[bp->b_npages - 1] != m)) {
+ bp->b_pages[bp->b_npages] = m;
+ bp->b_npages++;
+ }
+ }
+ }
+ bp->b_bcount += size;
+ bp->b_bufsize += size;
+
+ s = splbio();
+ bundirty(tbp);
+ tbp->b_flags &= ~B_DONE;
+ tbp->b_ioflags &= ~BIO_ERROR;
+ tbp->b_flags |= B_ASYNC;
+ tbp->b_iocmd = BIO_WRITE;
+ reassignbuf(tbp, tbp->b_vp); /* put on clean list */
+ ++tbp->b_vp->v_numoutput;
+ splx(s);
+ BUF_KERNPROC(tbp);
+ TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+ tbp, b_cluster.cluster_entry);
+ }
+ finishcluster:
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *) bp->b_pages, bp->b_npages);
+ if (bp->b_bufsize > bp->b_kvasize)
+ panic(
+ "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+ bp->b_bufsize, bp->b_kvasize);
+ bp->b_kvasize = bp->b_bufsize;
+ totalwritten += bp->b_bufsize;
+ bp->b_dirtyoff = 0;
+ bp->b_dirtyend = bp->b_bufsize;
+ bawrite(bp);
+
+ len -= i;
+ }
+ return totalwritten;
+}
+
+/*
+ * Collect together all the buffers in a cluster.
+ * Plus add one additional buffer.
+ */
+static struct cluster_save *
+cluster_collectbufs(vp, last_bp)
+ struct vnode *vp;
+ struct buf *last_bp;
+{
+ struct cluster_save *buflist;
+ struct buf *bp;
+ daddr_t lbn;
+ int i, len;
+
+ len = vp->v_lastw - vp->v_cstart + 1;
+ buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
+ M_SEGMENT, M_WAITOK);
+ buflist->bs_nchildren = 0;
+ buflist->bs_children = (struct buf **) (buflist + 1);
+ for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
+ (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp);
+ buflist->bs_children[i] = bp;
+ if (bp->b_blkno == bp->b_lblkno)
+ VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
+ NULL, NULL);
+ }
+ buflist->bs_children[i] = bp = last_bp;
+ if (bp->b_blkno == bp->b_lblkno)
+ VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
+ NULL, NULL);
+ buflist->bs_nchildren = i + 1;
+ return (buflist);
+}
diff --git a/sys/kern/vfs_conf.c b/sys/kern/vfs_conf.c
new file mode 100644
index 0000000..20d9b90
--- /dev/null
+++ b/sys/kern/vfs_conf.c
@@ -0,0 +1,396 @@
+/*-
+ * Copyright (c) 1999 Michael Smith
+ * All rights reserved.
+ * Copyright (c) 1999 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Locate and mount the root filesystem.
+ *
+ * The root filesystem is detailed in the kernel environment variable
+ * vfs.root.mountfrom, which is expected to be in the general format
+ *
+ * <vfsname>:[<path>]
+ * vfsname := the name of a VFS known to the kernel and capable
+ * of being mounted as root
+ * path := disk device name or other data used by the filesystem
+ * to locate its physical store
+ *
+ */
+
+#include "opt_rootdevname.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/reboot.h>
+#include <sys/diskslice.h>
+#include <sys/disklabel.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/proc.h>
+
+#include "opt_ddb.h"
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <paths.h>
+
+MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
+
+#define ROOTNAME "root_device"
+
+/*
+ * The vnode of the system's root (/ in the filesystem, without chroot
+ * active.)
+ */
+struct vnode *rootvnode;
+
+/*
+ * The root specifiers we will try if RB_CDROM is specified.
+ */
+static char *cdrom_rootdevnames[] = {
+ "cd9660:cd0a",
+ "cd9660:acd0a",
+ "cd9660:wcd0a",
+ NULL
+};
+
+static int vfs_mountroot_try(char *mountfrom);
+static int vfs_mountroot_ask(void);
+static void gets(char *cp);
+
+/* legacy find-root code */
+char *rootdevnames[2] = {NULL, NULL};
+static int setrootbyname(char *name);
+dev_t rootdev = NODEV;
+
+/*
+ * Find and mount the root filesystem
+ */
+void
+vfs_mountroot(void *foo __unused)
+{
+ char *cp;
+ int i, error;
+
+ /*
+ * The root filesystem information is compiled in, and we are
+ * booted with instructions to use it.
+ */
+#ifdef ROOTDEVNAME
+ if ((boothowto & RB_DFLTROOT) &&
+ !vfs_mountroot_try(ROOTDEVNAME))
+ return;
+#endif
+ /*
+ * We are booted with instructions to prompt for the root filesystem,
+ * or to use the compiled-in default when it doesn't exist.
+ */
+ if (boothowto & (RB_DFLTROOT | RB_ASKNAME)) {
+ if (!vfs_mountroot_ask())
+ return;
+ }
+
+ /*
+ * We've been given the generic "use CDROM as root" flag. This is
+ * necessary because one media may be used in many different
+ * devices, so we need to search for them.
+ */
+ if (boothowto & RB_CDROM) {
+ for (i = 0; cdrom_rootdevnames[i] != NULL; i++) {
+ if (!vfs_mountroot_try(cdrom_rootdevnames[i]))
+ return;
+ }
+ }
+
+ /*
+ * Try to use the value read by the loader from /etc/fstab, or
+ * supplied via some other means. This is the preferred
+ * mechanism.
+ */
+ if ((cp = getenv("vfs.root.mountfrom")) != NULL) {
+ error = vfs_mountroot_try(cp);
+ freeenv(cp);
+ if (!error)
+ return;
+ }
+
+ /*
+ * Try values that may have been computed by the machine-dependant
+ * legacy code.
+ */
+ if (!vfs_mountroot_try(rootdevnames[0]))
+ return;
+ if (!vfs_mountroot_try(rootdevnames[1]))
+ return;
+
+ /*
+ * If we have a compiled-in default, and haven't already tried it, try
+ * it now.
+ */
+#ifdef ROOTDEVNAME
+ if (!(boothowto & RB_DFLTROOT))
+ if (!vfs_mountroot_try(ROOTDEVNAME))
+ return;
+#endif
+
+ /*
+ * Everything so far has failed, prompt on the console if we haven't
+ * already tried that.
+ */
+ if (!(boothowto & (RB_DFLTROOT | RB_ASKNAME)) && !vfs_mountroot_ask())
+ return;
+ panic("Root mount failed, startup aborted.");
+}
+
+/*
+ * Mount (mountfrom) as the root filesystem.
+ */
+static int
+vfs_mountroot_try(char *mountfrom)
+{
+ struct mount *mp;
+ char *vfsname, *path;
+ int error;
+ char patt[32];
+ int s;
+
+ vfsname = NULL;
+ path = NULL;
+ mp = NULL;
+ error = EINVAL;
+
+ if (mountfrom == NULL)
+ return(error); /* don't complain */
+
+ s = splcam(); /* Overkill, but annoying without it */
+ printf("Mounting root from %s\n", mountfrom);
+ splx(s);
+
+ /* parse vfs name and path */
+ vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK);
+ path = malloc(MNAMELEN, M_MOUNT, M_WAITOK);
+ vfsname[0] = path[0] = 0;
+ sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN);
+ if (sscanf(mountfrom, patt, vfsname, path) < 1)
+ goto done;
+
+ /* allocate a root mount */
+ error = vfs_rootmountalloc(vfsname, path[0] != 0 ? path : ROOTNAME,
+ &mp);
+ if (error != 0) {
+ printf("Can't allocate root mount for filesystem '%s': %d\n",
+ vfsname, error);
+ goto done;
+ }
+ mp->mnt_flag |= MNT_ROOTFS;
+
+ /* do our best to set rootdev */
+ if ((path[0] != 0) && setrootbyname(path))
+ printf("setrootbyname failed\n");
+
+ /* If the root device is a type "memory disk", mount RW */
+ if (rootdev != NODEV && devsw(rootdev) &&
+ (devsw(rootdev)->d_flags & D_MEMDISK))
+ mp->mnt_flag &= ~MNT_RDONLY;
+
+ /*
+ * Set the mount path to be something useful, because the
+ * filesystem code isn't responsible now for initialising
+ * f_mntonname unless they want to override the default
+ * (which is `path'.)
+ */
+ strncpy(mp->mnt_stat.f_mntonname, "/", MNAMELEN);
+
+ error = VFS_MOUNT(mp, NULL, NULL, NULL, curthread);
+
+done:
+ if (vfsname != NULL)
+ free(vfsname, M_MOUNT);
+ if (path != NULL)
+ free(path, M_MOUNT);
+ if (error != 0) {
+ if (mp != NULL) {
+ vfs_unbusy(mp, curthread);
+ free(mp, M_MOUNT);
+ }
+ printf("Root mount failed: %d\n", error);
+ } else {
+
+ /* register with list of mounted filesystems */
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+
+ /* sanity check system clock against root filesystem timestamp */
+ inittodr(mp->mnt_time);
+ vfs_unbusy(mp, curthread);
+ }
+ return(error);
+}
+
+/*
+ * Spin prompting on the console for a suitable root filesystem
+ */
+static int
+vfs_mountroot_ask(void)
+{
+ char name[128];
+ int i;
+ dev_t dev;
+
+ for(;;) {
+ printf("\nManual root filesystem specification:\n");
+ printf(" <fstype>:<device> Mount <device> using filesystem <fstype>\n");
+#if defined(__i386__) || defined(__ia64__)
+ printf(" eg. ufs:da0s1a\n");
+#else
+ printf(" eg. ufs:da0a\n");
+#endif
+ printf(" ? List valid disk boot devices\n");
+ printf(" <empty line> Abort manual input\n");
+ printf("\nmountroot> ");
+ gets(name);
+ if (name[0] == 0)
+ return(1);
+ if (name[0] == '?') {
+ printf("Possibly valid devices for 'ufs' root:\n");
+ for (i = 0; i < NUMCDEVSW; i++) {
+ dev = makedev(i, 0);
+ if (devsw(dev) != NULL)
+ printf(" \"%s\"", devsw(dev)->d_name);
+ }
+ printf("\n");
+ continue;
+ }
+ if (!vfs_mountroot_try(name))
+ return(0);
+ }
+}
+
+/*
+ * Local helper function for vfs_mountroot_ask.
+ */
+static void
+gets(char *cp)
+{
+ char *lp;
+ int c;
+
+ lp = cp;
+ for (;;) {
+ printf("%c", c = cngetc() & 0177);
+ switch (c) {
+ case -1:
+ case '\n':
+ case '\r':
+ *lp++ = '\0';
+ return;
+ case '\b':
+ case '\177':
+ if (lp > cp) {
+ printf(" \b");
+ lp--;
+ }
+ continue;
+ case '#':
+ lp--;
+ if (lp < cp)
+ lp = cp;
+ continue;
+ case '@':
+ case 'u' & 037:
+ lp = cp;
+ printf("%c", '\n');
+ continue;
+ default:
+ *lp++ = c;
+ }
+ }
+}
+
+/*
+ * Convert a given name to the dev_t of the disk-like device
+ * it refers to.
+ */
+dev_t
+getdiskbyname(char *name) {
+ char *cp;
+ dev_t dev;
+
+ cp = name;
+ if (!bcmp(cp, "/dev/", 5))
+ cp += 5;
+
+ dev = NODEV;
+ EVENTHANDLER_INVOKE(dev_clone, cp, strlen(cp), &dev);
+ return (dev);
+}
+
+/*
+ * Set rootdev to match (name), given that we expect it to
+ * refer to a disk-like device.
+ */
+static int
+setrootbyname(char *name)
+{
+ dev_t diskdev;
+
+ diskdev = getdiskbyname(name);
+ if (diskdev != NODEV) {
+ rootdev = diskdev;
+ return (0);
+ }
+
+ return (1);
+}
+
+/* Show the dev_t for a disk specified by name */
+#ifdef DDB
+DB_SHOW_COMMAND(disk, db_getdiskbyname)
+{
+ dev_t dev;
+
+ if (modif[0] == '\0') {
+ db_error("usage: show disk/devicename");
+ return;
+ }
+ dev = getdiskbyname(modif);
+ if (dev != NODEV)
+ db_printf("dev_t = %p\n", dev);
+ else
+ db_printf("No disk device matched.\n");
+}
+#endif
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
new file mode 100644
index 0000000..6bfe085
--- /dev/null
+++ b/sys/kern/vfs_default.c
@@ -0,0 +1,845 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/poll.h>
+
+#include <machine/limits.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+
+static int vop_nolookup(struct vop_lookup_args *);
+static int vop_nostrategy(struct vop_strategy_args *);
+
+/*
+ * This vnode table stores what we want to do if the filesystem doesn't
+ * implement a particular VOP.
+ *
+ * If there is no specific entry here, we will return EOPNOTSUPP.
+ *
+ */
+
+vop_t **default_vnodeop_p;
+static struct vnodeopv_entry_desc default_vnodeop_entries[] = {
+ { &vop_default_desc, (vop_t *) vop_eopnotsupp },
+ { &vop_advlock_desc, (vop_t *) vop_einval },
+ { &vop_bmap_desc, (vop_t *) vop_stdbmap },
+ { &vop_close_desc, (vop_t *) vop_null },
+ { &vop_createvobject_desc, (vop_t *) vop_stdcreatevobject },
+ { &vop_destroyvobject_desc, (vop_t *) vop_stddestroyvobject },
+ { &vop_fsync_desc, (vop_t *) vop_null },
+ { &vop_getpages_desc, (vop_t *) vop_stdgetpages },
+ { &vop_getvobject_desc, (vop_t *) vop_stdgetvobject },
+ { &vop_inactive_desc, (vop_t *) vop_stdinactive },
+ { &vop_ioctl_desc, (vop_t *) vop_enotty },
+ { &vop_islocked_desc, (vop_t *) vop_noislocked },
+ { &vop_lease_desc, (vop_t *) vop_null },
+ { &vop_lock_desc, (vop_t *) vop_nolock },
+ { &vop_lookup_desc, (vop_t *) vop_nolookup },
+ { &vop_open_desc, (vop_t *) vop_null },
+ { &vop_pathconf_desc, (vop_t *) vop_einval },
+ { &vop_putpages_desc, (vop_t *) vop_stdputpages },
+ { &vop_poll_desc, (vop_t *) vop_nopoll },
+ { &vop_readlink_desc, (vop_t *) vop_einval },
+ { &vop_revoke_desc, (vop_t *) vop_revoke },
+ { &vop_strategy_desc, (vop_t *) vop_nostrategy },
+ { &vop_unlock_desc, (vop_t *) vop_nounlock },
+ { NULL, NULL }
+};
+
+static struct vnodeopv_desc default_vnodeop_opv_desc =
+ { &default_vnodeop_p, default_vnodeop_entries };
+
+VNODEOP_SET(default_vnodeop_opv_desc);
+
+/*
+ * Series of placeholder functions for various error returns for
+ * VOPs.
+ */
+
+int
+vop_eopnotsupp(struct vop_generic_args *ap)
+{
+ /*
+ printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
+ */
+
+ return (EOPNOTSUPP);
+}
+
+int
+vop_ebadf(struct vop_generic_args *ap)
+{
+
+ return (EBADF);
+}
+
+int
+vop_enotty(struct vop_generic_args *ap)
+{
+
+ return (ENOTTY);
+}
+
+int
+vop_einval(struct vop_generic_args *ap)
+{
+
+ return (EINVAL);
+}
+
+int
+vop_null(struct vop_generic_args *ap)
+{
+
+ return (0);
+}
+
+/*
+ * Used to make a defined VOP fall back to the default VOP.
+ */
+int
+vop_defaultop(struct vop_generic_args *ap)
+{
+
+ return (VOCALL(default_vnodeop_p, ap->a_desc->vdesc_offset, ap));
+}
+
+/*
+ * Helper function to panic on some bad VOPs in some filesystems.
+ */
+int
+vop_panic(struct vop_generic_args *ap)
+{
+
+ panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name);
+}
+
+/*
+ * vop_std<something> and vop_no<something> are default functions for use by
+ * filesystems that need the "default reasonable" implementation for a
+ * particular operation.
+ *
+ * The documentation for the operations they implement exists (if it exists)
+ * in the VOP_<SOMETHING>(9) manpage (all uppercase).
+ */
+
+/*
+ * Default vop for filesystems that do not support name lookup
+ */
+static int
+vop_nolookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+
+ *ap->a_vpp = NULL;
+ return (ENOTDIR);
+}
+
+/*
+ * vop_nostrategy:
+ *
+ * Strategy routine for VFS devices that have none.
+ *
+ * BIO_ERROR and B_INVAL must be cleared prior to calling any strategy
+ * routine. Typically this is done for a BIO_READ strategy call.
+ * Typically B_INVAL is assumed to already be clear prior to a write
+ * and should not be cleared manually unless you just made the buffer
+ * invalid. BIO_ERROR should be cleared either way.
+ */
+
+static int
+vop_nostrategy (struct vop_strategy_args *ap)
+{
+ printf("No strategy for buffer at %p\n", ap->a_bp);
+ vprint("", ap->a_vp);
+ vprint("", ap->a_bp->b_vp);
+ ap->a_bp->b_ioflags |= BIO_ERROR;
+ ap->a_bp->b_error = EOPNOTSUPP;
+ bufdone(ap->a_bp);
+ return (EOPNOTSUPP);
+}
+
+/*
+ * vop_stdpathconf:
+ *
+ * Standard implementation of POSIX pathconf, to get information about limits
+ * for a filesystem.
+ * Override per filesystem for the case where the filesystem has smaller
+ * limits.
+ */
+int
+vop_stdpathconf(ap)
+ struct vop_pathconf_args /* {
+ struct vnode *a_vp;
+ int a_name;
+ int *a_retval;
+ } */ *ap;
+{
+
+ switch (ap->a_name) {
+ case _PC_LINK_MAX:
+ *ap->a_retval = LINK_MAX;
+ return (0);
+ case _PC_MAX_CANON:
+ *ap->a_retval = MAX_CANON;
+ return (0);
+ case _PC_MAX_INPUT:
+ *ap->a_retval = MAX_INPUT;
+ return (0);
+ case _PC_PIPE_BUF:
+ *ap->a_retval = PIPE_BUF;
+ return (0);
+ case _PC_CHOWN_RESTRICTED:
+ *ap->a_retval = 1;
+ return (0);
+ case _PC_VDISABLE:
+ *ap->a_retval = _POSIX_VDISABLE;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Standard lock, unlock and islocked functions.
+ */
+int
+vop_stdlock(ap)
+ struct vop_lock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ struct thread *a_td;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+#ifndef DEBUG_LOCKS
+ return (lockmgr(&vp->v_lock, ap->a_flags, &vp->v_interlock, ap->a_td));
+#else
+ return (debuglockmgr(&vp->v_lock, ap->a_flags, &vp->v_interlock,
+ ap->a_td, "vop_stdlock", vp->filename, vp->line));
+#endif
+}
+
+/* See above. */
+int
+vop_stdunlock(ap)
+ struct vop_unlock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ struct thread *a_td;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+ return (lockmgr(&vp->v_lock, ap->a_flags | LK_RELEASE, &vp->v_interlock,
+ ap->a_td));
+}
+
+/* See above. */
+int
+vop_stdislocked(ap)
+ struct vop_islocked_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+
+ return (lockstatus(&ap->a_vp->v_lock, ap->a_td));
+}
+
+/* Mark the vnode inactive */
+int
+vop_stdinactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+
+ VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
+ return (0);
+}
+
+/*
+ * Return true for select/poll.
+ */
+int
+vop_nopoll(ap)
+ struct vop_poll_args /* {
+ struct vnode *a_vp;
+ int a_events;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ /*
+ * Return true for read/write. If the user asked for something
+ * special, return POLLNVAL, so that clients have a way of
+ * determining reliably whether or not the extended
+ * functionality is present without hard-coding knowledge
+ * of specific filesystem implementations.
+ */
+ if (ap->a_events & ~POLLSTANDARD)
+ return (POLLNVAL);
+
+ return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Implement poll for local filesystems that support it.
+ */
+int
+vop_stdpoll(ap)
+ struct vop_poll_args /* {
+ struct vnode *a_vp;
+ int a_events;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ if (ap->a_events & ~POLLSTANDARD)
+ return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events));
+ return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Stubs to use when there is no locking to be done on the underlying object.
+ * A minimal shared lock is necessary to ensure that the underlying object
+ * is not revoked while an operation is in progress. So, an active shared
+ * count is maintained in an auxillary vnode lock structure.
+ */
+int
+vop_sharedlock(ap)
+ struct vop_lock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ struct thread *a_td;
+ } */ *ap;
+{
+ /*
+ * This code cannot be used until all the non-locking filesystems
+ * (notably NFS) are converted to properly lock and release nodes.
+ * Also, certain vnode operations change the locking state within
+ * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
+ * and symlink). Ideally these operations should not change the
+ * lock state, but should be changed to let the caller of the
+ * function unlock them. Otherwise all intermediate vnode layers
+ * (such as union, umapfs, etc) must catch these functions to do
+ * the necessary locking at their layer. Note that the inactive
+ * and lookup operations also change their lock state, but this
+ * cannot be avoided, so these two operations will always need
+ * to be handled in intermediate layers.
+ */
+ struct vnode *vp = ap->a_vp;
+ int vnflags, flags = ap->a_flags;
+
+ switch (flags & LK_TYPE_MASK) {
+ case LK_DRAIN:
+ vnflags = LK_DRAIN;
+ break;
+ case LK_EXCLUSIVE:
+#ifdef DEBUG_VFS_LOCKS
+ /*
+ * Normally, we use shared locks here, but that confuses
+ * the locking assertions.
+ */
+ vnflags = LK_EXCLUSIVE;
+ break;
+#endif
+ case LK_SHARED:
+ vnflags = LK_SHARED;
+ break;
+ case LK_UPGRADE:
+ case LK_EXCLUPGRADE:
+ case LK_DOWNGRADE:
+ return (0);
+ case LK_RELEASE:
+ default:
+ panic("vop_sharedlock: bad operation %d", flags & LK_TYPE_MASK);
+ }
+ if (flags & LK_INTERLOCK)
+ vnflags |= LK_INTERLOCK;
+#ifndef DEBUG_LOCKS
+ return (lockmgr(&vp->v_lock, vnflags, &vp->v_interlock, ap->a_td));
+#else
+ return (debuglockmgr(&vp->v_lock, vnflags, &vp->v_interlock, ap->a_td,
+ "vop_sharedlock", vp->filename, vp->line));
+#endif
+}
+
+/*
+ * Stubs to use when there is no locking to be done on the underlying object.
+ * A minimal shared lock is necessary to ensure that the underlying object
+ * is not revoked while an operation is in progress. So, an active shared
+ * count is maintained in an auxillary vnode lock structure.
+ */
+int
+vop_nolock(ap)
+ struct vop_lock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ struct thread *a_td;
+ } */ *ap;
+{
+#ifdef notyet
+ /*
+ * This code cannot be used until all the non-locking filesystems
+ * (notably NFS) are converted to properly lock and release nodes.
+ * Also, certain vnode operations change the locking state within
+ * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
+ * and symlink). Ideally these operations should not change the
+ * lock state, but should be changed to let the caller of the
+ * function unlock them. Otherwise all intermediate vnode layers
+ * (such as union, umapfs, etc) must catch these functions to do
+ * the necessary locking at their layer. Note that the inactive
+ * and lookup operations also change their lock state, but this
+ * cannot be avoided, so these two operations will always need
+ * to be handled in intermediate layers.
+ */
+ struct vnode *vp = ap->a_vp;
+ int vnflags, flags = ap->a_flags;
+
+ switch (flags & LK_TYPE_MASK) {
+ case LK_DRAIN:
+ vnflags = LK_DRAIN;
+ break;
+ case LK_EXCLUSIVE:
+ case LK_SHARED:
+ vnflags = LK_SHARED;
+ break;
+ case LK_UPGRADE:
+ case LK_EXCLUPGRADE:
+ case LK_DOWNGRADE:
+ return (0);
+ case LK_RELEASE:
+ default:
+ panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
+ }
+ if (flags & LK_INTERLOCK)
+ vnflags |= LK_INTERLOCK;
+ return(lockmgr(&vp->v_lock, vnflags, &vp->v_interlock, ap->a_td));
+#else /* for now */
+ /*
+ * Since we are not using the lock manager, we must clear
+ * the interlock here.
+ */
+ if (ap->a_flags & LK_INTERLOCK)
+ mtx_unlock(&ap->a_vp->v_interlock);
+ return (0);
+#endif
+}
+
+/*
+ * Do the inverse of vop_nolock, handling the interlock in a compatible way.
+ */
+int
+vop_nounlock(ap)
+ struct vop_unlock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ struct thread *a_td;
+ } */ *ap;
+{
+
+ /*
+ * Since we are not using the lock manager, we must clear
+ * the interlock here.
+ */
+ if (ap->a_flags & LK_INTERLOCK)
+ mtx_unlock(&ap->a_vp->v_interlock);
+ return (0);
+}
+
+/*
+ * Return whether or not the node is in use.
+ */
+int
+vop_noislocked(ap)
+ struct vop_islocked_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+
+ return (0);
+}
+
+/*
+ * Return our mount point, as we will take charge of the writes.
+ */
+int
+vop_stdgetwritemount(ap)
+ struct vop_getwritemount_args /* {
+ struct vnode *a_vp;
+ struct mount **a_mpp;
+ } */ *ap;
+{
+
+ *(ap->a_mpp) = ap->a_vp->v_mount;
+ return (0);
+}
+
+/* Create the VM system backing object for this vnode */
+int
+vop_stdcreatevobject(ap)
+ struct vop_createvobject_args /* {
+ struct vnode *vp;
+ struct ucred *cred;
+ struct thread *td;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+ struct ucred *cred = ap->a_cred;
+ struct thread *td = ap->a_td;
+ struct vattr vat;
+ vm_object_t object;
+ int error = 0;
+
+ GIANT_REQUIRED;
+
+ if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
+ return (0);
+
+retry:
+ if ((object = vp->v_object) == NULL) {
+ if (vp->v_type == VREG || vp->v_type == VDIR) {
+ if ((error = VOP_GETATTR(vp, &vat, cred, td)) != 0)
+ goto retn;
+ object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
+ } else if (devsw(vp->v_rdev) != NULL) {
+ /*
+ * This simply allocates the biggest object possible
+ * for a disk vnode. This should be fixed, but doesn't
+ * cause any problems (yet).
+ */
+ object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
+ } else {
+ goto retn;
+ }
+ /*
+ * Dereference the reference we just created. This assumes
+ * that the object is associated with the vp.
+ */
+ object->ref_count--;
+ vp->v_usecount--;
+ } else {
+ if (object->flags & OBJ_DEAD) {
+ VOP_UNLOCK(vp, 0, td);
+ tsleep(object, PVM, "vodead", 0);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ goto retry;
+ }
+ }
+
+ KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object"));
+ vp->v_flag |= VOBJBUF;
+
+retn:
+ return (error);
+}
+
+/* Destroy the VM system object associated with this vnode */
+int
+vop_stddestroyvobject(ap)
+ struct vop_destroyvobject_args /* {
+ struct vnode *vp;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+ vm_object_t obj = vp->v_object;
+
+ GIANT_REQUIRED;
+
+ if (vp->v_object == NULL)
+ return (0);
+
+ if (obj->ref_count == 0) {
+ /*
+ * vclean() may be called twice. The first time
+ * removes the primary reference to the object,
+ * the second time goes one further and is a
+ * special-case to terminate the object.
+ *
+ * don't double-terminate the object
+ */
+ if ((obj->flags & OBJ_DEAD) == 0)
+ vm_object_terminate(obj);
+ } else {
+ /*
+ * Woe to the process that tries to page now :-).
+ */
+ vm_pager_deallocate(obj);
+ }
+ return (0);
+}
+
+/*
+ * Return the underlying VM object. This routine may be called with or
+ * without the vnode interlock held. If called without, the returned
+ * object is not guarenteed to be valid. The syncer typically gets the
+ * object without holding the interlock in order to quickly test whether
+ * it might be dirty before going heavy-weight. vm_object's use zalloc
+ * and thus stable-storage, so this is safe.
+ */
+int
+vop_stdgetvobject(ap)
+ struct vop_getvobject_args /* {
+ struct vnode *vp;
+ struct vm_object **objpp;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+ struct vm_object **objpp = ap->a_objpp;
+
+ if (objpp)
+ *objpp = vp->v_object;
+ return (vp->v_object ? 0 : EINVAL);
+}
+
+/* XXX Needs good comment and VOP_BMAP(9) manpage */
+int
+vop_stdbmap(ap)
+ struct vop_bmap_args /* {
+ struct vnode *a_vp;
+ daddr_t a_bn;
+ struct vnode **a_vpp;
+ daddr_t *a_bnp;
+ int *a_runp;
+ int *a_runb;
+ } */ *ap;
+{
+
+ if (ap->a_vpp != NULL)
+ *ap->a_vpp = ap->a_vp;
+ if (ap->a_bnp != NULL)
+ *ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize);
+ if (ap->a_runp != NULL)
+ *ap->a_runp = 0;
+ if (ap->a_runb != NULL)
+ *ap->a_runb = 0;
+ return (0);
+}
+
+/* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */
+int
+vop_stdgetpages(ap)
+ struct vop_getpages_args /* {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int a_reqpage;
+ vm_ooffset_t a_offset;
+ } */ *ap;
+{
+
+ return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
+ ap->a_count, ap->a_reqpage);
+}
+
+/* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */
+int
+vop_stdputpages(ap)
+ struct vop_putpages_args /* {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int a_sync;
+ int *a_rtvals;
+ vm_ooffset_t a_offset;
+ } */ *ap;
+{
+
+ return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
+ ap->a_sync, ap->a_rtvals);
+}
+
+
+
+/*
+ * vfs default ops
+ * used to fill the vfs function table to get reasonable default return values.
+ */
+int
+vfs_stdmount (mp, path, data, ndp, td)
+ struct mount *mp;
+ char *path;
+ caddr_t data;
+ struct nameidata *ndp;
+ struct thread *td;
+{
+ return (0);
+}
+
+int
+vfs_stdunmount (mp, mntflags, td)
+ struct mount *mp;
+ int mntflags;
+ struct thread *td;
+{
+ return (0);
+}
+
+int
+vfs_stdroot (mp, vpp)
+ struct mount *mp;
+ struct vnode **vpp;
+{
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdstatfs (mp, sbp, td)
+ struct mount *mp;
+ struct statfs *sbp;
+ struct thread *td;
+{
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdvptofh (vp, fhp)
+ struct vnode *vp;
+ struct fid *fhp;
+{
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdstart (mp, flags, td)
+ struct mount *mp;
+ int flags;
+ struct thread *td;
+{
+ return (0);
+}
+
+int
+vfs_stdquotactl (mp, cmds, uid, arg, td)
+ struct mount *mp;
+ int cmds;
+ uid_t uid;
+ caddr_t arg;
+ struct thread *td;
+{
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdsync (mp, waitfor, cred, td)
+ struct mount *mp;
+ int waitfor;
+ struct ucred *cred;
+ struct thread *td;
+{
+ return (0);
+}
+
+int
+vfs_stdvget (mp, ino, flags, vpp)
+ struct mount *mp;
+ ino_t ino;
+ int flags;
+ struct vnode **vpp;
+{
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdfhtovp (mp, fhp, vpp)
+ struct mount *mp;
+ struct fid *fhp;
+ struct vnode **vpp;
+{
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdinit (vfsp)
+ struct vfsconf *vfsp;
+{
+ return (0);
+}
+
+int
+vfs_stduninit (vfsp)
+ struct vfsconf *vfsp;
+{
+ return(0);
+}
+
+int
+vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)
+ struct mount *mp;
+ int cmd;
+ struct vnode *filename_vp;
+ int attrnamespace;
+ const char *attrname;
+ struct thread *td;
+{
+ return(EOPNOTSUPP);
+}
+
+/* end of vfs default ops */
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
new file mode 100644
index 0000000..ec135bd
--- /dev/null
+++ b/sys/kern/vfs_export.c
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mount.h>
+#include <net/radix.h>
+#include <sys/domain.h>
+#include <sys/dirent.h>
+#include <sys/vnode.h>
+
+static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+
+static void vfs_free_addrlist(struct netexport *nep);
+static int vfs_free_netcred(struct radix_node *rn, void *w);
+static int vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+ struct export_args *argp);
+
+/*
+ * Network address lookup element
+ */
+struct netcred {
+ struct radix_node netc_rnodes[2];
+ int netc_exflags;
+ struct ucred netc_anon;
+};
+
+/*
+ * Network export information
+ */
+struct netexport {
+ struct netcred ne_defexported; /* Default export */
+ struct radix_node_head *ne_rtable[AF_MAX+1]; /* Individual exports */
+};
+
+/*
+ * Build hash lists of net addresses and hang them off the mount point.
+ * Called by ufs_mount() to set up the lists of export addresses.
+ */
+static int
+vfs_hang_addrlist(mp, nep, argp)
+ struct mount *mp;
+ struct netexport *nep;
+ struct export_args *argp;
+{
+ register struct netcred *np;
+ register struct radix_node_head *rnh;
+ register int i;
+ struct radix_node *rn;
+ struct sockaddr *saddr, *smask = 0;
+ struct domain *dom;
+ int error;
+
+ /*
+ * XXX: This routine converts from a `struct xucred'
+ * (argp->ex_anon) to a `struct ucred' (np->netc_anon). This
+ * operation is questionable; for example, what should be done
+ * with fields like cr_uidinfo and cr_prison? Currently, this
+ * routine does not touch them (leaves them as NULL).
+ */
+ if (argp->ex_anon.cr_version != XUCRED_VERSION)
+ return (EINVAL);
+
+ if (argp->ex_addrlen == 0) {
+ if (mp->mnt_flag & MNT_DEFEXPORTED)
+ return (EPERM);
+ np = &nep->ne_defexported;
+ np->netc_exflags = argp->ex_flags;
+ bzero(&np->netc_anon, sizeof(np->netc_anon));
+ np->netc_anon.cr_uid = argp->ex_anon.cr_uid;
+ np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups;
+ bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups,
+ sizeof(np->netc_anon.cr_groups));
+ np->netc_anon.cr_ref = 1;
+ mp->mnt_flag |= MNT_DEFEXPORTED;
+ return (0);
+ }
+
+ if (argp->ex_addrlen > MLEN)
+ return (EINVAL);
+
+ i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
+ np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO);
+ saddr = (struct sockaddr *) (np + 1);
+ if ((error = copyin(argp->ex_addr, saddr, argp->ex_addrlen)))
+ goto out;
+ if (saddr->sa_len > argp->ex_addrlen)
+ saddr->sa_len = argp->ex_addrlen;
+ if (argp->ex_masklen) {
+ smask = (struct sockaddr *) (saddr + argp->ex_addrlen);
+ error = copyin(argp->ex_mask, smask, argp->ex_masklen);
+ if (error)
+ goto out;
+ if (smask->sa_len > argp->ex_masklen)
+ smask->sa_len = argp->ex_masklen;
+ }
+ i = saddr->sa_family;
+ if ((rnh = nep->ne_rtable[i]) == 0) {
+ /*
+ * Seems silly to initialize every AF when most are not used,
+ * do so on demand here
+ */
+ for (dom = domains; dom; dom = dom->dom_next)
+ if (dom->dom_family == i && dom->dom_rtattach) {
+ dom->dom_rtattach((void **) &nep->ne_rtable[i],
+ dom->dom_rtoffset);
+ break;
+ }
+ if ((rnh = nep->ne_rtable[i]) == 0) {
+ error = ENOBUFS;
+ goto out;
+ }
+ }
+ rn = (*rnh->rnh_addaddr) (saddr, smask, rnh,
+ np->netc_rnodes);
+ if (rn == 0 || np != (struct netcred *) rn) { /* already exists */
+ error = EPERM;
+ goto out;
+ }
+ np->netc_exflags = argp->ex_flags;
+ bzero(&np->netc_anon, sizeof(np->netc_anon));
+ np->netc_anon.cr_uid = argp->ex_anon.cr_uid;
+ np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups;
+ bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups,
+ sizeof(np->netc_anon.cr_groups));
+ np->netc_anon.cr_ref = 1;
+ return (0);
+out:
+ free(np, M_NETADDR);
+ return (error);
+}
+
+/* Helper for vfs_free_addrlist. */
+/* ARGSUSED */
+static int
+vfs_free_netcred(rn, w)
+ struct radix_node *rn;
+ void *w;
+{
+ register struct radix_node_head *rnh = (struct radix_node_head *) w;
+
+ (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+ free(rn, M_NETADDR);
+ return (0);
+}
+
+/*
+ * Free the net address hash lists that are hanging off the mount points.
+ */
+static void
+vfs_free_addrlist(nep)
+ struct netexport *nep;
+{
+ register int i;
+ register struct radix_node_head *rnh;
+
+ for (i = 0; i <= AF_MAX; i++)
+ if ((rnh = nep->ne_rtable[i])) {
+ (*rnh->rnh_walktree) (rnh, vfs_free_netcred, rnh);
+ free(rnh, M_RTABLE);
+ nep->ne_rtable[i] = 0;
+ }
+}
+
+/*
+ * High level function to manipulate export options on a mount point
+ * and the passed in netexport.
+ * Struct export_args *argp is the variable used to twiddle options,
+ * the structure is described in sys/mount.h
+ */
+int
+vfs_export(mp, argp)
+ struct mount *mp;
+ struct export_args *argp;
+{
+ struct netexport *nep;
+ int error;
+
+ nep = mp->mnt_export;
+ if (argp->ex_flags & MNT_DELEXPORT) {
+ if (nep == NULL)
+ return (ENOENT);
+ if (mp->mnt_flag & MNT_EXPUBLIC) {
+ vfs_setpublicfs(NULL, NULL, NULL);
+ mp->mnt_flag &= ~MNT_EXPUBLIC;
+ }
+ vfs_free_addrlist(nep);
+ mp->mnt_export = NULL;
+ free(nep, M_MOUNT);
+ mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+ }
+ if (argp->ex_flags & MNT_EXPORTED) {
+ if (nep == NULL) {
+ nep = malloc(sizeof(struct netexport), M_MOUNT, M_WAITOK | M_ZERO);
+ mp->mnt_export = nep;
+ }
+ if (argp->ex_flags & MNT_EXPUBLIC) {
+ if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
+ return (error);
+ mp->mnt_flag |= MNT_EXPUBLIC;
+ }
+ if ((error = vfs_hang_addrlist(mp, nep, argp)))
+ return (error);
+ mp->mnt_flag |= MNT_EXPORTED;
+ }
+ return (0);
+}
+
+/*
+ * Set the publicly exported filesystem (WebNFS). Currently, only
+ * one public filesystem is possible in the spec (RFC 2054 and 2055)
+ */
+int
+vfs_setpublicfs(mp, nep, argp)
+ struct mount *mp;
+ struct netexport *nep;
+ struct export_args *argp;
+{
+ int error;
+ struct vnode *rvp;
+ char *cp;
+
+ /*
+ * mp == NULL -> invalidate the current info, the FS is
+ * no longer exported. May be called from either vfs_export
+ * or unmount, so check if it hasn't already been done.
+ */
+ if (mp == NULL) {
+ if (nfs_pub.np_valid) {
+ nfs_pub.np_valid = 0;
+ if (nfs_pub.np_index != NULL) {
+ FREE(nfs_pub.np_index, M_TEMP);
+ nfs_pub.np_index = NULL;
+ }
+ }
+ return (0);
+ }
+
+ /*
+ * Only one allowed at a time.
+ */
+ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
+ return (EBUSY);
+
+ /*
+ * Get real filehandle for root of exported FS.
+ */
+ bzero(&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
+ nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
+
+ if ((error = VFS_ROOT(mp, &rvp)))
+ return (error);
+
+ if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
+ return (error);
+
+ vput(rvp);
+
+ /*
+ * If an indexfile was specified, pull it in.
+ */
+ if (argp->ex_indexfile != NULL) {
+ MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
+ M_WAITOK);
+ error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
+ MAXNAMLEN, (size_t *)0);
+ if (!error) {
+ /*
+ * Check for illegal filenames.
+ */
+ for (cp = nfs_pub.np_index; *cp; cp++) {
+ if (*cp == '/') {
+ error = EINVAL;
+ break;
+ }
+ }
+ }
+ if (error) {
+ FREE(nfs_pub.np_index, M_TEMP);
+ return (error);
+ }
+ }
+
+ nfs_pub.np_mount = mp;
+ nfs_pub.np_valid = 1;
+ return (0);
+}
+
+/*
+ * Used by the filesystems to determine if a given network address
+ * (passed in 'nam') is present in thier exports list, returns a pointer
+ * to struct netcred so that the filesystem can examine it for
+ * access rights (read/write/etc).
+ */
+struct netcred *
+vfs_export_lookup(mp, nam)
+ register struct mount *mp;
+ struct sockaddr *nam;
+{
+ struct netexport *nep;
+ register struct netcred *np;
+ register struct radix_node_head *rnh;
+ struct sockaddr *saddr;
+
+ nep = mp->mnt_export;
+ if (nep == NULL)
+ return (NULL);
+ np = NULL;
+ if (mp->mnt_flag & MNT_EXPORTED) {
+ /*
+ * Lookup in the export list first.
+ */
+ if (nam != NULL) {
+ saddr = nam;
+ rnh = nep->ne_rtable[saddr->sa_family];
+ if (rnh != NULL) {
+ np = (struct netcred *)
+ (*rnh->rnh_matchaddr)(saddr,
+ rnh);
+ if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
+ np = NULL;
+ }
+ }
+ /*
+ * If no address match, use the default if it exists.
+ */
+ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
+ np = &nep->ne_defexported;
+ }
+ return (np);
+}
+
+/*
+ * XXX: This comment comes from the deprecated ufs_check_export()
+ * XXX: and may not entirely apply, but lacking something better:
+ * This is the generic part of fhtovp called after the underlying
+ * filesystem has validated the file handle.
+ *
+ * Verify that a host should have access to a filesystem.
+ */
+
+int
+vfs_stdcheckexp(mp, nam, extflagsp, credanonp)
+ struct mount *mp;
+ struct sockaddr *nam;
+ int *extflagsp;
+ struct ucred **credanonp;
+{
+ struct netcred *np;
+
+ np = vfs_export_lookup(mp, nam);
+ if (np == NULL)
+ return (EACCES);
+ *extflagsp = np->netc_exflags;
+ *credanonp = &np->netc_anon;
+ return (0);
+}
+
diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c
new file mode 100644
index 0000000..1244e54
--- /dev/null
+++ b/sys/kern/vfs_extattr.c
@@ -0,0 +1,4862 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94
+ * $FreeBSD$
+ */
+
+/* For 4.3 integer FS ID compatibility */
+#include "opt_compat.h"
+#include "opt_ffs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sysent.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/linker.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/dirent.h>
+#include <sys/extattr.h>
+#include <sys/jail.h>
+#include <sys/sysctl.h>
+
+#include <machine/limits.h>
+#include <machine/stdarg.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+static int change_dir(struct nameidata *ndp, struct thread *td);
+static void checkdirs(struct vnode *olddp, struct vnode *newdp);
+static int chroot_refuse_vdir_fds(struct filedesc *fdp);
+static int getutimes(const struct timeval *, struct timespec *);
+static int setfown(struct thread *td, struct vnode *, uid_t, gid_t);
+static int setfmode(struct thread *td, struct vnode *, int);
+static int setfflags(struct thread *td, struct vnode *, int);
+static int setutimes(struct thread *td, struct vnode *,
+ const struct timespec *, int);
+static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
+ struct thread *td);
+static int vfs_nmount(struct thread *td, int, struct uio *);
+
+static int usermount = 0; /* if 1, non-root can mount fs. */
+
+int (*union_dircheckp)(struct thread *td, struct vnode **, struct file *);
+
+SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
+
+/*
+ * Virtual File System System Calls
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct nmount_args {
+ struct iovec *iovp;
+ unsigned int iovcnt;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+nmount(td, uap)
+ struct thread *td;
+ struct nmount_args /* {
+ syscallarg(struct iovec *) iovp;
+ syscallarg(unsigned int) iovcnt;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ struct uio auio;
+ struct iovec *iov, *needfree;
+ struct iovec aiov[UIO_SMALLIOV];
+ unsigned int i;
+ int error;
+ u_int iovlen, iovcnt;
+
+ iovcnt = SCARG(uap, iovcnt);
+ iovlen = iovcnt * sizeof (struct iovec);
+ /*
+ * Check that we have an even number of iovec's
+ * and that we have at least two options.
+ */
+ if ((iovcnt & 1) || (iovcnt < 4) || (iovcnt > UIO_MAXIOV))
+ return (EINVAL);
+
+ if (iovcnt > UIO_SMALLIOV) {
+ MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+ needfree = iov;
+ } else {
+ iov = aiov;
+ needfree = NULL;
+ }
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_segflg = UIO_USERSPACE;
+ if ((error = copyin(uap->iovp, iov, iovlen)))
+ goto finish;
+
+ for (i = 0; i < iovcnt; i++) {
+ if (iov->iov_len > MMAXOPTIONLEN) {
+ error = EINVAL;
+ goto finish;
+ }
+ iov++;
+ }
+ error = vfs_nmount(td, SCARG(uap, flags), &auio);
+finish:
+ if (needfree != NULL)
+ free(needfree, M_TEMP);
+ return (error);
+}
+
+/*
+ * Release all resources related to the
+ * mount options.
+ */
+void
+vfs_freeopts(struct vfsoptlist *opts)
+{
+ struct vfsopt *opt;
+
+ while (!TAILQ_EMPTY(opts)) {
+ opt = TAILQ_FIRST(opts);
+ TAILQ_REMOVE(opts, opt, link);
+ free(opt->name, M_MOUNT);
+ free(opt->value, M_MOUNT);
+ free(opt, M_MOUNT);
+ }
+ free(opts, M_MOUNT);
+}
+
+int
+kernel_mount(iovp, iovcnt, flags)
+ struct iovec *iovp;
+ unsigned int iovcnt;
+ int flags;
+{
+ struct uio auio;
+ int error;
+
+ /*
+ * Check that we have an even number of iovec's
+ * and that we have at least two options.
+ */
+ if ((iovcnt & 1) || (iovcnt < 4))
+ return (EINVAL);
+
+ auio.uio_iov = iovp;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_segflg = UIO_SYSSPACE;
+
+ error = vfs_nmount(curthread, flags, &auio);
+ return (error);
+}
+
+int
+kernel_vmount(int flags, ...)
+{
+ struct iovec *iovp;
+ struct uio auio;
+ va_list ap;
+ unsigned int iovcnt, iovlen, len;
+ const char *cp;
+ char *buf, *pos;
+ size_t n;
+ int error, i;
+
+ len = 0;
+ va_start(ap, flags);
+ for (iovcnt = 0; (cp = va_arg(ap, const char *)) != NULL; iovcnt++)
+ len += strlen(cp) + 1;
+ va_end(ap);
+
+ if (iovcnt < 4 || iovcnt & 1)
+ return (EINVAL);
+
+ iovlen = iovcnt * sizeof (struct iovec);
+ MALLOC(iovp, struct iovec *, iovlen, M_MOUNT, M_WAITOK);
+ MALLOC(buf, char *, len, M_MOUNT, M_WAITOK);
+ pos = buf;
+ va_start(ap, flags);
+ for (i = 0; i < iovcnt; i++) {
+ cp = va_arg(ap, const char *);
+ copystr(cp, pos, len - (pos - buf), &n);
+ iovp[i].iov_base = pos;
+ iovp[i].iov_len = n;
+ pos += n;
+ }
+ va_end(ap);
+
+ auio.uio_iov = iovp;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_segflg = UIO_SYSSPACE;
+
+ error = vfs_nmount(curthread, flags, &auio);
+ FREE(iovp, M_MOUNT);
+ FREE(buf, M_MOUNT);
+ return (error);
+}
+
+/*
+ * vfs_nmount(): actually attempt a filesystem mount.
+ */
+static int
+vfs_nmount(td, fsflags, fsoptions)
+ struct thread *td;
+ int fsflags; /* Flags common to all filesystems. */
+ struct uio *fsoptions; /* Options local to the filesystem. */
+{
+ linker_file_t lf;
+ struct vnode *vp;
+ struct mount *mp;
+ struct vfsconf *vfsp;
+ struct vfsoptlist *optlist;
+ char *fstype, *fspath;
+ int error, flag = 0, kern_flag = 0;
+ int fstypelen, fspathlen;
+ struct vattr va;
+ struct nameidata nd;
+
+ error = vfs_buildopts(fsoptions, &optlist);
+ if (error)
+ return (error);
+
+ /*
+ * We need these two options before the others,
+ * and they are mandatory for any filesystem.
+ * Ensure they are NUL terminated as well.
+ */
+ fstypelen = 0;
+ error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
+ if (error || fstype[fstypelen - 1] != '\0') {
+ error = EINVAL;
+ goto bad;
+ }
+ fspathlen = 0;
+ error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
+ if (error || fspath[fspathlen - 1] != '\0') {
+ error = EINVAL;
+ goto bad;
+ }
+
+ /*
+ * Be ultra-paranoid about making sure the type and fspath
+ * variables will fit in our mp buffers, including the
+ * terminating NUL.
+ */
+ if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
+ error = ENAMETOOLONG;
+ goto bad;
+ }
+
+ if (usermount == 0) {
+ error = suser(td);
+ if (error)
+ goto bad;
+ }
+ /*
+ * Do not allow NFS export by non-root users.
+ */
+ if (fsflags & MNT_EXPORTED) {
+ error = suser(td);
+ if (error)
+ goto bad;
+ }
+ /*
+ * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users.
+ */
+ if (suser(td))
+ fsflags |= MNT_NOSUID | MNT_NODEV;
+ /*
+ * Get vnode to be covered
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
+ if ((error = namei(&nd)) != 0)
+ goto bad;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ if (fsflags & MNT_UPDATE) {
+ if ((vp->v_flag & VROOT) == 0) {
+ vput(vp);
+ error = EINVAL;
+ goto bad;
+ }
+ mp = vp->v_mount;
+ flag = mp->mnt_flag;
+ kern_flag = mp->mnt_kern_flag;
+ /*
+ * We only allow the filesystem to be reloaded if it
+ * is currently mounted read-only.
+ */
+ if ((fsflags & MNT_RELOAD) &&
+ ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+ vput(vp);
+ error = EOPNOTSUPP; /* Needs translation */
+ goto bad;
+ }
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to update it.
+ */
+ if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) {
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ goto bad;
+ }
+ }
+ if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
+ vput(vp);
+ error = EBUSY;
+ goto bad;
+ }
+ mtx_lock(&vp->v_interlock);
+ if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) {
+ mtx_unlock(&vp->v_interlock);
+ vfs_unbusy(mp, td);
+ vput(vp);
+ error = EBUSY;
+ goto bad;
+ }
+ vp->v_flag |= VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ mp->mnt_flag |= fsflags &
+ (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT);
+ VOP_UNLOCK(vp, 0, td);
+ goto update;
+ }
+ /*
+ * If the user is not root, ensure that they own the directory
+ * onto which we are attempting to mount.
+ */
+ error = VOP_GETATTR(vp, &va, td->td_ucred, td);
+ if (error) {
+ vput(vp);
+ goto bad;
+ }
+ if (va.va_uid != td->td_ucred->cr_uid) {
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ goto bad;
+ }
+ }
+ if ((error = vinvalbuf(vp, V_SAVE, td->td_ucred, td, 0, 0)) != 0) {
+ vput(vp);
+ goto bad;
+ }
+ if (vp->v_type != VDIR) {
+ vput(vp);
+ error = ENOTDIR;
+ goto bad;
+ }
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstype))
+ break;
+ if (vfsp == NULL) {
+ /* Only load modules for root (very important!). */
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ goto bad;
+ }
+ error = securelevel_gt(td->td_ucred, 0);
+ if (error) {
+ vput(vp);
+ goto bad;
+ }
+ error = linker_load_file(fstype, &lf);
+ if (error || lf == NULL) {
+ vput(vp);
+ if (lf == NULL)
+ error = ENODEV;
+ goto bad;
+ }
+ lf->userrefs++;
+ /* Look up again to see if the VFS was loaded. */
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstype))
+ break;
+ if (vfsp == NULL) {
+ lf->userrefs--;
+ linker_file_unload(lf);
+ vput(vp);
+ error = ENODEV;
+ goto bad;
+ }
+ }
+ mtx_lock(&vp->v_interlock);
+ if ((vp->v_flag & VMOUNT) != 0 ||
+ vp->v_mountedhere != NULL) {
+ mtx_unlock(&vp->v_interlock);
+ vput(vp);
+ error = EBUSY;
+ goto bad;
+ }
+ vp->v_flag |= VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+
+ /*
+ * Allocate and initialize the filesystem.
+ */
+ mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&mp->mnt_nvnodelist);
+ TAILQ_INIT(&mp->mnt_reservedvnlist);
+ lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+ (void)vfs_busy(mp, LK_NOWAIT, 0, td);
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_vfc = vfsp;
+ vfsp->vfc_refcount++;
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+ strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN);
+ mp->mnt_vnodecovered = vp;
+ mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
+ strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
+ mp->mnt_iosize_max = DFLTPHYS;
+ VOP_UNLOCK(vp, 0, td);
+
+update:
+ mp->mnt_optnew = optlist;
+ /*
+ * Check if the fs implements the new VFS_NMOUNT()
+ * function, since the new system call was used.
+ */
+ if (mp->mnt_op->vfs_mount != NULL) {
+ printf("%s doesn't support the new mount syscall\n",
+ mp->mnt_vfc->vfc_name);
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ if (mp->mnt_flag & MNT_UPDATE)
+ vfs_unbusy(mp, td);
+ else {
+ mp->mnt_vfc->vfc_refcount--;
+ vfs_unbusy(mp, td);
+ free(mp, M_MOUNT);
+ }
+ vrele(vp);
+ error = EOPNOTSUPP;
+ goto bad;
+ }
+
+ /*
+ * Set the mount level flags.
+ */
+ if (fsflags & MNT_RDONLY)
+ mp->mnt_flag |= MNT_RDONLY;
+ else if (mp->mnt_flag & MNT_RDONLY)
+ mp->mnt_kern_flag |= MNTK_WANTRDWR;
+ mp->mnt_flag &=~ MNT_UPDATEMASK;
+ mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE);
+ /*
+ * Mount the filesystem.
+ * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+ * get. No freeing of cn_pnbuf.
+ */
+ error = VFS_NMOUNT(mp, &nd, td);
+ if (!error) {
+ if (mp->mnt_opt != NULL)
+ vfs_freeopts(mp->mnt_opt);
+ mp->mnt_opt = mp->mnt_optnew;
+ }
+ /*
+ * Prevent external consumers of mount
+ * options to read mnt_optnew.
+ */
+ mp->mnt_optnew = NULL;
+ if (mp->mnt_flag & MNT_UPDATE) {
+ if (mp->mnt_kern_flag & MNTK_WANTRDWR)
+ mp->mnt_flag &= ~MNT_RDONLY;
+ mp->mnt_flag &=~
+ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
+ mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
+ if (error) {
+ mp->mnt_flag = flag;
+ mp->mnt_kern_flag = kern_flag;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+ if (mp->mnt_syncer == NULL)
+ error = vfs_allocate_syncvnode(mp);
+ } else {
+ if (mp->mnt_syncer != NULL)
+ vrele(mp->mnt_syncer);
+ mp->mnt_syncer = NULL;
+ }
+ vfs_unbusy(mp, td);
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ vrele(vp);
+ return (error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ /*
+ * Put the new filesystem on the mount list after root.
+ */
+ cache_purge(vp);
+ if (!error) {
+ struct vnode *newdp;
+
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ vp->v_mountedhere = mp;
+ mtx_unlock(&vp->v_interlock);
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+ if (VFS_ROOT(mp, &newdp))
+ panic("mount: lost mount");
+ checkdirs(vp, newdp);
+ vput(newdp);
+ VOP_UNLOCK(vp, 0, td);
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ error = vfs_allocate_syncvnode(mp);
+ vfs_unbusy(mp, td);
+ if ((error = VFS_START(mp, 0, td)) != 0) {
+ vrele(vp);
+ goto bad;
+ }
+ } else {
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ mp->mnt_vfc->vfc_refcount--;
+ vfs_unbusy(mp, td);
+ free(mp, M_MOUNT);
+ vput(vp);
+ goto bad;
+ }
+ return (0);
+bad:
+ vfs_freeopts(optlist);
+ return (error);
+}
+
+/*
+ * Old Mount API.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+ char *type;
+ char *path;
+ int flags;
+ caddr_t data;
+};
+#endif
+/* ARGSUSED */
+int
+mount(td, uap)
+ struct thread *td;
+ struct mount_args /* {
+ syscallarg(char *) type;
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(caddr_t) data;
+ } */ *uap;
+{
+ char *fstype;
+ char *fspath;
+ int error;
+
+ fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
+ fspath = malloc(MNAMELEN, M_TEMP, M_WAITOK);
+
+ /*
+ * vfs_mount() actually takes a kernel string for `type' and
+ * `path' now, so extract them.
+ */
+ error = copyinstr(SCARG(uap, type), fstype, MFSNAMELEN, NULL);
+ if (error)
+ goto finish;
+ error = copyinstr(SCARG(uap, path), fspath, MNAMELEN, NULL);
+ if (error)
+ goto finish;
+ error = vfs_mount(td, fstype, fspath, SCARG(uap, flags),
+ SCARG(uap, data));
+finish:
+ free(fstype, M_TEMP);
+ free(fspath, M_TEMP);
+ return (error);
+}
+
+/*
+ * vfs_mount(): actually attempt a filesystem mount.
+ *
+ * This routine is designed to be a "generic" entry point for routines
+ * that wish to mount a filesystem. All parameters except `fsdata' are
+ * pointers into kernel space. `fsdata' is currently still a pointer
+ * into userspace.
+ */
+int
+vfs_mount(td, fstype, fspath, fsflags, fsdata)
+ struct thread *td;
+ const char *fstype;
+ char *fspath;
+ int fsflags;
+ void *fsdata;
+{
+ linker_file_t lf;
+ struct vnode *vp;
+ struct mount *mp;
+ struct vfsconf *vfsp;
+ int error, flag = 0, kern_flag = 0;
+ struct vattr va;
+ struct nameidata nd;
+
+ /*
+ * Be ultra-paranoid about making sure the type and fspath
+ * variables will fit in our mp buffers, including the
+ * terminating NUL.
+ */
+ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
+ return (ENAMETOOLONG);
+
+ if (usermount == 0) {
+ error = suser(td);
+ if (error)
+ return (error);
+ }
+ /*
+ * Do not allow NFS export by non-root users.
+ */
+ if (fsflags & MNT_EXPORTED) {
+ error = suser(td);
+ if (error)
+ return (error);
+ }
+ /*
+ * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users.
+ */
+ if (suser(td))
+ fsflags |= MNT_NOSUID | MNT_NODEV;
+ /*
+ * Get vnode to be covered
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ if (fsflags & MNT_UPDATE) {
+ if ((vp->v_flag & VROOT) == 0) {
+ vput(vp);
+ return (EINVAL);
+ }
+ mp = vp->v_mount;
+ flag = mp->mnt_flag;
+ kern_flag = mp->mnt_kern_flag;
+ /*
+ * We only allow the filesystem to be reloaded if it
+ * is currently mounted read-only.
+ */
+ if ((fsflags & MNT_RELOAD) &&
+ ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+ vput(vp);
+ return (EOPNOTSUPP); /* Needs translation */
+ }
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to update it.
+ */
+ if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) {
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ }
+ if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
+ vput(vp);
+ return (EBUSY);
+ }
+ mtx_lock(&vp->v_interlock);
+ if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) {
+ mtx_unlock(&vp->v_interlock);
+ vfs_unbusy(mp, td);
+ vput(vp);
+ return (EBUSY);
+ }
+ vp->v_flag |= VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ mp->mnt_flag |= fsflags &
+ (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT);
+ VOP_UNLOCK(vp, 0, td);
+ goto update;
+ }
+ /*
+ * If the user is not root, ensure that they own the directory
+ * onto which we are attempting to mount.
+ */
+ error = VOP_GETATTR(vp, &va, td->td_ucred, td);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ if (va.va_uid != td->td_ucred->cr_uid) {
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ }
+ if ((error = vinvalbuf(vp, V_SAVE, td->td_ucred, td, 0, 0)) != 0) {
+ vput(vp);
+ return (error);
+ }
+ if (vp->v_type != VDIR) {
+ vput(vp);
+ return (ENOTDIR);
+ }
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstype))
+ break;
+ if (vfsp == NULL) {
+ /* Only load modules for root (very important!). */
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ error = securelevel_gt(td->td_ucred, 0);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ error = linker_load_file(fstype, &lf);
+ if (error || lf == NULL) {
+ vput(vp);
+ if (lf == NULL)
+ error = ENODEV;
+ return (error);
+ }
+ lf->userrefs++;
+ /* Look up again to see if the VFS was loaded. */
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstype))
+ break;
+ if (vfsp == NULL) {
+ lf->userrefs--;
+ linker_file_unload(lf);
+ vput(vp);
+ return (ENODEV);
+ }
+ }
+ mtx_lock(&vp->v_interlock);
+ if ((vp->v_flag & VMOUNT) != 0 ||
+ vp->v_mountedhere != NULL) {
+ mtx_unlock(&vp->v_interlock);
+ vput(vp);
+ return (EBUSY);
+ }
+ vp->v_flag |= VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+
+ /*
+ * Allocate and initialize the filesystem.
+ */
+ mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&mp->mnt_nvnodelist);
+ TAILQ_INIT(&mp->mnt_reservedvnlist);
+ lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+ (void)vfs_busy(mp, LK_NOWAIT, 0, td);
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_vfc = vfsp;
+ vfsp->vfc_refcount++;
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+ strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN);
+ mp->mnt_vnodecovered = vp;
+ mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
+ strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
+ mp->mnt_iosize_max = DFLTPHYS;
+ VOP_UNLOCK(vp, 0, td);
+update:
+ /*
+ * Check if the fs implements the old VFS_MOUNT()
+ * function, since the old system call was used.
+ */
+ if (mp->mnt_op->vfs_mount == NULL) {
+ printf("%s doesn't support the old mount syscall\n",
+ mp->mnt_vfc->vfc_name);
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ if (mp->mnt_flag & MNT_UPDATE)
+ vfs_unbusy(mp, td);
+ else {
+ mp->mnt_vfc->vfc_refcount--;
+ vfs_unbusy(mp, td);
+ free(mp, M_MOUNT);
+ }
+ vrele(vp);
+ return (EOPNOTSUPP);
+ }
+
+ /*
+ * Set the mount level flags.
+ */
+ if (fsflags & MNT_RDONLY)
+ mp->mnt_flag |= MNT_RDONLY;
+ else if (mp->mnt_flag & MNT_RDONLY)
+ mp->mnt_kern_flag |= MNTK_WANTRDWR;
+ mp->mnt_flag &=~ MNT_UPDATEMASK;
+ mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE);
+ /*
+ * Mount the filesystem.
+ * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+ * get. No freeing of cn_pnbuf.
+ */
+ error = VFS_MOUNT(mp, fspath, fsdata, &nd, td);
+ if (mp->mnt_flag & MNT_UPDATE) {
+ if (mp->mnt_kern_flag & MNTK_WANTRDWR)
+ mp->mnt_flag &= ~MNT_RDONLY;
+ mp->mnt_flag &=~
+ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
+ mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
+ if (error) {
+ mp->mnt_flag = flag;
+ mp->mnt_kern_flag = kern_flag;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+ if (mp->mnt_syncer == NULL)
+ error = vfs_allocate_syncvnode(mp);
+ } else {
+ if (mp->mnt_syncer != NULL)
+ vrele(mp->mnt_syncer);
+ mp->mnt_syncer = NULL;
+ }
+ vfs_unbusy(mp, td);
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ vrele(vp);
+ return (error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ /*
+ * Put the new filesystem on the mount list after root.
+ */
+ cache_purge(vp);
+ if (!error) {
+ struct vnode *newdp;
+
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ vp->v_mountedhere = mp;
+ mtx_unlock(&vp->v_interlock);
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+ if (VFS_ROOT(mp, &newdp))
+ panic("mount: lost mount");
+ checkdirs(vp, newdp);
+ vput(newdp);
+ VOP_UNLOCK(vp, 0, td);
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ error = vfs_allocate_syncvnode(mp);
+ vfs_unbusy(mp, td);
+ if ((error = VFS_START(mp, 0, td)) != 0)
+ vrele(vp);
+ } else {
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ mp->mnt_vfc->vfc_refcount--;
+ vfs_unbusy(mp, td);
+ free(mp, M_MOUNT);
+ vput(vp);
+ }
+ return (error);
+}
+
+/*
+ * Scan all active processes to see if any of them have a current
+ * or root directory of `olddp'. If so, replace them with the new
+ * mount point.
+ */
+static void
+checkdirs(olddp, newdp)
+ struct vnode *olddp, *newdp;
+{
+ struct filedesc *fdp;
+ struct proc *p;
+ int nrele;
+
+ if (olddp->v_usecount == 1)
+ return;
+ sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &allproc, p_list) {
+ PROC_LOCK(p);
+ fdp = p->p_fd;
+ if (fdp == NULL) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ nrele = 0;
+ FILEDESC_LOCK(fdp);
+ if (fdp->fd_cdir == olddp) {
+ VREF(newdp);
+ fdp->fd_cdir = newdp;
+ nrele++;
+ }
+ if (fdp->fd_rdir == olddp) {
+ VREF(newdp);
+ fdp->fd_rdir = newdp;
+ nrele++;
+ }
+ FILEDESC_UNLOCK(fdp);
+ PROC_UNLOCK(p);
+ while (nrele--)
+ vrele(olddp);
+ }
+ sx_sunlock(&allproc_lock);
+ if (rootvnode == olddp) {
+ vrele(rootvnode);
+ VREF(newdp);
+ rootvnode = newdp;
+ }
+}
+
+/*
+ * Unmount a filesystem.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument,
+ * not special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+ char *path;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+unmount(td, uap)
+ struct thread *td;
+ register struct unmount_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct mount *mp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ mp = vp->v_mount;
+
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to unmount this filesystem.
+ */
+ if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) {
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ }
+
+ /*
+ * Don't allow unmounting the root filesystem.
+ */
+ if (mp->mnt_flag & MNT_ROOTFS) {
+ vput(vp);
+ return (EINVAL);
+ }
+
+ /*
+ * Must be the root of the filesystem
+ */
+ if ((vp->v_flag & VROOT) == 0) {
+ vput(vp);
+ return (EINVAL);
+ }
+ vput(vp);
+ return (dounmount(mp, SCARG(uap, flags), td));
+}
+
+/*
+ * Do the actual filesystem unmount.
+ */
+int
+dounmount(mp, flags, td)
+ struct mount *mp;
+ int flags;
+ struct thread *td;
+{
+ struct vnode *coveredvp, *fsrootvp;
+ int error;
+ int async_flag;
+
+ mtx_lock(&mountlist_mtx);
+ if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+ mtx_unlock(&mountlist_mtx);
+ return (EBUSY);
+ }
+ mp->mnt_kern_flag |= MNTK_UNMOUNT;
+ /* Allow filesystems to detect that a forced unmount is in progress. */
+ if (flags & MNT_FORCE)
+ mp->mnt_kern_flag |= MNTK_UNMOUNTF;
+ error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
+ ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), &mountlist_mtx, td);
+ if (error) {
+ mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+ if (mp->mnt_kern_flag & MNTK_MWAIT)
+ wakeup(mp);
+ return (error);
+ }
+ vn_start_write(NULL, &mp, V_WAIT);
+
+ if (mp->mnt_flag & MNT_EXPUBLIC)
+ vfs_setpublicfs(NULL, NULL, NULL);
+
+ vfs_msync(mp, MNT_WAIT);
+ async_flag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &=~ MNT_ASYNC;
+ cache_purgevfs(mp); /* remove cache entries for this file sys */
+ if (mp->mnt_syncer != NULL)
+ vrele(mp->mnt_syncer);
+ /* Move process cdir/rdir refs on fs root to underlying vnode. */
+ if (VFS_ROOT(mp, &fsrootvp) == 0) {
+ if (mp->mnt_vnodecovered != NULL)
+ checkdirs(fsrootvp, mp->mnt_vnodecovered);
+ if (fsrootvp == rootvnode) {
+ vrele(rootvnode);
+ rootvnode = NULL;
+ }
+ vput(fsrootvp);
+ }
+ if (((mp->mnt_flag & MNT_RDONLY) ||
+ (error = VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td)) == 0) ||
+ (flags & MNT_FORCE)) {
+ error = VFS_UNMOUNT(mp, flags, td);
+ }
+ vn_finished_write(mp);
+ if (error) {
+ /* Undo cdir/rdir and rootvnode changes made above. */
+ if (VFS_ROOT(mp, &fsrootvp) == 0) {
+ if (mp->mnt_vnodecovered != NULL)
+ checkdirs(mp->mnt_vnodecovered, fsrootvp);
+ if (rootvnode == NULL) {
+ rootvnode = fsrootvp;
+ vref(rootvnode);
+ }
+ vput(fsrootvp);
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
+ (void) vfs_allocate_syncvnode(mp);
+ mtx_lock(&mountlist_mtx);
+ mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+ mp->mnt_flag |= async_flag;
+ lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK,
+ &mountlist_mtx, td);
+ if (mp->mnt_kern_flag & MNTK_MWAIT)
+ wakeup(mp);
+ return (error);
+ }
+ mtx_lock(&mountlist_mtx);
+ TAILQ_REMOVE(&mountlist, mp, mnt_list);
+ if ((coveredvp = mp->mnt_vnodecovered) != NULL)
+ coveredvp->v_mountedhere = NULL;
+ mp->mnt_vfc->vfc_refcount--;
+ if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
+ panic("unmount: dangling vnode");
+ lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_mtx, td);
+ lockdestroy(&mp->mnt_lock);
+ if (coveredvp != NULL)
+ vrele(coveredvp);
+ if (mp->mnt_kern_flag & MNTK_MWAIT)
+ wakeup(mp);
+ if (mp->mnt_op->vfs_mount == NULL)
+ vfs_freeopts(mp->mnt_opt);
+ free(mp, M_MOUNT);
+ return (0);
+}
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+ int dummy;
+};
+#endif
+
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
+/* ARGSUSED */
+int
+sync(td, uap)
+ struct thread *td;
+ struct sync_args *uap;
+{
+ struct mount *mp, *nmp;
+ int asyncflag;
+
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
+ vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
+ asyncflag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &= ~MNT_ASYNC;
+ vfs_msync(mp, MNT_NOWAIT);
+ VFS_SYNC(mp, MNT_NOWAIT,
+ ((td != NULL) ? td->td_ucred : NOCRED), td);
+ mp->mnt_flag |= asyncflag;
+ vn_finished_write(mp);
+ }
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp, td);
+ }
+ mtx_unlock(&mountlist_mtx);
+#if 0
+/*
+ * XXX don't call vfs_bufstats() yet because that routine
+ * was not imported in the Lite2 merge.
+ */
+#ifdef DIAGNOSTIC
+ if (syncprt)
+ vfs_bufstats();
+#endif /* DIAGNOSTIC */
+#endif
+ return (0);
+}
+
+/* XXX PRISON: could be per prison flag */
+static int prison_quotas;
+#if 0
+SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
+#endif
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+ char *path;
+ int cmd;
+ int uid;
+ caddr_t arg;
+};
+#endif
+/* ARGSUSED */
+int
+quotactl(td, uap)
+ struct thread *td;
+ register struct quotactl_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) cmd;
+ syscallarg(int) uid;
+ syscallarg(caddr_t) arg;
+ } */ *uap;
+{
+ struct mount *mp;
+ int error;
+ struct nameidata nd;
+
+ if (jailed(td->td_ucred) && !prison_quotas)
+ return (EPERM);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
+ vrele(nd.ni_vp);
+ if (error)
+ return (error);
+ error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
+ SCARG(uap, arg), td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+ char *path;
+ struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+statfs(td, uap)
+ struct thread *td;
+ register struct statfs_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct statfs *) buf;
+ } */ *uap;
+{
+ register struct mount *mp;
+ register struct statfs *sp;
+ int error;
+ struct nameidata nd;
+ struct statfs sb;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ mp = nd.ni_vp->v_mount;
+ sp = &mp->mnt_stat;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(nd.ni_vp);
+ error = VFS_STATFS(mp, sp, td);
+ if (error)
+ return (error);
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (suser(td)) {
+ bcopy(sp, &sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
+ return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+ int fd;
+ struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+fstatfs(td, uap)
+ struct thread *td;
+ register struct fstatfs_args /* {
+ syscallarg(int) fd;
+ syscallarg(struct statfs *) buf;
+ } */ *uap;
+{
+ struct file *fp;
+ struct mount *mp;
+ register struct statfs *sp;
+ int error;
+ struct statfs sb;
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ mp = ((struct vnode *)fp->f_data)->v_mount;
+ fdrop(fp, td);
+ if (mp == NULL)
+ return (EBADF);
+ sp = &mp->mnt_stat;
+ error = VFS_STATFS(mp, sp, td);
+ if (error)
+ return (error);
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (suser(td)) {
+ bcopy(sp, &sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
+ return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+ struct statfs *buf;
+ long bufsize;
+ int flags;
+};
+#endif
+int
+getfsstat(td, uap)
+ struct thread *td;
+ register struct getfsstat_args /* {
+ syscallarg(struct statfs *) buf;
+ syscallarg(long) bufsize;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ register struct mount *mp, *nmp;
+ register struct statfs *sp;
+ caddr_t sfsp;
+ long count, maxcount, error;
+
+ maxcount = SCARG(uap, bufsize) / sizeof(struct statfs);
+ sfsp = (caddr_t)SCARG(uap, buf);
+ count = 0;
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ if (sfsp && count < maxcount) {
+ sp = &mp->mnt_stat;
+ /*
+ * If MNT_NOWAIT or MNT_LAZY is specified, do not
+ * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
+ * overrides MNT_WAIT.
+ */
+ if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
+ (SCARG(uap, flags) & MNT_WAIT)) &&
+ (error = VFS_STATFS(mp, sp, td))) {
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp, td);
+ continue;
+ }
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ error = copyout(sp, sfsp, sizeof(*sp));
+ if (error) {
+ vfs_unbusy(mp, td);
+ return (error);
+ }
+ sfsp += sizeof(*sp);
+ }
+ count++;
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp, td);
+ }
+ mtx_unlock(&mountlist_mtx);
+ if (sfsp && count > maxcount)
+ td->td_retval[0] = maxcount;
+ else
+ td->td_retval[0] = count;
+ return (0);
+}
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+fchdir(td, uap)
+ struct thread *td;
+ struct fchdir_args /* {
+ syscallarg(int) fd;
+ } */ *uap;
+{
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ struct vnode *vp, *tdp, *vpold;
+ struct mount *mp;
+ struct file *fp;
+ int error;
+
+ if ((error = getvnode(fdp, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ VREF(vp);
+ fdrop(fp, td);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (vp->v_type != VDIR)
+ error = ENOTDIR;
+ else
+ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+ while (!error && (mp = vp->v_mountedhere) != NULL) {
+ if (vfs_busy(mp, 0, 0, td))
+ continue;
+ error = VFS_ROOT(mp, &tdp);
+ vfs_unbusy(mp, td);
+ if (error)
+ break;
+ vput(vp);
+ vp = tdp;
+ }
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ VOP_UNLOCK(vp, 0, td);
+ FILEDESC_LOCK(fdp);
+ vpold = fdp->fd_cdir;
+ fdp->fd_cdir = vp;
+ FILEDESC_UNLOCK(fdp);
+ vrele(vpold);
+ return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+chdir(td, uap)
+ struct thread *td;
+ struct chdir_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ int error;
+ struct nameidata nd;
+ struct vnode *vp;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = change_dir(&nd, td)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ FILEDESC_LOCK(fdp);
+ vp = fdp->fd_cdir;
+ fdp->fd_cdir = nd.ni_vp;
+ FILEDESC_UNLOCK(fdp);
+ vrele(vp);
+ return (0);
+}
+
+/*
+ * Helper function for raised chroot(2) security function: Refuse if
+ * any filedescriptors are open directories.
+ */
+static int
+chroot_refuse_vdir_fds(fdp)
+ struct filedesc *fdp;
+{
+ struct vnode *vp;
+ struct file *fp;
+ int fd;
+
+ FILEDESC_LOCK(fdp);
+ for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
+ fp = fget_locked(fdp, fd);
+ if (fp == NULL)
+ continue;
+ if (fp->f_type == DTYPE_VNODE) {
+ vp = (struct vnode *)fp->f_data;
+ if (vp->v_type == VDIR) {
+ FILEDESC_UNLOCK(fdp);
+ return (EPERM);
+ }
+ }
+ }
+ FILEDESC_UNLOCK(fdp);
+ return (0);
+}
+
+/*
+ * This sysctl determines if we will allow a process to chroot(2) if it
+ * has a directory open:
+ * 0: disallowed for all processes.
+ * 1: allowed for processes that were not already chroot(2)'ed.
+ * 2: allowed for all processes.
+ */
+
+static int chroot_allow_open_directories = 1;
+
+SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
+ &chroot_allow_open_directories, 0, "");
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+chroot(td, uap)
+ struct thread *td;
+ struct chroot_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ int error;
+ struct nameidata nd;
+ struct vnode *vp;
+
+ error = suser_cred(td->td_ucred, PRISON_ROOT);
+ if (error)
+ return (error);
+ FILEDESC_LOCK(fdp);
+ if (chroot_allow_open_directories == 0 ||
+ (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
+ FILEDESC_UNLOCK(fdp);
+ error = chroot_refuse_vdir_fds(fdp);
+ } else
+ FILEDESC_UNLOCK(fdp);
+ if (error)
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = change_dir(&nd, td)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ FILEDESC_LOCK(fdp);
+ vp = fdp->fd_rdir;
+ fdp->fd_rdir = nd.ni_vp;
+ if (!fdp->fd_jdir) {
+ fdp->fd_jdir = nd.ni_vp;
+ VREF(fdp->fd_jdir);
+ }
+ FILEDESC_UNLOCK(fdp);
+ vrele(vp);
+ return (0);
+}
+
+/*
+ * Common routine for chroot and chdir.
+ */
+static int
+change_dir(ndp, td)
+ register struct nameidata *ndp;
+ struct thread *td;
+{
+ struct vnode *vp;
+ int error;
+
+ error = namei(ndp);
+ if (error)
+ return (error);
+ vp = ndp->ni_vp;
+ if (vp->v_type != VDIR)
+ error = ENOTDIR;
+ else
+ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+ if (error)
+ vput(vp);
+ else
+ VOP_UNLOCK(vp, 0, td);
+ return (error);
+}
+
+/*
+ * Check permissions, allocate an open file structure,
+ * and call the device open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+ char *path;
+ int flags;
+ int mode;
+};
+#endif
+int
+open(td, uap)
+ struct thread *td;
+ register struct open_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct proc *p = td->td_proc;
+ struct filedesc *fdp = p->p_fd;
+ struct file *fp;
+ struct vnode *vp;
+ struct vattr vat;
+ struct mount *mp;
+ int cmode, flags, oflags;
+ struct file *nfp;
+ int type, indx, error;
+ struct flock lf;
+ struct nameidata nd;
+
+ oflags = SCARG(uap, flags);
+ if ((oflags & O_ACCMODE) == O_ACCMODE)
+ return (EINVAL);
+ flags = FFLAGS(oflags);
+ error = falloc(td, &nfp, &indx);
+ if (error)
+ return (error);
+ fp = nfp;
+ FILEDESC_LOCK(fdp);
+ cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
+ FILEDESC_UNLOCK(fdp);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ td->td_dupfd = -indx - 1; /* XXX check for fdopen */
+ /*
+ * Bump the ref count to prevent another process from closing
+ * the descriptor while we are blocked in vn_open()
+ */
+ fhold(fp);
+ error = vn_open(&nd, &flags, cmode);
+ if (error) {
+ /*
+ * release our own reference
+ */
+ fdrop(fp, td);
+
+ /*
+ * handle special fdopen() case. bleh. dupfdopen() is
+ * responsible for dropping the old contents of ofiles[indx]
+ * if it succeeds.
+ */
+ if ((error == ENODEV || error == ENXIO) &&
+ td->td_dupfd >= 0 && /* XXX from fdopen */
+ (error =
+ dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) {
+ td->td_retval[0] = indx;
+ return (0);
+ }
+ /*
+ * Clean up the descriptor, but only if another thread hadn't
+ * replaced or closed it.
+ */
+ FILEDESC_LOCK(fdp);
+ if (fdp->fd_ofiles[indx] == fp) {
+ fdp->fd_ofiles[indx] = NULL;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ } else
+ FILEDESC_UNLOCK(fdp);
+
+ if (error == ERESTART)
+ error = EINTR;
+ return (error);
+ }
+ td->td_dupfd = 0;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+
+ /*
+ * There should be 2 references on the file, one from the descriptor
+ * table, and one for us.
+ *
+ * Handle the case where someone closed the file (via its file
+ * descriptor) while we were blocked. The end result should look
+ * like opening the file succeeded but it was immediately closed.
+ */
+ FILEDESC_LOCK(fdp);
+ FILE_LOCK(fp);
+ if (fp->f_count == 1) {
+ KASSERT(fdp->fd_ofiles[indx] != fp,
+ ("Open file descriptor lost all refs"));
+ FILEDESC_UNLOCK(fdp);
+ FILE_UNLOCK(fp);
+ VOP_UNLOCK(vp, 0, td);
+ vn_close(vp, flags & FMASK, fp->f_cred, td);
+ fdrop(fp, td);
+ td->td_retval[0] = indx;
+ return 0;
+ }
+
+ /* assert that vn_open created a backing object if one is needed */
+ KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0,
+ ("open: vmio vnode has no backing object after vn_open"));
+
+ fp->f_data = vp;
+ fp->f_flag = flags & FMASK;
+ fp->f_ops = &vnops;
+ fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+ FILEDESC_UNLOCK(fdp);
+ FILE_UNLOCK(fp);
+ VOP_UNLOCK(vp, 0, td);
+ if (flags & (O_EXLOCK | O_SHLOCK)) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ if (flags & O_EXLOCK)
+ lf.l_type = F_WRLCK;
+ else
+ lf.l_type = F_RDLCK;
+ type = F_FLOCK;
+ if ((flags & FNONBLOCK) == 0)
+ type |= F_WAIT;
+ if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+ type)) != 0)
+ goto bad;
+ fp->f_flag |= FHASLOCK;
+ }
+ if (flags & O_TRUNC) {
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ goto bad;
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ VATTR_NULL(&vat);
+ vat.va_size = 0;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_SETATTR(vp, &vat, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ if (error)
+ goto bad;
+ }
+ /*
+ * Release our private reference, leaving the one associated with
+ * the descriptor table intact.
+ */
+ fdrop(fp, td);
+ td->td_retval[0] = indx;
+ return (0);
+bad:
+ FILEDESC_LOCK(fdp);
+ if (fdp->fd_ofiles[indx] == fp) {
+ fdp->fd_ofiles[indx] = NULL;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ } else
+ FILEDESC_UNLOCK(fdp);
+ return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+ocreat(td, uap)
+ struct thread *td;
+ register struct ocreat_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct open_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(int) mode;
+ } */ nuap;
+
+ SCARG(&nuap, path) = SCARG(uap, path);
+ SCARG(&nuap, mode) = SCARG(uap, mode);
+ SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
+ return (open(td, &nuap));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+ char *path;
+ int mode;
+ int dev;
+};
+#endif
+/* ARGSUSED */
+int
+mknod(td, uap)
+ struct thread *td;
+ register struct mknod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ syscallarg(int) dev;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct vattr vattr;
+ int error;
+ int whiteout = 0;
+ struct nameidata nd;
+
+ switch (SCARG(uap, mode) & S_IFMT) {
+ case S_IFCHR:
+ case S_IFBLK:
+ error = suser(td);
+ break;
+ default:
+ error = suser_cred(td->td_ucred, PRISON_ROOT);
+ break;
+ }
+ if (error)
+ return (error);
+restart:
+ bwillwrite();
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL) {
+ vrele(vp);
+ error = EEXIST;
+ } else {
+ VATTR_NULL(&vattr);
+ FILEDESC_LOCK(td->td_proc->p_fd);
+ vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask;
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ vattr.va_rdev = SCARG(uap, dev);
+ whiteout = 0;
+
+ switch (SCARG(uap, mode) & S_IFMT) {
+ case S_IFMT: /* used by badsect to flag bad sectors */
+ vattr.va_type = VBAD;
+ break;
+ case S_IFCHR:
+ vattr.va_type = VCHR;
+ break;
+ case S_IFBLK:
+ vattr.va_type = VBLK;
+ break;
+ case S_IFWHT:
+ whiteout = 1;
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ if (whiteout)
+ error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+ else {
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+ &nd.ni_cnd, &vattr);
+ if (error == 0)
+ vput(nd.ni_vp);
+ }
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod");
+ return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkfifo(td, uap)
+ struct thread *td;
+ register struct mkfifo_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+restart:
+ bwillwrite();
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ if (nd.ni_vp != NULL) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(nd.ni_vp);
+ vput(nd.ni_dvp);
+ return (EEXIST);
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VFIFO;
+ FILEDESC_LOCK(td->td_proc->p_fd);
+ vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask;
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ if (error == 0)
+ vput(nd.ni_vp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+ char *path;
+ char *link;
+};
+#endif
+/* ARGSUSED */
+int
+link(td, uap)
+ struct thread *td;
+ register struct link_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) link;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct nameidata nd;
+ int error;
+
+ bwillwrite();
+ NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ if (vp->v_type == VDIR) {
+ vrele(vp);
+ return (EPERM); /* POSIX */
+ }
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+ vrele(vp);
+ return (error);
+ }
+ NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td);
+ if ((error = namei(&nd)) == 0) {
+ if (nd.ni_vp != NULL) {
+ vrele(nd.ni_vp);
+ error = EEXIST;
+ } else {
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ }
+ vrele(vp);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "link");
+ return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+ char *path;
+ char *link;
+};
+#endif
+/* ARGSUSED */
+int
+symlink(td, uap)
+ struct thread *td;
+ register struct symlink_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) link;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vattr vattr;
+ char *path;
+ int error;
+ struct nameidata nd;
+
+ path = uma_zalloc(namei_zone, M_WAITOK);
+ if ((error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) != 0)
+ goto out;
+restart:
+ bwillwrite();
+ NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td);
+ if ((error = namei(&nd)) != 0)
+ goto out;
+ if (nd.ni_vp) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(nd.ni_vp);
+ vput(nd.ni_dvp);
+ error = EEXIST;
+ goto out;
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ FILEDESC_LOCK(td->td_proc->p_fd);
+ vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error == 0)
+ vput(nd.ni_vp);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
+out:
+ uma_zfree(namei_zone, path);
+ return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+/* ARGSUSED */
+int
+undelete(td, uap)
+ struct thread *td;
+ register struct undelete_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ int error;
+ struct mount *mp;
+ struct nameidata nd;
+
+restart:
+ bwillwrite();
+ NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+
+ if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_vp)
+ vrele(nd.ni_vp);
+ vput(nd.ni_dvp);
+ return (EEXIST);
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete");
+ return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+unlink(td, uap)
+ struct thread *td;
+ struct unlink_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+ struct nameidata nd;
+
+restart:
+ bwillwrite();
+ NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type == VDIR)
+ error = EPERM; /* POSIX */
+ else {
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ *
+ * XXX: can this only be a VDIR case?
+ */
+ if (vp->v_flag & VROOT)
+ error = EBUSY;
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(vp);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vput(vp);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink");
+ return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+ int fd;
+ int pad;
+ off_t offset;
+ int whence;
+};
+#endif
+int
+lseek(td, uap)
+ struct thread *td;
+ register struct lseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) offset;
+ syscallarg(int) whence;
+ } */ *uap;
+{
+ struct ucred *cred = td->td_ucred;
+ struct file *fp;
+ struct vnode *vp;
+ struct vattr vattr;
+ off_t offset;
+ int error, noneg;
+
+ if ((error = fget(td, uap->fd, &fp)) != 0)
+ return (error);
+ if (fp->f_type != DTYPE_VNODE) {
+ fdrop(fp, td);
+ return (ESPIPE);
+ }
+ vp = (struct vnode *)fp->f_data;
+ noneg = (vp->v_type != VCHR);
+ offset = SCARG(uap, offset);
+ switch (SCARG(uap, whence)) {
+ case L_INCR:
+ if (noneg &&
+ (fp->f_offset < 0 ||
+ (offset > 0 && fp->f_offset > OFF_MAX - offset)))
+ return (EOVERFLOW);
+ offset += fp->f_offset;
+ break;
+ case L_XTND:
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_GETATTR(vp, &vattr, cred, td);
+ VOP_UNLOCK(vp, 0, td);
+ if (error)
+ return (error);
+ if (noneg &&
+ (vattr.va_size > OFF_MAX ||
+ (offset > 0 && vattr.va_size > OFF_MAX - offset)))
+ return (EOVERFLOW);
+ offset += vattr.va_size;
+ break;
+ case L_SET:
+ break;
+ default:
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ if (noneg && offset < 0)
+ return (EINVAL);
+ fp->f_offset = offset;
+ *(off_t *)(td->td_retval) = fp->f_offset;
+ fdrop(fp, td);
+ return (0);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+ int fd;
+ long offset;
+ int whence;
+};
+#endif
+int
+olseek(td, uap)
+ struct thread *td;
+ register struct olseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(long) offset;
+ syscallarg(int) whence;
+ } */ *uap;
+{
+ struct lseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) offset;
+ syscallarg(int) whence;
+ } */ nuap;
+ int error;
+
+ SCARG(&nuap, fd) = SCARG(uap, fd);
+ SCARG(&nuap, offset) = SCARG(uap, offset);
+ SCARG(&nuap, whence) = SCARG(uap, whence);
+ error = lseek(td, &nuap);
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Check access permissions using passed credentials.
+ */
+static int
+vn_access(vp, user_flags, cred, td)
+ struct vnode *vp;
+ int user_flags;
+ struct ucred *cred;
+ struct thread *td;
+{
+ int error, flags;
+
+ /* Flags == 0 means only check for existence. */
+ error = 0;
+ if (user_flags) {
+ flags = 0;
+ if (user_flags & R_OK)
+ flags |= VREAD;
+ if (user_flags & W_OK)
+ flags |= VWRITE;
+ if (user_flags & X_OK)
+ flags |= VEXEC;
+ if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+ error = VOP_ACCESS(vp, flags, cred, td);
+ }
+ return (error);
+}
+
+/*
+ * Check access permissions using "real" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+ char *path;
+ int flags;
+};
+#endif
+int
+access(td, uap)
+ struct thread *td;
+ register struct access_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ struct ucred *cred, *tmpcred;
+ register struct vnode *vp;
+ int error;
+ struct nameidata nd;
+
+ /*
+ * Create and modify a temporary credential instead of one that
+ * is potentially shared. This could also mess up socket
+ * buffer accounting which can run in an interrupt context.
+ *
+ * XXX - Depending on how "threads" are finally implemented, it
+ * may be better to explicitly pass the credential to namei()
+ * rather than to modify the potentially shared process structure.
+ */
+ cred = td->td_ucred;
+ tmpcred = crdup(cred);
+ tmpcred->cr_uid = cred->cr_ruid;
+ tmpcred->cr_groups[0] = cred->cr_rgid;
+ td->td_ucred = tmpcred;
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ goto out1;
+ vp = nd.ni_vp;
+
+ error = vn_access(vp, SCARG(uap, flags), tmpcred, td);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+out1:
+ td->td_ucred = cred;
+ crfree(tmpcred);
+ return (error);
+}
+
+/*
+ * Check access permissions using "effective" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct eaccess_args {
+ char *path;
+ int flags;
+};
+#endif
+int
+eaccess(td, uap)
+ struct thread *td;
+ register struct eaccess_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ struct nameidata nd;
+ struct vnode *vp;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+
+ error = vn_access(vp, SCARG(uap, flags), td->td_ucred, td);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+ return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+ostat(td, uap)
+ struct thread *td;
+ register struct ostat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct ostat *) ub;
+ } */ *uap;
+{
+ struct stat sb;
+ struct ostat osb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = vn_stat(nd.ni_vp, &sb, td);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ cvtstat(&sb, &osb);
+ error = copyout(&osb, SCARG(uap, ub), sizeof (osb));
+ return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+olstat(td, uap)
+ struct thread *td;
+ register struct olstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct ostat *) ub;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct stat sb;
+ struct ostat osb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ error = vn_stat(vp, &sb, td);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+ if (error)
+ return (error);
+ cvtstat(&sb, &osb);
+ error = copyout(&osb, SCARG(uap, ub), sizeof (osb));
+ return (error);
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ */
+void
+cvtstat(st, ost)
+ struct stat *st;
+ struct ostat *ost;
+{
+
+ ost->st_dev = st->st_dev;
+ ost->st_ino = st->st_ino;
+ ost->st_mode = st->st_mode;
+ ost->st_nlink = st->st_nlink;
+ ost->st_uid = st->st_uid;
+ ost->st_gid = st->st_gid;
+ ost->st_rdev = st->st_rdev;
+ if (st->st_size < (quad_t)1 << 32)
+ ost->st_size = st->st_size;
+ else
+ ost->st_size = -2;
+ ost->st_atime = st->st_atime;
+ ost->st_mtime = st->st_mtime;
+ ost->st_ctime = st->st_ctime;
+ ost->st_blksize = st->st_blksize;
+ ost->st_blocks = st->st_blocks;
+ ost->st_flags = st->st_flags;
+ ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+stat(td, uap)
+ struct thread *td;
+ register struct stat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct stat *) ub;
+ } */ *uap;
+{
+ struct stat sb;
+ int error;
+ struct nameidata nd;
+
+#ifdef LOOKUP_SHARED
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | NOOBJ,
+ UIO_USERSPACE, SCARG(uap, path), td);
+#else
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+#endif
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ error = vn_stat(nd.ni_vp, &sb, td);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ error = copyout(&sb, SCARG(uap, ub), sizeof (sb));
+ return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+lstat(td, uap)
+ struct thread *td;
+ register struct lstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct stat *) ub;
+ } */ *uap;
+{
+ int error;
+ struct vnode *vp;
+ struct stat sb;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ error = vn_stat(vp, &sb, td);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+ if (error)
+ return (error);
+ error = copyout(&sb, SCARG(uap, ub), sizeof (sb));
+ return (error);
+}
+
+/*
+ * Implementation of the NetBSD stat() function.
+ * XXX This should probably be collapsed with the FreeBSD version,
+ * as the differences are only due to vn_stat() clearing spares at
+ * the end of the structures. vn_stat could be split to avoid this,
+ * and thus collapse the following to close to zero code.
+ */
+void
+cvtnstat(sb, nsb)
+ struct stat *sb;
+ struct nstat *nsb;
+{
+ bzero(nsb, sizeof *nsb);
+ nsb->st_dev = sb->st_dev;
+ nsb->st_ino = sb->st_ino;
+ nsb->st_mode = sb->st_mode;
+ nsb->st_nlink = sb->st_nlink;
+ nsb->st_uid = sb->st_uid;
+ nsb->st_gid = sb->st_gid;
+ nsb->st_rdev = sb->st_rdev;
+ nsb->st_atimespec = sb->st_atimespec;
+ nsb->st_mtimespec = sb->st_mtimespec;
+ nsb->st_ctimespec = sb->st_ctimespec;
+ nsb->st_size = sb->st_size;
+ nsb->st_blocks = sb->st_blocks;
+ nsb->st_blksize = sb->st_blksize;
+ nsb->st_flags = sb->st_flags;
+ nsb->st_gen = sb->st_gen;
+ nsb->st_createtimespec = sb->st_createtimespec;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nstat_args {
+ char *path;
+ struct nstat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nstat(td, uap)
+ struct thread *td;
+ register struct nstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct nstat *) ub;
+ } */ *uap;
+{
+ struct stat sb;
+ struct nstat nsb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = vn_stat(nd.ni_vp, &sb, td);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ cvtnstat(&sb, &nsb);
+ error = copyout(&nsb, SCARG(uap, ub), sizeof (nsb));
+ return (error);
+}
+
+/*
+ * NetBSD lstat. Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nlstat(td, uap)
+ struct thread *td;
+ register struct nlstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct nstat *) ub;
+ } */ *uap;
+{
+ int error;
+ struct vnode *vp;
+ struct stat sb;
+ struct nstat nsb;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = vn_stat(vp, &sb, td);
+ vput(vp);
+ if (error)
+ return (error);
+ cvtnstat(&sb, &nsb);
+ error = copyout(&nsb, SCARG(uap, ub), sizeof (nsb));
+ return (error);
+}
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+ char *path;
+ int name;
+};
+#endif
+/* ARGSUSED */
+int
+pathconf(td, uap)
+ struct thread *td;
+ register struct pathconf_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) name;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), td->td_retval);
+ vput(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+ char *path;
+ char *buf;
+ int count;
+};
+#endif
+/* ARGSUSED */
+int
+readlink(td, uap)
+ struct thread *td;
+ register struct readlink_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) buf;
+ syscallarg(int) count;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct iovec aiov;
+ struct uio auio;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ if (vp->v_type != VLNK)
+ error = EINVAL;
+ else {
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auio.uio_resid = SCARG(uap, count);
+ error = VOP_READLINK(vp, &auio, td->td_ucred);
+ }
+ vput(vp);
+ td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+
+/*
+ * Common implementation code for chflags() and fchflags().
+ */
+static int
+setfflags(td, vp, flags)
+ struct thread *td;
+ struct vnode *vp;
+ int flags;
+{
+ int error;
+ struct mount *mp;
+ struct vattr vattr;
+
+ /*
+ * Prevent non-root users from setting flags on devices. When
+ * a device is reused, users can retain ownership of the device
+ * if they are allowed to set flags and programs assume that
+ * chown can't fail when done as root.
+ */
+ if (vp->v_type == VCHR || vp->v_type == VBLK) {
+ error = suser_cred(td->td_ucred, PRISON_ROOT);
+ if (error)
+ return (error);
+ }
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ VATTR_NULL(&vattr);
+ vattr.va_flags = flags;
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+ char *path;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+chflags(td, uap)
+ struct thread *td;
+ register struct chflags_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfflags(td, nd.ni_vp, SCARG(uap, flags));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Same as chflags() but doesn't follow symlinks.
+ */
+int
+lchflags(td, uap)
+ struct thread *td;
+ register struct lchflags_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfflags(td, nd.ni_vp, SCARG(uap, flags));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+ int fd;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+fchflags(td, uap)
+ struct thread *td;
+ register struct fchflags_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ struct file *fp;
+ int error;
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ error = setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags));
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common implementation code for chmod(), lchmod() and fchmod().
+ */
+static int
+setfmode(td, vp, mode)
+ struct thread *td;
+ struct vnode *vp;
+ int mode;
+{
+ int error;
+ struct mount *mp;
+ struct vattr vattr;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ VATTR_NULL(&vattr);
+ vattr.va_mode = mode & ALLPERMS;
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return error;
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+chmod(td, uap)
+ struct thread *td;
+ register struct chmod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfmode(td, nd.ni_vp, SCARG(uap, mode));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Change mode of a file given path name (don't follow links.)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchmod_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+lchmod(td, uap)
+ struct thread *td;
+ register struct lchmod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfmode(td, nd.ni_vp, SCARG(uap, mode));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+ int fd;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+fchmod(td, uap)
+ struct thread *td;
+ register struct fchmod_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct file *fp;
+ struct vnode *vp;
+ int error;
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ error = setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode));
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common implementation for chown(), lchown(), and fchown()
+ */
+static int
+setfown(td, vp, uid, gid)
+ struct thread *td;
+ struct vnode *vp;
+ uid_t uid;
+ gid_t gid;
+{
+ int error;
+ struct mount *mp;
+ struct vattr vattr;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ VATTR_NULL(&vattr);
+ vattr.va_uid = uid;
+ vattr.va_gid = gid;
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return error;
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+chown(td, uap)
+ struct thread *td;
+ register struct chown_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set ownership given a path name, do not cross symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+lchown(td, uap)
+ struct thread *td;
+ register struct lchown_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+ int fd;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+fchown(td, uap)
+ struct thread *td;
+ register struct fchown_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+{
+ struct file *fp;
+ struct vnode *vp;
+ int error;
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ error = setfown(td, (struct vnode *)fp->f_data,
+ SCARG(uap, uid), SCARG(uap, gid));
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+getutimes(usrtvp, tsp)
+ const struct timeval *usrtvp;
+ struct timespec *tsp;
+{
+ struct timeval tv[2];
+ int error;
+
+ if (usrtvp == NULL) {
+ microtime(&tv[0]);
+ TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
+ tsp[1] = tsp[0];
+ } else {
+ if ((error = copyin(usrtvp, tv, sizeof (tv))) != 0)
+ return (error);
+ TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
+ TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
+ }
+ return 0;
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+setutimes(td, vp, ts, nullflag)
+ struct thread *td;
+ struct vnode *vp;
+ const struct timespec *ts;
+ int nullflag;
+{
+ int error;
+ struct mount *mp;
+ struct vattr vattr;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ VATTR_NULL(&vattr);
+ vattr.va_atime = ts[0];
+ vattr.va_mtime = ts[1];
+ if (nullflag)
+ vattr.va_vaflags |= VA_UTIMES_NULL;
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return error;
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+utimes(td, uap)
+ struct thread *td;
+ register struct utimes_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+{
+ struct timespec ts[2];
+ struct timeval *usrtvp;
+ int error;
+ struct nameidata nd;
+
+ usrtvp = SCARG(uap, tptr);
+ if ((error = getutimes(usrtvp, ts)) != 0)
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lutimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+lutimes(td, uap)
+ struct thread *td;
+ register struct lutimes_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+{
+ struct timespec ts[2];
+ struct timeval *usrtvp;
+ int error;
+ struct nameidata nd;
+
+ usrtvp = SCARG(uap, tptr);
+ if ((error = getutimes(usrtvp, ts)) != 0)
+ return (error);
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct futimes_args {
+ int fd;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+futimes(td, uap)
+ struct thread *td;
+ register struct futimes_args /* {
+ syscallarg(int ) fd;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+{
+ struct timespec ts[2];
+ struct file *fp;
+ struct timeval *usrtvp;
+ int error;
+
+ usrtvp = SCARG(uap, tptr);
+ if ((error = getutimes(usrtvp, ts)) != 0)
+ return (error);
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ error = setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+ char *path;
+ int pad;
+ off_t length;
+};
+#endif
+/* ARGSUSED */
+int
+truncate(td, uap)
+ struct thread *td;
+ register struct truncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ if (uap->length < 0)
+ return(EINVAL);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+ vrele(vp);
+ return (error);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+ else if ((error = vn_writechk(vp)) == 0 &&
+ (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = SCARG(uap, length);
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+ }
+ vput(vp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+ int fd;
+ int pad;
+ off_t length;
+};
+#endif
+/* ARGSUSED */
+int
+ftruncate(td, uap)
+ struct thread *td;
+ register struct ftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vattr vattr;
+ struct vnode *vp;
+ struct file *fp;
+ int error;
+
+ if (uap->length < 0)
+ return(EINVAL);
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ if ((fp->f_flag & FWRITE) == 0) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ vp = (struct vnode *)fp->f_data;
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+ fdrop(fp, td);
+ return (error);
+ }
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+ else if ((error = vn_writechk(vp)) == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = SCARG(uap, length);
+ error = VOP_SETATTR(vp, &vattr, fp->f_cred, td);
+ }
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ fdrop(fp, td);
+ return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+ char *path;
+ long length;
+};
+#endif
+/* ARGSUSED */
+int
+otruncate(td, uap)
+ struct thread *td;
+ register struct otruncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(long) length;
+ } */ *uap;
+{
+ struct truncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ nuap;
+
+ SCARG(&nuap, path) = SCARG(uap, path);
+ SCARG(&nuap, length) = SCARG(uap, length);
+ return (truncate(td, &nuap));
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+ int fd;
+ long length;
+};
+#endif
+/* ARGSUSED */
+int
+oftruncate(td, uap)
+ struct thread *td;
+ register struct oftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(long) length;
+ } */ *uap;
+{
+ struct ftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ nuap;
+
+ SCARG(&nuap, fd) = SCARG(uap, fd);
+ SCARG(&nuap, length) = SCARG(uap, length);
+ return (ftruncate(td, &nuap));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+fsync(td, uap)
+ struct thread *td;
+ struct fsync_args /* {
+ syscallarg(int) fd;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct file *fp;
+ vm_object_t obj;
+ int error;
+
+ GIANT_REQUIRED;
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+ fdrop(fp, td);
+ return (error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (VOP_GETVOBJECT(vp, &obj) == 0) {
+ vm_object_page_clean(obj, 0, 0, 0);
+ }
+ error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, td);
+#ifdef SOFTUPDATES
+ if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
+ error = softdep_fsync(vp);
+#endif
+
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Rename files. Source and destination must either both be directories,
+ * or both not be directories. If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+ char *from;
+ char *to;
+};
+#endif
+/* ARGSUSED */
+int
+rename(td, uap)
+ struct thread *td;
+ register struct rename_args /* {
+ syscallarg(char *) from;
+ syscallarg(char *) to;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vnode *tvp, *fvp, *tdvp;
+ struct nameidata fromnd, tond;
+ int error;
+
+ bwillwrite();
+ NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE,
+ SCARG(uap, from), td);
+ if ((error = namei(&fromnd)) != 0)
+ return (error);
+ fvp = fromnd.ni_vp;
+ if ((error = vn_start_write(fvp, &mp, V_WAIT | PCATCH)) != 0) {
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ goto out1;
+ }
+ NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ,
+ UIO_USERSPACE, SCARG(uap, to), td);
+ if (fromnd.ni_vp->v_type == VDIR)
+ tond.ni_cnd.cn_flags |= WILLBEDIR;
+ if ((error = namei(&tond)) != 0) {
+ /* Translate error code for rename("dir1", "dir2/."). */
+ if (error == EISDIR && fvp->v_type == VDIR)
+ error = EINVAL;
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ goto out1;
+ }
+ tdvp = tond.ni_dvp;
+ tvp = tond.ni_vp;
+ if (tvp != NULL) {
+ if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
+ }
+ if (fvp == tdvp)
+ error = EINVAL;
+ /*
+ * If source is the same as the destination (that is the
+ * same inode number with the same name in the same directory),
+ * then there is nothing to do.
+ */
+ if (fvp == tvp && fromnd.ni_dvp == tdvp &&
+ fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
+ !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
+ fromnd.ni_cnd.cn_namelen))
+ error = -1;
+out:
+ if (!error) {
+ VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE);
+ if (fromnd.ni_dvp != tdvp) {
+ VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ }
+ if (tvp) {
+ VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE);
+ }
+ error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+ tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ NDFREE(&tond, NDF_ONLY_PNBUF);
+ } else {
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ NDFREE(&tond, NDF_ONLY_PNBUF);
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ if (tvp)
+ vput(tvp);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ }
+ vrele(tond.ni_startdir);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename");
+ ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename");
+ ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename");
+ ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename");
+out1:
+ if (fromnd.ni_startdir)
+ vrele(fromnd.ni_startdir);
+ if (error == -1)
+ return (0);
+ return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkdir(td, uap)
+ struct thread *td;
+ register struct mkdir_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+
+ return vn_mkdir(uap->path, uap->mode, UIO_USERSPACE, td);
+}
+
+int
+vn_mkdir(path, mode, segflg, td)
+ char *path;
+ int mode;
+ enum uio_seg segflg;
+ struct thread *td;
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+restart:
+ bwillwrite();
+ NDINIT(&nd, CREATE, LOCKPARENT, segflg, path, td);
+ nd.ni_cnd.cn_flags |= WILLBEDIR;
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(vp);
+ /*
+ * XXX namei called with LOCKPARENT but not LOCKLEAF has
+ * the strange behaviour of leaving the vnode unlocked
+ * if the target is the same vnode as the parent.
+ */
+ if (vp == nd.ni_dvp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ return (EEXIST);
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VDIR;
+ FILEDESC_LOCK(td->td_proc->p_fd);
+ vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if (!error)
+ vput(nd.ni_vp);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir");
+ return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+rmdir(td, uap)
+ struct thread *td;
+ struct rmdir_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+ struct nameidata nd;
+
+restart:
+ bwillwrite();
+ NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+ /*
+ * No rmdir "." please.
+ */
+ if (nd.ni_dvp == vp) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ */
+ if (vp->v_flag & VROOT) {
+ error = EBUSY;
+ goto out;
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vput(vp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+ vn_finished_write(mp);
+out:
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vput(vp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir");
+ return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+ogetdirentries(td, uap)
+ struct thread *td;
+ register struct ogetdirentries_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ syscallarg(long *) basep;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct file *fp;
+ struct uio auio, kuio;
+ struct iovec aiov, kiov;
+ struct dirent *dp, *edp;
+ caddr_t dirbuf;
+ int error, eofflag, readcnt;
+ long loff;
+
+ /* XXX arbitrary sanity limit on `count'. */
+ if (SCARG(uap, count) > 64 * 1024)
+ return (EINVAL);
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ if ((fp->f_flag & FREAD) == 0) {
+ fdrop(fp, td);
+ return (EBADF);
+ }
+ vp = (struct vnode *)fp->f_data;
+unionread:
+ if (vp->v_type != VDIR) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auio.uio_resid = SCARG(uap, count);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ loff = auio.uio_offset = fp->f_offset;
+# if (BYTE_ORDER != LITTLE_ENDIAN)
+ if (vp->v_mount->mnt_maxsymlinklen <= 0) {
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+ NULL, NULL);
+ fp->f_offset = auio.uio_offset;
+ } else
+# endif
+ {
+ kuio = auio;
+ kuio.uio_iov = &kiov;
+ kuio.uio_segflg = UIO_SYSSPACE;
+ kiov.iov_len = SCARG(uap, count);
+ MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
+ kiov.iov_base = dirbuf;
+ error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
+ NULL, NULL);
+ fp->f_offset = kuio.uio_offset;
+ if (error == 0) {
+ readcnt = SCARG(uap, count) - kuio.uio_resid;
+ edp = (struct dirent *)&dirbuf[readcnt];
+ for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+# if (BYTE_ORDER == LITTLE_ENDIAN)
+ /*
+ * The expected low byte of
+ * dp->d_namlen is our dp->d_type.
+ * The high MBZ byte of dp->d_namlen
+ * is our dp->d_namlen.
+ */
+ dp->d_type = dp->d_namlen;
+ dp->d_namlen = 0;
+# else
+ /*
+ * The dp->d_type is the high byte
+ * of the expected dp->d_namlen,
+ * so must be zero'ed.
+ */
+ dp->d_type = 0;
+# endif
+ if (dp->d_reclen > 0) {
+ dp = (struct dirent *)
+ ((char *)dp + dp->d_reclen);
+ } else {
+ error = EIO;
+ break;
+ }
+ }
+ if (dp >= edp)
+ error = uiomove(dirbuf, readcnt, &auio);
+ }
+ FREE(dirbuf, M_TEMP);
+ }
+ VOP_UNLOCK(vp, 0, td);
+ if (error) {
+ fdrop(fp, td);
+ return (error);
+ }
+ if (SCARG(uap, count) == auio.uio_resid) {
+ if (union_dircheckp) {
+ error = union_dircheckp(td, &vp, fp);
+ if (error == -1)
+ goto unionread;
+ if (error) {
+ fdrop(fp, td);
+ return (error);
+ }
+ }
+ if ((vp->v_flag & VROOT) &&
+ (vp->v_mount->mnt_flag & MNT_UNION)) {
+ struct vnode *tvp = vp;
+ vp = vp->v_mount->mnt_vnodecovered;
+ VREF(vp);
+ fp->f_data = vp;
+ fp->f_offset = 0;
+ vrele(tvp);
+ goto unionread;
+ }
+ }
+ error = copyout(&loff, SCARG(uap, basep), sizeof(long));
+ fdrop(fp, td);
+ td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+getdirentries(td, uap)
+ struct thread *td;
+ register struct getdirentries_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ syscallarg(long *) basep;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct file *fp;
+ struct uio auio;
+ struct iovec aiov;
+ long loff;
+ int error, eofflag;
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ if ((fp->f_flag & FREAD) == 0) {
+ fdrop(fp, td);
+ return (EBADF);
+ }
+ vp = (struct vnode *)fp->f_data;
+unionread:
+ if (vp->v_type != VDIR) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auio.uio_resid = SCARG(uap, count);
+ /* vn_lock(vp, LK_SHARED | LK_RETRY, td); */
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ loff = auio.uio_offset = fp->f_offset;
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
+ fp->f_offset = auio.uio_offset;
+ VOP_UNLOCK(vp, 0, td);
+ if (error) {
+ fdrop(fp, td);
+ return (error);
+ }
+ if (SCARG(uap, count) == auio.uio_resid) {
+ if (union_dircheckp) {
+ error = union_dircheckp(td, &vp, fp);
+ if (error == -1)
+ goto unionread;
+ if (error) {
+ fdrop(fp, td);
+ return (error);
+ }
+ }
+ if ((vp->v_flag & VROOT) &&
+ (vp->v_mount->mnt_flag & MNT_UNION)) {
+ struct vnode *tvp = vp;
+ vp = vp->v_mount->mnt_vnodecovered;
+ VREF(vp);
+ fp->f_data = vp;
+ fp->f_offset = 0;
+ vrele(tvp);
+ goto unionread;
+ }
+ }
+ if (SCARG(uap, basep) != NULL) {
+ error = copyout(&loff, SCARG(uap, basep), sizeof(long));
+ }
+ td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+ fdrop(fp, td);
+ return (error);
+}
+#ifndef _SYS_SYSPROTO_H_
+struct getdents_args {
+ int fd;
+ char *buf;
+ size_t count;
+};
+#endif
+int
+getdents(td, uap)
+ struct thread *td;
+ register struct getdents_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ } */ *uap;
+{
+ struct getdirentries_args ap;
+ ap.fd = uap->fd;
+ ap.buf = uap->buf;
+ ap.count = uap->count;
+ ap.basep = NULL;
+ return getdirentries(td, &ap);
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ *
+ * MP SAFE
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+ int newmask;
+};
+#endif
+int
+umask(td, uap)
+ struct thread *td;
+ struct umask_args /* {
+ syscallarg(int) newmask;
+ } */ *uap;
+{
+ register struct filedesc *fdp;
+
+ FILEDESC_LOCK(td->td_proc->p_fd);
+ fdp = td->td_proc->p_fd;
+ td->td_retval[0] = fdp->fd_cmask;
+ fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS;
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem
+ * away from vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+revoke(td, uap)
+ struct thread *td;
+ register struct revoke_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path),
+ td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (vp->v_type != VCHR) {
+ vput(vp);
+ return (EINVAL);
+ }
+ error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ VOP_UNLOCK(vp, 0, td);
+ if (td->td_ucred->cr_uid != vattr.va_uid) {
+ error = suser_cred(td->td_ucred, PRISON_ROOT);
+ if (error)
+ goto out;
+ }
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ goto out;
+ if (vcount(vp) > 1)
+ VOP_REVOKE(vp, REVOKEALL);
+ vn_finished_write(mp);
+out:
+ vrele(vp);
+ return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry.
+ * The file entry is locked upon returning.
+ */
+int
+getvnode(fdp, fd, fpp)
+ struct filedesc *fdp;
+ int fd;
+ struct file **fpp;
+{
+ int error;
+ struct file *fp;
+
+ fp = NULL;
+ if (fdp == NULL)
+ error = EBADF;
+ else {
+ FILEDESC_LOCK(fdp);
+ if ((u_int)fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[fd]) == NULL)
+ error = EBADF;
+ else if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
+ fp = NULL;
+ error = EINVAL;
+ } else {
+ fhold(fp);
+ error = 0;
+ }
+ FILEDESC_UNLOCK(fdp);
+ }
+ *fpp = fp;
+ return (error);
+}
+/*
+ * Get (NFS) file handle
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfh_args {
+ char *fname;
+ fhandle_t *fhp;
+};
+#endif
+int
+getfh(td, uap)
+ struct thread *td;
+ register struct getfh_args *uap;
+{
+ struct nameidata nd;
+ fhandle_t fh;
+ register struct vnode *vp;
+ int error;
+
+ /*
+ * Must be super user
+ */
+ error = suser(td);
+ if (error)
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ bzero(&fh, sizeof(fh));
+ fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
+ error = VFS_VPTOFH(vp, &fh.fh_fid);
+ vput(vp);
+ if (error)
+ return (error);
+ error = copyout(&fh, uap->fhp, sizeof (fh));
+ return (error);
+}
+
+/*
+ * syscall for the rpc.lockd to use to translate a NFS file handle into
+ * an open descriptor.
+ *
+ * warning: do not remove the suser() call or this becomes one giant
+ * security hole.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhopen_args {
+ const struct fhandle *u_fhp;
+ int flags;
+};
+#endif
+int
+fhopen(td, uap)
+ struct thread *td;
+ struct fhopen_args /* {
+ syscallarg(const struct fhandle *) u_fhp;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ struct proc *p = td->td_proc;
+ struct mount *mp;
+ struct vnode *vp;
+ struct fhandle fhp;
+ struct vattr vat;
+ struct vattr *vap = &vat;
+ struct flock lf;
+ struct file *fp;
+ register struct filedesc *fdp = p->p_fd;
+ int fmode, mode, error, type;
+ struct file *nfp;
+ int indx;
+
+ /*
+ * Must be super user
+ */
+ error = suser(td);
+ if (error)
+ return (error);
+
+ fmode = FFLAGS(SCARG(uap, flags));
+ /* why not allow a non-read/write open for our lockd? */
+ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
+ return (EINVAL);
+ error = copyin(SCARG(uap,u_fhp), &fhp, sizeof(fhp));
+ if (error)
+ return(error);
+ /* find the mount point */
+ mp = vfs_getvfs(&fhp.fh_fsid);
+ if (mp == NULL)
+ return (ESTALE);
+ /* now give me my vnode, it gets returned to me locked */
+ error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
+ if (error)
+ return (error);
+ /*
+ * from now on we have to make sure not
+ * to forget about the vnode
+ * any error that causes an abort must vput(vp)
+ * just set error = err and 'goto bad;'.
+ */
+
+ /*
+ * from vn_open
+ */
+ if (vp->v_type == VLNK) {
+ error = EMLINK;
+ goto bad;
+ }
+ if (vp->v_type == VSOCK) {
+ error = EOPNOTSUPP;
+ goto bad;
+ }
+ mode = 0;
+ if (fmode & (FWRITE | O_TRUNC)) {
+ if (vp->v_type == VDIR) {
+ error = EISDIR;
+ goto bad;
+ }
+ error = vn_writechk(vp);
+ if (error)
+ goto bad;
+ mode |= VWRITE;
+ }
+ if (fmode & FREAD)
+ mode |= VREAD;
+ if (mode) {
+ error = VOP_ACCESS(vp, mode, td->td_ucred, td);
+ if (error)
+ goto bad;
+ }
+ if (fmode & O_TRUNC) {
+ VOP_UNLOCK(vp, 0, td); /* XXX */
+ if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
+ vrele(vp);
+ return (error);
+ }
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* XXX */
+ VATTR_NULL(vap);
+ vap->va_size = 0;
+ error = VOP_SETATTR(vp, vap, td->td_ucred, td);
+ vn_finished_write(mp);
+ if (error)
+ goto bad;
+ }
+ error = VOP_OPEN(vp, fmode, td->td_ucred, td);
+ if (error)
+ goto bad;
+ /*
+ * Make sure that a VM object is created for VMIO support.
+ */
+ if (vn_canvmio(vp) == TRUE) {
+ if ((error = vfs_object_create(vp, td, td->td_ucred)) != 0)
+ goto bad;
+ }
+ if (fmode & FWRITE)
+ vp->v_writecount++;
+
+ /*
+ * end of vn_open code
+ */
+
+ if ((error = falloc(td, &nfp, &indx)) != 0) {
+ if (fmode & FWRITE)
+ vp->v_writecount--;
+ goto bad;
+ }
+ fp = nfp;
+
+ /*
+ * Hold an extra reference to avoid having fp ripped out
+ * from under us while we block in the lock op
+ */
+ fhold(fp);
+ nfp->f_data = vp;
+ nfp->f_flag = fmode & FMASK;
+ nfp->f_ops = &vnops;
+ nfp->f_type = DTYPE_VNODE;
+ if (fmode & (O_EXLOCK | O_SHLOCK)) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ if (fmode & O_EXLOCK)
+ lf.l_type = F_WRLCK;
+ else
+ lf.l_type = F_RDLCK;
+ type = F_FLOCK;
+ if ((fmode & FNONBLOCK) == 0)
+ type |= F_WAIT;
+ VOP_UNLOCK(vp, 0, td);
+ if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+ type)) != 0) {
+ /*
+ * The lock request failed. Normally close the
+ * descriptor but handle the case where someone might
+ * have dup()d or close()d it when we weren't looking.
+ */
+ FILEDESC_LOCK(fdp);
+ if (fdp->fd_ofiles[indx] == fp) {
+ fdp->fd_ofiles[indx] = NULL;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ } else
+ FILEDESC_UNLOCK(fdp);
+ /*
+ * release our private reference
+ */
+ fdrop(fp, td);
+ return(error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ fp->f_flag |= FHASLOCK;
+ }
+ if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0))
+ vfs_object_create(vp, td, td->td_ucred);
+
+ VOP_UNLOCK(vp, 0, td);
+ fdrop(fp, td);
+ td->td_retval[0] = indx;
+ return (0);
+
+bad:
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Stat an (NFS) file handle.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstat_args {
+ struct fhandle *u_fhp;
+ struct stat *sb;
+};
+#endif
+int
+fhstat(td, uap)
+ struct thread *td;
+ register struct fhstat_args /* {
+ syscallarg(struct fhandle *) u_fhp;
+ syscallarg(struct stat *) sb;
+ } */ *uap;
+{
+ struct stat sb;
+ fhandle_t fh;
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+
+ /*
+ * Must be super user
+ */
+ error = suser(td);
+ if (error)
+ return (error);
+
+ error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t));
+ if (error)
+ return (error);
+
+ if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
+ return (ESTALE);
+ if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
+ return (error);
+ error = vn_stat(vp, &sb, td);
+ vput(vp);
+ if (error)
+ return (error);
+ error = copyout(&sb, SCARG(uap, sb), sizeof(sb));
+ return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstatfs_args {
+ struct fhandle *u_fhp;
+ struct statfs *buf;
+};
+#endif
+int
+fhstatfs(td, uap)
+ struct thread *td;
+ struct fhstatfs_args /* {
+ syscallarg(struct fhandle) *u_fhp;
+ syscallarg(struct statfs) *buf;
+ } */ *uap;
+{
+ struct statfs *sp;
+ struct mount *mp;
+ struct vnode *vp;
+ struct statfs sb;
+ fhandle_t fh;
+ int error;
+
+ /*
+ * Must be super user
+ */
+ error = suser(td);
+ if (error)
+ return (error);
+
+ if ((error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t))) != 0)
+ return (error);
+
+ if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
+ return (ESTALE);
+ if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
+ return (error);
+ mp = vp->v_mount;
+ sp = &mp->mnt_stat;
+ vput(vp);
+ if ((error = VFS_STATFS(mp, sp, td)) != 0)
+ return (error);
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (suser(td)) {
+ bcopy(sp, &sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
+ return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Syscall to push extended attribute configuration information into the
+ * VFS. Accepts a path, which it converts to a mountpoint, as well as
+ * a command (int cmd), and attribute name and misc data. For now, the
+ * attribute name is left in userspace for consumption by the VFS_op.
+ * It will probably be changed to be copied into sysspace by the
+ * syscall in the future, once issues with various consumers of the
+ * attribute code have raised their hands.
+ *
+ * Currently this is used only by UFS Extended Attributes.
+ */
+int
+extattrctl(td, uap)
+ struct thread *td;
+ struct extattrctl_args /* {
+ syscallarg(const char *) path;
+ syscallarg(int) cmd;
+ syscallarg(const char *) filename;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ } */ *uap;
+{
+ struct vnode *filename_vp;
+ struct nameidata nd;
+ struct mount *mp, *mp_writable;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ /*
+ * uap->attrname is not always defined. We check again later when we
+ * invoke the VFS call so as to pass in NULL there if needed.
+ */
+ if (uap->attrname != NULL) {
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
+ NULL);
+ if (error)
+ return (error);
+ }
+
+ /*
+ * uap->filename is not always defined. If it is, grab a vnode lock,
+ * which VFS_EXTATTRCTL() will later release.
+ */
+ filename_vp = NULL;
+ if (uap->filename != NULL) {
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ uap->filename, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ filename_vp = nd.ni_vp;
+ NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK);
+ }
+
+ /* uap->path is always defined. */
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+ if ((error = namei(&nd)) != 0) {
+ if (filename_vp != NULL)
+ vput(filename_vp);
+ return (error);
+ }
+ mp = nd.ni_vp->v_mount;
+ error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
+ NDFREE(&nd, 0);
+ if (error) {
+ if (filename_vp != NULL)
+ vput(filename_vp);
+ return (error);
+ }
+
+ if (uap->attrname != NULL) {
+ error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp,
+ uap->attrnamespace, attrname, td);
+ } else {
+ error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp,
+ uap->attrnamespace, NULL, td);
+ }
+
+ vn_finished_write(mp_writable);
+ /*
+ * VFS_EXTATTRCTL will have unlocked, but not de-ref'd,
+ * filename_vp, so vrele it if it is defined.
+ */
+ if (filename_vp != NULL)
+ vrele(filename_vp);
+
+ return (error);
+}
+
+/*-
+ * Set a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", userspace buffer
+ * pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ void *data, size_t nbytes, struct thread *td)
+{
+ struct mount *mp;
+ struct uio auio;
+ struct iovec aiov;
+ ssize_t cnt;
+ int error;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ if (nbytes > INT_MAX) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ cnt = nbytes;
+
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
+ td->td_ucred, td);
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+
+done:
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+int
+extattr_set_file(td, uap)
+ struct thread *td;
+ struct extattr_set_file_args /* {
+ syscallarg(const char *) path;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ syscallarg(void *) data;
+ syscallarg(size_t) nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
+ uap->data, uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+int
+extattr_set_fd(td, uap)
+ struct thread *td;
+ struct extattr_set_fd_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ syscallarg(void *) data;
+ syscallarg(size_t) nbytes;
+ } */ *uap;
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+
+ if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
+ return (error);
+
+ error = extattr_set_vp((struct vnode *)fp->f_data, uap->attrnamespace,
+ attrname, uap->data, uap->nbytes, td);
+ fdrop(fp, td);
+
+ return (error);
+}
+
+/*-
+ * Get a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", userspace buffer
+ * pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ void *data, size_t nbytes, struct thread *td)
+{
+ struct uio auio, *auiop;
+ struct iovec aiov;
+ ssize_t cnt;
+ size_t size, *sizep;
+ int error;
+
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+ /*
+ * Slightly unusual semantics: if the user provides a NULL data
+ * pointer, they don't want to receive the data, just the
+ * maximum read length.
+ */
+ auiop = NULL;
+ sizep = NULL;
+ cnt = 0;
+ if (data != NULL) {
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_offset = 0;
+ if (nbytes > INT_MAX) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auiop = &auio;
+ cnt = nbytes;
+ } else
+ sizep = &size;
+
+ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
+ td->td_ucred, td);
+
+ if (auiop != NULL) {
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+ } else
+ td->td_retval[0] = size;
+
+done:
+ VOP_UNLOCK(vp, 0, td);
+ return (error);
+}
+
+int
+extattr_get_file(td, uap)
+ struct thread *td;
+ struct extattr_get_file_args /* {
+ syscallarg(const char *) path;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ syscallarg(void *) data;
+ syscallarg(size_t) nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
+ uap->data, uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+int
+extattr_get_fd(td, uap)
+ struct thread *td;
+ struct extattr_get_fd_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ syscallarg(void *) data;
+ syscallarg(size_t) nbytes;
+ } */ *uap;
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+
+ error = extattr_get_vp((struct vnode *)fp->f_data, uap->attrnamespace,
+ attrname, uap->data, uap->nbytes, td);
+
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * extattr_delete_vp(): Delete a named extended attribute on a file or
+ * directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", proc "p"
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ struct thread *td)
+{
+ struct mount *mp;
+ int error;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_ucred,
+ td);
+
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+int
+extattr_delete_file(td, uap)
+ struct thread *td;
+ struct extattr_delete_file_args /* {
+ syscallarg(const char *) path;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return(error);
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+ if ((error = namei(&nd)) != 0)
+ return(error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
+
+ vrele(nd.ni_vp);
+ return(error);
+}
+
+int
+extattr_delete_fd(td, uap)
+ struct thread *td;
+ struct extattr_delete_fd_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ } */ *uap;
+{
+ struct file *fp;
+ struct vnode *vp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+
+ if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+
+ error = extattr_delete_vp((struct vnode *)fp->f_data,
+ uap->attrnamespace, attrname, td);
+
+ fdrop(fp, td);
+ return (error);
+}
diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c
new file mode 100644
index 0000000..b221cd3
--- /dev/null
+++ b/sys/kern/vfs_init.c
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_init.c 8.3 (Berkeley) 1/4/94
+ * $FreeBSD$
+ */
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+
+
+MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
+
+/*
+ * The highest defined VFS number.
+ */
+int maxvfsconf = VFS_GENERIC + 1;
+
+/*
+ * Single-linked list of configured VFSes.
+ * New entries are added/deleted by vfs_register()/vfs_unregister()
+ */
+struct vfsconf *vfsconf;
+
+/*
+ * vfs_init.c
+ *
+ * Allocate and fill in operations vectors.
+ *
+ * An undocumented feature of this approach to defining operations is that
+ * there can be multiple entries in vfs_opv_descs for the same operations
+ * vector. This allows third parties to extend the set of operations
+ * supported by another layer in a binary compatibile way. For example,
+ * assume that NFS needed to be modified to support Ficus. NFS has an entry
+ * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by
+ * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions)
+ * listing those new operations Ficus adds to NFS, all without modifying the
+ * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but
+ * that is a(whole)nother story.) This is a feature.
+ */
+
+/* Table of known vnodeop vectors (list of VFS vnode vectors) */
+static const struct vnodeopv_desc **vnodeopv_descs;
+static int vnodeopv_num;
+
+/* Table of known descs (list of vnode op handlers "vop_access_desc") */
+static struct vnodeop_desc **vfs_op_descs;
+/* Reference counts for vfs_op_descs */
+static int *vfs_op_desc_refs;
+/* Number of descriptions */
+static int num_op_descs;
+/* Number of entries in each description */
+static int vfs_opv_numops = 64;
+
+/* Allow this number to be tuned at boot */
+TUNABLE_INT("vfs.opv_numops", &vfs_opv_numops);
+SYSCTL_INT(_vfs, OID_AUTO, opv_numops, CTLFLAG_RD, &vfs_opv_numops,
+ 0, "Maximum number of operations in vop_t vector");
+
+static int int_cmp(const void *a, const void *b);
+
+static int
+int_cmp(const void *a, const void *b)
+{
+ return(*(const int *)a - *(const int *)b);
+}
+
+/*
+ * Recalculate the operations vector/description (those parts of it that can
+ * be recalculated, that is.)
+ * Always allocate operations vector large enough to hold vfs_opv_numops
+ * entries. The vector is never freed or deallocated once it is initialized,
+ * so that vnodes might safely reference it through their v_op pointer without
+ * vector changing suddenly from under them.
+ */
+static void
+vfs_opv_recalc(void)
+{
+ int i, j, k;
+ int *vfs_op_offsets;
+ vop_t ***opv_desc_vector_p;
+ vop_t **opv_desc_vector;
+ struct vnodeopv_entry_desc *opve_descp;
+ const struct vnodeopv_desc *opv;
+
+ if (vfs_op_descs == NULL)
+ panic("vfs_opv_recalc called with null vfs_op_descs");
+
+ /*
+ * Allocate and initialize temporary array to store
+ * offsets. Sort it to put all uninitialized entries
+ * first and to make holes in existing offset sequence
+ * detectable.
+ */
+ MALLOC(vfs_op_offsets, int *,
+ num_op_descs * sizeof(int), M_TEMP, M_WAITOK);
+ if (vfs_op_offsets == NULL)
+ panic("vfs_opv_recalc: no memory");
+ for (i = 0; i < num_op_descs; i++)
+ vfs_op_offsets[i] = vfs_op_descs[i]->vdesc_offset;
+ qsort(vfs_op_offsets, num_op_descs, sizeof(int), int_cmp);
+
+ /*
+ * Run through and make sure all known descs have an offset.
+ * Use vfs_op_offsets to locate holes in offset sequence and
+ * reuse them.
+ * vop_default_desc is hardwired at offset 1, and offset 0
+ * is a panic sanity check.
+ */
+ j = 1; k = 1;
+ for (i = 0; i < num_op_descs; i++) {
+ if (vfs_op_descs[i]->vdesc_offset != 0)
+ continue;
+ /*
+ * Look at two adjacent entries vfs_op_offsets[j - 1] and
+ * vfs_op_offsets[j] and see if we can fit a new offset
+ * number in between. If not, look at the next pair until
+ * hole is found or the end of the vfs_op_offsets vector is
+ * reached. j has been initialized to 1 above so that
+ * referencing (j-1)-th element is safe and the loop will
+ * never execute if num_op_descs is 1. For each new value s
+ * of i the j loop pick up from where previous iteration has
+ * left off. When the last hole has been consumed or if no
+ * hole has been found, we will start allocating new numbers
+ * starting from the biggest already available offset + 1.
+ */
+ for (; j < num_op_descs; j++) {
+ if (vfs_op_offsets[j - 1] < k && vfs_op_offsets[j] > k)
+ break;
+ k = vfs_op_offsets[j] + 1;
+ }
+ vfs_op_descs[i]->vdesc_offset = k++;
+ }
+ FREE(vfs_op_offsets, M_TEMP);
+
+ /* Panic if new vops will cause vector overflow */
+ if (k > vfs_opv_numops)
+ panic("VFS: Ran out of vop_t vector entries. %d entries required, only %d available.\n", k, vfs_opv_numops);
+
+ /*
+ * Allocate and fill in the vectors
+ */
+ for (i = 0; i < vnodeopv_num; i++) {
+ opv = vnodeopv_descs[i];
+ opv_desc_vector_p = opv->opv_desc_vector_p;
+ if (*opv_desc_vector_p == NULL)
+ MALLOC(*opv_desc_vector_p, vop_t **,
+ vfs_opv_numops * sizeof(vop_t *), M_VNODE,
+ M_WAITOK | M_ZERO);
+
+ /* Fill in, with slot 0 being to return EOPNOTSUPP */
+ opv_desc_vector = *opv_desc_vector_p;
+ opv_desc_vector[0] = (vop_t *)vop_eopnotsupp;
+ for (j = 0; opv->opv_desc_ops[j].opve_op; j++) {
+ opve_descp = &(opv->opv_desc_ops[j]);
+ opv_desc_vector[opve_descp->opve_op->vdesc_offset] =
+ opve_descp->opve_impl;
+ }
+
+ /* Replace unfilled routines with their default (slot 1). */
+ opv_desc_vector = *(opv->opv_desc_vector_p);
+ if (opv_desc_vector[1] == NULL)
+ panic("vfs_opv_recalc: vector without a default.");
+ for (j = 0; j < vfs_opv_numops; j++)
+ if (opv_desc_vector[j] == NULL)
+ opv_desc_vector[j] = opv_desc_vector[1];
+ }
+}
+
+/* Add a set of vnode operations (a description) to the table above. */
+void
+vfs_add_vnodeops(const void *data)
+{
+ const struct vnodeopv_desc *opv;
+ const struct vnodeopv_desc **newopv;
+ struct vnodeop_desc **newop;
+ int *newref;
+ vop_t **opv_desc_vector;
+ struct vnodeop_desc *desc;
+ int i, j;
+
+ opv = (const struct vnodeopv_desc *)data;
+ MALLOC(newopv, const struct vnodeopv_desc **,
+ (vnodeopv_num + 1) * sizeof(*newopv), M_VNODE, M_WAITOK);
+ if (vnodeopv_descs) {
+ bcopy(vnodeopv_descs, newopv, vnodeopv_num * sizeof(*newopv));
+ FREE(vnodeopv_descs, M_VNODE);
+ }
+ newopv[vnodeopv_num] = opv;
+ vnodeopv_descs = newopv;
+ vnodeopv_num++;
+
+ /* See if we have turned up a new vnode op desc */
+ opv_desc_vector = *(opv->opv_desc_vector_p);
+ for (i = 0; (desc = opv->opv_desc_ops[i].opve_op); i++) {
+ for (j = 0; j < num_op_descs; j++) {
+ if (desc == vfs_op_descs[j]) {
+ /* found it, increase reference count */
+ vfs_op_desc_refs[j]++;
+ break;
+ }
+ }
+ if (j == num_op_descs) {
+ /* not found, new entry */
+ MALLOC(newop, struct vnodeop_desc **,
+ (num_op_descs + 1) * sizeof(*newop),
+ M_VNODE, M_WAITOK);
+ /* new reference count (for unload) */
+ MALLOC(newref, int *,
+ (num_op_descs + 1) * sizeof(*newref),
+ M_VNODE, M_WAITOK);
+ if (vfs_op_descs) {
+ bcopy(vfs_op_descs, newop,
+ num_op_descs * sizeof(*newop));
+ FREE(vfs_op_descs, M_VNODE);
+ }
+ if (vfs_op_desc_refs) {
+ bcopy(vfs_op_desc_refs, newref,
+ num_op_descs * sizeof(*newref));
+ FREE(vfs_op_desc_refs, M_VNODE);
+ }
+ newop[num_op_descs] = desc;
+ newref[num_op_descs] = 1;
+ vfs_op_descs = newop;
+ vfs_op_desc_refs = newref;
+ num_op_descs++;
+ }
+ }
+ vfs_opv_recalc();
+}
+
+/* Remove a vnode type from the vnode description table above. */
+void
+vfs_rm_vnodeops(const void *data)
+{
+ const struct vnodeopv_desc *opv;
+ const struct vnodeopv_desc **newopv;
+ struct vnodeop_desc **newop;
+ int *newref;
+ vop_t **opv_desc_vector;
+ struct vnodeop_desc *desc;
+ int i, j, k;
+
+ opv = (const struct vnodeopv_desc *)data;
+ /* Lower ref counts on descs in the table and release if zero */
+ for (i = 0; (desc = opv->opv_desc_ops[i].opve_op); i++) {
+ for (j = 0; j < num_op_descs; j++) {
+ if (desc == vfs_op_descs[j]) {
+ /* found it, decrease reference count */
+ vfs_op_desc_refs[j]--;
+ break;
+ }
+ }
+ for (j = 0; j < num_op_descs; j++) {
+ if (vfs_op_desc_refs[j] > 0)
+ continue;
+ if (vfs_op_desc_refs[j] < 0)
+ panic("vfs_remove_vnodeops: negative refcnt");
+ /* Entry is going away - replace it with defaultop */
+ for (k = 0; k < vnodeopv_num; k++) {
+ opv_desc_vector =
+ *(vnodeopv_descs[k]->opv_desc_vector_p);
+ if (opv_desc_vector != NULL)
+ opv_desc_vector[desc->vdesc_offset] =
+ opv_desc_vector[1];
+ }
+ MALLOC(newop, struct vnodeop_desc **,
+ (num_op_descs - 1) * sizeof(*newop),
+ M_VNODE, M_WAITOK);
+ /* new reference count (for unload) */
+ MALLOC(newref, int *,
+ (num_op_descs - 1) * sizeof(*newref),
+ M_VNODE, M_WAITOK);
+ for (k = j; k < (num_op_descs - 1); k++) {
+ vfs_op_descs[k] = vfs_op_descs[k + 1];
+ vfs_op_desc_refs[k] = vfs_op_desc_refs[k + 1];
+ }
+ bcopy(vfs_op_descs, newop,
+ (num_op_descs - 1) * sizeof(*newop));
+ bcopy(vfs_op_desc_refs, newref,
+ (num_op_descs - 1) * sizeof(*newref));
+ FREE(vfs_op_descs, M_VNODE);
+ FREE(vfs_op_desc_refs, M_VNODE);
+ vfs_op_descs = newop;
+ vfs_op_desc_refs = newref;
+ num_op_descs--;
+ }
+ }
+
+ for (i = 0; i < vnodeopv_num; i++) {
+ if (vnodeopv_descs[i] == opv) {
+ for (j = i; j < (vnodeopv_num - 1); j++)
+ vnodeopv_descs[j] = vnodeopv_descs[j + 1];
+ break;
+ }
+ }
+ if (i == vnodeopv_num)
+ panic("vfs_remove_vnodeops: opv not found");
+ opv_desc_vector = *(opv->opv_desc_vector_p);
+ if (opv_desc_vector != NULL)
+ FREE(opv_desc_vector, M_VNODE);
+ MALLOC(newopv, const struct vnodeopv_desc **,
+ (vnodeopv_num - 1) * sizeof(*newopv), M_VNODE, M_WAITOK);
+ bcopy(vnodeopv_descs, newopv, (vnodeopv_num - 1) * sizeof(*newopv));
+ FREE(vnodeopv_descs, M_VNODE);
+ vnodeopv_descs = newopv;
+ vnodeopv_num--;
+
+ vfs_opv_recalc();
+}
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+struct vattr va_null;
+
+/*
+ * Initialize the vnode structures and initialize each filesystem type.
+ */
+/* ARGSUSED*/
+static void
+vfsinit(void *dummy)
+{
+
+ vattr_null(&va_null);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vfsinit, NULL)
+
+/* Register a new filesystem type in the global table */
+int
+vfs_register(struct vfsconf *vfc)
+{
+ struct sysctl_oid *oidp;
+ struct vfsconf *vfsp;
+
+ vfsp = NULL;
+ if (vfsconf)
+ for (vfsp = vfsconf; vfsp->vfc_next; vfsp = vfsp->vfc_next)
+ if (strcmp(vfc->vfc_name, vfsp->vfc_name) == 0)
+ return EEXIST;
+
+ vfc->vfc_typenum = maxvfsconf++;
+ if (vfsp)
+ vfsp->vfc_next = vfc;
+ else
+ vfsconf = vfc;
+ vfc->vfc_next = NULL;
+
+ /*
+ * If this filesystem has a sysctl node under vfs
+ * (i.e. vfs.xxfs), then change the oid number of that node to
+ * match the filesystem's type number. This allows user code
+ * which uses the type number to read sysctl variables defined
+ * by the filesystem to continue working. Since the oids are
+ * in a sorted list, we need to make sure the order is
+ * preserved by re-registering the oid after modifying its
+ * number.
+ */
+ SLIST_FOREACH(oidp, &sysctl__vfs_children, oid_link)
+ if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) {
+ sysctl_unregister_oid(oidp);
+ oidp->oid_number = vfc->vfc_typenum;
+ sysctl_register_oid(oidp);
+ }
+
+ /*
+ * Call init function for this VFS...
+ */
+ (*(vfc->vfc_vfsops->vfs_init))(vfc);
+
+ return 0;
+}
+
+
+/* Remove registration of a filesystem type */
+int
+vfs_unregister(struct vfsconf *vfc)
+{
+ struct vfsconf *vfsp, *prev_vfsp;
+ int error, i, maxtypenum;
+
+ i = vfc->vfc_typenum;
+
+ prev_vfsp = NULL;
+ for (vfsp = vfsconf; vfsp;
+ prev_vfsp = vfsp, vfsp = vfsp->vfc_next) {
+ if (!strcmp(vfc->vfc_name, vfsp->vfc_name))
+ break;
+ }
+ if (vfsp == NULL)
+ return EINVAL;
+ if (vfsp->vfc_refcount)
+ return EBUSY;
+ if (vfc->vfc_vfsops->vfs_uninit != NULL) {
+ error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp);
+ if (error)
+ return (error);
+ }
+ if (prev_vfsp)
+ prev_vfsp->vfc_next = vfsp->vfc_next;
+ else
+ vfsconf = vfsp->vfc_next;
+ maxtypenum = VFS_GENERIC;
+ for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
+ if (maxtypenum < vfsp->vfc_typenum)
+ maxtypenum = vfsp->vfc_typenum;
+ maxvfsconf = maxtypenum + 1;
+ return 0;
+}
+
+/*
+ * Standard kernel module handling code for filesystem modules.
+ * Referenced from VFS_SET().
+ */
+int
+vfs_modevent(module_t mod, int type, void *data)
+{
+ struct vfsconf *vfc;
+ int error = 0;
+
+ vfc = (struct vfsconf *)data;
+
+ switch (type) {
+ case MOD_LOAD:
+ if (vfc)
+ error = vfs_register(vfc);
+ break;
+
+ case MOD_UNLOAD:
+ if (vfc)
+ error = vfs_unregister(vfc);
+ break;
+ default: /* including MOD_SHUTDOWN */
+ break;
+ }
+ return (error);
+}
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
new file mode 100644
index 0000000..8e4af42
--- /dev/null
+++ b/sys/kern/vfs_lookup.c
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/uma.h>
+
+/*
+ * Allocation zone for namei
+ */
+uma_zone_t namei_zone;
+
+static void
+nameiinit(void *dummy __unused)
+{
+ namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL)
+
+/*
+ * Convert a pathname into a pointer to a locked inode.
+ *
+ * The FOLLOW flag is set when symbolic links are to be followed
+ * when they occur at the end of the name translation process.
+ * Symbolic links are always followed for all other pathname
+ * components other than the last.
+ *
+ * The segflg defines whether the name is to be copied from user
+ * space or kernel space.
+ *
+ * Overall outline of namei:
+ *
+ * copy in name
+ * get starting directory
+ * while (!done && !error) {
+ * call lookup to search path.
+ * if symbolic link, massage name in buffer and continue
+ * }
+ */
+int
+namei(ndp)
+ register struct nameidata *ndp;
+{
+ register struct filedesc *fdp; /* pointer to file descriptor state */
+ register char *cp; /* pointer into pathname argument */
+ register struct vnode *dp; /* the directory we are searching */
+ struct iovec aiov; /* uio for reading symbolic links */
+ struct uio auio;
+ int error, linklen;
+ struct componentname *cnp = &ndp->ni_cnd;
+ struct thread *td = cnp->cn_thread;
+ struct proc *p = td->td_proc;
+
+ ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
+ KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
+ KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
+ ("namei: nameiop contaminated with flags"));
+ KASSERT((cnp->cn_flags & OPMASK) == 0,
+ ("namei: flags contaminated with nameiops"));
+ fdp = p->p_fd;
+
+ /*
+ * Get a buffer for the name to be translated, and copy the
+ * name into the buffer.
+ */
+ if ((cnp->cn_flags & HASBUF) == 0)
+ cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
+ if (ndp->ni_segflg == UIO_SYSSPACE)
+ error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
+ MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+ else
+ error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
+ MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+
+ /*
+ * Don't allow empty pathnames.
+ */
+ if (!error && *cnp->cn_pnbuf == '\0')
+ error = ENOENT;
+
+ if (error) {
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+ ndp->ni_vp = NULL;
+ return (error);
+ }
+ ndp->ni_loopcnt = 0;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_NAMEI)) {
+ KASSERT(cnp->cn_thread == curthread,
+ ("namei not using curthread"));
+ ktrnamei(cnp->cn_pnbuf);
+ }
+#endif
+
+ /*
+ * Get starting point for the translation.
+ */
+ FILEDESC_LOCK(fdp);
+ ndp->ni_rootdir = fdp->fd_rdir;
+ ndp->ni_topdir = fdp->fd_jdir;
+
+ dp = fdp->fd_cdir;
+ VREF(dp);
+ FILEDESC_UNLOCK(fdp);
+ for (;;) {
+ /*
+ * Check if root directory should replace current directory.
+ * Done at start of translation and after symbolic link.
+ */
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+ if (*(cnp->cn_nameptr) == '/') {
+ vrele(dp);
+ while (*(cnp->cn_nameptr) == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+ dp = ndp->ni_rootdir;
+ VREF(dp);
+ }
+ ndp->ni_startdir = dp;
+ error = lookup(ndp);
+ if (error) {
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+ return (error);
+ }
+ /*
+ * Check for symbolic link
+ */
+ if ((cnp->cn_flags & ISSYMLINK) == 0) {
+ if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0)
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+ else
+ cnp->cn_flags |= HASBUF;
+
+ if (vn_canvmio(ndp->ni_vp) == TRUE &&
+ (cnp->cn_nameiop != DELETE) &&
+ ((cnp->cn_flags & (NOOBJ|LOCKLEAF)) ==
+ LOCKLEAF))
+ vfs_object_create(ndp->ni_vp, td,
+ ndp->ni_cnd.cn_cred);
+
+ return (0);
+ }
+ if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
+ VOP_UNLOCK(ndp->ni_dvp, 0, td);
+ if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
+ error = ELOOP;
+ break;
+ }
+ if (ndp->ni_pathlen > 1)
+ cp = uma_zalloc(namei_zone, M_WAITOK);
+ else
+ cp = cnp->cn_pnbuf;
+ aiov.iov_base = cp;
+ aiov.iov_len = MAXPATHLEN;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_td = (struct thread *)0;
+ auio.uio_resid = MAXPATHLEN;
+ error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
+ if (error) {
+ if (ndp->ni_pathlen > 1)
+ uma_zfree(namei_zone, cp);
+ break;
+ }
+ linklen = MAXPATHLEN - auio.uio_resid;
+ if (linklen == 0) {
+ if (ndp->ni_pathlen > 1)
+ uma_zfree(namei_zone, cp);
+ error = ENOENT;
+ break;
+ }
+ if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
+ if (ndp->ni_pathlen > 1)
+ uma_zfree(namei_zone, cp);
+ error = ENAMETOOLONG;
+ break;
+ }
+ if (ndp->ni_pathlen > 1) {
+ bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+ cnp->cn_pnbuf = cp;
+ } else
+ cnp->cn_pnbuf[linklen] = '\0';
+ ndp->ni_pathlen += linklen;
+ vput(ndp->ni_vp);
+ dp = ndp->ni_dvp;
+ }
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+ vrele(ndp->ni_dvp);
+ vput(ndp->ni_vp);
+ ndp->ni_vp = NULL;
+ return (error);
+}
+
+/*
+ * Search a pathname.
+ * This is a very central and rather complicated routine.
+ *
+ * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
+ * The starting directory is taken from ni_startdir. The pathname is
+ * descended until done, or a symbolic link is encountered. The variable
+ * ni_more is clear if the path is completed; it is set to one if a
+ * symbolic link needing interpretation is encountered.
+ *
+ * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
+ * whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it, the parent directory is returned
+ * locked. If flag has WANTPARENT or'ed into it, the parent directory is
+ * returned unlocked. Otherwise the parent directory is not returned. If
+ * the target of the pathname exists and LOCKLEAF is or'ed into the flag
+ * the target is returned locked, otherwise it is returned unlocked.
+ * When creating or renaming and LOCKPARENT is specified, the target may not
+ * be ".". When deleting and LOCKPARENT is specified, the target may be ".".
+ *
+ * Overall outline of lookup:
+ *
+ * dirloop:
+ * identify next component of name at ndp->ni_ptr
+ * handle degenerate case where name is null string
+ * if .. and crossing mount points and on mounted filesys, find parent
+ * call VOP_LOOKUP routine for next component name
+ * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
+ * component vnode returned in ni_vp (if it exists), locked.
+ * if result vnode is mounted on and crossing mount points,
+ * find mounted on vnode
+ * if more components of name, do next level at dirloop
+ * return the answer in ni_vp, locked if LOCKLEAF set
+ * if LOCKPARENT set, return locked parent in ni_dvp
+ * if WANTPARENT set, return unlocked parent in ni_dvp
+ */
+int
+lookup(ndp)
+ register struct nameidata *ndp;
+{
+ register char *cp; /* pointer into pathname argument */
+ register struct vnode *dp = 0; /* the directory we are searching */
+ struct vnode *tdp; /* saved dp */
+ struct mount *mp; /* mount table entry */
+ int docache; /* == 0 do not cache last component */
+ int wantparent; /* 1 => wantparent or lockparent flag */
+ int rdonly; /* lookup read-only flag bit */
+ int trailing_slash;
+ int error = 0;
+ int dpunlocked = 0; /* dp has already been unlocked */
+ struct componentname *cnp = &ndp->ni_cnd;
+ struct thread *td = cnp->cn_thread;
+
+ /*
+ * Setup: break out flag bits into variables.
+ */
+ wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
+ docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
+ if (cnp->cn_nameiop == DELETE ||
+ (wantparent && cnp->cn_nameiop != CREATE &&
+ cnp->cn_nameiop != LOOKUP))
+ docache = 0;
+ rdonly = cnp->cn_flags & RDONLY;
+ ndp->ni_dvp = NULL;
+ cnp->cn_flags &= ~ISSYMLINK;
+ dp = ndp->ni_startdir;
+ ndp->ni_startdir = NULLVP;
+ vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
+
+dirloop:
+ /*
+ * Search a new directory.
+ *
+ * The last component of the filename is left accessible via
+ * cnp->cn_nameptr for callers that need the name. Callers needing
+ * the name set the SAVENAME flag. When done, they assume
+ * responsibility for freeing the pathname buffer.
+ */
+ cnp->cn_consume = 0;
+ for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+ continue;
+ cnp->cn_namelen = cp - cnp->cn_nameptr;
+ if (cnp->cn_namelen > NAME_MAX) {
+ error = ENAMETOOLONG;
+ goto bad;
+ }
+#ifdef NAMEI_DIAGNOSTIC
+ { char c = *cp;
+ *cp = '\0';
+ printf("{%s}: ", cnp->cn_nameptr);
+ *cp = c; }
+#endif
+ ndp->ni_pathlen -= cnp->cn_namelen;
+ ndp->ni_next = cp;
+
+ /*
+ * Replace multiple slashes by a single slash and trailing slashes
+ * by a null. This must be done before VOP_LOOKUP() because some
+ * fs's don't know about trailing slashes. Remember if there were
+ * trailing slashes to handle symlinks, existing non-directories
+ * and non-existing files that won't be directories specially later.
+ */
+ trailing_slash = 0;
+ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+ cp++;
+ ndp->ni_pathlen--;
+ if (*cp == '\0') {
+ trailing_slash = 1;
+ *ndp->ni_next = '\0'; /* XXX for direnter() ... */
+ }
+ }
+ ndp->ni_next = cp;
+
+ cnp->cn_flags |= MAKEENTRY;
+ if (*cp == '\0' && docache == 0)
+ cnp->cn_flags &= ~MAKEENTRY;
+ if (cnp->cn_namelen == 2 &&
+ cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
+ cnp->cn_flags |= ISDOTDOT;
+ else
+ cnp->cn_flags &= ~ISDOTDOT;
+ if (*ndp->ni_next == 0)
+ cnp->cn_flags |= ISLASTCN;
+ else
+ cnp->cn_flags &= ~ISLASTCN;
+
+
+ /*
+ * Check for degenerate name (e.g. / or "")
+ * which is a way of talking about a directory,
+ * e.g. like "/." or ".".
+ */
+ if (cnp->cn_nameptr[0] == '\0') {
+ if (dp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto bad;
+ }
+ if (cnp->cn_nameiop != LOOKUP) {
+ error = EISDIR;
+ goto bad;
+ }
+ if (wantparent) {
+ ndp->ni_dvp = dp;
+ VREF(dp);
+ }
+ ndp->ni_vp = dp;
+ if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
+ VOP_UNLOCK(dp, 0, td);
+ /* XXX This should probably move to the top of function. */
+ if (cnp->cn_flags & SAVESTART)
+ panic("lookup: SAVESTART");
+ return (0);
+ }
+
+ /*
+ * Handle "..": two special cases.
+ * 1. If at root directory (e.g. after chroot)
+ * or at absolute root directory
+ * then ignore it so can't get out.
+ * 2. If this vnode is the root of a mounted
+ * filesystem, then replace it with the
+ * vnode which was mounted on so we take the
+ * .. in the other filesystem.
+ * 3. If the vnode is the top directory of
+ * the jail or chroot, don't let them out.
+ */
+ if (cnp->cn_flags & ISDOTDOT) {
+ for (;;) {
+ if (dp == ndp->ni_rootdir ||
+ dp == ndp->ni_topdir ||
+ dp == rootvnode) {
+ ndp->ni_dvp = dp;
+ ndp->ni_vp = dp;
+ VREF(dp);
+ goto nextname;
+ }
+ if ((dp->v_flag & VROOT) == 0 ||
+ (cnp->cn_flags & NOCROSSMOUNT))
+ break;
+ if (dp->v_mount == NULL) { /* forced unmount */
+ error = EBADF;
+ goto bad;
+ }
+ tdp = dp;
+ dp = dp->v_mount->mnt_vnodecovered;
+ vput(tdp);
+ VREF(dp);
+ vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
+ }
+ }
+
+ /*
+ * We now have a segment name to search for, and a directory to search.
+ */
+unionlookup:
+ ndp->ni_dvp = dp;
+ ndp->ni_vp = NULL;
+ cnp->cn_flags &= ~PDIRUNLOCK;
+ ASSERT_VOP_LOCKED(dp, "lookup");
+ if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
+ KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
+#ifdef NAMEI_DIAGNOSTIC
+ printf("not found\n");
+#endif
+ if ((error == ENOENT) &&
+ (dp->v_flag & VROOT) && (dp->v_mount != NULL) &&
+ (dp->v_mount->mnt_flag & MNT_UNION)) {
+ tdp = dp;
+ dp = dp->v_mount->mnt_vnodecovered;
+ if (cnp->cn_flags & PDIRUNLOCK)
+ vrele(tdp);
+ else
+ vput(tdp);
+ VREF(dp);
+ vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
+ goto unionlookup;
+ }
+
+ if (error != EJUSTRETURN)
+ goto bad;
+ /*
+ * If creating and at end of pathname, then can consider
+ * allowing file to be created.
+ */
+ if (rdonly) {
+ error = EROFS;
+ goto bad;
+ }
+ if (*cp == '\0' && trailing_slash &&
+ !(cnp->cn_flags & WILLBEDIR)) {
+ error = ENOENT;
+ goto bad;
+ }
+ /*
+ * We return with ni_vp NULL to indicate that the entry
+ * doesn't currently exist, leaving a pointer to the
+ * (possibly locked) directory inode in ndp->ni_dvp.
+ */
+ if (cnp->cn_flags & SAVESTART) {
+ ndp->ni_startdir = ndp->ni_dvp;
+ VREF(ndp->ni_startdir);
+ }
+ return (0);
+ }
+#ifdef NAMEI_DIAGNOSTIC
+ printf("found\n");
+#endif
+
+ ASSERT_VOP_LOCKED(ndp->ni_vp, "lookup");
+
+ /*
+ * Take into account any additional components consumed by
+ * the underlying filesystem.
+ */
+ if (cnp->cn_consume > 0) {
+ cnp->cn_nameptr += cnp->cn_consume;
+ ndp->ni_next += cnp->cn_consume;
+ ndp->ni_pathlen -= cnp->cn_consume;
+ cnp->cn_consume = 0;
+ }
+
+ dp = ndp->ni_vp;
+
+ /*
+ * Check to see if the vnode has been mounted on;
+ * if so find the root of the mounted filesystem.
+ */
+ while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
+ (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+ if (vfs_busy(mp, 0, 0, td))
+ continue;
+ VOP_UNLOCK(dp, 0, td);
+ error = VFS_ROOT(mp, &tdp);
+ vfs_unbusy(mp, td);
+ if (error) {
+ dpunlocked = 1;
+ goto bad2;
+ }
+ vrele(dp);
+ ndp->ni_vp = dp = tdp;
+ }
+
+ /*
+ * Check for symbolic link
+ */
+ if ((dp->v_type == VLNK) &&
+ ((cnp->cn_flags & FOLLOW) || trailing_slash ||
+ *ndp->ni_next == '/')) {
+ cnp->cn_flags |= ISSYMLINK;
+ if (dp->v_mount == NULL) {
+ /* We can't know whether the directory was mounted with
+ * NOSYMFOLLOW, so we can't follow safely. */
+ error = EBADF;
+ goto bad2;
+ }
+ if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
+ error = EACCES;
+ goto bad2;
+ }
+ return (0);
+ }
+
+ /*
+ * Check for bogus trailing slashes.
+ */
+ if (trailing_slash && dp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto bad2;
+ }
+
+nextname:
+ /*
+ * Not a symbolic link. If more pathname,
+ * continue at next component, else return.
+ */
+ if (*ndp->ni_next == '/') {
+ cnp->cn_nameptr = ndp->ni_next;
+ while (*cnp->cn_nameptr == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+ if (ndp->ni_dvp != ndp->ni_vp)
+ ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "lookup");
+ vrele(ndp->ni_dvp);
+ goto dirloop;
+ }
+ /*
+ * Disallow directory write attempts on read-only filesystems.
+ */
+ if (rdonly &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ error = EROFS;
+ goto bad2;
+ }
+ if (cnp->cn_flags & SAVESTART) {
+ ndp->ni_startdir = ndp->ni_dvp;
+ VREF(ndp->ni_startdir);
+ }
+ if (!wantparent)
+ vrele(ndp->ni_dvp);
+
+ if ((cnp->cn_flags & LOCKLEAF) == 0)
+ VOP_UNLOCK(dp, 0, td);
+ return (0);
+
+bad2:
+ if ((cnp->cn_flags & (LOCKPARENT | PDIRUNLOCK)) == LOCKPARENT &&
+ *ndp->ni_next == '\0')
+ VOP_UNLOCK(ndp->ni_dvp, 0, td);
+ vrele(ndp->ni_dvp);
+bad:
+ if (dpunlocked)
+ vrele(dp);
+ else
+ vput(dp);
+ ndp->ni_vp = NULL;
+ return (error);
+}
+
+/*
+ * relookup - lookup a path name component
+ * Used by lookup to re-aquire things.
+ */
+int
+relookup(dvp, vpp, cnp)
+ struct vnode *dvp, **vpp;
+ struct componentname *cnp;
+{
+ struct thread *td = cnp->cn_thread;
+ struct vnode *dp = 0; /* the directory we are searching */
+ int docache; /* == 0 do not cache last component */
+ int wantparent; /* 1 => wantparent or lockparent flag */
+ int rdonly; /* lookup read-only flag bit */
+ int error = 0;
+#ifdef NAMEI_DIAGNOSTIC
+ int newhash; /* DEBUG: check name hash */
+ char *cp; /* DEBUG: check name ptr/len */
+#endif
+
+ /*
+ * Setup: break out flag bits into variables.
+ */
+ wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
+ docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
+ if (cnp->cn_nameiop == DELETE ||
+ (wantparent && cnp->cn_nameiop != CREATE))
+ docache = 0;
+ rdonly = cnp->cn_flags & RDONLY;
+ cnp->cn_flags &= ~ISSYMLINK;
+ dp = dvp;
+ vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
+
+/* dirloop: */
+ /*
+ * Search a new directory.
+ *
+ * The last component of the filename is left accessible via
+ * cnp->cn_nameptr for callers that need the name. Callers needing
+ * the name set the SAVENAME flag. When done, they assume
+ * responsibility for freeing the pathname buffer.
+ */
+#ifdef NAMEI_DIAGNOSTIC
+ if (cnp->cn_namelen != cp - cnp->cn_nameptr)
+ panic ("relookup: bad len");
+ if (*cp != 0)
+ panic("relookup: not last component");
+ printf("{%s}: ", cnp->cn_nameptr);
+#endif
+
+ /*
+ * Check for degenerate name (e.g. / or "")
+ * which is a way of talking about a directory,
+ * e.g. like "/." or ".".
+ */
+ if (cnp->cn_nameptr[0] == '\0') {
+ if (cnp->cn_nameiop != LOOKUP || wantparent) {
+ error = EISDIR;
+ goto bad;
+ }
+ if (dp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto bad;
+ }
+ if (!(cnp->cn_flags & LOCKLEAF))
+ VOP_UNLOCK(dp, 0, td);
+ *vpp = dp;
+ /* XXX This should probably move to the top of function. */
+ if (cnp->cn_flags & SAVESTART)
+ panic("lookup: SAVESTART");
+ return (0);
+ }
+
+ if (cnp->cn_flags & ISDOTDOT)
+ panic ("relookup: lookup on dot-dot");
+
+ /*
+ * We now have a segment name to search for, and a directory to search.
+ */
+ if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
+ KASSERT(*vpp == NULL, ("leaf should be empty"));
+ if (error != EJUSTRETURN)
+ goto bad;
+ /*
+ * If creating and at end of pathname, then can consider
+ * allowing file to be created.
+ */
+ if (rdonly) {
+ error = EROFS;
+ goto bad;
+ }
+ /* ASSERT(dvp == ndp->ni_startdir) */
+ if (cnp->cn_flags & SAVESTART)
+ VREF(dvp);
+ /*
+ * We return with ni_vp NULL to indicate that the entry
+ * doesn't currently exist, leaving a pointer to the
+ * (possibly locked) directory inode in ndp->ni_dvp.
+ */
+ return (0);
+ }
+ dp = *vpp;
+
+ /*
+ * Check for symbolic link
+ */
+ KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
+ ("relookup: symlink found.\n"));
+
+ /*
+ * Disallow directory write attempts on read-only filesystems.
+ */
+ if (rdonly &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ error = EROFS;
+ goto bad2;
+ }
+ /* ASSERT(dvp == ndp->ni_startdir) */
+ if (cnp->cn_flags & SAVESTART)
+ VREF(dvp);
+
+ if (!wantparent)
+ vrele(dvp);
+
+ if (vn_canvmio(dp) == TRUE &&
+ ((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == LOCKLEAF))
+ vfs_object_create(dp, td, cnp->cn_cred);
+
+ if ((cnp->cn_flags & LOCKLEAF) == 0)
+ VOP_UNLOCK(dp, 0, td);
+ return (0);
+
+bad2:
+ if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN))
+ VOP_UNLOCK(dvp, 0, td);
+ vrele(dvp);
+bad:
+ vput(dp);
+ *vpp = NULL;
+ return (error);
+}
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
new file mode 100644
index 0000000..20d9b90
--- /dev/null
+++ b/sys/kern/vfs_mount.c
@@ -0,0 +1,396 @@
+/*-
+ * Copyright (c) 1999 Michael Smith
+ * All rights reserved.
+ * Copyright (c) 1999 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Locate and mount the root filesystem.
+ *
+ * The root filesystem is detailed in the kernel environment variable
+ * vfs.root.mountfrom, which is expected to be in the general format
+ *
+ * <vfsname>:[<path>]
+ * vfsname := the name of a VFS known to the kernel and capable
+ * of being mounted as root
+ * path := disk device name or other data used by the filesystem
+ * to locate its physical store
+ *
+ */
+
+#include "opt_rootdevname.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/reboot.h>
+#include <sys/diskslice.h>
+#include <sys/disklabel.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/proc.h>
+
+#include "opt_ddb.h"
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <paths.h>
+
+MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
+
+#define ROOTNAME "root_device"
+
+/*
+ * The vnode of the system's root (/ in the filesystem, without chroot
+ * active.)
+ */
+struct vnode *rootvnode;
+
+/*
+ * The root specifiers we will try if RB_CDROM is specified.
+ */
+static char *cdrom_rootdevnames[] = {
+ "cd9660:cd0a",
+ "cd9660:acd0a",
+ "cd9660:wcd0a",
+ NULL
+};
+
+static int vfs_mountroot_try(char *mountfrom);
+static int vfs_mountroot_ask(void);
+static void gets(char *cp);
+
+/* legacy find-root code */
+char *rootdevnames[2] = {NULL, NULL};
+static int setrootbyname(char *name);
+dev_t rootdev = NODEV;
+
+/*
+ * Find and mount the root filesystem
+ */
+void
+vfs_mountroot(void *foo __unused)
+{
+ char *cp;
+ int i, error;
+
+ /*
+ * The root filesystem information is compiled in, and we are
+ * booted with instructions to use it.
+ */
+#ifdef ROOTDEVNAME
+ if ((boothowto & RB_DFLTROOT) &&
+ !vfs_mountroot_try(ROOTDEVNAME))
+ return;
+#endif
+ /*
+ * We are booted with instructions to prompt for the root filesystem,
+ * or to use the compiled-in default when it doesn't exist.
+ */
+ if (boothowto & (RB_DFLTROOT | RB_ASKNAME)) {
+ if (!vfs_mountroot_ask())
+ return;
+ }
+
+ /*
+ * We've been given the generic "use CDROM as root" flag. This is
+ * necessary because one media may be used in many different
+ * devices, so we need to search for them.
+ */
+ if (boothowto & RB_CDROM) {
+ for (i = 0; cdrom_rootdevnames[i] != NULL; i++) {
+ if (!vfs_mountroot_try(cdrom_rootdevnames[i]))
+ return;
+ }
+ }
+
+ /*
+ * Try to use the value read by the loader from /etc/fstab, or
+ * supplied via some other means. This is the preferred
+ * mechanism.
+ */
+ if ((cp = getenv("vfs.root.mountfrom")) != NULL) {
+ error = vfs_mountroot_try(cp);
+ freeenv(cp);
+ if (!error)
+ return;
+ }
+
+ /*
+ * Try values that may have been computed by the machine-dependant
+ * legacy code.
+ */
+ if (!vfs_mountroot_try(rootdevnames[0]))
+ return;
+ if (!vfs_mountroot_try(rootdevnames[1]))
+ return;
+
+ /*
+ * If we have a compiled-in default, and haven't already tried it, try
+ * it now.
+ */
+#ifdef ROOTDEVNAME
+ if (!(boothowto & RB_DFLTROOT))
+ if (!vfs_mountroot_try(ROOTDEVNAME))
+ return;
+#endif
+
+ /*
+ * Everything so far has failed, prompt on the console if we haven't
+ * already tried that.
+ */
+ if (!(boothowto & (RB_DFLTROOT | RB_ASKNAME)) && !vfs_mountroot_ask())
+ return;
+ panic("Root mount failed, startup aborted.");
+}
+
+/*
+ * Mount (mountfrom) as the root filesystem.
+ */
+static int
+vfs_mountroot_try(char *mountfrom)
+{
+ struct mount *mp;
+ char *vfsname, *path;
+ int error;
+ char patt[32];
+ int s;
+
+ vfsname = NULL;
+ path = NULL;
+ mp = NULL;
+ error = EINVAL;
+
+ if (mountfrom == NULL)
+ return(error); /* don't complain */
+
+ s = splcam(); /* Overkill, but annoying without it */
+ printf("Mounting root from %s\n", mountfrom);
+ splx(s);
+
+ /* parse vfs name and path */
+ vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK);
+ path = malloc(MNAMELEN, M_MOUNT, M_WAITOK);
+ vfsname[0] = path[0] = 0;
+ sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN);
+ if (sscanf(mountfrom, patt, vfsname, path) < 1)
+ goto done;
+
+ /* allocate a root mount */
+ error = vfs_rootmountalloc(vfsname, path[0] != 0 ? path : ROOTNAME,
+ &mp);
+ if (error != 0) {
+ printf("Can't allocate root mount for filesystem '%s': %d\n",
+ vfsname, error);
+ goto done;
+ }
+ mp->mnt_flag |= MNT_ROOTFS;
+
+ /* do our best to set rootdev */
+ if ((path[0] != 0) && setrootbyname(path))
+ printf("setrootbyname failed\n");
+
+ /* If the root device is a type "memory disk", mount RW */
+ if (rootdev != NODEV && devsw(rootdev) &&
+ (devsw(rootdev)->d_flags & D_MEMDISK))
+ mp->mnt_flag &= ~MNT_RDONLY;
+
+ /*
+ * Set the mount path to be something useful, because the
+ * filesystem code isn't responsible now for initialising
+ * f_mntonname unless they want to override the default
+ * (which is `path'.)
+ */
+ strncpy(mp->mnt_stat.f_mntonname, "/", MNAMELEN);
+
+ error = VFS_MOUNT(mp, NULL, NULL, NULL, curthread);
+
+done:
+ if (vfsname != NULL)
+ free(vfsname, M_MOUNT);
+ if (path != NULL)
+ free(path, M_MOUNT);
+ if (error != 0) {
+ if (mp != NULL) {
+ vfs_unbusy(mp, curthread);
+ free(mp, M_MOUNT);
+ }
+ printf("Root mount failed: %d\n", error);
+ } else {
+
+ /* register with list of mounted filesystems */
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+
+ /* sanity check system clock against root filesystem timestamp */
+ inittodr(mp->mnt_time);
+ vfs_unbusy(mp, curthread);
+ }
+ return(error);
+}
+
+/*
+ * Spin prompting on the console for a suitable root filesystem
+ */
+static int
+vfs_mountroot_ask(void)
+{
+ char name[128];
+ int i;
+ dev_t dev;
+
+ for(;;) {
+ printf("\nManual root filesystem specification:\n");
+ printf(" <fstype>:<device> Mount <device> using filesystem <fstype>\n");
+#if defined(__i386__) || defined(__ia64__)
+ printf(" eg. ufs:da0s1a\n");
+#else
+ printf(" eg. ufs:da0a\n");
+#endif
+ printf(" ? List valid disk boot devices\n");
+ printf(" <empty line> Abort manual input\n");
+ printf("\nmountroot> ");
+ gets(name);
+ if (name[0] == 0)
+ return(1);
+ if (name[0] == '?') {
+ printf("Possibly valid devices for 'ufs' root:\n");
+ for (i = 0; i < NUMCDEVSW; i++) {
+ dev = makedev(i, 0);
+ if (devsw(dev) != NULL)
+ printf(" \"%s\"", devsw(dev)->d_name);
+ }
+ printf("\n");
+ continue;
+ }
+ if (!vfs_mountroot_try(name))
+ return(0);
+ }
+}
+
+/*
+ * Local helper function for vfs_mountroot_ask.
+ */
+static void
+gets(char *cp)
+{
+ char *lp;
+ int c;
+
+ lp = cp;
+ for (;;) {
+ printf("%c", c = cngetc() & 0177);
+ switch (c) {
+ case -1:
+ case '\n':
+ case '\r':
+ *lp++ = '\0';
+ return;
+ case '\b':
+ case '\177':
+ if (lp > cp) {
+ printf(" \b");
+ lp--;
+ }
+ continue;
+ case '#':
+ lp--;
+ if (lp < cp)
+ lp = cp;
+ continue;
+ case '@':
+ case 'u' & 037:
+ lp = cp;
+ printf("%c", '\n');
+ continue;
+ default:
+ *lp++ = c;
+ }
+ }
+}
+
+/*
+ * Convert a given name to the dev_t of the disk-like device
+ * it refers to.
+ */
+dev_t
+getdiskbyname(char *name) {
+ char *cp;
+ dev_t dev;
+
+ cp = name;
+ if (!bcmp(cp, "/dev/", 5))
+ cp += 5;
+
+ dev = NODEV;
+ EVENTHANDLER_INVOKE(dev_clone, cp, strlen(cp), &dev);
+ return (dev);
+}
+
+/*
+ * Set rootdev to match (name), given that we expect it to
+ * refer to a disk-like device.
+ */
+static int
+setrootbyname(char *name)
+{
+ dev_t diskdev;
+
+ diskdev = getdiskbyname(name);
+ if (diskdev != NODEV) {
+ rootdev = diskdev;
+ return (0);
+ }
+
+ return (1);
+}
+
+/* Show the dev_t for a disk specified by name */
+#ifdef DDB
+DB_SHOW_COMMAND(disk, db_getdiskbyname)
+{
+ dev_t dev;
+
+ if (modif[0] == '\0') {
+ db_error("usage: show disk/devicename");
+ return;
+ }
+ dev = getdiskbyname(modif);
+ if (dev != NODEV)
+ db_printf("dev_t = %p\n", dev);
+ else
+ db_printf("No disk device matched.\n");
+}
+#endif
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
new file mode 100644
index 0000000..0575662
--- /dev/null
+++ b/sys/kern/vfs_subr.c
@@ -0,0 +1,3275 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
+ * $FreeBSD$
+ */
+
+/*
+ * External virtual filesystem routines
+ */
+#include "opt_ddb.h"
+#include "opt_ffs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+
+static void addalias(struct vnode *vp, dev_t nvp_rdev);
+static void insmntque(struct vnode *vp, struct mount *mp);
+static void vclean(struct vnode *vp, int flags, struct thread *td);
+static void vlruvp(struct vnode *vp);
+
+/*
+ * Number of vnodes in existence. Increased whenever getnewvnode()
+ * allocates a new vnode, never decreased.
+ */
+static unsigned long numvnodes;
+
+SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
+
+/*
+ * Conversion tables for conversion from vnode types to inode formats
+ * and back.
+ */
+enum vtype iftovt_tab[16] = {
+ VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+ VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
+};
+int vttoif_tab[9] = {
+ 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+ S_IFSOCK, S_IFIFO, S_IFMT,
+};
+
+/*
+ * List of vnodes that are ready for recycling.
+ */
+static TAILQ_HEAD(freelst, vnode) vnode_free_list;
+
+/*
+ * Minimum number of free vnodes. If there are fewer than this free vnodes,
+ * getnewvnode() will return a newly allocated vnode.
+ */
+static u_long wantfreevnodes = 25;
+SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
+/* Number of vnodes in the free list. */
+static u_long freevnodes;
+SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
+
+/*
+ * Various variables used for debugging the new implementation of
+ * reassignbuf().
+ * XXX these are probably of (very) limited utility now.
+ */
+static int reassignbufcalls;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
+static int reassignbufloops;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
+static int reassignbufsortgood;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
+static int reassignbufsortbad;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
+/* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */
+static int reassignbufmethod = 1;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
+static int nameileafonly;
+SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
+
+#ifdef ENABLE_VFS_IOOPT
+/* See NOTES for a description of this setting. */
+int vfs_ioopt;
+SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
+#endif
+
+/* List of mounted filesystems. */
+struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
+
+/* For any iteration/modification of mountlist */
+struct mtx mountlist_mtx;
+
+/* For any iteration/modification of mnt_vnodelist */
+struct mtx mntvnode_mtx;
+
+/*
+ * Cache for the mount type id assigned to NFS. This is used for
+ * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
+ */
+int nfs_mount_type = -1;
+
+/* To keep more than one thread at a time from running vfs_getnewfsid */
+static struct mtx mntid_mtx;
+
+/* For any iteration/modification of vnode_free_list */
+static struct mtx vnode_free_list_mtx;
+
+/*
+ * For any iteration/modification of dev->si_hlist (linked through
+ * v_specnext)
+ */
+static struct mtx spechash_mtx;
+
+/* Publicly exported FS */
+struct nfs_public nfs_pub;
+
+/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
+static uma_zone_t vnode_zone;
+static uma_zone_t vnodepoll_zone;
+
+/* Set to 1 to print out reclaim of active vnodes */
+int prtactive;
+
+/*
+ * The workitem queue.
+ *
+ * It is useful to delay writes of file data and filesystem metadata
+ * for tens of seconds so that quickly created and deleted files need
+ * not waste disk bandwidth being created and removed. To realize this,
+ * we append vnodes to a "workitem" queue. When running with a soft
+ * updates implementation, most pending metadata dependencies should
+ * not wait for more than a few seconds. Thus, mounted on block devices
+ * are delayed only about a half the time that file data is delayed.
+ * Similarly, directory updates are more critical, so are only delayed
+ * about a third the time that file data is delayed. Thus, there are
+ * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
+ * one each second (driven off the filesystem syncer process). The
+ * syncer_delayno variable indicates the next queue that is to be processed.
+ * Items that need to be processed soon are placed in this queue:
+ *
+ * syncer_workitem_pending[syncer_delayno]
+ *
+ * A delay of fifteen seconds is done by placing the request fifteen
+ * entries later in the queue:
+ *
+ * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
+ *
+ */
+static int syncer_delayno;
+static long syncer_mask;
+LIST_HEAD(synclist, vnode);
+static struct synclist *syncer_workitem_pending;
+
+#define SYNCER_MAXDELAY 32
+static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
+static int syncdelay = 30; /* max time to delay syncing data */
+static int filedelay = 30; /* time to delay syncing files */
+SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
+static int dirdelay = 29; /* time to delay syncing directories */
+SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
+static int metadelay = 28; /* time to delay syncing metadata */
+SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
+static int rushjob; /* number of slots to run ASAP */
+static int stat_rush_requests; /* number of times I/O speeded up */
+SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
+
+/*
+ * Number of vnodes we want to exist at any one time. This is mostly used
+ * to size hash tables in vnode-related code. It is normally not used in
+ * getnewvnode(), as wantfreevnodes is normally nonzero.)
+ *
+ * XXX desiredvnodes is historical cruft and should not exist.
+ */
+int desiredvnodes;
+SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
+ &desiredvnodes, 0, "Maximum number of vnodes");
+static int minvnodes;
+SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
+ &minvnodes, 0, "Minimum number of vnodes");
+static int vnlru_nowhere;
+SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
+ "Number of times the vnlru process ran without success");
+
+#ifdef DEBUG_VFS_LOCKS
+/* Print lock violations */
+int vfs_badlock_print = 1;
+/* Panic on violation */
+int vfs_badlock_panic = 1;
+#endif
+
+void
+v_addpollinfo(struct vnode *vp)
+{
+ vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
+ mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
+}
+
+/*
+ * Initialize the vnode management data structures.
+ */
+static void
+vntblinit(void *dummy __unused)
+{
+
+ desiredvnodes = maxproc + cnt.v_page_count / 4;
+ minvnodes = desiredvnodes / 4;
+ mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
+ mtx_init(&mntvnode_mtx, "mntvnode", NULL, MTX_DEF);
+ mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
+ mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
+ TAILQ_INIT(&vnode_free_list);
+ mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
+ vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ /*
+ * Initialize the filesystem syncer.
+ */
+ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
+ &syncer_mask);
+ syncer_maxdelay = syncer_mask + 1;
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
+
+
+/*
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Interlock is not released on failure.
+ */
+int
+vfs_busy(mp, flags, interlkp, td)
+ struct mount *mp;
+ int flags;
+ struct mtx *interlkp;
+ struct thread *td;
+{
+ int lkflags;
+
+ if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+ if (flags & LK_NOWAIT)
+ return (ENOENT);
+ mp->mnt_kern_flag |= MNTK_MWAIT;
+ /*
+ * Since all busy locks are shared except the exclusive
+ * lock granted when unmounting, the only place that a
+ * wakeup needs to be done is at the release of the
+ * exclusive lock at the end of dounmount.
+ */
+ msleep(mp, interlkp, PVFS, "vfs_busy", 0);
+ return (ENOENT);
+ }
+ lkflags = LK_SHARED | LK_NOPAUSE;
+ if (interlkp)
+ lkflags |= LK_INTERLOCK;
+ if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
+ panic("vfs_busy: unexpected lock failure");
+ return (0);
+}
+
+/*
+ * Free a busy filesystem.
+ */
+void
+vfs_unbusy(mp, td)
+ struct mount *mp;
+ struct thread *td;
+{
+
+ lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
+}
+
+/*
+ * Lookup a filesystem type, and if found allocate and initialize
+ * a mount structure for it.
+ *
+ * Devname is usually updated by mount(8) after booting.
+ */
+int
+vfs_rootmountalloc(fstypename, devname, mpp)
+ char *fstypename;
+ char *devname;
+ struct mount **mpp;
+{
+ struct thread *td = curthread; /* XXX */
+ struct vfsconf *vfsp;
+ struct mount *mp;
+
+ if (fstypename == NULL)
+ return (ENODEV);
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstypename))
+ break;
+ if (vfsp == NULL)
+ return (ENODEV);
+ mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
+ lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+ (void)vfs_busy(mp, LK_NOWAIT, 0, td);
+ TAILQ_INIT(&mp->mnt_nvnodelist);
+ TAILQ_INIT(&mp->mnt_reservedvnlist);
+ mp->mnt_vfc = vfsp;
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_flag = MNT_RDONLY;
+ mp->mnt_vnodecovered = NULLVP;
+ vfsp->vfc_refcount++;
+ mp->mnt_iosize_max = DFLTPHYS;
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+ strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+ mp->mnt_stat.f_mntonname[0] = '/';
+ mp->mnt_stat.f_mntonname[1] = 0;
+ (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
+ *mpp = mp;
+ return (0);
+}
+
+/*
+ * Find an appropriate filesystem to use for the root. If a filesystem
+ * has not been preselected, walk through the list of known filesystems
+ * trying those that have mountroot routines, and try them until one
+ * works or we have tried them all.
+ */
+#ifdef notdef /* XXX JH */
+int
+lite2_vfs_mountroot()
+{
+ struct vfsconf *vfsp;
+ extern int (*lite2_mountroot)(void);
+ int error;
+
+ if (lite2_mountroot != NULL)
+ return ((*lite2_mountroot)());
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+ if (vfsp->vfc_mountroot == NULL)
+ continue;
+ if ((error = (*vfsp->vfc_mountroot)()) == 0)
+ return (0);
+ printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
+ }
+ return (ENODEV);
+}
+#endif
+
+/*
+ * Lookup a mount point by filesystem identifier.
+ */
+struct mount *
+vfs_getvfs(fsid)
+ fsid_t *fsid;
+{
+ register struct mount *mp;
+
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+ mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+ mtx_unlock(&mountlist_mtx);
+ return (mp);
+ }
+ }
+ mtx_unlock(&mountlist_mtx);
+ return ((struct mount *) 0);
+}
+
+/*
+ * Get a new unique fsid. Try to make its val[0] unique, since this value
+ * will be used to create fake device numbers for stat(). Also try (but
+ * not so hard) make its val[0] unique mod 2^16, since some emulators only
+ * support 16-bit device numbers. We end up with unique val[0]'s for the
+ * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
+ *
+ * Keep in mind that several mounts may be running in parallel. Starting
+ * the search one past where the previous search terminated is both a
+ * micro-optimization and a defense against returning the same fsid to
+ * different mounts.
+ */
+void
+vfs_getnewfsid(mp)
+ struct mount *mp;
+{
+ static u_int16_t mntid_base;
+ fsid_t tfsid;
+ int mtype;
+
+ mtx_lock(&mntid_mtx);
+ mtype = mp->mnt_vfc->vfc_typenum;
+ tfsid.val[1] = mtype;
+ mtype = (mtype & 0xFF) << 24;
+ for (;;) {
+ tfsid.val[0] = makeudev(255,
+ mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
+ mntid_base++;
+ if (vfs_getvfs(&tfsid) == NULL)
+ break;
+ }
+ mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+ mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
+ mtx_unlock(&mntid_mtx);
+}
+
+/*
+ * Knob to control the precision of file timestamps:
+ *
+ * 0 = seconds only; nanoseconds zeroed.
+ * 1 = seconds and nanoseconds, accurate within 1/HZ.
+ * 2 = seconds and nanoseconds, truncated to microseconds.
+ * >=3 = seconds and nanoseconds, maximum precision.
+ */
+enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
+
+static int timestamp_precision = TSP_SEC;
+SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
+ &timestamp_precision, 0, "");
+
+/*
+ * Get a current timestamp.
+ */
+void
+vfs_timestamp(tsp)
+ struct timespec *tsp;
+{
+ struct timeval tv;
+
+ switch (timestamp_precision) {
+ case TSP_SEC:
+ tsp->tv_sec = time_second;
+ tsp->tv_nsec = 0;
+ break;
+ case TSP_HZ:
+ getnanotime(tsp);
+ break;
+ case TSP_USEC:
+ microtime(&tv);
+ TIMEVAL_TO_TIMESPEC(&tv, tsp);
+ break;
+ case TSP_NSEC:
+ default:
+ nanotime(tsp);
+ break;
+ }
+}
+
+/*
+ * Build a linked list of mount options from a struct uio.
+ */
+int
+vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
+{
+ struct vfsoptlist *opts;
+ struct vfsopt *opt;
+ unsigned int i, iovcnt;
+ int error, namelen, optlen;
+
+ iovcnt = auio->uio_iovcnt;
+ opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
+ TAILQ_INIT(opts);
+ for (i = 0; i < iovcnt; i += 2) {
+ opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
+ namelen = auio->uio_iov[i].iov_len;
+ optlen = auio->uio_iov[i + 1].iov_len;
+ opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
+ opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
+ opt->len = optlen;
+ if (auio->uio_segflg == UIO_SYSSPACE) {
+ bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
+ bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
+ optlen);
+ } else {
+ error = copyin(auio->uio_iov[i].iov_base, opt->name,
+ namelen);
+ if (!error)
+ error = copyin(auio->uio_iov[i + 1].iov_base,
+ opt->value, optlen);
+ if (error)
+ goto bad;
+ }
+ TAILQ_INSERT_TAIL(opts, opt, link);
+ }
+ *options = opts;
+ return (0);
+bad:
+ vfs_freeopts(opts);
+ return (error);
+}
+
+/*
+ * Get a mount option by its name.
+ *
+ * Return 0 if the option was found, ENOENT otherwise.
+ * If len is non-NULL it will be filled with the length
+ * of the option. If buf is non-NULL, it will be filled
+ * with the address of the option.
+ */
+int
+vfs_getopt(opts, name, buf, len)
+ struct vfsoptlist *opts;
+ const char *name;
+ void **buf;
+ int *len;
+{
+ struct vfsopt *opt;
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) == 0) {
+ if (len != NULL)
+ *len = opt->len;
+ if (buf != NULL)
+ *buf = opt->value;
+ return (0);
+ }
+ }
+ return (ENOENT);
+}
+
+/*
+ * Find and copy a mount option.
+ *
+ * The size of the buffer has to be specified
+ * in len, if it is not the same length as the
+ * mount option, EINVAL is returned.
+ * Returns ENOENT if the option is not found.
+ */
+int
+vfs_copyopt(opts, name, dest, len)
+ struct vfsoptlist *opts;
+ const char *name;
+ void *dest;
+ int len;
+{
+ struct vfsopt *opt;
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) == 0) {
+ if (len != opt->len)
+ return (EINVAL);
+ bcopy(opt->value, dest, opt->len);
+ return (0);
+ }
+ }
+ return (ENOENT);
+}
+
+/*
+ * Set vnode attributes to VNOVAL
+ */
+void
+vattr_null(vap)
+ register struct vattr *vap;
+{
+
+ vap->va_type = VNON;
+ vap->va_size = VNOVAL;
+ vap->va_bytes = VNOVAL;
+ vap->va_mode = VNOVAL;
+ vap->va_nlink = VNOVAL;
+ vap->va_uid = VNOVAL;
+ vap->va_gid = VNOVAL;
+ vap->va_fsid = VNOVAL;
+ vap->va_fileid = VNOVAL;
+ vap->va_blocksize = VNOVAL;
+ vap->va_rdev = VNOVAL;
+ vap->va_atime.tv_sec = VNOVAL;
+ vap->va_atime.tv_nsec = VNOVAL;
+ vap->va_mtime.tv_sec = VNOVAL;
+ vap->va_mtime.tv_nsec = VNOVAL;
+ vap->va_ctime.tv_sec = VNOVAL;
+ vap->va_ctime.tv_nsec = VNOVAL;
+ vap->va_flags = VNOVAL;
+ vap->va_gen = VNOVAL;
+ vap->va_vaflags = 0;
+}
+
+/*
+ * This routine is called when we have too many vnodes. It attempts
+ * to free <count> vnodes and will potentially free vnodes that still
+ * have VM backing store (VM backing store is typically the cause
+ * of a vnode blowout so we want to do this). Therefore, this operation
+ * is not considered cheap.
+ *
+ * A number of conditions may prevent a vnode from being reclaimed.
+ * the buffer cache may have references on the vnode, a directory
+ * vnode may still have references due to the namei cache representing
+ * underlying files, or the vnode may be in active use. It is not
+ * desireable to reuse such vnodes. These conditions may cause the
+ * number of vnodes to reach some minimum value regardless of what
+ * you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
+ */
+static int
+vlrureclaim(struct mount *mp, int count)
+{
+ struct vnode *vp;
+ int done;
+ int trigger;
+ int usevnodes;
+
+ /*
+ * Calculate the trigger point, don't allow user
+ * screwups to blow us up. This prevents us from
+ * recycling vnodes with lots of resident pages. We
+ * aren't trying to free memory, we are trying to
+ * free vnodes.
+ */
+ usevnodes = desiredvnodes;
+ if (usevnodes <= 0)
+ usevnodes = 1;
+ trigger = cnt.v_page_count * 2 / usevnodes;
+
+ done = 0;
+ mtx_lock(&mntvnode_mtx);
+ while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
+ TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+ TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+
+ if (vp->v_type != VNON &&
+ vp->v_type != VBAD &&
+ VMIGHTFREE(vp) && /* critical path opt */
+ (vp->v_object == NULL || vp->v_object->resident_page_count < trigger) &&
+ mtx_trylock(&vp->v_interlock)
+ ) {
+ mtx_unlock(&mntvnode_mtx);
+ if (VMIGHTFREE(vp)) {
+ vgonel(vp, curthread);
+ done++;
+ } else {
+ mtx_unlock(&vp->v_interlock);
+ }
+ mtx_lock(&mntvnode_mtx);
+ }
+ --count;
+ }
+ mtx_unlock(&mntvnode_mtx);
+ return done;
+}
+
+/*
+ * Attempt to recycle vnodes in a context that is always safe to block.
+ * Calling vlrurecycle() from the bowels of filesystem code has some
+ * interesting deadlock problems.
+ */
+static struct proc *vnlruproc;
+static int vnlruproc_sig;
+
+static void
+vnlru_proc(void)
+{
+ struct mount *mp, *nmp;
+ int s;
+ int done;
+ struct proc *p = vnlruproc;
+ struct thread *td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */
+
+ mtx_lock(&Giant);
+
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
+ SHUTDOWN_PRI_FIRST);
+
+ s = splbio();
+ for (;;) {
+ kthread_suspend_check(p);
+ if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
+ vnlruproc_sig = 0;
+ tsleep(vnlruproc, PVFS, "vlruwt", 0);
+ continue;
+ }
+ done = 0;
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ done += vlrureclaim(mp, 10);
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp, td);
+ }
+ mtx_unlock(&mountlist_mtx);
+ if (done == 0) {
+#if 0
+ /* These messages are temporary debugging aids */
+ if (vnlru_nowhere < 5)
+ printf("vnlru process getting nowhere..\n");
+ else if (vnlru_nowhere == 5)
+ printf("vnlru process messages stopped.\n");
+#endif
+ vnlru_nowhere++;
+ tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
+ }
+ }
+ splx(s);
+}
+
+static struct kproc_desc vnlru_kp = {
+ "vnlru",
+ vnlru_proc,
+ &vnlruproc
+};
+SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
+
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(tag, mp, vops, vpp)
+ enum vtagtype tag;
+ struct mount *mp;
+ vop_t **vops;
+ struct vnode **vpp;
+{
+ int s;
+ struct thread *td = curthread; /* XXX */
+ struct vnode *vp = NULL;
+ struct mount *vnmp;
+ vm_object_t object;
+
+ s = splbio();
+ /*
+ * Try to reuse vnodes if we hit the max. This situation only
+ * occurs in certain large-memory (2G+) situations. We cannot
+ * attempt to directly reclaim vnodes due to nasty recursion
+ * problems.
+ */
+ if (vnlruproc_sig == 0 && numvnodes - freevnodes > desiredvnodes) {
+ vnlruproc_sig = 1; /* avoid unnecessary wakeups */
+ wakeup(vnlruproc);
+ }
+
+ /*
+ * Attempt to reuse a vnode already on the free list, allocating
+ * a new vnode if we can't find one or if we have not reached a
+ * good minimum for good LRU performance.
+ */
+
+ mtx_lock(&vnode_free_list_mtx);
+
+ if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
+ int count;
+
+ for (count = 0; count < freevnodes; count++) {
+ vp = TAILQ_FIRST(&vnode_free_list);
+ if (vp == NULL || vp->v_usecount)
+ panic("getnewvnode: free vnode isn't");
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+
+ /* Don't recycle if we can't get the interlock */
+ if (!mtx_trylock(&vp->v_interlock)) {
+ vp = NULL;
+ continue;
+ }
+
+ /* We should be able to immediately acquire this */
+ if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td) != 0)
+ continue;
+ /*
+ * Don't recycle if we still have cached pages.
+ */
+ if (VOP_GETVOBJECT(vp, &object) == 0 &&
+ (object->resident_page_count ||
+ object->ref_count)) {
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp,
+ v_freelist);
+ VOP_UNLOCK(vp, 0, td);
+ vp = NULL;
+ continue;
+ }
+ if (LIST_FIRST(&vp->v_cache_src)) {
+ /*
+ * note: nameileafonly sysctl is temporary,
+ * for debugging only, and will eventually be
+ * removed.
+ */
+ if (nameileafonly > 0) {
+ /*
+ * Do not reuse namei-cached directory
+ * vnodes that have cached
+ * subdirectories.
+ */
+ if (cache_leaf_test(vp) < 0) {
+ VOP_UNLOCK(vp, 0, td);
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ vp = NULL;
+ continue;
+ }
+ } else if (nameileafonly < 0 ||
+ vmiodirenable == 0) {
+ /*
+ * Do not reuse namei-cached directory
+ * vnodes if nameileafonly is -1 or
+ * if VMIO backing for directories is
+ * turned off (otherwise we reuse them
+ * too quickly).
+ */
+ VOP_UNLOCK(vp, 0, td);
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ vp = NULL;
+ continue;
+ }
+ }
+ /*
+ * Skip over it if its filesystem is being suspended.
+ */
+ if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0)
+ break;
+ VOP_UNLOCK(vp, 0, td);
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ vp = NULL;
+ }
+ }
+ if (vp) {
+ vp->v_flag |= VDOOMED;
+ vp->v_flag &= ~VFREE;
+ freevnodes--;
+ mtx_unlock(&vnode_free_list_mtx);
+ cache_purge(vp);
+ if (vp->v_type != VBAD) {
+ VOP_UNLOCK(vp, 0, td);
+ vgone(vp);
+ } else {
+ VOP_UNLOCK(vp, 0, td);
+ }
+ vn_finished_write(vnmp);
+
+#ifdef INVARIANTS
+ {
+ int s;
+
+ if (vp->v_data)
+ panic("cleaned vnode isn't");
+ s = splbio();
+ if (vp->v_numoutput)
+ panic("Clean vnode has pending I/O's");
+ splx(s);
+ if (vp->v_writecount != 0)
+ panic("Non-zero write count");
+ }
+#endif
+ if (vp->v_pollinfo) {
+ mtx_destroy(&vp->v_pollinfo->vpi_lock);
+ uma_zfree(vnodepoll_zone, vp->v_pollinfo);
+ }
+ vp->v_pollinfo = NULL;
+ vp->v_flag = 0;
+ vp->v_lastw = 0;
+ vp->v_lasta = 0;
+ vp->v_cstart = 0;
+ vp->v_clen = 0;
+ vp->v_socket = 0;
+ } else {
+ mtx_unlock(&vnode_free_list_mtx);
+ vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
+ bzero((char *) vp, sizeof *vp);
+ mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
+ vp->v_dd = vp;
+ cache_purge(vp);
+ LIST_INIT(&vp->v_cache_src);
+ TAILQ_INIT(&vp->v_cache_dst);
+ numvnodes++;
+ }
+
+ TAILQ_INIT(&vp->v_cleanblkhd);
+ TAILQ_INIT(&vp->v_dirtyblkhd);
+ vp->v_type = VNON;
+ vp->v_tag = tag;
+ vp->v_op = vops;
+ lockinit(&vp->v_lock, PVFS, "vnlock", VLKTIMEOUT, LK_NOPAUSE);
+ insmntque(vp, mp);
+ *vpp = vp;
+ vp->v_usecount = 1;
+ vp->v_data = 0;
+
+ splx(s);
+
+#if 0
+ vnodeallocs++;
+ if (vnodeallocs % vnoderecycleperiod == 0 &&
+ freevnodes < vnoderecycleminfreevn &&
+ vnoderecyclemintotalvn < numvnodes) {
+ /* Recycle vnodes. */
+ cache_purgeleafdirs(vnoderecyclenumber);
+ }
+#endif
+
+ return (0);
+}
+
+/*
+ * Move a vnode from one mount queue to another.
+ */
+static void
+insmntque(vp, mp)
+ register struct vnode *vp;
+ register struct mount *mp;
+{
+
+ mtx_lock(&mntvnode_mtx);
+ /*
+ * Delete from old mount point vnode list, if on one.
+ */
+ if (vp->v_mount != NULL)
+ TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
+ /*
+ * Insert into list of vnodes for the new mount point, if available.
+ */
+ if ((vp->v_mount = mp) == NULL) {
+ mtx_unlock(&mntvnode_mtx);
+ return;
+ }
+ TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+ mtx_unlock(&mntvnode_mtx);
+}
+
+/*
+ * Update outstanding I/O count and do wakeup if requested.
+ */
+void
+vwakeup(bp)
+ register struct buf *bp;
+{
+ register struct vnode *vp;
+
+ bp->b_flags &= ~B_WRITEINPROG;
+ if ((vp = bp->b_vp)) {
+ vp->v_numoutput--;
+ if (vp->v_numoutput < 0)
+ panic("vwakeup: neg numoutput");
+ if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
+ vp->v_flag &= ~VBWAIT;
+ wakeup(&vp->v_numoutput);
+ }
+ }
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
+ */
+int
+vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
+ register struct vnode *vp;
+ int flags;
+ struct ucred *cred;
+ struct thread *td;
+ int slpflag, slptimeo;
+{
+ register struct buf *bp;
+ struct buf *nbp, *blist;
+ int s, error;
+ vm_object_t object;
+
+ GIANT_REQUIRED;
+
+ if (flags & V_SAVE) {
+ s = splbio();
+ while (vp->v_numoutput) {
+ vp->v_flag |= VBWAIT;
+ error = tsleep(&vp->v_numoutput,
+ slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
+ if (error) {
+ splx(s);
+ return (error);
+ }
+ }
+ if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+ splx(s);
+ if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
+ return (error);
+ s = splbio();
+ if (vp->v_numoutput > 0 ||
+ !TAILQ_EMPTY(&vp->v_dirtyblkhd))
+ panic("vinvalbuf: dirty bufs");
+ }
+ splx(s);
+ }
+ s = splbio();
+ for (;;) {
+ blist = TAILQ_FIRST(&vp->v_cleanblkhd);
+ if (!blist)
+ blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
+ if (!blist)
+ break;
+
+ for (bp = blist; bp; bp = nbp) {
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+ error = BUF_TIMELOCK(bp,
+ LK_EXCLUSIVE | LK_SLEEPFAIL,
+ "vinvalbuf", slpflag, slptimeo);
+ if (error == ENOLCK)
+ break;
+ splx(s);
+ return (error);
+ }
+ /*
+ * XXX Since there are no node locks for NFS, I
+ * believe there is a slight chance that a delayed
+ * write will occur while sleeping just above, so
+ * check for it. Note that vfs_bio_awrite expects
+ * buffers to reside on a queue, while BUF_WRITE and
+ * brelse do not.
+ */
+ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
+ (flags & V_SAVE)) {
+
+ if (bp->b_vp == vp) {
+ if (bp->b_flags & B_CLUSTEROK) {
+ BUF_UNLOCK(bp);
+ vfs_bio_awrite(bp);
+ } else {
+ bremfree(bp);
+ bp->b_flags |= B_ASYNC;
+ BUF_WRITE(bp);
+ }
+ } else {
+ bremfree(bp);
+ (void) BUF_WRITE(bp);
+ }
+ break;
+ }
+ bremfree(bp);
+ bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ }
+ }
+
+ /*
+ * Wait for I/O to complete. XXX needs cleaning up. The vnode can
+ * have write I/O in-progress but if there is a VM object then the
+ * VM object can also have read-I/O in-progress.
+ */
+ do {
+ while (vp->v_numoutput > 0) {
+ vp->v_flag |= VBWAIT;
+ tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
+ }
+ if (VOP_GETVOBJECT(vp, &object) == 0) {
+ while (object->paging_in_progress)
+ vm_object_pip_sleep(object, "vnvlbx");
+ }
+ } while (vp->v_numoutput > 0);
+
+ splx(s);
+
+ /*
+ * Destroy the copy in the VM cache, too.
+ */
+ mtx_lock(&vp->v_interlock);
+ if (VOP_GETVOBJECT(vp, &object) == 0) {
+ vm_object_page_remove(object, 0, 0,
+ (flags & V_SAVE) ? TRUE : FALSE);
+ }
+ mtx_unlock(&vp->v_interlock);
+
+ if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
+ panic("vinvalbuf: flush failed");
+ return (0);
+}
+
+/*
+ * Truncate a file's buffer and pages to a specified length. This
+ * is in lieu of the old vinvalbuf mechanism, which performed unneeded
+ * sync activity.
+ */
+int
+vtruncbuf(vp, cred, td, length, blksize)
+ register struct vnode *vp;
+ struct ucred *cred;
+ struct thread *td;
+ off_t length;
+ int blksize;
+{
+ register struct buf *bp;
+ struct buf *nbp;
+ int s, anyfreed;
+ int trunclbn;
+
+ /*
+ * Round up to the *next* lbn.
+ */
+ trunclbn = (length + blksize - 1) / blksize;
+
+ s = splbio();
+restart:
+ anyfreed = 1;
+ for (;anyfreed;) {
+ anyfreed = 0;
+ for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+ if (bp->b_lblkno >= trunclbn) {
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+ BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
+ goto restart;
+ } else {
+ bremfree(bp);
+ bp->b_flags |= (B_INVAL | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ anyfreed = 1;
+ }
+ if (nbp &&
+ (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
+ (nbp->b_vp != vp) ||
+ (nbp->b_flags & B_DELWRI))) {
+ goto restart;
+ }
+ }
+ }
+
+ for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+ if (bp->b_lblkno >= trunclbn) {
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+ BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
+ goto restart;
+ } else {
+ bremfree(bp);
+ bp->b_flags |= (B_INVAL | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ anyfreed = 1;
+ }
+ if (nbp &&
+ (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
+ (nbp->b_vp != vp) ||
+ (nbp->b_flags & B_DELWRI) == 0)) {
+ goto restart;
+ }
+ }
+ }
+ }
+
+ if (length > 0) {
+restartsync:
+ for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+ if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+ BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
+ goto restart;
+ } else {
+ bremfree(bp);
+ if (bp->b_vp == vp) {
+ bp->b_flags |= B_ASYNC;
+ } else {
+ bp->b_flags &= ~B_ASYNC;
+ }
+ BUF_WRITE(bp);
+ }
+ goto restartsync;
+ }
+
+ }
+ }
+
+ while (vp->v_numoutput > 0) {
+ vp->v_flag |= VBWAIT;
+ tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
+ }
+
+ splx(s);
+
+ vnode_pager_setsize(vp, length);
+
+ return (0);
+}
+
+/*
+ * Associate a buffer with a vnode.
+ */
+void
+bgetvp(vp, bp)
+ register struct vnode *vp;
+ register struct buf *bp;
+{
+ int s;
+
+ KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
+
+ vhold(vp);
+ bp->b_vp = vp;
+ bp->b_dev = vn_todev(vp);
+ /*
+ * Insert onto list for new vnode.
+ */
+ s = splbio();
+ bp->b_xflags |= BX_VNCLEAN;
+ bp->b_xflags &= ~BX_VNDIRTY;
+ TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
+ splx(s);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ */
+void
+brelvp(bp)
+ register struct buf *bp;
+{
+ struct vnode *vp;
+ struct buflists *listheadp;
+ int s;
+
+ KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
+
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ vp = bp->b_vp;
+ s = splbio();
+ if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
+ if (bp->b_xflags & BX_VNDIRTY)
+ listheadp = &vp->v_dirtyblkhd;
+ else
+ listheadp = &vp->v_cleanblkhd;
+ TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+ bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
+ }
+ if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+ vp->v_flag &= ~VONWORKLST;
+ LIST_REMOVE(vp, v_synclist);
+ }
+ splx(s);
+ bp->b_vp = (struct vnode *) 0;
+ vdrop(vp);
+}
+
+/*
+ * Add an item to the syncer work queue.
+ */
+static void
+vn_syncer_add_to_worklist(struct vnode *vp, int delay)
+{
+ int s, slot;
+
+ s = splbio();
+
+ if (vp->v_flag & VONWORKLST) {
+ LIST_REMOVE(vp, v_synclist);
+ }
+
+ if (delay > syncer_maxdelay - 2)
+ delay = syncer_maxdelay - 2;
+ slot = (syncer_delayno + delay) & syncer_mask;
+
+ LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
+ vp->v_flag |= VONWORKLST;
+ splx(s);
+}
+
+struct proc *updateproc;
+static void sched_sync(void);
+static struct kproc_desc up_kp = {
+ "syncer",
+ sched_sync,
+ &updateproc
+};
+SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+
+/*
+ * System filesystem synchronizer daemon.
+ */
+void
+sched_sync(void)
+{
+ struct synclist *slp;
+ struct vnode *vp;
+ struct mount *mp;
+ long starttime;
+ int s;
+ struct thread *td = FIRST_THREAD_IN_PROC(updateproc); /* XXXKSE */
+
+ mtx_lock(&Giant);
+
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
+ SHUTDOWN_PRI_LAST);
+
+ for (;;) {
+ kthread_suspend_check(td->td_proc);
+
+ starttime = time_second;
+
+ /*
+ * Push files whose dirty time has expired. Be careful
+ * of interrupt race on slp queue.
+ */
+ s = splbio();
+ slp = &syncer_workitem_pending[syncer_delayno];
+ syncer_delayno += 1;
+ if (syncer_delayno == syncer_maxdelay)
+ syncer_delayno = 0;
+ splx(s);
+
+ while ((vp = LIST_FIRST(slp)) != NULL) {
+ if (VOP_ISLOCKED(vp, NULL) == 0 &&
+ vn_start_write(vp, &mp, V_NOWAIT) == 0) {
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ (void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ }
+ s = splbio();
+ if (LIST_FIRST(slp) == vp) {
+ /*
+ * Note: v_tag VT_VFS vps can remain on the
+ * worklist too with no dirty blocks, but
+ * since sync_fsync() moves it to a different
+ * slot we are safe.
+ */
+ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
+ !vn_isdisk(vp, NULL))
+ panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
+ /*
+ * Put us back on the worklist. The worklist
+ * routine will remove us from our current
+ * position and then add us back in at a later
+ * position.
+ */
+ vn_syncer_add_to_worklist(vp, syncdelay);
+ }
+ splx(s);
+ }
+
+ /*
+ * Do soft update processing.
+ */
+#ifdef SOFTUPDATES
+ softdep_process_worklist(NULL);
+#endif
+
+ /*
+ * The variable rushjob allows the kernel to speed up the
+ * processing of the filesystem syncer process. A rushjob
+ * value of N tells the filesystem syncer to process the next
+ * N seconds worth of work on its queue ASAP. Currently rushjob
+ * is used by the soft update code to speed up the filesystem
+ * syncer process when the incore state is getting so far
+ * ahead of the disk that the kernel memory pool is being
+ * threatened with exhaustion.
+ */
+ if (rushjob > 0) {
+ rushjob -= 1;
+ continue;
+ }
+ /*
+ * If it has taken us less than a second to process the
+ * current work, then wait. Otherwise start right over
+ * again. We can still lose time if any single round
+ * takes more than two seconds, but it does not really
+ * matter as we are just trying to generally pace the
+ * filesystem activity.
+ */
+ if (time_second == starttime)
+ tsleep(&lbolt, PPAUSE, "syncer", 0);
+ }
+}
+
+/*
+ * Request the syncer daemon to speed up its work.
+ * We never push it to speed up more than half of its
+ * normal turn time, otherwise it could take over the cpu.
+ * XXXKSE only one update?
+ */
+int
+speedup_syncer()
+{
+
+ mtx_lock_spin(&sched_lock);
+ if (FIRST_THREAD_IN_PROC(updateproc)->td_wchan == &lbolt) /* XXXKSE */
+ setrunnable(FIRST_THREAD_IN_PROC(updateproc));
+ mtx_unlock_spin(&sched_lock);
+ if (rushjob < syncdelay / 2) {
+ rushjob += 1;
+ stat_rush_requests += 1;
+ return (1);
+ }
+ return(0);
+}
+
+/*
+ * Associate a p-buffer with a vnode.
+ *
+ * Also sets B_PAGING flag to indicate that vnode is not fully associated
+ * with the buffer. i.e. the bp has not been linked into the vnode or
+ * ref-counted.
+ */
+void
+pbgetvp(vp, bp)
+ register struct vnode *vp;
+ register struct buf *bp;
+{
+
+ KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
+
+ bp->b_vp = vp;
+ bp->b_flags |= B_PAGING;
+ bp->b_dev = vn_todev(vp);
+}
+
+/*
+ * Disassociate a p-buffer from a vnode.
+ */
+void
+pbrelvp(bp)
+ register struct buf *bp;
+{
+
+ KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
+
+ /* XXX REMOVE ME */
+ if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
+ panic(
+ "relpbuf(): b_vp was probably reassignbuf()d %p %x",
+ bp,
+ (int)bp->b_flags
+ );
+ }
+ bp->b_vp = (struct vnode *) 0;
+ bp->b_flags &= ~B_PAGING;
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+void
+reassignbuf(bp, newvp)
+ register struct buf *bp;
+ register struct vnode *newvp;
+{
+ struct buflists *listheadp;
+ int delay;
+ int s;
+
+ if (newvp == NULL) {
+ printf("reassignbuf: NULL");
+ return;
+ }
+ ++reassignbufcalls;
+
+ /*
+ * B_PAGING flagged buffers cannot be reassigned because their vp
+ * is not fully linked in.
+ */
+ if (bp->b_flags & B_PAGING)
+ panic("cannot reassign paging buffer");
+
+ s = splbio();
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
+ if (bp->b_xflags & BX_VNDIRTY)
+ listheadp = &bp->b_vp->v_dirtyblkhd;
+ else
+ listheadp = &bp->b_vp->v_cleanblkhd;
+ TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+ bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
+ if (bp->b_vp != newvp) {
+ vdrop(bp->b_vp);
+ bp->b_vp = NULL; /* for clarification */
+ }
+ }
+ /*
+ * If dirty, put on list of dirty buffers; otherwise insert onto list
+ * of clean buffers.
+ */
+ if (bp->b_flags & B_DELWRI) {
+ struct buf *tbp;
+
+ listheadp = &newvp->v_dirtyblkhd;
+ if ((newvp->v_flag & VONWORKLST) == 0) {
+ switch (newvp->v_type) {
+ case VDIR:
+ delay = dirdelay;
+ break;
+ case VCHR:
+ if (newvp->v_rdev->si_mountpoint != NULL) {
+ delay = metadelay;
+ break;
+ }
+ /* fall through */
+ default:
+ delay = filedelay;
+ }
+ vn_syncer_add_to_worklist(newvp, delay);
+ }
+ bp->b_xflags |= BX_VNDIRTY;
+ tbp = TAILQ_FIRST(listheadp);
+ if (tbp == NULL ||
+ bp->b_lblkno == 0 ||
+ (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
+ (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
+ TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
+ ++reassignbufsortgood;
+ } else if (bp->b_lblkno < 0) {
+ TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
+ ++reassignbufsortgood;
+ } else if (reassignbufmethod == 1) {
+ /*
+ * New sorting algorithm, only handle sequential case,
+ * otherwise append to end (but before metadata)
+ */
+ if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
+ (tbp->b_xflags & BX_VNDIRTY)) {
+ /*
+ * Found the best place to insert the buffer
+ */
+ TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
+ ++reassignbufsortgood;
+ } else {
+ /*
+ * Missed, append to end, but before meta-data.
+ * We know that the head buffer in the list is
+ * not meta-data due to prior conditionals.
+ *
+ * Indirect effects: NFS second stage write
+ * tends to wind up here, giving maximum
+ * distance between the unstable write and the
+ * commit rpc.
+ */
+ tbp = TAILQ_LAST(listheadp, buflists);
+ while (tbp && tbp->b_lblkno < 0)
+ tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
+ TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
+ ++reassignbufsortbad;
+ }
+ } else {
+ /*
+ * Old sorting algorithm, scan queue and insert
+ */
+ struct buf *ttbp;
+ while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
+ (ttbp->b_lblkno < bp->b_lblkno)) {
+ ++reassignbufloops;
+ tbp = ttbp;
+ }
+ TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
+ }
+ } else {
+ bp->b_xflags |= BX_VNCLEAN;
+ TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
+ if ((newvp->v_flag & VONWORKLST) &&
+ TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
+ newvp->v_flag &= ~VONWORKLST;
+ LIST_REMOVE(newvp, v_synclist);
+ }
+ }
+ if (bp->b_vp != newvp) {
+ bp->b_vp = newvp;
+ vhold(bp->b_vp);
+ }
+ splx(s);
+}
+
+/*
+ * Create a vnode for a device.
+ * Used for mounting the root filesystem.
+ */
+int
+bdevvp(dev, vpp)
+ dev_t dev;
+ struct vnode **vpp;
+{
+ register struct vnode *vp;
+ struct vnode *nvp;
+ int error;
+
+ if (dev == NODEV) {
+ *vpp = NULLVP;
+ return (ENXIO);
+ }
+ if (vfinddev(dev, VCHR, vpp))
+ return (0);
+ error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
+ if (error) {
+ *vpp = NULLVP;
+ return (error);
+ }
+ vp = nvp;
+ vp->v_type = VCHR;
+ addalias(vp, dev);
+ *vpp = vp;
+ return (0);
+}
+
+/*
+ * Add vnode to the alias list hung off the dev_t.
+ *
+ * The reason for this gunk is that multiple vnodes can reference
+ * the same physical device, so checking vp->v_usecount to see
+ * how many users there are is inadequate; the v_usecount for
+ * the vnodes need to be accumulated. vcount() does that.
+ */
+struct vnode *
+addaliasu(nvp, nvp_rdev)
+ struct vnode *nvp;
+ udev_t nvp_rdev;
+{
+ struct vnode *ovp;
+ vop_t **ops;
+ dev_t dev;
+
+ if (nvp->v_type == VBLK)
+ return (nvp);
+ if (nvp->v_type != VCHR)
+ panic("addaliasu on non-special vnode");
+ dev = udev2dev(nvp_rdev, 0);
+ /*
+ * Check to see if we have a bdevvp vnode with no associated
+ * filesystem. If so, we want to associate the filesystem of
+ * the new newly instigated vnode with the bdevvp vnode and
+ * discard the newly created vnode rather than leaving the
+ * bdevvp vnode lying around with no associated filesystem.
+ */
+ if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
+ addalias(nvp, dev);
+ return (nvp);
+ }
+ /*
+ * Discard unneeded vnode, but save its node specific data.
+ * Note that if there is a lock, it is carried over in the
+ * node specific data to the replacement vnode.
+ */
+ vref(ovp);
+ ovp->v_data = nvp->v_data;
+ ovp->v_tag = nvp->v_tag;
+ nvp->v_data = NULL;
+ lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg,
+ nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK);
+ if (nvp->v_vnlock)
+ ovp->v_vnlock = &ovp->v_lock;
+ ops = ovp->v_op;
+ ovp->v_op = nvp->v_op;
+ if (VOP_ISLOCKED(nvp, curthread)) {
+ VOP_UNLOCK(nvp, 0, curthread);
+ vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ }
+ nvp->v_op = ops;
+ insmntque(ovp, nvp->v_mount);
+ vrele(nvp);
+ vgone(nvp);
+ return (ovp);
+}
+
+/* This is a local helper function that do the same as addaliasu, but for a
+ * dev_t instead of an udev_t. */
+static void
+addalias(nvp, dev)
+ struct vnode *nvp;
+ dev_t dev;
+{
+
+ KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
+ nvp->v_rdev = dev;
+ mtx_lock(&spechash_mtx);
+ SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
+ mtx_unlock(&spechash_mtx);
+}
+
+/*
+ * Grab a particular vnode from the free list, increment its
+ * reference count and lock it. The vnode lock bit is set if the
+ * vnode is being eliminated in vgone. The process is awakened
+ * when the transition is completed, and an error returned to
+ * indicate that the vnode is no longer usable (possibly having
+ * been changed to a new filesystem type).
+ */
+int
+vget(vp, flags, td)
+ register struct vnode *vp;
+ int flags;
+ struct thread *td;
+{
+ int error;
+
+ /*
+ * If the vnode is in the process of being cleaned out for
+ * another use, we wait for the cleaning to finish and then
+ * return failure. Cleaning is determined by checking that
+ * the VXLOCK flag is set.
+ */
+ if ((flags & LK_INTERLOCK) == 0)
+ mtx_lock(&vp->v_interlock);
+ if (vp->v_flag & VXLOCK) {
+ if (vp->v_vxproc == curthread) {
+#if 0
+ /* this can now occur in normal operation */
+ log(LOG_INFO, "VXLOCK interlock avoided\n");
+#endif
+ } else {
+ vp->v_flag |= VXWANT;
+ msleep(vp, &vp->v_interlock, PINOD | PDROP, "vget", 0);
+ return (ENOENT);
+ }
+ }
+
+ vp->v_usecount++;
+
+ if (VSHOULDBUSY(vp))
+ vbusy(vp);
+ if (flags & LK_TYPE_MASK) {
+ if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
+ /*
+ * must expand vrele here because we do not want
+ * to call VOP_INACTIVE if the reference count
+ * drops back to zero since it was never really
+ * active. We must remove it from the free list
+ * before sleeping so that multiple processes do
+ * not try to recycle it.
+ */
+ mtx_lock(&vp->v_interlock);
+ vp->v_usecount--;
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+ else
+ vlruvp(vp);
+ mtx_unlock(&vp->v_interlock);
+ }
+ return (error);
+ }
+ mtx_unlock(&vp->v_interlock);
+ return (0);
+}
+
+/*
+ * Increase the reference count of a vnode.
+ */
+void
+vref(struct vnode *vp)
+{
+ mtx_lock(&vp->v_interlock);
+ vp->v_usecount++;
+ mtx_unlock(&vp->v_interlock);
+}
+
+/*
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vrele(vp)
+ struct vnode *vp;
+{
+ struct thread *td = curthread; /* XXX */
+
+ KASSERT(vp != NULL, ("vrele: null vp"));
+
+ mtx_lock(&vp->v_interlock);
+
+ /* Skip this v_writecount check if we're going to panic below. */
+ KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
+ ("vrele: missed vn_close"));
+
+ if (vp->v_usecount > 1) {
+
+ vp->v_usecount--;
+ mtx_unlock(&vp->v_interlock);
+
+ return;
+ }
+
+ if (vp->v_usecount == 1) {
+ vp->v_usecount--;
+ /*
+ * We must call VOP_INACTIVE with the node locked.
+ * If we are doing a vput, the node is already locked,
+ * but, in the case of vrele, we must explicitly lock
+ * the vnode before calling VOP_INACTIVE.
+ */
+ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0)
+ VOP_INACTIVE(vp, td);
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+ else
+ vlruvp(vp);
+
+ } else {
+#ifdef DIAGNOSTIC
+ vprint("vrele: negative ref count", vp);
+ mtx_unlock(&vp->v_interlock);
+#endif
+ panic("vrele: negative ref cnt");
+ }
+}
+
+/*
+ * Release an already locked vnode. This give the same effects as
+ * unlock+vrele(), but takes less time and avoids releasing and
+ * re-aquiring the lock (as vrele() aquires the lock internally.)
+ */
+void
+vput(vp)
+ struct vnode *vp;
+{
+ struct thread *td = curthread; /* XXX */
+
+ GIANT_REQUIRED;
+
+ KASSERT(vp != NULL, ("vput: null vp"));
+ mtx_lock(&vp->v_interlock);
+ /* Skip this v_writecount check if we're going to panic below. */
+ KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
+ ("vput: missed vn_close"));
+
+ if (vp->v_usecount > 1) {
+ vp->v_usecount--;
+ VOP_UNLOCK(vp, LK_INTERLOCK, td);
+ return;
+ }
+
+ if (vp->v_usecount == 1) {
+ vp->v_usecount--;
+ /*
+ * We must call VOP_INACTIVE with the node locked.
+ * If we are doing a vput, the node is already locked,
+ * so we just need to release the vnode mutex.
+ */
+ mtx_unlock(&vp->v_interlock);
+ VOP_INACTIVE(vp, td);
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+ else
+ vlruvp(vp);
+
+ } else {
+#ifdef DIAGNOSTIC
+ vprint("vput: negative ref count", vp);
+#endif
+ panic("vput: negative ref cnt");
+ }
+}
+
+/*
+ * Somebody doesn't want the vnode recycled.
+ */
+void
+vhold(vp)
+ register struct vnode *vp;
+{
+ int s;
+
+ s = splbio();
+ vp->v_holdcnt++;
+ if (VSHOULDBUSY(vp))
+ vbusy(vp);
+ splx(s);
+}
+
+/*
+ * Note that there is one less who cares about this vnode. vdrop() is the
+ * opposite of vhold().
+ */
+void
+vdrop(vp)
+ register struct vnode *vp;
+{
+ int s;
+
+ s = splbio();
+ if (vp->v_holdcnt <= 0)
+ panic("vdrop: holdcnt");
+ vp->v_holdcnt--;
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+ else
+ vlruvp(vp);
+ splx(s);
+}
+
+/*
+ * Remove any vnodes in the vnode table belonging to mount point mp.
+ *
+ * If FORCECLOSE is not specified, there should not be any active ones,
+ * return error if any are found (nb: this is a user error, not a
+ * system error). If FORCECLOSE is specified, detach any active vnodes
+ * that are found.
+ *
+ * If WRITECLOSE is set, only flush out regular file vnodes open for
+ * writing.
+ *
+ * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped.
+ *
+ * `rootrefs' specifies the base reference count for the root vnode
+ * of this filesystem. The root vnode is considered busy if its
+ * v_usecount exceeds this value. On a successful return, vflush()
+ * will call vrele() on the root vnode exactly rootrefs times.
+ * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
+ * be zero.
+ */
+#ifdef DIAGNOSTIC
+static int busyprt = 0; /* print out busy vnodes */
+SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
+#endif
+
+int
+vflush(mp, rootrefs, flags)
+ struct mount *mp;
+ int rootrefs;
+ int flags;
+{
+ struct thread *td = curthread; /* XXX */
+ struct vnode *vp, *nvp, *rootvp = NULL;
+ struct vattr vattr;
+ int busy = 0, error;
+
+ if (rootrefs > 0) {
+ KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
+ ("vflush: bad args"));
+ /*
+ * Get the filesystem root vnode. We can vput() it
+ * immediately, since with rootrefs > 0, it won't go away.
+ */
+ if ((error = VFS_ROOT(mp, &rootvp)) != 0)
+ return (error);
+ vput(rootvp);
+ }
+ mtx_lock(&mntvnode_mtx);
+loop:
+ for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
+ /*
+ * Make sure this vnode wasn't reclaimed in getnewvnode().
+ * Start over if it has (it won't be on the list anymore).
+ */
+ if (vp->v_mount != mp)
+ goto loop;
+ nvp = TAILQ_NEXT(vp, v_nmntvnodes);
+
+ mtx_unlock(&mntvnode_mtx);
+ mtx_lock(&vp->v_interlock);
+ /*
+ * Skip over a vnodes marked VSYSTEM.
+ */
+ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
+ mtx_unlock(&vp->v_interlock);
+ mtx_lock(&mntvnode_mtx);
+ continue;
+ }
+ /*
+ * If WRITECLOSE is set, flush out unlinked but still open
+ * files (even if open only for reading) and regular file
+ * vnodes open for writing.
+ */
+ if ((flags & WRITECLOSE) &&
+ (vp->v_type == VNON ||
+ (VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 &&
+ vattr.va_nlink > 0)) &&
+ (vp->v_writecount == 0 || vp->v_type != VREG)) {
+ mtx_unlock(&vp->v_interlock);
+ mtx_lock(&mntvnode_mtx);
+ continue;
+ }
+
+ /*
+ * With v_usecount == 0, all we need to do is clear out the
+ * vnode data structures and we are done.
+ */
+ if (vp->v_usecount == 0) {
+ vgonel(vp, td);
+ mtx_lock(&mntvnode_mtx);
+ continue;
+ }
+
+ /*
+ * If FORCECLOSE is set, forcibly close the vnode. For block
+ * or character devices, revert to an anonymous device. For
+ * all other files, just kill them.
+ */
+ if (flags & FORCECLOSE) {
+ if (vp->v_type != VCHR) {
+ vgonel(vp, td);
+ } else {
+ vclean(vp, 0, td);
+ vp->v_op = spec_vnodeop_p;
+ insmntque(vp, (struct mount *) 0);
+ }
+ mtx_lock(&mntvnode_mtx);
+ continue;
+ }
+#ifdef DIAGNOSTIC
+ if (busyprt)
+ vprint("vflush: busy vnode", vp);
+#endif
+ mtx_unlock(&vp->v_interlock);
+ mtx_lock(&mntvnode_mtx);
+ busy++;
+ }
+ mtx_unlock(&mntvnode_mtx);
+ if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
+ /*
+ * If just the root vnode is busy, and if its refcount
+ * is equal to `rootrefs', then go ahead and kill it.
+ */
+ mtx_lock(&rootvp->v_interlock);
+ KASSERT(busy > 0, ("vflush: not busy"));
+ KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
+ if (busy == 1 && rootvp->v_usecount == rootrefs) {
+ vgonel(rootvp, td);
+ busy = 0;
+ } else
+ mtx_unlock(&rootvp->v_interlock);
+ }
+ if (busy)
+ return (EBUSY);
+ for (; rootrefs > 0; rootrefs--)
+ vrele(rootvp);
+ return (0);
+}
+
+/*
+ * This moves a now (likely recyclable) vnode to the end of the
+ * mountlist. XXX However, it is temporarily disabled until we
+ * can clean up ffs_sync() and friends, which have loop restart
+ * conditions which this code causes to operate O(N^2).
+ */
+static void
+vlruvp(struct vnode *vp)
+{
+#if 0
+ struct mount *mp;
+
+ if ((mp = vp->v_mount) != NULL) {
+ mtx_lock(&mntvnode_mtx);
+ TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+ TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+ mtx_unlock(&mntvnode_mtx);
+ }
+#endif
+}
+
+/*
+ * Disassociate the underlying filesystem from a vnode.
+ */
+static void
+vclean(vp, flags, td)
+ struct vnode *vp;
+ int flags;
+ struct thread *td;
+{
+ int active;
+
+ /*
+ * Check to see if the vnode is in use. If so we have to reference it
+ * before we clean it out so that its count cannot fall to zero and
+ * generate a race against ourselves to recycle it.
+ */
+ if ((active = vp->v_usecount))
+ vp->v_usecount++;
+
+ /*
+ * Prevent the vnode from being recycled or brought into use while we
+ * clean it out.
+ */
+ if (vp->v_flag & VXLOCK)
+ panic("vclean: deadlock");
+ vp->v_flag |= VXLOCK;
+ vp->v_vxproc = curthread;
+ /*
+ * Even if the count is zero, the VOP_INACTIVE routine may still
+ * have the object locked while it cleans it out. The VOP_LOCK
+ * ensures that the VOP_INACTIVE routine is done with its work.
+ * For active vnodes, it ensures that no other activity can
+ * occur while the underlying object is being cleaned out.
+ */
+ VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
+
+ /*
+ * Clean out any buffers associated with the vnode.
+ * If the flush fails, just toss the buffers.
+ */
+ if (flags & DOCLOSE) {
+ if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
+ (void) vn_write_suspend_wait(vp, NULL, V_WAIT);
+ if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
+ vinvalbuf(vp, 0, NOCRED, td, 0, 0);
+ }
+
+ VOP_DESTROYVOBJECT(vp);
+
+ /*
+ * If purging an active vnode, it must be closed and
+ * deactivated before being reclaimed. Note that the
+ * VOP_INACTIVE will unlock the vnode.
+ */
+ if (active) {
+ if (flags & DOCLOSE)
+ VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
+ VOP_INACTIVE(vp, td);
+ } else {
+ /*
+ * Any other processes trying to obtain this lock must first
+ * wait for VXLOCK to clear, then call the new lock operation.
+ */
+ VOP_UNLOCK(vp, 0, td);
+ }
+ /*
+ * Reclaim the vnode.
+ */
+ if (VOP_RECLAIM(vp, td))
+ panic("vclean: cannot reclaim");
+
+ if (active) {
+ /*
+ * Inline copy of vrele() since VOP_INACTIVE
+ * has already been called.
+ */
+ mtx_lock(&vp->v_interlock);
+ if (--vp->v_usecount <= 0) {
+#ifdef DIAGNOSTIC
+ if (vp->v_usecount < 0 || vp->v_writecount != 0) {
+ vprint("vclean: bad ref count", vp);
+ panic("vclean: ref cnt");
+ }
+#endif
+ vfree(vp);
+ }
+ mtx_unlock(&vp->v_interlock);
+ }
+
+ cache_purge(vp);
+ vp->v_vnlock = NULL;
+ lockdestroy(&vp->v_lock);
+
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+
+ /*
+ * Done with purge, notify sleepers of the grim news.
+ */
+ vp->v_op = dead_vnodeop_p;
+ if (vp->v_pollinfo != NULL)
+ vn_pollgone(vp);
+ vp->v_tag = VT_NON;
+ vp->v_flag &= ~VXLOCK;
+ vp->v_vxproc = NULL;
+ if (vp->v_flag & VXWANT) {
+ vp->v_flag &= ~VXWANT;
+ wakeup(vp);
+ }
+}
+
+/*
+ * Eliminate all activity associated with the requested vnode
+ * and with all vnodes aliased to the requested vnode.
+ */
+int
+vop_revoke(ap)
+ struct vop_revoke_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ } */ *ap;
+{
+ struct vnode *vp, *vq;
+ dev_t dev;
+
+ KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
+
+ vp = ap->a_vp;
+ /*
+ * If a vgone (or vclean) is already in progress,
+ * wait until it is done and return.
+ */
+ if (vp->v_flag & VXLOCK) {
+ vp->v_flag |= VXWANT;
+ msleep(vp, &vp->v_interlock, PINOD | PDROP,
+ "vop_revokeall", 0);
+ return (0);
+ }
+ dev = vp->v_rdev;
+ for (;;) {
+ mtx_lock(&spechash_mtx);
+ vq = SLIST_FIRST(&dev->si_hlist);
+ mtx_unlock(&spechash_mtx);
+ if (!vq)
+ break;
+ vgone(vq);
+ }
+ return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ * Release the passed interlock if the vnode will be recycled.
+ */
+int
+vrecycle(vp, inter_lkp, td)
+ struct vnode *vp;
+ struct mtx *inter_lkp;
+ struct thread *td;
+{
+
+ mtx_lock(&vp->v_interlock);
+ if (vp->v_usecount == 0) {
+ if (inter_lkp) {
+ mtx_unlock(inter_lkp);
+ }
+ vgonel(vp, td);
+ return (1);
+ }
+ mtx_unlock(&vp->v_interlock);
+ return (0);
+}
+
+/*
+ * Eliminate all activity associated with a vnode
+ * in preparation for reuse.
+ */
+void
+vgone(vp)
+ register struct vnode *vp;
+{
+ struct thread *td = curthread; /* XXX */
+
+ mtx_lock(&vp->v_interlock);
+ vgonel(vp, td);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+void
+vgonel(vp, td)
+ struct vnode *vp;
+ struct thread *td;
+{
+ int s;
+
+ /*
+ * If a vgone (or vclean) is already in progress,
+ * wait until it is done and return.
+ */
+ if (vp->v_flag & VXLOCK) {
+ vp->v_flag |= VXWANT;
+ msleep(vp, &vp->v_interlock, PINOD | PDROP, "vgone", 0);
+ return;
+ }
+
+ /*
+ * Clean out the filesystem specific data.
+ */
+ vclean(vp, DOCLOSE, td);
+ mtx_lock(&vp->v_interlock);
+
+ /*
+ * Delete from old mount point vnode list, if on one.
+ */
+ if (vp->v_mount != NULL)
+ insmntque(vp, (struct mount *)0);
+ /*
+ * If special device, remove it from special device alias list
+ * if it is on one.
+ */
+ if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) {
+ mtx_lock(&spechash_mtx);
+ SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
+ freedev(vp->v_rdev);
+ mtx_unlock(&spechash_mtx);
+ vp->v_rdev = NULL;
+ }
+
+ /*
+ * If it is on the freelist and not already at the head,
+ * move it to the head of the list. The test of the
+ * VDOOMED flag and the reference count of zero is because
+ * it will be removed from the free list by getnewvnode,
+ * but will not have its reference count incremented until
+ * after calling vgone. If the reference count were
+ * incremented first, vgone would (incorrectly) try to
+ * close the previous instance of the underlying object.
+ */
+ if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
+ s = splbio();
+ mtx_lock(&vnode_free_list_mtx);
+ if (vp->v_flag & VFREE)
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ else
+ freevnodes++;
+ vp->v_flag |= VFREE;
+ TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+ mtx_unlock(&vnode_free_list_mtx);
+ splx(s);
+ }
+
+ vp->v_type = VBAD;
+ mtx_unlock(&vp->v_interlock);
+}
+
+/*
+ * Lookup a vnode by device number.
+ */
+int
+vfinddev(dev, type, vpp)
+ dev_t dev;
+ enum vtype type;
+ struct vnode **vpp;
+{
+ struct vnode *vp;
+
+ mtx_lock(&spechash_mtx);
+ SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
+ if (type == vp->v_type) {
+ *vpp = vp;
+ mtx_unlock(&spechash_mtx);
+ return (1);
+ }
+ }
+ mtx_unlock(&spechash_mtx);
+ return (0);
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(vp)
+ struct vnode *vp;
+{
+ struct vnode *vq;
+ int count;
+
+ count = 0;
+ mtx_lock(&spechash_mtx);
+ SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext)
+ count += vq->v_usecount;
+ mtx_unlock(&spechash_mtx);
+ return (count);
+}
+
+/*
+ * Same as above, but using the dev_t as argument
+ */
+int
+count_dev(dev)
+ dev_t dev;
+{
+ struct vnode *vp;
+
+ vp = SLIST_FIRST(&dev->si_hlist);
+ if (vp == NULL)
+ return (0);
+ return(vcount(vp));
+}
+
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
+
+void
+vprint(label, vp)
+ char *label;
+ struct vnode *vp;
+{
+ char buf[96];
+
+ if (label != NULL)
+ printf("%s: %p: ", label, (void *)vp);
+ else
+ printf("%p: ", (void *)vp);
+ printf("type %s, usecount %d, writecount %d, refcount %d,",
+ typename[vp->v_type], vp->v_usecount, vp->v_writecount,
+ vp->v_holdcnt);
+ buf[0] = '\0';
+ if (vp->v_flag & VROOT)
+ strcat(buf, "|VROOT");
+ if (vp->v_flag & VTEXT)
+ strcat(buf, "|VTEXT");
+ if (vp->v_flag & VSYSTEM)
+ strcat(buf, "|VSYSTEM");
+ if (vp->v_flag & VXLOCK)
+ strcat(buf, "|VXLOCK");
+ if (vp->v_flag & VXWANT)
+ strcat(buf, "|VXWANT");
+ if (vp->v_flag & VBWAIT)
+ strcat(buf, "|VBWAIT");
+ if (vp->v_flag & VDOOMED)
+ strcat(buf, "|VDOOMED");
+ if (vp->v_flag & VFREE)
+ strcat(buf, "|VFREE");
+ if (vp->v_flag & VOBJBUF)
+ strcat(buf, "|VOBJBUF");
+ if (buf[0] != '\0')
+ printf(" flags (%s)", &buf[1]);
+ if (vp->v_data == NULL) {
+ printf("\n");
+ } else {
+ printf("\n\t");
+ VOP_PRINT(vp);
+ }
+}
+
+#ifdef DDB
+#include <ddb/ddb.h>
+/*
+ * List all of the locked vnodes in the system.
+ * Called when debugging the kernel.
+ */
+DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
+{
+ struct thread *td = curthread; /* XXX */
+ struct mount *mp, *nmp;
+ struct vnode *vp;
+
+ printf("Locked vnodes\n");
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ mtx_lock(&mntvnode_mtx);
+ TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+ if (VOP_ISLOCKED(vp, NULL))
+ vprint((char *)0, vp);
+ }
+ mtx_unlock(&mntvnode_mtx);
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp, td);
+ }
+ mtx_unlock(&mountlist_mtx);
+}
+#endif
+
+/*
+ * Top level filesystem related information gathering.
+ */
+static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1 - 1; /* XXX */
+ u_int namelen = arg2 + 1; /* XXX */
+ struct vfsconf *vfsp;
+
+#if 1 || defined(COMPAT_PRELITE2)
+ /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+ if (namelen == 1)
+ return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+ /* XXX the below code does not compile; vfs_sysctl does not exist. */
+#ifdef notyet
+ /* all sysctl names at this level are at least name and field */
+ if (namelen < 2)
+ return (ENOTDIR); /* overloaded */
+ if (name[0] != VFS_GENERIC) {
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == name[0])
+ break;
+ if (vfsp == NULL)
+ return (EOPNOTSUPP);
+ return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
+ oldp, oldlenp, newp, newlen, td));
+ }
+#endif
+ switch (name[1]) {
+ case VFS_MAXTYPENUM:
+ if (namelen != 2)
+ return (ENOTDIR);
+ return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
+ case VFS_CONF:
+ if (namelen != 3)
+ return (ENOTDIR); /* overloaded */
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == name[2])
+ break;
+ if (vfsp == NULL)
+ return (EOPNOTSUPP);
+ return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
+ }
+ return (EOPNOTSUPP);
+}
+
+SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
+ "Generic filesystem");
+
+#if 1 || defined(COMPAT_PRELITE2)
+
+static int
+sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct vfsconf *vfsp;
+ struct ovfsconf ovfs;
+
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+ ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
+ strcpy(ovfs.vfc_name, vfsp->vfc_name);
+ ovfs.vfc_index = vfsp->vfc_typenum;
+ ovfs.vfc_refcount = vfsp->vfc_refcount;
+ ovfs.vfc_flags = vfsp->vfc_flags;
+ error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+ if (error)
+ return error;
+ }
+ return 0;
+}
+
+#endif /* 1 || COMPAT_PRELITE2 */
+
+#if COMPILING_LINT
+#define KINFO_VNODESLOP 10
+/*
+ * Dump vnode list (via sysctl).
+ * Copyout address of vnode followed by vnode.
+ */
+/* ARGSUSED */
+static int
+sysctl_vnode(SYSCTL_HANDLER_ARGS)
+{
+ struct thread *td = curthread; /* XXX */
+ struct mount *mp, *nmp;
+ struct vnode *nvp, *vp;
+ int error;
+
+#define VPTRSZ sizeof (struct vnode *)
+#define VNODESZ sizeof (struct vnode)
+
+ req->lock = 0;
+ if (!req->oldptr) /* Make an estimate */
+ return (SYSCTL_OUT(req, 0,
+ (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
+
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ mtx_lock(&mntvnode_mtx);
+again:
+ for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
+ vp != NULL;
+ vp = nvp) {
+ /*
+ * Check that the vp is still associated with
+ * this filesystem. RACE: could have been
+ * recycled onto the same filesystem.
+ */
+ if (vp->v_mount != mp)
+ goto again;
+ nvp = TAILQ_NEXT(vp, v_nmntvnodes);
+ mtx_unlock(&mntvnode_mtx);
+ if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
+ (error = SYSCTL_OUT(req, vp, VNODESZ)))
+ return (error);
+ mtx_lock(&mntvnode_mtx);
+ }
+ mtx_unlock(&mntvnode_mtx);
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp, td);
+ }
+ mtx_unlock(&mountlist_mtx);
+
+ return (0);
+}
+
+/*
+ * XXX
+ * Exporting the vnode list on large systems causes them to crash.
+ * Exporting the vnode list on medium systems causes sysctl to coredump.
+ */
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, 0, sysctl_vnode, "S,vnode", "");
+#endif
+
+/*
+ * Check to see if a filesystem is mounted on a block device.
+ */
+int
+vfs_mountedon(vp)
+ struct vnode *vp;
+{
+
+ if (vp->v_rdev->si_mountpoint != NULL)
+ return (EBUSY);
+ return (0);
+}
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+void
+vfs_unmountall()
+{
+ struct mount *mp;
+ struct thread *td;
+ int error;
+
+ if (curthread != NULL)
+ td = curthread;
+ else
+ td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
+ /*
+ * Since this only runs when rebooting, it is not interlocked.
+ */
+ while(!TAILQ_EMPTY(&mountlist)) {
+ mp = TAILQ_LAST(&mountlist, mntlist);
+ error = dounmount(mp, MNT_FORCE, td);
+ if (error) {
+ TAILQ_REMOVE(&mountlist, mp, mnt_list);
+ printf("unmount of %s failed (",
+ mp->mnt_stat.f_mntonname);
+ if (error == EBUSY)
+ printf("BUSY)\n");
+ else
+ printf("%d)\n", error);
+ } else {
+ /* The unmount has removed mp from the mountlist */
+ }
+ }
+}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags)
+{
+ struct vnode *vp, *nvp;
+ struct vm_object *obj;
+ int tries;
+
+ GIANT_REQUIRED;
+
+ tries = 5;
+ mtx_lock(&mntvnode_mtx);
+loop:
+ for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
+ if (vp->v_mount != mp) {
+ if (--tries > 0)
+ goto loop;
+ break;
+ }
+ nvp = TAILQ_NEXT(vp, v_nmntvnodes);
+
+ if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */
+ continue;
+
+ if (vp->v_flag & VNOSYNC) /* unlinked, skip it */
+ continue;
+
+ if ((vp->v_flag & VOBJDIRTY) &&
+ (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
+ mtx_unlock(&mntvnode_mtx);
+ if (!vget(vp,
+ LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) {
+ if (VOP_GETVOBJECT(vp, &obj) == 0) {
+ vm_object_page_clean(obj, 0, 0,
+ flags == MNT_WAIT ?
+ OBJPC_SYNC : OBJPC_NOSYNC);
+ }
+ vput(vp);
+ }
+ mtx_lock(&mntvnode_mtx);
+ if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
+ if (--tries > 0)
+ goto loop;
+ break;
+ }
+ }
+ }
+ mtx_unlock(&mntvnode_mtx);
+}
+
+/*
+ * Create the VM object needed for VMIO and mmap support. This
+ * is done for all VREG files in the system. Some filesystems might
+ * afford the additional metadata buffering capability of the
+ * VMIO code by making the device node be VMIO mode also.
+ *
+ * vp must be locked when vfs_object_create is called.
+ */
+int
+vfs_object_create(vp, td, cred)
+ struct vnode *vp;
+ struct thread *td;
+ struct ucred *cred;
+{
+ GIANT_REQUIRED;
+ return (VOP_CREATEVOBJECT(vp, cred, td));
+}
+
+/*
+ * Mark a vnode as free, putting it up for recycling.
+ */
+void
+vfree(vp)
+ struct vnode *vp;
+{
+ int s;
+
+ s = splbio();
+ mtx_lock(&vnode_free_list_mtx);
+ KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
+ if (vp->v_flag & VAGE) {
+ TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+ } else {
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ }
+ freevnodes++;
+ mtx_unlock(&vnode_free_list_mtx);
+ vp->v_flag &= ~VAGE;
+ vp->v_flag |= VFREE;
+ splx(s);
+}
+
+/*
+ * Opposite of vfree() - mark a vnode as in use.
+ */
+void
+vbusy(vp)
+ struct vnode *vp;
+{
+ int s;
+
+ s = splbio();
+ mtx_lock(&vnode_free_list_mtx);
+ KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ freevnodes--;
+ mtx_unlock(&vnode_free_list_mtx);
+ vp->v_flag &= ~(VFREE|VAGE);
+ splx(s);
+}
+
+/*
+ * Record a process's interest in events which might happen to
+ * a vnode. Because poll uses the historic select-style interface
+ * internally, this routine serves as both the ``check for any
+ * pending events'' and the ``record my interest in future events''
+ * functions. (These are done together, while the lock is held,
+ * to avoid race conditions.)
+ */
+int
+vn_pollrecord(vp, td, events)
+ struct vnode *vp;
+ struct thread *td;
+ short events;
+{
+
+ if (vp->v_pollinfo == NULL)
+ v_addpollinfo(vp);
+ mtx_lock(&vp->v_pollinfo->vpi_lock);
+ if (vp->v_pollinfo->vpi_revents & events) {
+ /*
+ * This leaves events we are not interested
+ * in available for the other process which
+ * which presumably had requested them
+ * (otherwise they would never have been
+ * recorded).
+ */
+ events &= vp->v_pollinfo->vpi_revents;
+ vp->v_pollinfo->vpi_revents &= ~events;
+
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ return events;
+ }
+ vp->v_pollinfo->vpi_events |= events;
+ selrecord(td, &vp->v_pollinfo->vpi_selinfo);
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ return 0;
+}
+
+/*
+ * Note the occurrence of an event. If the VN_POLLEVENT macro is used,
+ * it is possible for us to miss an event due to race conditions, but
+ * that condition is expected to be rare, so for the moment it is the
+ * preferred interface.
+ */
+void
+vn_pollevent(vp, events)
+ struct vnode *vp;
+ short events;
+{
+
+ if (vp->v_pollinfo == NULL)
+ v_addpollinfo(vp);
+ mtx_lock(&vp->v_pollinfo->vpi_lock);
+ if (vp->v_pollinfo->vpi_events & events) {
+ /*
+ * We clear vpi_events so that we don't
+ * call selwakeup() twice if two events are
+ * posted before the polling process(es) is
+ * awakened. This also ensures that we take at
+ * most one selwakeup() if the polling process
+ * is no longer interested. However, it does
+ * mean that only one event can be noticed at
+ * a time. (Perhaps we should only clear those
+ * event bits which we note?) XXX
+ */
+ vp->v_pollinfo->vpi_events = 0; /* &= ~events ??? */
+ vp->v_pollinfo->vpi_revents |= events;
+ selwakeup(&vp->v_pollinfo->vpi_selinfo);
+ }
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+}
+
+/*
+ * Wake up anyone polling on vp because it is being revoked.
+ * This depends on dead_poll() returning POLLHUP for correct
+ * behavior.
+ */
+void
+vn_pollgone(vp)
+ struct vnode *vp;
+{
+
+ mtx_lock(&vp->v_pollinfo->vpi_lock);
+ VN_KNOTE(vp, NOTE_REVOKE);
+ if (vp->v_pollinfo->vpi_events) {
+ vp->v_pollinfo->vpi_events = 0;
+ selwakeup(&vp->v_pollinfo->vpi_selinfo);
+ }
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+}
+
+
+
+/*
+ * Routine to create and manage a filesystem syncer vnode.
+ */
+#define sync_close ((int (*)(struct vop_close_args *))nullop)
+static int sync_fsync(struct vop_fsync_args *);
+static int sync_inactive(struct vop_inactive_args *);
+static int sync_reclaim(struct vop_reclaim_args *);
+#define sync_lock ((int (*)(struct vop_lock_args *))vop_nolock)
+#define sync_unlock ((int (*)(struct vop_unlock_args *))vop_nounlock)
+static int sync_print(struct vop_print_args *);
+#define sync_islocked ((int(*)(struct vop_islocked_args *))vop_noislocked)
+
+static vop_t **sync_vnodeop_p;
+static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
+ { &vop_default_desc, (vop_t *) vop_eopnotsupp },
+ { &vop_close_desc, (vop_t *) sync_close }, /* close */
+ { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */
+ { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */
+ { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */
+ { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */
+ { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */
+ { &vop_print_desc, (vop_t *) sync_print }, /* print */
+ { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */
+ { NULL, NULL }
+};
+static struct vnodeopv_desc sync_vnodeop_opv_desc =
+ { &sync_vnodeop_p, sync_vnodeop_entries };
+
+VNODEOP_SET(sync_vnodeop_opv_desc);
+
+/*
+ * Create a new filesystem syncer vnode for the specified mount point.
+ */
+int
+vfs_allocate_syncvnode(mp)
+ struct mount *mp;
+{
+ struct vnode *vp;
+ static long start, incr, next;
+ int error;
+
+ /* Allocate a new vnode */
+ if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
+ mp->mnt_syncer = NULL;
+ return (error);
+ }
+ vp->v_type = VNON;
+ /*
+ * Place the vnode onto the syncer worklist. We attempt to
+ * scatter them about on the list so that they will go off
+ * at evenly distributed times even if all the filesystems
+ * are mounted at once.
+ */
+ next += incr;
+ if (next == 0 || next > syncer_maxdelay) {
+ start /= 2;
+ incr /= 2;
+ if (start == 0) {
+ start = syncer_maxdelay / 2;
+ incr = syncer_maxdelay;
+ }
+ next = start;
+ }
+ vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
+ mp->mnt_syncer = vp;
+ return (0);
+}
+
+/*
+ * Do a lazy sync of the filesystem.
+ */
+static int
+sync_fsync(ap)
+ struct vop_fsync_args /* {
+ struct vnode *a_vp;
+ struct ucred *a_cred;
+ int a_waitfor;
+ struct thread *a_td;
+ } */ *ap;
+{
+ struct vnode *syncvp = ap->a_vp;
+ struct mount *mp = syncvp->v_mount;
+ struct thread *td = ap->a_td;
+ int asyncflag;
+
+ /*
+ * We only need to do something if this is a lazy evaluation.
+ */
+ if (ap->a_waitfor != MNT_LAZY)
+ return (0);
+
+ /*
+ * Move ourselves to the back of the sync list.
+ */
+ vn_syncer_add_to_worklist(syncvp, syncdelay);
+
+ /*
+ * Walk the list of vnodes pushing all that are dirty and
+ * not already on the sync list.
+ */
+ mtx_lock(&mountlist_mtx);
+ if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
+ mtx_unlock(&mountlist_mtx);
+ return (0);
+ }
+ if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
+ vfs_unbusy(mp, td);
+ return (0);
+ }
+ asyncflag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &= ~MNT_ASYNC;
+ vfs_msync(mp, MNT_NOWAIT);
+ VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
+ if (asyncflag)
+ mp->mnt_flag |= MNT_ASYNC;
+ vn_finished_write(mp);
+ vfs_unbusy(mp, td);
+ return (0);
+}
+
+/*
+ * The syncer vnode is no referenced.
+ */
+static int
+sync_inactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+
+ vgone(ap->a_vp);
+ return (0);
+}
+
+/*
+ * The syncer vnode is no longer needed and is being decommissioned.
+ *
+ * Modifications to the worklist must be protected at splbio().
+ */
+static int
+sync_reclaim(ap)
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+ int s;
+
+ s = splbio();
+ vp->v_mount->mnt_syncer = NULL;
+ if (vp->v_flag & VONWORKLST) {
+ LIST_REMOVE(vp, v_synclist);
+ vp->v_flag &= ~VONWORKLST;
+ }
+ splx(s);
+
+ return (0);
+}
+
+/*
+ * Print out a syncer vnode.
+ */
+static int
+sync_print(ap)
+ struct vop_print_args /* {
+ struct vnode *a_vp;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+ printf("syncer vnode");
+ if (vp->v_vnlock != NULL)
+ lockmgr_printinfo(vp->v_vnlock);
+ printf("\n");
+ return (0);
+}
+
+/*
+ * extract the dev_t from a VCHR
+ */
+dev_t
+vn_todev(vp)
+ struct vnode *vp;
+{
+ if (vp->v_type != VCHR)
+ return (NODEV);
+ return (vp->v_rdev);
+}
+
+/*
+ * Check if vnode represents a disk device
+ */
+int
+vn_isdisk(vp, errp)
+ struct vnode *vp;
+ int *errp;
+{
+ struct cdevsw *cdevsw;
+
+ if (vp->v_type != VCHR) {
+ if (errp != NULL)
+ *errp = ENOTBLK;
+ return (0);
+ }
+ if (vp->v_rdev == NULL) {
+ if (errp != NULL)
+ *errp = ENXIO;
+ return (0);
+ }
+ cdevsw = devsw(vp->v_rdev);
+ if (cdevsw == NULL) {
+ if (errp != NULL)
+ *errp = ENXIO;
+ return (0);
+ }
+ if (!(cdevsw->d_flags & D_DISK)) {
+ if (errp != NULL)
+ *errp = ENOTBLK;
+ return (0);
+ }
+ if (errp != NULL)
+ *errp = 0;
+ return (1);
+}
+
+/*
+ * Free data allocated by namei(); see namei(9) for details.
+ */
+void
+NDFREE(ndp, flags)
+ struct nameidata *ndp;
+ const uint flags;
+{
+ if (!(flags & NDF_NO_FREE_PNBUF) &&
+ (ndp->ni_cnd.cn_flags & HASBUF)) {
+ uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
+ ndp->ni_cnd.cn_flags &= ~HASBUF;
+ }
+ if (!(flags & NDF_NO_DVP_UNLOCK) &&
+ (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
+ ndp->ni_dvp != ndp->ni_vp)
+ VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
+ if (!(flags & NDF_NO_DVP_RELE) &&
+ (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
+ vrele(ndp->ni_dvp);
+ ndp->ni_dvp = NULL;
+ }
+ if (!(flags & NDF_NO_VP_UNLOCK) &&
+ (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
+ VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
+ if (!(flags & NDF_NO_VP_RELE) &&
+ ndp->ni_vp) {
+ vrele(ndp->ni_vp);
+ ndp->ni_vp = NULL;
+ }
+ if (!(flags & NDF_NO_STARTDIR_RELE) &&
+ (ndp->ni_cnd.cn_flags & SAVESTART)) {
+ vrele(ndp->ni_startdir);
+ ndp->ni_startdir = NULL;
+ }
+}
+
+/*
+ * Common filesystem object access control check routine. Accepts a
+ * vnode's type, "mode", uid and gid, requested access mode, credentials,
+ * and optional call-by-reference privused argument allowing vaccess()
+ * to indicate to the caller whether privilege was used to satisfy the
+ * request. Returns 0 on success, or an errno on failure.
+ */
+int
+vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
+ enum vtype type;
+ mode_t file_mode;
+ uid_t file_uid;
+ gid_t file_gid;
+ mode_t acc_mode;
+ struct ucred *cred;
+ int *privused;
+{
+ mode_t dac_granted;
+#ifdef CAPABILITIES
+ mode_t cap_granted;
+#endif
+
+ /*
+ * Look for a normal, non-privileged way to access the file/directory
+ * as requested. If it exists, go with that.
+ */
+
+ if (privused != NULL)
+ *privused = 0;
+
+ dac_granted = 0;
+
+ /* Check the owner. */
+ if (cred->cr_uid == file_uid) {
+ dac_granted |= VADMIN;
+ if (file_mode & S_IXUSR)
+ dac_granted |= VEXEC;
+ if (file_mode & S_IRUSR)
+ dac_granted |= VREAD;
+ if (file_mode & S_IWUSR)
+ dac_granted |= VWRITE;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+
+ goto privcheck;
+ }
+
+ /* Otherwise, check the groups (first match) */
+ if (groupmember(file_gid, cred)) {
+ if (file_mode & S_IXGRP)
+ dac_granted |= VEXEC;
+ if (file_mode & S_IRGRP)
+ dac_granted |= VREAD;
+ if (file_mode & S_IWGRP)
+ dac_granted |= VWRITE;
+
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+
+ goto privcheck;
+ }
+
+ /* Otherwise, check everyone else. */
+ if (file_mode & S_IXOTH)
+ dac_granted |= VEXEC;
+ if (file_mode & S_IROTH)
+ dac_granted |= VREAD;
+ if (file_mode & S_IWOTH)
+ dac_granted |= VWRITE;
+ if ((acc_mode & dac_granted) == acc_mode)
+ return (0);
+
+privcheck:
+ if (!suser_cred(cred, PRISON_ROOT)) {
+ /* XXX audit: privilege used */
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+
+#ifdef CAPABILITIES
+ /*
+ * Build a capability mask to determine if the set of capabilities
+ * satisfies the requirements when combined with the granted mask
+ * from above.
+ * For each capability, if the capability is required, bitwise
+ * or the request type onto the cap_granted mask.
+ */
+ cap_granted = 0;
+
+ if (type == VDIR) {
+ /*
+ * For directories, use CAP_DAC_READ_SEARCH to satisfy
+ * VEXEC requests, instead of CAP_DAC_EXECUTE.
+ */
+ if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
+ !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
+ cap_granted |= VEXEC;
+ } else {
+ if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
+ !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
+ cap_granted |= VEXEC;
+ }
+
+ if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
+ !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
+ cap_granted |= VREAD;
+
+ if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
+ !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
+ cap_granted |= VWRITE;
+
+ if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
+ !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
+ cap_granted |= VADMIN;
+
+ if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
+ /* XXX audit: privilege used */
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+#endif
+
+ return ((acc_mode & VADMIN) ? EPERM : EACCES);
+}
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
new file mode 100644
index 0000000..1244e54
--- /dev/null
+++ b/sys/kern/vfs_syscalls.c
@@ -0,0 +1,4862 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94
+ * $FreeBSD$
+ */
+
+/* For 4.3 integer FS ID compatibility */
+#include "opt_compat.h"
+#include "opt_ffs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sysent.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/linker.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/dirent.h>
+#include <sys/extattr.h>
+#include <sys/jail.h>
+#include <sys/sysctl.h>
+
+#include <machine/limits.h>
+#include <machine/stdarg.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+static int change_dir(struct nameidata *ndp, struct thread *td);
+static void checkdirs(struct vnode *olddp, struct vnode *newdp);
+static int chroot_refuse_vdir_fds(struct filedesc *fdp);
+static int getutimes(const struct timeval *, struct timespec *);
+static int setfown(struct thread *td, struct vnode *, uid_t, gid_t);
+static int setfmode(struct thread *td, struct vnode *, int);
+static int setfflags(struct thread *td, struct vnode *, int);
+static int setutimes(struct thread *td, struct vnode *,
+ const struct timespec *, int);
+static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
+ struct thread *td);
+static int vfs_nmount(struct thread *td, int, struct uio *);
+
+static int usermount = 0; /* if 1, non-root can mount fs. */
+
+int (*union_dircheckp)(struct thread *td, struct vnode **, struct file *);
+
+SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
+
+/*
+ * Virtual File System System Calls
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct nmount_args {
+ struct iovec *iovp;
+ unsigned int iovcnt;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+nmount(td, uap)
+ struct thread *td;
+ struct nmount_args /* {
+ syscallarg(struct iovec *) iovp;
+ syscallarg(unsigned int) iovcnt;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ struct uio auio;
+ struct iovec *iov, *needfree;
+ struct iovec aiov[UIO_SMALLIOV];
+ unsigned int i;
+ int error;
+ u_int iovlen, iovcnt;
+
+ iovcnt = SCARG(uap, iovcnt);
+ iovlen = iovcnt * sizeof (struct iovec);
+ /*
+ * Check that we have an even number of iovec's
+ * and that we have at least two options.
+ */
+ if ((iovcnt & 1) || (iovcnt < 4) || (iovcnt > UIO_MAXIOV))
+ return (EINVAL);
+
+ if (iovcnt > UIO_SMALLIOV) {
+ MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+ needfree = iov;
+ } else {
+ iov = aiov;
+ needfree = NULL;
+ }
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_segflg = UIO_USERSPACE;
+ if ((error = copyin(uap->iovp, iov, iovlen)))
+ goto finish;
+
+ for (i = 0; i < iovcnt; i++) {
+ if (iov->iov_len > MMAXOPTIONLEN) {
+ error = EINVAL;
+ goto finish;
+ }
+ iov++;
+ }
+ error = vfs_nmount(td, SCARG(uap, flags), &auio);
+finish:
+ if (needfree != NULL)
+ free(needfree, M_TEMP);
+ return (error);
+}
+
+/*
+ * Release all resources related to the
+ * mount options.
+ */
+void
+vfs_freeopts(struct vfsoptlist *opts)
+{
+ struct vfsopt *opt;
+
+ while (!TAILQ_EMPTY(opts)) {
+ opt = TAILQ_FIRST(opts);
+ TAILQ_REMOVE(opts, opt, link);
+ free(opt->name, M_MOUNT);
+ free(opt->value, M_MOUNT);
+ free(opt, M_MOUNT);
+ }
+ free(opts, M_MOUNT);
+}
+
+int
+kernel_mount(iovp, iovcnt, flags)
+ struct iovec *iovp;
+ unsigned int iovcnt;
+ int flags;
+{
+ struct uio auio;
+ int error;
+
+ /*
+ * Check that we have an even number of iovec's
+ * and that we have at least two options.
+ */
+ if ((iovcnt & 1) || (iovcnt < 4))
+ return (EINVAL);
+
+ auio.uio_iov = iovp;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_segflg = UIO_SYSSPACE;
+
+ error = vfs_nmount(curthread, flags, &auio);
+ return (error);
+}
+
+int
+kernel_vmount(int flags, ...)
+{
+ struct iovec *iovp;
+ struct uio auio;
+ va_list ap;
+ unsigned int iovcnt, iovlen, len;
+ const char *cp;
+ char *buf, *pos;
+ size_t n;
+ int error, i;
+
+ len = 0;
+ va_start(ap, flags);
+ for (iovcnt = 0; (cp = va_arg(ap, const char *)) != NULL; iovcnt++)
+ len += strlen(cp) + 1;
+ va_end(ap);
+
+ if (iovcnt < 4 || iovcnt & 1)
+ return (EINVAL);
+
+ iovlen = iovcnt * sizeof (struct iovec);
+ MALLOC(iovp, struct iovec *, iovlen, M_MOUNT, M_WAITOK);
+ MALLOC(buf, char *, len, M_MOUNT, M_WAITOK);
+ pos = buf;
+ va_start(ap, flags);
+ for (i = 0; i < iovcnt; i++) {
+ cp = va_arg(ap, const char *);
+ copystr(cp, pos, len - (pos - buf), &n);
+ iovp[i].iov_base = pos;
+ iovp[i].iov_len = n;
+ pos += n;
+ }
+ va_end(ap);
+
+ auio.uio_iov = iovp;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_segflg = UIO_SYSSPACE;
+
+ error = vfs_nmount(curthread, flags, &auio);
+ FREE(iovp, M_MOUNT);
+ FREE(buf, M_MOUNT);
+ return (error);
+}
+
+/*
+ * vfs_nmount(): actually attempt a filesystem mount.
+ */
+static int
+vfs_nmount(td, fsflags, fsoptions)
+ struct thread *td;
+ int fsflags; /* Flags common to all filesystems. */
+ struct uio *fsoptions; /* Options local to the filesystem. */
+{
+ linker_file_t lf;
+ struct vnode *vp;
+ struct mount *mp;
+ struct vfsconf *vfsp;
+ struct vfsoptlist *optlist;
+ char *fstype, *fspath;
+ int error, flag = 0, kern_flag = 0;
+ int fstypelen, fspathlen;
+ struct vattr va;
+ struct nameidata nd;
+
+ error = vfs_buildopts(fsoptions, &optlist);
+ if (error)
+ return (error);
+
+ /*
+ * We need these two options before the others,
+ * and they are mandatory for any filesystem.
+ * Ensure they are NUL terminated as well.
+ */
+ fstypelen = 0;
+ error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
+ if (error || fstype[fstypelen - 1] != '\0') {
+ error = EINVAL;
+ goto bad;
+ }
+ fspathlen = 0;
+ error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
+ if (error || fspath[fspathlen - 1] != '\0') {
+ error = EINVAL;
+ goto bad;
+ }
+
+ /*
+ * Be ultra-paranoid about making sure the type and fspath
+ * variables will fit in our mp buffers, including the
+ * terminating NUL.
+ */
+ if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
+ error = ENAMETOOLONG;
+ goto bad;
+ }
+
+ if (usermount == 0) {
+ error = suser(td);
+ if (error)
+ goto bad;
+ }
+ /*
+ * Do not allow NFS export by non-root users.
+ */
+ if (fsflags & MNT_EXPORTED) {
+ error = suser(td);
+ if (error)
+ goto bad;
+ }
+ /*
+ * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users.
+ */
+ if (suser(td))
+ fsflags |= MNT_NOSUID | MNT_NODEV;
+ /*
+ * Get vnode to be covered
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
+ if ((error = namei(&nd)) != 0)
+ goto bad;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ if (fsflags & MNT_UPDATE) {
+ if ((vp->v_flag & VROOT) == 0) {
+ vput(vp);
+ error = EINVAL;
+ goto bad;
+ }
+ mp = vp->v_mount;
+ flag = mp->mnt_flag;
+ kern_flag = mp->mnt_kern_flag;
+ /*
+ * We only allow the filesystem to be reloaded if it
+ * is currently mounted read-only.
+ */
+ if ((fsflags & MNT_RELOAD) &&
+ ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+ vput(vp);
+ error = EOPNOTSUPP; /* Needs translation */
+ goto bad;
+ }
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to update it.
+ */
+ if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) {
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ goto bad;
+ }
+ }
+ if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
+ vput(vp);
+ error = EBUSY;
+ goto bad;
+ }
+ mtx_lock(&vp->v_interlock);
+ if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) {
+ mtx_unlock(&vp->v_interlock);
+ vfs_unbusy(mp, td);
+ vput(vp);
+ error = EBUSY;
+ goto bad;
+ }
+ vp->v_flag |= VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ mp->mnt_flag |= fsflags &
+ (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT);
+ VOP_UNLOCK(vp, 0, td);
+ goto update;
+ }
+ /*
+ * If the user is not root, ensure that they own the directory
+ * onto which we are attempting to mount.
+ */
+ error = VOP_GETATTR(vp, &va, td->td_ucred, td);
+ if (error) {
+ vput(vp);
+ goto bad;
+ }
+ if (va.va_uid != td->td_ucred->cr_uid) {
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ goto bad;
+ }
+ }
+ if ((error = vinvalbuf(vp, V_SAVE, td->td_ucred, td, 0, 0)) != 0) {
+ vput(vp);
+ goto bad;
+ }
+ if (vp->v_type != VDIR) {
+ vput(vp);
+ error = ENOTDIR;
+ goto bad;
+ }
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstype))
+ break;
+ if (vfsp == NULL) {
+ /* Only load modules for root (very important!). */
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ goto bad;
+ }
+ error = securelevel_gt(td->td_ucred, 0);
+ if (error) {
+ vput(vp);
+ goto bad;
+ }
+ error = linker_load_file(fstype, &lf);
+ if (error || lf == NULL) {
+ vput(vp);
+ if (lf == NULL)
+ error = ENODEV;
+ goto bad;
+ }
+ lf->userrefs++;
+ /* Look up again to see if the VFS was loaded. */
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstype))
+ break;
+ if (vfsp == NULL) {
+ lf->userrefs--;
+ linker_file_unload(lf);
+ vput(vp);
+ error = ENODEV;
+ goto bad;
+ }
+ }
+ mtx_lock(&vp->v_interlock);
+ if ((vp->v_flag & VMOUNT) != 0 ||
+ vp->v_mountedhere != NULL) {
+ mtx_unlock(&vp->v_interlock);
+ vput(vp);
+ error = EBUSY;
+ goto bad;
+ }
+ vp->v_flag |= VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+
+ /*
+ * Allocate and initialize the filesystem.
+ */
+ mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&mp->mnt_nvnodelist);
+ TAILQ_INIT(&mp->mnt_reservedvnlist);
+ lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+ (void)vfs_busy(mp, LK_NOWAIT, 0, td);
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_vfc = vfsp;
+ vfsp->vfc_refcount++;
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+ strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN);
+ mp->mnt_vnodecovered = vp;
+ mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
+ strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
+ mp->mnt_iosize_max = DFLTPHYS;
+ VOP_UNLOCK(vp, 0, td);
+
+update:
+ mp->mnt_optnew = optlist;
+ /*
+ * Check if the fs implements the new VFS_NMOUNT()
+ * function, since the new system call was used.
+ */
+ if (mp->mnt_op->vfs_mount != NULL) {
+ printf("%s doesn't support the new mount syscall\n",
+ mp->mnt_vfc->vfc_name);
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ if (mp->mnt_flag & MNT_UPDATE)
+ vfs_unbusy(mp, td);
+ else {
+ mp->mnt_vfc->vfc_refcount--;
+ vfs_unbusy(mp, td);
+ free(mp, M_MOUNT);
+ }
+ vrele(vp);
+ error = EOPNOTSUPP;
+ goto bad;
+ }
+
+ /*
+ * Set the mount level flags.
+ */
+ if (fsflags & MNT_RDONLY)
+ mp->mnt_flag |= MNT_RDONLY;
+ else if (mp->mnt_flag & MNT_RDONLY)
+ mp->mnt_kern_flag |= MNTK_WANTRDWR;
+ mp->mnt_flag &=~ MNT_UPDATEMASK;
+ mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE);
+ /*
+ * Mount the filesystem.
+ * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+ * get. No freeing of cn_pnbuf.
+ */
+ error = VFS_NMOUNT(mp, &nd, td);
+ if (!error) {
+ if (mp->mnt_opt != NULL)
+ vfs_freeopts(mp->mnt_opt);
+ mp->mnt_opt = mp->mnt_optnew;
+ }
+ /*
+ * Prevent external consumers of mount
+ * options to read mnt_optnew.
+ */
+ mp->mnt_optnew = NULL;
+ if (mp->mnt_flag & MNT_UPDATE) {
+ if (mp->mnt_kern_flag & MNTK_WANTRDWR)
+ mp->mnt_flag &= ~MNT_RDONLY;
+ mp->mnt_flag &=~
+ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
+ mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
+ if (error) {
+ mp->mnt_flag = flag;
+ mp->mnt_kern_flag = kern_flag;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+ if (mp->mnt_syncer == NULL)
+ error = vfs_allocate_syncvnode(mp);
+ } else {
+ if (mp->mnt_syncer != NULL)
+ vrele(mp->mnt_syncer);
+ mp->mnt_syncer = NULL;
+ }
+ vfs_unbusy(mp, td);
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ vrele(vp);
+ return (error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ /*
+ * Put the new filesystem on the mount list after root.
+ */
+ cache_purge(vp);
+ if (!error) {
+ struct vnode *newdp;
+
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ vp->v_mountedhere = mp;
+ mtx_unlock(&vp->v_interlock);
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+ if (VFS_ROOT(mp, &newdp))
+ panic("mount: lost mount");
+ checkdirs(vp, newdp);
+ vput(newdp);
+ VOP_UNLOCK(vp, 0, td);
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ error = vfs_allocate_syncvnode(mp);
+ vfs_unbusy(mp, td);
+ if ((error = VFS_START(mp, 0, td)) != 0) {
+ vrele(vp);
+ goto bad;
+ }
+ } else {
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ mp->mnt_vfc->vfc_refcount--;
+ vfs_unbusy(mp, td);
+ free(mp, M_MOUNT);
+ vput(vp);
+ goto bad;
+ }
+ return (0);
+bad:
+ vfs_freeopts(optlist);
+ return (error);
+}
+
+/*
+ * Old Mount API.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+ char *type;
+ char *path;
+ int flags;
+ caddr_t data;
+};
+#endif
+/* ARGSUSED */
+int
+mount(td, uap)
+ struct thread *td;
+ struct mount_args /* {
+ syscallarg(char *) type;
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(caddr_t) data;
+ } */ *uap;
+{
+ char *fstype;
+ char *fspath;
+ int error;
+
+ fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
+ fspath = malloc(MNAMELEN, M_TEMP, M_WAITOK);
+
+ /*
+ * vfs_mount() actually takes a kernel string for `type' and
+ * `path' now, so extract them.
+ */
+ error = copyinstr(SCARG(uap, type), fstype, MFSNAMELEN, NULL);
+ if (error)
+ goto finish;
+ error = copyinstr(SCARG(uap, path), fspath, MNAMELEN, NULL);
+ if (error)
+ goto finish;
+ error = vfs_mount(td, fstype, fspath, SCARG(uap, flags),
+ SCARG(uap, data));
+finish:
+ free(fstype, M_TEMP);
+ free(fspath, M_TEMP);
+ return (error);
+}
+
+/*
+ * vfs_mount(): actually attempt a filesystem mount.
+ *
+ * This routine is designed to be a "generic" entry point for routines
+ * that wish to mount a filesystem. All parameters except `fsdata' are
+ * pointers into kernel space. `fsdata' is currently still a pointer
+ * into userspace.
+ */
+int
+vfs_mount(td, fstype, fspath, fsflags, fsdata)
+ struct thread *td;
+ const char *fstype;
+ char *fspath;
+ int fsflags;
+ void *fsdata;
+{
+ linker_file_t lf;
+ struct vnode *vp;
+ struct mount *mp;
+ struct vfsconf *vfsp;
+ int error, flag = 0, kern_flag = 0;
+ struct vattr va;
+ struct nameidata nd;
+
+ /*
+ * Be ultra-paranoid about making sure the type and fspath
+ * variables will fit in our mp buffers, including the
+ * terminating NUL.
+ */
+ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
+ return (ENAMETOOLONG);
+
+ if (usermount == 0) {
+ error = suser(td);
+ if (error)
+ return (error);
+ }
+ /*
+ * Do not allow NFS export by non-root users.
+ */
+ if (fsflags & MNT_EXPORTED) {
+ error = suser(td);
+ if (error)
+ return (error);
+ }
+ /*
+ * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users.
+ */
+ if (suser(td))
+ fsflags |= MNT_NOSUID | MNT_NODEV;
+ /*
+ * Get vnode to be covered
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ if (fsflags & MNT_UPDATE) {
+ if ((vp->v_flag & VROOT) == 0) {
+ vput(vp);
+ return (EINVAL);
+ }
+ mp = vp->v_mount;
+ flag = mp->mnt_flag;
+ kern_flag = mp->mnt_kern_flag;
+ /*
+ * We only allow the filesystem to be reloaded if it
+ * is currently mounted read-only.
+ */
+ if ((fsflags & MNT_RELOAD) &&
+ ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+ vput(vp);
+ return (EOPNOTSUPP); /* Needs translation */
+ }
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to update it.
+ */
+ if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) {
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ }
+ if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
+ vput(vp);
+ return (EBUSY);
+ }
+ mtx_lock(&vp->v_interlock);
+ if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) {
+ mtx_unlock(&vp->v_interlock);
+ vfs_unbusy(mp, td);
+ vput(vp);
+ return (EBUSY);
+ }
+ vp->v_flag |= VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ mp->mnt_flag |= fsflags &
+ (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT);
+ VOP_UNLOCK(vp, 0, td);
+ goto update;
+ }
+ /*
+ * If the user is not root, ensure that they own the directory
+ * onto which we are attempting to mount.
+ */
+ error = VOP_GETATTR(vp, &va, td->td_ucred, td);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ if (va.va_uid != td->td_ucred->cr_uid) {
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ }
+ if ((error = vinvalbuf(vp, V_SAVE, td->td_ucred, td, 0, 0)) != 0) {
+ vput(vp);
+ return (error);
+ }
+ if (vp->v_type != VDIR) {
+ vput(vp);
+ return (ENOTDIR);
+ }
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstype))
+ break;
+ if (vfsp == NULL) {
+ /* Only load modules for root (very important!). */
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ error = securelevel_gt(td->td_ucred, 0);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ error = linker_load_file(fstype, &lf);
+ if (error || lf == NULL) {
+ vput(vp);
+ if (lf == NULL)
+ error = ENODEV;
+ return (error);
+ }
+ lf->userrefs++;
+ /* Look up again to see if the VFS was loaded. */
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstype))
+ break;
+ if (vfsp == NULL) {
+ lf->userrefs--;
+ linker_file_unload(lf);
+ vput(vp);
+ return (ENODEV);
+ }
+ }
+ mtx_lock(&vp->v_interlock);
+ if ((vp->v_flag & VMOUNT) != 0 ||
+ vp->v_mountedhere != NULL) {
+ mtx_unlock(&vp->v_interlock);
+ vput(vp);
+ return (EBUSY);
+ }
+ vp->v_flag |= VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+
+ /*
+ * Allocate and initialize the filesystem.
+ */
+ mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&mp->mnt_nvnodelist);
+ TAILQ_INIT(&mp->mnt_reservedvnlist);
+ lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+ (void)vfs_busy(mp, LK_NOWAIT, 0, td);
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_vfc = vfsp;
+ vfsp->vfc_refcount++;
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+ strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN);
+ mp->mnt_vnodecovered = vp;
+ mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
+ strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
+ mp->mnt_iosize_max = DFLTPHYS;
+ VOP_UNLOCK(vp, 0, td);
+update:
+ /*
+ * Check if the fs implements the old VFS_MOUNT()
+ * function, since the old system call was used.
+ */
+ if (mp->mnt_op->vfs_mount == NULL) {
+ printf("%s doesn't support the old mount syscall\n",
+ mp->mnt_vfc->vfc_name);
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ if (mp->mnt_flag & MNT_UPDATE)
+ vfs_unbusy(mp, td);
+ else {
+ mp->mnt_vfc->vfc_refcount--;
+ vfs_unbusy(mp, td);
+ free(mp, M_MOUNT);
+ }
+ vrele(vp);
+ return (EOPNOTSUPP);
+ }
+
+ /*
+ * Set the mount level flags.
+ */
+ if (fsflags & MNT_RDONLY)
+ mp->mnt_flag |= MNT_RDONLY;
+ else if (mp->mnt_flag & MNT_RDONLY)
+ mp->mnt_kern_flag |= MNTK_WANTRDWR;
+ mp->mnt_flag &=~ MNT_UPDATEMASK;
+ mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE);
+ /*
+ * Mount the filesystem.
+ * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+ * get. No freeing of cn_pnbuf.
+ */
+ error = VFS_MOUNT(mp, fspath, fsdata, &nd, td);
+ if (mp->mnt_flag & MNT_UPDATE) {
+ if (mp->mnt_kern_flag & MNTK_WANTRDWR)
+ mp->mnt_flag &= ~MNT_RDONLY;
+ mp->mnt_flag &=~
+ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
+ mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
+ if (error) {
+ mp->mnt_flag = flag;
+ mp->mnt_kern_flag = kern_flag;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+ if (mp->mnt_syncer == NULL)
+ error = vfs_allocate_syncvnode(mp);
+ } else {
+ if (mp->mnt_syncer != NULL)
+ vrele(mp->mnt_syncer);
+ mp->mnt_syncer = NULL;
+ }
+ vfs_unbusy(mp, td);
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ vrele(vp);
+ return (error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ /*
+ * Put the new filesystem on the mount list after root.
+ */
+ cache_purge(vp);
+ if (!error) {
+ struct vnode *newdp;
+
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ vp->v_mountedhere = mp;
+ mtx_unlock(&vp->v_interlock);
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+ if (VFS_ROOT(mp, &newdp))
+ panic("mount: lost mount");
+ checkdirs(vp, newdp);
+ vput(newdp);
+ VOP_UNLOCK(vp, 0, td);
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ error = vfs_allocate_syncvnode(mp);
+ vfs_unbusy(mp, td);
+ if ((error = VFS_START(mp, 0, td)) != 0)
+ vrele(vp);
+ } else {
+ mtx_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ mtx_unlock(&vp->v_interlock);
+ mp->mnt_vfc->vfc_refcount--;
+ vfs_unbusy(mp, td);
+ free(mp, M_MOUNT);
+ vput(vp);
+ }
+ return (error);
+}
+
+/*
+ * Scan all active processes to see if any of them have a current
+ * or root directory of `olddp'. If so, replace them with the new
+ * mount point.
+ */
+static void
+checkdirs(olddp, newdp)
+ struct vnode *olddp, *newdp;
+{
+ struct filedesc *fdp;
+ struct proc *p;
+ int nrele;
+
+ if (olddp->v_usecount == 1)
+ return;
+ sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &allproc, p_list) {
+ PROC_LOCK(p);
+ fdp = p->p_fd;
+ if (fdp == NULL) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ nrele = 0;
+ FILEDESC_LOCK(fdp);
+ if (fdp->fd_cdir == olddp) {
+ VREF(newdp);
+ fdp->fd_cdir = newdp;
+ nrele++;
+ }
+ if (fdp->fd_rdir == olddp) {
+ VREF(newdp);
+ fdp->fd_rdir = newdp;
+ nrele++;
+ }
+ FILEDESC_UNLOCK(fdp);
+ PROC_UNLOCK(p);
+ while (nrele--)
+ vrele(olddp);
+ }
+ sx_sunlock(&allproc_lock);
+ if (rootvnode == olddp) {
+ vrele(rootvnode);
+ VREF(newdp);
+ rootvnode = newdp;
+ }
+}
+
+/*
+ * Unmount a filesystem.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument,
+ * not special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+ char *path;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+unmount(td, uap)
+ struct thread *td;
+ register struct unmount_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct mount *mp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ mp = vp->v_mount;
+
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to unmount this filesystem.
+ */
+ if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) {
+ error = suser(td);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ }
+
+ /*
+ * Don't allow unmounting the root filesystem.
+ */
+ if (mp->mnt_flag & MNT_ROOTFS) {
+ vput(vp);
+ return (EINVAL);
+ }
+
+ /*
+ * Must be the root of the filesystem
+ */
+ if ((vp->v_flag & VROOT) == 0) {
+ vput(vp);
+ return (EINVAL);
+ }
+ vput(vp);
+ return (dounmount(mp, SCARG(uap, flags), td));
+}
+
+/*
+ * Do the actual filesystem unmount.
+ */
+int
+dounmount(mp, flags, td)
+ struct mount *mp;
+ int flags;
+ struct thread *td;
+{
+ struct vnode *coveredvp, *fsrootvp;
+ int error;
+ int async_flag;
+
+ mtx_lock(&mountlist_mtx);
+ if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+ mtx_unlock(&mountlist_mtx);
+ return (EBUSY);
+ }
+ mp->mnt_kern_flag |= MNTK_UNMOUNT;
+ /* Allow filesystems to detect that a forced unmount is in progress. */
+ if (flags & MNT_FORCE)
+ mp->mnt_kern_flag |= MNTK_UNMOUNTF;
+ error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
+ ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), &mountlist_mtx, td);
+ if (error) {
+ mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+ if (mp->mnt_kern_flag & MNTK_MWAIT)
+ wakeup(mp);
+ return (error);
+ }
+ vn_start_write(NULL, &mp, V_WAIT);
+
+ if (mp->mnt_flag & MNT_EXPUBLIC)
+ vfs_setpublicfs(NULL, NULL, NULL);
+
+ vfs_msync(mp, MNT_WAIT);
+ async_flag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &=~ MNT_ASYNC;
+ cache_purgevfs(mp); /* remove cache entries for this file sys */
+ if (mp->mnt_syncer != NULL)
+ vrele(mp->mnt_syncer);
+ /* Move process cdir/rdir refs on fs root to underlying vnode. */
+ if (VFS_ROOT(mp, &fsrootvp) == 0) {
+ if (mp->mnt_vnodecovered != NULL)
+ checkdirs(fsrootvp, mp->mnt_vnodecovered);
+ if (fsrootvp == rootvnode) {
+ vrele(rootvnode);
+ rootvnode = NULL;
+ }
+ vput(fsrootvp);
+ }
+ if (((mp->mnt_flag & MNT_RDONLY) ||
+ (error = VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td)) == 0) ||
+ (flags & MNT_FORCE)) {
+ error = VFS_UNMOUNT(mp, flags, td);
+ }
+ vn_finished_write(mp);
+ if (error) {
+ /* Undo cdir/rdir and rootvnode changes made above. */
+ if (VFS_ROOT(mp, &fsrootvp) == 0) {
+ if (mp->mnt_vnodecovered != NULL)
+ checkdirs(mp->mnt_vnodecovered, fsrootvp);
+ if (rootvnode == NULL) {
+ rootvnode = fsrootvp;
+ vref(rootvnode);
+ }
+ vput(fsrootvp);
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
+ (void) vfs_allocate_syncvnode(mp);
+ mtx_lock(&mountlist_mtx);
+ mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+ mp->mnt_flag |= async_flag;
+ lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK,
+ &mountlist_mtx, td);
+ if (mp->mnt_kern_flag & MNTK_MWAIT)
+ wakeup(mp);
+ return (error);
+ }
+ mtx_lock(&mountlist_mtx);
+ TAILQ_REMOVE(&mountlist, mp, mnt_list);
+ if ((coveredvp = mp->mnt_vnodecovered) != NULL)
+ coveredvp->v_mountedhere = NULL;
+ mp->mnt_vfc->vfc_refcount--;
+ if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
+ panic("unmount: dangling vnode");
+ lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_mtx, td);
+ lockdestroy(&mp->mnt_lock);
+ if (coveredvp != NULL)
+ vrele(coveredvp);
+ if (mp->mnt_kern_flag & MNTK_MWAIT)
+ wakeup(mp);
+ if (mp->mnt_op->vfs_mount == NULL)
+ vfs_freeopts(mp->mnt_opt);
+ free(mp, M_MOUNT);
+ return (0);
+}
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+ int dummy;
+};
+#endif
+
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
+/* ARGSUSED */
+int
+sync(td, uap)
+ struct thread *td;
+ struct sync_args *uap;
+{
+ struct mount *mp, *nmp;
+ int asyncflag;
+
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
+ vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
+ asyncflag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &= ~MNT_ASYNC;
+ vfs_msync(mp, MNT_NOWAIT);
+ VFS_SYNC(mp, MNT_NOWAIT,
+ ((td != NULL) ? td->td_ucred : NOCRED), td);
+ mp->mnt_flag |= asyncflag;
+ vn_finished_write(mp);
+ }
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp, td);
+ }
+ mtx_unlock(&mountlist_mtx);
+#if 0
+/*
+ * XXX don't call vfs_bufstats() yet because that routine
+ * was not imported in the Lite2 merge.
+ */
+#ifdef DIAGNOSTIC
+ if (syncprt)
+ vfs_bufstats();
+#endif /* DIAGNOSTIC */
+#endif
+ return (0);
+}
+
+/* XXX PRISON: could be per prison flag */
+static int prison_quotas;
+#if 0
+SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
+#endif
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+ char *path;
+ int cmd;
+ int uid;
+ caddr_t arg;
+};
+#endif
+/* ARGSUSED */
+int
+quotactl(td, uap)
+ struct thread *td;
+ register struct quotactl_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) cmd;
+ syscallarg(int) uid;
+ syscallarg(caddr_t) arg;
+ } */ *uap;
+{
+ struct mount *mp;
+ int error;
+ struct nameidata nd;
+
+ if (jailed(td->td_ucred) && !prison_quotas)
+ return (EPERM);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
+ vrele(nd.ni_vp);
+ if (error)
+ return (error);
+ error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
+ SCARG(uap, arg), td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+ char *path;
+ struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+statfs(td, uap)
+ struct thread *td;
+ register struct statfs_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct statfs *) buf;
+ } */ *uap;
+{
+ register struct mount *mp;
+ register struct statfs *sp;
+ int error;
+ struct nameidata nd;
+ struct statfs sb;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ mp = nd.ni_vp->v_mount;
+ sp = &mp->mnt_stat;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(nd.ni_vp);
+ error = VFS_STATFS(mp, sp, td);
+ if (error)
+ return (error);
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (suser(td)) {
+ bcopy(sp, &sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
+ return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+ int fd;
+ struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+fstatfs(td, uap)
+ struct thread *td;
+ register struct fstatfs_args /* {
+ syscallarg(int) fd;
+ syscallarg(struct statfs *) buf;
+ } */ *uap;
+{
+ struct file *fp;
+ struct mount *mp;
+ register struct statfs *sp;
+ int error;
+ struct statfs sb;
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ mp = ((struct vnode *)fp->f_data)->v_mount;
+ fdrop(fp, td);
+ if (mp == NULL)
+ return (EBADF);
+ sp = &mp->mnt_stat;
+ error = VFS_STATFS(mp, sp, td);
+ if (error)
+ return (error);
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (suser(td)) {
+ bcopy(sp, &sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
+ return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+ struct statfs *buf;
+ long bufsize;
+ int flags;
+};
+#endif
+int
+getfsstat(td, uap)
+ struct thread *td;
+ register struct getfsstat_args /* {
+ syscallarg(struct statfs *) buf;
+ syscallarg(long) bufsize;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ register struct mount *mp, *nmp;
+ register struct statfs *sp;
+ caddr_t sfsp;
+ long count, maxcount, error;
+
+ maxcount = SCARG(uap, bufsize) / sizeof(struct statfs);
+ sfsp = (caddr_t)SCARG(uap, buf);
+ count = 0;
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ if (sfsp && count < maxcount) {
+ sp = &mp->mnt_stat;
+ /*
+ * If MNT_NOWAIT or MNT_LAZY is specified, do not
+ * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
+ * overrides MNT_WAIT.
+ */
+ if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
+ (SCARG(uap, flags) & MNT_WAIT)) &&
+ (error = VFS_STATFS(mp, sp, td))) {
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp, td);
+ continue;
+ }
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ error = copyout(sp, sfsp, sizeof(*sp));
+ if (error) {
+ vfs_unbusy(mp, td);
+ return (error);
+ }
+ sfsp += sizeof(*sp);
+ }
+ count++;
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp, td);
+ }
+ mtx_unlock(&mountlist_mtx);
+ if (sfsp && count > maxcount)
+ td->td_retval[0] = maxcount;
+ else
+ td->td_retval[0] = count;
+ return (0);
+}
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+fchdir(td, uap)
+ struct thread *td;
+ struct fchdir_args /* {
+ syscallarg(int) fd;
+ } */ *uap;
+{
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ struct vnode *vp, *tdp, *vpold;
+ struct mount *mp;
+ struct file *fp;
+ int error;
+
+ if ((error = getvnode(fdp, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ VREF(vp);
+ fdrop(fp, td);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (vp->v_type != VDIR)
+ error = ENOTDIR;
+ else
+ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+ while (!error && (mp = vp->v_mountedhere) != NULL) {
+ if (vfs_busy(mp, 0, 0, td))
+ continue;
+ error = VFS_ROOT(mp, &tdp);
+ vfs_unbusy(mp, td);
+ if (error)
+ break;
+ vput(vp);
+ vp = tdp;
+ }
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ VOP_UNLOCK(vp, 0, td);
+ FILEDESC_LOCK(fdp);
+ vpold = fdp->fd_cdir;
+ fdp->fd_cdir = vp;
+ FILEDESC_UNLOCK(fdp);
+ vrele(vpold);
+ return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+chdir(td, uap)
+ struct thread *td;
+ struct chdir_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ int error;
+ struct nameidata nd;
+ struct vnode *vp;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = change_dir(&nd, td)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ FILEDESC_LOCK(fdp);
+ vp = fdp->fd_cdir;
+ fdp->fd_cdir = nd.ni_vp;
+ FILEDESC_UNLOCK(fdp);
+ vrele(vp);
+ return (0);
+}
+
+/*
+ * Helper function for raised chroot(2) security function: Refuse if
+ * any filedescriptors are open directories.
+ */
+static int
+chroot_refuse_vdir_fds(fdp)
+ struct filedesc *fdp;
+{
+ struct vnode *vp;
+ struct file *fp;
+ int fd;
+
+ FILEDESC_LOCK(fdp);
+ for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
+ fp = fget_locked(fdp, fd);
+ if (fp == NULL)
+ continue;
+ if (fp->f_type == DTYPE_VNODE) {
+ vp = (struct vnode *)fp->f_data;
+ if (vp->v_type == VDIR) {
+ FILEDESC_UNLOCK(fdp);
+ return (EPERM);
+ }
+ }
+ }
+ FILEDESC_UNLOCK(fdp);
+ return (0);
+}
+
+/*
+ * This sysctl determines if we will allow a process to chroot(2) if it
+ * has a directory open:
+ * 0: disallowed for all processes.
+ * 1: allowed for processes that were not already chroot(2)'ed.
+ * 2: allowed for all processes.
+ */
+
+static int chroot_allow_open_directories = 1;
+
+SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
+ &chroot_allow_open_directories, 0, "");
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+chroot(td, uap)
+ struct thread *td;
+ struct chroot_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ int error;
+ struct nameidata nd;
+ struct vnode *vp;
+
+ error = suser_cred(td->td_ucred, PRISON_ROOT);
+ if (error)
+ return (error);
+ FILEDESC_LOCK(fdp);
+ if (chroot_allow_open_directories == 0 ||
+ (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
+ FILEDESC_UNLOCK(fdp);
+ error = chroot_refuse_vdir_fds(fdp);
+ } else
+ FILEDESC_UNLOCK(fdp);
+ if (error)
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = change_dir(&nd, td)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ FILEDESC_LOCK(fdp);
+ vp = fdp->fd_rdir;
+ fdp->fd_rdir = nd.ni_vp;
+ if (!fdp->fd_jdir) {
+ fdp->fd_jdir = nd.ni_vp;
+ VREF(fdp->fd_jdir);
+ }
+ FILEDESC_UNLOCK(fdp);
+ vrele(vp);
+ return (0);
+}
+
+/*
+ * Common routine for chroot and chdir.
+ */
+static int
+change_dir(ndp, td)
+ register struct nameidata *ndp;
+ struct thread *td;
+{
+ struct vnode *vp;
+ int error;
+
+ error = namei(ndp);
+ if (error)
+ return (error);
+ vp = ndp->ni_vp;
+ if (vp->v_type != VDIR)
+ error = ENOTDIR;
+ else
+ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+ if (error)
+ vput(vp);
+ else
+ VOP_UNLOCK(vp, 0, td);
+ return (error);
+}
+
+/*
+ * Check permissions, allocate an open file structure,
+ * and call the device open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+ char *path;
+ int flags;
+ int mode;
+};
+#endif
+int
+open(td, uap)
+ struct thread *td;
+ register struct open_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct proc *p = td->td_proc;
+ struct filedesc *fdp = p->p_fd;
+ struct file *fp;
+ struct vnode *vp;
+ struct vattr vat;
+ struct mount *mp;
+ int cmode, flags, oflags;
+ struct file *nfp;
+ int type, indx, error;
+ struct flock lf;
+ struct nameidata nd;
+
+ oflags = SCARG(uap, flags);
+ if ((oflags & O_ACCMODE) == O_ACCMODE)
+ return (EINVAL);
+ flags = FFLAGS(oflags);
+ error = falloc(td, &nfp, &indx);
+ if (error)
+ return (error);
+ fp = nfp;
+ FILEDESC_LOCK(fdp);
+ cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
+ FILEDESC_UNLOCK(fdp);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ td->td_dupfd = -indx - 1; /* XXX check for fdopen */
+ /*
+ * Bump the ref count to prevent another process from closing
+ * the descriptor while we are blocked in vn_open()
+ */
+ fhold(fp);
+ error = vn_open(&nd, &flags, cmode);
+ if (error) {
+ /*
+ * release our own reference
+ */
+ fdrop(fp, td);
+
+ /*
+ * handle special fdopen() case. bleh. dupfdopen() is
+ * responsible for dropping the old contents of ofiles[indx]
+ * if it succeeds.
+ */
+ if ((error == ENODEV || error == ENXIO) &&
+ td->td_dupfd >= 0 && /* XXX from fdopen */
+ (error =
+ dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) {
+ td->td_retval[0] = indx;
+ return (0);
+ }
+ /*
+ * Clean up the descriptor, but only if another thread hadn't
+ * replaced or closed it.
+ */
+ FILEDESC_LOCK(fdp);
+ if (fdp->fd_ofiles[indx] == fp) {
+ fdp->fd_ofiles[indx] = NULL;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ } else
+ FILEDESC_UNLOCK(fdp);
+
+ if (error == ERESTART)
+ error = EINTR;
+ return (error);
+ }
+ td->td_dupfd = 0;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+
+ /*
+ * There should be 2 references on the file, one from the descriptor
+ * table, and one for us.
+ *
+ * Handle the case where someone closed the file (via its file
+ * descriptor) while we were blocked. The end result should look
+ * like opening the file succeeded but it was immediately closed.
+ */
+ FILEDESC_LOCK(fdp);
+ FILE_LOCK(fp);
+ if (fp->f_count == 1) {
+ KASSERT(fdp->fd_ofiles[indx] != fp,
+ ("Open file descriptor lost all refs"));
+ FILEDESC_UNLOCK(fdp);
+ FILE_UNLOCK(fp);
+ VOP_UNLOCK(vp, 0, td);
+ vn_close(vp, flags & FMASK, fp->f_cred, td);
+ fdrop(fp, td);
+ td->td_retval[0] = indx;
+ return 0;
+ }
+
+ /* assert that vn_open created a backing object if one is needed */
+ KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0,
+ ("open: vmio vnode has no backing object after vn_open"));
+
+ fp->f_data = vp;
+ fp->f_flag = flags & FMASK;
+ fp->f_ops = &vnops;
+ fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+ FILEDESC_UNLOCK(fdp);
+ FILE_UNLOCK(fp);
+ VOP_UNLOCK(vp, 0, td);
+ if (flags & (O_EXLOCK | O_SHLOCK)) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ if (flags & O_EXLOCK)
+ lf.l_type = F_WRLCK;
+ else
+ lf.l_type = F_RDLCK;
+ type = F_FLOCK;
+ if ((flags & FNONBLOCK) == 0)
+ type |= F_WAIT;
+ if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+ type)) != 0)
+ goto bad;
+ fp->f_flag |= FHASLOCK;
+ }
+ if (flags & O_TRUNC) {
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ goto bad;
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ VATTR_NULL(&vat);
+ vat.va_size = 0;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_SETATTR(vp, &vat, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ if (error)
+ goto bad;
+ }
+ /*
+ * Release our private reference, leaving the one associated with
+ * the descriptor table intact.
+ */
+ fdrop(fp, td);
+ td->td_retval[0] = indx;
+ return (0);
+bad:
+ FILEDESC_LOCK(fdp);
+ if (fdp->fd_ofiles[indx] == fp) {
+ fdp->fd_ofiles[indx] = NULL;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ } else
+ FILEDESC_UNLOCK(fdp);
+ return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+ocreat(td, uap)
+ struct thread *td;
+ register struct ocreat_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct open_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(int) mode;
+ } */ nuap;
+
+ SCARG(&nuap, path) = SCARG(uap, path);
+ SCARG(&nuap, mode) = SCARG(uap, mode);
+ SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
+ return (open(td, &nuap));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+ char *path;
+ int mode;
+ int dev;
+};
+#endif
+/* ARGSUSED */
+int
+mknod(td, uap)
+ struct thread *td;
+ register struct mknod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ syscallarg(int) dev;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct vattr vattr;
+ int error;
+ int whiteout = 0;
+ struct nameidata nd;
+
+ switch (SCARG(uap, mode) & S_IFMT) {
+ case S_IFCHR:
+ case S_IFBLK:
+ error = suser(td);
+ break;
+ default:
+ error = suser_cred(td->td_ucred, PRISON_ROOT);
+ break;
+ }
+ if (error)
+ return (error);
+restart:
+ bwillwrite();
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL) {
+ vrele(vp);
+ error = EEXIST;
+ } else {
+ VATTR_NULL(&vattr);
+ FILEDESC_LOCK(td->td_proc->p_fd);
+ vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask;
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ vattr.va_rdev = SCARG(uap, dev);
+ whiteout = 0;
+
+ switch (SCARG(uap, mode) & S_IFMT) {
+ case S_IFMT: /* used by badsect to flag bad sectors */
+ vattr.va_type = VBAD;
+ break;
+ case S_IFCHR:
+ vattr.va_type = VCHR;
+ break;
+ case S_IFBLK:
+ vattr.va_type = VBLK;
+ break;
+ case S_IFWHT:
+ whiteout = 1;
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ if (whiteout)
+ error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+ else {
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+ &nd.ni_cnd, &vattr);
+ if (error == 0)
+ vput(nd.ni_vp);
+ }
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod");
+ return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkfifo(td, uap)
+ struct thread *td;
+ register struct mkfifo_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+restart:
+ bwillwrite();
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ if (nd.ni_vp != NULL) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(nd.ni_vp);
+ vput(nd.ni_dvp);
+ return (EEXIST);
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VFIFO;
+ FILEDESC_LOCK(td->td_proc->p_fd);
+ vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask;
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ if (error == 0)
+ vput(nd.ni_vp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+ char *path;
+ char *link;
+};
+#endif
+/* ARGSUSED */
+int
+link(td, uap)
+ struct thread *td;
+ register struct link_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) link;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct nameidata nd;
+ int error;
+
+ bwillwrite();
+ NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ if (vp->v_type == VDIR) {
+ vrele(vp);
+ return (EPERM); /* POSIX */
+ }
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+ vrele(vp);
+ return (error);
+ }
+ NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td);
+ if ((error = namei(&nd)) == 0) {
+ if (nd.ni_vp != NULL) {
+ vrele(nd.ni_vp);
+ error = EEXIST;
+ } else {
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ }
+ vrele(vp);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "link");
+ return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+ char *path;
+ char *link;
+};
+#endif
+/* ARGSUSED */
+int
+symlink(td, uap)
+ struct thread *td;
+ register struct symlink_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) link;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vattr vattr;
+ char *path;
+ int error;
+ struct nameidata nd;
+
+ path = uma_zalloc(namei_zone, M_WAITOK);
+ if ((error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) != 0)
+ goto out;
+restart:
+ bwillwrite();
+ NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td);
+ if ((error = namei(&nd)) != 0)
+ goto out;
+ if (nd.ni_vp) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(nd.ni_vp);
+ vput(nd.ni_dvp);
+ error = EEXIST;
+ goto out;
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ FILEDESC_LOCK(td->td_proc->p_fd);
+ vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error == 0)
+ vput(nd.ni_vp);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
+out:
+ uma_zfree(namei_zone, path);
+ return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+/* ARGSUSED */
+int
+undelete(td, uap)
+ struct thread *td;
+ register struct undelete_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ int error;
+ struct mount *mp;
+ struct nameidata nd;
+
+restart:
+ bwillwrite();
+ NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+
+ if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_vp)
+ vrele(nd.ni_vp);
+ vput(nd.ni_dvp);
+ return (EEXIST);
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete");
+ return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+unlink(td, uap)
+ struct thread *td;
+ struct unlink_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+ struct nameidata nd;
+
+restart:
+ bwillwrite();
+ NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type == VDIR)
+ error = EPERM; /* POSIX */
+ else {
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ *
+ * XXX: can this only be a VDIR case?
+ */
+ if (vp->v_flag & VROOT)
+ error = EBUSY;
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(vp);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vput(vp);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink");
+ return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+ int fd;
+ int pad;
+ off_t offset;
+ int whence;
+};
+#endif
+int
+lseek(td, uap)
+ struct thread *td;
+ register struct lseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) offset;
+ syscallarg(int) whence;
+ } */ *uap;
+{
+ struct ucred *cred = td->td_ucred;
+ struct file *fp;
+ struct vnode *vp;
+ struct vattr vattr;
+ off_t offset;
+ int error, noneg;
+
+ if ((error = fget(td, uap->fd, &fp)) != 0)
+ return (error);
+ if (fp->f_type != DTYPE_VNODE) {
+ fdrop(fp, td);
+ return (ESPIPE);
+ }
+ vp = (struct vnode *)fp->f_data;
+ noneg = (vp->v_type != VCHR);
+ offset = SCARG(uap, offset);
+ switch (SCARG(uap, whence)) {
+ case L_INCR:
+ if (noneg &&
+ (fp->f_offset < 0 ||
+ (offset > 0 && fp->f_offset > OFF_MAX - offset)))
+ return (EOVERFLOW);
+ offset += fp->f_offset;
+ break;
+ case L_XTND:
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_GETATTR(vp, &vattr, cred, td);
+ VOP_UNLOCK(vp, 0, td);
+ if (error)
+ return (error);
+ if (noneg &&
+ (vattr.va_size > OFF_MAX ||
+ (offset > 0 && vattr.va_size > OFF_MAX - offset)))
+ return (EOVERFLOW);
+ offset += vattr.va_size;
+ break;
+ case L_SET:
+ break;
+ default:
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ if (noneg && offset < 0)
+ return (EINVAL);
+ fp->f_offset = offset;
+ *(off_t *)(td->td_retval) = fp->f_offset;
+ fdrop(fp, td);
+ return (0);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+ int fd;
+ long offset;
+ int whence;
+};
+#endif
+int
+olseek(td, uap)
+ struct thread *td;
+ register struct olseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(long) offset;
+ syscallarg(int) whence;
+ } */ *uap;
+{
+ struct lseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) offset;
+ syscallarg(int) whence;
+ } */ nuap;
+ int error;
+
+ SCARG(&nuap, fd) = SCARG(uap, fd);
+ SCARG(&nuap, offset) = SCARG(uap, offset);
+ SCARG(&nuap, whence) = SCARG(uap, whence);
+ error = lseek(td, &nuap);
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Check access permissions using passed credentials.
+ */
+static int
+vn_access(vp, user_flags, cred, td)
+ struct vnode *vp;
+ int user_flags;
+ struct ucred *cred;
+ struct thread *td;
+{
+ int error, flags;
+
+ /* Flags == 0 means only check for existence. */
+ error = 0;
+ if (user_flags) {
+ flags = 0;
+ if (user_flags & R_OK)
+ flags |= VREAD;
+ if (user_flags & W_OK)
+ flags |= VWRITE;
+ if (user_flags & X_OK)
+ flags |= VEXEC;
+ if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+ error = VOP_ACCESS(vp, flags, cred, td);
+ }
+ return (error);
+}
+
+/*
+ * Check access permissions using "real" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+ char *path;
+ int flags;
+};
+#endif
+int
+access(td, uap)
+ struct thread *td;
+ register struct access_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ struct ucred *cred, *tmpcred;
+ register struct vnode *vp;
+ int error;
+ struct nameidata nd;
+
+ /*
+ * Create and modify a temporary credential instead of one that
+ * is potentially shared. This could also mess up socket
+ * buffer accounting which can run in an interrupt context.
+ *
+ * XXX - Depending on how "threads" are finally implemented, it
+ * may be better to explicitly pass the credential to namei()
+ * rather than to modify the potentially shared process structure.
+ */
+ cred = td->td_ucred;
+ tmpcred = crdup(cred);
+ tmpcred->cr_uid = cred->cr_ruid;
+ tmpcred->cr_groups[0] = cred->cr_rgid;
+ td->td_ucred = tmpcred;
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ goto out1;
+ vp = nd.ni_vp;
+
+ error = vn_access(vp, SCARG(uap, flags), tmpcred, td);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+out1:
+ td->td_ucred = cred;
+ crfree(tmpcred);
+ return (error);
+}
+
+/*
+ * Check access permissions using "effective" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct eaccess_args {
+ char *path;
+ int flags;
+};
+#endif
+int
+eaccess(td, uap)
+ struct thread *td;
+ register struct eaccess_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ struct nameidata nd;
+ struct vnode *vp;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+
+ error = vn_access(vp, SCARG(uap, flags), td->td_ucred, td);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+ return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+ostat(td, uap)
+ struct thread *td;
+ register struct ostat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct ostat *) ub;
+ } */ *uap;
+{
+ struct stat sb;
+ struct ostat osb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = vn_stat(nd.ni_vp, &sb, td);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ cvtstat(&sb, &osb);
+ error = copyout(&osb, SCARG(uap, ub), sizeof (osb));
+ return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+olstat(td, uap)
+ struct thread *td;
+ register struct olstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct ostat *) ub;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct stat sb;
+ struct ostat osb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ error = vn_stat(vp, &sb, td);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+ if (error)
+ return (error);
+ cvtstat(&sb, &osb);
+ error = copyout(&osb, SCARG(uap, ub), sizeof (osb));
+ return (error);
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ */
+void
+cvtstat(st, ost)
+ struct stat *st;
+ struct ostat *ost;
+{
+
+ ost->st_dev = st->st_dev;
+ ost->st_ino = st->st_ino;
+ ost->st_mode = st->st_mode;
+ ost->st_nlink = st->st_nlink;
+ ost->st_uid = st->st_uid;
+ ost->st_gid = st->st_gid;
+ ost->st_rdev = st->st_rdev;
+ if (st->st_size < (quad_t)1 << 32)
+ ost->st_size = st->st_size;
+ else
+ ost->st_size = -2;
+ ost->st_atime = st->st_atime;
+ ost->st_mtime = st->st_mtime;
+ ost->st_ctime = st->st_ctime;
+ ost->st_blksize = st->st_blksize;
+ ost->st_blocks = st->st_blocks;
+ ost->st_flags = st->st_flags;
+ ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+stat(td, uap)
+ struct thread *td;
+ register struct stat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct stat *) ub;
+ } */ *uap;
+{
+ struct stat sb;
+ int error;
+ struct nameidata nd;
+
+#ifdef LOOKUP_SHARED
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | NOOBJ,
+ UIO_USERSPACE, SCARG(uap, path), td);
+#else
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+#endif
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ error = vn_stat(nd.ni_vp, &sb, td);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ error = copyout(&sb, SCARG(uap, ub), sizeof (sb));
+ return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+lstat(td, uap)
+ struct thread *td;
+ register struct lstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct stat *) ub;
+ } */ *uap;
+{
+ int error;
+ struct vnode *vp;
+ struct stat sb;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ error = vn_stat(vp, &sb, td);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+ if (error)
+ return (error);
+ error = copyout(&sb, SCARG(uap, ub), sizeof (sb));
+ return (error);
+}
+
+/*
+ * Implementation of the NetBSD stat() function.
+ * XXX This should probably be collapsed with the FreeBSD version,
+ * as the differences are only due to vn_stat() clearing spares at
+ * the end of the structures. vn_stat could be split to avoid this,
+ * and thus collapse the following to close to zero code.
+ */
+void
+cvtnstat(sb, nsb)
+ struct stat *sb;
+ struct nstat *nsb;
+{
+ bzero(nsb, sizeof *nsb);
+ nsb->st_dev = sb->st_dev;
+ nsb->st_ino = sb->st_ino;
+ nsb->st_mode = sb->st_mode;
+ nsb->st_nlink = sb->st_nlink;
+ nsb->st_uid = sb->st_uid;
+ nsb->st_gid = sb->st_gid;
+ nsb->st_rdev = sb->st_rdev;
+ nsb->st_atimespec = sb->st_atimespec;
+ nsb->st_mtimespec = sb->st_mtimespec;
+ nsb->st_ctimespec = sb->st_ctimespec;
+ nsb->st_size = sb->st_size;
+ nsb->st_blocks = sb->st_blocks;
+ nsb->st_blksize = sb->st_blksize;
+ nsb->st_flags = sb->st_flags;
+ nsb->st_gen = sb->st_gen;
+ nsb->st_createtimespec = sb->st_createtimespec;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nstat_args {
+ char *path;
+ struct nstat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nstat(td, uap)
+ struct thread *td;
+ register struct nstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct nstat *) ub;
+ } */ *uap;
+{
+ struct stat sb;
+ struct nstat nsb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = vn_stat(nd.ni_vp, &sb, td);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ cvtnstat(&sb, &nsb);
+ error = copyout(&nsb, SCARG(uap, ub), sizeof (nsb));
+ return (error);
+}
+
+/*
+ * NetBSD lstat. Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nlstat(td, uap)
+ struct thread *td;
+ register struct nlstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct nstat *) ub;
+ } */ *uap;
+{
+ int error;
+ struct vnode *vp;
+ struct stat sb;
+ struct nstat nsb;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = vn_stat(vp, &sb, td);
+ vput(vp);
+ if (error)
+ return (error);
+ cvtnstat(&sb, &nsb);
+ error = copyout(&nsb, SCARG(uap, ub), sizeof (nsb));
+ return (error);
+}
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+ char *path;
+ int name;
+};
+#endif
+/* ARGSUSED */
+int
+pathconf(td, uap)
+ struct thread *td;
+ register struct pathconf_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) name;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), td->td_retval);
+ vput(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+ char *path;
+ char *buf;
+ int count;
+};
+#endif
+/* ARGSUSED */
+int
+readlink(td, uap)
+ struct thread *td;
+ register struct readlink_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) buf;
+ syscallarg(int) count;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct iovec aiov;
+ struct uio auio;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ if (vp->v_type != VLNK)
+ error = EINVAL;
+ else {
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auio.uio_resid = SCARG(uap, count);
+ error = VOP_READLINK(vp, &auio, td->td_ucred);
+ }
+ vput(vp);
+ td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+
+/*
+ * Common implementation code for chflags() and fchflags().
+ */
+static int
+setfflags(td, vp, flags)
+ struct thread *td;
+ struct vnode *vp;
+ int flags;
+{
+ int error;
+ struct mount *mp;
+ struct vattr vattr;
+
+ /*
+ * Prevent non-root users from setting flags on devices. When
+ * a device is reused, users can retain ownership of the device
+ * if they are allowed to set flags and programs assume that
+ * chown can't fail when done as root.
+ */
+ if (vp->v_type == VCHR || vp->v_type == VBLK) {
+ error = suser_cred(td->td_ucred, PRISON_ROOT);
+ if (error)
+ return (error);
+ }
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ VATTR_NULL(&vattr);
+ vattr.va_flags = flags;
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+ char *path;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+chflags(td, uap)
+ struct thread *td;
+ register struct chflags_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfflags(td, nd.ni_vp, SCARG(uap, flags));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Same as chflags() but doesn't follow symlinks.
+ */
+int
+lchflags(td, uap)
+ struct thread *td;
+ register struct lchflags_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfflags(td, nd.ni_vp, SCARG(uap, flags));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+ int fd;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+fchflags(td, uap)
+ struct thread *td;
+ register struct fchflags_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ struct file *fp;
+ int error;
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ error = setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags));
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common implementation code for chmod(), lchmod() and fchmod().
+ */
+static int
+setfmode(td, vp, mode)
+ struct thread *td;
+ struct vnode *vp;
+ int mode;
+{
+ int error;
+ struct mount *mp;
+ struct vattr vattr;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ VATTR_NULL(&vattr);
+ vattr.va_mode = mode & ALLPERMS;
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return error;
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+chmod(td, uap)
+ struct thread *td;
+ register struct chmod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfmode(td, nd.ni_vp, SCARG(uap, mode));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Change mode of a file given path name (don't follow links.)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchmod_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+lchmod(td, uap)
+ struct thread *td;
+ register struct lchmod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfmode(td, nd.ni_vp, SCARG(uap, mode));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+ int fd;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+fchmod(td, uap)
+ struct thread *td;
+ register struct fchmod_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct file *fp;
+ struct vnode *vp;
+ int error;
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ error = setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode));
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common implementation for chown(), lchown(), and fchown()
+ */
+static int
+setfown(td, vp, uid, gid)
+ struct thread *td;
+ struct vnode *vp;
+ uid_t uid;
+ gid_t gid;
+{
+ int error;
+ struct mount *mp;
+ struct vattr vattr;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ VATTR_NULL(&vattr);
+ vattr.va_uid = uid;
+ vattr.va_gid = gid;
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return error;
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+chown(td, uap)
+ struct thread *td;
+ register struct chown_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set ownership given a path name, do not cross symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+lchown(td, uap)
+ struct thread *td;
+ register struct lchown_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+ int fd;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+fchown(td, uap)
+ struct thread *td;
+ register struct fchown_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+{
+ struct file *fp;
+ struct vnode *vp;
+ int error;
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ error = setfown(td, (struct vnode *)fp->f_data,
+ SCARG(uap, uid), SCARG(uap, gid));
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+getutimes(usrtvp, tsp)
+ const struct timeval *usrtvp;
+ struct timespec *tsp;
+{
+ struct timeval tv[2];
+ int error;
+
+ if (usrtvp == NULL) {
+ microtime(&tv[0]);
+ TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
+ tsp[1] = tsp[0];
+ } else {
+ if ((error = copyin(usrtvp, tv, sizeof (tv))) != 0)
+ return (error);
+ TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
+ TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
+ }
+ return 0;
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+setutimes(td, vp, ts, nullflag)
+ struct thread *td;
+ struct vnode *vp;
+ const struct timespec *ts;
+ int nullflag;
+{
+ int error;
+ struct mount *mp;
+ struct vattr vattr;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ VATTR_NULL(&vattr);
+ vattr.va_atime = ts[0];
+ vattr.va_mtime = ts[1];
+ if (nullflag)
+ vattr.va_vaflags |= VA_UTIMES_NULL;
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return error;
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+utimes(td, uap)
+ struct thread *td;
+ register struct utimes_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+{
+ struct timespec ts[2];
+ struct timeval *usrtvp;
+ int error;
+ struct nameidata nd;
+
+ usrtvp = SCARG(uap, tptr);
+ if ((error = getutimes(usrtvp, ts)) != 0)
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lutimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+lutimes(td, uap)
+ struct thread *td;
+ register struct lutimes_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+{
+ struct timespec ts[2];
+ struct timeval *usrtvp;
+ int error;
+ struct nameidata nd;
+
+ usrtvp = SCARG(uap, tptr);
+ if ((error = getutimes(usrtvp, ts)) != 0)
+ return (error);
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct futimes_args {
+ int fd;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+futimes(td, uap)
+ struct thread *td;
+ register struct futimes_args /* {
+ syscallarg(int ) fd;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+{
+ struct timespec ts[2];
+ struct file *fp;
+ struct timeval *usrtvp;
+ int error;
+
+ usrtvp = SCARG(uap, tptr);
+ if ((error = getutimes(usrtvp, ts)) != 0)
+ return (error);
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ error = setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+ char *path;
+ int pad;
+ off_t length;
+};
+#endif
+/* ARGSUSED */
+int
+truncate(td, uap)
+ struct thread *td;
+ register struct truncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ if (uap->length < 0)
+ return(EINVAL);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+ vrele(vp);
+ return (error);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+ else if ((error = vn_writechk(vp)) == 0 &&
+ (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = SCARG(uap, length);
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+ }
+ vput(vp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+ int fd;
+ int pad;
+ off_t length;
+};
+#endif
+/* ARGSUSED */
+int
+ftruncate(td, uap)
+ struct thread *td;
+ register struct ftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vattr vattr;
+ struct vnode *vp;
+ struct file *fp;
+ int error;
+
+ if (uap->length < 0)
+ return(EINVAL);
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ if ((fp->f_flag & FWRITE) == 0) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ vp = (struct vnode *)fp->f_data;
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+ fdrop(fp, td);
+ return (error);
+ }
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+ else if ((error = vn_writechk(vp)) == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = SCARG(uap, length);
+ error = VOP_SETATTR(vp, &vattr, fp->f_cred, td);
+ }
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ fdrop(fp, td);
+ return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+ char *path;
+ long length;
+};
+#endif
+/* ARGSUSED */
+int
+otruncate(td, uap)
+ struct thread *td;
+ register struct otruncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(long) length;
+ } */ *uap;
+{
+ struct truncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ nuap;
+
+ SCARG(&nuap, path) = SCARG(uap, path);
+ SCARG(&nuap, length) = SCARG(uap, length);
+ return (truncate(td, &nuap));
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+ int fd;
+ long length;
+};
+#endif
+/* ARGSUSED */
+int
+oftruncate(td, uap)
+ struct thread *td;
+ register struct oftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(long) length;
+ } */ *uap;
+{
+ struct ftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ nuap;
+
+ SCARG(&nuap, fd) = SCARG(uap, fd);
+ SCARG(&nuap, length) = SCARG(uap, length);
+ return (ftruncate(td, &nuap));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+fsync(td, uap)
+ struct thread *td;
+ struct fsync_args /* {
+ syscallarg(int) fd;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct file *fp;
+ vm_object_t obj;
+ int error;
+
+ GIANT_REQUIRED;
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+ fdrop(fp, td);
+ return (error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (VOP_GETVOBJECT(vp, &obj) == 0) {
+ vm_object_page_clean(obj, 0, 0, 0);
+ }
+ error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, td);
+#ifdef SOFTUPDATES
+ if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
+ error = softdep_fsync(vp);
+#endif
+
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Rename files. Source and destination must either both be directories,
+ * or both not be directories. If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+ char *from;
+ char *to;
+};
+#endif
+/* ARGSUSED */
+int
+rename(td, uap)
+ struct thread *td;
+ register struct rename_args /* {
+ syscallarg(char *) from;
+ syscallarg(char *) to;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vnode *tvp, *fvp, *tdvp;
+ struct nameidata fromnd, tond;
+ int error;
+
+ bwillwrite();
+ NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE,
+ SCARG(uap, from), td);
+ if ((error = namei(&fromnd)) != 0)
+ return (error);
+ fvp = fromnd.ni_vp;
+ if ((error = vn_start_write(fvp, &mp, V_WAIT | PCATCH)) != 0) {
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ goto out1;
+ }
+ NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ,
+ UIO_USERSPACE, SCARG(uap, to), td);
+ if (fromnd.ni_vp->v_type == VDIR)
+ tond.ni_cnd.cn_flags |= WILLBEDIR;
+ if ((error = namei(&tond)) != 0) {
+ /* Translate error code for rename("dir1", "dir2/."). */
+ if (error == EISDIR && fvp->v_type == VDIR)
+ error = EINVAL;
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ goto out1;
+ }
+ tdvp = tond.ni_dvp;
+ tvp = tond.ni_vp;
+ if (tvp != NULL) {
+ if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
+ }
+ if (fvp == tdvp)
+ error = EINVAL;
+ /*
+ * If source is the same as the destination (that is the
+ * same inode number with the same name in the same directory),
+ * then there is nothing to do.
+ */
+ if (fvp == tvp && fromnd.ni_dvp == tdvp &&
+ fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
+ !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
+ fromnd.ni_cnd.cn_namelen))
+ error = -1;
+out:
+ if (!error) {
+ VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE);
+ if (fromnd.ni_dvp != tdvp) {
+ VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ }
+ if (tvp) {
+ VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE);
+ }
+ error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+ tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ NDFREE(&tond, NDF_ONLY_PNBUF);
+ } else {
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ NDFREE(&tond, NDF_ONLY_PNBUF);
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ if (tvp)
+ vput(tvp);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ }
+ vrele(tond.ni_startdir);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename");
+ ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename");
+ ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename");
+ ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename");
+out1:
+ if (fromnd.ni_startdir)
+ vrele(fromnd.ni_startdir);
+ if (error == -1)
+ return (0);
+ return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkdir(td, uap)
+ struct thread *td;
+ register struct mkdir_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+
+ return vn_mkdir(uap->path, uap->mode, UIO_USERSPACE, td);
+}
+
+int
+vn_mkdir(path, mode, segflg, td)
+ char *path;
+ int mode;
+ enum uio_seg segflg;
+ struct thread *td;
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+restart:
+ bwillwrite();
+ NDINIT(&nd, CREATE, LOCKPARENT, segflg, path, td);
+ nd.ni_cnd.cn_flags |= WILLBEDIR;
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(vp);
+ /*
+ * XXX namei called with LOCKPARENT but not LOCKLEAF has
+ * the strange behaviour of leaving the vnode unlocked
+ * if the target is the same vnode as the parent.
+ */
+ if (vp == nd.ni_dvp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ return (EEXIST);
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VDIR;
+ FILEDESC_LOCK(td->td_proc->p_fd);
+ vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if (!error)
+ vput(nd.ni_vp);
+ vn_finished_write(mp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir");
+ return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+rmdir(td, uap)
+ struct thread *td;
+ struct rmdir_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+ struct nameidata nd;
+
+restart:
+ bwillwrite();
+ NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+ /*
+ * No rmdir "." please.
+ */
+ if (nd.ni_dvp == vp) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ */
+ if (vp->v_flag & VROOT) {
+ error = EBUSY;
+ goto out;
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vput(vp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+ vn_finished_write(mp);
+out:
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vput(vp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir");
+ return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+ogetdirentries(td, uap)
+ struct thread *td;
+ register struct ogetdirentries_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ syscallarg(long *) basep;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct file *fp;
+ struct uio auio, kuio;
+ struct iovec aiov, kiov;
+ struct dirent *dp, *edp;
+ caddr_t dirbuf;
+ int error, eofflag, readcnt;
+ long loff;
+
+ /* XXX arbitrary sanity limit on `count'. */
+ if (SCARG(uap, count) > 64 * 1024)
+ return (EINVAL);
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ if ((fp->f_flag & FREAD) == 0) {
+ fdrop(fp, td);
+ return (EBADF);
+ }
+ vp = (struct vnode *)fp->f_data;
+unionread:
+ if (vp->v_type != VDIR) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auio.uio_resid = SCARG(uap, count);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ loff = auio.uio_offset = fp->f_offset;
+# if (BYTE_ORDER != LITTLE_ENDIAN)
+ if (vp->v_mount->mnt_maxsymlinklen <= 0) {
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+ NULL, NULL);
+ fp->f_offset = auio.uio_offset;
+ } else
+# endif
+ {
+ kuio = auio;
+ kuio.uio_iov = &kiov;
+ kuio.uio_segflg = UIO_SYSSPACE;
+ kiov.iov_len = SCARG(uap, count);
+ MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
+ kiov.iov_base = dirbuf;
+ error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
+ NULL, NULL);
+ fp->f_offset = kuio.uio_offset;
+ if (error == 0) {
+ readcnt = SCARG(uap, count) - kuio.uio_resid;
+ edp = (struct dirent *)&dirbuf[readcnt];
+ for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+# if (BYTE_ORDER == LITTLE_ENDIAN)
+ /*
+ * The expected low byte of
+ * dp->d_namlen is our dp->d_type.
+ * The high MBZ byte of dp->d_namlen
+ * is our dp->d_namlen.
+ */
+ dp->d_type = dp->d_namlen;
+ dp->d_namlen = 0;
+# else
+ /*
+ * The dp->d_type is the high byte
+ * of the expected dp->d_namlen,
+ * so must be zero'ed.
+ */
+ dp->d_type = 0;
+# endif
+ if (dp->d_reclen > 0) {
+ dp = (struct dirent *)
+ ((char *)dp + dp->d_reclen);
+ } else {
+ error = EIO;
+ break;
+ }
+ }
+ if (dp >= edp)
+ error = uiomove(dirbuf, readcnt, &auio);
+ }
+ FREE(dirbuf, M_TEMP);
+ }
+ VOP_UNLOCK(vp, 0, td);
+ if (error) {
+ fdrop(fp, td);
+ return (error);
+ }
+ if (SCARG(uap, count) == auio.uio_resid) {
+ if (union_dircheckp) {
+ error = union_dircheckp(td, &vp, fp);
+ if (error == -1)
+ goto unionread;
+ if (error) {
+ fdrop(fp, td);
+ return (error);
+ }
+ }
+ if ((vp->v_flag & VROOT) &&
+ (vp->v_mount->mnt_flag & MNT_UNION)) {
+ struct vnode *tvp = vp;
+ vp = vp->v_mount->mnt_vnodecovered;
+ VREF(vp);
+ fp->f_data = vp;
+ fp->f_offset = 0;
+ vrele(tvp);
+ goto unionread;
+ }
+ }
+ error = copyout(&loff, SCARG(uap, basep), sizeof(long));
+ fdrop(fp, td);
+ td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+getdirentries(td, uap)
+ struct thread *td;
+ register struct getdirentries_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ syscallarg(long *) basep;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct file *fp;
+ struct uio auio;
+ struct iovec aiov;
+ long loff;
+ int error, eofflag;
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+ if ((fp->f_flag & FREAD) == 0) {
+ fdrop(fp, td);
+ return (EBADF);
+ }
+ vp = (struct vnode *)fp->f_data;
+unionread:
+ if (vp->v_type != VDIR) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auio.uio_resid = SCARG(uap, count);
+ /* vn_lock(vp, LK_SHARED | LK_RETRY, td); */
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ loff = auio.uio_offset = fp->f_offset;
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
+ fp->f_offset = auio.uio_offset;
+ VOP_UNLOCK(vp, 0, td);
+ if (error) {
+ fdrop(fp, td);
+ return (error);
+ }
+ if (SCARG(uap, count) == auio.uio_resid) {
+ if (union_dircheckp) {
+ error = union_dircheckp(td, &vp, fp);
+ if (error == -1)
+ goto unionread;
+ if (error) {
+ fdrop(fp, td);
+ return (error);
+ }
+ }
+ if ((vp->v_flag & VROOT) &&
+ (vp->v_mount->mnt_flag & MNT_UNION)) {
+ struct vnode *tvp = vp;
+ vp = vp->v_mount->mnt_vnodecovered;
+ VREF(vp);
+ fp->f_data = vp;
+ fp->f_offset = 0;
+ vrele(tvp);
+ goto unionread;
+ }
+ }
+ if (SCARG(uap, basep) != NULL) {
+ error = copyout(&loff, SCARG(uap, basep), sizeof(long));
+ }
+ td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+ fdrop(fp, td);
+ return (error);
+}
+#ifndef _SYS_SYSPROTO_H_
+struct getdents_args {
+ int fd;
+ char *buf;
+ size_t count;
+};
+#endif
+int
+getdents(td, uap)
+ struct thread *td;
+ register struct getdents_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ } */ *uap;
+{
+ struct getdirentries_args ap;
+ ap.fd = uap->fd;
+ ap.buf = uap->buf;
+ ap.count = uap->count;
+ ap.basep = NULL;
+ return getdirentries(td, &ap);
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ *
+ * MP SAFE
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+ int newmask;
+};
+#endif
+int
+umask(td, uap)
+ struct thread *td;
+ struct umask_args /* {
+ syscallarg(int) newmask;
+ } */ *uap;
+{
+ register struct filedesc *fdp;
+
+ FILEDESC_LOCK(td->td_proc->p_fd);
+ fdp = td->td_proc->p_fd;
+ td->td_retval[0] = fdp->fd_cmask;
+ fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS;
+ FILEDESC_UNLOCK(td->td_proc->p_fd);
+ return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem
+ * away from vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+revoke(td, uap)
+ struct thread *td;
+ register struct revoke_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path),
+ td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (vp->v_type != VCHR) {
+ vput(vp);
+ return (EINVAL);
+ }
+ error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ VOP_UNLOCK(vp, 0, td);
+ if (td->td_ucred->cr_uid != vattr.va_uid) {
+ error = suser_cred(td->td_ucred, PRISON_ROOT);
+ if (error)
+ goto out;
+ }
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ goto out;
+ if (vcount(vp) > 1)
+ VOP_REVOKE(vp, REVOKEALL);
+ vn_finished_write(mp);
+out:
+ vrele(vp);
+ return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry.
+ * The file entry is locked upon returning.
+ */
+int
+getvnode(fdp, fd, fpp)
+ struct filedesc *fdp;
+ int fd;
+ struct file **fpp;
+{
+ int error;
+ struct file *fp;
+
+ fp = NULL;
+ if (fdp == NULL)
+ error = EBADF;
+ else {
+ FILEDESC_LOCK(fdp);
+ if ((u_int)fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[fd]) == NULL)
+ error = EBADF;
+ else if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
+ fp = NULL;
+ error = EINVAL;
+ } else {
+ fhold(fp);
+ error = 0;
+ }
+ FILEDESC_UNLOCK(fdp);
+ }
+ *fpp = fp;
+ return (error);
+}
+/*
+ * Get (NFS) file handle
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfh_args {
+ char *fname;
+ fhandle_t *fhp;
+};
+#endif
+int
+getfh(td, uap)
+ struct thread *td;
+ register struct getfh_args *uap;
+{
+ struct nameidata nd;
+ fhandle_t fh;
+ register struct vnode *vp;
+ int error;
+
+ /*
+ * Must be super user
+ */
+ error = suser(td);
+ if (error)
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ bzero(&fh, sizeof(fh));
+ fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
+ error = VFS_VPTOFH(vp, &fh.fh_fid);
+ vput(vp);
+ if (error)
+ return (error);
+ error = copyout(&fh, uap->fhp, sizeof (fh));
+ return (error);
+}
+
+/*
+ * syscall for the rpc.lockd to use to translate a NFS file handle into
+ * an open descriptor.
+ *
+ * warning: do not remove the suser() call or this becomes one giant
+ * security hole.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhopen_args {
+ const struct fhandle *u_fhp;
+ int flags;
+};
+#endif
+int
+fhopen(td, uap)
+ struct thread *td;
+ struct fhopen_args /* {
+ syscallarg(const struct fhandle *) u_fhp;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ struct proc *p = td->td_proc;
+ struct mount *mp;
+ struct vnode *vp;
+ struct fhandle fhp;
+ struct vattr vat;
+ struct vattr *vap = &vat;
+ struct flock lf;
+ struct file *fp;
+ register struct filedesc *fdp = p->p_fd;
+ int fmode, mode, error, type;
+ struct file *nfp;
+ int indx;
+
+ /*
+ * Must be super user
+ */
+ error = suser(td);
+ if (error)
+ return (error);
+
+ fmode = FFLAGS(SCARG(uap, flags));
+ /* why not allow a non-read/write open for our lockd? */
+ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
+ return (EINVAL);
+ error = copyin(SCARG(uap,u_fhp), &fhp, sizeof(fhp));
+ if (error)
+ return(error);
+ /* find the mount point */
+ mp = vfs_getvfs(&fhp.fh_fsid);
+ if (mp == NULL)
+ return (ESTALE);
+ /* now give me my vnode, it gets returned to me locked */
+ error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
+ if (error)
+ return (error);
+ /*
+ * from now on we have to make sure not
+ * to forget about the vnode
+ * any error that causes an abort must vput(vp)
+ * just set error = err and 'goto bad;'.
+ */
+
+ /*
+ * from vn_open
+ */
+ if (vp->v_type == VLNK) {
+ error = EMLINK;
+ goto bad;
+ }
+ if (vp->v_type == VSOCK) {
+ error = EOPNOTSUPP;
+ goto bad;
+ }
+ mode = 0;
+ if (fmode & (FWRITE | O_TRUNC)) {
+ if (vp->v_type == VDIR) {
+ error = EISDIR;
+ goto bad;
+ }
+ error = vn_writechk(vp);
+ if (error)
+ goto bad;
+ mode |= VWRITE;
+ }
+ if (fmode & FREAD)
+ mode |= VREAD;
+ if (mode) {
+ error = VOP_ACCESS(vp, mode, td->td_ucred, td);
+ if (error)
+ goto bad;
+ }
+ if (fmode & O_TRUNC) {
+ VOP_UNLOCK(vp, 0, td); /* XXX */
+ if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
+ vrele(vp);
+ return (error);
+ }
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* XXX */
+ VATTR_NULL(vap);
+ vap->va_size = 0;
+ error = VOP_SETATTR(vp, vap, td->td_ucred, td);
+ vn_finished_write(mp);
+ if (error)
+ goto bad;
+ }
+ error = VOP_OPEN(vp, fmode, td->td_ucred, td);
+ if (error)
+ goto bad;
+ /*
+ * Make sure that a VM object is created for VMIO support.
+ */
+ if (vn_canvmio(vp) == TRUE) {
+ if ((error = vfs_object_create(vp, td, td->td_ucred)) != 0)
+ goto bad;
+ }
+ if (fmode & FWRITE)
+ vp->v_writecount++;
+
+ /*
+ * end of vn_open code
+ */
+
+ if ((error = falloc(td, &nfp, &indx)) != 0) {
+ if (fmode & FWRITE)
+ vp->v_writecount--;
+ goto bad;
+ }
+ fp = nfp;
+
+ /*
+ * Hold an extra reference to avoid having fp ripped out
+ * from under us while we block in the lock op
+ */
+ fhold(fp);
+ nfp->f_data = vp;
+ nfp->f_flag = fmode & FMASK;
+ nfp->f_ops = &vnops;
+ nfp->f_type = DTYPE_VNODE;
+ if (fmode & (O_EXLOCK | O_SHLOCK)) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ if (fmode & O_EXLOCK)
+ lf.l_type = F_WRLCK;
+ else
+ lf.l_type = F_RDLCK;
+ type = F_FLOCK;
+ if ((fmode & FNONBLOCK) == 0)
+ type |= F_WAIT;
+ VOP_UNLOCK(vp, 0, td);
+ if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+ type)) != 0) {
+ /*
+ * The lock request failed. Normally close the
+ * descriptor but handle the case where someone might
+ * have dup()d or close()d it when we weren't looking.
+ */
+ FILEDESC_LOCK(fdp);
+ if (fdp->fd_ofiles[indx] == fp) {
+ fdp->fd_ofiles[indx] = NULL;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ } else
+ FILEDESC_UNLOCK(fdp);
+ /*
+ * release our private reference
+ */
+ fdrop(fp, td);
+ return(error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ fp->f_flag |= FHASLOCK;
+ }
+ if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0))
+ vfs_object_create(vp, td, td->td_ucred);
+
+ VOP_UNLOCK(vp, 0, td);
+ fdrop(fp, td);
+ td->td_retval[0] = indx;
+ return (0);
+
+bad:
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Stat an (NFS) file handle.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstat_args {
+ struct fhandle *u_fhp;
+ struct stat *sb;
+};
+#endif
+int
+fhstat(td, uap)
+ struct thread *td;
+ register struct fhstat_args /* {
+ syscallarg(struct fhandle *) u_fhp;
+ syscallarg(struct stat *) sb;
+ } */ *uap;
+{
+ struct stat sb;
+ fhandle_t fh;
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+
+ /*
+ * Must be super user
+ */
+ error = suser(td);
+ if (error)
+ return (error);
+
+ error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t));
+ if (error)
+ return (error);
+
+ if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
+ return (ESTALE);
+ if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
+ return (error);
+ error = vn_stat(vp, &sb, td);
+ vput(vp);
+ if (error)
+ return (error);
+ error = copyout(&sb, SCARG(uap, sb), sizeof(sb));
+ return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstatfs_args {
+ struct fhandle *u_fhp;
+ struct statfs *buf;
+};
+#endif
+int
+fhstatfs(td, uap)
+ struct thread *td;
+ struct fhstatfs_args /* {
+ syscallarg(struct fhandle) *u_fhp;
+ syscallarg(struct statfs) *buf;
+ } */ *uap;
+{
+ struct statfs *sp;
+ struct mount *mp;
+ struct vnode *vp;
+ struct statfs sb;
+ fhandle_t fh;
+ int error;
+
+ /*
+ * Must be super user
+ */
+ error = suser(td);
+ if (error)
+ return (error);
+
+ if ((error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t))) != 0)
+ return (error);
+
+ if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
+ return (ESTALE);
+ if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
+ return (error);
+ mp = vp->v_mount;
+ sp = &mp->mnt_stat;
+ vput(vp);
+ if ((error = VFS_STATFS(mp, sp, td)) != 0)
+ return (error);
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (suser(td)) {
+ bcopy(sp, &sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
+ return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Syscall to push extended attribute configuration information into the
+ * VFS. Accepts a path, which it converts to a mountpoint, as well as
+ * a command (int cmd), and attribute name and misc data. For now, the
+ * attribute name is left in userspace for consumption by the VFS_op.
+ * It will probably be changed to be copied into sysspace by the
+ * syscall in the future, once issues with various consumers of the
+ * attribute code have raised their hands.
+ *
+ * Currently this is used only by UFS Extended Attributes.
+ */
+int
+extattrctl(td, uap)
+ struct thread *td;
+ struct extattrctl_args /* {
+ syscallarg(const char *) path;
+ syscallarg(int) cmd;
+ syscallarg(const char *) filename;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ } */ *uap;
+{
+ struct vnode *filename_vp;
+ struct nameidata nd;
+ struct mount *mp, *mp_writable;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ /*
+ * uap->attrname is not always defined. We check again later when we
+ * invoke the VFS call so as to pass in NULL there if needed.
+ */
+ if (uap->attrname != NULL) {
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
+ NULL);
+ if (error)
+ return (error);
+ }
+
+ /*
+ * uap->filename is not always defined. If it is, grab a vnode lock,
+ * which VFS_EXTATTRCTL() will later release.
+ */
+ filename_vp = NULL;
+ if (uap->filename != NULL) {
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ uap->filename, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ filename_vp = nd.ni_vp;
+ NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK);
+ }
+
+ /* uap->path is always defined. */
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+ if ((error = namei(&nd)) != 0) {
+ if (filename_vp != NULL)
+ vput(filename_vp);
+ return (error);
+ }
+ mp = nd.ni_vp->v_mount;
+ error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
+ NDFREE(&nd, 0);
+ if (error) {
+ if (filename_vp != NULL)
+ vput(filename_vp);
+ return (error);
+ }
+
+ if (uap->attrname != NULL) {
+ error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp,
+ uap->attrnamespace, attrname, td);
+ } else {
+ error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp,
+ uap->attrnamespace, NULL, td);
+ }
+
+ vn_finished_write(mp_writable);
+ /*
+ * VFS_EXTATTRCTL will have unlocked, but not de-ref'd,
+ * filename_vp, so vrele it if it is defined.
+ */
+ if (filename_vp != NULL)
+ vrele(filename_vp);
+
+ return (error);
+}
+
+/*-
+ * Set a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", userspace buffer
+ * pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ void *data, size_t nbytes, struct thread *td)
+{
+ struct mount *mp;
+ struct uio auio;
+ struct iovec aiov;
+ ssize_t cnt;
+ int error;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ if (nbytes > INT_MAX) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ cnt = nbytes;
+
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
+ td->td_ucred, td);
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+
+done:
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+int
+extattr_set_file(td, uap)
+ struct thread *td;
+ struct extattr_set_file_args /* {
+ syscallarg(const char *) path;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ syscallarg(void *) data;
+ syscallarg(size_t) nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
+ uap->data, uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+int
+extattr_set_fd(td, uap)
+ struct thread *td;
+ struct extattr_set_fd_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ syscallarg(void *) data;
+ syscallarg(size_t) nbytes;
+ } */ *uap;
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+
+ if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
+ return (error);
+
+ error = extattr_set_vp((struct vnode *)fp->f_data, uap->attrnamespace,
+ attrname, uap->data, uap->nbytes, td);
+ fdrop(fp, td);
+
+ return (error);
+}
+
+/*-
+ * Get a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", userspace buffer
+ * pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ void *data, size_t nbytes, struct thread *td)
+{
+ struct uio auio, *auiop;
+ struct iovec aiov;
+ ssize_t cnt;
+ size_t size, *sizep;
+ int error;
+
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+ /*
+ * Slightly unusual semantics: if the user provides a NULL data
+ * pointer, they don't want to receive the data, just the
+ * maximum read length.
+ */
+ auiop = NULL;
+ sizep = NULL;
+ cnt = 0;
+ if (data != NULL) {
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_offset = 0;
+ if (nbytes > INT_MAX) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auiop = &auio;
+ cnt = nbytes;
+ } else
+ sizep = &size;
+
+ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
+ td->td_ucred, td);
+
+ if (auiop != NULL) {
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+ } else
+ td->td_retval[0] = size;
+
+done:
+ VOP_UNLOCK(vp, 0, td);
+ return (error);
+}
+
+int
+extattr_get_file(td, uap)
+ struct thread *td;
+ struct extattr_get_file_args /* {
+ syscallarg(const char *) path;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ syscallarg(void *) data;
+ syscallarg(size_t) nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
+ uap->data, uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+int
+extattr_get_fd(td, uap)
+ struct thread *td;
+ struct extattr_get_fd_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ syscallarg(void *) data;
+ syscallarg(size_t) nbytes;
+ } */ *uap;
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+
+ if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+ return (error);
+
+ error = extattr_get_vp((struct vnode *)fp->f_data, uap->attrnamespace,
+ attrname, uap->data, uap->nbytes, td);
+
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * extattr_delete_vp(): Delete a named extended attribute on a file or
+ * directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", proc "p"
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ struct thread *td)
+{
+ struct mount *mp;
+ int error;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_ucred,
+ td);
+
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ return (error);
+}
+
+int
+extattr_delete_file(td, uap)
+ struct thread *td;
+ struct extattr_delete_file_args /* {
+ syscallarg(const char *) path;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return(error);
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+ if ((error = namei(&nd)) != 0)
+ return(error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
+
+ vrele(nd.ni_vp);
+ return(error);
+}
+
+int
+extattr_delete_fd(td, uap)
+ struct thread *td;
+ struct extattr_delete_fd_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) attrnamespace;
+ syscallarg(const char *) attrname;
+ } */ *uap;
+{
+ struct file *fp;
+ struct vnode *vp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+
+ if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+
+ error = extattr_delete_vp((struct vnode *)fp->f_data,
+ uap->attrnamespace, attrname, td);
+
+ fdrop(fp, td);
+ return (error);
+}
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
new file mode 100644
index 0000000..77568c2
--- /dev/null
+++ b/sys/kern/vfs_vnops.c
@@ -0,0 +1,1056 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/filio.h>
+#include <sys/sx.h>
+#include <sys/ttycom.h>
+#include <sys/conf.h>
+#include <sys/syslog.h>
+
+#include <machine/limits.h>
+
+static int vn_closefile(struct file *fp, struct thread *td);
+static int vn_ioctl(struct file *fp, u_long com, caddr_t data,
+ struct thread *td);
+static int vn_read(struct file *fp, struct uio *uio,
+ struct ucred *cred, int flags, struct thread *td);
+static int vn_poll(struct file *fp, int events, struct ucred *cred,
+ struct thread *td);
+static int vn_kqfilter(struct file *fp, struct knote *kn);
+static int vn_statfile(struct file *fp, struct stat *sb, struct thread *td);
+static int vn_write(struct file *fp, struct uio *uio,
+ struct ucred *cred, int flags, struct thread *td);
+
+struct fileops vnops = {
+ vn_read, vn_write, vn_ioctl, vn_poll, vn_kqfilter,
+ vn_statfile, vn_closefile
+};
+
+int
+vn_open(ndp, flagp, cmode)
+ register struct nameidata *ndp;
+ int *flagp, cmode;
+{
+ struct thread *td = ndp->ni_cnd.cn_thread;
+
+ return (vn_open_cred(ndp, flagp, cmode, td->td_ucred));
+}
+
+/*
+ * Common code for vnode open operations.
+ * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
+ *
+ * Note that this does NOT free nameidata for the successful case,
+ * due to the NDINIT being done elsewhere.
+ */
+int
+vn_open_cred(ndp, flagp, cmode, cred)
+ register struct nameidata *ndp;
+ int *flagp, cmode;
+ struct ucred *cred;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct thread *td = ndp->ni_cnd.cn_thread;
+ struct vattr vat;
+ struct vattr *vap = &vat;
+ int mode, fmode, error;
+#ifdef LOOKUP_SHARED
+ int exclusive; /* The current intended lock state */
+
+ exclusive = 0;
+#endif
+
+restart:
+ fmode = *flagp;
+ if (fmode & O_CREAT) {
+ ndp->ni_cnd.cn_nameiop = CREATE;
+ ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
+ if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
+ ndp->ni_cnd.cn_flags |= FOLLOW;
+ bwillwrite();
+ if ((error = namei(ndp)) != 0)
+ return (error);
+ if (ndp->ni_vp == NULL) {
+ VATTR_NULL(vap);
+ vap->va_type = VREG;
+ vap->va_mode = cmode;
+ if (fmode & O_EXCL)
+ vap->va_vaflags |= VA_EXCLUSIVE;
+ if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(ndp, NDF_ONLY_PNBUF);
+ vput(ndp->ni_dvp);
+ if ((error = vn_start_write(NULL, &mp,
+ V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
+ error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
+ &ndp->ni_cnd, vap);
+ vput(ndp->ni_dvp);
+ vn_finished_write(mp);
+ if (error) {
+ NDFREE(ndp, NDF_ONLY_PNBUF);
+ return (error);
+ }
+ ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
+ ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
+ fmode &= ~O_TRUNC;
+ vp = ndp->ni_vp;
+#ifdef LOOKUP_SHARED
+ exclusive = 1;
+#endif
+ } else {
+ if (ndp->ni_dvp == ndp->ni_vp)
+ vrele(ndp->ni_dvp);
+ else
+ vput(ndp->ni_dvp);
+ ndp->ni_dvp = NULL;
+ vp = ndp->ni_vp;
+ if (fmode & O_EXCL) {
+ error = EEXIST;
+ goto bad;
+ }
+ fmode &= ~O_CREAT;
+ }
+ } else {
+ ndp->ni_cnd.cn_nameiop = LOOKUP;
+#ifdef LOOKUP_SHARED
+ ndp->ni_cnd.cn_flags =
+ ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
+ LOCKSHARED | LOCKLEAF;
+#else
+ ndp->ni_cnd.cn_flags =
+ ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
+#endif
+ if ((error = namei(ndp)) != 0)
+ return (error);
+ vp = ndp->ni_vp;
+ }
+ if (vp->v_type == VLNK) {
+ error = EMLINK;
+ goto bad;
+ }
+ if (vp->v_type == VSOCK) {
+ error = EOPNOTSUPP;
+ goto bad;
+ }
+ if ((fmode & O_CREAT) == 0) {
+ mode = 0;
+ if (fmode & (FWRITE | O_TRUNC)) {
+ if (vp->v_type == VDIR) {
+ error = EISDIR;
+ goto bad;
+ }
+ error = vn_writechk(vp);
+ if (error)
+ goto bad;
+ mode |= VWRITE;
+ }
+ if (fmode & FREAD)
+ mode |= VREAD;
+ if (mode) {
+ error = VOP_ACCESS(vp, mode, cred, td);
+ if (error)
+ goto bad;
+ }
+ }
+ if ((error = VOP_OPEN(vp, fmode, cred, td)) != 0)
+ goto bad;
+ /*
+ * Make sure that a VM object is created for VMIO support.
+ */
+ if (vn_canvmio(vp) == TRUE) {
+#ifdef LOOKUP_SHARED
+ int flock;
+
+ if (!exclusive && VOP_GETVOBJECT(vp, NULL) != 0)
+ VOP_LOCK(vp, LK_UPGRADE, td);
+ /*
+ * In cases where the object is marked as dead object_create
+ * will unlock and relock exclusive. It is safe to call in
+ * here with a shared lock because we only examine fields that
+ * the shared lock guarantees will be stable. In the UPGRADE
+ * case it is not likely that anyone has used this vnode yet
+ * so there will be no contention. The logic after this call
+ * restores the requested locking state.
+ */
+#endif
+ if ((error = vfs_object_create(vp, td, cred)) != 0) {
+ VOP_UNLOCK(vp, 0, td);
+ VOP_CLOSE(vp, fmode, cred, td);
+ NDFREE(ndp, NDF_ONLY_PNBUF);
+ vrele(vp);
+ *flagp = fmode;
+ return (error);
+ }
+#ifdef LOOKUP_SHARED
+ flock = VOP_ISLOCKED(vp, td);
+ if (!exclusive && flock == LK_EXCLUSIVE)
+ VOP_LOCK(vp, LK_DOWNGRADE, td);
+#endif
+ }
+
+ if (fmode & FWRITE)
+ vp->v_writecount++;
+ *flagp = fmode;
+ return (0);
+bad:
+ NDFREE(ndp, NDF_ONLY_PNBUF);
+ vput(vp);
+ *flagp = fmode;
+ return (error);
+}
+
+/*
+ * Check for write permissions on the specified vnode.
+ * Prototype text segments cannot be written.
+ */
+int
+vn_writechk(vp)
+ register struct vnode *vp;
+{
+
+ /*
+ * If there's shared text associated with
+ * the vnode, try to free it up once. If
+ * we fail, we can't allow writing.
+ */
+ if (vp->v_flag & VTEXT)
+ return (ETXTBSY);
+ return (0);
+}
+
+/*
+ * Vnode close call
+ */
+int
+vn_close(vp, flags, cred, td)
+ register struct vnode *vp;
+ int flags;
+ struct ucred *cred;
+ struct thread *td;
+{
+ int error;
+
+ if (flags & FWRITE)
+ vp->v_writecount--;
+ error = VOP_CLOSE(vp, flags, cred, td);
+ /*
+ * XXX - In certain instances VOP_CLOSE has to do the vrele
+ * itself. If the vrele has been done, it will return EAGAIN
+ * to indicate that the vrele should not be done again. When
+ * this happens, we just return success. The correct thing to
+ * do would be to have all VOP_CLOSE instances do the vrele.
+ */
+ if (error == EAGAIN)
+ return (0);
+ vrele(vp);
+ return (error);
+}
+
+/*
+ * Sequential heuristic - detect sequential operation
+ */
+static __inline
+int
+sequential_heuristic(struct uio *uio, struct file *fp)
+{
+
+ if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
+ uio->uio_offset == fp->f_nextoff) {
+ /*
+ * XXX we assume that the filesystem block size is
+ * the default. Not true, but still gives us a pretty
+ * good indicator of how sequential the read operations
+ * are.
+ */
+ fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
+ if (fp->f_seqcount >= 127)
+ fp->f_seqcount = 127;
+ return(fp->f_seqcount << 16);
+ }
+
+ /*
+ * Not sequential, quick draw-down of seqcount
+ */
+ if (fp->f_seqcount > 1)
+ fp->f_seqcount = 1;
+ else
+ fp->f_seqcount = 0;
+ return(0);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it.
+ */
+int
+vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
+ enum uio_rw rw;
+ struct vnode *vp;
+ caddr_t base;
+ int len;
+ off_t offset;
+ enum uio_seg segflg;
+ int ioflg;
+ struct ucred *cred;
+ int *aresid;
+ struct thread *td;
+{
+ struct uio auio;
+ struct iovec aiov;
+ struct mount *mp;
+ int error;
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ mp = NULL;
+ if (rw == UIO_WRITE) {
+ if (vp->v_type != VCHR &&
+ (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
+ != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ } else {
+ vn_lock(vp, LK_SHARED | LK_RETRY, td);
+ }
+
+ }
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ aiov.iov_base = base;
+ aiov.iov_len = len;
+ auio.uio_resid = len;
+ auio.uio_offset = offset;
+ auio.uio_segflg = segflg;
+ auio.uio_rw = rw;
+ auio.uio_td = td;
+ if (rw == UIO_READ) {
+ error = VOP_READ(vp, &auio, ioflg, cred);
+ } else {
+ error = VOP_WRITE(vp, &auio, ioflg, cred);
+ }
+ if (aresid)
+ *aresid = auio.uio_resid;
+ else
+ if (auio.uio_resid && error == 0)
+ error = EIO;
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ if (rw == UIO_WRITE)
+ vn_finished_write(mp);
+ VOP_UNLOCK(vp, 0, td);
+ }
+ return (error);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it. The I/O
+ * request is split up into smaller chunks and we try to avoid saturating
+ * the buffer cache while potentially holding a vnode locked, so we
+ * check bwillwrite() before calling vn_rdwr(). We also call uio_yield()
+ * to give other processes a chance to lock the vnode (either other processes
+ * core'ing the same binary, or unrelated processes scanning the directory).
+ */
+int
+vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
+ enum uio_rw rw;
+ struct vnode *vp;
+ caddr_t base;
+ int len;
+ off_t offset;
+ enum uio_seg segflg;
+ int ioflg;
+ struct ucred *cred;
+ int *aresid;
+ struct thread *td;
+{
+ int error = 0;
+
+ do {
+ int chunk = (len > MAXBSIZE) ? MAXBSIZE : len;
+
+ if (rw != UIO_READ && vp->v_type == VREG)
+ bwillwrite();
+ error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
+ ioflg, cred, aresid, td);
+ len -= chunk; /* aresid calc already includes length */
+ if (error)
+ break;
+ offset += chunk;
+ base += chunk;
+ uio_yield();
+ } while (len);
+ if (aresid)
+ *aresid += len;
+ return (error);
+}
+
+/*
+ * File table vnode read routine.
+ */
+static int
+vn_read(fp, uio, cred, flags, td)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+ struct thread *td;
+ int flags;
+{
+ struct vnode *vp;
+ int error, ioflag;
+
+ mtx_lock(&Giant);
+ KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
+ uio->uio_td, td));
+ vp = (struct vnode *)fp->f_data;
+ ioflag = 0;
+ if (fp->f_flag & FNONBLOCK)
+ ioflag |= IO_NDELAY;
+ if (fp->f_flag & O_DIRECT)
+ ioflag |= IO_DIRECT;
+ VOP_LEASE(vp, td, cred, LEASE_READ);
+ vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
+ if ((flags & FOF_OFFSET) == 0)
+ uio->uio_offset = fp->f_offset;
+
+ ioflag |= sequential_heuristic(uio, fp);
+
+ error = VOP_READ(vp, uio, ioflag, cred);
+ if ((flags & FOF_OFFSET) == 0)
+ fp->f_offset = uio->uio_offset;
+ fp->f_nextoff = uio->uio_offset;
+ VOP_UNLOCK(vp, 0, td);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * File table vnode write routine.
+ */
+static int
+vn_write(fp, uio, cred, flags, td)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+ struct thread *td;
+ int flags;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ int error, ioflag;
+
+ mtx_lock(&Giant);
+ KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
+ uio->uio_td, td));
+ vp = (struct vnode *)fp->f_data;
+ if (vp->v_type == VREG)
+ bwillwrite();
+ ioflag = IO_UNIT;
+ if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
+ ioflag |= IO_APPEND;
+ if (fp->f_flag & FNONBLOCK)
+ ioflag |= IO_NDELAY;
+ if (fp->f_flag & O_DIRECT)
+ ioflag |= IO_DIRECT;
+ if ((fp->f_flag & O_FSYNC) ||
+ (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
+ ioflag |= IO_SYNC;
+ mp = NULL;
+ if (vp->v_type != VCHR &&
+ (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+ mtx_unlock(&Giant);
+ return (error);
+ }
+ VOP_LEASE(vp, td, cred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if ((flags & FOF_OFFSET) == 0)
+ uio->uio_offset = fp->f_offset;
+ ioflag |= sequential_heuristic(uio, fp);
+ error = VOP_WRITE(vp, uio, ioflag, cred);
+ if ((flags & FOF_OFFSET) == 0)
+ fp->f_offset = uio->uio_offset;
+ fp->f_nextoff = uio->uio_offset;
+ VOP_UNLOCK(vp, 0, td);
+ vn_finished_write(mp);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * File table vnode stat routine.
+ */
+static int
+vn_statfile(fp, sb, td)
+ struct file *fp;
+ struct stat *sb;
+ struct thread *td;
+{
+ struct vnode *vp = (struct vnode *)fp->f_data;
+ int error;
+
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = vn_stat(vp, sb, td);
+ VOP_UNLOCK(vp, 0, td);
+
+ return (error);
+}
+
+/*
+ * Stat a vnode; implementation for the stat syscall
+ */
+int
+vn_stat(vp, sb, td)
+ struct vnode *vp;
+ register struct stat *sb;
+ struct thread *td;
+{
+ struct vattr vattr;
+ register struct vattr *vap;
+ int error;
+ u_short mode;
+
+ vap = &vattr;
+ error = VOP_GETATTR(vp, vap, td->td_ucred, td);
+ if (error)
+ return (error);
+
+ /*
+ * Zero the spare stat fields
+ */
+ bzero(sb, sizeof *sb);
+
+ /*
+ * Copy from vattr table
+ */
+ if (vap->va_fsid != VNOVAL)
+ sb->st_dev = vap->va_fsid;
+ else
+ sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
+ sb->st_ino = vap->va_fileid;
+ mode = vap->va_mode;
+ switch (vap->va_type) {
+ case VREG:
+ mode |= S_IFREG;
+ break;
+ case VDIR:
+ mode |= S_IFDIR;
+ break;
+ case VBLK:
+ mode |= S_IFBLK;
+ break;
+ case VCHR:
+ mode |= S_IFCHR;
+ break;
+ case VLNK:
+ mode |= S_IFLNK;
+ /* This is a cosmetic change, symlinks do not have a mode. */
+ if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
+ sb->st_mode &= ~ACCESSPERMS; /* 0000 */
+ else
+ sb->st_mode |= ACCESSPERMS; /* 0777 */
+ break;
+ case VSOCK:
+ mode |= S_IFSOCK;
+ break;
+ case VFIFO:
+ mode |= S_IFIFO;
+ break;
+ default:
+ return (EBADF);
+ };
+ sb->st_mode = mode;
+ sb->st_nlink = vap->va_nlink;
+ sb->st_uid = vap->va_uid;
+ sb->st_gid = vap->va_gid;
+ sb->st_rdev = vap->va_rdev;
+ if (vap->va_size > OFF_MAX)
+ return (EOVERFLOW);
+ sb->st_size = vap->va_size;
+ sb->st_atimespec = vap->va_atime;
+ sb->st_mtimespec = vap->va_mtime;
+ sb->st_ctimespec = vap->va_ctime;
+ sb->st_createtimespec = vap->va_createtime;
+
+ /*
+ * According to www.opengroup.org, the meaning of st_blksize is
+ * "a filesystem-specific preferred I/O block size for this
+ * object. In some filesystem types, this may vary from file
+ * to file"
+ * Default to PAGE_SIZE after much discussion.
+ */
+
+ if (vap->va_type == VREG) {
+ sb->st_blksize = vap->va_blocksize;
+ } else if (vn_isdisk(vp, NULL)) {
+ sb->st_blksize = vp->v_rdev->si_bsize_best;
+ if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
+ sb->st_blksize = vp->v_rdev->si_bsize_phys;
+ if (sb->st_blksize < BLKDEV_IOSIZE)
+ sb->st_blksize = BLKDEV_IOSIZE;
+ } else {
+ sb->st_blksize = PAGE_SIZE;
+ }
+
+ sb->st_flags = vap->va_flags;
+ if (suser(td))
+ sb->st_gen = 0;
+ else
+ sb->st_gen = vap->va_gen;
+
+#if (S_BLKSIZE == 512)
+ /* Optimize this case */
+ sb->st_blocks = vap->va_bytes >> 9;
+#else
+ sb->st_blocks = vap->va_bytes / S_BLKSIZE;
+#endif
+ return (0);
+}
+
+/*
+ * File table vnode ioctl routine.
+ */
+static int
+vn_ioctl(fp, com, data, td)
+ struct file *fp;
+ u_long com;
+ caddr_t data;
+ struct thread *td;
+{
+ register struct vnode *vp = ((struct vnode *)fp->f_data);
+ struct vnode *vpold;
+ struct vattr vattr;
+ int error;
+
+ switch (vp->v_type) {
+
+ case VREG:
+ case VDIR:
+ if (com == FIONREAD) {
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
+ VOP_UNLOCK(vp, 0, td);
+ if (error)
+ return (error);
+ *(int *)data = vattr.va_size - fp->f_offset;
+ return (0);
+ }
+ if (com == FIONBIO || com == FIOASYNC) /* XXX */
+ return (0); /* XXX */
+ /* fall into ... */
+
+ default:
+#if 0
+ return (ENOTTY);
+#endif
+ case VFIFO:
+ case VCHR:
+ case VBLK:
+ if (com == FIODTYPE) {
+ if (vp->v_type != VCHR && vp->v_type != VBLK)
+ return (ENOTTY);
+ *(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
+ return (0);
+ }
+ error = VOP_IOCTL(vp, com, data, fp->f_flag, td->td_ucred, td);
+ if (error == 0 && com == TIOCSCTTY) {
+
+ /* Do nothing if reassigning same control tty */
+ sx_slock(&proctree_lock);
+ if (td->td_proc->p_session->s_ttyvp == vp) {
+ sx_sunlock(&proctree_lock);
+ return (0);
+ }
+
+ vpold = td->td_proc->p_session->s_ttyvp;
+ VREF(vp);
+ SESS_LOCK(td->td_proc->p_session);
+ td->td_proc->p_session->s_ttyvp = vp;
+ SESS_UNLOCK(td->td_proc->p_session);
+
+ sx_sunlock(&proctree_lock);
+
+ /* Get rid of reference to old control tty */
+ if (vpold)
+ vrele(vpold);
+ }
+ return (error);
+ }
+}
+
+/*
+ * File table vnode poll routine.
+ */
+static int
+vn_poll(fp, events, cred, td)
+ struct file *fp;
+ int events;
+ struct ucred *cred;
+ struct thread *td;
+{
+
+ return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, td));
+}
+
+/*
+ * Check that the vnode is still valid, and if so
+ * acquire requested lock.
+ */
+int
+#ifndef DEBUG_LOCKS
+vn_lock(vp, flags, td)
+#else
+debug_vn_lock(vp, flags, td, filename, line)
+#endif
+ struct vnode *vp;
+ int flags;
+ struct thread *td;
+#ifdef DEBUG_LOCKS
+ const char *filename;
+ int line;
+#endif
+{
+ int error;
+
+ do {
+ if ((flags & LK_INTERLOCK) == 0)
+ mtx_lock(&vp->v_interlock);
+ if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curthread) {
+ vp->v_flag |= VXWANT;
+ msleep(vp, &vp->v_interlock, PINOD | PDROP,
+ "vn_lock", 0);
+ error = ENOENT;
+ } else {
+#if 0
+ /* this can now occur in normal operation */
+ if (vp->v_vxproc != NULL)
+ log(LOG_INFO, "VXLOCK interlock avoided in vn_lock\n");
+#endif
+#ifdef DEBUG_LOCKS
+ vp->filename = filename;
+ vp->line = line;
+#endif
+ error = VOP_LOCK(vp,
+ flags | LK_NOPAUSE | LK_INTERLOCK, td);
+ if (error == 0)
+ return (error);
+ }
+ flags &= ~LK_INTERLOCK;
+ } while (flags & LK_RETRY);
+ return (error);
+}
+
+/*
+ * File table vnode close routine.
+ */
+static int
+vn_closefile(fp, td)
+ struct file *fp;
+ struct thread *td;
+{
+
+ fp->f_ops = &badfileops;
+ return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
+ fp->f_cred, td));
+}
+
+/*
+ * Preparing to start a filesystem write operation. If the operation is
+ * permitted, then we bump the count of operations in progress and
+ * proceed. If a suspend request is in progress, we wait until the
+ * suspension is over, and then proceed.
+ */
+int
+vn_start_write(vp, mpp, flags)
+ struct vnode *vp;
+ struct mount **mpp;
+ int flags;
+{
+ struct mount *mp;
+ int error;
+
+ /*
+ * If a vnode is provided, get and return the mount point that
+ * to which it will write.
+ */
+ if (vp != NULL) {
+ if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
+ *mpp = NULL;
+ if (error != EOPNOTSUPP)
+ return (error);
+ return (0);
+ }
+ }
+ if ((mp = *mpp) == NULL)
+ return (0);
+ /*
+ * Check on status of suspension.
+ */
+ while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
+ if (flags & V_NOWAIT)
+ return (EWOULDBLOCK);
+ error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
+ "suspfs", 0);
+ if (error)
+ return (error);
+ }
+ if (flags & V_XSLEEP)
+ return (0);
+ mp->mnt_writeopcount++;
+ return (0);
+}
+
+/*
+ * Secondary suspension. Used by operations such as vop_inactive
+ * routines that are needed by the higher level functions. These
+ * are allowed to proceed until all the higher level functions have
+ * completed (indicated by mnt_writeopcount dropping to zero). At that
+ * time, these operations are halted until the suspension is over.
+ */
+int
+vn_write_suspend_wait(vp, mp, flags)
+ struct vnode *vp;
+ struct mount *mp;
+ int flags;
+{
+ int error;
+
+ if (vp != NULL) {
+ if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
+ if (error != EOPNOTSUPP)
+ return (error);
+ return (0);
+ }
+ }
+ /*
+ * If we are not suspended or have not yet reached suspended
+ * mode, then let the operation proceed.
+ */
+ if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0)
+ return (0);
+ if (flags & V_NOWAIT)
+ return (EWOULDBLOCK);
+ /*
+ * Wait for the suspension to finish.
+ */
+ return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
+ "suspfs", 0));
+}
+
+/*
+ * Filesystem write operation has completed. If we are suspending and this
+ * operation is the last one, notify the suspender that the suspension is
+ * now in effect.
+ */
+void
+vn_finished_write(mp)
+ struct mount *mp;
+{
+
+ if (mp == NULL)
+ return;
+ mp->mnt_writeopcount--;
+ if (mp->mnt_writeopcount < 0)
+ panic("vn_finished_write: neg cnt");
+ if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
+ mp->mnt_writeopcount <= 0)
+ wakeup(&mp->mnt_writeopcount);
+}
+
+/*
+ * Request a filesystem to suspend write operations.
+ */
+void
+vfs_write_suspend(mp)
+ struct mount *mp;
+{
+ struct thread *td = curthread;
+
+ if (mp->mnt_kern_flag & MNTK_SUSPEND)
+ return;
+ mp->mnt_kern_flag |= MNTK_SUSPEND;
+ if (mp->mnt_writeopcount > 0)
+ (void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0);
+ VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td);
+ mp->mnt_kern_flag |= MNTK_SUSPENDED;
+}
+
+/*
+ * Request a filesystem to resume write operations.
+ */
+void
+vfs_write_resume(mp)
+ struct mount *mp;
+{
+
+ if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0)
+ return;
+ mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
+ wakeup(&mp->mnt_writeopcount);
+ wakeup(&mp->mnt_flag);
+}
+
+/*
+ * Implement kqueues for files by translating it to vnode operation.
+ */
+static int
+vn_kqfilter(struct file *fp, struct knote *kn)
+{
+
+ return (VOP_KQFILTER(((struct vnode *)fp->f_data), kn));
+}
+
+/*
+ * Simplified in-kernel wrapper calls for extended attribute access.
+ * Both calls pass in a NULL credential, authorizing as "kernel" access.
+ * Set IO_NODELOCKED in ioflg if the vnode is already locked.
+ */
+int
+vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
+ const char *attrname, int *buflen, char *buf, struct thread *td)
+{
+ struct uio auio;
+ struct iovec iov;
+ int error;
+
+ iov.iov_len = *buflen;
+ iov.iov_base = buf;
+
+ auio.uio_iov = &iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_td = td;
+ auio.uio_offset = 0;
+ auio.uio_resid = *buflen;
+
+ if ((ioflg & IO_NODELOCKED) == 0)
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+ /* authorize attribute retrieval as kernel */
+ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
+ td);
+
+ if ((ioflg & IO_NODELOCKED) == 0)
+ VOP_UNLOCK(vp, 0, td);
+
+ if (error == 0) {
+ *buflen = *buflen - auio.uio_resid;
+ }
+
+ return (error);
+}
+
+/*
+ * XXX failure mode if partially written?
+ */
+int
+vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
+ const char *attrname, int buflen, char *buf, struct thread *td)
+{
+ struct uio auio;
+ struct iovec iov;
+ struct mount *mp;
+ int error;
+
+ iov.iov_len = buflen;
+ iov.iov_base = buf;
+
+ auio.uio_iov = &iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_td = td;
+ auio.uio_offset = 0;
+ auio.uio_resid = buflen;
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ }
+
+ /* authorize attribute setting as kernel */
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ vn_finished_write(mp);
+ VOP_UNLOCK(vp, 0, td);
+ }
+
+ return (error);
+}
+
+int
+vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
+ const char *attrname, struct thread *td)
+{
+ struct mount *mp;
+ int error;
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ }
+
+ /* authorize attribute removal as kernel */
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td);
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ vn_finished_write(mp);
+ VOP_UNLOCK(vp, 0, td);
+ }
+
+ return (error);
+}
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
new file mode 100644
index 0000000..cdeb5e5
--- /dev/null
+++ b/sys/kern/vnode_if.src
@@ -0,0 +1,556 @@
+#
+# Copyright (c) 1992, 1993
+# The Regents of the University of California. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+# must display the following acknowledgement:
+# This product includes software developed by the University of
+# California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# @(#)vnode_if.src 8.12 (Berkeley) 5/14/95
+# $FreeBSD$
+#
+
+#
+# Above each of the vop descriptors is a specification of the locking
+# protocol used by each vop call. The first column is the name of
+# the variable, the remaining three columns are in, out and error
+# respectively. The "in" column defines the lock state on input,
+# the "out" column defines the state on succesful return, and the
+# "error" column defines the locking state on error exit.
+#
+# The locking value can take the following values:
+# L: locked; not converted to type of lock.
+# A: any lock type.
+# S: locked with shared lock.
+# E: locked with exclusive lock for this process.
+# O: locked with exclusive lock for other process.
+# U: unlocked.
+# -: not applicable. vnode does not yet (or no longer) exists.
+# =: the same on input and output, may be either L or U.
+# X: locked if not nil.
+#
+
+#
+#% islocked vp = = =
+#
+vop_islocked {
+ IN struct vnode *vp;
+ IN struct thread *td;
+};
+
+#
+#% lookup dvp L ? ?
+#% lookup vpp - L -
+#
+# XXX - the lookup locking protocol defies simple description and depends
+# on the flags and operation fields in the (cnp) structure. Note
+# especially that *vpp may equal dvp and both may be locked.
+#
+vop_lookup {
+ IN struct vnode *dvp;
+ INOUT struct vnode **vpp;
+ IN struct componentname *cnp;
+};
+
+#
+#% cachedlookup dvp L ? ?
+#% cachedlookup vpp - L -
+#
+# This must be an exact copy of lookup. See kern/vfs_cache.c for details.
+#
+vop_cachedlookup {
+ IN struct vnode *dvp;
+ INOUT struct vnode **vpp;
+ IN struct componentname *cnp;
+};
+
+#
+#% create dvp L L L
+#% create vpp - L -
+#
+vop_create {
+ IN struct vnode *dvp;
+ OUT struct vnode **vpp;
+ IN struct componentname *cnp;
+ IN struct vattr *vap;
+};
+
+#
+#% whiteout dvp L L L
+#
+vop_whiteout {
+ IN struct vnode *dvp;
+ IN struct componentname *cnp;
+ IN int flags;
+};
+
+#
+#% mknod dvp L L L
+#% mknod vpp - L -
+#
+vop_mknod {
+ IN struct vnode *dvp;
+ OUT struct vnode **vpp;
+ IN struct componentname *cnp;
+ IN struct vattr *vap;
+};
+
+#
+#% open vp L L L
+#
+vop_open {
+ IN struct vnode *vp;
+ IN int mode;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% close vp U U U
+#
+vop_close {
+ IN struct vnode *vp;
+ IN int fflag;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% access vp L L L
+#
+vop_access {
+ IN struct vnode *vp;
+ IN int mode;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% getattr vp = = =
+#
+# XXX: This should be A A A
+#
+vop_getattr {
+ IN struct vnode *vp;
+ OUT struct vattr *vap;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% setattr vp L L L
+#
+vop_setattr {
+ IN struct vnode *vp;
+ IN struct vattr *vap;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% read vp L L L
+#
+vop_read {
+ IN struct vnode *vp;
+ INOUT struct uio *uio;
+ IN int ioflag;
+ IN struct ucred *cred;
+};
+
+#
+#% write vp L L L
+#
+vop_write {
+ IN struct vnode *vp;
+ INOUT struct uio *uio;
+ IN int ioflag;
+ IN struct ucred *cred;
+};
+
+#
+#% lease vp = = =
+#
+vop_lease {
+ IN struct vnode *vp;
+ IN struct thread *td;
+ IN struct ucred *cred;
+ IN int flag;
+};
+
+#
+#% ioctl vp U U U
+#
+vop_ioctl {
+ IN struct vnode *vp;
+ IN u_long command;
+ IN caddr_t data;
+ IN int fflag;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% poll vp U U U
+#
+vop_poll {
+ IN struct vnode *vp;
+ IN int events;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% kqfilter vp U U U
+#
+vop_kqfilter {
+ IN struct vnode *vp;
+ IN struct knote *kn;
+};
+
+#
+#% revoke vp U U U
+#
+vop_revoke {
+ IN struct vnode *vp;
+ IN int flags;
+};
+
+#
+#% fsync vp L L L
+#
+vop_fsync {
+ IN struct vnode *vp;
+ IN struct ucred *cred;
+ IN int waitfor;
+ IN struct thread *td;
+};
+
+#
+#% remove dvp L L L
+#% remove vp L L L
+#
+vop_remove {
+ IN struct vnode *dvp;
+ IN struct vnode *vp;
+ IN struct componentname *cnp;
+};
+
+#
+#% link tdvp L L L
+#% link vp U U U
+#
+vop_link {
+ IN struct vnode *tdvp;
+ IN struct vnode *vp;
+ IN struct componentname *cnp;
+};
+
+#
+#% rename fdvp U U U
+#% rename fvp U U U
+#% rename tdvp L U U
+#% rename tvp X U U
+#
+vop_rename {
+ IN WILLRELE struct vnode *fdvp;
+ IN WILLRELE struct vnode *fvp;
+ IN struct componentname *fcnp;
+ IN WILLRELE struct vnode *tdvp;
+ IN WILLRELE struct vnode *tvp;
+ IN struct componentname *tcnp;
+};
+
+#
+#% mkdir dvp L L L
+#% mkdir vpp - L -
+#
+vop_mkdir {
+ IN struct vnode *dvp;
+ OUT struct vnode **vpp;
+ IN struct componentname *cnp;
+ IN struct vattr *vap;
+};
+
+#
+#% rmdir dvp L L L
+#% rmdir vp L L L
+#
+vop_rmdir {
+ IN struct vnode *dvp;
+ IN struct vnode *vp;
+ IN struct componentname *cnp;
+};
+
+#
+#% symlink dvp L L L
+#% symlink vpp - L -
+#
+vop_symlink {
+ IN struct vnode *dvp;
+ OUT struct vnode **vpp;
+ IN struct componentname *cnp;
+ IN struct vattr *vap;
+ IN char *target;
+};
+
+#
+#% readdir vp L L L
+#
+vop_readdir {
+ IN struct vnode *vp;
+ INOUT struct uio *uio;
+ IN struct ucred *cred;
+ INOUT int *eofflag;
+ OUT int *ncookies;
+ INOUT u_long **cookies;
+};
+
+#
+#% readlink vp L L L
+#
+vop_readlink {
+ IN struct vnode *vp;
+ INOUT struct uio *uio;
+ IN struct ucred *cred;
+};
+
+#
+#% inactive vp L U U
+#
+vop_inactive {
+ IN struct vnode *vp;
+ IN struct thread *td;
+};
+
+#
+#% reclaim vp U U U
+#
+vop_reclaim {
+ IN struct vnode *vp;
+ IN struct thread *td;
+};
+
+#
+#% lock vp ? ? ?
+#
+vop_lock {
+ IN struct vnode *vp;
+ IN int flags;
+ IN struct thread *td;
+};
+
+#
+#% unlock vp L U L
+#
+vop_unlock {
+ IN struct vnode *vp;
+ IN int flags;
+ IN struct thread *td;
+};
+
+#
+#% bmap vp L L L
+#% bmap vpp - U -
+#
+vop_bmap {
+ IN struct vnode *vp;
+ IN daddr_t bn;
+ OUT struct vnode **vpp;
+ IN daddr_t *bnp;
+ OUT int *runp;
+ OUT int *runb;
+};
+
+#
+#% strategy vp L L L
+#
+vop_strategy {
+ IN struct vnode *vp;
+ IN struct buf *bp;
+};
+
+#
+#% getwritemount vp = = =
+#
+vop_getwritemount {
+ IN struct vnode *vp;
+ OUT struct mount **mpp;
+};
+
+#
+#% print vp = = =
+#
+vop_print {
+ IN struct vnode *vp;
+};
+
+#
+#% pathconf vp L L L
+#
+vop_pathconf {
+ IN struct vnode *vp;
+ IN int name;
+ OUT register_t *retval;
+};
+
+#
+#% advlock vp U U U
+#
+vop_advlock {
+ IN struct vnode *vp;
+ IN caddr_t id;
+ IN int op;
+ IN struct flock *fl;
+ IN int flags;
+};
+
+#
+#% reallocblks vp L L L
+#
+vop_reallocblks {
+ IN struct vnode *vp;
+ IN struct cluster_save *buflist;
+};
+
+#
+#% getpages vp L L L
+#
+vop_getpages {
+ IN struct vnode *vp;
+ IN vm_page_t *m;
+ IN int count;
+ IN int reqpage;
+ IN vm_ooffset_t offset;
+};
+
+#
+#% putpages vp L L L
+#
+vop_putpages {
+ IN struct vnode *vp;
+ IN vm_page_t *m;
+ IN int count;
+ IN int sync;
+ IN int *rtvals;
+ IN vm_ooffset_t offset;
+};
+
+#
+#% freeblks vp - - -
+#
+# This call is used by the filesystem to release blocks back to
+# device-driver. This is useful if the driver has a lengthy
+# erase handling or similar.
+#
+
+vop_freeblks {
+ IN struct vnode *vp;
+ IN daddr_t addr;
+ IN daddr_t length;
+};
+
+#
+#% getacl vp L L L
+#
+vop_getacl {
+ IN struct vnode *vp;
+ IN acl_type_t type;
+ OUT struct acl *aclp;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% setacl vp L L L
+#
+vop_setacl {
+ IN struct vnode *vp;
+ IN acl_type_t type;
+ IN struct acl *aclp;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% aclcheck vp = = =
+#
+vop_aclcheck {
+ IN struct vnode *vp;
+ IN acl_type_t type;
+ IN struct acl *aclp;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% getextattr vp L L L
+#
+vop_getextattr {
+ IN struct vnode *vp;
+ IN int attrnamespace;
+ IN const char *name;
+ INOUT struct uio *uio;
+ OUT size_t *size;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% setextattr vp L L L
+#
+vop_setextattr {
+ IN struct vnode *vp;
+ IN int attrnamespace;
+ IN const char *name;
+ INOUT struct uio *uio;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% createvobject vp L L L
+#
+vop_createvobject {
+ IN struct vnode *vp;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+#
+#% destroyvobject vp L L L
+#
+vop_destroyvobject {
+ IN struct vnode *vp;
+};
+
+#
+#% getvobject vp L L L
+#
+vop_getvobject {
+ IN struct vnode *vp;
+ OUT struct vm_object **objpp;
+};
OpenPOWER on IntegriCloud