summaryrefslogtreecommitdiffstats
path: root/sys/kern
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/Make.tags.inc19
-rw-r--r--sys/kern/Makefile53
-rw-r--r--sys/kern/bus_if.m141
-rw-r--r--sys/kern/device_if.m83
-rw-r--r--sys/kern/imgact_aout.c304
-rw-r--r--sys/kern/imgact_elf.c992
-rw-r--r--sys/kern/imgact_gzip.c378
-rw-r--r--sys/kern/imgact_shell.c138
-rw-r--r--sys/kern/inflate.c1078
-rw-r--r--sys/kern/init_main.c698
-rw-r--r--sys/kern/init_sysent.c360
-rw-r--r--sys/kern/kern_acct.c330
-rw-r--r--sys/kern/kern_clock.c870
-rw-r--r--sys/kern/kern_conf.c220
-rw-r--r--sys/kern/kern_descrip.c1313
-rw-r--r--sys/kern/kern_environment.c137
-rw-r--r--sys/kern/kern_exec.c778
-rw-r--r--sys/kern/kern_exit.c647
-rw-r--r--sys/kern/kern_fork.c546
-rw-r--r--sys/kern/kern_intr.c535
-rw-r--r--sys/kern/kern_ktrace.c529
-rw-r--r--sys/kern/kern_linker.c1016
-rw-r--r--sys/kern/kern_lkm.c838
-rw-r--r--sys/kern/kern_lock.c613
-rw-r--r--sys/kern/kern_lockf.c806
-rw-r--r--sys/kern/kern_malloc.c501
-rw-r--r--sys/kern/kern_mib.c182
-rw-r--r--sys/kern/kern_module.c330
-rw-r--r--sys/kern/kern_ntptime.c856
-rw-r--r--sys/kern/kern_physio.c215
-rw-r--r--sys/kern/kern_proc.c608
-rw-r--r--sys/kern/kern_prot.c898
-rw-r--r--sys/kern/kern_random.c379
-rw-r--r--sys/kern/kern_resource.c623
-rw-r--r--sys/kern/kern_shutdown.c530
-rw-r--r--sys/kern/kern_sig.c1455
-rw-r--r--sys/kern/kern_subr.c391
-rw-r--r--sys/kern/kern_synch.c923
-rw-r--r--sys/kern/kern_syscalls.c109
-rw-r--r--sys/kern/kern_sysctl.c1122
-rw-r--r--sys/kern/kern_tc.c870
-rw-r--r--sys/kern/kern_threads.c154
-rw-r--r--sys/kern/kern_time.c644
-rw-r--r--sys/kern/kern_timeout.c286
-rw-r--r--sys/kern/kern_xxx.c256
-rw-r--r--sys/kern/ksched.c262
-rw-r--r--sys/kern/link_aout.c585
-rw-r--r--sys/kern/link_elf.c981
-rw-r--r--sys/kern/link_elf_obj.c981
-rw-r--r--sys/kern/makedevops.pl394
-rw-r--r--sys/kern/makedevops.sh232
-rw-r--r--sys/kern/makesyscalls.sh394
-rw-r--r--sys/kern/md5c.c342
-rw-r--r--sys/kern/p1003_1b.c260
-rw-r--r--sys/kern/posix4_mib.c94
-rw-r--r--sys/kern/subr_autoconf.c420
-rw-r--r--sys/kern/subr_blist.c928
-rw-r--r--sys/kern/subr_bus.c1572
-rw-r--r--sys/kern/subr_clist.c694
-rw-r--r--sys/kern/subr_devstat.c248
-rw-r--r--sys/kern/subr_disklabel.c410
-rw-r--r--sys/kern/subr_diskmbr.c445
-rw-r--r--sys/kern/subr_diskslice.c1192
-rw-r--r--sys/kern/subr_dkbad.c160
-rw-r--r--sys/kern/subr_log.c274
-rw-r--r--sys/kern/subr_module.c267
-rw-r--r--sys/kern/subr_param.c189
-rw-r--r--sys/kern/subr_prf.c716
-rw-r--r--sys/kern/subr_prof.c457
-rw-r--r--sys/kern/subr_rlist.c313
-rw-r--r--sys/kern/subr_rman.c591
-rw-r--r--sys/kern/subr_scanf.c793
-rw-r--r--sys/kern/subr_smp.c2663
-rw-r--r--sys/kern/subr_trap.c1441
-rw-r--r--sys/kern/subr_xxx.c210
-rw-r--r--sys/kern/sys_generic.c872
-rw-r--r--sys/kern/sys_pipe.c1102
-rw-r--r--sys/kern/sys_process.c534
-rw-r--r--sys/kern/sys_socket.c182
-rw-r--r--sys/kern/syscalls.c347
-rw-r--r--sys/kern/syscalls.master473
-rw-r--r--sys/kern/sysv_ipc.c283
-rw-r--r--sys/kern/sysv_msg.c1027
-rw-r--r--sys/kern/sysv_sem.c977
-rw-r--r--sys/kern/sysv_shm.c617
-rw-r--r--sys/kern/tty.c2437
-rw-r--r--sys/kern/tty_compat.c490
-rw-r--r--sys/kern/tty_conf.c210
-rw-r--r--sys/kern/tty_cons.c375
-rw-r--r--sys/kern/tty_pty.c832
-rw-r--r--sys/kern/tty_snoop.c541
-rw-r--r--sys/kern/tty_subr.c694
-rw-r--r--sys/kern/tty_tb.c367
-rw-r--r--sys/kern/tty_tty.c206
-rw-r--r--sys/kern/uipc_domain.c294
-rw-r--r--sys/kern/uipc_mbuf.c945
-rw-r--r--sys/kern/uipc_proto.c79
-rw-r--r--sys/kern/uipc_sockbuf.c954
-rw-r--r--sys/kern/uipc_socket.c1216
-rw-r--r--sys/kern/uipc_socket2.c954
-rw-r--r--sys/kern/uipc_syscalls.c1701
-rw-r--r--sys/kern/uipc_usrreq.c1186
-rw-r--r--sys/kern/vfs_aio.c2046
-rw-r--r--sys/kern/vfs_bio.c2443
-rw-r--r--sys/kern/vfs_cache.c450
-rw-r--r--sys/kern/vfs_cluster.c840
-rw-r--r--sys/kern/vfs_conf.c190
-rw-r--r--sys/kern/vfs_default.c477
-rw-r--r--sys/kern/vfs_export.c2872
-rw-r--r--sys/kern/vfs_extattr.c3034
-rw-r--r--sys/kern/vfs_init.c461
-rw-r--r--sys/kern/vfs_lookup.c706
-rw-r--r--sys/kern/vfs_mount.c190
-rw-r--r--sys/kern/vfs_subr.c2872
-rw-r--r--sys/kern/vfs_syscalls.c3034
-rw-r--r--sys/kern/vfs_vnops.c562
-rw-r--r--sys/kern/vnode_if.pl402
-rw-r--r--sys/kern/vnode_if.sh402
-rw-r--r--sys/kern/vnode_if.src488
119 files changed, 84052 insertions, 574 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc
new file mode 100644
index 0000000..a09e484
--- /dev/null
+++ b/sys/kern/Make.tags.inc
@@ -0,0 +1,19 @@
+# @(#)Make.tags.inc 8.1 (Berkeley) 6/11/93
+# $Id$
+
+# Common files for "make tags".
+# Included by the Makefile for each architecture.
+
+# Put the ../sys stuff near the end so that subroutine definitions win when
+# there is a struct tag with the same name (eg., vmmeter). The real
+# solution would probably be for ctags to generate "struct vmmeter" tags.
+
+COMM= /sys/conf/*.[ch] \
+ /sys/dev/*.[ch] /sys/dev/scsi/*.[ch] \
+ /sys/kern/*.[ch] /sys/libkern/*.[ch] \
+ /sys/miscfs/*/*.[ch] \
+ /sys/net/*.[ch] /sys/netccitt/*.[ch] /sys/netinet/*.[ch] \
+ /sys/netiso/*.[ch] /sys/netns/*.[ch] \
+ /sys/nfs/*.[ch] /sys/sys/*.[ch] \
+ /sys/ufs/*/*.[ch] \
+ /sys/vm/*.[ch]
diff --git a/sys/kern/Makefile b/sys/kern/Makefile
new file mode 100644
index 0000000..f42a44e
--- /dev/null
+++ b/sys/kern/Makefile
@@ -0,0 +1,53 @@
+# @(#)Makefile 8.2 (Berkeley) 3/21/94
+
+# Makefile for kernel tags files, init_sysent, etc.
+
+ARCH= i386 # luna68k news3400 pmax sparc tahoe vax
+
+all:
+ @echo "make tags, make links or init_sysent.c only"
+
+init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscall-hide.h \
+../sys/sysproto.h: makesyscalls.sh syscalls.master
+ -mv -f init_sysent.c init_sysent.c.bak
+ -mv -f syscalls.c syscalls.c.bak
+ -mv -f ../sys/syscall.h ../sys/syscall.h.bak
+ -mv -f ../sys/syscall-hide.h ../sys/syscall-hide.h.bak
+ -mv -f ../sys/sysproto.h ../sys/sysproto.h.bak
+ sh makesyscalls.sh syscalls.master
+
+# Kernel tags:
+# Tags files are built in the top-level directory for each architecture,
+# with a makefile listing the architecture-dependent files, etc. The list
+# of common files is in ./Make.tags.inc. Links to the correct tags file
+# are placed in each source directory. We need to have links to tags files
+# from the generic directories that are relative to the machine type, even
+# via remote mounts; therefore we use symlinks to $SYSTAGS, which points at
+# ${SYSDIR}/${MACHINE}/tags.
+
+SYSTAGS=/var/db/sys_tags
+SYSDIR=/sys
+
+# Directories in which to place tags links (other than machine-dependent)
+DGEN= conf \
+ dev dev/scsi \
+ hp hp/dev hp/hpux \
+ kern libkern \
+ miscfs miscfs/deadfs miscfs/fdesc miscfs/fifofs miscfs/kernfs \
+ miscfs/lofs miscfs/nullfs miscfs/portal miscfs/procfs \
+ miscfs/specfs miscfs/umapfs miscfs/union \
+ net netccitt netinet netiso netns nfs scripts sys \
+ ufs ufs/ffs ufs/lfs ufs/mfs ufs/ufs \
+ vm
+
+tags::
+ -for i in ${ARCH}; do \
+ (cd ../$$i && make ${MFLAGS} tags); done
+
+links::
+ rm -f ${SYSTAGS}
+ ln -s ${SYSDIR}/${MACHINE}/tags ${SYSTAGS}
+ -for i in ${DGEN}; do \
+ (cd ../$$i && { rm -f tags; ln -s ${SYSTAGS} tags; }) done
+ -for i in ${ARCH}; do \
+ (cd ../$$i && make ${MFLAGS} SYSTAGS=${SYSTAGS} links); done
diff --git a/sys/kern/bus_if.m b/sys/kern/bus_if.m
new file mode 100644
index 0000000..fd4f648
--- /dev/null
+++ b/sys/kern/bus_if.m
@@ -0,0 +1,141 @@
+#
+# Copyright (c) 1998 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $Id: bus_if.m,v 1.4 1998/11/08 18:51:38 nsouch Exp $
+#
+
+INTERFACE bus;
+
+#
+# This is called from system code which prints out a description of a
+# device. It should describe the attachment that the child has with
+# the parent. For instance the TurboLaser bus prints which node the
+# device is attached to.
+#
+METHOD void print_child {
+ device_t dev;
+ device_t child;
+};
+
+#
+# These two methods manage a bus specific set of instance variables of
+# a child device. The intention is that each different type of bus
+# defines a set of appropriate instance variables (such as ports and
+# irqs for ISA bus etc.)
+#
+# This information could be given to the child device as a struct but
+# that makes it hard for a bus to add or remove variables without
+# forcing an edit and recompile for all drivers which may not be
+# possible for vendor supplied binary drivers.
+
+#
+# Read an instance variable. Return 0 on success.
+#
+METHOD int read_ivar {
+ device_t dev;
+ device_t child;
+ int index;
+ uintptr_t *result;
+};
+
+#
+# Write an instance variable. Return 0 on success.
+#
+METHOD int write_ivar {
+ device_t dev;
+ device_t child;
+ int index;
+ uintptr_t value;
+};
+
+#
+# Allocate a system resource attached to `dev' on behalf of `child'.
+# The types are defined in <machine/resource.h>; the meaning of the
+# resource-ID field varies from bus to bus (but *rid == 0 is always
+# valid if the resource type is). start and end reflect the allowable
+# range, and should be passed as `0UL' and `~0UL', respectively, if
+# the client has no range restriction. count is the number of consecutive
+# indices in the resource required. flags is a set of sharing flags
+# as defined in <sys/rman.h>.
+#
+# Returns a resource or a null pointer on failure. The caller is
+# responsible for calling rman_activate_resource() when it actually
+# uses the resource.
+#
+METHOD struct resource * alloc_resource {
+ device_t dev;
+ device_t child;
+ int type;
+ int *rid;
+ u_long start;
+ u_long end;
+ u_long count;
+ u_int flags;
+};
+
+METHOD int activate_resource {
+ device_t dev;
+ device_t child;
+ int type;
+ int rid;
+ struct resource *r;
+};
+
+METHOD int deactivate_resource {
+ device_t dev;
+ device_t child;
+ int type;
+ int rid;
+ struct resource *r;
+};
+
+#
+# Free a resource allocated by the preceding method. The `rid' value
+# must be the same as the one returned by BUS_ALLOC_RESOURCE (which
+# is not necessarily the same as the one the client passed).
+#
+METHOD int release_resource {
+ device_t dev;
+ device_t child;
+ int type;
+ int rid;
+ struct resource *res;
+};
+
+METHOD int setup_intr {
+ device_t dev;
+ device_t child;
+ struct resource *irq;
+ driver_intr_t *intr;
+ void *arg;
+ void **cookiep;
+};
+
+METHOD int teardown_intr {
+ device_t dev;
+ device_t child;
+ struct resource *irq;
+ void *cookie;
+};
diff --git a/sys/kern/device_if.m b/sys/kern/device_if.m
new file mode 100644
index 0000000..f429e67
--- /dev/null
+++ b/sys/kern/device_if.m
@@ -0,0 +1,83 @@
+#
+# Copyright (c) 1998 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $Id: device_if.m,v 1.2 1998/11/08 18:35:53 nsouch Exp $
+#
+
+INTERFACE device;
+
+#
+# Probe to see if the device is present. Return 0 if the device exists,
+# ENXIO if it cannot be found.
+#
+# Devices which implement busses should use this method to probe for
+# the existence of devices attached to the bus and add them as
+# children. If this is combined with the use of bus_generic_attach,
+# the child devices will be automatically probed and attached.
+#
+METHOD int probe {
+ device_t dev;
+};
+
+#
+# Attach a device to the system. The probe method will have been
+# called and will have indicated that the device exists. This routine
+# should initialise the hardware and allocate other system resources
+# (such as devfs entries). Returns 0 on success.
+#
+METHOD int attach {
+ device_t dev;
+};
+
+#
+# Detach a device. This can be called if the user is replacing the
+# driver software or if a device is about to be physically removed
+# from the system (e.g. for pccard devices). Returns 0 on success.
+#
+METHOD int detach {
+ device_t dev;
+};
+
+#
+# This is called during system shutdown to allow the driver to put the
+# hardware into a consistent state for rebooting the computer.
+#
+METHOD int shutdown {
+ device_t dev;
+};
+
+#
+# This is called by the power-management subsystem when a suspend has been
+# requested by the user or by some automatic mechanism. This gives
+# drivers a chance to veto the suspend or save their configuration before
+# power is removed.
+#
+METHOD int suspend {
+ device_t dev;
+};
+
+METHOD int resume {
+ device_t dev;
+};
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
new file mode 100644
index 0000000..9fbd203
--- /dev/null
+++ b/sys/kern/imgact_aout.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: imgact_aout.c,v 1.43 1998/10/16 03:55:00 peter Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/acct.h>
+#include <sys/resourcevar.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sysent.h>
+#include <sys/syscall.h>
+#include <sys/vnode.h>
+#include <sys/systm.h>
+#include <machine/md_var.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <sys/user.h>
+
+static int exec_aout_imgact __P((struct image_params *imgp));
+
+struct sysentvec aout_sysvec = {
+ SYS_MAXSYSCALL,
+ sysent,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ sendsig,
+ sigcode,
+ &szsigcode,
+ 0,
+ "FreeBSD a.out",
+ aout_coredump
+};
+
+static int
+exec_aout_imgact(imgp)
+ struct image_params *imgp;
+{
+ const struct exec *a_out = (const struct exec *) imgp->image_header;
+ struct vmspace *vmspace;
+ struct vnode *vp;
+ vm_object_t object;
+ vm_offset_t text_end, data_end;
+ unsigned long virtual_offset;
+ unsigned long file_offset;
+ unsigned long bss_size;
+ int error;
+
+ /*
+ * Linux and *BSD binaries look very much alike,
+ * only the machine id is different:
+ * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
+ * NetBSD is in network byte order.. ugh.
+ */
+ if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
+ ((a_out->a_magic >> 16) & 0xff) != 0 &&
+ ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86)
+ return -1;
+
+ /*
+ * Set file/virtual offset based on a.out variant.
+ * We do two cases: host byte order and network byte order
+ * (for NetBSD compatibility)
+ */
+ switch ((int)(a_out->a_magic & 0xffff)) {
+ case ZMAGIC:
+ virtual_offset = 0;
+ if (a_out->a_text) {
+ file_offset = PAGE_SIZE;
+ } else {
+ /* Bill's "screwball mode" */
+ file_offset = 0;
+ }
+ break;
+ case QMAGIC:
+ virtual_offset = PAGE_SIZE;
+ file_offset = 0;
+ break;
+ default:
+ /* NetBSD compatibility */
+ switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
+ case ZMAGIC:
+ case QMAGIC:
+ virtual_offset = PAGE_SIZE;
+ file_offset = 0;
+ break;
+ default:
+ return (-1);
+ }
+ }
+
+ bss_size = roundup(a_out->a_bss, PAGE_SIZE);
+
+ /*
+ * Check various fields in header for validity/bounds.
+ */
+ if (/* entry point must lay with text region */
+ a_out->a_entry < virtual_offset ||
+ a_out->a_entry >= virtual_offset + a_out->a_text ||
+
+ /* text and data size must each be page rounded */
+ a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
+ return (-1);
+
+ /* text + data can't exceed file size */
+ if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
+ return (EFAULT);
+
+ /*
+ * text/data/bss must not exceed limits
+ */
+ if (/* text can't exceed maximum text size */
+ a_out->a_text > MAXTSIZ ||
+
+ /* data + bss can't exceed rlimit */
+ a_out->a_data + bss_size >
+ imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
+ return (ENOMEM);
+
+ /* copy in arguments and/or environment from old process */
+ error = exec_extract_strings(imgp);
+ if (error)
+ return (error);
+
+ /*
+ * Destroy old process VM and create a new one (with a new stack)
+ */
+ exec_new_vmspace(imgp);
+
+ /*
+ * The vm space can be changed by exec_new_vmspace
+ */
+ vmspace = imgp->proc->p_vmspace;
+
+ vp = imgp->vp;
+ object = vp->v_object;
+ vm_object_reference(object);
+
+ text_end = virtual_offset + a_out->a_text;
+ error = vm_map_insert(&vmspace->vm_map, object,
+ file_offset,
+ virtual_offset, text_end,
+ VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
+ MAP_COPY_NEEDED | MAP_COPY_ON_WRITE);
+ if (error)
+ return (error);
+
+ data_end = text_end + a_out->a_data;
+ if (a_out->a_data) {
+ vm_object_reference(object);
+ error = vm_map_insert(&vmspace->vm_map, object,
+ file_offset + a_out->a_text,
+ text_end, data_end,
+ VM_PROT_ALL, VM_PROT_ALL,
+ MAP_COPY_NEEDED | MAP_COPY_ON_WRITE);
+ if (error)
+ return (error);
+ }
+
+ pmap_object_init_pt(&vmspace->vm_pmap, virtual_offset,
+ object, (vm_pindex_t) OFF_TO_IDX(file_offset),
+ a_out->a_text + a_out->a_data, 0);
+
+ if (bss_size) {
+ error = vm_map_insert(&vmspace->vm_map, NULL, 0,
+ data_end, data_end + bss_size,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error)
+ return (error);
+ }
+
+ /* Fill in process VM information */
+ vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
+ vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
+ vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
+ vmspace->vm_daddr = (caddr_t) (uintptr_t)
+ (virtual_offset + a_out->a_text);
+
+ /* Fill in image_params */
+ imgp->interpreted = 0;
+ imgp->entry_addr = a_out->a_entry;
+
+ imgp->proc->p_sysent = &aout_sysvec;
+
+ /* Indicate that this file should not be modified */
+ imgp->vp->v_flag |= VTEXT;
+
+ return (0);
+}
+
+/*
+ * Dump core, into a file named as described in the comments for
+ * expand_name(), unless the process was setuid/setgid.
+ */
+int
+aout_coredump(p)
+ register struct proc *p;
+{
+ register struct vnode *vp;
+ register struct ucred *cred = p->p_cred->pc_ucred;
+ register struct vmspace *vm = p->p_vmspace;
+ struct nameidata nd;
+ struct vattr vattr;
+ int error, error1;
+ char *name; /* name of corefile */
+
+ STOPEVENT(p, S_CORE, 0);
+ if (sugid_coredump == 0 && p->p_flag & P_SUGID)
+ return (EFAULT);
+ if (ctob(UPAGES + vm->vm_dsize + vm->vm_ssize) >=
+ p->p_rlimit[RLIMIT_CORE].rlim_cur)
+ return (EFAULT);
+ name = expand_name(p->p_comm, p->p_ucred->cr_uid, p->p_pid);
+ if (name == NULL)
+ return (EFAULT); /* XXX -- not the best error */
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, name, p);
+ error = vn_open(&nd, O_CREAT | FWRITE, S_IRUSR | S_IWUSR);
+ free(name, M_TEMP);
+ if (error)
+ return (error);
+ vp = nd.ni_vp;
+
+ /* Don't dump to non-regular files or files with links. */
+ if (vp->v_type != VREG ||
+ VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) {
+ error = EFAULT;
+ goto out;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_size = 0;
+ VOP_LEASE(vp, p, cred, LEASE_WRITE);
+ VOP_SETATTR(vp, &vattr, cred, p);
+ p->p_acflag |= ACORE;
+ bcopy(p, &p->p_addr->u_kproc.kp_proc, sizeof(struct proc));
+ fill_eproc(p, &p->p_addr->u_kproc.kp_eproc);
+ error = cpu_coredump(p, vp, cred);
+ if (error == 0)
+ error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
+ (int)ctob(vm->vm_dsize), (off_t)ctob(UPAGES), UIO_USERSPACE,
+ IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p);
+ if (error == 0)
+ error = vn_rdwr(UIO_WRITE, vp,
+ (caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)),
+ round_page(ctob(vm->vm_ssize)),
+ (off_t)ctob(UPAGES) + ctob(vm->vm_dsize), UIO_USERSPACE,
+ IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p);
+out:
+ VOP_UNLOCK(vp, 0, p);
+ error1 = vn_close(vp, FWRITE, cred, p);
+ if (error == 0)
+ error = error1;
+ return (error);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+static const struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
+EXEC_SET(aout, aout_execsw);
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
new file mode 100644
index 0000000..a0a2284
--- /dev/null
+++ b/sys/kern/imgact_elf.c
@@ -0,0 +1,992 @@
+/*-
+ * Copyright (c) 1995-1996 Søren Schmidt
+ * Copyright (c) 1996 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software withough specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Id: imgact_elf.c,v 1.43 1998/12/04 22:54:51 archie Exp $
+ */
+
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/acct.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/namei.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <sys/lock.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_extern.h>
+
+#include <machine/elf.h>
+#include <machine/md_var.h>
+
+__ElfType(Brandinfo);
+__ElfType(Auxargs);
+
+static int elf_check_header __P((const Elf_Ehdr *hdr, int type));
+static int elf_freebsd_fixup __P((long **stack_base,
+ struct image_params *imgp));
+static int elf_load_file __P((struct proc *p, char *file, u_long *addr,
+ u_long *entry));
+static int elf_load_section __P((struct proc *p,
+ struct vmspace *vmspace, struct vnode *vp,
+ vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
+ vm_prot_t prot));
+static int exec_elf_imgact __P((struct image_params *imgp));
+
+static int elf_trace = 0;
+SYSCTL_INT(_debug, OID_AUTO, elf_trace, CTLFLAG_RW, &elf_trace, 0, "");
+
+static struct sysentvec elf_freebsd_sysvec = {
+ SYS_MAXSYSCALL,
+ sysent,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ elf_freebsd_fixup,
+ sendsig,
+ sigcode,
+ &szsigcode,
+ 0,
+ "FreeBSD ELF",
+ elf_coredump
+};
+
+static Elf_Brandinfo freebsd_brand_info = {
+ "FreeBSD",
+ "",
+ "/usr/libexec/ld-elf.so.1",
+ &elf_freebsd_sysvec
+ };
+static Elf_Brandinfo *elf_brand_list[MAX_BRANDS] = {
+ &freebsd_brand_info,
+ NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL
+ };
+
+int
+elf_insert_brand_entry(Elf_Brandinfo *entry)
+{
+ int i;
+
+ for (i=1; i<MAX_BRANDS; i++) {
+ if (elf_brand_list[i] == NULL) {
+ elf_brand_list[i] = entry;
+ break;
+ }
+ }
+ if (i == MAX_BRANDS)
+ return -1;
+ return 0;
+}
+
+int
+elf_remove_brand_entry(Elf_Brandinfo *entry)
+{
+ int i;
+
+ for (i=1; i<MAX_BRANDS; i++) {
+ if (elf_brand_list[i] == entry) {
+ elf_brand_list[i] = NULL;
+ break;
+ }
+ }
+ if (i == MAX_BRANDS)
+ return -1;
+ return 0;
+}
+
+static int
+elf_check_header(const Elf_Ehdr *hdr, int type)
+{
+ if (!IS_ELF(*hdr) ||
+ hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+ hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
+ hdr->e_ident[EI_VERSION] != EV_CURRENT)
+ return ENOEXEC;
+
+ if (!ELF_MACHINE_OK(hdr->e_machine))
+ return ENOEXEC;
+
+ if (hdr->e_type != type || hdr->e_version != ELF_TARG_VER)
+ return ENOEXEC;
+
+ return 0;
+}
+
+static int
+elf_load_section(struct proc *p, struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
+{
+ size_t map_len;
+ vm_offset_t map_addr;
+ int error, rv;
+ size_t copy_len;
+ vm_object_t object;
+ vm_offset_t file_addr;
+ vm_offset_t data_buf = 0;
+
+ object = vp->v_object;
+ error = 0;
+
+ map_addr = trunc_page((vm_offset_t)vmaddr);
+ file_addr = trunc_page(offset);
+
+ /*
+ * We have two choices. We can either clear the data in the last page
+ * of an oversized mapping, or we can start the anon mapping a page
+ * early and copy the initialized data into that first page. We
+ * choose the second..
+ */
+ if (memsz > filsz)
+ map_len = trunc_page(offset+filsz) - file_addr;
+ else
+ map_len = round_page(offset+filsz) - file_addr;
+
+ if (map_len != 0) {
+ vm_object_reference(object);
+ vm_map_lock(&vmspace->vm_map);
+ rv = vm_map_insert(&vmspace->vm_map,
+ object,
+ file_addr, /* file offset */
+ map_addr, /* virtual start */
+ map_addr + map_len,/* virtual end */
+ prot,
+ VM_PROT_ALL,
+ MAP_COPY_NEEDED | MAP_COPY_ON_WRITE);
+ vm_map_unlock(&vmspace->vm_map);
+ if (rv != KERN_SUCCESS)
+ return EINVAL;
+
+ /* prefault the page tables */
+ pmap_object_init_pt(&vmspace->vm_pmap,
+ map_addr,
+ object,
+ (vm_pindex_t) OFF_TO_IDX(file_addr),
+ map_len,
+ 0);
+
+ /* we can stop now if we've covered it all */
+ if (memsz == filsz)
+ return 0;
+ }
+
+
+ /*
+ * We have to get the remaining bit of the file into the first part
+ * of the oversized map segment. This is normally because the .data
+ * segment in the file is extended to provide bss. It's a neat idea
+ * to try and save a page, but it's a pain in the behind to implement.
+ */
+ copy_len = (offset + filsz) - trunc_page(offset + filsz);
+ map_addr = trunc_page((vm_offset_t)vmaddr + filsz);
+ map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr;
+
+ /* This had damn well better be true! */
+ if (map_len != 0) {
+ vm_map_lock(&vmspace->vm_map);
+ rv = vm_map_insert(&vmspace->vm_map, NULL, 0,
+ map_addr, map_addr + map_len,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ vm_map_unlock(&vmspace->vm_map);
+ if (rv != KERN_SUCCESS)
+ return EINVAL;
+ }
+
+ if (copy_len != 0) {
+ vm_object_reference(object);
+ rv = vm_map_find(exec_map,
+ object,
+ trunc_page(offset + filsz),
+ &data_buf,
+ PAGE_SIZE,
+ TRUE,
+ VM_PROT_READ,
+ VM_PROT_ALL,
+ MAP_COPY_ON_WRITE | MAP_COPY_NEEDED);
+ if (rv != KERN_SUCCESS) {
+ vm_object_deallocate(object);
+ return EINVAL;
+ }
+ pmap_object_init_pt(exec_map->pmap, data_buf, object,
+ (vm_pindex_t) OFF_TO_IDX(trunc_page(offset + filsz)),
+ PAGE_SIZE, 1);
+
+ /* send the page fragment to user space */
+ error = copyout((caddr_t)data_buf, (caddr_t)map_addr, copy_len);
+ vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE);
+ if (error)
+ return (error);
+ }
+
+ /*
+ * set it to the specified protection
+ */
+ vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len, prot,
+ FALSE);
+
+ return error;
+}
+
+static int
+elf_load_file(struct proc *p, char *file, u_long *addr, u_long *entry)
+{
+ Elf_Ehdr *hdr = NULL;
+ Elf_Phdr *phdr = NULL;
+ struct nameidata nd;
+ struct vmspace *vmspace = p->p_vmspace;
+ struct vattr attr;
+ struct image_params image_params, *imgp;
+ vm_prot_t prot;
+ unsigned long text_size = 0, data_size = 0;
+ unsigned long text_addr = 0, data_addr = 0;
+ int error, i;
+
+ imgp = &image_params;
+ /*
+ * Initialize part of the common data
+ */
+ imgp->proc = p;
+ imgp->uap = NULL;
+ imgp->attr = &attr;
+ imgp->firstpage = NULL;
+ imgp->image_header = (char *)kmem_alloc_wait(exec_map, PAGE_SIZE);
+
+ if (imgp->image_header == NULL) {
+ nd.ni_vp = NULL;
+ error = ENOMEM;
+ goto fail;
+ }
+
+ NDINIT(&nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, p);
+
+ if (error = namei(&nd)) {
+ nd.ni_vp = NULL;
+ goto fail;
+ }
+
+ imgp->vp = nd.ni_vp;
+
+ /*
+ * Check permissions, modes, uid, etc on the file, and "open" it.
+ */
+ error = exec_check_permissions(imgp);
+ if (error) {
+ VOP_UNLOCK(nd.ni_vp, 0, p);
+ goto fail;
+ }
+
+ error = exec_map_first_page(imgp);
+ VOP_UNLOCK(nd.ni_vp, 0, p);
+ if (error)
+ goto fail;
+
+ hdr = (Elf_Ehdr *)imgp->image_header;
+ if (error = elf_check_header(hdr, ET_DYN))
+ goto fail;
+
+ /* Only support headers that fit within first page for now */
+ if ((hdr->e_phoff > PAGE_SIZE) ||
+ (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
+ error = ENOEXEC;
+ goto fail;
+ }
+
+ phdr = (Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+
+ for (i = 0; i < hdr->e_phnum; i++) {
+ if (phdr[i].p_type == PT_LOAD) { /* Loadable segment */
+ prot = 0;
+ if (phdr[i].p_flags & PF_X)
+ prot |= VM_PROT_EXECUTE;
+ if (phdr[i].p_flags & PF_W)
+ prot |= VM_PROT_WRITE;
+ if (phdr[i].p_flags & PF_R)
+ prot |= VM_PROT_READ;
+
+ if (error = elf_load_section(p, vmspace, nd.ni_vp,
+ phdr[i].p_offset,
+ (caddr_t)phdr[i].p_vaddr +
+ (*addr),
+ phdr[i].p_memsz,
+ phdr[i].p_filesz, prot))
+ goto fail;
+
+ /*
+ * Is this .text or .data ??
+ *
+ * We only handle one each of those yet XXX
+ */
+ if (hdr->e_entry >= phdr[i].p_vaddr &&
+ hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) {
+ text_addr = trunc_page(phdr[i].p_vaddr+(*addr));
+ text_size = round_page(phdr[i].p_memsz +
+ phdr[i].p_vaddr -
+ trunc_page(phdr[i].p_vaddr));
+ *entry=(unsigned long)hdr->e_entry+(*addr);
+ } else {
+ data_addr = trunc_page(phdr[i].p_vaddr+(*addr));
+ data_size = round_page(phdr[i].p_memsz +
+ phdr[i].p_vaddr -
+ trunc_page(phdr[i].p_vaddr));
+ }
+ }
+ }
+
+fail:
+ if (imgp->firstpage)
+ exec_unmap_first_page(imgp);
+ if (imgp->image_header)
+ kmem_free_wakeup(exec_map, (vm_offset_t)imgp->image_header,
+ PAGE_SIZE);
+ if (nd.ni_vp)
+ vrele(nd.ni_vp);
+
+ return error;
+}
+
+static int
+exec_elf_imgact(struct image_params *imgp)
+{
+ const Elf_Ehdr *hdr = (const Elf_Ehdr *) imgp->image_header;
+ const Elf_Phdr *phdr;
+ Elf_Auxargs *elf_auxargs = NULL;
+ struct vmspace *vmspace;
+ vm_prot_t prot;
+ u_long text_size = 0, data_size = 0;
+ u_long text_addr = 0, data_addr = 0;
+ u_long addr, entry = 0, proghdr = 0;
+ int error, i;
+ const char *interp = NULL;
+ Elf_Brandinfo *brand_info;
+ char *brand;
+ char path[MAXPATHLEN];
+
+ /*
+ * Do we have a valid ELF header ?
+ */
+ if (elf_check_header(hdr, ET_EXEC))
+ return -1;
+
+ /*
+ * From here on down, we return an errno, not -1, as we've
+ * detected an ELF file.
+ */
+
+ if ((hdr->e_phoff > PAGE_SIZE) ||
+ (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
+ /* Only support headers in first page for now */
+ return ENOEXEC;
+ }
+ phdr = (const Elf_Phdr*)(imgp->image_header + hdr->e_phoff);
+
+ /*
+ * From this point on, we may have resources that need to be freed.
+ */
+ if (error = exec_extract_strings(imgp))
+ goto fail;
+
+ exec_new_vmspace(imgp);
+
+ vmspace = imgp->proc->p_vmspace;
+
+ for (i = 0; i < hdr->e_phnum; i++) {
+ switch(phdr[i].p_type) {
+
+ case PT_LOAD: /* Loadable segment */
+ prot = 0;
+ if (phdr[i].p_flags & PF_X)
+ prot |= VM_PROT_EXECUTE;
+ if (phdr[i].p_flags & PF_W)
+ prot |= VM_PROT_WRITE;
+ if (phdr[i].p_flags & PF_R)
+ prot |= VM_PROT_READ;
+
+ if (error = elf_load_section(imgp->proc,
+ vmspace, imgp->vp,
+ phdr[i].p_offset,
+ (caddr_t)phdr[i].p_vaddr,
+ phdr[i].p_memsz,
+ phdr[i].p_filesz, prot))
+ goto fail;
+
+ /*
+ * Is this .text or .data ??
+ *
+ * We only handle one each of those yet XXX
+ */
+ if (hdr->e_entry >= phdr[i].p_vaddr &&
+ hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) {
+ text_addr = trunc_page(phdr[i].p_vaddr);
+ text_size = round_page(phdr[i].p_memsz +
+ phdr[i].p_vaddr -
+ text_addr);
+ entry = (u_long)hdr->e_entry;
+ } else {
+ data_addr = trunc_page(phdr[i].p_vaddr);
+ data_size = round_page(phdr[i].p_memsz +
+ phdr[i].p_vaddr -
+ data_addr);
+ }
+ break;
+ case PT_INTERP: /* Path to interpreter */
+ if (phdr[i].p_filesz > MAXPATHLEN ||
+ phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) {
+ error = ENOEXEC;
+ goto fail;
+ }
+ interp = imgp->image_header + phdr[i].p_offset;
+ break;
+ case PT_PHDR: /* Program header table info */
+ proghdr = phdr[i].p_vaddr;
+ break;
+ default:
+ break;
+ }
+ }
+
+ vmspace->vm_tsize = text_size >> PAGE_SHIFT;
+ vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
+ vmspace->vm_dsize = data_size >> PAGE_SHIFT;
+ vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
+
+ addr = 2L*MAXDSIZ; /* May depend on OS type XXX */
+
+ imgp->entry_addr = entry;
+
+ /* If the executable has a brand, search for it in the brand list. */
+ brand_info = NULL;
+ brand = (char *)&hdr->e_ident[EI_BRAND];
+ if (brand[0] != '\0') {
+ for (i = 0; i < MAX_BRANDS; i++) {
+ Elf_Brandinfo *bi = elf_brand_list[i];
+
+ if (bi != NULL && strcmp(brand, bi->brand) == 0) {
+ brand_info = bi;
+ break;
+ }
+ }
+ }
+
+ /* Lacking a known brand, search for a recognized interpreter. */
+ if (brand_info == NULL && interp != NULL) {
+ for (i = 0; i < MAX_BRANDS; i++) {
+ Elf_Brandinfo *bi = elf_brand_list[i];
+
+ if (bi != NULL &&
+ strcmp(interp, bi->interp_path) == 0) {
+ brand_info = bi;
+ break;
+ }
+ }
+ }
+
+#ifdef __alpha__
+ /* XXX - Assume FreeBSD on the alpha. */
+ if (brand_info == NULL)
+ brand_info = &freebsd_brand_info;
+#endif
+
+ if (brand_info == NULL) {
+ if (brand[0] == 0)
+ uprintf("ELF binary type not known."
+ " Use \"brandelf\" to brand it.\n");
+ else
+ uprintf("ELF binary type \"%.*s\" not known.\n",
+ EI_NIDENT - EI_BRAND, brand);
+ error = ENOEXEC;
+ goto fail;
+ }
+
+ imgp->proc->p_sysent = brand_info->sysvec;
+ if (interp != NULL) {
+ snprintf(path, sizeof(path), "%s%s",
+ brand_info->emul_path, interp);
+ if ((error = elf_load_file(imgp->proc, path, &addr,
+ &imgp->entry_addr)) != 0) {
+ uprintf("ELF interpreter %s not found\n", path);
+ goto fail;
+ }
+ }
+
+ /*
+ * Construct auxargs table (used by the fixup routine)
+ */
+ elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
+ elf_auxargs->execfd = -1;
+ elf_auxargs->phdr = proghdr;
+ elf_auxargs->phent = hdr->e_phentsize;
+ elf_auxargs->phnum = hdr->e_phnum;
+ elf_auxargs->pagesz = PAGE_SIZE;
+ elf_auxargs->base = addr;
+ elf_auxargs->flags = 0;
+ elf_auxargs->entry = entry;
+ elf_auxargs->trace = elf_trace;
+
+ imgp->auxargs = elf_auxargs;
+ imgp->interpreted = 0;
+
+ /* don't allow modifying the file while we run it */
+ imgp->vp->v_flag |= VTEXT;
+
+fail:
+ return error;
+}
+
+static int
+elf_freebsd_fixup(long **stack_base, struct image_params *imgp)
+{
+ Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
+ long *pos;
+
+ pos = *stack_base + (imgp->argc + imgp->envc + 2);
+
+ if (args->trace) {
+ AUXARGS_ENTRY(pos, AT_DEBUG, 1);
+ }
+ if (args->execfd != -1) {
+ AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
+ }
+ AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
+ AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
+ AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
+ AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
+ AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
+ AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
+ AUXARGS_ENTRY(pos, AT_BASE, args->base);
+ AUXARGS_ENTRY(pos, AT_NULL, 0);
+
+ free(imgp->auxargs, M_TEMP);
+ imgp->auxargs = NULL;
+
+ (*stack_base)--;
+ suword(*stack_base, (long) imgp->argc);
+ return 0;
+}
+
+/*
+ * Code for generating ELF core dumps.
+ */
+
+typedef void (*segment_callback) __P((vm_map_entry_t, void *));
+
+/* Closure for cb_put_phdr(). */
+struct phdr_closure {
+ Elf_Phdr *phdr; /* Program header to fill in */
+ Elf_Off offset; /* Offset of segment in core file */
+};
+
+/* Closure for cb_size_segment(). */
+struct sseg_closure {
+ int count; /* Count of writable segments. */
+ size_t size; /* Total size of all writable segments. */
+};
+
+static void cb_put_phdr __P((vm_map_entry_t, void *));
+static void cb_size_segment __P((vm_map_entry_t, void *));
+static void each_writable_segment __P((struct proc *, segment_callback,
+ void *));
+static int elf_corehdr __P((struct proc *, struct vnode *, struct ucred *,
+ int, void *, size_t));
+static void elf_puthdr __P((struct proc *, void *, size_t *,
+ const prstatus_t *, const prfpregset_t *, const prpsinfo_t *, int));
+static void elf_putnote __P((void *, size_t *, const char *, int,
+ const void *, size_t));
+
+extern int osreldate;
+
+int
+elf_coredump(p)
+ register struct proc *p;
+{
+ register struct vnode *vp;
+ register struct ucred *cred = p->p_cred->pc_ucred;
+ struct nameidata nd;
+ struct vattr vattr;
+ int error, error1;
+ char *name; /* name of corefile */
+ struct sseg_closure seginfo;
+ void *hdr;
+ size_t hdrsize;
+
+ STOPEVENT(p, S_CORE, 0);
+
+ if (sugid_coredump == 0 && p->p_flag & P_SUGID)
+ return (EFAULT);
+
+ /* Size the program segments. */
+ seginfo.count = 0;
+ seginfo.size = 0;
+ each_writable_segment(p, cb_size_segment, &seginfo);
+
+ /*
+ * Calculate the size of the core file header area by making
+ * a dry run of generating it. Nothing is written, but the
+ * size is calculated.
+ */
+ hdrsize = 0;
+ elf_puthdr((struct proc *)NULL, (void *)NULL, &hdrsize,
+ (const prstatus_t *)NULL, (const prfpregset_t *)NULL,
+ (const prpsinfo_t *)NULL, seginfo.count);
+
+ if (hdrsize + seginfo.size >= p->p_rlimit[RLIMIT_CORE].rlim_cur)
+ return (EFAULT);
+ name = expand_name(p->p_comm, p->p_ucred->cr_uid, p->p_pid);
+ if (name == NULL)
+ return (EFAULT); /* XXX -- not the best error */
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, name, p);
+ error = vn_open(&nd, O_CREAT | FWRITE, S_IRUSR | S_IWUSR);
+ free(name, M_TEMP);
+ if (error)
+ return (error);
+ vp = nd.ni_vp;
+
+ /* Don't dump to non-regular files or files with links. */
+ if (vp->v_type != VREG ||
+ VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) {
+ error = EFAULT;
+ goto out;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_size = 0;
+ VOP_LEASE(vp, p, cred, LEASE_WRITE);
+ VOP_SETATTR(vp, &vattr, cred, p);
+ p->p_acflag |= ACORE;
+
+
+ /*
+ * Allocate memory for building the header, fill it up,
+ * and write it out.
+ */
+ hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
+ if (hdr == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+ error = elf_corehdr(p, vp, cred, seginfo.count, hdr, hdrsize);
+
+ /* Write the contents of all of the writable segments. */
+ if (error == 0) {
+ Elf_Phdr *php;
+ off_t offset;
+ int i;
+
+ php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
+ offset = hdrsize;
+ for (i = 0; i < seginfo.count; i++) {
+ error = vn_rdwr(UIO_WRITE, vp, (caddr_t)php->p_vaddr,
+ php->p_filesz, offset, UIO_USERSPACE,
+ IO_NODELOCKED|IO_UNIT, cred, (int *)NULL, p);
+ if (error != 0)
+ break;
+ offset += php->p_filesz;
+ php++;
+ }
+ }
+ free(hdr, M_TEMP);
+
+out:
+ VOP_UNLOCK(vp, 0, p);
+ error1 = vn_close(vp, FWRITE, cred, p);
+ if (error == 0)
+ error = error1;
+ return (error);
+}
+
+/*
+ * A callback for each_writable_segment() to write out the segment's
+ * program header entry.
+ */
+static void
+cb_put_phdr(entry, closure)
+ vm_map_entry_t entry;
+ void *closure;
+{
+ struct phdr_closure *phc = (struct phdr_closure *)closure;
+ Elf_Phdr *phdr = phc->phdr;
+
+ phc->offset = round_page(phc->offset);
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_offset = phc->offset;
+ phdr->p_vaddr = entry->start;
+ phdr->p_paddr = 0;
+ phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
+ phdr->p_align = PAGE_SIZE;
+ phdr->p_flags = 0;
+ if (entry->protection & VM_PROT_READ)
+ phdr->p_flags |= PF_R;
+ if (entry->protection & VM_PROT_WRITE)
+ phdr->p_flags |= PF_W;
+ if (entry->protection & VM_PROT_EXECUTE)
+ phdr->p_flags |= PF_X;
+
+ phc->offset += phdr->p_filesz;
+ phc->phdr++;
+}
+
+/*
+ * A callback for each_writable_segment() to gather information about
+ * the number of segments and their total size.
+ */
+static void
+cb_size_segment(entry, closure)
+ vm_map_entry_t entry;
+ void *closure;
+{
+ struct sseg_closure *ssc = (struct sseg_closure *)closure;
+
+ ssc->count++;
+ ssc->size += entry->end - entry->start;
+}
+
+/*
+ * For each writable segment in the process's memory map, call the given
+ * function with a pointer to the map entry and some arbitrary
+ * caller-supplied data.
+ */
+static void
+each_writable_segment(p, func, closure)
+ struct proc *p;
+ segment_callback func;
+ void *closure;
+{
+ vm_map_t map = &p->p_vmspace->vm_map;
+ vm_map_entry_t entry;
+
+ for (entry = map->header.next; entry != &map->header;
+ entry = entry->next) {
+ vm_object_t obj;
+
+ if (entry->eflags & (MAP_ENTRY_IS_A_MAP|MAP_ENTRY_IS_SUB_MAP) ||
+ (entry->protection & (VM_PROT_READ|VM_PROT_WRITE)) !=
+ (VM_PROT_READ|VM_PROT_WRITE))
+ continue;
+
+ if ((obj = entry->object.vm_object) == NULL)
+ continue;
+
+ /* Find the deepest backing object. */
+ while (obj->backing_object != NULL)
+ obj = obj->backing_object;
+
+ /* Ignore memory-mapped devices and such things. */
+ if (obj->type != OBJT_DEFAULT &&
+ obj->type != OBJT_SWAP &&
+ obj->type != OBJT_VNODE)
+ continue;
+
+ (*func)(entry, closure);
+ }
+}
+
+/*
+ * Write the core file header to the file, including padding up to
+ * the page boundary.
+ */
+static int
+elf_corehdr(p, vp, cred, numsegs, hdr, hdrsize)
+ struct proc *p;
+ struct vnode *vp;
+ struct ucred *cred;
+ int numsegs;
+ size_t hdrsize;
+ void *hdr;
+{
+ size_t off;
+ prstatus_t status;
+ prfpregset_t fpregset;
+ prpsinfo_t psinfo;
+
+ /* Gather the information for the header. */
+ bzero(&status, sizeof status);
+ status.pr_version = PRSTATUS_VERSION;
+ status.pr_statussz = sizeof(prstatus_t);
+ status.pr_gregsetsz = sizeof(gregset_t);
+ status.pr_fpregsetsz = sizeof(fpregset_t);
+ status.pr_osreldate = osreldate;
+#ifndef COMPAT_LINUX_THREADS
+ status.pr_cursig = p->p_sigacts->ps_sig;
+#else
+ status.pr_cursig = p->p_sig;
+#endif /* COMPAT_LINUX_THREADS */
+ status.pr_pid = p->p_pid;
+ fill_regs(p, &status.pr_reg);
+
+ fill_fpregs(p, &fpregset);
+
+ bzero(&psinfo, sizeof psinfo);
+ psinfo.pr_version = PRPSINFO_VERSION;
+ psinfo.pr_psinfosz = sizeof(prpsinfo_t);
+ strncpy(psinfo.pr_fname, p->p_comm, MAXCOMLEN);
+ /* XXX - We don't fill in the command line arguments properly yet. */
+ strncpy(psinfo.pr_psargs, p->p_comm, PRARGSZ);
+
+ /* Fill in the header. */
+ bzero(hdr, hdrsize);
+ off = 0;
+ elf_puthdr(p, hdr, &off, &status, &fpregset, &psinfo, numsegs);
+
+ /* Write it to the core file. */
+ return vn_rdwr(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
+ UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, NULL, p);
+}
+
+static void
+elf_puthdr(struct proc *p, void *dst, size_t *off, const prstatus_t *status,
+ const prfpregset_t *fpregset, const prpsinfo_t *psinfo, int numsegs)
+{
+ size_t ehoff;
+ size_t phoff;
+ size_t noteoff;
+ size_t notesz;
+
+ ehoff = *off;
+ *off += sizeof(Elf_Ehdr);
+
+ phoff = *off;
+ *off += (numsegs + 1) * sizeof(Elf_Phdr);
+
+ noteoff = *off;
+ elf_putnote(dst, off, "FreeBSD", NT_PRSTATUS, status,
+ sizeof *status);
+ elf_putnote(dst, off, "FreeBSD", NT_FPREGSET, fpregset,
+ sizeof *fpregset);
+ elf_putnote(dst, off, "FreeBSD", NT_PRPSINFO, psinfo,
+ sizeof *psinfo);
+ notesz = *off - noteoff;
+
+ /* Align up to a page boundary for the program segments. */
+ *off = round_page(*off);
+
+ if (dst != NULL) {
+ Elf_Ehdr *ehdr;
+ Elf_Phdr *phdr;
+ struct phdr_closure phc;
+
+ /*
+ * Fill in the ELF header.
+ */
+ ehdr = (Elf_Ehdr *)((char *)dst + ehoff);
+ ehdr->e_ident[EI_MAG0] = ELFMAG0;
+ ehdr->e_ident[EI_MAG1] = ELFMAG1;
+ ehdr->e_ident[EI_MAG2] = ELFMAG2;
+ ehdr->e_ident[EI_MAG3] = ELFMAG3;
+ ehdr->e_ident[EI_CLASS] = ELF_CLASS;
+ ehdr->e_ident[EI_DATA] = ELF_DATA;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_PAD] = 0;
+ strncpy(ehdr->e_ident + EI_BRAND, "FreeBSD",
+ EI_NIDENT - EI_BRAND);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_entry = 0;
+ ehdr->e_phoff = phoff;
+ ehdr->e_flags = 0;
+ ehdr->e_ehsize = sizeof(Elf_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf_Phdr);
+ ehdr->e_phnum = numsegs + 1;
+ ehdr->e_shentsize = sizeof(Elf_Shdr);
+ ehdr->e_shnum = 0;
+ ehdr->e_shstrndx = SHN_UNDEF;
+
+ /*
+ * Fill in the program header entries.
+ */
+ phdr = (Elf_Phdr *)((char *)dst + phoff);
+
+ /* The note segement. */
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = noteoff;
+ phdr->p_vaddr = 0;
+ phdr->p_paddr = 0;
+ phdr->p_filesz = notesz;
+ phdr->p_memsz = 0;
+ phdr->p_flags = 0;
+ phdr->p_align = 0;
+ phdr++;
+
+ /* All the writable segments from the program. */
+ phc.phdr = phdr;
+ phc.offset = *off;
+ each_writable_segment(p, cb_put_phdr, &phc);
+ }
+}
+
+static void
+elf_putnote(void *dst, size_t *off, const char *name, int type,
+ const void *desc, size_t descsz)
+{
+ Elf_Note note;
+
+ note.n_namesz = strlen(name) + 1;
+ note.n_descsz = descsz;
+ note.n_type = type;
+ if (dst != NULL)
+ bcopy(&note, (char *)dst + *off, sizeof note);
+ *off += sizeof note;
+ if (dst != NULL)
+ bcopy(name, (char *)dst + *off, note.n_namesz);
+ *off += roundup2(note.n_namesz, sizeof(Elf_Size));
+ if (dst != NULL)
+ bcopy(desc, (char *)dst + *off, note.n_descsz);
+ *off += roundup2(note.n_descsz, sizeof(Elf_Size));
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+static const struct execsw elf_execsw = {exec_elf_imgact, "ELF"};
+EXEC_SET(elf, elf_execsw);
diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c
new file mode 100644
index 0000000..d666a87
--- /dev/null
+++ b/sys/kern/imgact_gzip.c
@@ -0,0 +1,378 @@
+/*
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@login.dkuug.dk> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $Id: imgact_gzip.c,v 1.34 1998/07/15 05:00:26 bde Exp $
+ *
+ * This module handles execution of a.out files which have been run through
+ * "gzip". This saves diskspace, but wastes cpu-cycles and VM.
+ *
+ * TODO:
+ * text-segments should be made R/O after being filled
+ * is the vm-stuff safe ?
+ * should handle the entire header of gzip'ed stuff.
+ * inflate isn't quite reentrant yet...
+ * error-handling is a mess...
+ * so is the rest...
+ * tidy up unnecesary includes
+ */
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/mman.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/inflate.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+struct imgact_gzip {
+ struct image_params *ip;
+ struct exec a_out;
+ int error;
+ int where;
+ u_char *inbuf;
+ u_long offset;
+ u_long output;
+ u_long len;
+ int idx;
+ u_long virtual_offset, file_offset, file_end, bss_size;
+};
+
+static int exec_gzip_imgact __P((struct image_params *imgp));
+static int NextByte __P((void *vp));
+static int do_aout_hdr __P((struct imgact_gzip *));
+static int Flush __P((void *vp, u_char *, u_long siz));
+
+static int
+exec_gzip_imgact(imgp)
+ struct image_params *imgp;
+{
+ int error, error2 = 0;
+ const u_char *p = (const u_char *) imgp->image_header;
+ struct imgact_gzip igz;
+ struct inflate infl;
+ struct vmspace *vmspace;
+
+ /* If these four are not OK, it isn't a gzip file */
+ if (p[0] != 0x1f)
+ return -1; /* 0 Simply magic */
+ if (p[1] != 0x8b)
+ return -1; /* 1 Simply magic */
+ if (p[2] != 0x08)
+ return -1; /* 2 Compression method */
+ if (p[9] != 0x03)
+ return -1; /* 9 OS compressed on */
+
+ /*
+ * If this one contains anything but a comment or a filename marker,
+ * we don't want to chew on it
+ */
+ if (p[3] & ~(0x18))
+ return ENOEXEC; /* 3 Flags */
+
+ /* These are of no use to us */
+ /* 4-7 Timestamp */
+ /* 8 Extra flags */
+
+ bzero(&igz, sizeof igz);
+ bzero(&infl, sizeof infl);
+ infl.gz_private = (void *) &igz;
+ infl.gz_input = NextByte;
+ infl.gz_output = Flush;
+
+ igz.ip = imgp;
+ igz.idx = 10;
+
+ if (p[3] & 0x08) { /* skip a filename */
+ while (p[igz.idx++])
+ if (igz.idx >= PAGE_SIZE)
+ return ENOEXEC;
+ }
+ if (p[3] & 0x10) { /* skip a comment */
+ while (p[igz.idx++])
+ if (igz.idx >= PAGE_SIZE)
+ return ENOEXEC;
+ }
+ igz.len = imgp->attr->va_size;
+
+ error = inflate(&infl);
+
+ if ( !error ) {
+ vmspace = imgp->proc->p_vmspace;
+ error = vm_map_protect(&vmspace->vm_map,
+ (vm_offset_t) vmspace->vm_taddr,
+ (vm_offset_t) (vmspace->vm_taddr +
+ (vmspace->vm_tsize << PAGE_SHIFT)) ,
+ VM_PROT_READ|VM_PROT_EXECUTE,0);
+ }
+
+ if (igz.inbuf) {
+ error2 =
+ vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf,
+ (vm_offset_t) igz.inbuf + PAGE_SIZE);
+ }
+ if (igz.error || error || error2) {
+ printf("Output=%lu ", igz.output);
+ printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n",
+ error, igz.error, error2, igz.where);
+ }
+ if (igz.error)
+ return igz.error;
+ if (error)
+ return ENOEXEC;
+ if (error2)
+ return error2;
+ return 0;
+}
+
+static int
+do_aout_hdr(struct imgact_gzip * gz)
+{
+ int error;
+ struct vmspace *vmspace;
+ vm_offset_t vmaddr;
+
+ /*
+ * Set file/virtual offset based on a.out variant. We do two cases:
+ * host byte order and network byte order (for NetBSD compatibility)
+ */
+ switch ((int) (gz->a_out.a_magic & 0xffff)) {
+ case ZMAGIC:
+ gz->virtual_offset = 0;
+ if (gz->a_out.a_text) {
+ gz->file_offset = PAGE_SIZE;
+ } else {
+ /* Bill's "screwball mode" */
+ gz->file_offset = 0;
+ }
+ break;
+ case QMAGIC:
+ gz->virtual_offset = PAGE_SIZE;
+ gz->file_offset = 0;
+ break;
+ default:
+ /* NetBSD compatibility */
+ switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
+ case ZMAGIC:
+ case QMAGIC:
+ gz->virtual_offset = PAGE_SIZE;
+ gz->file_offset = 0;
+ break;
+ default:
+ gz->where = __LINE__;
+ return (-1);
+ }
+ }
+
+ gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE);
+
+ /*
+ * Check various fields in header for validity/bounds.
+ */
+ if ( /* entry point must lay with text region */
+ gz->a_out.a_entry < gz->virtual_offset ||
+ gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text ||
+
+ /* text and data size must each be page rounded */
+ gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) {
+ gz->where = __LINE__;
+ return (-1);
+ }
+ /*
+ * text/data/bss must not exceed limits
+ */
+ if ( /* text can't exceed maximum text size */
+ gz->a_out.a_text > MAXTSIZ ||
+
+ /* data + bss can't exceed rlimit */
+ gz->a_out.a_data + gz->bss_size >
+ gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) {
+ gz->where = __LINE__;
+ return (ENOMEM);
+ }
+ /* Find out how far we should go */
+ gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;
+
+ /* copy in arguments and/or environment from old process */
+ error = exec_extract_strings(gz->ip);
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
+ /*
+ * Destroy old process VM and create a new one (with a new stack)
+ */
+ exec_new_vmspace(gz->ip);
+
+ vmspace = gz->ip->proc->p_vmspace;
+
+ vmaddr = gz->virtual_offset;
+
+ error = vm_mmap(&vmspace->vm_map,
+ &vmaddr,
+ gz->a_out.a_text + gz->a_out.a_data,
+ VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED,
+ 0,
+ 0);
+
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
+
+ if (gz->bss_size != 0) {
+ /*
+ * Allocate demand-zeroed area for uninitialized data.
+ * "bss" = 'block started by symbol' - named after the
+ * IBM 7090 instruction of the same name.
+ */
+ vmaddr = gz->virtual_offset + gz->a_out.a_text +
+ gz->a_out.a_data;
+ error = vm_map_find(&vmspace->vm_map,
+ NULL,
+ 0,
+ &vmaddr,
+ gz->bss_size,
+ FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
+ }
+ /* Fill in process VM information */
+ vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
+ vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
+ vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset;
+ vmspace->vm_daddr = (caddr_t) (uintptr_t)
+ (gz->virtual_offset + gz->a_out.a_text);
+
+ /* Fill in image_params */
+ gz->ip->interpreted = 0;
+ gz->ip->entry_addr = gz->a_out.a_entry;
+
+ gz->ip->proc->p_sysent = &aout_sysvec;
+
+ return 0;
+}
+
+static int
+NextByte(void *vp)
+{
+ int error;
+ struct imgact_gzip *igz = (struct imgact_gzip *) vp;
+
+ if (igz->idx >= igz->len) {
+ igz->where = __LINE__;
+ return GZ_EOF;
+ }
+ if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
+ return igz->inbuf[(igz->idx++) - igz->offset];
+ }
+ if (igz->inbuf) {
+ error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf,
+ (vm_offset_t) igz->inbuf + PAGE_SIZE);
+ if (error) {
+ igz->where = __LINE__;
+ igz->error = error;
+ return GZ_EOF;
+ }
+ }
+ igz->offset = igz->idx & ~PAGE_MASK;
+
+ error = vm_mmap(kernel_map, /* map */
+ (vm_offset_t *) & igz->inbuf, /* address */
+ PAGE_SIZE, /* size */
+ VM_PROT_READ, /* protection */
+ VM_PROT_READ, /* max protection */
+ 0, /* flags */
+ (caddr_t) igz->ip->vp, /* vnode */
+ igz->offset); /* offset */
+ if (error) {
+ igz->where = __LINE__;
+ igz->error = error;
+ return GZ_EOF;
+ }
+ return igz->inbuf[(igz->idx++) - igz->offset];
+}
+
+static int
+Flush(void *vp, u_char * ptr, u_long siz)
+{
+ struct imgact_gzip *gz = (struct imgact_gzip *) vp;
+ u_char *p = ptr, *q;
+ int i;
+
+ /* First, find a a.out-header */
+ if (gz->output < sizeof gz->a_out) {
+ q = (u_char *) & gz->a_out;
+ i = min(siz, sizeof gz->a_out - gz->output);
+ bcopy(p, q + gz->output, i);
+ gz->output += i;
+ p += i;
+ siz -= i;
+ if (gz->output == sizeof gz->a_out) {
+ i = do_aout_hdr(gz);
+ if (i == -1) {
+ if (!gz->where)
+ gz->where = __LINE__;
+ gz->error = ENOEXEC;
+ return ENOEXEC;
+ } else if (i) {
+ gz->where = __LINE__;
+ gz->error = i;
+ return ENOEXEC;
+ }
+ if (gz->file_offset == 0) {
+ q = (u_char *) (uintptr_t) gz->virtual_offset;
+ copyout(&gz->a_out, q, sizeof gz->a_out);
+ }
+ }
+ }
+ /* Skip over zero-padded first PAGE if needed */
+ if (gz->output < gz->file_offset &&
+ gz->output + siz > gz->file_offset) {
+ i = min(siz, gz->file_offset - gz->output);
+ gz->output += i;
+ p += i;
+ siz -= i;
+ }
+ if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
+ i = min(siz, gz->file_end - gz->output);
+ q = (u_char *) (uintptr_t)
+ (gz->virtual_offset + gz->output - gz->file_offset);
+ copyout(p, q, i);
+ gz->output += i;
+ p += i;
+ siz -= i;
+ }
+ gz->output += siz;
+ return 0;
+}
+
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+
+static const struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
+EXEC_SET(execgzip, gzip_execsw);
diff --git a/sys/kern/imgact_shell.c b/sys/kern/imgact_shell.c
new file mode 100644
index 0000000..e72b86d
--- /dev/null
+++ b/sys/kern/imgact_shell.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: imgact_shell.c,v 1.16 1997/08/02 14:31:23 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/kernel.h>
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define SHELLMAGIC 0x2123 /* #! */
+#else
+#define SHELLMAGIC 0x2321
+#endif
+
+#define MAXSHELLCMDLEN 64
+
+static int exec_shell_imgact __P((struct image_params *imgp));
+
+/*
+ * Shell interpreter image activator. A interpreter name beginning
+ * at imgp->stringbase is the minimal successful exit requirement.
+ */
+static int
+exec_shell_imgact(imgp)
+ struct image_params *imgp;
+{
+ const char *image_header = imgp->image_header;
+ const char *ihp, *line_endp;
+ char *interp;
+
+ /* a shell script? */
+ if (((const short *) image_header)[0] != SHELLMAGIC)
+ return(-1);
+
+ /*
+ * Don't allow a shell script to be the shell for a shell
+ * script. :-)
+ */
+ if (imgp->interpreted)
+ return(ENOEXEC);
+
+ imgp->interpreted = 1;
+
+ /*
+ * Copy shell name and arguments from image_header into string
+ * buffer.
+ */
+
+ /*
+ * Find end of line; return if the line > MAXSHELLCMDLEN long.
+ */
+ for (ihp = &image_header[2]; *ihp != '\n'; ++ihp) {
+ if (ihp >= &image_header[MAXSHELLCMDLEN])
+ return(ENOEXEC);
+ }
+ line_endp = ihp;
+
+ /* reset for another pass */
+ ihp = &image_header[2];
+
+ /* Skip over leading spaces - until the interpreter name */
+ while ((*ihp == ' ') || (*ihp == '\t')) ihp++;
+
+ /* copy the interpreter name */
+ interp = imgp->interpreter_name;
+ while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t'))
+ *interp++ = *ihp++;
+ *interp = '\0';
+
+ /* Disallow a null interpreter filename */
+ if (*imgp->interpreter_name == '\0')
+ return(ENOEXEC);
+
+ /* reset for another pass */
+ ihp = &image_header[2];
+
+ /* copy the interpreter name and arguments */
+ while (ihp < line_endp) {
+ /* Skip over leading spaces */
+ while ((*ihp == ' ') || (*ihp == '\t')) ihp++;
+
+ if (ihp < line_endp) {
+ /*
+ * Copy to end of token. No need to watch stringspace
+ * because this is at the front of the string buffer
+ * and the maximum shell command length is tiny.
+ */
+ while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) {
+ *imgp->stringp++ = *ihp++;
+ imgp->stringspace--;
+ }
+
+ *imgp->stringp++ = 0;
+ imgp->stringspace--;
+
+ imgp->argc++;
+ }
+ }
+
+ imgp->argv0 = imgp->uap->fname;
+
+ return(0);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+static const struct execsw shell_execsw = { exec_shell_imgact, "#!" };
+EXEC_SET(shell, shell_execsw);
diff --git a/sys/kern/inflate.c b/sys/kern/inflate.c
new file mode 100644
index 0000000..1db9b2c
--- /dev/null
+++ b/sys/kern/inflate.c
@@ -0,0 +1,1078 @@
+/*
+ * Most parts of this file are not covered by:
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@login.dknet.dk> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $Id: inflate.c,v 1.11 1997/10/12 20:23:40 phk Exp $
+ *
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/inflate.h>
+#ifdef KERNEL
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#endif
+#include <sys/malloc.h>
+
+#ifdef KERNEL
+static MALLOC_DEFINE(M_GZIP, "Gzip trees", "Gzip trees");
+#endif
+
+/* needed to make inflate() work */
+#define uch u_char
+#define ush u_short
+#define ulg u_long
+
+/* Stuff to make inflate() work */
+#ifdef KERNEL
+#define memzero(dest,len) bzero(dest,len)
+#endif
+#define NOMEMCPY
+#ifdef KERNEL
+#define FPRINTF printf
+#else
+extern void putstr (char *);
+#define FPRINTF putstr
+#endif
+
+#define FLUSH(x,y) { \
+ int foo = (*x->gz_output)(x->gz_private,x->gz_slide,y); \
+ if (foo) \
+ return foo; \
+ }
+
+static const int qflag = 0;
+
+#ifndef KERNEL /* want to use this file in kzip also */
+extern unsigned char *kzipmalloc (int);
+extern void kzipfree (void*);
+#define malloc(x, y, z) kzipmalloc((x))
+#define free(x, y) kzipfree((x))
+#endif
+
+/*
+ * This came from unzip-5.12. I have changed it the flow to pass
+ * a structure pointer around, thus hopefully making it re-entrant.
+ * Poul-Henning
+ */
+
+/* inflate.c -- put in the public domain by Mark Adler
+ version c14o, 23 August 1994 */
+
+/* You can do whatever you like with this source file, though I would
+ prefer that if you modify it and redistribute it that you include
+ comments to that effect with your name and the date. Thank you.
+
+ History:
+ vers date who what
+ ---- --------- -------------- ------------------------------------
+ a ~~ Feb 92 M. Adler used full (large, one-step) lookup table
+ b1 21 Mar 92 M. Adler first version with partial lookup tables
+ b2 21 Mar 92 M. Adler fixed bug in fixed-code blocks
+ b3 22 Mar 92 M. Adler sped up match copies, cleaned up some
+ b4 25 Mar 92 M. Adler added prototypes; removed window[] (now
+ is the responsibility of unzip.h--also
+ changed name to slide[]), so needs diffs
+ for unzip.c and unzip.h (this allows
+ compiling in the small model on MSDOS);
+ fixed cast of q in huft_build();
+ b5 26 Mar 92 M. Adler got rid of unintended macro recursion.
+ b6 27 Mar 92 M. Adler got rid of nextbyte() routine. fixed
+ bug in inflate_fixed().
+ c1 30 Mar 92 M. Adler removed lbits, dbits environment variables.
+ changed BMAX to 16 for explode. Removed
+ OUTB usage, and replaced it with flush()--
+ this was a 20% speed improvement! Added
+ an explode.c (to replace unimplod.c) that
+ uses the huft routines here. Removed
+ register union.
+ c2 4 Apr 92 M. Adler fixed bug for file sizes a multiple of 32k.
+ c3 10 Apr 92 M. Adler reduced memory of code tables made by
+ huft_build significantly (factor of two to
+ three).
+ c4 15 Apr 92 M. Adler added NOMEMCPY do kill use of memcpy().
+ worked around a Turbo C optimization bug.
+ c5 21 Apr 92 M. Adler added the GZ_WSIZE #define to allow reducing
+ the 32K window size for specialized
+ applications.
+ c6 31 May 92 M. Adler added some typecasts to eliminate warnings
+ c7 27 Jun 92 G. Roelofs added some more typecasts (444: MSC bug).
+ c8 5 Oct 92 J-l. Gailly added ifdef'd code to deal with PKZIP bug.
+ c9 9 Oct 92 M. Adler removed a memory error message (~line 416).
+ c10 17 Oct 92 G. Roelofs changed ULONG/UWORD/byte to ulg/ush/uch,
+ removed old inflate, renamed inflate_entry
+ to inflate, added Mark's fix to a comment.
+ c10.5 14 Dec 92 M. Adler fix up error messages for incomplete trees.
+ c11 2 Jan 93 M. Adler fixed bug in detection of incomplete
+ tables, and removed assumption that EOB is
+ the longest code (bad assumption).
+ c12 3 Jan 93 M. Adler make tables for fixed blocks only once.
+ c13 5 Jan 93 M. Adler allow all zero length codes (pkzip 2.04c
+ outputs one zero length code for an empty
+ distance tree).
+ c14 12 Mar 93 M. Adler made inflate.c standalone with the
+ introduction of inflate.h.
+ c14b 16 Jul 93 G. Roelofs added (unsigned) typecast to w at 470.
+ c14c 19 Jul 93 J. Bush changed v[N_MAX], l[288], ll[28x+3x] arrays
+ to static for Amiga.
+ c14d 13 Aug 93 J-l. Gailly de-complicatified Mark's c[*p++]++ thing.
+ c14e 8 Oct 93 G. Roelofs changed memset() to memzero().
+ c14f 22 Oct 93 G. Roelofs renamed quietflg to qflag; made Trace()
+ conditional; added inflate_free().
+ c14g 28 Oct 93 G. Roelofs changed l/(lx+1) macro to pointer (Cray bug)
+ c14h 7 Dec 93 C. Ghisler huft_build() optimizations.
+ c14i 9 Jan 94 A. Verheijen set fixed_t{d,l} to NULL after freeing;
+ G. Roelofs check NEXTBYTE macro for GZ_EOF.
+ c14j 23 Jan 94 G. Roelofs removed Ghisler "optimizations"; ifdef'd
+ GZ_EOF check.
+ c14k 27 Feb 94 G. Roelofs added some typecasts to avoid warnings.
+ c14l 9 Apr 94 G. Roelofs fixed split comments on preprocessor lines
+ to avoid bug in Encore compiler.
+ c14m 7 Jul 94 P. Kienitz modified to allow assembler version of
+ inflate_codes() (define ASM_INFLATECODES)
+ c14n 22 Jul 94 G. Roelofs changed fprintf to FPRINTF for DLL versions
+ c14o 23 Aug 94 C. Spieler added a newline to a debug statement;
+ G. Roelofs added another typecast to avoid MSC warning
+ */
+
+
+/*
+ Inflate deflated (PKZIP's method 8 compressed) data. The compression
+ method searches for as much of the current string of bytes (up to a
+ length of 258) in the previous 32K bytes. If it doesn't find any
+ matches (of at least length 3), it codes the next byte. Otherwise, it
+ codes the length of the matched string and its distance backwards from
+ the current position. There is a single Huffman code that codes both
+ single bytes (called "literals") and match lengths. A second Huffman
+ code codes the distance information, which follows a length code. Each
+ length or distance code actually represents a base value and a number
+ of "extra" (sometimes zero) bits to get to add to the base value. At
+ the end of each deflated block is a special end-of-block (EOB) literal/
+ length code. The decoding process is basically: get a literal/length
+ code; if EOB then done; if a literal, emit the decoded byte; if a
+ length then get the distance and emit the referred-to bytes from the
+ sliding window of previously emitted data.
+
+ There are (currently) three kinds of inflate blocks: stored, fixed, and
+ dynamic. The compressor outputs a chunk of data at a time and decides
+ which method to use on a chunk-by-chunk basis. A chunk might typically
+ be 32K to 64K, uncompressed. If the chunk is uncompressible, then the
+ "stored" method is used. In this case, the bytes are simply stored as
+ is, eight bits per byte, with none of the above coding. The bytes are
+ preceded by a count, since there is no longer an EOB code.
+
+ If the data is compressible, then either the fixed or dynamic methods
+ are used. In the dynamic method, the compressed data is preceded by
+ an encoding of the literal/length and distance Huffman codes that are
+ to be used to decode this block. The representation is itself Huffman
+ coded, and so is preceded by a description of that code. These code
+ descriptions take up a little space, and so for small blocks, there is
+ a predefined set of codes, called the fixed codes. The fixed method is
+ used if the block ends up smaller that way (usually for quite small
+ chunks); otherwise the dynamic method is used. In the latter case, the
+ codes are customized to the probabilities in the current block and so
+ can code it much better than the pre-determined fixed codes can.
+
+ The Huffman codes themselves are decoded using a mutli-level table
+ lookup, in order to maximize the speed of decoding plus the speed of
+ building the decoding tables. See the comments below that precede the
+ lbits and dbits tuning parameters.
+ */
+
+
+/*
+ Notes beyond the 1.93a appnote.txt:
+
+ 1. Distance pointers never point before the beginning of the output
+ stream.
+ 2. Distance pointers can point back across blocks, up to 32k away.
+ 3. There is an implied maximum of 7 bits for the bit length table and
+ 15 bits for the actual data.
+ 4. If only one code exists, then it is encoded using one bit. (Zero
+ would be more efficient, but perhaps a little confusing.) If two
+ codes exist, they are coded using one bit each (0 and 1).
+ 5. There is no way of sending zero distance codes--a dummy must be
+ sent if there are none. (History: a pre 2.0 version of PKZIP would
+ store blocks with no distance codes, but this was discovered to be
+ too harsh a criterion.) Valid only for 1.93a. 2.04c does allow
+ zero distance codes, which is sent as one code of zero bits in
+ length.
+ 6. There are up to 286 literal/length codes. Code 256 represents the
+ end-of-block. Note however that the static length tree defines
+ 288 codes just to fill out the Huffman codes. Codes 286 and 287
+ cannot be used though, since there is no length base or extra bits
+ defined for them. Similarily, there are up to 30 distance codes.
+ However, static trees define 32 codes (all 5 bits) to fill out the
+ Huffman codes, but the last two had better not show up in the data.
+ 7. Unzip can check dynamic Huffman blocks for complete code sets.
+ The exception is that a single code would not be complete (see #4).
+ 8. The five bits following the block type is really the number of
+ literal codes sent minus 257.
+ 9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits
+ (1+6+6). Therefore, to output three times the length, you output
+ three codes (1+1+1), whereas to output four times the same length,
+ you only need two codes (1+3). Hmm.
+ 10. In the tree reconstruction algorithm, Code = Code + Increment
+ only if BitLength(i) is not zero. (Pretty obvious.)
+ 11. Correction: 4 Bits: # of Bit Length codes - 4 (4 - 19)
+ 12. Note: length code 284 can represent 227-258, but length code 285
+ really is 258. The last length deserves its own, short code
+ since it gets used a lot in very redundant files. The length
+ 258 is special since 258 - 3 (the min match length) is 255.
+ 13. The literal/length and distance code bit lengths are read as a
+ single stream of lengths. It is possible (and advantageous) for
+ a repeat code (16, 17, or 18) to go across the boundary between
+ the two sets of lengths.
+ */
+
+
+#define PKZIP_BUG_WORKAROUND /* PKZIP 1.93a problem--live with it */
+
+/*
+ inflate.h must supply the uch slide[GZ_WSIZE] array and the NEXTBYTE,
+ FLUSH() and memzero macros. If the window size is not 32K, it
+ should also define GZ_WSIZE. If INFMOD is defined, it can include
+ compiled functions to support the NEXTBYTE and/or FLUSH() macros.
+ There are defaults for NEXTBYTE and FLUSH() below for use as
+ examples of what those functions need to do. Normally, you would
+ also want FLUSH() to compute a crc on the data. inflate.h also
+ needs to provide these typedefs:
+
+ typedef unsigned char uch;
+ typedef unsigned short ush;
+ typedef unsigned long ulg;
+
+ This module uses the external functions malloc() and free() (and
+ probably memset() or bzero() in the memzero() macro). Their
+ prototypes are normally found in <string.h> and <stdlib.h>.
+ */
+#define INFMOD /* tell inflate.h to include code to be
+ * compiled */
+
+/* Huffman code lookup table entry--this entry is four bytes for machines
+ that have 16-bit pointers (e.g. PC's in the small or medium model).
+ Valid extra bits are 0..13. e == 15 is EOB (end of block), e == 16
+ means that v is a literal, 16 < e < 32 means that v is a pointer to
+ the next table, which codes e - 16 bits, and lastly e == 99 indicates
+ an unused code. If a code with e == 99 is looked up, this implies an
+ error in the data. */
+struct huft {
+ uch e; /* number of extra bits or operation */
+ uch b; /* number of bits in this code or subcode */
+ union {
+ ush n; /* literal, length base, or distance
+ * base */
+ struct huft *t; /* pointer to next level of table */
+ } v;
+};
+
+
+/* Function prototypes */
+static int huft_build __P((struct inflate *, unsigned *, unsigned, unsigned, const ush *, const ush *, struct huft **, int *));
+static int huft_free __P((struct inflate *, struct huft *));
+static int inflate_codes __P((struct inflate *, struct huft *, struct huft *, int, int));
+static int inflate_stored __P((struct inflate *));
+static int xinflate __P((struct inflate *));
+static int inflate_fixed __P((struct inflate *));
+static int inflate_dynamic __P((struct inflate *));
+static int inflate_block __P((struct inflate *, int *));
+
+/* The inflate algorithm uses a sliding 32K byte window on the uncompressed
+ stream to find repeated byte strings. This is implemented here as a
+ circular buffer. The index is updated simply by incrementing and then
+ and'ing with 0x7fff (32K-1). */
+/* It is left to other modules to supply the 32K area. It is assumed
+ to be usable as if it were declared "uch slide[32768];" or as just
+ "uch *slide;" and then malloc'ed in the latter case. The definition
+ must be in unzip.h, included above. */
+
+
+/* Tables for deflate from PKZIP's appnote.txt. */
+
+/* Order of the bit length code lengths */
+static const unsigned border[] = {
+ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+static const ush cplens[] = { /* Copy lengths for literal codes 257..285 */
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+ 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+ /* note: see note #13 above about the 258 in this list. */
+
+static const ush cplext[] = { /* Extra bits for literal codes 257..285 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+ 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99}; /* 99==invalid */
+
+static const ush cpdist[] = { /* Copy offsets for distance codes 0..29 */
+ 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+ 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+ 8193, 12289, 16385, 24577};
+
+static const ush cpdext[] = { /* Extra bits for distance codes */
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+ 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+ 12, 12, 13, 13};
+
+/* And'ing with mask[n] masks the lower n bits */
+static const ush mask[] = {
+ 0x0000,
+ 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
+ 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
+};
+
+
+/* Macros for inflate() bit peeking and grabbing.
+ The usage is:
+
+ NEEDBITS(glbl,j)
+ x = b & mask[j];
+ DUMPBITS(j)
+
+ where NEEDBITS makes sure that b has at least j bits in it, and
+ DUMPBITS removes the bits from b. The macros use the variable k
+ for the number of bits in b. Normally, b and k are register
+ variables for speed, and are initialized at the begining of a
+ routine that uses these macros from a global bit buffer and count.
+
+ In order to not ask for more bits than there are in the compressed
+ stream, the Huffman tables are constructed to only ask for just
+ enough bits to make up the end-of-block code (value 256). Then no
+ bytes need to be "returned" to the buffer at the end of the last
+ block. See the huft_build() routine.
+ */
+
+/*
+ * The following 2 were global variables.
+ * They are now fields of the inflate structure.
+ */
+
+#define NEEDBITS(glbl,n) { \
+ while(k<(n)) { \
+ int c=(*glbl->gz_input)(glbl->gz_private); \
+ if(c==GZ_EOF) \
+ return 1; \
+ b|=((ulg)c)<<k; \
+ k+=8; \
+ } \
+ }
+
+#define DUMPBITS(n) {b>>=(n);k-=(n);}
+
+/*
+ Huffman code decoding is performed using a multi-level table lookup.
+ The fastest way to decode is to simply build a lookup table whose
+ size is determined by the longest code. However, the time it takes
+ to build this table can also be a factor if the data being decoded
+ is not very long. The most common codes are necessarily the
+ shortest codes, so those codes dominate the decoding time, and hence
+ the speed. The idea is you can have a shorter table that decodes the
+ shorter, more probable codes, and then point to subsidiary tables for
+ the longer codes. The time it costs to decode the longer codes is
+ then traded against the time it takes to make longer tables.
+
+ This results of this trade are in the variables lbits and dbits
+ below. lbits is the number of bits the first level table for literal/
+ length codes can decode in one step, and dbits is the same thing for
+ the distance codes. Subsequent tables are also less than or equal to
+ those sizes. These values may be adjusted either when all of the
+ codes are shorter than that, in which case the longest code length in
+ bits is used, or when the shortest code is *longer* than the requested
+ table size, in which case the length of the shortest code in bits is
+ used.
+
+ There are two different values for the two tables, since they code a
+ different number of possibilities each. The literal/length table
+ codes 286 possible values, or in a flat code, a little over eight
+ bits. The distance table codes 30 possible values, or a little less
+ than five bits, flat. The optimum values for speed end up being
+ about one bit more than those, so lbits is 8+1 and dbits is 5+1.
+ The optimum values may differ though from machine to machine, and
+ possibly even between compilers. Your mileage may vary.
+ */
+
+static const int lbits = 9; /* bits in base literal/length lookup table */
+static const int dbits = 6; /* bits in base distance lookup table */
+
+
+/* If BMAX needs to be larger than 16, then h and x[] should be ulg. */
+#define BMAX 16 /* maximum bit length of any code (16 for
+ * explode) */
+#define N_MAX 288 /* maximum number of codes in any set */
+
+/* Given a list of code lengths and a maximum table size, make a set of
+ tables to decode that set of codes. Return zero on success, one if
+ the given code set is incomplete (the tables are still built in this
+ case), two if the input is invalid (all zero length codes or an
+ oversubscribed set of lengths), and three if not enough memory.
+ The code with value 256 is special, and the tables are constructed
+ so that no bits beyond that code are fetched when that code is
+ decoded. */
+static int
+huft_build(glbl, b, n, s, d, e, t, m)
+ struct inflate *glbl;
+ unsigned *b; /* code lengths in bits (all assumed <= BMAX) */
+ unsigned n; /* number of codes (assumed <= N_MAX) */
+ unsigned s; /* number of simple-valued codes (0..s-1) */
+ const ush *d; /* list of base values for non-simple codes */
+ const ush *e; /* list of extra bits for non-simple codes */
+ struct huft **t; /* result: starting table */
+ int *m; /* maximum lookup bits, returns actual */
+{
+ unsigned a; /* counter for codes of length k */
+ unsigned c[BMAX + 1]; /* bit length count table */
+ unsigned el; /* length of EOB code (value 256) */
+ unsigned f; /* i repeats in table every f entries */
+ int g; /* maximum code length */
+ int h; /* table level */
+ register unsigned i; /* counter, current code */
+ register unsigned j; /* counter */
+ register int k; /* number of bits in current code */
+ int lx[BMAX + 1]; /* memory for l[-1..BMAX-1] */
+ int *l = lx + 1; /* stack of bits per table */
+ register unsigned *p; /* pointer into c[], b[], or v[] */
+ register struct huft *q;/* points to current table */
+ struct huft r; /* table entry for structure assignment */
+ struct huft *u[BMAX];/* table stack */
+ unsigned v[N_MAX]; /* values in order of bit length */
+ register int w; /* bits before this table == (l * h) */
+ unsigned x[BMAX + 1]; /* bit offsets, then code stack */
+ unsigned *xp; /* pointer into x */
+ int y; /* number of dummy codes added */
+ unsigned z; /* number of entries in current table */
+
+ /* Generate counts for each bit length */
+ el = n > 256 ? b[256] : BMAX; /* set length of EOB code, if any */
+#ifdef KERNEL
+ memzero((char *) c, sizeof(c));
+#else
+ for (i = 0; i < BMAX+1; i++)
+ c [i] = 0;
+#endif
+ p = b;
+ i = n;
+ do {
+ c[*p]++;
+ p++; /* assume all entries <= BMAX */
+ } while (--i);
+ if (c[0] == n) { /* null input--all zero length codes */
+ *t = (struct huft *) NULL;
+ *m = 0;
+ return 0;
+ }
+ /* Find minimum and maximum length, bound *m by those */
+ for (j = 1; j <= BMAX; j++)
+ if (c[j])
+ break;
+ k = j; /* minimum code length */
+ if ((unsigned) *m < j)
+ *m = j;
+ for (i = BMAX; i; i--)
+ if (c[i])
+ break;
+ g = i; /* maximum code length */
+ if ((unsigned) *m > i)
+ *m = i;
+
+ /* Adjust last length count to fill out codes, if needed */
+ for (y = 1 << j; j < i; j++, y <<= 1)
+ if ((y -= c[j]) < 0)
+ return 2; /* bad input: more codes than bits */
+ if ((y -= c[i]) < 0)
+ return 2;
+ c[i] += y;
+
+ /* Generate starting offsets into the value table for each length */
+ x[1] = j = 0;
+ p = c + 1;
+ xp = x + 2;
+ while (--i) { /* note that i == g from above */
+ *xp++ = (j += *p++);
+ }
+
+ /* Make a table of values in order of bit lengths */
+ p = b;
+ i = 0;
+ do {
+ if ((j = *p++) != 0)
+ v[x[j]++] = i;
+ } while (++i < n);
+
+ /* Generate the Huffman codes and for each, make the table entries */
+ x[0] = i = 0; /* first Huffman code is zero */
+ p = v; /* grab values in bit order */
+ h = -1; /* no tables yet--level -1 */
+ w = l[-1] = 0; /* no bits decoded yet */
+ u[0] = (struct huft *) NULL; /* just to keep compilers happy */
+ q = (struct huft *) NULL; /* ditto */
+ z = 0; /* ditto */
+
+ /* go through the bit lengths (k already is bits in shortest code) */
+ for (; k <= g; k++) {
+ a = c[k];
+ while (a--) {
+ /*
+ * here i is the Huffman code of length k bits for
+ * value *p
+ */
+ /* make tables up to required level */
+ while (k > w + l[h]) {
+ w += l[h++]; /* add bits already decoded */
+
+ /*
+ * compute minimum size table less than or
+ * equal to *m bits
+ */
+ z = (z = g - w) > (unsigned) *m ? *m : z; /* upper limit */
+ if ((f = 1 << (j = k - w)) > a + 1) { /* try a k-w bit table *//* t
+ * oo few codes for k-w
+ * bit table */
+ f -= a + 1; /* deduct codes from
+ * patterns left */
+ xp = c + k;
+ while (++j < z) { /* try smaller tables up
+ * to z bits */
+ if ((f <<= 1) <= *++xp)
+ break; /* enough codes to use
+ * up j bits */
+ f -= *xp; /* else deduct codes
+ * from patterns */
+ }
+ }
+ if ((unsigned) w + j > el && (unsigned) w < el)
+ j = el - w; /* make EOB code end at
+ * table */
+ z = 1 << j; /* table entries for j-bit
+ * table */
+ l[h] = j; /* set table size in stack */
+
+ /* allocate and link in new table */
+ if ((q = (struct huft *) malloc((z + 1) * sizeof(struct huft), M_GZIP, M_WAITOK)) ==
+ (struct huft *) NULL) {
+ if (h)
+ huft_free(glbl, u[0]);
+ return 3; /* not enough memory */
+ }
+ glbl->gz_hufts += z + 1; /* track memory usage */
+ *t = q + 1; /* link to list for
+ * huft_free() */
+ *(t = &(q->v.t)) = (struct huft *) NULL;
+ u[h] = ++q; /* table starts after link */
+
+ /* connect to last table, if there is one */
+ if (h) {
+ x[h] = i; /* save pattern for
+ * backing up */
+ r.b = (uch) l[h - 1]; /* bits to dump before
+ * this table */
+ r.e = (uch) (16 + j); /* bits in this table */
+ r.v.t = q; /* pointer to this table */
+ j = (i & ((1 << w) - 1)) >> (w - l[h - 1]);
+ u[h - 1][j] = r; /* connect to last table */
+ }
+ }
+
+ /* set up table entry in r */
+ r.b = (uch) (k - w);
+ if (p >= v + n)
+ r.e = 99; /* out of values--invalid
+ * code */
+ else if (*p < s) {
+ r.e = (uch) (*p < 256 ? 16 : 15); /* 256 is end-of-block
+ * code */
+ r.v.n = *p++; /* simple code is just the
+ * value */
+ } else {
+ r.e = (uch) e[*p - s]; /* non-simple--look up
+ * in lists */
+ r.v.n = d[*p++ - s];
+ }
+
+ /* fill code-like entries with r */
+ f = 1 << (k - w);
+ for (j = i >> w; j < z; j += f)
+ q[j] = r;
+
+ /* backwards increment the k-bit code i */
+ for (j = 1 << (k - 1); i & j; j >>= 1)
+ i ^= j;
+ i ^= j;
+
+ /* backup over finished tables */
+ while ((i & ((1 << w) - 1)) != x[h])
+ w -= l[--h]; /* don't need to update q */
+ }
+ }
+
+ /* return actual size of base table */
+ *m = l[0];
+
+ /* Return true (1) if we were given an incomplete table */
+ return y != 0 && g != 1;
+}
+
+static int
+huft_free(glbl, t)
+ struct inflate *glbl;
+ struct huft *t; /* table to free */
+/* Free the malloc'ed tables built by huft_build(), which makes a linked
+ list of the tables it made, with the links in a dummy first entry of
+ each table. */
+{
+ register struct huft *p, *q;
+
+ /* Go through linked list, freeing from the malloced (t[-1]) address. */
+ p = t;
+ while (p != (struct huft *) NULL) {
+ q = (--p)->v.t;
+ free(p, M_GZIP);
+ p = q;
+ }
+ return 0;
+}
+
+/* inflate (decompress) the codes in a deflated (compressed) block.
+ Return an error code or zero if it all goes ok. */
+static int
+inflate_codes(glbl, tl, td, bl, bd)
+ struct inflate *glbl;
+ struct huft *tl, *td;/* literal/length and distance decoder tables */
+ int bl, bd; /* number of bits decoded by tl[] and td[] */
+{
+ register unsigned e; /* table entry flag/number of extra bits */
+ unsigned n, d; /* length and index for copy */
+ unsigned w; /* current window position */
+ struct huft *t; /* pointer to table entry */
+ unsigned ml, md; /* masks for bl and bd bits */
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local copies of globals */
+ b = glbl->gz_bb; /* initialize bit buffer */
+ k = glbl->gz_bk;
+ w = glbl->gz_wp; /* initialize window position */
+
+ /* inflate the coded data */
+ ml = mask[bl]; /* precompute masks for speed */
+ md = mask[bd];
+ while (1) { /* do until end of block */
+ NEEDBITS(glbl, (unsigned) bl)
+ if ((e = (t = tl + ((unsigned) b & ml))->e) > 16)
+ do {
+ if (e == 99)
+ return 1;
+ DUMPBITS(t->b)
+ e -= 16;
+ NEEDBITS(glbl, e)
+ } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+ DUMPBITS(t->b)
+ if (e == 16) { /* then it's a literal */
+ glbl->gz_slide[w++] = (uch) t->v.n;
+ if (w == GZ_WSIZE) {
+ FLUSH(glbl, w);
+ w = 0;
+ }
+ } else { /* it's an EOB or a length */
+ /* exit if end of block */
+ if (e == 15)
+ break;
+
+ /* get length of block to copy */
+ NEEDBITS(glbl, e)
+ n = t->v.n + ((unsigned) b & mask[e]);
+ DUMPBITS(e);
+
+ /* decode distance of block to copy */
+ NEEDBITS(glbl, (unsigned) bd)
+ if ((e = (t = td + ((unsigned) b & md))->e) > 16)
+ do {
+ if (e == 99)
+ return 1;
+ DUMPBITS(t->b)
+ e -= 16;
+ NEEDBITS(glbl, e)
+ } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+ DUMPBITS(t->b)
+ NEEDBITS(glbl, e)
+ d = w - t->v.n - ((unsigned) b & mask[e]);
+ DUMPBITS(e)
+ /* do the copy */
+ do {
+ n -= (e = (e = GZ_WSIZE - ((d &= GZ_WSIZE - 1) > w ? d : w)) > n ? n : e);
+#ifndef NOMEMCPY
+ if (w - d >= e) { /* (this test assumes
+ * unsigned comparison) */
+ memcpy(glbl->gz_slide + w, glbl->gz_slide + d, e);
+ w += e;
+ d += e;
+ } else /* do it slow to avoid memcpy()
+ * overlap */
+#endif /* !NOMEMCPY */
+ do {
+ glbl->gz_slide[w++] = glbl->gz_slide[d++];
+ } while (--e);
+ if (w == GZ_WSIZE) {
+ FLUSH(glbl, w);
+ w = 0;
+ }
+ } while (n);
+ }
+ }
+
+ /* restore the globals from the locals */
+ glbl->gz_wp = w; /* restore global window pointer */
+ glbl->gz_bb = b; /* restore global bit buffer */
+ glbl->gz_bk = k;
+
+ /* done */
+ return 0;
+}
+
+/* "decompress" an inflated type 0 (stored) block. */
+static int
+inflate_stored(glbl)
+ struct inflate *glbl;
+{
+ unsigned n; /* number of bytes in block */
+ unsigned w; /* current window position */
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local copies of globals */
+ b = glbl->gz_bb; /* initialize bit buffer */
+ k = glbl->gz_bk;
+ w = glbl->gz_wp; /* initialize window position */
+
+ /* go to byte boundary */
+ n = k & 7;
+ DUMPBITS(n);
+
+ /* get the length and its complement */
+ NEEDBITS(glbl, 16)
+ n = ((unsigned) b & 0xffff);
+ DUMPBITS(16)
+ NEEDBITS(glbl, 16)
+ if (n != (unsigned) ((~b) & 0xffff))
+ return 1; /* error in compressed data */
+ DUMPBITS(16)
+ /* read and output the compressed data */
+ while (n--) {
+ NEEDBITS(glbl, 8)
+ glbl->gz_slide[w++] = (uch) b;
+ if (w == GZ_WSIZE) {
+ FLUSH(glbl, w);
+ w = 0;
+ }
+ DUMPBITS(8)
+ }
+
+ /* restore the globals from the locals */
+ glbl->gz_wp = w; /* restore global window pointer */
+ glbl->gz_bb = b; /* restore global bit buffer */
+ glbl->gz_bk = k;
+ return 0;
+}
+
+/* decompress an inflated type 1 (fixed Huffman codes) block. We should
+ either replace this with a custom decoder, or at least precompute the
+ Huffman tables. */
+static int
+inflate_fixed(glbl)
+ struct inflate *glbl;
+{
+ /* if first time, set up tables for fixed blocks */
+ if (glbl->gz_fixed_tl == (struct huft *) NULL) {
+ int i; /* temporary variable */
+ static unsigned l[288]; /* length list for huft_build */
+
+ /* literal table */
+ for (i = 0; i < 144; i++)
+ l[i] = 8;
+ for (; i < 256; i++)
+ l[i] = 9;
+ for (; i < 280; i++)
+ l[i] = 7;
+ for (; i < 288; i++) /* make a complete, but wrong code
+ * set */
+ l[i] = 8;
+ glbl->gz_fixed_bl = 7;
+ if ((i = huft_build(glbl, l, 288, 257, cplens, cplext,
+ &glbl->gz_fixed_tl, &glbl->gz_fixed_bl)) != 0) {
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+ return i;
+ }
+ /* distance table */
+ for (i = 0; i < 30; i++) /* make an incomplete code
+ * set */
+ l[i] = 5;
+ glbl->gz_fixed_bd = 5;
+ if ((i = huft_build(glbl, l, 30, 0, cpdist, cpdext,
+ &glbl->gz_fixed_td, &glbl->gz_fixed_bd)) > 1) {
+ huft_free(glbl, glbl->gz_fixed_tl);
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+ return i;
+ }
+ }
+ /* decompress until an end-of-block code */
+ return inflate_codes(glbl, glbl->gz_fixed_tl, glbl->gz_fixed_td, glbl->gz_fixed_bl, glbl->gz_fixed_bd) != 0;
+}
+
+/* decompress an inflated type 2 (dynamic Huffman codes) block. */
+static int
+inflate_dynamic(glbl)
+ struct inflate *glbl;
+{
+ int i; /* temporary variables */
+ unsigned j;
+ unsigned l; /* last length */
+ unsigned m; /* mask for bit lengths table */
+ unsigned n; /* number of lengths to get */
+ struct huft *tl; /* literal/length code table */
+ struct huft *td; /* distance code table */
+ int bl; /* lookup bits for tl */
+ int bd; /* lookup bits for td */
+ unsigned nb; /* number of bit length codes */
+ unsigned nl; /* number of literal/length codes */
+ unsigned nd; /* number of distance codes */
+#ifdef PKZIP_BUG_WORKAROUND
+ unsigned ll[288 + 32]; /* literal/length and distance code
+ * lengths */
+#else
+ unsigned ll[286 + 30]; /* literal/length and distance code
+ * lengths */
+#endif
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local bit buffer */
+ b = glbl->gz_bb;
+ k = glbl->gz_bk;
+
+ /* read in table lengths */
+ NEEDBITS(glbl, 5)
+ nl = 257 + ((unsigned) b & 0x1f); /* number of
+ * literal/length codes */
+ DUMPBITS(5)
+ NEEDBITS(glbl, 5)
+ nd = 1 + ((unsigned) b & 0x1f); /* number of distance codes */
+ DUMPBITS(5)
+ NEEDBITS(glbl, 4)
+ nb = 4 + ((unsigned) b & 0xf); /* number of bit length codes */
+ DUMPBITS(4)
+#ifdef PKZIP_BUG_WORKAROUND
+ if (nl > 288 || nd > 32)
+#else
+ if (nl > 286 || nd > 30)
+#endif
+ return 1; /* bad lengths */
+ /* read in bit-length-code lengths */
+ for (j = 0; j < nb; j++) {
+ NEEDBITS(glbl, 3)
+ ll[border[j]] = (unsigned) b & 7;
+ DUMPBITS(3)
+ }
+ for (; j < 19; j++)
+ ll[border[j]] = 0;
+
+ /* build decoding table for trees--single level, 7 bit lookup */
+ bl = 7;
+ if ((i = huft_build(glbl, ll, 19, 19, NULL, NULL, &tl, &bl)) != 0) {
+ if (i == 1)
+ huft_free(glbl, tl);
+ return i; /* incomplete code set */
+ }
+ /* read in literal and distance code lengths */
+ n = nl + nd;
+ m = mask[bl];
+ i = l = 0;
+ while ((unsigned) i < n) {
+ NEEDBITS(glbl, (unsigned) bl)
+ j = (td = tl + ((unsigned) b & m))->b;
+ DUMPBITS(j)
+ j = td->v.n;
+ if (j < 16) /* length of code in bits (0..15) */
+ ll[i++] = l = j; /* save last length in l */
+ else if (j == 16) { /* repeat last length 3 to 6 times */
+ NEEDBITS(glbl, 2)
+ j = 3 + ((unsigned) b & 3);
+ DUMPBITS(2)
+ if ((unsigned) i + j > n)
+ return 1;
+ while (j--)
+ ll[i++] = l;
+ } else if (j == 17) { /* 3 to 10 zero length codes */
+ NEEDBITS(glbl, 3)
+ j = 3 + ((unsigned) b & 7);
+ DUMPBITS(3)
+ if ((unsigned) i + j > n)
+ return 1;
+ while (j--)
+ ll[i++] = 0;
+ l = 0;
+ } else { /* j == 18: 11 to 138 zero length codes */
+ NEEDBITS(glbl, 7)
+ j = 11 + ((unsigned) b & 0x7f);
+ DUMPBITS(7)
+ if ((unsigned) i + j > n)
+ return 1;
+ while (j--)
+ ll[i++] = 0;
+ l = 0;
+ }
+ }
+
+ /* free decoding table for trees */
+ huft_free(glbl, tl);
+
+ /* restore the global bit buffer */
+ glbl->gz_bb = b;
+ glbl->gz_bk = k;
+
+ /* build the decoding tables for literal/length and distance codes */
+ bl = lbits;
+ i = huft_build(glbl, ll, nl, 257, cplens, cplext, &tl, &bl);
+ if (i != 0) {
+ if (i == 1 && !qflag) {
+ FPRINTF("(incomplete l-tree) ");
+ huft_free(glbl, tl);
+ }
+ return i; /* incomplete code set */
+ }
+ bd = dbits;
+ i = huft_build(glbl, ll + nl, nd, 0, cpdist, cpdext, &td, &bd);
+ if (i != 0) {
+ if (i == 1 && !qflag) {
+ FPRINTF("(incomplete d-tree) ");
+#ifdef PKZIP_BUG_WORKAROUND
+ i = 0;
+ }
+#else
+ huft_free(glbl, td);
+ }
+ huft_free(glbl, tl);
+ return i; /* incomplete code set */
+#endif
+ }
+ /* decompress until an end-of-block code */
+ if (inflate_codes(glbl, tl, td, bl, bd))
+ return 1;
+
+ /* free the decoding tables, return */
+ huft_free(glbl, tl);
+ huft_free(glbl, td);
+ return 0;
+}
+
+/* decompress an inflated block */
+static int
+inflate_block(glbl, e)
+ struct inflate *glbl;
+ int *e; /* last block flag */
+{
+ unsigned t; /* block type */
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local bit buffer */
+ b = glbl->gz_bb;
+ k = glbl->gz_bk;
+
+ /* read in last block bit */
+ NEEDBITS(glbl, 1)
+ * e = (int) b & 1;
+ DUMPBITS(1)
+ /* read in block type */
+ NEEDBITS(glbl, 2)
+ t = (unsigned) b & 3;
+ DUMPBITS(2)
+ /* restore the global bit buffer */
+ glbl->gz_bb = b;
+ glbl->gz_bk = k;
+
+ /* inflate that block type */
+ if (t == 2)
+ return inflate_dynamic(glbl);
+ if (t == 0)
+ return inflate_stored(glbl);
+ if (t == 1)
+ return inflate_fixed(glbl);
+ /* bad block type */
+ return 2;
+}
+
+
+
+/* decompress an inflated entry */
+static int
+xinflate(glbl)
+ struct inflate *glbl;
+{
+ int e; /* last block flag */
+ int r; /* result code */
+ unsigned h; /* maximum struct huft's malloc'ed */
+
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+
+ /* initialize window, bit buffer */
+ glbl->gz_wp = 0;
+ glbl->gz_bk = 0;
+ glbl->gz_bb = 0;
+
+ /* decompress until the last block */
+ h = 0;
+ do {
+ glbl->gz_hufts = 0;
+ if ((r = inflate_block(glbl, &e)) != 0)
+ return r;
+ if (glbl->gz_hufts > h)
+ h = glbl->gz_hufts;
+ } while (!e);
+
+ /* flush out slide */
+ FLUSH(glbl, glbl->gz_wp);
+
+ /* return success */
+ return 0;
+}
+
+/* Nobody uses this - why not? */
+int
+inflate(glbl)
+ struct inflate *glbl;
+{
+ int i;
+#ifdef KERNEL
+ u_char *p = NULL;
+
+ if (!glbl->gz_slide)
+ p = glbl->gz_slide = malloc(GZ_WSIZE, M_GZIP, M_WAITOK);
+#endif
+ if (!glbl->gz_slide)
+#ifdef KERNEL
+ return(ENOMEM);
+#else
+ return 3; /* kzip expects 3 */
+#endif
+ i = xinflate(glbl);
+
+ if (glbl->gz_fixed_td != (struct huft *) NULL) {
+ huft_free(glbl, glbl->gz_fixed_td);
+ glbl->gz_fixed_td = (struct huft *) NULL;
+ }
+ if (glbl->gz_fixed_tl != (struct huft *) NULL) {
+ huft_free(glbl, glbl->gz_fixed_tl);
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+ }
+#ifdef KERNEL
+ if (p == glbl->gz_slide) {
+ free(glbl->gz_slide, M_GZIP);
+ glbl->gz_slide = NULL;
+ }
+#endif
+ return i;
+}
+/* ----------------------- END INFLATE.C */
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
new file mode 100644
index 0000000..246684f
--- /dev/null
+++ b/sys/kern/init_main.c
@@ -0,0 +1,698 @@
+/*
+ * Copyright (c) 1995 Terrence R. Lambert
+ * All rights reserved.
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)init_main.c 8.9 (Berkeley) 1/21/94
+ * $Id: init_main.c,v 1.102 1998/12/30 10:38:58 dfr Exp $
+ */
+
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/sysent.h>
+#include <sys/reboot.h>
+#include <sys/sysproto.h>
+#include <sys/vmmeter.h>
+#include <sys/unistd.h>
+#include <sys/malloc.h>
+
+#include <machine/cpu.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/user.h>
+#include <sys/copyright.h>
+
+extern struct linker_set sysinit_set; /* XXX */
+
+extern void __main __P((void));
+extern void main __P((void *framep));
+
+/* Components of the first process -- never freed. */
+static struct session session0;
+static struct pgrp pgrp0;
+struct proc proc0;
+static struct pcred cred0;
+#ifdef COMPAT_LINUX_THREADS
+static struct procsig procsig0;
+#endif /* COMPAT_LINUX_THREADS */
+static struct filedesc0 filedesc0;
+static struct plimit limit0;
+static struct vmspace vmspace0;
+struct proc *initproc;
+
+int cmask = CMASK;
+extern struct user *proc0paddr;
+
+struct vnode *rootvp;
+int boothowto = 0; /* initialized so that it can be patched */
+
+struct timeval boottime;
+SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime,
+ CTLFLAG_RD, &boottime, timeval, "");
+
+static int shutdowntimeout = 120;
+SYSCTL_INT(_kern, OID_AUTO, shutdown_timeout,
+ CTLFLAG_RW, &shutdowntimeout, 0, "");
+
+/*
+ * Promiscuous argument pass for start_init()
+ *
+ * This is a kludge because we use a return from main() rather than a call
+ * to a new routine in locore.s to kick the kernel alive from locore.s.
+ */
+static void *init_framep;
+
+
+#if __GNUC__ >= 2
+void __main() {}
+#endif
+
+
+/*
+ * This ensures that there is at least one entry so that the sysinit_set
+ * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never
+ * executed.
+ */
+SYSINIT(placeholder, SI_SUB_DUMMY,SI_ORDER_ANY, NULL, NULL)
+
+/*
+ * The sysinit table itself. Items are checked off as the are run.
+ * If we want to register new sysinit types, add them to newsysinit.
+ */
+struct sysinit **sysinit = (struct sysinit **)sysinit_set.ls_items;
+struct sysinit **newsysinit;
+
+/*
+ * Merge a new sysinit set into the current set, reallocating it if
+ * necessary. This can only be called after malloc is running.
+ */
+void
+sysinit_add(set)
+ struct sysinit **set;
+{
+ struct sysinit **newset;
+ struct sysinit **sipp;
+ struct sysinit **xipp;
+ int count = 0;
+
+ if (newsysinit)
+ for (sipp = newsysinit; *sipp; sipp++)
+ count++;
+ else
+ for (sipp = sysinit; *sipp; sipp++)
+ count++;
+ for (sipp = set; *sipp; sipp++)
+ count++;
+ count++; /* Trailing NULL */
+ newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
+ if (newset == NULL)
+ panic("cannot malloc for sysinit");
+ xipp = newset;
+ if (newsysinit)
+ for (sipp = newsysinit; *sipp; sipp++)
+ *xipp++ = *sipp;
+ else
+ for (sipp = sysinit; *sipp; sipp++)
+ *xipp++ = *sipp;
+ for (sipp = set; *sipp; sipp++)
+ *xipp++ = *sipp;
+ *xipp = NULL;
+ if (newsysinit)
+ free(newsysinit, M_TEMP);
+ newsysinit = newset;
+}
+
+/*
+ * System startup; initialize the world, create process 0, mount root
+ * filesystem, and fork to create init and pagedaemon. Most of the
+ * hard work is done in the lower-level initialization routines including
+ * startup(), which does memory initialization and autoconfiguration.
+ *
+ * This allows simple addition of new kernel subsystems that require
+ * boot time initialization. It also allows substitution of subsystem
+ * (for instance, a scheduler, kernel profiler, or VM system) by object
+ * module. Finally, it allows for optional "kernel threads".
+ */
+void
+main(framep)
+ void *framep;
+{
+
+ register struct sysinit **sipp; /* system initialization*/
+ register struct sysinit **xipp; /* interior loop of sort*/
+ register struct sysinit *save; /* bubble*/
+
+ /*
+ * Copy the locore.s frame pointer for proc0, this is forked into
+ * all other processes.
+ */
+ init_framep = framep;
+
+restart:
+ /*
+ * Perform a bubble sort of the system initialization objects by
+ * their subsystem (primary key) and order (secondary key).
+ */
+ for (sipp = sysinit; *sipp; sipp++) {
+ for (xipp = sipp + 1; *xipp; xipp++) {
+ if ((*sipp)->subsystem < (*xipp)->subsystem ||
+ ((*sipp)->subsystem == (*xipp)->subsystem &&
+ (*sipp)->order < (*xipp)->order))
+ continue; /* skip*/
+ save = *sipp;
+ *sipp = *xipp;
+ *xipp = save;
+ }
+ }
+
+ /*
+ * Traverse the (now) ordered list of system initialization tasks.
+ * Perform each task, and continue on to the next task.
+ *
+ * The last item on the list is expected to be the scheduler,
+ * which will not return.
+ */
+ for (sipp = sysinit; *sipp; sipp++) {
+
+ if ((*sipp)->subsystem == SI_SUB_DUMMY)
+ continue; /* skip dummy task(s)*/
+
+ if ((*sipp)->subsystem == SI_SUB_DONE)
+ continue;
+
+ switch( (*sipp)->type) {
+ case SI_TYPE_DEFAULT:
+ /* no special processing*/
+ (*((*sipp)->func))((*sipp)->udata);
+ break;
+
+ case SI_TYPE_KTHREAD:
+#if !defined(SMP)
+ /* kernel thread*/
+ if (fork1(&proc0, RFMEM|RFFDG|RFPROC))
+ panic("fork kernel thread");
+ cpu_set_fork_handler(pfind(proc0.p_retval[0]),
+ (*sipp)->func, (*sipp)->udata);
+ break;
+#endif
+
+ case SI_TYPE_KPROCESS:
+ if (fork1(&proc0, RFFDG|RFPROC))
+ panic("fork kernel process");
+ cpu_set_fork_handler(pfind(proc0.p_retval[0]),
+ (*sipp)->func, (*sipp)->udata);
+ break;
+
+ default:
+ panic("init_main: unrecognized init type");
+ }
+
+ /* Check off the one we're just done */
+ (*sipp)->subsystem = SI_SUB_DONE;
+
+ /* Check if we've installed more sysinit items via KLD */
+ if (newsysinit != NULL) {
+ if (sysinit != (struct sysinit **)sysinit_set.ls_items)
+ free(sysinit, M_TEMP);
+ sysinit = newsysinit;
+ newsysinit = NULL;
+ goto restart;
+ }
+ }
+
+ panic("Shouldn't get here!");
+ /* NOTREACHED*/
+}
+
+
+/*
+ * Start a kernel process. This is called after a fork() call in
+ * main() in the file kern/init_main.c.
+ *
+ * This function is used to start "internal" daemons.
+ */
+/* ARGSUSED*/
+void
+kproc_start(udata)
+ void *udata;
+{
+ struct kproc_desc *kp = udata;
+ struct proc *p = curproc;
+
+#ifdef DIAGNOSTIC
+ printf("Start pid=%d <%s>\n",p->p_pid, kp->arg0);
+#endif
+
+ /* save a global descriptor, if desired*/
+ if( kp->global_procpp != NULL)
+ *kp->global_procpp = p;
+
+ /* this is a non-swapped system process*/
+ p->p_flag |= P_INMEM | P_SYSTEM;
+
+ /* set up arg0 for 'ps', et al*/
+ strcpy( p->p_comm, kp->arg0);
+
+ /* call the processes' main()...*/
+ (*kp->func)();
+
+ /* NOTREACHED */
+ panic("kproc_start: %s", kp->arg0);
+}
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's belong elsewhere, but have not yet
+ **** been moved.
+ ****
+ ***************************************************************************
+ */
+#ifdef OMIT
+/*
+ * Handled by vfs_mountroot (bad idea) at this time... should be
+ * done the same as 4.4Lite2.
+ */
+SYSINIT(swapinit, SI_SUB_SWAP, SI_ORDER_FIRST, swapinit, NULL)
+#endif /* OMIT*/
+
+static void print_caddr_t __P((void *data));
+static void
+print_caddr_t(data)
+ void *data;
+{
+ printf("%s", (char *)data);
+}
+SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The two following SYSINT's are proc0 specific glue code. I am not
+ **** convinced that they can not be safely combined, but their order of
+ **** operation has been maintained as the same as the original init_main.c
+ **** for right now.
+ ****
+ **** These probably belong in init_proc.c or kern_proc.c, since they
+ **** deal with proc0 (the fork template process).
+ ****
+ ***************************************************************************
+ */
+/* ARGSUSED*/
+static void proc0_init __P((void *dummy));
+static void
+proc0_init(dummy)
+ void *dummy;
+{
+ register struct proc *p;
+ register struct filedesc0 *fdp;
+ register unsigned i;
+
+ /*
+ * Initialize the current process pointer (curproc) before
+ * any possible traps/probes to simplify trap processing.
+ */
+ p = &proc0;
+ curproc = p; /* XXX redundant*/
+
+ /*
+ * Initialize process and pgrp structures.
+ */
+ procinit();
+
+ /*
+ * Initialize sleep queue hash table
+ */
+ sleepinit();
+
+ /*
+ * additional VM structures
+ */
+ vm_init2();
+
+ /*
+ * Create process 0 (the swapper).
+ */
+ LIST_INSERT_HEAD(&allproc, p, p_list);
+ p->p_pgrp = &pgrp0;
+ LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
+ LIST_INIT(&pgrp0.pg_members);
+ LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
+
+ pgrp0.pg_session = &session0;
+ session0.s_count = 1;
+ session0.s_leader = p;
+
+ p->p_sysent = &aout_sysvec;
+
+ p->p_flag = P_INMEM | P_SYSTEM;
+ p->p_stat = SRUN;
+ p->p_nice = NZERO;
+ p->p_rtprio.type = RTP_PRIO_NORMAL;
+ p->p_rtprio.prio = 0;
+
+/*
+ * Link for kernel based threads
+ */
+ p->p_peers = 0;
+ p->p_leader = p;
+
+ bcopy("swapper", p->p_comm, sizeof ("swapper"));
+
+ /* Create credentials. */
+ cred0.p_refcnt = 1;
+ p->p_cred = &cred0;
+ p->p_ucred = crget();
+ p->p_ucred->cr_ngroups = 1; /* group 0 */
+
+#ifdef COMPAT_LINUX_THREADS
+ /* Create procsig. */
+ p->p_procsig = &procsig0;
+ p->p_procsig->ps_refcnt = 2;
+
+#endif /* COMPAT_LINUX_THREADS */
+ /* Create the file descriptor table. */
+ fdp = &filedesc0;
+ p->p_fd = &fdp->fd_fd;
+ fdp->fd_fd.fd_refcnt = 1;
+ fdp->fd_fd.fd_cmask = cmask;
+ fdp->fd_fd.fd_ofiles = fdp->fd_dfiles;
+ fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags;
+ fdp->fd_fd.fd_nfiles = NDFILE;
+
+ /* Create the limits structures. */
+ p->p_limit = &limit0;
+ for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++)
+ limit0.pl_rlimit[i].rlim_cur =
+ limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY;
+ limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur =
+ limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
+ limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur =
+ limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
+ i = ptoa(cnt.v_free_count);
+ limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i;
+ limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
+ limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3;
+ limit0.p_cpulimit = RLIM_INFINITY;
+ limit0.p_refcnt = 1;
+
+
+ /* Allocate a prototype map so we have something to fork. */
+ pmap_pinit0(&vmspace0.vm_pmap);
+ p->p_vmspace = &vmspace0;
+ vmspace0.vm_refcnt = 1;
+ vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS),
+ trunc_page(VM_MAXUSER_ADDRESS));
+ vmspace0.vm_map.pmap = &vmspace0.vm_pmap;
+ p->p_addr = proc0paddr; /* XXX */
+
+#ifndef __alpha__ /* XXX what is this? */
+#define INCOMPAT_LITES2
+#ifdef INCOMPAT_LITES2
+ /*
+ * proc0 needs to have a coherent frame base in its stack.
+ */
+ cpu_set_init_frame(p, init_framep); /* XXX! */
+#endif /* INCOMPAT_LITES2*/
+#endif
+
+ /*
+ * We continue to place resource usage info and signal
+ * actions in the user struct so they're pageable.
+ */
+ p->p_stats = &p->p_addr->u_stats;
+ p->p_sigacts = &p->p_addr->u_sigacts;
+
+ /*
+ * Charge root for one process.
+ */
+ (void)chgproccnt(0, 1);
+
+ /*
+ * Initialize the procfs flags (to 0, of course)
+ */
+ p->p_stops = p->p_stype = p->p_step = 0;
+
+}
+SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
+
+/* ARGSUSED*/
+static void proc0_post __P((void *dummy));
+static void
+proc0_post(dummy)
+ void *dummy;
+{
+ struct timespec ts;
+
+ /*
+ * Now can look at time, having had a chance to verify the time
+ * from the file system. Reset p->p_runtime as it may have been
+ * munched in mi_switch() after the time got set. Set
+ * p->p_switchtime to be consistent with this unmunching.
+ */
+ microtime(&proc0.p_stats->p_start);
+ proc0.p_runtime = 0;
+ microuptime(&proc0.p_switchtime);
+
+ /*
+ * Give the ``random'' number generator a thump.
+ * XXX: Does read_random() contain enough bits to be used here ?
+ */
+ nanotime(&ts);
+ srandom(ts.tv_sec ^ ts.tv_nsec);
+
+ /* Initialize signal state for process 0. */
+ siginit(&proc0);
+}
+SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
+
+
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's and glue code should be moved to the
+ **** respective files on a per subsystem basis.
+ ****
+ ***************************************************************************
+ */
+
+/* ARGSUSED */
+static void root_conf __P((void *dummy));
+static void
+root_conf(dummy)
+ void *dummy;
+{
+ cpu_rootconf();
+}
+SYSINIT(root_conf, SI_SUB_ROOT_CONF, SI_ORDER_FIRST, root_conf, NULL)
+
+/* ARGSUSED*/
+static void xxx_vfs_root_fdtab __P((void *dummy));
+static void
+xxx_vfs_root_fdtab(dummy)
+ void *dummy;
+{
+ register struct filedesc0 *fdp = &filedesc0;
+
+ /* Get the vnode for '/'. Set fdp->fd_fd.fd_cdir to reference it. */
+ if (VFS_ROOT(mountlist.cqh_first, &rootvnode))
+ panic("cannot find root vnode");
+ fdp->fd_fd.fd_cdir = rootvnode;
+ VREF(fdp->fd_fd.fd_cdir);
+ VOP_UNLOCK(rootvnode, 0, &proc0);
+ fdp->fd_fd.fd_rdir = rootvnode;
+}
+SYSINIT(retrofit, SI_SUB_ROOT_FDTAB, SI_ORDER_FIRST, xxx_vfs_root_fdtab, NULL)
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following code probably belongs in another file, like
+ **** kern/init_init.c. It is here for two reasons only:
+ ****
+ **** 1) This code returns to startup the system; this is
+ **** abnormal for a kernel thread.
+ **** 2) This code promiscuously uses init_frame
+ ****
+ ***************************************************************************
+ */
+
+static void kthread_init __P((void *dummy));
+SYSINIT_KP(init,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kthread_init, NULL)
+
+
+extern void prepare_usermode __P((void));
+static void start_init __P((struct proc *p));
+
+/* ARGSUSED*/
+static void
+kthread_init(dummy)
+ void *dummy;
+{
+ /* Create process 1 (init(8)). */
+ start_init(curproc);
+
+ prepare_usermode();
+
+ /*
+ * This returns to the fork trampoline, then to user mode.
+ */
+ return;
+}
+
+
+/*
+ * List of paths to try when searching for "init".
+ */
+static char *initpaths[] = {
+ "/sbin/init",
+ "/sbin/oinit",
+ "/sbin/init.bak",
+ "/stand/sysinstall",
+ NULL,
+};
+
+/*
+ * Start the initial user process; try exec'ing each pathname in "initpaths".
+ * The program is invoked with one argument containing the boot flags.
+ */
+static void
+start_init(p)
+ struct proc *p;
+{
+ vm_offset_t addr;
+ struct execve_args args;
+ int options, i, error;
+ char **pathp, *path, *ucp, **uap, *arg0, *arg1;
+
+ initproc = p;
+
+ /*
+ * Need just enough stack to hold the faked-up "execve()" arguments.
+ */
+ addr = trunc_page(USRSTACK - PAGE_SIZE);
+ if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
+ panic("init: couldn't allocate argument space");
+ p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
+ p->p_vmspace->vm_ssize = 1;
+
+ for (pathp = &initpaths[0]; (path = *pathp) != NULL; pathp++) {
+ /*
+ * Move out the boot flag argument.
+ */
+ options = 0;
+ ucp = (char *)USRSTACK;
+ (void)subyte(--ucp, 0); /* trailing zero */
+ if (boothowto & RB_SINGLE) {
+ (void)subyte(--ucp, 's');
+ options = 1;
+ }
+#ifdef notyet
+ if (boothowto & RB_FASTBOOT) {
+ (void)subyte(--ucp, 'f');
+ options = 1;
+ }
+#endif
+
+#ifdef BOOTCDROM
+ (void)subyte(--ucp, 'C');
+ options = 1;
+#endif
+ if (options == 0)
+ (void)subyte(--ucp, '-');
+ (void)subyte(--ucp, '-'); /* leading hyphen */
+ arg1 = ucp;
+
+ /*
+ * Move out the file name (also arg 0).
+ */
+ for (i = strlen(path) + 1; i >= 0; i--)
+ (void)subyte(--ucp, path[i]);
+ arg0 = ucp;
+
+ /*
+ * Move out the arg pointers.
+ */
+ uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
+ (void)suword((caddr_t)--uap, (long)0); /* terminator */
+ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
+ (void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
+
+ /*
+ * Point at the arguments.
+ */
+ args.fname = arg0;
+ args.argv = uap;
+ args.envv = NULL;
+
+ /*
+ * Now try to exec the program. If can't for any reason
+ * other than it doesn't exist, complain.
+ *
+ * Otherwise return to main() which returns to btext
+ * which completes the system startup.
+ */
+ if ((error = execve(p, &args)) == 0)
+ return;
+ if (error != ENOENT)
+ printf("exec %s: error %d\n", path, error);
+ }
+ printf("init: not found\n");
+ panic("no init");
+}
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
new file mode 100644
index 0000000..c31ed46
--- /dev/null
+++ b/sys/kern/init_sysent.c
@@ -0,0 +1,360 @@
+/*
+ * System call switch table.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * created from Id: syscalls.master,v 1.55 1998/11/11 12:45:14 peter Exp
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+
+#ifdef COMPAT_43
+#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)
+#else
+#define compat(n, name) 0, (sy_call_t *)nosys
+#endif
+
+/* The casts are bogus but will do for now. */
+struct sysent sysent[] = {
+ { 0, (sy_call_t *)nosys }, /* 0 = syscall */
+ { 1, (sy_call_t *)exit }, /* 1 = exit */
+ { 0, (sy_call_t *)fork }, /* 2 = fork */
+ { 3, (sy_call_t *)read }, /* 3 = read */
+ { 3, (sy_call_t *)write }, /* 4 = write */
+ { 3, (sy_call_t *)open }, /* 5 = open */
+ { 1, (sy_call_t *)close }, /* 6 = close */
+ { 4, (sy_call_t *)wait4 }, /* 7 = wait4 */
+ { compat(2,creat) }, /* 8 = old creat */
+ { 2, (sy_call_t *)link }, /* 9 = link */
+ { 1, (sy_call_t *)unlink }, /* 10 = unlink */
+ { 0, (sy_call_t *)nosys }, /* 11 = obsolete execv */
+ { 1, (sy_call_t *)chdir }, /* 12 = chdir */
+ { 1, (sy_call_t *)fchdir }, /* 13 = fchdir */
+ { 3, (sy_call_t *)mknod }, /* 14 = mknod */
+ { 2, (sy_call_t *)chmod }, /* 15 = chmod */
+ { 3, (sy_call_t *)chown }, /* 16 = chown */
+ { 1, (sy_call_t *)obreak }, /* 17 = break */
+ { 3, (sy_call_t *)getfsstat }, /* 18 = getfsstat */
+ { compat(3,lseek) }, /* 19 = old lseek */
+ { 0, (sy_call_t *)getpid }, /* 20 = getpid */
+ { 4, (sy_call_t *)mount }, /* 21 = mount */
+ { 2, (sy_call_t *)unmount }, /* 22 = unmount */
+ { 1, (sy_call_t *)setuid }, /* 23 = setuid */
+ { 0, (sy_call_t *)getuid }, /* 24 = getuid */
+ { 0, (sy_call_t *)geteuid }, /* 25 = geteuid */
+ { 4, (sy_call_t *)ptrace }, /* 26 = ptrace */
+ { 3, (sy_call_t *)recvmsg }, /* 27 = recvmsg */
+ { 3, (sy_call_t *)sendmsg }, /* 28 = sendmsg */
+ { 6, (sy_call_t *)recvfrom }, /* 29 = recvfrom */
+ { 3, (sy_call_t *)accept }, /* 30 = accept */
+ { 3, (sy_call_t *)getpeername }, /* 31 = getpeername */
+ { 3, (sy_call_t *)getsockname }, /* 32 = getsockname */
+ { 2, (sy_call_t *)access }, /* 33 = access */
+ { 2, (sy_call_t *)chflags }, /* 34 = chflags */
+ { 2, (sy_call_t *)fchflags }, /* 35 = fchflags */
+ { 0, (sy_call_t *)sync }, /* 36 = sync */
+ { 2, (sy_call_t *)kill }, /* 37 = kill */
+ { compat(2,stat) }, /* 38 = old stat */
+ { 0, (sy_call_t *)getppid }, /* 39 = getppid */
+ { compat(2,lstat) }, /* 40 = old lstat */
+ { 1, (sy_call_t *)dup }, /* 41 = dup */
+ { 0, (sy_call_t *)pipe }, /* 42 = pipe */
+ { 0, (sy_call_t *)getegid }, /* 43 = getegid */
+ { 4, (sy_call_t *)profil }, /* 44 = profil */
+ { 4, (sy_call_t *)ktrace }, /* 45 = ktrace */
+ { 3, (sy_call_t *)sigaction }, /* 46 = sigaction */
+ { 0, (sy_call_t *)getgid }, /* 47 = getgid */
+ { 2, (sy_call_t *)sigprocmask }, /* 48 = sigprocmask */
+ { 2, (sy_call_t *)getlogin }, /* 49 = getlogin */
+ { 1, (sy_call_t *)setlogin }, /* 50 = setlogin */
+ { 1, (sy_call_t *)acct }, /* 51 = acct */
+ { 0, (sy_call_t *)sigpending }, /* 52 = sigpending */
+ { 2, (sy_call_t *)sigaltstack }, /* 53 = sigaltstack */
+ { 3, (sy_call_t *)ioctl }, /* 54 = ioctl */
+ { 1, (sy_call_t *)reboot }, /* 55 = reboot */
+ { 1, (sy_call_t *)revoke }, /* 56 = revoke */
+ { 2, (sy_call_t *)symlink }, /* 57 = symlink */
+ { 3, (sy_call_t *)readlink }, /* 58 = readlink */
+ { 3, (sy_call_t *)execve }, /* 59 = execve */
+ { 1, (sy_call_t *)umask }, /* 60 = umask */
+ { 1, (sy_call_t *)chroot }, /* 61 = chroot */
+ { compat(2,fstat) }, /* 62 = old fstat */
+ { compat(4,getkerninfo) }, /* 63 = old getkerninfo */
+ { compat(0,getpagesize) }, /* 64 = old getpagesize */
+ { 3, (sy_call_t *)msync }, /* 65 = msync */
+ { 0, (sy_call_t *)vfork }, /* 66 = vfork */
+ { 0, (sy_call_t *)nosys }, /* 67 = obsolete vread */
+ { 0, (sy_call_t *)nosys }, /* 68 = obsolete vwrite */
+ { 1, (sy_call_t *)sbrk }, /* 69 = sbrk */
+ { 1, (sy_call_t *)sstk }, /* 70 = sstk */
+ { compat(6,mmap) }, /* 71 = old mmap */
+ { 1, (sy_call_t *)ovadvise }, /* 72 = vadvise */
+ { 2, (sy_call_t *)munmap }, /* 73 = munmap */
+ { 3, (sy_call_t *)mprotect }, /* 74 = mprotect */
+ { 3, (sy_call_t *)madvise }, /* 75 = madvise */
+ { 0, (sy_call_t *)nosys }, /* 76 = obsolete vhangup */
+ { 0, (sy_call_t *)nosys }, /* 77 = obsolete vlimit */
+ { 3, (sy_call_t *)mincore }, /* 78 = mincore */
+ { 2, (sy_call_t *)getgroups }, /* 79 = getgroups */
+ { 2, (sy_call_t *)setgroups }, /* 80 = setgroups */
+ { 0, (sy_call_t *)getpgrp }, /* 81 = getpgrp */
+ { 2, (sy_call_t *)setpgid }, /* 82 = setpgid */
+ { 3, (sy_call_t *)setitimer }, /* 83 = setitimer */
+ { compat(0,wait) }, /* 84 = old wait */
+ { 1, (sy_call_t *)swapon }, /* 85 = swapon */
+ { 2, (sy_call_t *)getitimer }, /* 86 = getitimer */
+ { compat(2,gethostname) }, /* 87 = old gethostname */
+ { compat(2,sethostname) }, /* 88 = old sethostname */
+ { 0, (sy_call_t *)getdtablesize }, /* 89 = getdtablesize */
+ { 2, (sy_call_t *)dup2 }, /* 90 = dup2 */
+ { 0, (sy_call_t *)nosys }, /* 91 = getdopt */
+ { 3, (sy_call_t *)fcntl }, /* 92 = fcntl */
+ { 5, (sy_call_t *)select }, /* 93 = select */
+ { 0, (sy_call_t *)nosys }, /* 94 = setdopt */
+ { 1, (sy_call_t *)fsync }, /* 95 = fsync */
+ { 3, (sy_call_t *)setpriority }, /* 96 = setpriority */
+ { 3, (sy_call_t *)socket }, /* 97 = socket */
+ { 3, (sy_call_t *)connect }, /* 98 = connect */
+ { compat(3,accept) }, /* 99 = old accept */
+ { 2, (sy_call_t *)getpriority }, /* 100 = getpriority */
+ { compat(4,send) }, /* 101 = old send */
+ { compat(4,recv) }, /* 102 = old recv */
+ { 1, (sy_call_t *)sigreturn }, /* 103 = sigreturn */
+ { 3, (sy_call_t *)bind }, /* 104 = bind */
+ { 5, (sy_call_t *)setsockopt }, /* 105 = setsockopt */
+ { 2, (sy_call_t *)listen }, /* 106 = listen */
+ { 0, (sy_call_t *)nosys }, /* 107 = obsolete vtimes */
+ { compat(3,sigvec) }, /* 108 = old sigvec */
+ { compat(1,sigblock) }, /* 109 = old sigblock */
+ { compat(1,sigsetmask) }, /* 110 = old sigsetmask */
+ { 1, (sy_call_t *)sigsuspend }, /* 111 = sigsuspend */
+ { compat(2,sigstack) }, /* 112 = old sigstack */
+ { compat(3,recvmsg) }, /* 113 = old recvmsg */
+ { compat(3,sendmsg) }, /* 114 = old sendmsg */
+ { 0, (sy_call_t *)nosys }, /* 115 = obsolete vtrace */
+ { 2, (sy_call_t *)gettimeofday }, /* 116 = gettimeofday */
+ { 2, (sy_call_t *)getrusage }, /* 117 = getrusage */
+ { 5, (sy_call_t *)getsockopt }, /* 118 = getsockopt */
+ { 0, (sy_call_t *)nosys }, /* 119 = resuba */
+ { 3, (sy_call_t *)readv }, /* 120 = readv */
+ { 3, (sy_call_t *)writev }, /* 121 = writev */
+ { 2, (sy_call_t *)settimeofday }, /* 122 = settimeofday */
+ { 3, (sy_call_t *)fchown }, /* 123 = fchown */
+ { 2, (sy_call_t *)fchmod }, /* 124 = fchmod */
+ { compat(6,recvfrom) }, /* 125 = old recvfrom */
+ { 2, (sy_call_t *)setreuid }, /* 126 = setreuid */
+ { 2, (sy_call_t *)setregid }, /* 127 = setregid */
+ { 2, (sy_call_t *)rename }, /* 128 = rename */
+ { compat(2,truncate) }, /* 129 = old truncate */
+ { compat(2,ftruncate) }, /* 130 = old ftruncate */
+ { 2, (sy_call_t *)flock }, /* 131 = flock */
+ { 2, (sy_call_t *)mkfifo }, /* 132 = mkfifo */
+ { 6, (sy_call_t *)sendto }, /* 133 = sendto */
+ { 2, (sy_call_t *)shutdown }, /* 134 = shutdown */
+ { 4, (sy_call_t *)socketpair }, /* 135 = socketpair */
+ { 2, (sy_call_t *)mkdir }, /* 136 = mkdir */
+ { 1, (sy_call_t *)rmdir }, /* 137 = rmdir */
+ { 2, (sy_call_t *)utimes }, /* 138 = utimes */
+ { 0, (sy_call_t *)nosys }, /* 139 = obsolete 4.2 sigreturn */
+ { 2, (sy_call_t *)adjtime }, /* 140 = adjtime */
+ { compat(3,getpeername) }, /* 141 = old getpeername */
+ { compat(0,gethostid) }, /* 142 = old gethostid */
+ { compat(1,sethostid) }, /* 143 = old sethostid */
+ { compat(2,getrlimit) }, /* 144 = old getrlimit */
+ { compat(2,setrlimit) }, /* 145 = old setrlimit */
+ { compat(2,killpg) }, /* 146 = old killpg */
+ { 0, (sy_call_t *)setsid }, /* 147 = setsid */
+ { 4, (sy_call_t *)quotactl }, /* 148 = quotactl */
+ { compat(0,quota) }, /* 149 = old quota */
+ { compat(3,getsockname) }, /* 150 = old getsockname */
+ { 0, (sy_call_t *)nosys }, /* 151 = sem_lock */
+ { 0, (sy_call_t *)nosys }, /* 152 = sem_wakeup */
+ { 0, (sy_call_t *)nosys }, /* 153 = asyncdaemon */
+ { 0, (sy_call_t *)nosys }, /* 154 = nosys */
+ { 2, (sy_call_t *)nosys }, /* 155 = nfssvc */
+ { compat(4,getdirentries) }, /* 156 = old getdirentries */
+ { 2, (sy_call_t *)statfs }, /* 157 = statfs */
+ { 2, (sy_call_t *)fstatfs }, /* 158 = fstatfs */
+ { 0, (sy_call_t *)nosys }, /* 159 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 160 = nosys */
+ { 2, (sy_call_t *)nosys }, /* 161 = getfh */
+ { 2, (sy_call_t *)getdomainname }, /* 162 = getdomainname */
+ { 2, (sy_call_t *)setdomainname }, /* 163 = setdomainname */
+ { 1, (sy_call_t *)uname }, /* 164 = uname */
+ { 2, (sy_call_t *)sysarch }, /* 165 = sysarch */
+ { 3, (sy_call_t *)rtprio }, /* 166 = rtprio */
+ { 0, (sy_call_t *)nosys }, /* 167 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 168 = nosys */
+ { 5, (sy_call_t *)semsys }, /* 169 = semsys */
+ { 6, (sy_call_t *)msgsys }, /* 170 = msgsys */
+ { 4, (sy_call_t *)shmsys }, /* 171 = shmsys */
+ { 0, (sy_call_t *)nosys }, /* 172 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 173 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 174 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 175 = nosys */
+ { 1, (sy_call_t *)ntp_adjtime }, /* 176 = ntp_adjtime */
+ { 0, (sy_call_t *)nosys }, /* 177 = sfork */
+ { 0, (sy_call_t *)nosys }, /* 178 = getdescriptor */
+ { 0, (sy_call_t *)nosys }, /* 179 = setdescriptor */
+ { 0, (sy_call_t *)nosys }, /* 180 = nosys */
+ { 1, (sy_call_t *)setgid }, /* 181 = setgid */
+ { 1, (sy_call_t *)setegid }, /* 182 = setegid */
+ { 1, (sy_call_t *)seteuid }, /* 183 = seteuid */
+ { 0, (sy_call_t *)nosys }, /* 184 = lfs_bmapv */
+ { 0, (sy_call_t *)nosys }, /* 185 = lfs_markv */
+ { 0, (sy_call_t *)nosys }, /* 186 = lfs_segclean */
+ { 0, (sy_call_t *)nosys }, /* 187 = lfs_segwait */
+ { 2, (sy_call_t *)stat }, /* 188 = stat */
+ { 2, (sy_call_t *)fstat }, /* 189 = fstat */
+ { 2, (sy_call_t *)lstat }, /* 190 = lstat */
+ { 2, (sy_call_t *)pathconf }, /* 191 = pathconf */
+ { 2, (sy_call_t *)fpathconf }, /* 192 = fpathconf */
+ { 0, (sy_call_t *)nosys }, /* 193 = nosys */
+ { 2, (sy_call_t *)getrlimit }, /* 194 = getrlimit */
+ { 2, (sy_call_t *)setrlimit }, /* 195 = setrlimit */
+ { 4, (sy_call_t *)getdirentries }, /* 196 = getdirentries */
+ { 8, (sy_call_t *)mmap }, /* 197 = mmap */
+ { 0, (sy_call_t *)nosys }, /* 198 = __syscall */
+ { 5, (sy_call_t *)lseek }, /* 199 = lseek */
+ { 4, (sy_call_t *)truncate }, /* 200 = truncate */
+ { 4, (sy_call_t *)ftruncate }, /* 201 = ftruncate */
+ { 6, (sy_call_t *)__sysctl }, /* 202 = __sysctl */
+ { 2, (sy_call_t *)mlock }, /* 203 = mlock */
+ { 2, (sy_call_t *)munlock }, /* 204 = munlock */
+ { 1, (sy_call_t *)undelete }, /* 205 = undelete */
+ { 2, (sy_call_t *)futimes }, /* 206 = futimes */
+ { 1, (sy_call_t *)getpgid }, /* 207 = getpgid */
+ { 0, (sy_call_t *)nosys }, /* 208 = newreboot */
+ { 3, (sy_call_t *)poll }, /* 209 = poll */
+ { 0, (sy_call_t *)lkmnosys }, /* 210 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 211 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 212 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 213 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 214 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 215 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 216 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 217 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 218 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 219 = lkmnosys */
+ { 4, (sy_call_t *)__semctl }, /* 220 = __semctl */
+ { 3, (sy_call_t *)semget }, /* 221 = semget */
+ { 3, (sy_call_t *)semop }, /* 222 = semop */
+ { 1, (sy_call_t *)semconfig }, /* 223 = semconfig */
+ { 3, (sy_call_t *)msgctl }, /* 224 = msgctl */
+ { 2, (sy_call_t *)msgget }, /* 225 = msgget */
+ { 4, (sy_call_t *)msgsnd }, /* 226 = msgsnd */
+ { 5, (sy_call_t *)msgrcv }, /* 227 = msgrcv */
+ { 3, (sy_call_t *)shmat }, /* 228 = shmat */
+ { 3, (sy_call_t *)shmctl }, /* 229 = shmctl */
+ { 1, (sy_call_t *)shmdt }, /* 230 = shmdt */
+ { 3, (sy_call_t *)shmget }, /* 231 = shmget */
+ { 2, (sy_call_t *)clock_gettime }, /* 232 = clock_gettime */
+ { 2, (sy_call_t *)clock_settime }, /* 233 = clock_settime */
+ { 2, (sy_call_t *)clock_getres }, /* 234 = clock_getres */
+ { 0, (sy_call_t *)nosys }, /* 235 = timer_create */
+ { 0, (sy_call_t *)nosys }, /* 236 = timer_delete */
+ { 0, (sy_call_t *)nosys }, /* 237 = timer_settime */
+ { 0, (sy_call_t *)nosys }, /* 238 = timer_gettime */
+ { 0, (sy_call_t *)nosys }, /* 239 = timer_getoverrun */
+ { 2, (sy_call_t *)nanosleep }, /* 240 = nanosleep */
+ { 0, (sy_call_t *)nosys }, /* 241 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 242 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 243 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 244 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 245 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 246 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 247 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 248 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 249 = nosys */
+ { 3, (sy_call_t *)minherit }, /* 250 = minherit */
+ { 1, (sy_call_t *)rfork }, /* 251 = rfork */
+ { 3, (sy_call_t *)openbsd_poll }, /* 252 = openbsd_poll */
+ { 0, (sy_call_t *)issetugid }, /* 253 = issetugid */
+ { 3, (sy_call_t *)lchown }, /* 254 = lchown */
+ { 0, (sy_call_t *)nosys }, /* 255 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 256 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 257 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 258 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 259 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 260 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 261 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 262 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 263 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 264 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 265 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 266 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 267 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 268 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 269 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 270 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 271 = nosys */
+ { 3, (sy_call_t *)getdents }, /* 272 = getdents */
+ { 0, (sy_call_t *)nosys }, /* 273 = nosys */
+ { 2, (sy_call_t *)lchmod }, /* 274 = lchmod */
+ { 3, (sy_call_t *)lchown }, /* 275 = netbsd_lchown */
+ { 2, (sy_call_t *)lutimes }, /* 276 = lutimes */
+ { 3, (sy_call_t *)msync }, /* 277 = netbsd_msync */
+ { 2, (sy_call_t *)nstat }, /* 278 = nstat */
+ { 2, (sy_call_t *)nfstat }, /* 279 = nfstat */
+ { 2, (sy_call_t *)nlstat }, /* 280 = nlstat */
+ { 0, (sy_call_t *)nosys }, /* 281 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 282 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 283 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 284 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 285 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 286 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 287 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 288 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 289 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 290 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 291 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 292 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 293 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 294 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 295 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 296 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 297 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 298 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 299 = nosys */
+ { 1, (sy_call_t *)modnext }, /* 300 = modnext */
+ { 2, (sy_call_t *)modstat }, /* 301 = modstat */
+ { 1, (sy_call_t *)modfnext }, /* 302 = modfnext */
+ { 1, (sy_call_t *)modfind }, /* 303 = modfind */
+ { 1, (sy_call_t *)kldload }, /* 304 = kldload */
+ { 1, (sy_call_t *)kldunload }, /* 305 = kldunload */
+ { 1, (sy_call_t *)kldfind }, /* 306 = kldfind */
+ { 1, (sy_call_t *)kldnext }, /* 307 = kldnext */
+ { 2, (sy_call_t *)kldstat }, /* 308 = kldstat */
+ { 1, (sy_call_t *)kldfirstmod }, /* 309 = kldfirstmod */
+ { 1, (sy_call_t *)getsid }, /* 310 = getsid */
+ { 0, (sy_call_t *)nosys }, /* 311 = setresuid */
+ { 0, (sy_call_t *)nosys }, /* 312 = setresgid */
+ { 0, (sy_call_t *)nosys }, /* 313 = obsolete signanosleep */
+ { 1, (sy_call_t *)aio_return }, /* 314 = aio_return */
+ { 3, (sy_call_t *)aio_suspend }, /* 315 = aio_suspend */
+ { 2, (sy_call_t *)aio_cancel }, /* 316 = aio_cancel */
+ { 1, (sy_call_t *)aio_error }, /* 317 = aio_error */
+ { 1, (sy_call_t *)aio_read }, /* 318 = aio_read */
+ { 1, (sy_call_t *)aio_write }, /* 319 = aio_write */
+ { 4, (sy_call_t *)lio_listio }, /* 320 = lio_listio */
+ { 0, (sy_call_t *)yield }, /* 321 = yield */
+ { 1, (sy_call_t *)thr_sleep }, /* 322 = thr_sleep */
+ { 1, (sy_call_t *)thr_wakeup }, /* 323 = thr_wakeup */
+ { 1, (sy_call_t *)mlockall }, /* 324 = mlockall */
+ { 0, (sy_call_t *)munlockall }, /* 325 = munlockall */
+ { 2, (sy_call_t *)__getcwd }, /* 326 = __getcwd */
+ { 2, (sy_call_t *)sched_setparam }, /* 327 = sched_setparam */
+ { 2, (sy_call_t *)sched_getparam }, /* 328 = sched_getparam */
+ { 3, (sy_call_t *)sched_setscheduler }, /* 329 = sched_setscheduler */
+ { 1, (sy_call_t *)sched_getscheduler }, /* 330 = sched_getscheduler */
+ { 0, (sy_call_t *)sched_yield }, /* 331 = sched_yield */
+ { 1, (sy_call_t *)sched_get_priority_max }, /* 332 = sched_get_priority_max */
+ { 1, (sy_call_t *)sched_get_priority_min }, /* 333 = sched_get_priority_min */
+ { 2, (sy_call_t *)sched_rr_get_interval }, /* 334 = sched_rr_get_interval */
+ { 2, (sy_call_t *)utrace }, /* 335 = utrace */
+ { 8, (sy_call_t *)sendfile }, /* 336 = sendfile */
+ { 3, (sy_call_t *)kldsym }, /* 337 = kldsym */
+};
diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c
new file mode 100644
index 0000000..11db4e9
--- /dev/null
+++ b/sys/kern/kern_acct.c
@@ -0,0 +1,330 @@
+/*-
+ * Copyright (c) 1994 Christopher G. Demetriou
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_acct.c 8.1 (Berkeley) 6/14/93
+ * $Id: kern_acct.c,v 1.18 1997/11/06 19:29:07 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/fcntl.h>
+#include <sys/syslog.h>
+#include <sys/kernel.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/namei.h>
+#include <sys/acct.h>
+#include <sys/resourcevar.h>
+#include <sys/tty.h>
+
+/*
+ * The routines implemented in this file are described in:
+ * Leffler, et al.: The Design and Implementation of the 4.3BSD
+ * UNIX Operating System (Addison Welley, 1989)
+ * on pages 62-63.
+ *
+ * Arguably, to simplify accounting operations, this mechanism should
+ * be replaced by one in which an accounting log file (similar to /dev/klog)
+ * is read by a user process, etc. However, that has its own problems.
+ */
+
+/*
+ * Internal accounting functions.
+ * The former's operation is described in Leffler, et al., and the latter
+ * was provided by UCB with the 4.4BSD-Lite release
+ */
+static comp_t encode_comp_t __P((u_long, u_long));
+static void acctwatch __P((void *));
+
+/*
+ * Accounting callout handle used for periodic scheduling of
+ * acctwatch.
+ */
+static struct callout_handle acctwatch_handle
+ = CALLOUT_HANDLE_INITIALIZER(&acctwatch_handle);
+
+/*
+ * Accounting vnode pointer, and saved vnode pointer.
+ */
+static struct vnode *acctp;
+static struct vnode *savacctp;
+
+/*
+ * Values associated with enabling and disabling accounting
+ */
+static int acctsuspend = 2; /* stop accounting when < 2% free space left */
+SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW,
+ &acctsuspend, 0, "");
+
+static int acctresume = 4; /* resume when free space risen to > 4% */
+SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW,
+ &acctresume, 0, "");
+
+static int acctchkfreq = 15; /* frequency (in seconds) to check space */
+SYSCTL_INT(_kern, OID_AUTO, acct_chkfreq, CTLFLAG_RW,
+ &acctchkfreq, 0, "");
+
+/*
+ * Accounting system call. Written based on the specification and
+ * previous implementation done by Mark Tinguely.
+ */
+int
+acct(a1, uap)
+ struct proc *a1;
+ struct acct_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ struct proc *p = curproc; /* XXX */
+ struct nameidata nd;
+ int error;
+
+ /* Make sure that the caller is root. */
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error)
+ return (error);
+
+ /*
+ * If accounting is to be started to a file, open that file for
+ * writing and make sure it's a 'normal'.
+ */
+ if (SCARG(uap, path) != NULL) {
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path),
+ p);
+ error = vn_open(&nd, FWRITE, 0);
+ if (error)
+ return (error);
+ VOP_UNLOCK(nd.ni_vp, 0, p);
+ if (nd.ni_vp->v_type != VREG) {
+ vn_close(nd.ni_vp, FWRITE, p->p_ucred, p);
+ return (EACCES);
+ }
+ }
+
+ /*
+ * If accounting was previously enabled, kill the old space-watcher,
+ * close the file, and (if no new file was specified, leave).
+ */
+ if (acctp != NULLVP || savacctp != NULLVP) {
+ untimeout(acctwatch, NULL, acctwatch_handle);
+ error = vn_close((acctp != NULLVP ? acctp : savacctp), FWRITE,
+ p->p_ucred, p);
+ acctp = savacctp = NULLVP;
+ }
+ if (SCARG(uap, path) == NULL)
+ return (error);
+
+ /*
+ * Save the new accounting file vnode, and schedule the new
+ * free space watcher.
+ */
+ acctp = nd.ni_vp;
+ acctwatch(NULL);
+ return (error);
+}
+
+/*
+ * Write out process accounting information, on process exit.
+ * Data to be written out is specified in Leffler, et al.
+ * and are enumerated below. (They're also noted in the system
+ * "acct.h" header file.)
+ */
+
+int
+acct_process(p)
+ struct proc *p;
+{
+ struct acct acct;
+ struct rusage *r;
+ struct timeval ut, st, tmp;
+ int t;
+ struct vnode *vp;
+
+ /* If accounting isn't enabled, don't bother */
+ vp = acctp;
+ if (vp == NULLVP)
+ return (0);
+
+ /*
+ * Get process accounting information.
+ */
+
+ /* (1) The name of the command that ran */
+ bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
+
+ /* (2) The amount of user and system time that was used */
+ calcru(p, &ut, &st, NULL);
+ acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec);
+ acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec);
+
+ /* (3) The elapsed time the commmand ran (and its starting time) */
+ acct.ac_btime = p->p_stats->p_start.tv_sec;
+ microtime(&tmp);
+ timevalsub(&tmp, &p->p_stats->p_start);
+ acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec);
+
+ /* (4) The average amount of memory used */
+ r = &p->p_stats->p_ru;
+ tmp = ut;
+ timevaladd(&tmp, &st);
+ t = tmp.tv_sec * hz + tmp.tv_usec / tick;
+ if (t)
+ acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t;
+ else
+ acct.ac_mem = 0;
+
+ /* (5) The number of disk I/O operations done */
+ acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0);
+
+ /* (6) The UID and GID of the process */
+ acct.ac_uid = p->p_cred->p_ruid;
+ acct.ac_gid = p->p_cred->p_rgid;
+
+ /* (7) The terminal from which the process was started */
+ if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
+ acct.ac_tty = p->p_pgrp->pg_session->s_ttyp->t_dev;
+ else
+ acct.ac_tty = NODEV;
+
+ /* (8) The boolean flags that tell how the process terminated, etc. */
+ acct.ac_flag = p->p_acflag;
+
+ /*
+ * Eliminate any file size rlimit.
+ */
+ if (p->p_limit->p_refcnt > 1 &&
+ (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
+ p->p_limit->p_refcnt--;
+ p->p_limit = limcopy(p->p_limit);
+ }
+ p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+
+ /*
+ * Write the accounting information to the file.
+ */
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ return (vn_rdwr(UIO_WRITE, vp, (caddr_t)&acct, sizeof (acct),
+ (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, p->p_ucred,
+ (int *)0, p));
+}
+
+/*
+ * Encode_comp_t converts from ticks in seconds and microseconds
+ * to ticks in 1/AHZ seconds. The encoding is described in
+ * Leffler, et al., on page 63.
+ */
+
+#define MANTSIZE 13 /* 13 bit mantissa. */
+#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
+#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
+
+static comp_t
+encode_comp_t(s, us)
+ u_long s, us;
+{
+ int exp, rnd;
+
+ exp = 0;
+ rnd = 0;
+ s *= AHZ;
+ s += us / (1000000 / AHZ); /* Maximize precision. */
+
+ while (s > MAXFRACT) {
+ rnd = s & (1 << (EXPSIZE - 1)); /* Round up? */
+ s >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
+ exp++;
+ }
+
+ /* If we need to round up, do it (and handle overflow correctly). */
+ if (rnd && (++s > MAXFRACT)) {
+ s >>= EXPSIZE;
+ exp++;
+ }
+
+ /* Clean it up and polish it off. */
+ exp <<= MANTSIZE; /* Shift the exponent into place */
+ exp += s; /* and add on the mantissa. */
+ return (exp);
+}
+
+/*
+ * Periodically check the file system to see if accounting
+ * should be turned on or off. Beware the case where the vnode
+ * has been vgone()'d out from underneath us, e.g. when the file
+ * system containing the accounting file has been forcibly unmounted.
+ */
+/* ARGSUSED */
+static void
+acctwatch(a)
+ void *a;
+{
+ struct statfs sb;
+
+ if (savacctp != NULLVP) {
+ if (savacctp->v_type == VBAD) {
+ (void) vn_close(savacctp, FWRITE, NOCRED, NULL);
+ savacctp = NULLVP;
+ return;
+ }
+ (void)VFS_STATFS(savacctp->v_mount, &sb, (struct proc *)0);
+ if (sb.f_bavail > acctresume * sb.f_blocks / 100) {
+ acctp = savacctp;
+ savacctp = NULLVP;
+ log(LOG_NOTICE, "Accounting resumed\n");
+ }
+ } else {
+ if (acctp == NULLVP)
+ return;
+ if (acctp->v_type == VBAD) {
+ (void) vn_close(acctp, FWRITE, NOCRED, NULL);
+ acctp = NULLVP;
+ return;
+ }
+ (void)VFS_STATFS(acctp->v_mount, &sb, (struct proc *)0);
+ if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) {
+ savacctp = acctp;
+ acctp = NULLVP;
+ log(LOG_NOTICE, "Accounting suspended\n");
+ }
+ }
+ acctwatch_handle = timeout(acctwatch, NULL, acctchkfreq * hz);
+}
diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
new file mode 100644
index 0000000..2ea378f
--- /dev/null
+++ b/sys/kern/kern_clock.c
@@ -0,0 +1,870 @@
+/*-
+ * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
+ * $Id: kern_clock.c,v 1.85 1998/11/23 09:58:53 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/dkstat.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/timex.h>
+#include <vm/vm.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#include <machine/limits.h>
+
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+
+#if defined(SMP) && defined(BETTER_CLOCK)
+#include <machine/smp.h>
+#endif
+
+/* This is where the NTIMECOUNTER option hangs out */
+#include "opt_ntp.h"
+
+/*
+ * Number of timecounters used to implement stable storage
+ */
+#ifndef NTIMECOUNTER
+#define NTIMECOUNTER 5
+#endif
+
+static MALLOC_DEFINE(M_TIMECOUNTER, "timecounter",
+ "Timecounter stable storage");
+
+static void initclocks __P((void *dummy));
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
+
+static void tco_forward __P((int force));
+static void tco_setscales __P((struct timecounter *tc));
+static __inline unsigned tco_delta __P((struct timecounter *tc));
+
+/* Some of these don't belong here, but it's easiest to concentrate them. */
+#if defined(SMP) && defined(BETTER_CLOCK)
+long cp_time[CPUSTATES];
+#else
+static long cp_time[CPUSTATES];
+#endif
+
+long tk_cancc;
+long tk_nin;
+long tk_nout;
+long tk_rawcc;
+
+time_t time_second;
+
+/*
+ * Which update policy to use.
+ * 0 - every tick, bad hardware may fail with "calcru negative..."
+ * 1 - more resistent to the above hardware, but less efficient.
+ */
+static int tco_method;
+
+/*
+ * Implement a dummy timecounter which we can use until we get a real one
+ * in the air. This allows the console and other early stuff to use
+ * timeservices.
+ */
+
+static unsigned
+dummy_get_timecount(struct timecounter *tc)
+{
+ static unsigned now;
+ return (++now);
+}
+
+static struct timecounter dummy_timecounter = {
+ dummy_get_timecount,
+ 0,
+ ~0u,
+ 1000000,
+ "dummy"
+};
+
+struct timecounter *timecounter = &dummy_timecounter;
+
+/*
+ * Clock handling routines.
+ *
+ * This code is written to operate with two timers that run independently of
+ * each other.
+ *
+ * The main timer, running hz times per second, is used to trigger interval
+ * timers, timeouts and rescheduling as needed.
+ *
+ * The second timer handles kernel and user profiling,
+ * and does resource use estimation. If the second timer is programmable,
+ * it is randomized to avoid aliasing between the two clocks. For example,
+ * the randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires. Otherwise, it would never accumulate
+ * cpu ticks. The mean frequency of the second timer is stathz.
+ *
+ * If no second timer exists, stathz will be zero; in this case we drive
+ * profiling and statistics off the main clock. This WILL NOT be accurate;
+ * do not do it unless absolutely necessary.
+ *
+ * The statistics clock may (or may not) be run at a higher rate while
+ * profiling. This profile clock runs at profhz. We require that profhz
+ * be an integral multiple of stathz.
+ *
+ * If the statistics clock is running fast, it must be divided by the ratio
+ * profhz/stathz for statistics. (For profiling, every tick counts.)
+ *
+ * Time-of-day is maintained using a "timecounter", which may or may
+ * not be related to the hardware generating the above mentioned
+ * interrupts.
+ */
+
+int stathz;
+int profhz;
+static int profprocs;
+int ticks;
+static int psdiv, pscnt; /* prof => stat divider */
+int psratio; /* ratio: prof / stat */
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+ void *dummy;
+{
+ register int i;
+
+ /*
+ * Set divisors to 1 (normal case) and let the machine-specific
+ * code do its bit.
+ */
+ psdiv = pscnt = 1;
+ cpu_initclocks();
+
+ /*
+ * Compute profhz/stathz, and fix profhz if needed.
+ */
+ i = stathz ? stathz : hz;
+ if (profhz == 0)
+ profhz = i;
+ psratio = profhz / i;
+}
+
+/*
+ * The real-time timer, interrupting hz times per second.
+ */
+void
+hardclock(frame)
+ register struct clockframe *frame;
+{
+ register struct proc *p;
+
+ p = curproc;
+ if (p) {
+ register struct pstats *pstats;
+
+ /*
+ * Run current process's virtual and profile time, as needed.
+ */
+ pstats = p->p_stats;
+ if (CLKF_USERMODE(frame) &&
+ timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
+ itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
+ psignal(p, SIGVTALRM);
+ if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
+ itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
+ psignal(p, SIGPROF);
+ }
+
+#if defined(SMP) && defined(BETTER_CLOCK)
+ forward_hardclock(pscnt);
+#endif
+
+ /*
+ * If no separate statistics clock is available, run it from here.
+ */
+ if (stathz == 0)
+ statclock(frame);
+
+ tco_forward(0);
+ ticks++;
+
+ /*
+ * Process callouts at a very low cpu priority, so we don't keep the
+ * relatively high clock interrupt priority any longer than necessary.
+ */
+ if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
+ if (CLKF_BASEPRI(frame)) {
+ /*
+ * Save the overhead of a software interrupt;
+ * it will happen as soon as we return, so do it now.
+ */
+ (void)splsoftclock();
+ softclock();
+ } else
+ setsoftclock();
+ } else if (softticks + 1 == ticks)
+ ++softticks;
+}
+
+/*
+ * Compute number of ticks in the specified amount of time.
+ */
+int
+tvtohz(tv)
+ struct timeval *tv;
+{
+ register unsigned long ticks;
+ register long sec, usec;
+
+ /*
+ * If the number of usecs in the whole seconds part of the time
+ * difference fits in a long, then the total number of usecs will
+ * fit in an unsigned long. Compute the total and convert it to
+ * ticks, rounding up and adding 1 to allow for the current tick
+ * to expire. Rounding also depends on unsigned long arithmetic
+ * to avoid overflow.
+ *
+ * Otherwise, if the number of ticks in the whole seconds part of
+ * the time difference fits in a long, then convert the parts to
+ * ticks separately and add, using similar rounding methods and
+ * overflow avoidance. This method would work in the previous
+ * case but it is slightly slower and assumes that hz is integral.
+ *
+ * Otherwise, round the time difference down to the maximum
+ * representable value.
+ *
+ * If ints have 32 bits, then the maximum value for any timeout in
+ * 10ms ticks is 248 days.
+ */
+ sec = tv->tv_sec;
+ usec = tv->tv_usec;
+ if (usec < 0) {
+ sec--;
+ usec += 1000000;
+ }
+ if (sec < 0) {
+#ifdef DIAGNOSTIC
+ if (usec > 0) {
+ sec++;
+ usec -= 1000000;
+ }
+ printf("tvotohz: negative time difference %ld sec %ld usec\n",
+ sec, usec);
+#endif
+ ticks = 1;
+ } else if (sec <= LONG_MAX / 1000000)
+ ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+ / tick + 1;
+ else if (sec <= LONG_MAX / hz)
+ ticks = sec * hz
+ + ((unsigned long)usec + (tick - 1)) / tick + 1;
+ else
+ ticks = LONG_MAX;
+ if (ticks > INT_MAX)
+ ticks = INT_MAX;
+ return ((int)ticks);
+}
+
+/*
+ * Start profiling on a process.
+ *
+ * Kernel profiling passes proc0 which never exits and hence
+ * keeps the profile clock running constantly.
+ */
+void
+startprofclock(p)
+ register struct proc *p;
+{
+ int s;
+
+ if ((p->p_flag & P_PROFIL) == 0) {
+ p->p_flag |= P_PROFIL;
+ if (++profprocs == 1 && stathz != 0) {
+ s = splstatclock();
+ psdiv = pscnt = psratio;
+ setstatclockrate(profhz);
+ splx(s);
+ }
+ }
+}
+
+/*
+ * Stop profiling on a process.
+ */
+void
+stopprofclock(p)
+ register struct proc *p;
+{
+ int s;
+
+ if (p->p_flag & P_PROFIL) {
+ p->p_flag &= ~P_PROFIL;
+ if (--profprocs == 0 && stathz != 0) {
+ s = splstatclock();
+ psdiv = pscnt = 1;
+ setstatclockrate(stathz);
+ splx(s);
+ }
+ }
+}
+
+/*
+ * Statistics clock. Grab profile sample, and if divider reaches 0,
+ * do process and kernel statistics.
+ */
+void
+statclock(frame)
+ register struct clockframe *frame;
+{
+#ifdef GPROF
+ register struct gmonparam *g;
+ int i;
+#endif
+ register struct proc *p;
+ struct pstats *pstats;
+ long rss;
+ struct rusage *ru;
+ struct vmspace *vm;
+
+ if (curproc != NULL && CLKF_USERMODE(frame)) {
+ p = curproc;
+ if (p->p_flag & P_PROFIL)
+ addupc_intr(p, CLKF_PC(frame), 1);
+#if defined(SMP) && defined(BETTER_CLOCK)
+ if (stathz != 0)
+ forward_statclock(pscnt);
+#endif
+ if (--pscnt > 0)
+ return;
+ /*
+ * Came from user mode; CPU was in user state.
+ * If this process is being profiled record the tick.
+ */
+ p->p_uticks++;
+ if (p->p_nice > NZERO)
+ cp_time[CP_NICE]++;
+ else
+ cp_time[CP_USER]++;
+ } else {
+#ifdef GPROF
+ /*
+ * Kernel statistics are just like addupc_intr, only easier.
+ */
+ g = &_gmonparam;
+ if (g->state == GMON_PROF_ON) {
+ i = CLKF_PC(frame) - g->lowpc;
+ if (i < g->textsize) {
+ i /= HISTFRACTION * sizeof(*g->kcount);
+ g->kcount[i]++;
+ }
+ }
+#endif
+#if defined(SMP) && defined(BETTER_CLOCK)
+ if (stathz != 0)
+ forward_statclock(pscnt);
+#endif
+ if (--pscnt > 0)
+ return;
+ /*
+ * Came from kernel mode, so we were:
+ * - handling an interrupt,
+ * - doing syscall or trap work on behalf of the current
+ * user process, or
+ * - spinning in the idle loop.
+ * Whichever it is, charge the time as appropriate.
+ * Note that we charge interrupts to the current process,
+ * regardless of whether they are ``for'' that process,
+ * so that we know how much of its real time was spent
+ * in ``non-process'' (i.e., interrupt) work.
+ */
+ p = curproc;
+ if (CLKF_INTR(frame)) {
+ if (p != NULL)
+ p->p_iticks++;
+ cp_time[CP_INTR]++;
+ } else if (p != NULL) {
+ p->p_sticks++;
+ cp_time[CP_SYS]++;
+ } else
+ cp_time[CP_IDLE]++;
+ }
+ pscnt = psdiv;
+
+ /*
+ * We maintain statistics shown by user-level statistics
+ * programs: the amount of time in each cpu state.
+ */
+
+ /*
+ * We adjust the priority of the current process. The priority of
+ * a process gets worse as it accumulates CPU time. The cpu usage
+ * estimator (p_estcpu) is increased here. The formula for computing
+ * priorities (in kern_synch.c) will compute a different value each
+ * time p_estcpu increases by 4. The cpu usage estimator ramps up
+ * quite quickly when the process is running (linearly), and decays
+ * away exponentially, at a rate which is proportionally slower when
+ * the system is busy. The basic principal is that the system will
+ * 90% forget that the process used a lot of CPU time in 5 * loadav
+ * seconds. This causes the system to favor processes which haven't
+ * run much recently, and to round-robin among other processes.
+ */
+ if (p != NULL) {
+ p->p_cpticks++;
+ if (++p->p_estcpu == 0)
+ p->p_estcpu--;
+ if ((p->p_estcpu & 3) == 0) {
+ resetpriority(p);
+ if (p->p_priority >= PUSER)
+ p->p_priority = p->p_usrpri;
+ }
+
+ /* Update resource usage integrals and maximums. */
+ if ((pstats = p->p_stats) != NULL &&
+ (ru = &pstats->p_ru) != NULL &&
+ (vm = p->p_vmspace) != NULL) {
+ ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
+ ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
+ ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
+ rss = vm->vm_pmap.pm_stats.resident_count *
+ PAGE_SIZE / 1024;
+ if (ru->ru_maxrss < rss)
+ ru->ru_maxrss = rss;
+ }
+ }
+}
+
+/*
+ * Return information about system clocks.
+ */
+static int
+sysctl_kern_clockrate SYSCTL_HANDLER_ARGS
+{
+ struct clockinfo clkinfo;
+ /*
+ * Construct clockinfo structure.
+ */
+ clkinfo.hz = hz;
+ clkinfo.tick = tick;
+ clkinfo.tickadj = tickadj;
+ clkinfo.profhz = profhz;
+ clkinfo.stathz = stathz ? stathz : hz;
+ return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
+ 0, 0, sysctl_kern_clockrate, "S,clockinfo","");
+
+static __inline unsigned
+tco_delta(struct timecounter *tc)
+{
+
+ return ((tc->tc_get_timecount(tc) - tc->tc_offset_count) &
+ tc->tc_counter_mask);
+}
+
+/*
+ * We have four functions for looking at the clock, two for microseconds
+ * and two for nanoseconds. For each there is fast but less precise
+ * version "get{nano|micro}time" which will return a time which is up
+ * to 1/HZ previous to the call, whereas the raw version "{nano|micro}time"
+ * will return a timestamp which is as precise as possible.
+ */
+
+void
+getmicrotime(struct timeval *tvp)
+{
+ struct timecounter *tc;
+
+ if (!tco_method) {
+ tc = timecounter;
+ *tvp = tc->tc_microtime;
+ } else {
+ microtime(tvp);
+ }
+}
+
+void
+getnanotime(struct timespec *tsp)
+{
+ struct timecounter *tc;
+
+ if (!tco_method) {
+ tc = timecounter;
+ *tsp = tc->tc_nanotime;
+ } else {
+ nanotime(tsp);
+ }
+}
+
+void
+microtime(struct timeval *tv)
+{
+ struct timecounter *tc;
+
+ tc = (struct timecounter *)timecounter;
+ tv->tv_sec = tc->tc_offset_sec;
+ tv->tv_usec = tc->tc_offset_micro;
+ tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32;
+ tv->tv_usec += boottime.tv_usec;
+ tv->tv_sec += boottime.tv_sec;
+ while (tv->tv_usec >= 1000000) {
+ tv->tv_usec -= 1000000;
+ tv->tv_sec++;
+ }
+}
+
+void
+nanotime(struct timespec *ts)
+{
+ unsigned count;
+ u_int64_t delta;
+ struct timecounter *tc;
+
+ tc = (struct timecounter *)timecounter;
+ ts->tv_sec = tc->tc_offset_sec;
+ count = tco_delta(tc);
+ delta = tc->tc_offset_nano;
+ delta += ((u_int64_t)count * tc->tc_scale_nano_f);
+ delta >>= 32;
+ delta += ((u_int64_t)count * tc->tc_scale_nano_i);
+ delta += boottime.tv_usec * 1000;
+ ts->tv_sec += boottime.tv_sec;
+ while (delta >= 1000000000) {
+ delta -= 1000000000;
+ ts->tv_sec++;
+ }
+ ts->tv_nsec = delta;
+}
+
+void
+timecounter_timespec(unsigned count, struct timespec *ts)
+{
+ u_int64_t delta;
+ struct timecounter *tc;
+
+ tc = (struct timecounter *)timecounter;
+ ts->tv_sec = tc->tc_offset_sec;
+ count -= tc->tc_offset_count;
+ count &= tc->tc_counter_mask;
+ delta = tc->tc_offset_nano;
+ delta += ((u_int64_t)count * tc->tc_scale_nano_f);
+ delta >>= 32;
+ delta += ((u_int64_t)count * tc->tc_scale_nano_i);
+ delta += boottime.tv_usec * 1000;
+ ts->tv_sec += boottime.tv_sec;
+ while (delta >= 1000000000) {
+ delta -= 1000000000;
+ ts->tv_sec++;
+ }
+ ts->tv_nsec = delta;
+}
+
+void
+getmicrouptime(struct timeval *tvp)
+{
+ struct timecounter *tc;
+
+ if (!tco_method) {
+ tc = timecounter;
+ tvp->tv_sec = tc->tc_offset_sec;
+ tvp->tv_usec = tc->tc_offset_micro;
+ } else {
+ microuptime(tvp);
+ }
+}
+
+void
+getnanouptime(struct timespec *tsp)
+{
+ struct timecounter *tc;
+
+ if (!tco_method) {
+ tc = timecounter;
+ tsp->tv_sec = tc->tc_offset_sec;
+ tsp->tv_nsec = tc->tc_offset_nano >> 32;
+ } else {
+ nanouptime(tsp);
+ }
+}
+
+void
+microuptime(struct timeval *tv)
+{
+ struct timecounter *tc;
+
+ tc = (struct timecounter *)timecounter;
+ tv->tv_sec = tc->tc_offset_sec;
+ tv->tv_usec = tc->tc_offset_micro;
+ tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32;
+ if (tv->tv_usec >= 1000000) {
+ tv->tv_usec -= 1000000;
+ tv->tv_sec++;
+ }
+}
+
+void
+nanouptime(struct timespec *ts)
+{
+ unsigned count;
+ u_int64_t delta;
+ struct timecounter *tc;
+
+ tc = (struct timecounter *)timecounter;
+ ts->tv_sec = tc->tc_offset_sec;
+ count = tco_delta(tc);
+ delta = tc->tc_offset_nano;
+ delta += ((u_int64_t)count * tc->tc_scale_nano_f);
+ delta >>= 32;
+ delta += ((u_int64_t)count * tc->tc_scale_nano_i);
+ if (delta >= 1000000000) {
+ delta -= 1000000000;
+ ts->tv_sec++;
+ }
+ ts->tv_nsec = delta;
+}
+
+static void
+tco_setscales(struct timecounter *tc)
+{
+ u_int64_t scale;
+
+ scale = 1000000000LL << 32;
+ if (tc->tc_adjustment > 0)
+ scale += (tc->tc_adjustment * 1000LL) << 10;
+ else
+ scale -= (-tc->tc_adjustment * 1000LL) << 10;
+ scale /= tc->tc_frequency;
+ tc->tc_scale_micro = scale / 1000;
+ tc->tc_scale_nano_f = scale & 0xffffffff;
+ tc->tc_scale_nano_i = scale >> 32;
+}
+
+void
+init_timecounter(struct timecounter *tc)
+{
+ struct timespec ts1;
+ struct timecounter *t1, *t2, *t3;
+ int i;
+
+ tc->tc_adjustment = 0;
+ tco_setscales(tc);
+ tc->tc_offset_count = tc->tc_get_timecount(tc);
+ tc->tc_tweak = tc;
+ MALLOC(t1, struct timecounter *, sizeof *t1, M_TIMECOUNTER, M_WAITOK);
+ *t1 = *tc;
+ t2 = t1;
+ for (i = 1; i < NTIMECOUNTER; i++) {
+ MALLOC(t3, struct timecounter *, sizeof *t3,
+ M_TIMECOUNTER, M_WAITOK);
+ *t3 = *tc;
+ t3->tc_other = t2;
+ t2 = t3;
+ }
+ t1->tc_other = t3;
+ tc = t1;
+
+ printf("Timecounter \"%s\" frequency %lu Hz\n",
+ tc->tc_name, (u_long)tc->tc_frequency);
+
+ /* XXX: For now always start using the counter. */
+ tc->tc_offset_count = tc->tc_get_timecount(tc);
+ nanouptime(&ts1);
+ tc->tc_offset_nano = (u_int64_t)ts1.tv_nsec << 32;
+ tc->tc_offset_micro = ts1.tv_nsec / 1000;
+ tc->tc_offset_sec = ts1.tv_sec;
+ timecounter = tc;
+}
+
+void
+set_timecounter(struct timespec *ts)
+{
+ struct timespec ts2;
+
+ nanouptime(&ts2);
+ boottime.tv_sec = ts->tv_sec - ts2.tv_sec;
+ boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000;
+ if (boottime.tv_usec < 0) {
+ boottime.tv_usec += 1000000;
+ boottime.tv_sec--;
+ }
+ /* fiddle all the little crinkly bits around the fiords... */
+ tco_forward(1);
+}
+
+
+#if 0 /* Currently unused */
+void
+switch_timecounter(struct timecounter *newtc)
+{
+ int s;
+ struct timecounter *tc;
+ struct timespec ts;
+
+ s = splclock();
+ tc = timecounter;
+ if (newtc == tc || newtc == tc->tc_other) {
+ splx(s);
+ return;
+ }
+ nanouptime(&ts);
+ newtc->tc_offset_sec = ts.tv_sec;
+ newtc->tc_offset_nano = (u_int64_t)ts.tv_nsec << 32;
+ newtc->tc_offset_micro = ts.tv_nsec / 1000;
+ newtc->tc_offset_count = newtc->tc_get_timecount(newtc);
+ timecounter = newtc;
+ splx(s);
+}
+#endif
+
+static struct timecounter *
+sync_other_counter(void)
+{
+ struct timecounter *tc, *tcn, *tco;
+ unsigned delta;
+
+ tco = timecounter;
+ tc = tco->tc_other;
+ tcn = tc->tc_other;
+ *tc = *tco;
+ tc->tc_other = tcn;
+ delta = tco_delta(tc);
+ tc->tc_offset_count += delta;
+ tc->tc_offset_count &= tc->tc_counter_mask;
+ tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_f;
+ tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_i << 32;
+ return (tc);
+}
+
+static void
+tco_forward(int force)
+{
+ struct timecounter *tc, *tco;
+
+ tco = timecounter;
+ tc = sync_other_counter();
+ /*
+ * We may be inducing a tiny error here, the tc_poll_pps() may
+ * process a latched count which happens after the tco_delta()
+ * in sync_other_counter(), which would extend the previous
+ * counters parameters into the domain of this new one.
+ * Since the timewindow is very small for this, the error is
+ * going to be only a few weenieseconds (as Dave Mills would
+ * say), so lets just not talk more about it, OK ?
+ */
+ if (tco->tc_poll_pps)
+ tco->tc_poll_pps(tco);
+ if (timedelta != 0) {
+ tc->tc_offset_nano += (u_int64_t)(tickdelta * 1000) << 32;
+ timedelta -= tickdelta;
+ force++;
+ }
+
+ while (tc->tc_offset_nano >= 1000000000ULL << 32) {
+ tc->tc_offset_nano -= 1000000000ULL << 32;
+ tc->tc_offset_sec++;
+ tc->tc_frequency = tc->tc_tweak->tc_frequency;
+ tc->tc_adjustment = tc->tc_tweak->tc_adjustment;
+ ntp_update_second(tc); /* XXX only needed if xntpd runs */
+ tco_setscales(tc);
+ force++;
+ }
+
+ if (tco_method && !force)
+ return;
+
+ tc->tc_offset_micro = (tc->tc_offset_nano / 1000) >> 32;
+
+ /* Figure out the wall-clock time */
+ tc->tc_nanotime.tv_sec = tc->tc_offset_sec + boottime.tv_sec;
+ tc->tc_nanotime.tv_nsec =
+ (tc->tc_offset_nano >> 32) + boottime.tv_usec * 1000;
+ tc->tc_microtime.tv_usec = tc->tc_offset_micro + boottime.tv_usec;
+ if (tc->tc_nanotime.tv_nsec >= 1000000000) {
+ tc->tc_nanotime.tv_nsec -= 1000000000;
+ tc->tc_microtime.tv_usec -= 1000000;
+ tc->tc_nanotime.tv_sec++;
+ }
+ time_second = tc->tc_microtime.tv_sec = tc->tc_nanotime.tv_sec;
+
+ timecounter = tc;
+}
+
+static int
+sysctl_kern_timecounter_frequency SYSCTL_HANDLER_ARGS
+{
+
+ return (sysctl_handle_opaque(oidp,
+ &timecounter->tc_tweak->tc_frequency,
+ sizeof(timecounter->tc_tweak->tc_frequency), req));
+}
+
+static int
+sysctl_kern_timecounter_adjustment SYSCTL_HANDLER_ARGS
+{
+
+ return (sysctl_handle_opaque(oidp,
+ &timecounter->tc_tweak->tc_adjustment,
+ sizeof(timecounter->tc_tweak->tc_adjustment), req));
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+
+SYSCTL_INT(_kern_timecounter, KERN_ARGMAX, method, CTLFLAG_RW, &tco_method, 0,
+ "This variable determines the method used for updating timecounters. "
+ "If the default algorithm (0) fails with \"calcru negative...\" messages "
+ "try the alternate algorithm (1) which handles bad hardware better."
+
+);
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, frequency, CTLTYPE_INT | CTLFLAG_RW,
+ 0, sizeof(u_int), sysctl_kern_timecounter_frequency, "I", "");
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, adjustment, CTLTYPE_INT | CTLFLAG_RW,
+ 0, sizeof(int), sysctl_kern_timecounter_adjustment, "I", "");
diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c
new file mode 100644
index 0000000..df832f6
--- /dev/null
+++ b/sys/kern/kern_conf.c
@@ -0,0 +1,220 @@
+/*-
+ * Parts Copyright (c) 1995 Terrence R. Lambert
+ * Copyright (c) 1995 Julian R. Elischer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Terrence R. Lambert.
+ * 4. The name Terrence R. Lambert may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Julian R. Elischer ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: kern_conf.c,v 1.28 1998/10/25 17:44:50 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/module.h>
+#include <sys/conf.h>
+#include <sys/vnode.h>
+
+#define NUMBDEV 128
+#define NUMCDEV 256
+#define bdevsw_ALLOCSTART (NUMBDEV/2)
+#define cdevsw_ALLOCSTART (NUMCDEV/2)
+
+struct cdevsw *bdevsw[NUMBDEV];
+int nblkdev = NUMBDEV;
+struct cdevsw *cdevsw[NUMCDEV];
+int nchrdev = NUMCDEV;
+
+/*
+ * Routine to convert from character to block device number.
+ *
+ * A minimal stub routine can always return NODEV.
+ */
+dev_t
+chrtoblk(dev_t dev)
+{
+ struct cdevsw *cd;
+
+ if(cd = cdevsw[major(dev)]) {
+ if (cd->d_bmaj != -1)
+ return(makedev(cd->d_bmaj,minor(dev)));
+ }
+ return(NODEV);
+}
+
+/*
+ * (re)place an entry in the bdevsw or cdevsw table
+ * return the slot used in major(*descrip)
+ */
+static int
+bdevsw_add(dev_t *descrip,
+ struct cdevsw *newentry,
+ struct cdevsw **oldentry)
+{
+ int i ;
+
+ if ( (int)*descrip == NODEV) { /* auto (0 is valid) */
+ /*
+ * Search the table looking for a slot...
+ */
+ for (i = bdevsw_ALLOCSTART; i < nblkdev; i++)
+ if (bdevsw[i] == NULL)
+ break; /* found one! */
+ /* out of allocable slots? */
+ if (i >= nblkdev) {
+ return ENFILE;
+ }
+ } else { /* assign */
+ i = major(*descrip);
+ if (i < 0 || i >= nblkdev) {
+ return EINVAL;
+ }
+ }
+
+ /* maybe save old */
+ if (oldentry) {
+ *oldentry = bdevsw[i];
+ }
+ if (newentry) {
+ newentry->d_bmaj = i;
+ }
+ /* replace with new */
+ bdevsw[i] = newentry;
+
+ /* done! let them know where we put it */
+ *descrip = makedev(i,0);
+ return 0;
+}
+
+int
+cdevsw_add(dev_t *descrip,
+ struct cdevsw *newentry,
+ struct cdevsw **oldentry)
+{
+ int i ;
+
+ if ( (int)*descrip == NODEV) { /* auto (0 is valid) */
+ /*
+ * Search the table looking for a slot...
+ */
+ for (i = cdevsw_ALLOCSTART; i < nchrdev; i++)
+ if (cdevsw[i] == NULL)
+ break; /* found one! */
+ /* out of allocable slots? */
+ if (i >= nchrdev) {
+ return ENFILE;
+ }
+ } else { /* assign */
+ i = major(*descrip);
+ if (i < 0 || i >= nchrdev) {
+ return EINVAL;
+ }
+ }
+
+ /* maybe save old */
+ if (oldentry) {
+ *oldentry = cdevsw[i];
+ }
+ if (newentry) {
+ newentry->d_bmaj = -1;
+ newentry->d_maj = i;
+ }
+ /* replace with new */
+ cdevsw[i] = newentry;
+
+ /* done! let them know where we put it */
+ *descrip = makedev(i,0);
+ return 0;
+}
+
+/*
+ * note must call cdevsw_add before bdevsw_add due to d_bmaj hack.
+ */
+void
+cdevsw_add_generic(int bdev, int cdev, struct cdevsw *cdevsw)
+{
+ dev_t dev;
+
+ dev = makedev(cdev, 0);
+ cdevsw_add(&dev, cdevsw, NULL);
+ dev = makedev(bdev, 0);
+ bdevsw_add(&dev, cdevsw, NULL);
+}
+
+int
+cdevsw_module_handler(module_t mod, int what, void *arg)
+{
+ struct cdevsw_module_data* data = (struct cdevsw_module_data*) arg;
+ int error;
+
+ switch (what) {
+ case MOD_LOAD:
+ if (error = cdevsw_add(&data->dev, data->cdevsw, NULL))
+ return error;
+ break;
+
+ case MOD_UNLOAD:
+ if (error = cdevsw_add(&data->dev, NULL, NULL))
+ return error;
+ break;
+ }
+
+ if (data->chainevh)
+ return data->chainevh(mod, what, data->chainarg);
+ else
+ return 0;
+}
+
+int
+bdevsw_module_handler(module_t mod, int what, void* arg)
+{
+ struct bdevsw_module_data* data = (struct bdevsw_module_data*) arg;
+ int error;
+
+ switch (what) {
+ case MOD_LOAD:
+ if (error = cdevsw_add(&data->cdev, data->cdevsw, NULL))
+ return error;
+ if (error = bdevsw_add(&data->bdev, data->cdevsw, NULL)) {
+ cdevsw_add(&data->bdev, NULL, NULL);
+ return error;
+ }
+ break;
+
+ case MOD_UNLOAD:
+ if (error = bdevsw_add(&data->bdev, NULL, NULL))
+ return error;
+ if (error = cdevsw_add(&data->cdev, NULL, NULL))
+ return error;
+ break;
+ }
+
+ if (data->chainevh)
+ return data->chainevh(mod, what, data->chainarg);
+ else
+ return 0;
+}
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
new file mode 100644
index 0000000..1d18a86
--- /dev/null
+++ b/sys/kern/kern_descrip.c
@@ -0,0 +1,1313 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94
+ * $Id: kern_descrip.c,v 1.57 1998/11/11 10:55:56 truckman Exp $
+ */
+
+#include "opt_compat.h"
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/conf.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+#include <sys/resourcevar.h>
+#include <sys/pipe.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
+MALLOC_DEFINE(M_FILE, "file", "Open file structure");
+static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
+
+
+static d_open_t fdopen;
+#define NUMFDESC 64
+
+#define CDEV_MAJOR 22
+static struct cdevsw fildesc_cdevsw =
+ { fdopen, noclose, noread, nowrite,
+ noioc, nostop, nullreset, nodevtotty,
+ seltrue, nommap, nostrat };
+
+static int finishdup __P((struct filedesc *fdp, int old, int new, register_t *retval));
+/*
+ * Descriptor management.
+ */
+struct filelist filehead; /* head of list of open files */
+int nfiles; /* actual number of open files */
+extern int cmask;
+
+/*
+ * System calls on descriptors.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdtablesize_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+getdtablesize(p, uap)
+ struct proc *p;
+ struct getdtablesize_args *uap;
+{
+
+ p->p_retval[0] =
+ min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
+ return (0);
+}
+
+/*
+ * Duplicate a file descriptor to a particular value.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup2_args {
+ u_int from;
+ u_int to;
+};
+#endif
+/* ARGSUSED */
+int
+dup2(p, uap)
+ struct proc *p;
+ struct dup2_args *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register u_int old = uap->from, new = uap->to;
+ int i, error;
+
+ if (old >= fdp->fd_nfiles ||
+ fdp->fd_ofiles[old] == NULL ||
+ new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
+ new >= maxfilesperproc)
+ return (EBADF);
+ if (old == new) {
+ p->p_retval[0] = new;
+ return (0);
+ }
+ if (new >= fdp->fd_nfiles) {
+ if ((error = fdalloc(p, new, &i)))
+ return (error);
+ if (new != i)
+ panic("dup2: fdalloc");
+ } else if (fdp->fd_ofiles[new]) {
+ if (fdp->fd_ofileflags[new] & UF_MAPPED)
+ (void) munmapfd(p, new);
+ /*
+ * dup2() must succeed even if the close has an error.
+ */
+ (void) closef(fdp->fd_ofiles[new], p);
+ }
+ return (finishdup(fdp, (int)old, (int)new, p->p_retval));
+}
+
+/*
+ * Duplicate a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup_args {
+ u_int fd;
+};
+#endif
+/* ARGSUSED */
+int
+dup(p, uap)
+ struct proc *p;
+ struct dup_args *uap;
+{
+ register struct filedesc *fdp;
+ u_int old;
+ int new, error;
+
+ old = uap->fd;
+
+#if 0
+ /*
+ * XXX Compatibility
+ */
+ if (old &~ 077) { uap->fd &= 077; return (dup2(p, uap, p->p_retval)); }
+#endif
+
+ fdp = p->p_fd;
+ if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL)
+ return (EBADF);
+ if ((error = fdalloc(p, 0, &new)))
+ return (error);
+ return (finishdup(fdp, (int)old, new, p->p_retval));
+}
+
+/*
+ * The file control system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fcntl_args {
+ int fd;
+ int cmd;
+ long arg;
+};
+#endif
+/* ARGSUSED */
+int
+fcntl(p, uap)
+ struct proc *p;
+ register struct fcntl_args *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register struct file *fp;
+ register char *pop;
+ struct vnode *vp;
+ int i, tmp, error, flg = F_POSIX;
+ struct flock fl;
+ u_int newmin;
+
+ if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+ return (EBADF);
+ pop = &fdp->fd_ofileflags[uap->fd];
+ switch (uap->cmd) {
+
+ case F_DUPFD:
+ newmin = uap->arg;
+ if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
+ newmin >= maxfilesperproc)
+ return (EINVAL);
+ if ((error = fdalloc(p, newmin, &i)))
+ return (error);
+ return (finishdup(fdp, uap->fd, i, p->p_retval));
+
+ case F_GETFD:
+ p->p_retval[0] = *pop & 1;
+ return (0);
+
+ case F_SETFD:
+ *pop = (*pop &~ 1) | (uap->arg & 1);
+ return (0);
+
+ case F_GETFL:
+ p->p_retval[0] = OFLAGS(fp->f_flag);
+ return (0);
+
+ case F_SETFL:
+ fp->f_flag &= ~FCNTLFLAGS;
+ fp->f_flag |= FFLAGS(uap->arg & ~O_ACCMODE) & FCNTLFLAGS;
+ tmp = fp->f_flag & FNONBLOCK;
+ error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
+ if (error)
+ return (error);
+ tmp = fp->f_flag & FASYNC;
+ error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
+ if (!error)
+ return (0);
+ fp->f_flag &= ~FNONBLOCK;
+ tmp = 0;
+ (void) (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
+ return (error);
+
+ case F_GETOWN:
+ error = (*fp->f_ops->fo_ioctl)
+ (fp, FIOGETOWN, (caddr_t)p->p_retval, p);
+ return (error);
+
+ case F_SETOWN:
+ return ((*fp->f_ops->fo_ioctl)
+ (fp, FIOSETOWN, (caddr_t)&uap->arg, p));
+
+ case F_SETLKW:
+ flg |= F_WAIT;
+ /* Fall into F_SETLK */
+
+ case F_SETLK:
+ if (fp->f_type != DTYPE_VNODE)
+ return (EBADF);
+ vp = (struct vnode *)fp->f_data;
+ /* Copy in the lock structure */
+ error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
+ sizeof(fl));
+ if (error)
+ return (error);
+ if (fl.l_whence == SEEK_CUR)
+ fl.l_start += fp->f_offset;
+ switch (fl.l_type) {
+
+ case F_RDLCK:
+ if ((fp->f_flag & FREAD) == 0)
+ return (EBADF);
+ p->p_flag |= P_ADVLOCK;
+ return (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg));
+
+ case F_WRLCK:
+ if ((fp->f_flag & FWRITE) == 0)
+ return (EBADF);
+ p->p_flag |= P_ADVLOCK;
+ return (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg));
+
+ case F_UNLCK:
+ return (VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &fl,
+ F_POSIX));
+
+ default:
+ return (EINVAL);
+ }
+
+ case F_GETLK:
+ if (fp->f_type != DTYPE_VNODE)
+ return (EBADF);
+ vp = (struct vnode *)fp->f_data;
+ /* Copy in the lock structure */
+ error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
+ sizeof(fl));
+ if (error)
+ return (error);
+ if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK &&
+ fl.l_type != F_UNLCK)
+ return (EINVAL);
+ if (fl.l_whence == SEEK_CUR)
+ fl.l_start += fp->f_offset;
+ if ((error = VOP_ADVLOCK(vp,(caddr_t)p,F_GETLK,&fl,F_POSIX)))
+ return (error);
+ return (copyout((caddr_t)&fl, (caddr_t)(intptr_t)uap->arg,
+ sizeof(fl)));
+
+ default:
+ return (EINVAL);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Common code for dup, dup2, and fcntl(F_DUPFD).
+ */
+static int
+finishdup(fdp, old, new, retval)
+ register struct filedesc *fdp;
+ register int old, new;
+ register_t *retval;
+{
+ register struct file *fp;
+
+ fp = fdp->fd_ofiles[old];
+ fdp->fd_ofiles[new] = fp;
+ fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
+ fp->f_count++;
+ if (new > fdp->fd_lastfile)
+ fdp->fd_lastfile = new;
+ *retval = new;
+ return (0);
+}
+
+/*
+ * If sigio is on the list associated with a process or process group,
+ * disable signalling from the device, remove sigio from the list and
+ * free sigio.
+ */
+void
+funsetown(sigio)
+ struct sigio *sigio;
+{
+ int s;
+
+ if (sigio == NULL)
+ return;
+ s = splhigh();
+ *(sigio->sio_myref) = NULL;
+ splx(s);
+ if (sigio->sio_pgid < 0) {
+ SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
+ sigio, sio_pgsigio);
+ } else /* if ((*sigiop)->sio_pgid > 0) */ {
+ SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
+ sigio, sio_pgsigio);
+ }
+ crfree(sigio->sio_ucred);
+ FREE(sigio, M_SIGIO);
+}
+
+/* Free a list of sigio structures. */
+void
+funsetownlst(sigiolst)
+ struct sigiolst *sigiolst;
+{
+ struct sigio *sigio;
+
+ while ((sigio = sigiolst->slh_first) != NULL)
+ funsetown(sigio);
+}
+
+/*
+ * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
+ *
+ * After permission checking, add a sigio structure to the sigio list for
+ * the process or process group.
+ */
+int
+fsetown(pgid, sigiop)
+ pid_t pgid;
+ struct sigio **sigiop;
+{
+ struct proc *proc;
+ struct pgrp *pgrp;
+ struct sigio *sigio;
+ int s;
+
+ if (pgid == 0) {
+ funsetown(*sigiop);
+ return (0);
+ }
+ if (pgid > 0) {
+ proc = pfind(pgid);
+ if (proc == NULL)
+ return (ESRCH);
+ /*
+ * Policy - Don't allow a process to FSETOWN a process
+ * in another session.
+ *
+ * Remove this test to allow maximum flexibility or
+ * restrict FSETOWN to the current process or process
+ * group for maximum safety.
+ */
+ else if (proc->p_session != curproc->p_session)
+ return (EPERM);
+ pgrp = NULL;
+ } else /* if (pgid < 0) */ {
+ pgrp = pgfind(-pgid);
+ if (pgrp == NULL)
+ return (ESRCH);
+ /*
+ * Policy - Don't allow a process to FSETOWN a process
+ * in another session.
+ *
+ * Remove this test to allow maximum flexibility or
+ * restrict FSETOWN to the current process or process
+ * group for maximum safety.
+ */
+ else if (pgrp->pg_session != curproc->p_session)
+ return (EPERM);
+ proc = NULL;
+ }
+ funsetown(*sigiop);
+ MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO,
+ M_WAITOK);
+ if (pgid > 0) {
+ SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
+ sigio->sio_proc = proc;
+ } else {
+ SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
+ sigio->sio_pgrp = pgrp;
+ }
+ sigio->sio_pgid = pgid;
+ crhold(curproc->p_ucred);
+ sigio->sio_ucred = curproc->p_ucred;
+ /* It would be convenient if p_ruid was in ucred. */
+ sigio->sio_ruid = curproc->p_cred->p_ruid;
+ sigio->sio_myref = sigiop;
+ s = splhigh();
+ *sigiop = sigio;
+ splx(s);
+ return (0);
+}
+
+/*
+ * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
+ */
+pid_t
+fgetown(sigio)
+ struct sigio *sigio;
+{
+ return (sigio != NULL ? sigio->sio_pgid : 0);
+}
+
+/*
+ * Close a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct close_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+close(p, uap)
+ struct proc *p;
+ struct close_args *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register struct file *fp;
+ register int fd = uap->fd;
+ register u_char *pf;
+
+ if ((unsigned)fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[fd]) == NULL)
+ return (EBADF);
+ pf = (u_char *)&fdp->fd_ofileflags[fd];
+ if (*pf & UF_MAPPED)
+ (void) munmapfd(p, fd);
+ fdp->fd_ofiles[fd] = NULL;
+ while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+ fdp->fd_lastfile--;
+ if (fd < fdp->fd_freefile)
+ fdp->fd_freefile = fd;
+ *pf = 0;
+ return (closef(fp, p));
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ofstat_args {
+ int fd;
+ struct ostat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+ofstat(p, uap)
+ struct proc *p;
+ register struct ofstat_args *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register struct file *fp;
+ struct stat ub;
+ struct ostat oub;
+ int error;
+
+ if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+ return (EBADF);
+ switch (fp->f_type) {
+
+ case DTYPE_FIFO:
+ case DTYPE_VNODE:
+ error = vn_stat((struct vnode *)fp->f_data, &ub, p);
+ break;
+
+ case DTYPE_SOCKET:
+ error = soo_stat((struct socket *)fp->f_data, &ub);
+ break;
+
+ case DTYPE_PIPE:
+ error = pipe_stat((struct pipe *)fp->f_data, &ub);
+ break;
+
+ default:
+ panic("ofstat");
+ /*NOTREACHED*/
+ }
+ cvtstat(&ub, &oub);
+ if (error == 0)
+ error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub));
+ return (error);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstat_args {
+ int fd;
+ struct stat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+fstat(p, uap)
+ struct proc *p;
+ register struct fstat_args *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register struct file *fp;
+ struct stat ub;
+ int error;
+
+ if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+ return (EBADF);
+ switch (fp->f_type) {
+
+ case DTYPE_FIFO:
+ case DTYPE_VNODE:
+ error = vn_stat((struct vnode *)fp->f_data, &ub, p);
+ break;
+
+ case DTYPE_SOCKET:
+ error = soo_stat((struct socket *)fp->f_data, &ub);
+ break;
+
+ case DTYPE_PIPE:
+ error = pipe_stat((struct pipe *)fp->f_data, &ub);
+ break;
+
+ default:
+ panic("fstat");
+ /*NOTREACHED*/
+ }
+ if (error == 0)
+ error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub));
+ return (error);
+}
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nfstat_args {
+ int fd;
+ struct nstat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+nfstat(p, uap)
+ struct proc *p;
+ register struct nfstat_args *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register struct file *fp;
+ struct stat ub;
+ struct nstat nub;
+ int error;
+
+ if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+ return (EBADF);
+ switch (fp->f_type) {
+
+ case DTYPE_FIFO:
+ case DTYPE_VNODE:
+ error = vn_stat((struct vnode *)fp->f_data, &ub, p);
+ break;
+
+ case DTYPE_SOCKET:
+ error = soo_stat((struct socket *)fp->f_data, &ub);
+ break;
+
+ case DTYPE_PIPE:
+ error = pipe_stat((struct pipe *)fp->f_data, &ub);
+ break;
+
+ default:
+ panic("fstat");
+ /*NOTREACHED*/
+ }
+ if (error == 0) {
+ cvtnstat(&ub, &nub);
+ error = copyout((caddr_t)&nub, (caddr_t)uap->sb, sizeof (nub));
+ }
+ return (error);
+}
+
+/*
+ * Return pathconf information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fpathconf_args {
+ int fd;
+ int name;
+};
+#endif
+/* ARGSUSED */
+int
+fpathconf(p, uap)
+ struct proc *p;
+ register struct fpathconf_args *uap;
+{
+ struct filedesc *fdp = p->p_fd;
+ struct file *fp;
+ struct vnode *vp;
+
+ if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+ return (EBADF);
+ switch (fp->f_type) {
+
+ case DTYPE_PIPE:
+ case DTYPE_SOCKET:
+ if (uap->name != _PC_PIPE_BUF)
+ return (EINVAL);
+ p->p_retval[0] = PIPE_BUF;
+ return (0);
+
+ case DTYPE_FIFO:
+ case DTYPE_VNODE:
+ vp = (struct vnode *)fp->f_data;
+ return (VOP_PATHCONF(vp, uap->name, p->p_retval));
+
+ default:
+ panic("fpathconf");
+ }
+ /*NOTREACHED*/
+}
+
+/*
+ * Allocate a file descriptor for the process.
+ */
+static int fdexpand;
+SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
+
+int
+fdalloc(p, want, result)
+ struct proc *p;
+ int want;
+ int *result;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register int i;
+ int lim, last, nfiles;
+ struct file **newofile;
+ char *newofileflags;
+
+ /*
+ * Search for a free descriptor starting at the higher
+ * of want or fd_freefile. If that fails, consider
+ * expanding the ofile array.
+ */
+ lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
+ for (;;) {
+ last = min(fdp->fd_nfiles, lim);
+ if ((i = want) < fdp->fd_freefile)
+ i = fdp->fd_freefile;
+ for (; i < last; i++) {
+ if (fdp->fd_ofiles[i] == NULL) {
+ fdp->fd_ofileflags[i] = 0;
+ if (i > fdp->fd_lastfile)
+ fdp->fd_lastfile = i;
+ if (want <= fdp->fd_freefile)
+ fdp->fd_freefile = i;
+ *result = i;
+ return (0);
+ }
+ }
+
+ /*
+ * No space in current array. Expand?
+ */
+ if (fdp->fd_nfiles >= lim)
+ return (EMFILE);
+ if (fdp->fd_nfiles < NDEXTENT)
+ nfiles = NDEXTENT;
+ else
+ nfiles = 2 * fdp->fd_nfiles;
+ MALLOC(newofile, struct file **, nfiles * OFILESIZE,
+ M_FILEDESC, M_WAITOK);
+ newofileflags = (char *) &newofile[nfiles];
+ /*
+ * Copy the existing ofile and ofileflags arrays
+ * and zero the new portion of each array.
+ */
+ bcopy(fdp->fd_ofiles, newofile,
+ (i = sizeof(struct file *) * fdp->fd_nfiles));
+ bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i);
+ bcopy(fdp->fd_ofileflags, newofileflags,
+ (i = sizeof(char) * fdp->fd_nfiles));
+ bzero(newofileflags + i, nfiles * sizeof(char) - i);
+ if (fdp->fd_nfiles > NDFILE)
+ FREE(fdp->fd_ofiles, M_FILEDESC);
+ fdp->fd_ofiles = newofile;
+ fdp->fd_ofileflags = newofileflags;
+ fdp->fd_nfiles = nfiles;
+ fdexpand++;
+ }
+ return (0);
+}
+
+/*
+ * Check to see whether n user file descriptors
+ * are available to the process p.
+ */
+int
+fdavail(p, n)
+ struct proc *p;
+ register int n;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register struct file **fpp;
+ register int i, lim, last;
+
+ lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
+ if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
+ return (1);
+
+ last = min(fdp->fd_nfiles, lim);
+ fpp = &fdp->fd_ofiles[fdp->fd_freefile];
+ for (i = last - fdp->fd_freefile; --i >= 0; fpp++)
+ if (*fpp == NULL && --n <= 0)
+ return (1);
+ return (0);
+}
+
+/*
+ * Create a new open file structure and allocate
+ * a file decriptor for the process that refers to it.
+ */
+int
+falloc(p, resultfp, resultfd)
+ register struct proc *p;
+ struct file **resultfp;
+ int *resultfd;
+{
+ register struct file *fp, *fq;
+ int error, i;
+
+ if ((error = fdalloc(p, 0, &i)))
+ return (error);
+ if (nfiles >= maxfiles) {
+ tablefull("file");
+ return (ENFILE);
+ }
+ /*
+ * Allocate a new file descriptor.
+ * If the process has file descriptor zero open, add to the list
+ * of open files at that point, otherwise put it at the front of
+ * the list of open files.
+ */
+ nfiles++;
+ MALLOC(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK);
+ bzero(fp, sizeof(struct file));
+ if ((fq = p->p_fd->fd_ofiles[0])) {
+ LIST_INSERT_AFTER(fq, fp, f_list);
+ } else {
+ LIST_INSERT_HEAD(&filehead, fp, f_list);
+ }
+ p->p_fd->fd_ofiles[i] = fp;
+ fp->f_count = 1;
+ fp->f_cred = p->p_ucred;
+ fp->f_seqcount = 1;
+ crhold(fp->f_cred);
+ if (resultfp)
+ *resultfp = fp;
+ if (resultfd)
+ *resultfd = i;
+ return (0);
+}
+
+/*
+ * Free a file descriptor.
+ */
+void
+ffree(fp)
+ register struct file *fp;
+{
+ LIST_REMOVE(fp, f_list);
+ crfree(fp->f_cred);
+#if defined(DIAGNOSTIC) || defined(INVARIANTS)
+ fp->f_count = 0;
+#endif
+ nfiles--;
+ FREE(fp, M_FILE);
+}
+
+/*
+ * Build a new filedesc structure.
+ */
+struct filedesc *
+fdinit(p)
+ struct proc *p;
+{
+ register struct filedesc0 *newfdp;
+ register struct filedesc *fdp = p->p_fd;
+
+ MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
+ M_FILEDESC, M_WAITOK);
+ bzero(newfdp, sizeof(struct filedesc0));
+ newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
+ VREF(newfdp->fd_fd.fd_cdir);
+ newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
+ VREF(newfdp->fd_fd.fd_rdir);
+
+ /* Create the file descriptor table. */
+ newfdp->fd_fd.fd_refcnt = 1;
+ newfdp->fd_fd.fd_cmask = cmask;
+ newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
+ newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
+ newfdp->fd_fd.fd_nfiles = NDFILE;
+
+ newfdp->fd_fd.fd_freefile = 0;
+ newfdp->fd_fd.fd_lastfile = 0;
+
+ return (&newfdp->fd_fd);
+}
+
+/*
+ * Share a filedesc structure.
+ */
+struct filedesc *
+fdshare(p)
+ struct proc *p;
+{
+ p->p_fd->fd_refcnt++;
+ return (p->p_fd);
+}
+
+/*
+ * Copy a filedesc structure.
+ */
+struct filedesc *
+fdcopy(p)
+ struct proc *p;
+{
+ register struct filedesc *newfdp, *fdp = p->p_fd;
+ register struct file **fpp;
+ register int i;
+
+/*
+ * Certain daemons might not have file descriptors
+ */
+ if (fdp == NULL)
+ return NULL;
+
+ MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
+ M_FILEDESC, M_WAITOK);
+ bcopy(fdp, newfdp, sizeof(struct filedesc));
+ VREF(newfdp->fd_cdir);
+ VREF(newfdp->fd_rdir);
+ newfdp->fd_refcnt = 1;
+
+ /*
+ * If the number of open files fits in the internal arrays
+ * of the open file structure, use them, otherwise allocate
+ * additional memory for the number of descriptors currently
+ * in use.
+ */
+ if (newfdp->fd_lastfile < NDFILE) {
+ newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
+ newfdp->fd_ofileflags =
+ ((struct filedesc0 *) newfdp)->fd_dfileflags;
+ i = NDFILE;
+ } else {
+ /*
+ * Compute the smallest multiple of NDEXTENT needed
+ * for the file descriptors currently in use,
+ * allowing the table to shrink.
+ */
+ i = newfdp->fd_nfiles;
+ while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
+ i /= 2;
+ MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
+ M_FILEDESC, M_WAITOK);
+ newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
+ }
+ newfdp->fd_nfiles = i;
+ bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
+ bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
+ fpp = newfdp->fd_ofiles;
+ for (i = newfdp->fd_lastfile; i-- >= 0; fpp++)
+ if (*fpp != NULL)
+ (*fpp)->f_count++;
+ return (newfdp);
+}
+
+/*
+ * Release a filedesc structure.
+ */
+void
+fdfree(p)
+ struct proc *p;
+{
+ register struct filedesc *fdp = p->p_fd;
+ struct file **fpp;
+ register int i;
+
+/*
+ * Certain daemons might not have file descriptors
+ */
+ if (fdp == NULL)
+ return;
+
+ if (--fdp->fd_refcnt > 0)
+ return;
+ fpp = fdp->fd_ofiles;
+ for (i = fdp->fd_lastfile; i-- >= 0; fpp++)
+ if (*fpp)
+ (void) closef(*fpp, p);
+ if (fdp->fd_nfiles > NDFILE)
+ FREE(fdp->fd_ofiles, M_FILEDESC);
+ vrele(fdp->fd_cdir);
+ vrele(fdp->fd_rdir);
+ FREE(fdp, M_FILEDESC);
+}
+
+/*
+ * Close any files on exec?
+ */
+void
+fdcloseexec(p)
+ struct proc *p;
+{
+ struct filedesc *fdp = p->p_fd;
+ struct file **fpp;
+ char *fdfp;
+ register int i;
+
+/*
+ * Certain daemons might not have file descriptors
+ */
+ if (fdp == NULL)
+ return;
+
+ fpp = fdp->fd_ofiles;
+ fdfp = fdp->fd_ofileflags;
+ for (i = 0; i <= fdp->fd_lastfile; i++, fpp++, fdfp++)
+ if (*fpp != NULL && (*fdfp & UF_EXCLOSE)) {
+ if (*fdfp & UF_MAPPED)
+ (void) munmapfd(p, i);
+ (void) closef(*fpp, p);
+ *fpp = NULL;
+ *fdfp = 0;
+ if (i < fdp->fd_freefile)
+ fdp->fd_freefile = i;
+ }
+ while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+ fdp->fd_lastfile--;
+}
+
+/*
+ * Internal form of close.
+ * Decrement reference count on file structure.
+ * Note: p may be NULL when closing a file
+ * that was being passed in a message.
+ */
+int
+closef(fp, p)
+ register struct file *fp;
+ register struct proc *p;
+{
+ struct vnode *vp;
+ struct flock lf;
+ int error;
+
+ if (fp == NULL)
+ return (0);
+ /*
+ * POSIX record locking dictates that any close releases ALL
+ * locks owned by this process. This is handled by setting
+ * a flag in the unlock to free ONLY locks obeying POSIX
+ * semantics, and not to free BSD-style file locks.
+ * If the descriptor was in a message, POSIX-style locks
+ * aren't passed with the descriptor.
+ */
+ if (p && (p->p_flag & P_ADVLOCK) && fp->f_type == DTYPE_VNODE) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ vp = (struct vnode *)fp->f_data;
+ (void) VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_POSIX);
+ }
+ if (--fp->f_count > 0)
+ return (0);
+ if (fp->f_count < 0)
+ panic("closef: count < 0");
+ if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ vp = (struct vnode *)fp->f_data;
+ (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
+ }
+ if (fp->f_ops)
+ error = (*fp->f_ops->fo_close)(fp, p);
+ else
+ error = 0;
+ ffree(fp);
+ return (error);
+}
+
+/*
+ * Apply an advisory lock on a file descriptor.
+ *
+ * Just attempt to get a record lock of the requested type on
+ * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct flock_args {
+ int fd;
+ int how;
+};
+#endif
+/* ARGSUSED */
+int
+flock(p, uap)
+ struct proc *p;
+ register struct flock_args *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register struct file *fp;
+ struct vnode *vp;
+ struct flock lf;
+
+ if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+ return (EBADF);
+ if (fp->f_type != DTYPE_VNODE)
+ return (EOPNOTSUPP);
+ vp = (struct vnode *)fp->f_data;
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ if (uap->how & LOCK_UN) {
+ lf.l_type = F_UNLCK;
+ fp->f_flag &= ~FHASLOCK;
+ return (VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK));
+ }
+ if (uap->how & LOCK_EX)
+ lf.l_type = F_WRLCK;
+ else if (uap->how & LOCK_SH)
+ lf.l_type = F_RDLCK;
+ else
+ return (EBADF);
+ fp->f_flag |= FHASLOCK;
+ if (uap->how & LOCK_NB)
+ return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK));
+ return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT));
+}
+
+/*
+ * File Descriptor pseudo-device driver (/dev/fd/).
+ *
+ * Opening minor device N dup()s the file (if any) connected to file
+ * descriptor N belonging to the calling process. Note that this driver
+ * consists of only the ``open()'' routine, because all subsequent
+ * references to this file will be direct to the other driver.
+ */
+/* ARGSUSED */
+static int
+fdopen(dev, mode, type, p)
+ dev_t dev;
+ int mode, type;
+ struct proc *p;
+{
+
+ /*
+ * XXX Kludge: set curproc->p_dupfd to contain the value of the
+ * the file descriptor being sought for duplication. The error
+ * return ensures that the vnode for this device will be released
+ * by vn_open. Open will detect this special error and take the
+ * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
+ * will simply report the error.
+ */
+ p->p_dupfd = minor(dev);
+ return (ENODEV);
+}
+
+/*
+ * Duplicate the specified descriptor to a free descriptor.
+ */
+int
+dupfdopen(fdp, indx, dfd, mode, error)
+ register struct filedesc *fdp;
+ register int indx, dfd;
+ int mode;
+ int error;
+{
+ register struct file *wfp;
+ struct file *fp;
+
+ /*
+ * If the to-be-dup'd fd number is greater than the allowed number
+ * of file descriptors, or the fd to be dup'd has already been
+ * closed, reject. Note, check for new == old is necessary as
+ * falloc could allocate an already closed to-be-dup'd descriptor
+ * as the new descriptor.
+ */
+ fp = fdp->fd_ofiles[indx];
+ if ((u_int)dfd >= fdp->fd_nfiles ||
+ (wfp = fdp->fd_ofiles[dfd]) == NULL || fp == wfp)
+ return (EBADF);
+
+ /*
+ * There are two cases of interest here.
+ *
+ * For ENODEV simply dup (dfd) to file descriptor
+ * (indx) and return.
+ *
+ * For ENXIO steal away the file structure from (dfd) and
+ * store it in (indx). (dfd) is effectively closed by
+ * this operation.
+ *
+ * Any other error code is just returned.
+ */
+ switch (error) {
+ case ENODEV:
+ /*
+ * Check that the mode the file is being opened for is a
+ * subset of the mode of the existing descriptor.
+ */
+ if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag)
+ return (EACCES);
+ fdp->fd_ofiles[indx] = wfp;
+ fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
+ wfp->f_count++;
+ if (indx > fdp->fd_lastfile)
+ fdp->fd_lastfile = indx;
+ return (0);
+
+ case ENXIO:
+ /*
+ * Steal away the file pointer from dfd, and stuff it into indx.
+ */
+ fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
+ fdp->fd_ofiles[dfd] = NULL;
+ fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
+ fdp->fd_ofileflags[dfd] = 0;
+ /*
+ * Complete the clean up of the filedesc structure by
+ * recomputing the various hints.
+ */
+ if (indx > fdp->fd_lastfile)
+ fdp->fd_lastfile = indx;
+ else
+ while (fdp->fd_lastfile > 0 &&
+ fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+ fdp->fd_lastfile--;
+ if (dfd < fdp->fd_freefile)
+ fdp->fd_freefile = dfd;
+ return (0);
+
+ default:
+ return (error);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Get file structures.
+ */
+static int
+sysctl_kern_file SYSCTL_HANDLER_ARGS
+{
+ int error;
+ struct file *fp;
+
+ if (!req->oldptr) {
+ /*
+ * overestimate by 10 files
+ */
+ return (SYSCTL_OUT(req, 0, sizeof(filehead) +
+ (nfiles + 10) * sizeof(struct file)));
+ }
+
+ error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead));
+ if (error)
+ return (error);
+
+ /*
+ * followed by an array of file structures
+ */
+ for (fp = filehead.lh_first; fp != NULL; fp = fp->f_list.le_next) {
+ error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file));
+ if (error)
+ return (error);
+ }
+ return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, 0, sysctl_kern_file, "S,file", "");
+
+SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc,
+ CTLFLAG_RW, &maxfilesperproc, 0, "");
+
+SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "");
+
+static fildesc_devsw_installed = 0;
+#ifdef DEVFS
+static void *devfs_token_stdin;
+static void *devfs_token_stdout;
+static void *devfs_token_stderr;
+static void *devfs_token_fildesc[NUMFDESC];
+#endif
+
+static void fildesc_drvinit(void *unused)
+{
+ dev_t dev;
+#ifdef DEVFS
+ int fd;
+#endif
+
+ if( ! fildesc_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR,0);
+ cdevsw_add(&dev,&fildesc_cdevsw,NULL);
+ fildesc_devsw_installed = 1;
+#ifdef DEVFS
+ for (fd = 0; fd < NUMFDESC; fd++)
+ devfs_token_fildesc[fd] =
+ devfs_add_devswf(&fildesc_cdevsw, fd, DV_CHR,
+ UID_BIN, GID_BIN, 0666,
+ "fd/%d", fd);
+ devfs_token_stdin =
+ devfs_add_devswf(&fildesc_cdevsw, 0, DV_CHR,
+ UID_ROOT, GID_WHEEL, 0666,
+ "stdin");
+ devfs_token_stdout =
+ devfs_add_devswf(&fildesc_cdevsw, 1, DV_CHR,
+ UID_ROOT, GID_WHEEL, 0666,
+ "stdout");
+ devfs_token_stderr =
+ devfs_add_devswf(&fildesc_cdevsw, 2, DV_CHR,
+ UID_ROOT, GID_WHEEL, 0666,
+ "stderr");
+#endif
+ }
+}
+
+SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
+ fildesc_drvinit,NULL)
+
+
diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c
new file mode 100644
index 0000000..2243e27
--- /dev/null
+++ b/sys/kern/kern_environment.c
@@ -0,0 +1,137 @@
+/*-
+ * Copyright (c) 1998 Michael Smith
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: kern_environment.c,v 1.3 1998/10/09 21:21:34 msmith Exp $
+ */
+
+/*
+ * The unified bootloader passes us a pointer to a preserved copy of
+ * bootstrap/kernel environment variables.
+ * We make these available using sysctl for both in-kernel and
+ * out-of-kernel consumers.
+ *
+ * Note that the current sysctl infrastructure doesn't allow
+ * dynamic insertion or traversal through handled spaces. Grr.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/libkern.h>
+#include <machine/bootinfo.h>
+
+char *kern_envp;
+
+static char *kernenv_next(char *cp);
+
+char *
+getenv(char *name)
+{
+ char *cp, *ep;
+ int len;
+
+ for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
+ for (ep = cp; (*ep != '=') && (*ep != 0); ep++)
+ ;
+ len = ep - cp;
+ if (*ep = '=')
+ ep++;
+ if (!strncmp(name, cp, len))
+ return(ep);
+ }
+ return(NULL);
+}
+
+/*
+ * Return an integer value from an environment variable.
+ */
+int
+getenv_int(char *name, int *data)
+{
+ char *value, *vtp;
+ quad_t iv;
+
+ if ((value = getenv(name)) == NULL)
+ return(0);
+
+ iv = strtoq(value, &vtp, 0);
+ if ((vtp == value) || (*vtp != 0))
+ return(0);
+
+ *data = (int)iv;
+ return(1);
+}
+
+static int
+sysctl_kernenv SYSCTL_HANDLER_ARGS
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ char *cp;
+ int i, error;
+
+ if (kern_envp == NULL)
+ return(ENOENT);
+
+ name++;
+ namelen--;
+
+ if (namelen != 1)
+ return(EINVAL);
+
+ cp = kern_envp;
+ for (i = 0; i < name[0]; i++) {
+ cp = kernenv_next(cp);
+ if (cp == NULL)
+ break;
+ }
+
+ if (cp == NULL)
+ return(ENOENT);
+
+ error = SYSCTL_OUT(req, cp, strlen(cp) + 1);
+ return (error);
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, environment, CTLFLAG_RD, sysctl_kernenv, "kernel environment space");
+
+/*
+ * Find the next entry after the one which (cp) falls within, return a
+ * pointer to its start or NULL if there are no more.
+ */
+static char *
+kernenv_next(char *cp)
+{
+ if (cp != NULL) {
+ while (*cp != 0)
+ cp++;
+ cp++;
+ if (*cp == 0)
+ cp = NULL;
+ }
+ return(cp);
+}
+
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
new file mode 100644
index 0000000..dd63672
--- /dev/null
+++ b/sys/kern/kern_exec.c
@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: kern_exec.c,v 1.92 1998/12/30 10:38:59 dfr Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/fcntl.h>
+#include <sys/acct.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/wait.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/sysent.h>
+#include <sys/shm.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_zone.h>
+#include <vm/vm_pager.h>
+
+#include <machine/reg.h>
+
+static long *exec_copyout_strings __P((struct image_params *));
+
+static long ps_strings = PS_STRINGS;
+SYSCTL_LONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings, "");
+
+static long usrstack = USRSTACK;
+SYSCTL_LONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, "");
+
+/*
+ * Each of the items is a pointer to a `const struct execsw', hence the
+ * double pointer here.
+ */
+static const struct execsw **execsw;
+
+#ifndef _SYS_SYSPROTO_H_
+struct execve_args {
+ char *fname;
+ char **argv;
+ char **envv;
+};
+#endif
+
+/*
+ * execve() system call.
+ */
+int
+execve(p, uap)
+ struct proc *p;
+ register struct execve_args *uap;
+{
+ struct nameidata nd, *ndp;
+ long *stack_base;
+ int error, len, i;
+ struct image_params image_params, *imgp;
+ struct vattr attr;
+
+ imgp = &image_params;
+
+ /*
+ * Initialize part of the common data
+ */
+ imgp->proc = p;
+ imgp->uap = uap;
+ imgp->attr = &attr;
+ imgp->argc = imgp->envc = 0;
+ imgp->argv0 = NULL;
+ imgp->entry_addr = 0;
+ imgp->vmspace_destroyed = 0;
+ imgp->interpreted = 0;
+ imgp->interpreter_name[0] = '\0';
+ imgp->auxargs = NULL;
+ imgp->vp = NULL;
+ imgp->firstpage = NULL;
+
+ /*
+ * Allocate temporary demand zeroed space for argument and
+ * environment strings
+ */
+ imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX + PAGE_SIZE);
+ if (imgp->stringbase == NULL) {
+ error = ENOMEM;
+ goto exec_fail;
+ }
+ imgp->stringp = imgp->stringbase;
+ imgp->stringspace = ARG_MAX;
+ imgp->image_header = imgp->stringbase + ARG_MAX;
+
+ /*
+ * Translate the file name. namei() returns a vnode pointer
+ * in ni_vp amoung other things.
+ */
+ ndp = &nd;
+ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+ UIO_USERSPACE, uap->fname, p);
+
+interpret:
+
+ error = namei(ndp);
+ if (error) {
+ kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
+ ARG_MAX + PAGE_SIZE);
+ goto exec_fail;
+ }
+
+ imgp->vp = ndp->ni_vp;
+ imgp->fname = uap->fname;
+
+ /*
+ * Check file permissions (also 'opens' file)
+ */
+ error = exec_check_permissions(imgp);
+ if (error) {
+ VOP_UNLOCK(imgp->vp, 0, p);
+ goto exec_fail_dealloc;
+ }
+
+ error = exec_map_first_page(imgp);
+ VOP_UNLOCK(imgp->vp, 0, p);
+ if (error)
+ goto exec_fail_dealloc;
+
+ /*
+ * Loop through list of image activators, calling each one.
+ * If there is no match, the activator returns -1. If there
+ * is a match, but there was an error during the activation,
+ * the error is returned. Otherwise 0 means success. If the
+ * image is interpreted, loop back up and try activating
+ * the interpreter.
+ */
+ for (i = 0; execsw[i]; ++i) {
+ if (execsw[i]->ex_imgact)
+ error = (*execsw[i]->ex_imgact)(imgp);
+ else
+ continue;
+ if (error == -1)
+ continue;
+ if (error)
+ goto exec_fail_dealloc;
+ if (imgp->interpreted) {
+ exec_unmap_first_page(imgp);
+ /* free old vnode and name buffer */
+ vrele(ndp->ni_vp);
+ zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
+ /* set new name to that of the interpreter */
+ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+ UIO_SYSSPACE, imgp->interpreter_name, p);
+ goto interpret;
+ }
+ break;
+ }
+ /* If we made it through all the activators and none matched, exit. */
+ if (error == -1) {
+ error = ENOEXEC;
+ goto exec_fail_dealloc;
+ }
+
+ /*
+ * Copy out strings (args and env) and initialize stack base
+ */
+ stack_base = exec_copyout_strings(imgp);
+ p->p_vmspace->vm_minsaddr = (char *)stack_base;
+
+ /*
+ * If custom stack fixup routine present for this process
+ * let it do the stack setup.
+ * Else stuff argument count as first item on stack
+ */
+ if (p->p_sysent->sv_fixup)
+ (*p->p_sysent->sv_fixup)(&stack_base, imgp);
+ else
+ suword(--stack_base, imgp->argc);
+
+ /*
+ * For security and other reasons, the file descriptor table cannot
+ * be shared after an exec.
+ */
+ if (p->p_fd->fd_refcnt > 1) {
+ struct filedesc *tmp;
+
+ tmp = fdcopy(p);
+ fdfree(p);
+ p->p_fd = tmp;
+ }
+
+ /* close files on exec */
+ fdcloseexec(p);
+
+ /* reset caught signals */
+ execsigs(p);
+
+ /* name this process - nameiexec(p, ndp) */
+ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
+ bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
+ p->p_comm[len] = 0;
+
+ /*
+ * mark as execed, wakeup the process that vforked (if any) and tell
+ * it that it now has its own resources back
+ */
+ p->p_flag |= P_EXEC;
+ if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
+ p->p_flag &= ~P_PPWAIT;
+ wakeup((caddr_t)p->p_pptr);
+ }
+
+ /*
+ * Implement image setuid/setgid.
+ *
+ * Don't honor setuid/setgid if the filesystem prohibits it or if
+ * the process is being traced.
+ */
+ if ((attr.va_mode & VSUID && p->p_ucred->cr_uid != attr.va_uid ||
+ attr.va_mode & VSGID && p->p_ucred->cr_gid != attr.va_gid) &&
+ (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
+ (p->p_flag & P_TRACED) == 0) {
+ /*
+ * Turn off syscall tracing for set-id programs, except for
+ * root.
+ */
+ if (p->p_tracep && suser(p->p_ucred, &p->p_acflag)) {
+ p->p_traceflag = 0;
+ vrele(p->p_tracep);
+ p->p_tracep = NULL;
+ }
+ /*
+ * Set the new credentials.
+ */
+ p->p_ucred = crcopy(p->p_ucred);
+ if (attr.va_mode & VSUID)
+ p->p_ucred->cr_uid = attr.va_uid;
+ if (attr.va_mode & VSGID)
+ p->p_ucred->cr_gid = attr.va_gid;
+ setsugid(p);
+ } else {
+ if (p->p_ucred->cr_uid == p->p_cred->p_ruid &&
+ p->p_ucred->cr_gid == p->p_cred->p_rgid)
+ p->p_flag &= ~P_SUGID;
+ }
+
+ /*
+ * Implement correct POSIX saved-id behavior.
+ */
+ p->p_cred->p_svuid = p->p_ucred->cr_uid;
+ p->p_cred->p_svgid = p->p_ucred->cr_gid;
+
+ /*
+ * Store the vp for use in procfs
+ */
+ if (p->p_textvp) /* release old reference */
+ vrele(p->p_textvp);
+ VREF(ndp->ni_vp);
+ p->p_textvp = ndp->ni_vp;
+
+ /*
+ * If tracing the process, trap to debugger so breakpoints
+ * can be set before the program executes.
+ */
+ STOPEVENT(p, S_EXEC, 0);
+
+ if (p->p_flag & P_TRACED)
+ psignal(p, SIGTRAP);
+
+ /* clear "fork but no exec" flag, as we _are_ execing */
+ p->p_acflag &= ~AFORK;
+
+ /* Set entry address */
+ setregs(p, imgp->entry_addr, (u_long)(uintptr_t)stack_base);
+
+exec_fail_dealloc:
+
+ /*
+ * free various allocated resources
+ */
+ if (imgp->firstpage)
+ exec_unmap_first_page(imgp);
+
+ if (imgp->stringbase != NULL)
+ kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
+ ARG_MAX + PAGE_SIZE);
+
+ if (imgp->vp) {
+ vrele(imgp->vp);
+ zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
+ }
+
+ if (error == 0)
+ return (0);
+
+exec_fail:
+ if (imgp->vmspace_destroyed) {
+ /* sorry, no more process anymore. exit gracefully */
+ exit1(p, W_EXITCODE(0, SIGABRT));
+ /* NOT REACHED */
+ return(0);
+ } else {
+ return(error);
+ }
+}
+
+int
+exec_map_first_page(imgp)
+ struct image_params *imgp;
+{
+ int s, rv, i;
+ int initial_pagein;
+ vm_page_t ma[VM_INITIAL_PAGEIN];
+ vm_object_t object;
+
+
+ if (imgp->firstpage) {
+ exec_unmap_first_page(imgp);
+ }
+
+ object = imgp->vp->v_object;
+ s = splvm();
+
+ ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+
+ if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
+ initial_pagein = VM_INITIAL_PAGEIN;
+ if (initial_pagein > object->size)
+ initial_pagein = object->size;
+ for (i = 1; i < initial_pagein; i++) {
+ if (ma[i] = vm_page_lookup(object, i)) {
+ if ((ma[i]->flags & PG_BUSY) || ma[i]->busy)
+ break;
+ if (ma[i]->valid)
+ break;
+ vm_page_busy(ma[i]);
+ } else {
+ ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL);
+ if (ma[i] == NULL)
+ break;
+ }
+ }
+ initial_pagein = i;
+
+ rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
+ ma[0] = vm_page_lookup(object, 0);
+
+ if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) {
+ if (ma[0]) {
+ vm_page_protect(ma[0], VM_PROT_NONE);
+ vm_page_free(ma[0]);
+ }
+ splx(s);
+ return EIO;
+ }
+ }
+
+ vm_page_wire(ma[0]);
+ vm_page_wakeup(ma[0]);
+ splx(s);
+
+ pmap_kenter((vm_offset_t) imgp->image_header, VM_PAGE_TO_PHYS(ma[0]));
+ imgp->firstpage = ma[0];
+
+ return 0;
+}
+
+void
+exec_unmap_first_page(imgp)
+ struct image_params *imgp;
+{
+ if (imgp->firstpage) {
+ pmap_kremove((vm_offset_t) imgp->image_header);
+ vm_page_unwire(imgp->firstpage, 1);
+ imgp->firstpage = NULL;
+ }
+}
+
+/*
+ * Destroy old address space, and allocate a new stack
+ * The new stack is only SGROWSIZ large because it is grown
+ * automatically in trap.c.
+ */
+int
+exec_new_vmspace(imgp)
+ struct image_params *imgp;
+{
+ int error;
+ struct vmspace *vmspace = imgp->proc->p_vmspace;
+#ifdef VM_STACK
+ caddr_t stack_addr = (caddr_t) (USRSTACK - MAXSSIZ);
+#else
+ caddr_t stack_addr = (caddr_t) (USRSTACK - SGROWSIZ);
+#endif
+ vm_map_t map = &vmspace->vm_map;
+
+ imgp->vmspace_destroyed = 1;
+
+ /*
+ * Blow away entire process VM, if address space not shared,
+ * otherwise, create a new VM space so that other threads are
+ * not disrupted
+ */
+ if (vmspace->vm_refcnt == 1) {
+ if (vmspace->vm_shm)
+ shmexit(imgp->proc);
+ pmap_remove_pages(&vmspace->vm_pmap, 0, VM_MAXUSER_ADDRESS);
+ vm_map_remove(map, 0, VM_MAXUSER_ADDRESS);
+ } else {
+ vmspace_exec(imgp->proc);
+ vmspace = imgp->proc->p_vmspace;
+ map = &vmspace->vm_map;
+ }
+
+ /* Allocate a new stack */
+#ifdef VM_STACK
+ error = vm_map_stack (&vmspace->vm_map, (vm_offset_t)stack_addr,
+ (vm_size_t)MAXSSIZ, VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error)
+ return (error);
+
+ /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
+ * VM_STACK case, but they are still used to monitor the size of the
+ * process stack so we can check the stack rlimit.
+ */
+ vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT;
+ vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ;
+#else
+ error = vm_map_insert(&vmspace->vm_map, NULL, 0,
+ (vm_offset_t) stack_addr, (vm_offset_t) USRSTACK,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error)
+ return (error);
+
+ vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT;
+
+ /* Initialize maximum stack address */
+ vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ;
+#endif
+
+ return(0);
+}
+
+/*
+ * Copy out argument and environment strings from the old process
+ * address space into the temporary string buffer.
+ */
+int
+exec_extract_strings(imgp)
+ struct image_params *imgp;
+{
+ char **argv, **envv;
+ char *argp, *envp;
+ int error;
+ size_t length;
+
+ /*
+ * extract arguments first
+ */
+
+ argv = imgp->uap->argv;
+
+ if (argv) {
+ argp = (caddr_t) (intptr_t) fuword(argv);
+ if (argp == (caddr_t) -1)
+ return (EFAULT);
+ if (argp)
+ argv++;
+ if (imgp->argv0)
+ argp = imgp->argv0;
+ if (argp) {
+ do {
+ if (argp == (caddr_t) -1)
+ return (EFAULT);
+ if ((error = copyinstr(argp, imgp->stringp,
+ imgp->stringspace, &length))) {
+ if (error == ENAMETOOLONG)
+ return(E2BIG);
+ return (error);
+ }
+ imgp->stringspace -= length;
+ imgp->stringp += length;
+ imgp->argc++;
+ } while ((argp = (caddr_t) (intptr_t) fuword(argv++)));
+ }
+ }
+
+ /*
+ * extract environment strings
+ */
+
+ envv = imgp->uap->envv;
+
+ if (envv) {
+ while ((envp = (caddr_t) (intptr_t) fuword(envv++))) {
+ if (envp == (caddr_t) -1)
+ return (EFAULT);
+ if ((error = copyinstr(envp, imgp->stringp,
+ imgp->stringspace, &length))) {
+ if (error == ENAMETOOLONG)
+ return(E2BIG);
+ return (error);
+ }
+ imgp->stringspace -= length;
+ imgp->stringp += length;
+ imgp->envc++;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Copy strings out to the new process address space, constructing
+ * new arg and env vector tables. Return a pointer to the base
+ * so that it can be used as the initial stack pointer.
+ */
+long *
+exec_copyout_strings(imgp)
+ struct image_params *imgp;
+{
+ int argc, envc;
+ char **vectp;
+ char *stringp, *destp;
+ long *stack_base;
+ struct ps_strings *arginfo;
+ int szsigcode;
+
+ /*
+ * Calculate string base and vector table pointers.
+ * Also deal with signal trampoline code for this exec type.
+ */
+ arginfo = (struct ps_strings *)PS_STRINGS;
+ szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
+ destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
+ roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
+
+ /*
+ * install sigcode
+ */
+ if (szsigcode)
+ copyout(imgp->proc->p_sysent->sv_sigcode,
+ ((caddr_t)arginfo - szsigcode), szsigcode);
+
+ /*
+ * If we have a valid auxargs ptr, prepare some room
+ * on the stack.
+ */
+ if (imgp->auxargs)
+ /*
+ * The '+ 2' is for the null pointers at the end of each of the
+ * arg and env vector sets, and 'AT_COUNT*2' is room for the
+ * ELF Auxargs data.
+ */
+ vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 +
+ AT_COUNT*2) * sizeof(char*));
+ else
+ /*
+ * The '+ 2' is for the null pointers at the end of each of the
+ * arg and env vector sets
+ */
+ vectp = (char **)
+ (destp - (imgp->argc + imgp->envc + 2) * sizeof(char*));
+
+ /*
+ * vectp also becomes our initial stack base
+ */
+ stack_base = (long *)vectp;
+
+ stringp = imgp->stringbase;
+ argc = imgp->argc;
+ envc = imgp->envc;
+
+ /*
+ * Copy out strings - arguments and environment.
+ */
+ copyout(stringp, destp, ARG_MAX - imgp->stringspace);
+
+ /*
+ * Fill in "ps_strings" struct for ps, w, etc.
+ */
+ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
+ suword(&arginfo->ps_nargvstr, argc);
+
+ /*
+ * Fill in argument portion of vector table.
+ */
+ for (; argc > 0; --argc) {
+ suword(vectp++, (long)(intptr_t)destp);
+ while (*stringp++ != 0)
+ destp++;
+ destp++;
+ }
+
+ /* a null vector table pointer seperates the argp's from the envp's */
+ suword(vectp++, 0);
+
+ suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
+ suword(&arginfo->ps_nenvstr, envc);
+
+ /*
+ * Fill in environment portion of vector table.
+ */
+ for (; envc > 0; --envc) {
+ suword(vectp++, (long)(intptr_t)destp);
+ while (*stringp++ != 0)
+ destp++;
+ destp++;
+ }
+
+ /* end of vector table is a null pointer */
+ suword(vectp, 0);
+
+ return (stack_base);
+}
+
+/*
+ * Check permissions of file to execute.
+ * Return 0 for success or error code on failure.
+ */
+int
+exec_check_permissions(imgp)
+ struct image_params *imgp;
+{
+ struct proc *p = imgp->proc;
+ struct vnode *vp = imgp->vp;
+ struct vattr *attr = imgp->attr;
+ int error;
+
+ /* Get file attributes */
+ error = VOP_GETATTR(vp, attr, p->p_ucred, p);
+ if (error)
+ return (error);
+
+ /*
+ * 1) Check if file execution is disabled for the filesystem that this
+ * file resides on.
+ * 2) Insure that at least one execute bit is on - otherwise root
+ * will always succeed, and we don't want to happen unless the
+ * file really is executable.
+ * 3) Insure that the file is a regular file.
+ */
+ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
+ ((attr->va_mode & 0111) == 0) ||
+ (attr->va_type != VREG)) {
+ return (EACCES);
+ }
+
+ /*
+ * Zero length files can't be exec'd
+ */
+ if (attr->va_size == 0)
+ return (ENOEXEC);
+
+ /*
+ * Check for execute permission to file based on current credentials.
+ */
+ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+ if (error)
+ return (error);
+
+ /*
+ * Check number of open-for-writes on the file and deny execution
+ * if there are any.
+ */
+ if (vp->v_writecount)
+ return (ETXTBSY);
+
+ /*
+ * Call filesystem specific open routine (which does nothing in the
+ * general case).
+ */
+ error = VOP_OPEN(vp, FREAD, p->p_ucred, p);
+ if (error)
+ return (error);
+
+ return (0);
+}
+
+/*
+ * Exec handler registration
+ */
+int
+exec_register(execsw_arg)
+ const struct execsw *execsw_arg;
+{
+ const struct execsw **es, **xs, **newexecsw;
+ int count = 2; /* New slot and trailing NULL */
+
+ if (execsw)
+ for (es = execsw; *es; es++)
+ count++;
+ newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
+ if (newexecsw == NULL)
+ return ENOMEM;
+ xs = newexecsw;
+ if (execsw)
+ for (es = execsw; *es; es++)
+ *xs++ = *es;
+ *xs++ = execsw_arg;
+ *xs = NULL;
+ if (execsw)
+ free(execsw, M_TEMP);
+ execsw = newexecsw;
+ return 0;
+}
+
+int
+exec_unregister(execsw_arg)
+ const struct execsw *execsw_arg;
+{
+ const struct execsw **es, **xs, **newexecsw;
+ int count = 1;
+
+ if (execsw == NULL)
+ panic("unregister with no handlers left?\n");
+
+ for (es = execsw; *es; es++) {
+ if (*es == execsw_arg)
+ break;
+ }
+ if (*es == NULL)
+ return ENOENT;
+ for (es = execsw; *es; es++)
+ if (*es != execsw_arg)
+ count++;
+ newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
+ if (newexecsw == NULL)
+ return ENOMEM;
+ xs = newexecsw;
+ for (es = execsw; *es; es++)
+ if (*es != execsw_arg)
+ *xs++ = *es;
+ *xs = NULL;
+ if (execsw)
+ free(execsw, M_TEMP);
+ execsw = newexecsw;
+ return 0;
+}
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
new file mode 100644
index 0000000..7be01af
--- /dev/null
+++ b/sys/kern/kern_exit.c
@@ -0,0 +1,647 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94
+ * $Id: kern_exit.c,v 1.70 1998/12/19 02:55:33 julian Exp $
+ */
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/tty.h>
+#include <sys/wait.h>
+#include <sys/vnode.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/ptrace.h>
+#include <sys/acct.h> /* for acct_process() function prototype */
+#include <sys/filedesc.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
+#include <sys/aio.h>
+
+#ifdef COMPAT_43
+#include <machine/reg.h>
+#include <machine/psl.h>
+#endif
+#include <machine/limits.h> /* for UCHAR_MAX = typeof(p_priority)_MAX */
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_zone.h>
+#ifdef COMPAT_LINUX_THREADS
+#include <sys/user.h>
+#endif
+
+static MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status");
+
+static int wait1 __P((struct proc *, struct wait_args *, int));
+
+/*
+ * callout list for things to do at exit time
+ */
+typedef struct exit_list_element {
+ struct exit_list_element *next;
+ exitlist_fn function;
+} *ele_p;
+
+static ele_p exit_list;
+
+/*
+ * exit --
+ * Death of process.
+ */
+void
+exit(p, uap)
+ struct proc *p;
+ struct rexit_args /* {
+ int rval;
+ } */ *uap;
+{
+
+ exit1(p, W_EXITCODE(uap->rval, 0));
+ /* NOTREACHED */
+}
+
+/*
+ * Exit: deallocate address space and other resources, change proc state
+ * to zombie, and unlink proc from allproc and parent's lists. Save exit
+ * status and rusage for wait(). Check for child processes and orphan them.
+ */
+void
+exit1(p, rv)
+ register struct proc *p;
+ int rv;
+{
+ register struct proc *q, *nq;
+ register struct vmspace *vm;
+ ele_p ep = exit_list;
+
+ if (p->p_pid == 1) {
+ printf("init died (signal %d, exit %d)\n",
+ WTERMSIG(rv), WEXITSTATUS(rv));
+ panic("Going nowhere without my init!");
+ }
+
+ aio_proc_rundown(p);
+
+ /* are we a task leader? */
+ if(p == p->p_leader) {
+ struct kill_args killArgs;
+ killArgs.signum = SIGKILL;
+ q = p->p_peers;
+ while(q) {
+ killArgs.pid = q->p_pid;
+ /*
+ * The interface for kill is better
+ * than the internal signal
+ */
+ kill(p, &killArgs);
+ nq = q;
+ q = q->p_peers;
+ /*
+ * orphan the threads so we don't mess up
+ * when they call exit
+ */
+ nq->p_peers = 0;
+ nq->p_leader = nq;
+ }
+
+ /* otherwise are we a peer? */
+ } else if(p->p_peers) {
+ q = p->p_leader;
+ while(q->p_peers != p)
+ q = q->p_peers;
+ q->p_peers = p->p_peers;
+ }
+
+#ifdef PGINPROF
+ vmsizmon();
+#endif
+ STOPEVENT(p, S_EXIT, rv);
+
+ /*
+ * Check if any LKMs need anything done at process exit.
+ * e.g. SYSV IPC stuff
+ * XXX what if one of these generates an error?
+ */
+ while (ep) {
+ (*ep->function)(p);
+ ep = ep->next;
+ }
+
+ if (p->p_flag & P_PROFIL)
+ stopprofclock(p);
+ MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
+ M_ZOMBIE, M_WAITOK);
+ /*
+ * If parent is waiting for us to exit or exec,
+ * P_PPWAIT is set; we will wakeup the parent below.
+ */
+ p->p_flag &= ~(P_TRACED | P_PPWAIT);
+ p->p_flag |= P_WEXIT;
+#ifndef COMPAT_LINUX_THREADS
+ p->p_sigignore = ~0;
+#endif /* COMPAT_LINUX_THREADS */
+ p->p_siglist = 0;
+ if (timevalisset(&p->p_realtimer.it_value))
+ untimeout(realitexpire, (caddr_t)p, p->p_ithandle);
+
+ /*
+ * Reset any sigio structures pointing to us as a result of
+ * F_SETOWN with our pid.
+ */
+ funsetownlst(&p->p_sigiolst);
+
+ /*
+ * Close open files and release open-file table.
+ * This may block!
+ */
+ fdfree(p);
+
+ /*
+ * XXX Shutdown SYSV semaphores
+ */
+ semexit(p);
+
+ /* The next two chunks should probably be moved to vmspace_exit. */
+ vm = p->p_vmspace;
+ /*
+ * Release user portion of address space.
+ * This releases references to vnodes,
+ * which could cause I/O if the file has been unlinked.
+ * Need to do this early enough that we can still sleep.
+ * Can't free the entire vmspace as the kernel stack
+ * may be mapped within that space also.
+ */
+ if (vm->vm_refcnt == 1) {
+ if (vm->vm_shm)
+ shmexit(p);
+ pmap_remove_pages(&vm->vm_pmap, VM_MIN_ADDRESS,
+ VM_MAXUSER_ADDRESS);
+ (void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS,
+ VM_MAXUSER_ADDRESS);
+ }
+
+ if (SESS_LEADER(p)) {
+ register struct session *sp = p->p_session;
+
+ if (sp->s_ttyvp) {
+ /*
+ * Controlling process.
+ * Signal foreground pgrp,
+ * drain controlling terminal
+ * and revoke access to controlling terminal.
+ */
+ if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) {
+ if (sp->s_ttyp->t_pgrp)
+ pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
+ (void) ttywait(sp->s_ttyp);
+ /*
+ * The tty could have been revoked
+ * if we blocked.
+ */
+ if (sp->s_ttyvp)
+ VOP_REVOKE(sp->s_ttyvp, REVOKEALL);
+ }
+ if (sp->s_ttyvp)
+ vrele(sp->s_ttyvp);
+ sp->s_ttyvp = NULL;
+ /*
+ * s_ttyp is not zero'd; we use this to indicate
+ * that the session once had a controlling terminal.
+ * (for logging and informational purposes)
+ */
+ }
+ sp->s_leader = NULL;
+ }
+ fixjobc(p, p->p_pgrp, 0);
+ (void)acct_process(p);
+#ifdef KTRACE
+ /*
+ * release trace file
+ */
+ p->p_traceflag = 0; /* don't trace the vrele() */
+ if (p->p_tracep)
+ vrele(p->p_tracep);
+#endif
+ /*
+ * Remove proc from allproc queue and pidhash chain.
+ * Place onto zombproc. Unlink from parent's child list.
+ */
+ LIST_REMOVE(p, p_list);
+ LIST_INSERT_HEAD(&zombproc, p, p_list);
+ p->p_stat = SZOMB;
+
+ LIST_REMOVE(p, p_hash);
+
+ q = p->p_children.lh_first;
+ if (q) /* only need this if any child is S_ZOMB */
+ wakeup((caddr_t) initproc);
+ for (; q != 0; q = nq) {
+ nq = q->p_sibling.le_next;
+ LIST_REMOVE(q, p_sibling);
+ LIST_INSERT_HEAD(&initproc->p_children, q, p_sibling);
+ q->p_pptr = initproc;
+#ifdef COMPAT_LINUX_THREADS
+ q->p_sigparent = 0;
+#endif /* COMPAT_LINUX_THREADS */
+ /*
+ * Traced processes are killed
+ * since their existence means someone is screwing up.
+ */
+ if (q->p_flag & P_TRACED) {
+ q->p_flag &= ~P_TRACED;
+ psignal(q, SIGKILL);
+ }
+ }
+
+ /*
+ * Save exit status and final rusage info, adding in child rusage
+ * info and self times.
+ */
+ p->p_xstat = rv;
+ *p->p_ru = p->p_stats->p_ru;
+ calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL);
+ ruadd(p->p_ru, &p->p_stats->p_cru);
+
+ /*
+ * Notify parent that we're gone. If parent has the P_NOCLDWAIT
+ * flag set, notify process 1 instead (and hope it will handle
+ * this situation).
+ */
+#ifndef COMPAT_LINUX_THREADS
+ if (p->p_pptr->p_flag & P_NOCLDWAIT) {
+#else
+ if (p->p_pptr->p_procsig->ps_flag & P_NOCLDWAIT) {
+#endif /* COMPAT_LINUX_THREADS */
+ struct proc *pp = p->p_pptr;
+ proc_reparent(p, initproc);
+ /*
+ * If this was the last child of our parent, notify
+ * parent, so in case he was wait(2)ing, he will
+ * continue.
+ */
+ if (LIST_EMPTY(&pp->p_children))
+ wakeup((caddr_t)pp);
+ }
+
+#ifndef COMPAT_LINUX_THREADS
+ psignal(p->p_pptr, SIGCHLD);
+#else
+ if (p->p_sigparent && p->p_pptr != initproc) {
+ psignal(p->p_pptr, p->p_sigparent);
+ } else {
+ psignal(p->p_pptr, SIGCHLD);
+ }
+#endif /* COMPAT_LINUX_THREADS */
+ wakeup((caddr_t)p->p_pptr);
+#if defined(tahoe)
+ /* move this to cpu_exit */
+ p->p_addr->u_pcb.pcb_savacc.faddr = (float *)NULL;
+#endif
+ /*
+ * Clear curproc after we've done all operations
+ * that could block, and before tearing down the rest
+ * of the process state that might be used from clock, etc.
+ * Also, can't clear curproc while we're still runnable,
+ * as we're not on a run queue (we are current, just not
+ * a proper proc any longer!).
+ *
+ * Other substructures are freed from wait().
+ */
+ curproc = NULL;
+ if (--p->p_limit->p_refcnt == 0) {
+ FREE(p->p_limit, M_SUBPROC);
+ p->p_limit = NULL;
+ }
+
+ /*
+ * Finally, call machine-dependent code to release the remaining
+ * resources including address space, the kernel stack and pcb.
+ * The address space is released by "vmspace_free(p->p_vmspace)";
+ * This is machine-dependent, as we may have to change stacks
+ * or ensure that the current one isn't reallocated before we
+ * finish. cpu_exit will end with a call to cpu_switch(), finishing
+ * our execution (pun intended).
+ */
+ cpu_exit(p);
+}
+
+#ifdef COMPAT_43
+#if defined(hp300) || defined(luna68k)
+#include <machine/frame.h>
+#define GETPS(rp) ((struct frame *)(rp))->f_sr
+#else
+#define GETPS(rp) (rp)[PS]
+#endif
+
+int
+owait(p, uap)
+ struct proc *p;
+ register struct owait_args /* {
+ int dummy;
+ } */ *uap;
+{
+ struct wait_args w;
+
+#ifdef PSL_ALLCC
+ if ((GETPS(p->p_md.md_regs) & PSL_ALLCC) != PSL_ALLCC) {
+ w.options = 0;
+ w.rusage = NULL;
+ } else {
+ w.options = p->p_md.md_regs[R0];
+ w.rusage = (struct rusage *)p->p_md.md_regs[R1];
+ }
+#else
+ w.options = 0;
+ w.rusage = NULL;
+#endif
+ w.pid = WAIT_ANY;
+ w.status = NULL;
+ return (wait1(p, &w, 1));
+}
+#endif /* COMPAT_43 */
+
+int
+wait4(p, uap)
+ struct proc *p;
+ struct wait_args *uap;
+{
+
+ return (wait1(p, uap, 0));
+}
+
+static int
+wait1(q, uap, compat)
+ register struct proc *q;
+ register struct wait_args /* {
+ int pid;
+ int *status;
+ int options;
+ struct rusage *rusage;
+ } */ *uap;
+ int compat;
+{
+ register int nfound;
+ register struct proc *p, *t;
+ int status, error;
+
+ if (uap->pid == 0)
+ uap->pid = -q->p_pgid;
+ if (uap->options &~ (WUNTRACED|WNOHANG))
+ return (EINVAL);
+loop:
+ nfound = 0;
+ for (p = q->p_children.lh_first; p != 0; p = p->p_sibling.le_next) {
+ if (uap->pid != WAIT_ANY &&
+ p->p_pid != uap->pid && p->p_pgid != -uap->pid)
+ continue;
+ nfound++;
+ if (p->p_stat == SZOMB) {
+ /* charge childs scheduling cpu usage to parent */
+ if (curproc->p_pid != 1) {
+ curproc->p_estcpu = min(curproc->p_estcpu +
+ p->p_estcpu, UCHAR_MAX);
+ }
+
+ q->p_retval[0] = p->p_pid;
+#ifdef COMPAT_43
+ if (compat)
+ q->p_retval[1] = p->p_xstat;
+ else
+#endif
+ if (uap->status) {
+ status = p->p_xstat; /* convert to int */
+ if ((error = copyout((caddr_t)&status,
+ (caddr_t)uap->status, sizeof(status))))
+ return (error);
+ }
+ if (uap->rusage && (error = copyout((caddr_t)p->p_ru,
+ (caddr_t)uap->rusage, sizeof (struct rusage))))
+ return (error);
+ /*
+ * If we got the child via a ptrace 'attach',
+ * we need to give it back to the old parent.
+ */
+ if (p->p_oppid && (t = pfind(p->p_oppid))) {
+ p->p_oppid = 0;
+ proc_reparent(p, t);
+ psignal(t, SIGCHLD);
+ wakeup((caddr_t)t);
+ return (0);
+ }
+ p->p_xstat = 0;
+ ruadd(&q->p_stats->p_cru, p->p_ru);
+ FREE(p->p_ru, M_ZOMBIE);
+ p->p_ru = NULL;
+
+ /*
+ * Decrement the count of procs running with this uid.
+ */
+ (void)chgproccnt(p->p_cred->p_ruid, -1);
+
+ /*
+ * Release reference to text vnode
+ */
+ if (p->p_textvp)
+ vrele(p->p_textvp);
+
+ /*
+ * Free up credentials.
+ */
+ if (--p->p_cred->p_refcnt == 0) {
+ crfree(p->p_cred->pc_ucred);
+ FREE(p->p_cred, M_SUBPROC);
+ p->p_cred = NULL;
+ }
+
+ /*
+ * Finally finished with old proc entry.
+ * Unlink it from its process group and free it.
+ */
+ leavepgrp(p);
+ LIST_REMOVE(p, p_list); /* off zombproc */
+ LIST_REMOVE(p, p_sibling);
+
+#ifdef COMPAT_LINUX_THREADS
+ if (--p->p_procsig->ps_refcnt == 0) {
+ if (p->p_sigacts != &p->p_addr->u_sigacts)
+ FREE(p->p_sigacts, M_SUBPROC);
+ FREE(p->p_procsig, M_SUBPROC);
+ p->p_procsig = NULL;
+ }
+#endif /* COMPAT_LINUX_THREADS */
+ /*
+ * Give machine-dependent layer a chance
+ * to free anything that cpu_exit couldn't
+ * release while still running in process context.
+ */
+ cpu_wait(p);
+ zfree(proc_zone, p);
+ nprocs--;
+ return (0);
+ }
+ if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 &&
+ (p->p_flag & P_TRACED || uap->options & WUNTRACED)) {
+ p->p_flag |= P_WAITED;
+ q->p_retval[0] = p->p_pid;
+#ifdef COMPAT_43
+ if (compat) {
+ q->p_retval[1] = W_STOPCODE(p->p_xstat);
+ error = 0;
+ } else
+#endif
+ if (uap->status) {
+ status = W_STOPCODE(p->p_xstat);
+ error = copyout((caddr_t)&status,
+ (caddr_t)uap->status, sizeof(status));
+ } else
+ error = 0;
+ return (error);
+ }
+ }
+ if (nfound == 0)
+ return (ECHILD);
+ if (uap->options & WNOHANG) {
+ q->p_retval[0] = 0;
+ return (0);
+ }
+ if ((error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0)))
+ return (error);
+ goto loop;
+}
+
+/*
+ * make process 'parent' the new parent of process 'child'.
+ */
+void
+proc_reparent(child, parent)
+ register struct proc *child;
+ register struct proc *parent;
+{
+
+ if (child->p_pptr == parent)
+ return;
+
+ LIST_REMOVE(child, p_sibling);
+ LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
+ child->p_pptr = parent;
+}
+
+/*
+ * The next two functions are to handle adding/deleting items on the
+ * exit callout list
+ *
+ * at_exit():
+ * Take the arguments given and put them onto the exit callout list,
+ * However first make sure that it's not already there.
+ * returns 0 on success.
+ */
+int
+at_exit(function)
+ exitlist_fn function;
+{
+ ele_p ep;
+
+ /* Be noisy if the programmer has lost track of things */
+ if (rm_at_exit(function))
+ printf("exit callout entry already present\n");
+ ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
+ if (ep == NULL)
+ return (ENOMEM);
+ ep->next = exit_list;
+ ep->function = function;
+ exit_list = ep;
+ return (0);
+}
+/*
+ * Scan the exit callout list for the given items and remove them.
+ * Returns the number of items removed.
+ * Logically this can only be 0 or 1.
+ */
+int
+rm_at_exit(function)
+ exitlist_fn function;
+{
+ ele_p *epp, ep;
+ int count;
+
+ count = 0;
+ epp = &exit_list;
+ ep = *epp;
+ while (ep) {
+ if (ep->function == function) {
+ *epp = ep->next;
+ free(ep, M_TEMP);
+ count++;
+ } else {
+ epp = &ep->next;
+ }
+ ep = *epp;
+ }
+ return (count);
+}
+
+#ifdef COMPAT_LINUX_THREADS
+void check_sigacts (void)
+{
+ struct proc *p = curproc;
+ struct sigacts *pss;
+ int s;
+
+ if (p->p_procsig->ps_refcnt == 1 &&
+ p->p_sigacts != &p->p_addr->u_sigacts) {
+ pss = p->p_sigacts;
+ s = splhigh();
+ p->p_addr->u_sigacts = *pss;
+ p->p_sigacts = &p->p_addr->u_sigacts;
+ splx(s);
+ FREE(pss, M_SUBPROC);
+ }
+}
+#endif
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
new file mode 100644
index 0000000..732712b
--- /dev/null
+++ b/sys/kern/kern_fork.c
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
+ * $Id: kern_fork.c,v 1.53 1998/12/19 02:55:33 julian Exp $
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/vnode.h>
+#include <sys/acct.h>
+#include <sys/ktrace.h>
+#include <sys/unistd.h>
+
+#include <vm/vm.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_zone.h>
+
+#ifdef COMPAT_LINUX_THREADS
+#include <machine/frame.h>
+#include <sys/user.h>
+#endif /* COMPAT_LINUX_THREADS */
+#ifdef SMP
+static int fast_vfork = 0; /* Doesn't work on SMP yet. */
+#else
+static int fast_vfork = 1;
+#endif
+SYSCTL_INT(_kern, OID_AUTO, fast_vfork, CTLFLAG_RW, &fast_vfork, 0, "");
+
+/*
+ * These are the stuctures used to create a callout list for things to do
+ * when forking a process
+ */
+typedef struct fork_list_element {
+ struct fork_list_element *next;
+ forklist_fn function;
+} *fle_p;
+
+static fle_p fork_list;
+
+#ifndef _SYS_SYSPROTO_H_
+struct fork_args {
+ int dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+fork(p, uap)
+ struct proc *p;
+ struct fork_args *uap;
+{
+
+ return (fork1(p, RFFDG | RFPROC));
+}
+
+/* ARGSUSED */
+int
+vfork(p, uap)
+ struct proc *p;
+ struct vfork_args *uap;
+{
+
+ return (fork1(p, RFFDG | RFPROC | RFPPWAIT | (fast_vfork ? RFMEM : 0)));
+}
+
+/* ARGSUSED */
+int
+rfork(p, uap)
+ struct proc *p;
+ struct rfork_args *uap;
+{
+
+ return (fork1(p, uap->flags));
+}
+
+
+int nprocs = 1; /* process 0 */
+static int nextpid = 0;
+
+int
+fork1(p1, flags)
+ register struct proc *p1;
+ int flags;
+{
+ register struct proc *p2, *pptr;
+ register uid_t uid;
+ struct proc *newproc;
+ int count;
+ static int pidchecked = 0;
+ fle_p ep ;
+
+ ep = fork_list;
+
+ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
+ return (EINVAL);
+
+#ifdef SMP
+ /*
+ * FATAL now, we cannot have the same PTD on both cpus, the PTD
+ * needs to move out of PTmap and be per-process, even for shared
+ * page table processes. Unfortunately, this means either removing
+ * PTD[] as a fixed virtual address, or move it to the per-cpu map
+ * area for SMP mode. Both cases require seperate management of
+ * the per-process-even-if-PTmap-is-shared PTD.
+ */
+ if (flags & RFMEM) {
+ printf("shared address space fork attempted: pid: %d\n",
+ p1->p_pid);
+ return (EOPNOTSUPP);
+ }
+#endif
+
+ /*
+ * Here we don't create a new process, but we divorce
+ * certain parts of a process from itself.
+ */
+ if ((flags & RFPROC) == 0) {
+
+ /*
+ * Divorce the memory, if it is shared, essentially
+ * this changes shared memory amongst threads, into
+ * COW locally.
+ */
+ if ((flags & RFMEM) == 0) {
+ if (p1->p_vmspace->vm_refcnt > 1) {
+ vmspace_unshare(p1);
+ }
+ }
+
+ /*
+ * Close all file descriptors.
+ */
+ if (flags & RFCFDG) {
+ struct filedesc *fdtmp;
+ fdtmp = fdinit(p1);
+ fdfree(p1);
+ p1->p_fd = fdtmp;
+ }
+
+ /*
+ * Unshare file descriptors (from parent.)
+ */
+ if (flags & RFFDG) {
+ if (p1->p_fd->fd_refcnt > 1) {
+ struct filedesc *newfd;
+ newfd = fdcopy(p1);
+ fdfree(p1);
+ p1->p_fd = newfd;
+ }
+ }
+ return (0);
+ }
+
+ /*
+ * Although process entries are dynamically created, we still keep
+ * a global limit on the maximum number we will create. Don't allow
+ * a nonprivileged user to use the last process; don't let root
+ * exceed the limit. The variable nprocs is the current number of
+ * processes, maxproc is the limit.
+ */
+ uid = p1->p_cred->p_ruid;
+ if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
+ tablefull("proc");
+ return (EAGAIN);
+ }
+ /*
+ * Increment the nprocs resource before blocking can occur. There
+ * are hard-limits as to the number of processes that can run.
+ */
+ nprocs++;
+
+ /*
+ * Increment the count of procs running with this uid. Don't allow
+ * a nonprivileged user to exceed their current limit.
+ */
+ count = chgproccnt(uid, 1);
+ if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) {
+ (void)chgproccnt(uid, -1);
+ /*
+ * Back out the process count
+ */
+ nprocs--;
+ return (EAGAIN);
+ }
+
+ /* Allocate new proc. */
+ newproc = zalloc(proc_zone);
+
+/*
+ * Setup linkage for kernel based threading
+ */
+ if((flags & RFTHREAD) != 0) {
+ newproc->p_peers = p1->p_peers;
+ p1->p_peers = newproc;
+ newproc->p_leader = p1->p_leader;
+ } else {
+ newproc->p_peers = 0;
+ newproc->p_leader = newproc;
+ }
+
+ newproc->p_wakeup = 0;
+
+ /*
+ * Find an unused process ID. We remember a range of unused IDs
+ * ready to use (from nextpid+1 through pidchecked-1).
+ */
+ nextpid++;
+retry:
+ /*
+ * If the process ID prototype has wrapped around,
+ * restart somewhat above 0, as the low-numbered procs
+ * tend to include daemons that don't exit.
+ */
+ if (nextpid >= PID_MAX) {
+ nextpid = 100;
+ pidchecked = 0;
+ }
+ if (nextpid >= pidchecked) {
+ int doingzomb = 0;
+
+ pidchecked = PID_MAX;
+ /*
+ * Scan the active and zombie procs to check whether this pid
+ * is in use. Remember the lowest pid that's greater
+ * than nextpid, so we can avoid checking for a while.
+ */
+ p2 = allproc.lh_first;
+again:
+ for (; p2 != 0; p2 = p2->p_list.le_next) {
+ while (p2->p_pid == nextpid ||
+ p2->p_pgrp->pg_id == nextpid ||
+ p2->p_session->s_sid == nextpid) {
+ nextpid++;
+ if (nextpid >= pidchecked)
+ goto retry;
+ }
+ if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
+ pidchecked = p2->p_pid;
+ if (p2->p_pgrp->pg_id > nextpid &&
+ pidchecked > p2->p_pgrp->pg_id)
+ pidchecked = p2->p_pgrp->pg_id;
+ if (p2->p_session->s_sid > nextpid &&
+ pidchecked > p2->p_session->s_sid)
+ pidchecked = p2->p_session->s_sid;
+ }
+ if (!doingzomb) {
+ doingzomb = 1;
+ p2 = zombproc.lh_first;
+ goto again;
+ }
+ }
+
+ p2 = newproc;
+ p2->p_stat = SIDL; /* protect against others */
+ p2->p_pid = nextpid;
+ LIST_INSERT_HEAD(&allproc, p2, p_list);
+ LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
+
+ /*
+ * Make a proc table entry for the new process.
+ * Start by zeroing the section of proc that is zero-initialized,
+ * then copy the section that is copied directly from the parent.
+ */
+ bzero(&p2->p_startzero,
+ (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
+ bcopy(&p1->p_startcopy, &p2->p_startcopy,
+ (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
+
+ p2->p_aioinfo = NULL;
+
+ /*
+ * Duplicate sub-structures as needed.
+ * Increase reference counts on shared objects.
+ * The p_stats and p_sigacts substructs are set in vm_fork.
+ */
+ p2->p_flag = P_INMEM;
+ if (p1->p_flag & P_PROFIL)
+ startprofclock(p2);
+ MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred),
+ M_SUBPROC, M_WAITOK);
+ bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred));
+ p2->p_cred->p_refcnt = 1;
+ crhold(p1->p_ucred);
+
+#ifdef COMPAT_LINUX_THREADS
+ if (flags & RFSIGSHARE) {
+ p2->p_procsig = p1->p_procsig;
+ p2->p_procsig->ps_refcnt++;
+ if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
+ struct sigacts *newsigacts;
+ int s;
+
+ if (p2->p_procsig->ps_refcnt != 2)
+ printf ("PID:%d Creating shared sigacts with procsig->ps_refcnt %d\n",
+ p2->p_pid, p2->p_procsig->ps_refcnt);
+ /* Create the shared sigacts structure */
+ MALLOC (newsigacts, struct sigacts *, sizeof (struct sigacts),
+ M_SUBPROC, M_WAITOK);
+ s = splhigh();
+ /* Set p_sigacts to the new shared structure. Note that this
+ * is updating p1->p_sigacts at the same time, since p_sigacts
+ * is just a pointer to the shared p_procsig->ps_sigacts.
+ */
+ p2->p_sigacts = newsigacts;
+ /* Copy in the values from the u area */
+ *p2->p_sigacts = p1->p_addr->u_sigacts;
+ splx (s);
+ }
+ } else {
+ MALLOC (p2->p_procsig, struct procsig *, sizeof(struct procsig),
+ M_SUBPROC, M_WAITOK);
+ bcopy(&p1->p_procsig->ps_begincopy, &p2->p_procsig->ps_begincopy,
+ (unsigned)&p1->p_procsig->ps_endcopy -
+ (unsigned)&p1->p_procsig->ps_begincopy);
+ p2->p_procsig->ps_refcnt = 1;
+ /* Note that we fill in the values of sigacts in vm_fork */
+ p2->p_sigacts = NULL;
+ }
+ if (flags & RFLINUXTHPN) {
+ p2->p_sigparent = SIGUSR1;
+ }
+#endif /* COMPAT_LINUX_THREADS */
+ /* bump references to the text vnode (for procfs) */
+ p2->p_textvp = p1->p_textvp;
+ if (p2->p_textvp)
+ VREF(p2->p_textvp);
+
+ if (flags & RFCFDG)
+ p2->p_fd = fdinit(p1);
+ else if (flags & RFFDG)
+ p2->p_fd = fdcopy(p1);
+ else
+ p2->p_fd = fdshare(p1);
+
+ /*
+ * If p_limit is still copy-on-write, bump refcnt,
+ * otherwise get a copy that won't be modified.
+ * (If PL_SHAREMOD is clear, the structure is shared
+ * copy-on-write.)
+ */
+ if (p1->p_limit->p_lflags & PL_SHAREMOD)
+ p2->p_limit = limcopy(p1->p_limit);
+ else {
+ p2->p_limit = p1->p_limit;
+ p2->p_limit->p_refcnt++;
+ }
+
+ /*
+ * Preserve some more flags in subprocess. P_PROFIL has already
+ * been preserved.
+ */
+ p2->p_flag |= p1->p_flag & P_SUGID;
+ if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
+ p2->p_flag |= P_CONTROLT;
+ if (flags & RFPPWAIT)
+ p2->p_flag |= P_PPWAIT;
+
+ LIST_INSERT_AFTER(p1, p2, p_pglist);
+
+ /*
+ * Attach the new process to its parent.
+ *
+ * If RFNOWAIT is set, the newly created process becomes a child
+ * of init. This effectively disassociates the child from the
+ * parent.
+ */
+ if (flags & RFNOWAIT)
+ pptr = initproc;
+ else
+ pptr = p1;
+ p2->p_pptr = pptr;
+ LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
+ LIST_INIT(&p2->p_children);
+
+#ifdef KTRACE
+ /*
+ * Copy traceflag and tracefile if enabled.
+ * If not inherited, these were zeroed above.
+ */
+ if (p1->p_traceflag&KTRFAC_INHERIT) {
+ p2->p_traceflag = p1->p_traceflag;
+ if ((p2->p_tracep = p1->p_tracep) != NULL)
+ VREF(p2->p_tracep);
+ }
+#endif
+
+ /*
+ * set priority of child to be that of parent
+ */
+ p2->p_estcpu = p1->p_estcpu;
+
+ /*
+ * This begins the section where we must prevent the parent
+ * from being swapped.
+ */
+ p1->p_flag |= P_NOSWAP;
+
+ /*
+ * Finish creating the child process. It will return via a different
+ * execution path later. (ie: directly into user mode)
+ */
+ vm_fork(p1, p2, flags);
+
+ /*
+ * Both processes are set up, now check if any LKMs want
+ * to adjust anything.
+ * What if they have an error? XXX
+ */
+ while (ep) {
+ (*ep->function)(p1, p2, flags);
+ ep = ep->next;
+ }
+
+ /*
+ * Make child runnable and add to run queue.
+ */
+ microtime(&(p2->p_stats->p_start));
+ p2->p_acflag = AFORK;
+ (void) splhigh();
+ p2->p_stat = SRUN;
+ setrunqueue(p2);
+ (void) spl0();
+
+ /*
+ * Now can be swapped.
+ */
+ p1->p_flag &= ~P_NOSWAP;
+
+ /*
+ * Preserve synchronization semantics of vfork. If waiting for
+ * child to exec or exit, set P_PPWAIT on child, and sleep on our
+ * proc (in case of exit).
+ */
+ while (p2->p_flag & P_PPWAIT)
+ tsleep(p1, PWAIT, "ppwait", 0);
+
+ /*
+ * Return child pid to parent process,
+ * marking us as parent via p1->p_retval[1].
+ */
+ p1->p_retval[0] = p2->p_pid;
+ p1->p_retval[1] = 0;
+ return (0);
+}
+
+/*
+ * The next two functionms are general routines to handle adding/deleting
+ * items on the fork callout list.
+ *
+ * at_fork():
+ * Take the arguments given and put them onto the fork callout list,
+ * However first make sure that it's not already there.
+ * Returns 0 on success or a standard error number.
+ */
+int
+at_fork(function)
+ forklist_fn function;
+{
+ fle_p ep;
+
+ /* let the programmer know if he's been stupid */
+ if (rm_at_fork(function))
+ printf("fork callout entry already present\n");
+ ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
+ if (ep == NULL)
+ return (ENOMEM);
+ ep->next = fork_list;
+ ep->function = function;
+ fork_list = ep;
+ return (0);
+}
+
+/*
+ * Scan the exit callout list for the given items and remove them.
+ * Returns the number of items removed.
+ * Theoretically this value can only be 0 or 1.
+ */
+int
+rm_at_fork(function)
+ forklist_fn function;
+{
+ fle_p *epp, ep;
+ int count;
+
+ count= 0;
+ epp = &fork_list;
+ ep = *epp;
+ while (ep) {
+ if (ep->function == function) {
+ *epp = ep->next;
+ free(ep, M_TEMP);
+ count++;
+ } else {
+ epp = &ep->next;
+ }
+ ep = *epp;
+ }
+ return (count);
+}
diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c
new file mode 100644
index 0000000..1d6756c
--- /dev/null
+++ b/sys/kern/kern_intr.c
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Id: kern_intr.c,v 1.20 1998/09/26 14:25:31 dfr Exp $
+ *
+ */
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/errno.h>
+#ifdef RESOURCE_CHECK
+#include <sys/drvresource.h>
+#endif /* RESOURCE_CHECK */
+
+#include <machine/ipl.h>
+
+#ifdef __i386__
+#include <i386/isa/icu.h>
+#include <i386/isa/intr_machdep.h>
+#endif
+
+#include <sys/interrupt.h>
+
+#include <stddef.h>
+
+#ifdef __i386__
+
+typedef struct intrec {
+ intrmask_t mask;
+ inthand2_t *handler;
+ void *argument;
+ struct intrec *next;
+ void *devdata;
+ int intr;
+ intrmask_t *maskptr;
+ int flags;
+} intrec;
+
+static intrec *intreclist_head[NHWI];
+
+#endif
+
+struct swilist {
+ swihand_t *sl_handler;
+ struct swilist *sl_next;
+};
+
+static struct swilist swilists[NSWI];
+
+#ifdef __i386__
+
+/*
+ * The interrupt multiplexer calls each of the handlers in turn,
+ * and applies the associated interrupt mask to "cpl", which is
+ * defined as a ".long" in /sys/i386/isa/ipl.s
+ */
+
+#ifndef SMP
+static __inline intrmask_t
+splq(intrmask_t mask)
+{
+ intrmask_t tmp = cpl;
+ cpl |= mask;
+ return (tmp);
+}
+#endif /* SMP */
+
+static void
+intr_mux(void *arg)
+{
+ intrec *p = arg;
+
+ while (p != NULL) {
+ int oldspl = splq(p->mask);
+ p->handler(p->argument);
+ splx(oldspl);
+ p = p->next;
+ }
+}
+
+static intrec*
+find_idesc(unsigned *maskptr, int irq)
+{
+ intrec *p = intreclist_head[irq];
+
+ while (p && p->maskptr != maskptr)
+ p = p->next;
+
+ return (p);
+}
+
+static intrec**
+find_pred(intrec *idesc, int irq)
+{
+ intrec **pp = &intreclist_head[irq];
+ intrec *p = *pp;
+
+ while (p != idesc) {
+ if (p == NULL)
+ return (NULL);
+ pp = &p->next;
+ p = *pp;
+ }
+ return (pp);
+}
+
+/*
+ * Both the low level handler and the shared interrupt multiplexer
+ * block out further interrupts as set in the handlers "mask", while
+ * the handler is running. In fact *maskptr should be used for this
+ * purpose, but since this requires one more pointer dereference on
+ * each interrupt, we rather bother update "mask" whenever *maskptr
+ * changes. The function "update_masks" should be called **after**
+ * all manipulation of the linked list of interrupt handlers hung
+ * off of intrdec_head[irq] is complete, since the chain of handlers
+ * will both determine the *maskptr values and the instances of mask
+ * that are fixed. This function should be called with the irq for
+ * which a new handler has been add blocked, since the masks may not
+ * yet know about the use of this irq for a device of a certain class.
+ */
+
+static void
+update_mux_masks(void)
+{
+ int irq;
+ for (irq = 0; irq < ICU_LEN; irq++) {
+ intrec *idesc = intreclist_head[irq];
+ while (idesc != NULL) {
+ if (idesc->maskptr != NULL) {
+ /* our copy of *maskptr may be stale, refresh */
+ idesc->mask = *idesc->maskptr;
+ }
+ idesc = idesc->next;
+ }
+ }
+}
+
+static void
+update_masks(intrmask_t *maskptr, int irq)
+{
+ intrmask_t mask = 1 << irq;
+
+ if (maskptr == NULL)
+ return;
+
+ if (find_idesc(maskptr, irq) == NULL) {
+ /* no reference to this maskptr was found in this irq's chain */
+ if ((*maskptr & mask) == 0)
+ return;
+ /* the irq was included in the classes mask, remove it */
+ INTRUNMASK(*maskptr, mask);
+ } else {
+ /* a reference to this maskptr was found in this irq's chain */
+ if ((*maskptr & mask) != 0)
+ return;
+ /* put the irq into the classes mask */
+ INTRMASK(*maskptr, mask);
+ }
+ /* we need to update all values in the intr_mask[irq] array */
+ update_intr_masks();
+ /* update mask in chains of the interrupt multiplex handler as well */
+ update_mux_masks();
+}
+
+/*
+ * Add interrupt handler to linked list hung off of intreclist_head[irq]
+ * and install shared interrupt multiplex handler, if necessary
+ */
+
+static int
+add_intrdesc(intrec *idesc)
+{
+ int irq = idesc->intr;
+
+ intrec *head = intreclist_head[irq];
+
+ if (head == NULL) {
+ /* first handler for this irq, just install it */
+ if (icu_setup(irq, idesc->handler, idesc->argument,
+ idesc->maskptr, idesc->flags) != 0)
+ return (-1);
+
+ update_intrname(irq, (intptr_t)idesc->devdata);
+ /* keep reference */
+ intreclist_head[irq] = idesc;
+ } else {
+ if ((idesc->flags & INTR_EXCL) != 0
+ || (head->flags & INTR_EXCL) != 0) {
+ /*
+ * can't append new handler, if either list head or
+ * new handler do not allow interrupts to be shared
+ */
+ if (bootverbose)
+ printf("\tdevice combination doesn't support "
+ "shared irq%d\n", irq);
+ return (-1);
+ }
+ if (head->next == NULL) {
+ /*
+ * second handler for this irq, replace device driver's
+ * handler by shared interrupt multiplexer function
+ */
+ icu_unset(irq, head->handler);
+ if (icu_setup(irq, (inthand2_t*)intr_mux, head, 0, 0) != 0)
+ return (-1);
+ if (bootverbose)
+ printf("\tusing shared irq%d.\n", irq);
+ update_intrname(irq, -1);
+ }
+ /* just append to the end of the chain */
+ while (head->next != NULL)
+ head = head->next;
+ head->next = idesc;
+ }
+ update_masks(idesc->maskptr, irq);
+ return (0);
+}
+
+/*
+ * Add the interrupt handler descriptor data structure created by an
+ * earlier call of create_intr() to the linked list for its irq and
+ * adjust the interrupt masks if necessary.
+ *
+ * This function effectively activates the handler.
+ */
+
+int
+intr_connect(intrec *idesc)
+{
+ int errcode = -1;
+ int irq;
+
+#ifdef RESOURCE_CHECK
+ int resflag;
+#endif /* RESOURCE_CHECK */
+
+ if (idesc == NULL)
+ return (-1);
+
+ irq = idesc->intr;
+#ifdef RESOURCE_CHECK
+ resflag = (idesc->flags & INTR_EXCL) ? RESF_NONE : RESF_SHARED;
+ if (resource_claim(idesc->devdata, REST_INT, resflag, irq, irq) == 0)
+#endif /* RESOURCE_CHECK */
+ {
+ /* block this irq */
+ intrmask_t oldspl = splq(1 << irq);
+
+ /* add irq to class selected by maskptr */
+ errcode = add_intrdesc(idesc);
+ splx(oldspl);
+ }
+ if (errcode != 0 && bootverbose)
+ printf("\tintr_connect(irq%d) failed, result=%d\n",
+ irq, errcode);
+
+ return (errcode);
+}
+
+/*
+ * Remove the interrupt handler descriptor data connected created by an
+ * earlier call of intr_connect() from the linked list and adjust the
+ * interrupt masks if necessary.
+ *
+ * This function deactivates the handler.
+ */
+
+int
+intr_disconnect(intrec *idesc)
+{
+ intrec **hook, *head;
+ int irq;
+ int errcode = 0;
+
+ if (idesc == NULL)
+ return (-1);
+
+ irq = idesc->intr;
+
+ /* find pointer that keeps the reference to this interrupt descriptor */
+ hook = find_pred(idesc, irq);
+ if (hook == NULL)
+ return (-1);
+
+ /* make copy of original list head, the line after may overwrite it */
+ head = intreclist_head[irq];
+
+ /* unlink: make predecessor point to idesc->next instead of to idesc */
+ *hook = idesc->next;
+
+ /* now check whether the element we removed was the list head */
+ if (idesc == head) {
+ intrmask_t oldspl = splq(1 << irq);
+
+ /* we want to remove the list head, which was known to intr_mux */
+ icu_unset(irq, (inthand2_t*)intr_mux);
+
+ /* check whether the new list head is the only element on list */
+ head = intreclist_head[irq];
+ if (head != NULL) {
+ if (head->next != NULL) {
+ /* install the multiplex handler with new list head as argument */
+ errcode = icu_setup(irq, (inthand2_t*)intr_mux, head, 0, 0);
+ if (errcode == 0)
+ update_intrname(irq, -1);
+ } else {
+ /* install the one remaining handler for this irq */
+ errcode = icu_setup(irq, head->handler,
+ head->argument,
+ head->maskptr, head->flags);
+ if (errcode == 0)
+ update_intrname(irq, (intptr_t)head->devdata);
+ }
+ }
+ splx(oldspl);
+ }
+ update_masks(idesc->maskptr, irq);
+#ifdef RESOURCE_CHECK
+ resource_free(idesc->devdata);
+#endif /* RESOURCE_CHECK */
+ return (0);
+}
+
+/*
+ * Create an interrupt handler descriptor data structure, which later can
+ * be activated or deactivated at will by calls of [dis]connect(intrec*).
+ *
+ * The dev_instance pointer is required for resource management, and will
+ * only be passed through to resource_claim().
+ *
+ * The interrupt handler takes an argument of type (void*), which is not
+ * what is currently used for ISA devices. But since the unit number passed
+ * to an ISA interrupt handler can be stored in a (void*) variable, this
+ * causes no problems. Eventually all the ISA interrupt handlers should be
+ * modified to accept the pointer to their private data, too, instead of
+ * an integer index.
+ *
+ * There will be functions that derive a driver and unit name from a
+ * dev_instance variable, and those functions will be used to maintain the
+ * interrupt counter label array referenced by systat and vmstat to report
+ * device interrupt rates (->update_intrlabels).
+ */
+
+intrec *
+intr_create(void *dev_instance, int irq, inthand2_t handler, void *arg,
+ intrmask_t *maskptr, int flags)
+{
+ intrec *idesc;
+
+ if (ICU_LEN > 8 * sizeof *maskptr) {
+ printf("create_intr: ICU_LEN of %d too high for %d bit intrmask\n",
+ ICU_LEN, 8 * sizeof *maskptr);
+ return (NULL);
+ }
+ if ((unsigned)irq >= ICU_LEN) {
+ printf("create_intr: requested irq%d too high, limit is %d\n",
+ irq, ICU_LEN -1);
+ return (NULL);
+ }
+
+ idesc = malloc(sizeof *idesc, M_DEVBUF, M_WAITOK);
+ if (idesc) {
+ idesc->next = NULL;
+ bzero(idesc, sizeof *idesc);
+
+ idesc->devdata = dev_instance;
+ idesc->handler = handler;
+ idesc->argument = arg;
+ idesc->maskptr = maskptr;
+ idesc->intr = irq;
+ idesc->flags = flags;
+ }
+ return (idesc);
+}
+
+/*
+ * Return the memory held by the interrupt handler descriptor data structure
+ * to the system. Make sure, the handler is not actively used anymore, before.
+ */
+
+int
+intr_destroy(intrec *rec)
+{
+ if (intr_disconnect(rec) != 0)
+ return (-1);
+ free(rec, M_DEVBUF);
+ return (0);
+}
+
+/*
+ * Emulate the register_intr() call previously defined as low level function.
+ * That function (now icu_setup()) may no longer be directly called, since
+ * a conflict between an ISA and PCI interrupt might go by unnocticed, else.
+ */
+
+int
+register_intr(int intr, int device_id, u_int flags,
+ inthand2_t handler, u_int *maskptr, int unit)
+{
+ /* XXX modify to include isa_device instead of device_id */
+ intrec *idesc;
+
+ flags |= INTR_EXCL;
+ idesc = intr_create((void *)(intptr_t)device_id, intr, handler,
+ (void*)(intptr_t)unit, maskptr, flags);
+ return (intr_connect(idesc));
+}
+
+/*
+ * Emulate the old unregister_intr() low level function.
+ * Make sure there is just one interrupt, that it was
+ * registered as non-shared, and that the handlers match.
+ */
+
+int
+unregister_intr(int intr, inthand2_t handler)
+{
+ intrec *p = intreclist_head[intr];
+
+ if (p != NULL && (p->flags & INTR_EXCL) != 0 && p->handler == handler)
+ return (intr_destroy(p));
+ return (EINVAL);
+}
+
+#endif /* __i386__ */
+
+void
+register_swi(intr, handler)
+ int intr;
+ swihand_t *handler;
+{
+ struct swilist *slp, *slq;
+ int s;
+
+ if (intr < NHWI || intr >= NHWI + NSWI)
+ panic("register_swi: bad intr %d", intr);
+ if (handler == swi_generic || handler == swi_null)
+ panic("register_swi: bad handler %p", (void *)handler);
+ slp = &swilists[intr - NHWI];
+ s = splhigh();
+ if (ihandlers[intr] == swi_null)
+ ihandlers[intr] = handler;
+ else {
+ if (slp->sl_next == NULL) {
+ slp->sl_handler = ihandlers[intr];
+ ihandlers[intr] = swi_generic;
+ }
+ slq = malloc(sizeof(*slq), M_DEVBUF, M_NOWAIT);
+ if (slq == NULL)
+ panic("register_swi: malloc failed");
+ slq->sl_handler = handler;
+ slq->sl_next = NULL;
+ while (slp->sl_next != NULL)
+ slp = slp->sl_next;
+ slp->sl_next = slq;
+ }
+ splx(s);
+}
+
+void
+swi_dispatcher(intr)
+ int intr;
+{
+ struct swilist *slp;
+
+ slp = &swilists[intr - NHWI];
+ do {
+ (*slp->sl_handler)();
+ slp = slp->sl_next;
+ } while (slp != NULL);
+}
+
+void
+unregister_swi(intr, handler)
+ int intr;
+ swihand_t *handler;
+{
+ struct swilist *slfoundpred, *slp, *slq;
+ int s;
+
+ if (intr < NHWI || intr >= NHWI + NSWI)
+ panic("unregister_swi: bad intr %d", intr);
+ if (handler == swi_generic || handler == swi_null)
+ panic("unregister_swi: bad handler %p", (void *)handler);
+ slp = &swilists[intr - NHWI];
+ s = splhigh();
+ if (ihandlers[intr] == handler)
+ ihandlers[intr] = swi_null;
+ else if (slp->sl_next != NULL) {
+ slfoundpred = NULL;
+ for (slq = slp->sl_next; slq != NULL;
+ slp = slq, slq = slp->sl_next)
+ if (slq->sl_handler == handler)
+ slfoundpred = slp;
+ slp = &swilists[intr - NHWI];
+ if (slfoundpred != NULL) {
+ slq = slfoundpred->sl_next;
+ slfoundpred->sl_next = slq->sl_next;
+ free(slq, M_DEVBUF);
+ } else if (slp->sl_handler == handler) {
+ slq = slp->sl_next;
+ slp->sl_next = slq->sl_next;
+ slp->sl_handler = slq->sl_handler;
+ free(slq, M_DEVBUF);
+ }
+ if (slp->sl_next == NULL)
+ ihandlers[intr] = slp->sl_handler;
+ }
+ splx(s);
+}
+
diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
new file mode 100644
index 0000000..7a6d237
--- /dev/null
+++ b/sys/kern/kern_ktrace.c
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93
+ * $Id: kern_ktrace.c,v 1.24 1998/11/10 09:16:29 peter Exp $
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/lock.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/ktrace.h>
+#include <sys/malloc.h>
+#include <sys/syslog.h>
+
+static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
+
+#ifdef KTRACE
+static struct ktr_header *ktrgetheader __P((int type));
+static void ktrwrite __P((struct vnode *, struct ktr_header *));
+static int ktrcanset __P((struct proc *,struct proc *));
+static int ktrsetchildren __P((struct proc *,struct proc *,int,int,struct vnode *));
+static int ktrops __P((struct proc *,struct proc *,int,int,struct vnode *));
+
+
+static struct ktr_header *
+ktrgetheader(type)
+ int type;
+{
+ register struct ktr_header *kth;
+ struct proc *p = curproc; /* XXX */
+
+ MALLOC(kth, struct ktr_header *, sizeof (struct ktr_header),
+ M_KTRACE, M_WAITOK);
+ kth->ktr_type = type;
+ microtime(&kth->ktr_time);
+ kth->ktr_pid = p->p_pid;
+ bcopy(p->p_comm, kth->ktr_comm, MAXCOMLEN);
+ return (kth);
+}
+
+void
+ktrsyscall(vp, code, narg, args)
+ struct vnode *vp;
+ int code, narg, args[];
+{
+ struct ktr_header *kth;
+ struct ktr_syscall *ktp;
+ register int len = sizeof(struct ktr_syscall) + (narg * sizeof(int));
+ struct proc *p = curproc; /* XXX */
+ int *argp, i;
+
+ p->p_traceflag |= KTRFAC_ACTIVE;
+ kth = ktrgetheader(KTR_SYSCALL);
+ MALLOC(ktp, struct ktr_syscall *, len, M_KTRACE, M_WAITOK);
+ ktp->ktr_code = code;
+ ktp->ktr_narg = narg;
+ argp = (int *)((char *)ktp + sizeof(struct ktr_syscall));
+ for (i = 0; i < narg; i++)
+ *argp++ = args[i];
+ kth->ktr_buf = (caddr_t)ktp;
+ kth->ktr_len = len;
+ ktrwrite(vp, kth);
+ FREE(ktp, M_KTRACE);
+ FREE(kth, M_KTRACE);
+ p->p_traceflag &= ~KTRFAC_ACTIVE;
+}
+
+void
+ktrsysret(vp, code, error, retval)
+ struct vnode *vp;
+ int code, error, retval;
+{
+ struct ktr_header *kth;
+ struct ktr_sysret ktp;
+ struct proc *p = curproc; /* XXX */
+
+ p->p_traceflag |= KTRFAC_ACTIVE;
+ kth = ktrgetheader(KTR_SYSRET);
+ ktp.ktr_code = code;
+ ktp.ktr_error = error;
+ ktp.ktr_retval = retval; /* what about val2 ? */
+
+ kth->ktr_buf = (caddr_t)&ktp;
+ kth->ktr_len = sizeof(struct ktr_sysret);
+
+ ktrwrite(vp, kth);
+ FREE(kth, M_KTRACE);
+ p->p_traceflag &= ~KTRFAC_ACTIVE;
+}
+
+void
+ktrnamei(vp, path)
+ struct vnode *vp;
+ char *path;
+{
+ struct ktr_header *kth;
+ struct proc *p = curproc; /* XXX */
+
+ p->p_traceflag |= KTRFAC_ACTIVE;
+ kth = ktrgetheader(KTR_NAMEI);
+ kth->ktr_len = strlen(path);
+ kth->ktr_buf = path;
+
+ ktrwrite(vp, kth);
+ FREE(kth, M_KTRACE);
+ p->p_traceflag &= ~KTRFAC_ACTIVE;
+}
+
+void
+ktrgenio(vp, fd, rw, iov, len, error)
+ struct vnode *vp;
+ int fd;
+ enum uio_rw rw;
+ register struct iovec *iov;
+ int len, error;
+{
+ struct ktr_header *kth;
+ register struct ktr_genio *ktp;
+ register caddr_t cp;
+ register int resid = len, cnt;
+ struct proc *p = curproc; /* XXX */
+
+ if (error)
+ return;
+ p->p_traceflag |= KTRFAC_ACTIVE;
+ kth = ktrgetheader(KTR_GENIO);
+ MALLOC(ktp, struct ktr_genio *, sizeof(struct ktr_genio) + len,
+ M_KTRACE, M_WAITOK);
+ ktp->ktr_fd = fd;
+ ktp->ktr_rw = rw;
+ cp = (caddr_t)((char *)ktp + sizeof (struct ktr_genio));
+ while (resid > 0) {
+ if ((cnt = iov->iov_len) > resid)
+ cnt = resid;
+ if (copyin(iov->iov_base, cp, (unsigned)cnt))
+ goto done;
+ cp += cnt;
+ resid -= cnt;
+ iov++;
+ }
+ kth->ktr_buf = (caddr_t)ktp;
+ kth->ktr_len = sizeof (struct ktr_genio) + len;
+
+ ktrwrite(vp, kth);
+done:
+ FREE(kth, M_KTRACE);
+ FREE(ktp, M_KTRACE);
+ p->p_traceflag &= ~KTRFAC_ACTIVE;
+}
+
+void
+ktrpsig(vp, sig, action, mask, code)
+ struct vnode *vp;
+ int sig;
+ sig_t action;
+ int mask, code;
+{
+ struct ktr_header *kth;
+ struct ktr_psig kp;
+ struct proc *p = curproc; /* XXX */
+
+ p->p_traceflag |= KTRFAC_ACTIVE;
+ kth = ktrgetheader(KTR_PSIG);
+ kp.signo = (char)sig;
+ kp.action = action;
+ kp.mask = mask;
+ kp.code = code;
+ kth->ktr_buf = (caddr_t)&kp;
+ kth->ktr_len = sizeof (struct ktr_psig);
+
+ ktrwrite(vp, kth);
+ FREE(kth, M_KTRACE);
+ p->p_traceflag &= ~KTRFAC_ACTIVE;
+}
+
+void
+ktrcsw(vp, out, user)
+ struct vnode *vp;
+ int out, user;
+{
+ struct ktr_header *kth;
+ struct ktr_csw kc;
+ struct proc *p = curproc; /* XXX */
+
+ p->p_traceflag |= KTRFAC_ACTIVE;
+ kth = ktrgetheader(KTR_CSW);
+ kc.out = out;
+ kc.user = user;
+ kth->ktr_buf = (caddr_t)&kc;
+ kth->ktr_len = sizeof (struct ktr_csw);
+
+ ktrwrite(vp, kth);
+ FREE(kth, M_KTRACE);
+ p->p_traceflag &= ~KTRFAC_ACTIVE;
+}
+#endif
+
+/* Interface and common routines */
+
+/*
+ * ktrace system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ktrace_args {
+ char *fname;
+ int ops;
+ int facs;
+ int pid;
+};
+#endif
+/* ARGSUSED */
+int
+ktrace(curp, uap)
+ struct proc *curp;
+ register struct ktrace_args *uap;
+{
+#ifdef KTRACE
+ register struct vnode *vp = NULL;
+ register struct proc *p;
+ struct pgrp *pg;
+ int facs = uap->facs & ~KTRFAC_ROOT;
+ int ops = KTROP(uap->ops);
+ int descend = uap->ops & KTRFLAG_DESCEND;
+ int ret = 0;
+ int error = 0;
+ struct nameidata nd;
+
+ curp->p_traceflag |= KTRFAC_ACTIVE;
+ if (ops != KTROP_CLEAR) {
+ /*
+ * an operation which requires a file argument.
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->fname, curp);
+ error = vn_open(&nd, FREAD|FWRITE, 0);
+ if (error) {
+ curp->p_traceflag &= ~KTRFAC_ACTIVE;
+ return (error);
+ }
+ vp = nd.ni_vp;
+ VOP_UNLOCK(vp, 0, curp);
+ if (vp->v_type != VREG) {
+ (void) vn_close(vp, FREAD|FWRITE, curp->p_ucred, curp);
+ curp->p_traceflag &= ~KTRFAC_ACTIVE;
+ return (EACCES);
+ }
+ }
+ /*
+ * Clear all uses of the tracefile
+ */
+ if (ops == KTROP_CLEARFILE) {
+ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+ if (p->p_tracep == vp) {
+ if (ktrcanset(curp, p)) {
+ p->p_tracep = NULL;
+ p->p_traceflag = 0;
+ (void) vn_close(vp, FREAD|FWRITE,
+ p->p_ucred, p);
+ } else
+ error = EPERM;
+ }
+ }
+ goto done;
+ }
+ /*
+ * need something to (un)trace (XXX - why is this here?)
+ */
+ if (!facs) {
+ error = EINVAL;
+ goto done;
+ }
+ /*
+ * do it
+ */
+ if (uap->pid < 0) {
+ /*
+ * by process group
+ */
+ pg = pgfind(-uap->pid);
+ if (pg == NULL) {
+ error = ESRCH;
+ goto done;
+ }
+ for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next)
+ if (descend)
+ ret |= ktrsetchildren(curp, p, ops, facs, vp);
+ else
+ ret |= ktrops(curp, p, ops, facs, vp);
+
+ } else {
+ /*
+ * by pid
+ */
+ p = pfind(uap->pid);
+ if (p == NULL) {
+ error = ESRCH;
+ goto done;
+ }
+ if (descend)
+ ret |= ktrsetchildren(curp, p, ops, facs, vp);
+ else
+ ret |= ktrops(curp, p, ops, facs, vp);
+ }
+ if (!ret)
+ error = EPERM;
+done:
+ if (vp != NULL)
+ (void) vn_close(vp, FWRITE, curp->p_ucred, curp);
+ curp->p_traceflag &= ~KTRFAC_ACTIVE;
+ return (error);
+#else
+ return ENOSYS;
+#endif
+}
+
+/*
+ * utrace system call
+ */
+/* ARGSUSED */
+int
+utrace(curp, uap)
+ struct proc *curp;
+ register struct utrace_args *uap;
+{
+#ifdef KTRACE
+ struct ktr_header *kth;
+ struct proc *p = curproc; /* XXX */
+ register caddr_t cp;
+
+ if (!KTRPOINT(p, KTR_USER))
+ return (0);
+ p->p_traceflag |= KTRFAC_ACTIVE;
+ kth = ktrgetheader(KTR_USER);
+ MALLOC(cp, caddr_t, uap->len, M_KTRACE, M_WAITOK);
+ if (!copyin(uap->addr, cp, uap->len)) {
+ kth->ktr_buf = cp;
+ kth->ktr_len = uap->len;
+ ktrwrite(p->p_tracep, kth);
+ }
+ FREE(kth, M_KTRACE);
+ FREE(cp, M_KTRACE);
+ p->p_traceflag &= ~KTRFAC_ACTIVE;
+
+ return (0);
+#else
+ return (ENOSYS);
+#endif
+}
+
+#ifdef KTRACE
+static int
+ktrops(curp, p, ops, facs, vp)
+ struct proc *p, *curp;
+ int ops, facs;
+ struct vnode *vp;
+{
+
+ if (!ktrcanset(curp, p))
+ return (0);
+ if (ops == KTROP_SET) {
+ if (p->p_tracep != vp) {
+ /*
+ * if trace file already in use, relinquish
+ */
+ if (p->p_tracep != NULL)
+ vrele(p->p_tracep);
+ VREF(vp);
+ p->p_tracep = vp;
+ }
+ p->p_traceflag |= facs;
+ if (curp->p_ucred->cr_uid == 0)
+ p->p_traceflag |= KTRFAC_ROOT;
+ } else {
+ /* KTROP_CLEAR */
+ if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
+ /* no more tracing */
+ p->p_traceflag = 0;
+ if (p->p_tracep != NULL) {
+ vrele(p->p_tracep);
+ p->p_tracep = NULL;
+ }
+ }
+ }
+
+ return (1);
+}
+
+static int
+ktrsetchildren(curp, top, ops, facs, vp)
+ struct proc *curp, *top;
+ int ops, facs;
+ struct vnode *vp;
+{
+ register struct proc *p;
+ register int ret = 0;
+
+ p = top;
+ for (;;) {
+ ret |= ktrops(curp, p, ops, facs, vp);
+ /*
+ * If this process has children, descend to them next,
+ * otherwise do any siblings, and if done with this level,
+ * follow back up the tree (but not past top).
+ */
+ if (p->p_children.lh_first)
+ p = p->p_children.lh_first;
+ else for (;;) {
+ if (p == top)
+ return (ret);
+ if (p->p_sibling.le_next) {
+ p = p->p_sibling.le_next;
+ break;
+ }
+ p = p->p_pptr;
+ }
+ }
+ /*NOTREACHED*/
+}
+
+static void
+ktrwrite(vp, kth)
+ struct vnode *vp;
+ register struct ktr_header *kth;
+{
+ struct uio auio;
+ struct iovec aiov[2];
+ register struct proc *p = curproc; /* XXX */
+ int error;
+
+ if (vp == NULL)
+ return;
+ auio.uio_iov = &aiov[0];
+ auio.uio_offset = 0;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ aiov[0].iov_base = (caddr_t)kth;
+ aiov[0].iov_len = sizeof(struct ktr_header);
+ auio.uio_resid = sizeof(struct ktr_header);
+ auio.uio_iovcnt = 1;
+ auio.uio_procp = curproc;
+ if (kth->ktr_len > 0) {
+ auio.uio_iovcnt++;
+ aiov[1].iov_base = kth->ktr_buf;
+ aiov[1].iov_len = kth->ktr_len;
+ auio.uio_resid += kth->ktr_len;
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ error = VOP_WRITE(vp, &auio, IO_UNIT|IO_APPEND, p->p_ucred);
+ VOP_UNLOCK(vp, 0, p);
+ if (!error)
+ return;
+ /*
+ * If error encountered, give up tracing on this vnode.
+ */
+ log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
+ error);
+ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+ if (p->p_tracep == vp) {
+ p->p_tracep = NULL;
+ p->p_traceflag = 0;
+ vrele(vp);
+ }
+ }
+}
+
+/*
+ * Return true if caller has permission to set the ktracing state
+ * of target. Essentially, the target can't possess any
+ * more permissions than the caller. KTRFAC_ROOT signifies that
+ * root previously set the tracing status on the target process, and
+ * so, only root may further change it.
+ *
+ * TODO: check groups. use caller effective gid.
+ */
+static int
+ktrcanset(callp, targetp)
+ struct proc *callp, *targetp;
+{
+ register struct pcred *caller = callp->p_cred;
+ register struct pcred *target = targetp->p_cred;
+
+ if ((caller->pc_ucred->cr_uid == target->p_ruid &&
+ target->p_ruid == target->p_svuid &&
+ caller->p_rgid == target->p_rgid && /* XXX */
+ target->p_rgid == target->p_svgid &&
+ (targetp->p_traceflag & KTRFAC_ROOT) == 0) ||
+ caller->pc_ucred->cr_uid == 0)
+ return (1);
+
+ return (0);
+}
+
+#endif /* KTRACE */
diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c
new file mode 100644
index 0000000..97def9f
--- /dev/null
+++ b/sys/kern/kern_linker.c
@@ -0,0 +1,1016 @@
+/*-
+ * Copyright (c) 1997 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: kern_linker.c,v 1.20 1999/01/19 16:26:32 peter Exp $
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <machine/cpu.h>
+#include <machine/bootinfo.h>
+#include <sys/module.h>
+#include <sys/linker.h>
+#include <sys/unistd.h>
+#include <sys/fcntl.h>
+#include <sys/libkern.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/sysctl.h>
+
+#ifdef KLD_DEBUG
+int kld_debug = 0;
+#endif
+
+MALLOC_DEFINE(M_LINKER, "kld", "kernel linker");
+linker_file_t linker_current_file;
+linker_file_t linker_kernel_file;
+
+static struct lock lock; /* lock for the file list */
+static linker_class_list_t classes;
+static linker_file_list_t files;
+static int next_file_id = 1;
+
+static void
+linker_init(void* arg)
+{
+ lockinit(&lock, PVM, "klink", 0, 0);
+ TAILQ_INIT(&classes);
+ TAILQ_INIT(&files);
+}
+
+SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0);
+
+int
+linker_add_class(const char* desc, void* priv,
+ struct linker_class_ops* ops)
+{
+ linker_class_t lc;
+
+ lc = malloc(sizeof(struct linker_class), M_LINKER, M_NOWAIT);
+ if (!lc)
+ return ENOMEM;
+ bzero(lc, sizeof(*lc));
+
+ lc->desc = desc;
+ lc->priv = priv;
+ lc->ops = ops;
+ TAILQ_INSERT_HEAD(&classes, lc, link);
+
+ return 0;
+}
+
+static void
+linker_file_sysinit(linker_file_t lf)
+{
+ struct linker_set* sysinits;
+ struct sysinit** sipp;
+ struct sysinit** xipp;
+ struct sysinit* save;
+ moduledata_t *moddata;
+
+ KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n",
+ lf->filename));
+
+ sysinits = (struct linker_set*)
+ linker_file_lookup_symbol(lf, "sysinit_set", 0);
+
+ KLD_DPF(FILE, ("linker_file_sysinit: SYSINITs %p\n", sysinits));
+ if (!sysinits)
+ return;
+
+ /* HACK ALERT! */
+ for (sipp = (struct sysinit **)sysinits->ls_items; *sipp; sipp++) {
+ if ((*sipp)->func == module_register_init) {
+ moddata = (*sipp)->udata;
+ moddata->_file = lf;
+ }
+ }
+
+ /*
+ * Perform a bubble sort of the system initialization objects by
+ * their subsystem (primary key) and order (secondary key).
+ *
+ * Since some things care about execution order, this is the
+ * operation which ensures continued function.
+ */
+ for (sipp = (struct sysinit **)sysinits->ls_items; *sipp; sipp++) {
+ for (xipp = sipp + 1; *xipp; xipp++) {
+ if ((*sipp)->subsystem <= (*xipp)->subsystem ||
+ ((*sipp)->subsystem == (*xipp)->subsystem &&
+ (*sipp)->order <= (*xipp)->order))
+ continue; /* skip*/
+ save = *sipp;
+ *sipp = *xipp;
+ *xipp = save;
+ }
+ }
+
+
+ /*
+ * Traverse the (now) ordered list of system initialization tasks.
+ * Perform each task, and continue on to the next task.
+ */
+ for (sipp = (struct sysinit **)sysinits->ls_items; *sipp; sipp++) {
+ if ((*sipp)->subsystem == SI_SUB_DUMMY)
+ continue; /* skip dummy task(s)*/
+
+ switch ((*sipp)->type) {
+ case SI_TYPE_DEFAULT:
+ /* no special processing*/
+ (*((*sipp)->func))((*sipp)->udata);
+ break;
+
+ case SI_TYPE_KTHREAD:
+#if !defined(SMP)
+ /* kernel thread*/
+ if (fork1(&proc0, RFFDG|RFPROC|RFMEM))
+ panic("fork kernel thread");
+ cpu_set_fork_handler(pfind(proc0.p_retval[0]),
+ (*sipp)->func, (*sipp)->udata);
+ break;
+#endif
+
+ case SI_TYPE_KPROCESS:
+ /* kernel thread*/
+ if (fork1(&proc0, RFFDG|RFPROC))
+ panic("fork kernel process");
+ cpu_set_fork_handler(pfind(proc0.p_retval[0]),
+ (*sipp)->func, (*sipp)->udata);
+ break;
+
+ default:
+ panic ("linker_file_sysinit: unrecognized init type");
+ }
+ }
+}
+
+static void
+linker_file_sysuninit(linker_file_t lf)
+{
+ struct linker_set* sysuninits;
+ struct sysinit** sipp;
+ struct sysinit** xipp;
+ struct sysinit* save;
+
+ KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n",
+ lf->filename));
+
+ sysuninits = (struct linker_set*)
+ linker_file_lookup_symbol(lf, "sysuninit_set", 0);
+
+ KLD_DPF(FILE, ("linker_file_sysuninit: SYSUNINITs %p\n", sysuninits));
+ if (!sysuninits)
+ return;
+
+ /*
+ * Perform a reverse bubble sort of the system initialization objects
+ * by their subsystem (primary key) and order (secondary key).
+ *
+ * Since some things care about execution order, this is the
+ * operation which ensures continued function.
+ */
+ for (sipp = (struct sysinit **)sysuninits->ls_items; *sipp; sipp++) {
+ for (xipp = sipp + 1; *xipp; xipp++) {
+ if ((*sipp)->subsystem >= (*xipp)->subsystem ||
+ ((*sipp)->subsystem == (*xipp)->subsystem &&
+ (*sipp)->order >= (*xipp)->order))
+ continue; /* skip*/
+ save = *sipp;
+ *sipp = *xipp;
+ *xipp = save;
+ }
+ }
+
+
+ /*
+ * Traverse the (now) ordered list of system initialization tasks.
+ * Perform each task, and continue on to the next task.
+ */
+ for (sipp = (struct sysinit **)sysuninits->ls_items; *sipp; sipp++) {
+ if ((*sipp)->subsystem == SI_SUB_DUMMY)
+ continue; /* skip dummy task(s)*/
+
+ switch ((*sipp)->type) {
+ case SI_TYPE_DEFAULT:
+ /* no special processing*/
+ (*((*sipp)->func))((*sipp)->udata);
+ break;
+
+ default:
+ panic("linker_file_sysuninit: unrecognized uninit type");
+ }
+ }
+}
+
+int
+linker_load_file(const char* filename, linker_file_t* result)
+{
+ linker_class_t lc;
+ linker_file_t lf;
+ int foundfile, error = 0;
+ char *koname = NULL;
+
+ lf = linker_find_file_by_name(filename);
+ if (lf) {
+ KLD_DPF(FILE, ("linker_load_file: file %s is already loaded, incrementing refs\n", filename));
+ *result = lf;
+ lf->refs++;
+ goto out;
+ }
+
+ koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
+ if (koname == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ sprintf(koname, "%s.ko", filename);
+ lf = NULL;
+ foundfile = 0;
+ for (lc = TAILQ_FIRST(&classes); lc; lc = TAILQ_NEXT(lc, link)) {
+ KLD_DPF(FILE, ("linker_load_file: trying to load %s as %s\n",
+ filename, lc->desc));
+
+ error = lc->ops->load_file(koname, &lf); /* First with .ko */
+ if (lf == NULL && error == ENOENT)
+ error = lc->ops->load_file(filename, &lf); /* Then try without */
+ /*
+ * If we got something other than ENOENT, then it exists but we cannot
+ * load it for some other reason.
+ */
+ if (error != ENOENT)
+ foundfile = 1;
+ if (lf) {
+ linker_file_sysinit(lf);
+
+ *result = lf;
+ error = 0;
+ goto out;
+ }
+ }
+ /*
+ * Less than ideal, but tells the user whether it failed to load or
+ * the module was not found.
+ */
+ if (foundfile)
+ error = ENOEXEC; /* Format not recognised (or unloadable) */
+ else
+ error = ENOENT; /* Nothing found */
+
+out:
+ if (koname)
+ free(koname, M_LINKER);
+ return error;
+}
+
+linker_file_t
+linker_find_file_by_name(const char* filename)
+{
+ linker_file_t lf = 0;
+ char *koname;
+
+ koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
+ if (koname == NULL)
+ goto out;
+ sprintf(koname, "%s.ko", filename);
+
+ lockmgr(&lock, LK_SHARED, 0, curproc);
+ for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) {
+ if (!strcmp(lf->filename, koname))
+ break;
+ if (!strcmp(lf->filename, filename))
+ break;
+ }
+ lockmgr(&lock, LK_RELEASE, 0, curproc);
+
+out:
+ if (koname)
+ free(koname, M_LINKER);
+ return lf;
+}
+
+linker_file_t
+linker_find_file_by_id(int fileid)
+{
+ linker_file_t lf = 0;
+
+ lockmgr(&lock, LK_SHARED, 0, curproc);
+ for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link))
+ if (lf->id == fileid)
+ break;
+ lockmgr(&lock, LK_RELEASE, 0, curproc);
+
+ return lf;
+}
+
+linker_file_t
+linker_make_file(const char* pathname, void* priv, struct linker_file_ops* ops)
+{
+ linker_file_t lf = 0;
+ int namelen;
+ const char *filename;
+
+ filename = rindex(pathname, '/');
+ if (filename && filename[1])
+ filename++;
+ else
+ filename = pathname;
+
+ KLD_DPF(FILE, ("linker_make_file: new file, filename=%s\n", filename));
+ lockmgr(&lock, LK_EXCLUSIVE|LK_RETRY, 0, curproc);
+ namelen = strlen(filename) + 1;
+ lf = malloc(sizeof(struct linker_file) + namelen, M_LINKER, M_WAITOK);
+ if (!lf)
+ goto out;
+ bzero(lf, sizeof(*lf));
+
+ lf->refs = 1;
+ lf->userrefs = 0;
+ lf->filename = (char*) (lf + 1);
+ strcpy(lf->filename, filename);
+ lf->id = next_file_id++;
+ lf->ndeps = 0;
+ lf->deps = NULL;
+ STAILQ_INIT(&lf->common);
+ TAILQ_INIT(&lf->modules);
+
+ lf->priv = priv;
+ lf->ops = ops;
+ TAILQ_INSERT_TAIL(&files, lf, link);
+
+out:
+ lockmgr(&lock, LK_RELEASE, 0, curproc);
+ return lf;
+}
+
+int
+linker_file_unload(linker_file_t file)
+{
+ module_t mod, next;
+ struct common_symbol* cp;
+ int error = 0;
+ int i;
+
+ KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs));
+ lockmgr(&lock, LK_EXCLUSIVE|LK_RETRY, 0, curproc);
+ if (file->refs == 1) {
+ KLD_DPF(FILE, ("linker_file_unload: file is unloading, informing modules\n"));
+ /*
+ * Inform any modules associated with this file.
+ */
+ for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
+ next = module_getfnext(mod);
+
+ /*
+ * Give the module a chance to veto the unload.
+ */
+ if (error = module_unload(mod)) {
+ KLD_DPF(FILE, ("linker_file_unload: module %x vetoes unload\n",
+ mod));
+ lockmgr(&lock, LK_RELEASE, 0, curproc);
+ goto out;
+ }
+
+ module_release(mod);
+ }
+ }
+
+ file->refs--;
+ if (file->refs > 0) {
+ lockmgr(&lock, LK_RELEASE, 0, curproc);
+ goto out;
+ }
+
+ linker_file_sysuninit(file);
+
+ TAILQ_REMOVE(&files, file, link);
+ lockmgr(&lock, LK_RELEASE, 0, curproc);
+
+ for (i = 0; i < file->ndeps; i++)
+ linker_file_unload(file->deps[i]);
+ free(file->deps, M_LINKER);
+
+ for (cp = STAILQ_FIRST(&file->common); cp;
+ cp = STAILQ_FIRST(&file->common)) {
+ STAILQ_REMOVE(&file->common, cp, common_symbol, link);
+ free(cp, M_LINKER);
+ }
+
+ file->ops->unload(file);
+ free(file, M_LINKER);
+
+out:
+ return error;
+}
+
+int
+linker_file_add_dependancy(linker_file_t file, linker_file_t dep)
+{
+ linker_file_t* newdeps;
+
+ newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t*),
+ M_LINKER, M_WAITOK);
+ if (newdeps == NULL)
+ return ENOMEM;
+ bzero(newdeps, (file->ndeps + 1) * sizeof(linker_file_t*));
+
+ if (file->deps) {
+ bcopy(file->deps, newdeps, file->ndeps * sizeof(linker_file_t*));
+ free(file->deps, M_LINKER);
+ }
+ file->deps = newdeps;
+ file->deps[file->ndeps] = dep;
+ file->ndeps++;
+
+ return 0;
+}
+
+caddr_t
+linker_file_lookup_symbol(linker_file_t file, const char* name, int deps)
+{
+ linker_sym_t sym;
+ linker_symval_t symval;
+ linker_file_t lf;
+ caddr_t address;
+ size_t common_size = 0;
+ int i;
+
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%x, name=%s, deps=%d\n",
+ file, name, deps));
+
+ if (file->ops->lookup_symbol(file, name, &sym) == 0) {
+ file->ops->symbol_values(file, sym, &symval);
+ if (symval.value == 0)
+ /*
+ * For commons, first look them up in the dependancies and
+ * only allocate space if not found there.
+ */
+ common_size = symval.size;
+ else {
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol.value=%x\n", symval.value));
+ return symval.value;
+ }
+ }
+
+ if (deps) {
+ for (i = 0; i < file->ndeps; i++) {
+ address = linker_file_lookup_symbol(file->deps[i], name, 0);
+ if (address) {
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: deps value=%x\n", address));
+ return address;
+ }
+ }
+
+ /* If we have not found it in the dependencies, search globally */
+ for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) {
+ /* But skip the current file if it's on the list */
+ if (lf == file)
+ continue;
+ /* And skip the files we searched above */
+ for (i = 0; i < file->ndeps; i++)
+ if (lf == file->deps[i])
+ break;
+ if (i < file->ndeps)
+ continue;
+ address = linker_file_lookup_symbol(lf, name, 0);
+ if (address) {
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: global value=%x\n", address));
+ return address;
+ }
+ }
+ }
+
+ if (common_size > 0) {
+ /*
+ * This is a common symbol which was not found in the
+ * dependancies. We maintain a simple common symbol table in
+ * the file object.
+ */
+ struct common_symbol* cp;
+
+ for (cp = STAILQ_FIRST(&file->common); cp;
+ cp = STAILQ_NEXT(cp, link))
+ if (!strcmp(cp->name, name)) {
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: old common value=%x\n", cp->address));
+ return cp->address;
+ }
+
+ /*
+ * Round the symbol size up to align.
+ */
+ common_size = (common_size + sizeof(int) - 1) & -sizeof(int);
+ cp = malloc(sizeof(struct common_symbol)
+ + common_size
+ + strlen(name) + 1,
+ M_LINKER, M_WAITOK);
+ if (!cp) {
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: nomem\n"));
+ return 0;
+ }
+ bzero(cp, sizeof(struct common_symbol) + common_size + strlen(name)+ 1);
+
+ cp->address = (caddr_t) (cp + 1);
+ cp->name = cp->address + common_size;
+ strcpy(cp->name, name);
+ bzero(cp->address, common_size);
+ STAILQ_INSERT_TAIL(&file->common, cp, link);
+
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: new common value=%x\n", cp->address));
+ return cp->address;
+ }
+
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n"));
+ return 0;
+}
+
+#ifdef DDB
+/*
+ * DDB Helpers. DDB has to look across multiple files with their own
+ * symbol tables and string tables.
+ *
+ * Note that we do not obey list locking protocols here. We really don't
+ * need DDB to hang because somebody's got the lock held. We'll take the
+ * chance that the files list is inconsistant instead.
+ */
+
+int
+linker_ddb_lookup(char *symstr, linker_sym_t *sym)
+{
+ linker_file_t lf;
+
+ for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) {
+ if (lf->ops->lookup_symbol(lf, symstr, sym) == 0)
+ return 0;
+ }
+ return ENOENT;
+}
+
+int
+linker_ddb_search_symbol(caddr_t value, linker_sym_t *sym, long *diffp)
+{
+ linker_file_t lf;
+ u_long off = (u_long)value;
+ u_long diff, bestdiff;
+ linker_sym_t best;
+ linker_sym_t es;
+
+ best = 0;
+ bestdiff = off;
+ for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) {
+ if (lf->ops->search_symbol(lf, value, &es, &diff) != 0)
+ continue;
+ if (es != 0 && diff < bestdiff) {
+ best = es;
+ bestdiff = diff;
+ }
+ if (bestdiff == 0)
+ break;
+ }
+ if (best) {
+ *sym = best;
+ *diffp = bestdiff;
+ return 0;
+ } else {
+ *sym = 0;
+ *diffp = off;
+ return ENOENT;
+ }
+}
+
+int
+linker_ddb_symbol_values(linker_sym_t sym, linker_symval_t *symval)
+{
+ linker_file_t lf;
+
+ for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) {
+ if (lf->ops->symbol_values(lf, sym, symval) == 0)
+ return 0;
+ }
+ return ENOENT;
+}
+
+#endif
+
+/*
+ * Syscalls.
+ */
+
+int
+kldload(struct proc* p, struct kldload_args* uap)
+{
+ char* filename = NULL, *modulename;
+ linker_file_t lf;
+ int error = 0;
+
+ p->p_retval[0] = -1;
+
+ if (securelevel > 0)
+ return EPERM;
+
+ if (error = suser(p->p_ucred, &p->p_acflag))
+ return error;
+
+ filename = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ if (error = copyinstr(SCARG(uap, file), filename, MAXPATHLEN, NULL))
+ goto out;
+
+ /* Can't load more than one module with the same name */
+ modulename = rindex(filename, '/');
+ if (modulename == NULL)
+ modulename = filename;
+ if (linker_find_file_by_name(modulename)) {
+ error = EEXIST;
+ goto out;
+ }
+
+ if (error = linker_load_file(filename, &lf))
+ goto out;
+
+ lf->userrefs++;
+ p->p_retval[0] = lf->id;
+
+out:
+ if (filename)
+ free(filename, M_TEMP);
+ return error;
+}
+
+int
+kldunload(struct proc* p, struct kldunload_args* uap)
+{
+ linker_file_t lf;
+ int error = 0;
+
+ if (securelevel > 0)
+ return EPERM;
+
+ if (error = suser(p->p_ucred, &p->p_acflag))
+ return error;
+
+ lf = linker_find_file_by_id(SCARG(uap, fileid));
+ if (lf) {
+ KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs));
+ if (lf->userrefs == 0) {
+ printf("linkerunload: attempt to unload file which was not loaded by user\n");
+ error = EBUSY;
+ goto out;
+ }
+ error = linker_file_unload(lf);
+ if (error)
+ goto out;
+ lf->userrefs--;
+ } else
+ error = ENOENT;
+
+out:
+ return error;
+}
+
+int
+kldfind(struct proc* p, struct kldfind_args* uap)
+{
+ char* filename = NULL, *modulename;
+ linker_file_t lf;
+ int error = 0;
+
+ p->p_retval[0] = -1;
+
+ filename = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ if (error = copyinstr(SCARG(uap, file), filename, MAXPATHLEN, NULL))
+ goto out;
+
+ modulename = rindex(filename, '/');
+ if (modulename == NULL)
+ modulename = filename;
+
+ lf = linker_find_file_by_name(modulename);
+ if (lf)
+ p->p_retval[0] = lf->id;
+ else
+ error = ENOENT;
+
+out:
+ if (filename)
+ free(filename, M_TEMP);
+ return error;
+}
+
+int
+kldnext(struct proc* p, struct kldnext_args* uap)
+{
+ linker_file_t lf;
+ int error = 0;
+
+ if (SCARG(uap, fileid) == 0) {
+ if (TAILQ_FIRST(&files))
+ p->p_retval[0] = TAILQ_FIRST(&files)->id;
+ else
+ p->p_retval[0] = 0;
+ return 0;
+ }
+
+ lf = linker_find_file_by_id(SCARG(uap, fileid));
+ if (lf) {
+ if (TAILQ_NEXT(lf, link))
+ p->p_retval[0] = TAILQ_NEXT(lf, link)->id;
+ else
+ p->p_retval[0] = 0;
+ } else
+ error = ENOENT;
+
+ return error;
+}
+
+int
+kldstat(struct proc* p, struct kldstat_args* uap)
+{
+ linker_file_t lf;
+ int error = 0;
+ int version;
+ struct kld_file_stat* stat;
+ int namelen;
+
+ lf = linker_find_file_by_id(SCARG(uap, fileid));
+ if (!lf) {
+ error = ENOENT;
+ goto out;
+ }
+
+ stat = SCARG(uap, stat);
+
+ /*
+ * Check the version of the user's structure.
+ */
+ if (error = copyin(&stat->version, &version, sizeof(version)))
+ goto out;
+ if (version != sizeof(struct kld_file_stat)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ namelen = strlen(lf->filename) + 1;
+ if (namelen > MAXPATHLEN)
+ namelen = MAXPATHLEN;
+ if (error = copyout(lf->filename, &stat->name[0], namelen))
+ goto out;
+ if (error = copyout(&lf->refs, &stat->refs, sizeof(int)))
+ goto out;
+ if (error = copyout(&lf->id, &stat->id, sizeof(int)))
+ goto out;
+ if (error = copyout(&lf->address, &stat->address, sizeof(caddr_t)))
+ goto out;
+ if (error = copyout(&lf->size, &stat->size, sizeof(size_t)))
+ goto out;
+
+ p->p_retval[0] = 0;
+
+out:
+ return error;
+}
+
+int
+kldfirstmod(struct proc* p, struct kldfirstmod_args* uap)
+{
+ linker_file_t lf;
+ int error = 0;
+
+ lf = linker_find_file_by_id(SCARG(uap, fileid));
+ if (lf) {
+ if (TAILQ_FIRST(&lf->modules))
+ p->p_retval[0] = module_getid(TAILQ_FIRST(&lf->modules));
+ else
+ p->p_retval[0] = 0;
+ } else
+ error = ENOENT;
+
+ return error;
+}
+
+int
+kldsym(struct proc *p, struct kldsym_args *uap)
+{
+ char *symstr = NULL;
+ linker_sym_t sym;
+ linker_symval_t symval;
+ linker_file_t lf;
+ struct kld_sym_lookup lookup;
+ int error = 0;
+
+ if (error = copyin(SCARG(uap, data), &lookup, sizeof(lookup)))
+ goto out;
+ if (lookup.version != sizeof(lookup) || SCARG(uap, cmd) != KLDSYM_LOOKUP) {
+ error = EINVAL;
+ goto out;
+ }
+
+ symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ if (error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL))
+ goto out;
+
+ if (SCARG(uap, fileid) != 0) {
+ lf = linker_find_file_by_id(SCARG(uap, fileid));
+ if (lf == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ if (lf->ops->lookup_symbol(lf, symstr, &sym) == 0 &&
+ lf->ops->symbol_values(lf, sym, &symval) == 0) {
+ lookup.symvalue = (u_long)symval.value;
+ lookup.symsize = symval.size;
+ error = copyout(&lookup, SCARG(uap, data), sizeof(lookup));
+ } else
+ error = ENOENT;
+ } else {
+ for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) {
+ if (lf->ops->lookup_symbol(lf, symstr, &sym) == 0 &&
+ lf->ops->symbol_values(lf, sym, &symval) == 0) {
+ lookup.symvalue = (u_long)symval.value;
+ lookup.symsize = symval.size;
+ error = copyout(&lookup, SCARG(uap, data), sizeof(lookup));
+ break;
+ }
+ }
+ if (!lf)
+ error = ENOENT;
+ }
+out:
+ if (symstr)
+ free(symstr, M_TEMP);
+ return error;
+}
+
+/*
+ * Preloaded module support
+ */
+
+static void
+linker_preload(void* arg)
+{
+ caddr_t modptr;
+ char *modname;
+ char *modtype;
+ linker_file_t lf;
+ linker_class_t lc;
+ int error;
+ struct linker_set *sysinits;
+ struct sysinit **sipp;
+ moduledata_t *moddata;
+
+ modptr = NULL;
+ while ((modptr = preload_search_next_name(modptr)) != NULL) {
+ modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+ modtype = (char *)preload_search_info(modptr, MODINFO_TYPE);
+ if (modname == NULL) {
+ printf("Preloaded module at %p does not have a name!\n", modptr);
+ continue;
+ }
+ if (modtype == NULL) {
+ printf("Preloaded module at %p does not have a type!\n", modptr);
+ continue;
+ }
+ printf("Preloaded %s \"%s\" at %p.\n", modtype, modname, modptr);
+ lf = linker_find_file_by_name(modname);
+ if (lf) {
+ lf->userrefs++;
+ continue;
+ }
+ lf = NULL;
+ for (lc = TAILQ_FIRST(&classes); lc; lc = TAILQ_NEXT(lc, link)) {
+ error = lc->ops->load_file(modname, &lf);
+ if (error) {
+ lf = NULL;
+ break;
+ }
+ }
+ if (lf) {
+ lf->userrefs++;
+
+ sysinits = (struct linker_set*)
+ linker_file_lookup_symbol(lf, "sysinit_set", 0);
+ if (sysinits) {
+ /* HACK ALERT!
+ * This is to set the sysinit moduledata so that the module
+ * can attach itself to the correct containing file.
+ * The sysinit could be run at *any* time.
+ */
+ for (sipp = (struct sysinit **)sysinits->ls_items; *sipp; sipp++) {
+ if ((*sipp)->func == module_register_init) {
+ moddata = (*sipp)->udata;
+ moddata->_file = lf;
+ }
+ }
+ sysinit_add((struct sysinit **)sysinits->ls_items);
+ }
+ }
+ }
+}
+
+SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0);
+
+/*
+ * Search for a not-loaded module by name.
+ *
+ * Modules may be found in the following locations:
+ *
+ * - preloaded (result is just the module name)
+ * - on disk (result is full path to module)
+ *
+ * If the module name is qualified in any way (contains path, etc.)
+ * the we simply return a copy of it.
+ *
+ * The search path can be manipulated via sysctl. Note that we use the ';'
+ * character as a separator to be consistent with the bootloader.
+ */
+
+static char linker_path[MAXPATHLEN + 1] = "/;/boot/;/modules/";
+
+SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RW, linker_path,
+ sizeof(linker_path), "module load search path");
+
+static char *
+linker_strdup(const char *str)
+{
+ char *result;
+
+ if ((result = malloc((strlen(str) + 1), M_LINKER, M_WAITOK)) != NULL)
+ strcpy(result, str);
+ return(result);
+}
+
+char *
+linker_search_path(const char *name)
+{
+ struct nameidata nd;
+ struct proc *p = curproc; /* XXX */
+ char *cp, *ep, *result;
+ int error;
+ enum vtype type;
+
+ /* qualified at all? */
+ if (index(name, '/'))
+ return(linker_strdup(name));
+
+ /* traverse the linker path */
+ cp = linker_path;
+ for (;;) {
+
+ /* find the end of this component */
+ for (ep = cp; (*ep != 0) && (*ep != ';'); ep++)
+ ;
+ result = malloc((strlen(name) + (ep - cp) + 1), M_LINKER, M_WAITOK);
+ if (result == NULL) /* actually ENOMEM */
+ return(NULL);
+
+ strncpy(result, cp, ep - cp);
+ strcpy(result + (ep - cp), name);
+
+ /*
+ * Attempt to open the file, and return the path if we succeed and it's
+ * a regular file.
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, p);
+ error = vn_open(&nd, FREAD, 0);
+ if (error == 0) {
+ type = nd.ni_vp->v_type;
+ VOP_UNLOCK(nd.ni_vp, 0, p);
+ vn_close(nd.ni_vp, FREAD, p->p_ucred, p);
+ if (type == VREG)
+ return(result);
+ }
+ free(result, M_LINKER);
+
+ if (*ep == 0)
+ break;
+ cp = ep + 1;
+ }
+ return(NULL);
+}
diff --git a/sys/kern/kern_lkm.c b/sys/kern/kern_lkm.c
new file mode 100644
index 0000000..e5ea629
--- /dev/null
+++ b/sys/kern/kern_lkm.c
@@ -0,0 +1,838 @@
+/*-
+ * Copyright (c) 1992 Terrence R. Lambert.
+ * Copyright (c) 1994 Christopher G. Demetriou
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Terrence R. Lambert.
+ * 4. The name Terrence R. Lambert may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY TERRENCE R. LAMBERT ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: kern_lkm.c,v 1.59 1998/11/10 09:12:40 peter Exp $
+ */
+
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/exec.h>
+#include <sys/lkm.h>
+#include <sys/vnode.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+
+#define PAGESIZE 1024 /* kmem_alloc() allocation quantum */
+
+#define LKM_ALLOC 0x01
+#define LKM_WANT 0x02
+
+#define LKMS_IDLE 0x00
+#define LKMS_RESERVED 0x01
+#define LKMS_LOADING 0x02
+#define LKMS_LOADED 0x04
+#define LKMS_UNLOADING 0x08
+
+static int lkm_v = 0;
+static int lkm_state = LKMS_IDLE;
+
+#ifndef MAXLKMS
+#define MAXLKMS 20
+#endif
+
+static struct lkm_table lkmods[MAXLKMS]; /* table of loaded modules */
+static struct lkm_table *curp; /* global for in-progress ops */
+
+static int _lkm_dev __P((struct lkm_table *lkmtp, int cmd));
+static int _lkm_exec __P((struct lkm_table *lkmtp, int cmd));
+static int _lkm_vfs __P((struct lkm_table *lkmtp, int cmd));
+static int _lkm_syscall __P((struct lkm_table *lkmtp, int cmd));
+static void lkmunreserve __P((void));
+
+static d_open_t lkmcopen;
+static d_close_t lkmcclose;
+static d_ioctl_t lkmcioctl;
+
+#define CDEV_MAJOR 32
+static struct cdevsw lkmc_cdevsw =
+ { lkmcopen, lkmcclose, noread, nowrite, /*32*/
+ lkmcioctl, nostop, nullreset, nodevtotty,
+ seltrue, nommap, NULL, "lkm", NULL, -1 };
+
+
+/*ARGSUSED*/
+static int
+lkmcopen(dev, flag, devtype, p)
+ dev_t dev;
+ int flag;
+ int devtype;
+ struct proc *p;
+{
+ int error;
+
+ if (minor(dev) != 0)
+ return(ENXIO); /* bad minor # */
+
+ /*
+ * Use of the loadable kernel module device must be exclusive; we
+ * may try to remove this restriction later, but it's really no
+ * hardship.
+ */
+ while (lkm_v & LKM_ALLOC) {
+ if (flag & FNONBLOCK) /* don't hang */
+ return(EBUSY);
+ lkm_v |= LKM_WANT;
+ /*
+ * Sleep pending unlock; we use tsleep() to allow
+ * an alarm out of the open.
+ */
+ error = tsleep((caddr_t)&lkm_v, TTIPRI|PCATCH, "lkmopn", 0);
+ if (error)
+ return(error); /* leave LKM_WANT set -- no problem */
+ }
+ lkm_v |= LKM_ALLOC;
+
+ return(0); /* pseudo-device open */
+}
+
+/*
+ * Unreserve the memory associated with the current loaded module; done on
+ * a coerced close of the lkm device (close on premature exit of modload)
+ * or explicitly by modload as a result of a link failure.
+ */
+static void
+lkmunreserve()
+{
+
+ if (lkm_state == LKMS_IDLE)
+ return;
+
+ /*
+ * Actually unreserve the memory
+ */
+ if (curp && curp->area) {
+ kmem_free(kernel_map, curp->area, curp->size);/**/
+ curp->area = 0;
+ if (curp->private.lkm_any != NULL)
+ curp->private.lkm_any = NULL;
+ }
+
+ lkm_state = LKMS_IDLE;
+}
+
+static int
+lkmcclose(dev, flag, mode, p)
+ dev_t dev;
+ int flag;
+ int mode;
+ struct proc *p;
+{
+
+ if (!(lkm_v & LKM_ALLOC)) {
+#ifdef DEBUG
+ printf("LKM: close before open!\n");
+#endif /* DEBUG */
+ return(EBADF);
+ }
+
+ /* do this before waking the herd... */
+ if (curp && !curp->used) {
+ /*
+ * If we close before setting used, we have aborted
+ * by way of error or by way of close-on-exit from
+ * a premature exit of "modload".
+ */
+ lkmunreserve(); /* coerce state to LKM_IDLE */
+ }
+
+ lkm_v &= ~LKM_ALLOC;
+ wakeup((caddr_t)&lkm_v); /* thundering herd "problem" here */
+
+ return(0); /* pseudo-device closed */
+}
+
+/*ARGSUSED*/
+static int
+lkmcioctl(dev, cmd, data, flag, p)
+ dev_t dev;
+ u_long cmd;
+ caddr_t data;
+ int flag;
+ struct proc *p;
+{
+ int err = 0;
+ int i;
+ struct lmc_resrv *resrvp;
+ struct lmc_loadbuf *loadbufp;
+ struct lmc_unload *unloadp;
+ struct lmc_stat *statp;
+ char istr[MAXLKMNAME];
+
+ switch(cmd) {
+ case LMRESERV: /* reserve pages for a module */
+ if ((flag & FWRITE) == 0 || securelevel > 0)
+ /* only allow this if writing and insecure */
+ return EPERM;
+
+ resrvp = (struct lmc_resrv *)data;
+
+ /*
+ * Find a free slot.
+ */
+ for (i = 0; i < MAXLKMS; i++)
+ if (!lkmods[i].used)
+ break;
+ if (i == MAXLKMS) {
+ err = ENOMEM; /* no slots available */
+ break;
+ }
+ curp = &lkmods[i];
+ curp->id = i; /* self reference slot offset */
+
+ resrvp->slot = i; /* return slot */
+
+ /*
+ * Get memory for module
+ */
+ curp->size = resrvp->size;
+
+ curp->area = kmem_alloc(kernel_map, curp->size);/**/
+
+ curp->offset = 0; /* load offset */
+
+ resrvp->addr = curp->area; /* ret kernel addr */
+
+#ifdef DEBUG
+ printf("LKM: LMRESERV (actual = 0x%08lx)\n", curp->area);
+ printf("LKM: LMRESERV (adjusted = 0x%08lx)\n",
+ trunc_page(curp->area));
+#endif /* DEBUG */
+ lkm_state = LKMS_RESERVED;
+ break;
+
+ case LMLOADBUF: /* Copy in; stateful, follows LMRESERV */
+ if ((flag & FWRITE) == 0 || securelevel > 0)
+ /* only allow this if writing and insecure */
+ return EPERM;
+
+ loadbufp = (struct lmc_loadbuf *)data;
+ i = loadbufp->cnt;
+ if ((lkm_state != LKMS_RESERVED && lkm_state != LKMS_LOADING)
+ || i < 0
+ || i > MODIOBUF
+ || i > curp->size - curp->offset) {
+ err = ENOMEM;
+ break;
+ }
+
+ /* copy in buffer full of data */
+ err = copyin((caddr_t)loadbufp->data,
+ (caddr_t)(uintptr_t)(curp->area + curp->offset), i);
+ if (err)
+ break;
+
+ if ((curp->offset + i) < curp->size) {
+ lkm_state = LKMS_LOADING;
+#ifdef DEBUG
+ printf(
+ "LKM: LMLOADBUF (loading @ %lu of %lu, i = %d)\n",
+ curp->offset, curp->size, i);
+#endif /* DEBUG */
+ } else {
+ lkm_state = LKMS_LOADED;
+#ifdef DEBUG
+ printf("LKM: LMLOADBUF (loaded)\n");
+#endif /* DEBUG */
+ }
+ curp->offset += i;
+ break;
+
+ case LMUNRESRV: /* discard reserved pages for a module */
+ if ((flag & FWRITE) == 0 || securelevel > 0)
+ /* only allow this if writing and insecure */
+ return EPERM;
+
+ lkmunreserve(); /* coerce state to LKM_IDLE */
+#ifdef DEBUG
+ printf("LKM: LMUNRESERV\n");
+#endif /* DEBUG */
+ break;
+
+ case LMREADY: /* module loaded: call entry */
+ if ((flag & FWRITE) == 0 || securelevel > 0)
+ /* only allow this if writing or insecure */
+ return EPERM;
+
+ switch (lkm_state) {
+ case LKMS_LOADED:
+ break;
+ case LKMS_LOADING:
+ /* The remainder must be bss, so we clear it */
+ bzero((caddr_t)(uintptr_t)(curp->area + curp->offset),
+ curp->size - curp->offset);
+ break;
+ default:
+
+#ifdef DEBUG
+ printf("lkm_state is %02x\n", lkm_state);
+#endif /* DEBUG */
+ return ENXIO;
+ }
+
+ /* XXX gack */
+ curp->entry = (int (*) __P((struct lkm_table *, int, int)))
+ (*(uintfptr_t *)data);
+
+ /* call entry(load)... (assigns "private" portion) */
+ err = (*(curp->entry))(curp, LKM_E_LOAD, LKM_VERSION);
+ if (err) {
+ /*
+ * Module may refuse loading or may have a
+ * version mismatch...
+ */
+ lkm_state = LKMS_UNLOADING; /* for lkmunreserve */
+ lkmunreserve(); /* free memory */
+ curp->used = 0; /* free slot */
+ break;
+ }
+ /*
+ * It's possible for a user to load a module that doesn't
+ * initialize itself correctly. (You can even get away with
+ * using it for a while.) Unfortunately, we are faced with
+ * the following problems:
+ * - we can't tell a good module from a bad one until
+ * after we've run its entry function (if the private
+ * section is uninitalized after we return from the
+ * entry, then something's fishy)
+ * - now that we've called the entry function, we can't
+ * forcibly unload the module without risking a crash
+ * - since we don't know what the module's entry function
+ * did, we can't easily clean up the mess it may have
+ * made, so we can't know just how unstable the system
+ * may be
+ * So, being stuck between a rock and a hard place, we
+ * have no choice but to do this...
+ */
+ if (curp->private.lkm_any == NULL)
+ panic("loadable module initialization failed");
+
+ curp->used = 1;
+#ifdef DEBUG
+ printf("LKM: LMREADY\n");
+#endif /* DEBUG */
+ lkm_state = LKMS_IDLE;
+ break;
+
+ case LMUNLOAD: /* unload a module */
+ if ((flag & FWRITE) == 0 || securelevel > 0)
+ /* only allow this if writing and insecure */
+ return EPERM;
+
+ unloadp = (struct lmc_unload *)data;
+
+ if ((i = unloadp->id) == -1) { /* unload by name */
+ /*
+ * Copy name and lookup id from all loaded
+ * modules. May fail.
+ */
+ err =copyinstr(unloadp->name, istr, MAXLKMNAME-1, NULL);
+ if (err)
+ break;
+
+ /*
+ * look up id...
+ */
+ for (i = 0; i < MAXLKMS; i++) {
+ if (!lkmods[i].used)
+ continue;
+ if (!strcmp(istr,
+ lkmods[i].private.lkm_any->lkm_name))
+ break;
+ }
+ }
+
+ /*
+ * Range check the value; on failure, return EINVAL
+ */
+ if (i < 0 || i >= MAXLKMS) {
+ err = EINVAL;
+ break;
+ }
+
+ curp = &lkmods[i];
+
+ if (!curp->used) {
+ err = ENOENT;
+ break;
+ }
+
+ /* call entry(unload) */
+ if ((*(curp->entry))(curp, LKM_E_UNLOAD, LKM_VERSION)) {
+ err = EBUSY;
+ break;
+ }
+
+ lkm_state = LKMS_UNLOADING; /* non-idle for lkmunreserve */
+ lkmunreserve(); /* free memory */
+ curp->used = 0; /* free slot */
+ break;
+
+ case LMSTAT: /* stat a module by id/name */
+ /* allow readers and writers to stat */
+
+ statp = (struct lmc_stat *)data;
+
+ if ((i = statp->id) == -1) { /* stat by name */
+ /*
+ * Copy name and lookup id from all loaded
+ * modules.
+ */
+ copystr(statp->name, istr, MAXLKMNAME-1, NULL);
+ /*
+ * look up id...
+ */
+ for (i = 0; i < MAXLKMS; i++) {
+ if (!lkmods[i].used)
+ continue;
+ if (!strcmp(istr,
+ lkmods[i].private.lkm_any->lkm_name))
+ break;
+ }
+
+ if (i == MAXLKMS) { /* Not found */
+ err = ENOENT;
+ break;
+ }
+ }
+
+ /*
+ * Range check the value; on failure, return EINVAL
+ */
+ if (i < 0 || i >= MAXLKMS) {
+ err = EINVAL;
+ break;
+ }
+
+ curp = &lkmods[i];
+
+ if (!curp->used) { /* Not found */
+ err = ENOENT;
+ break;
+ }
+
+ /*
+ * Copy out stat information for this module...
+ */
+ statp->id = curp->id;
+ statp->offset = curp->private.lkm_any->lkm_offset;
+ statp->type = curp->private.lkm_any->lkm_type;
+ statp->area = curp->area;
+ statp->size = curp->size / PAGESIZE;
+ statp->private = (uintptr_t)curp->private.lkm_any;
+ statp->ver = curp->private.lkm_any->lkm_ver;
+ copystr(curp->private.lkm_any->lkm_name,
+ statp->name,
+ MAXLKMNAME - 2,
+ NULL);
+
+ break;
+
+ default: /* bad ioctl()... */
+ err = ENOTTY;
+ break;
+ }
+
+ return (err);
+}
+
+int
+lkmexists(lkmtp)
+ struct lkm_table *lkmtp;
+{
+ int i;
+
+ /*
+ * see if name exists...
+ */
+ for (i = 0; i < MAXLKMS; i++) {
+ /*
+ * An unused module and the one we are testing are not
+ * considered.
+ */
+ if (!lkmods[i].used || &lkmods[i] == lkmtp)
+ continue;
+ if (!strcmp(lkmtp->private.lkm_any->lkm_name,
+ lkmods[i].private.lkm_any->lkm_name))
+ return(1); /* already loaded... */
+ }
+
+ return(0); /* module not loaded... */
+}
+
+/*
+ * For the loadable system call described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_syscall(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+ struct lkm_syscall *args = lkmtp->private.lkm_syscall;
+ int i;
+ int err = 0;
+
+ switch(cmd) {
+ case LKM_E_LOAD:
+ /* don't load twice! */
+ if (lkmexists(lkmtp))
+ return(EEXIST);
+
+ if (args->lkm_offset == LKM_ANON)
+ i = NO_SYSCALL;
+ else
+ i = args->lkm_offset;
+
+ err = syscall_register(&i, args->lkm_sysent,
+ &(args->lkm_oldent));
+ if (err)
+ return(err);
+
+ /* done! */
+ args->lkm_offset = i; /* slot in sysent[] */
+
+ break;
+
+ case LKM_E_UNLOAD:
+ /* current slot... */
+ i = args->lkm_offset;
+
+ err = syscall_deregister(&i, &(args->lkm_oldent));
+ if (err)
+ return(err);
+ break;
+
+ case LKM_E_STAT: /* no special handling... */
+ break;
+ }
+
+ return(err);
+}
+
+/*
+ * For the loadable virtual file system described by the structure pointed
+ * to by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_vfs(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+ struct lkm_vfs *args = lkmtp->private.lkm_vfs;
+ struct vfsconf *vfc = args->lkm_vfsconf;
+ int error, i;
+
+ switch(cmd) {
+ case LKM_E_LOAD:
+ /* don't load twice! */
+ if (lkmexists(lkmtp))
+ return(EEXIST);
+
+ for(i = 0; args->lkm_vnodeops->ls_items[i]; i++)
+ vfs_add_vnodeops((void*)args->lkm_vnodeops->ls_items[i]);
+ error = vfs_register(vfc);
+ if (error)
+ return(error);
+
+ args->lkm_offset = vfc->vfc_typenum;
+
+ /* done! */
+ break;
+
+ case LKM_E_UNLOAD:
+ /* current slot... */
+ i = args->lkm_offset;
+
+ error = vfs_unregister(vfc);
+ if (error)
+ return(error);
+
+ for(i = 0; args->lkm_vnodeops->ls_items[i]; i++)
+ vfs_rm_vnodeops((void*)args->lkm_vnodeops->ls_items[i]);
+
+ break;
+
+ case LKM_E_STAT: /* no special handling... */
+ break;
+ }
+ return (0);
+}
+
+/*
+ * For the loadable device driver described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_dev(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+ struct lkm_dev *args = lkmtp->private.lkm_dev;
+ int i;
+ dev_t descrip;
+ int err = 0;
+
+ switch(cmd) {
+ case LKM_E_LOAD:
+ /* don't load twice! */
+ if (lkmexists(lkmtp))
+ return(EEXIST);
+ switch(args->lkm_devtype) {
+ case LM_DT_CHAR:
+ if ((i = args->lkm_offset) == LKM_ANON)
+ descrip = (dev_t) -1;
+ else
+ descrip = makedev(args->lkm_offset,0);
+ if ( err = cdevsw_add(&descrip, args->lkm_dev.cdev,
+ &(args->lkm_olddev.cdev))) {
+ break;
+ }
+ args->lkm_offset = major(descrip) ;
+ break;
+
+ default:
+ err = ENODEV;
+ break;
+ }
+ break;
+
+ case LKM_E_UNLOAD:
+ /* current slot... */
+ i = args->lkm_offset;
+ descrip = makedev(i,0);
+
+ switch(args->lkm_devtype) {
+ case LM_DT_CHAR:
+ /* replace current slot contents with old contents */
+ cdevsw_add(&descrip, args->lkm_olddev.cdev,NULL);
+ break;
+
+ default:
+ err = ENODEV;
+ break;
+ }
+ break;
+
+ case LKM_E_STAT: /* no special handling... */
+ break;
+ }
+
+ return(err);
+}
+
+#ifdef STREAMS
+/*
+ * For the loadable streams module described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_strmod(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+ struct lkm_strmod *args = lkmtp->private.lkm_strmod;
+ int i;
+ int err = 0;
+
+ switch(cmd) {
+ case LKM_E_LOAD:
+ /* don't load twice! */
+ if (lkmexists(lkmtp))
+ return(EEXIST);
+ break;
+
+ case LKM_E_UNLOAD:
+ break;
+
+ case LKM_E_STAT: /* no special handling... */
+ break;
+ }
+
+ return(err);
+}
+#endif /* STREAMS */
+
+/*
+ * For the loadable execution class described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_exec(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+ struct lkm_exec *args = lkmtp->private.lkm_exec;
+ int err = 0;
+
+ switch(cmd) {
+ case LKM_E_LOAD:
+ /* don't load twice! */
+ if (lkmexists(lkmtp))
+ return(EEXIST);
+ if (args->lkm_offset != LKM_ANON) { /* auto */
+ err = EINVAL;
+ break;
+ }
+
+ err = exec_register(args->lkm_exec);
+
+ /* done! */
+ args->lkm_offset = 0;
+
+ break;
+
+ case LKM_E_UNLOAD:
+
+ err = exec_unregister(args->lkm_exec);
+
+ break;
+
+ case LKM_E_STAT: /* no special handling... */
+ break;
+ }
+ return(err);
+}
+
+/*
+ * This code handles the per-module type "wiring-in" of loadable modules
+ * into existing kernel tables. For "LM_MISC" modules, wiring and unwiring
+ * is assumed to be done in their entry routines internal to the module
+ * itself.
+ */
+int
+lkmdispatch(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+ int err = 0; /* default = success */
+
+ switch(lkmtp->private.lkm_any->lkm_type) {
+ case LM_SYSCALL:
+ err = _lkm_syscall(lkmtp, cmd);
+ break;
+
+ case LM_VFS:
+ err = _lkm_vfs(lkmtp, cmd);
+ break;
+
+ case LM_DEV:
+ err = _lkm_dev(lkmtp, cmd);
+ break;
+
+#ifdef STREAMS
+ case LM_STRMOD:
+ {
+ struct lkm_strmod *args = lkmtp->private.lkm_strmod;
+ }
+ break;
+
+#endif /* STREAMS */
+
+ case LM_EXEC:
+ err = _lkm_exec(lkmtp, cmd);
+ break;
+
+ case LM_MISC: /* ignore content -- no "misc-specific" procedure */
+ if (lkmexists(lkmtp))
+ err = EEXIST;
+ break;
+
+ default:
+ err = ENXIO; /* unknown type */
+ break;
+ }
+
+ return(err);
+}
+
+int
+lkm_nullcmd(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+
+ return (0);
+}
+
+#ifdef DEVFS
+static void *lkmc_devfs_token;
+#endif
+
+static int
+lkm_modevent(module_t mod, int type, void *data)
+{
+ dev_t dev;
+ static struct cdevsw *oldcdevsw;
+
+ switch (type) {
+ case MOD_LOAD:
+ dev = makedev(CDEV_MAJOR, 0);
+ cdevsw_add(&dev, &lkmc_cdevsw, &oldcdevsw);
+#ifdef DEVFS
+ lkmc_devfs_token = devfs_add_devswf(&lkmc_cdevsw, 0, DV_CHR,
+ UID_ROOT, GID_WHEEL, 0644,
+ "lkm");
+#endif
+ break;
+ case MOD_UNLOAD:
+#ifdef DEVFS
+ devfs_remove_dev(lkmc_devfs_token);
+#endif
+ cdevsw_add(&dev, oldcdevsw, NULL);
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+static moduledata_t lkm_mod = {
+ "lkm",
+ lkm_modevent,
+ NULL
+};
+DECLARE_MODULE(lkm, lkm_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR);
diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
new file mode 100644
index 0000000..e832acf
--- /dev/null
+++ b/sys/kern/kern_lock.c
@@ -0,0 +1,613 @@
+/*
+ * Copyright (c) 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Copyright (C) 1997
+ * John S. Dyson. All rights reserved.
+ *
+ * This code contains ideas from software contributed to Berkeley by
+ * Avadis Tevanian, Jr., Michael Wayne Young, and the Mach Operating
+ * System project at Carnegie-Mellon University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_lock.c 8.18 (Berkeley) 5/21/95
+ * $Id: kern_lock.c,v 1.22 1999/01/10 01:58:24 eivind Exp $
+ */
+
+#include "opt_lint.h"
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/systm.h>
+
+/*
+ * Locking primitives implementation.
+ * Locks provide shared/exclusive sychronization.
+ */
+
+#ifdef SIMPLELOCK_DEBUG
+#define COUNT(p, x) if (p) (p)->p_locks += (x)
+#else
+#define COUNT(p, x)
+#endif
+
+#define LOCK_WAIT_TIME 100
+#define LOCK_SAMPLE_WAIT 7
+
+#if defined(DIAGNOSTIC)
+#define LOCK_INLINE
+#else
+#define LOCK_INLINE __inline
+#endif
+
+#define LK_ALL (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | \
+ LK_SHARE_NONZERO | LK_WAIT_NONZERO)
+
+static int acquire(struct lock *lkp, int extflags, int wanted);
+static int apause(struct lock *lkp, int flags);
+static int acquiredrain(struct lock *lkp, int extflags) ;
+
+static LOCK_INLINE void
+sharelock(struct lock *lkp, int incr) {
+ lkp->lk_flags |= LK_SHARE_NONZERO;
+ lkp->lk_sharecount += incr;
+}
+
+static LOCK_INLINE void
+shareunlock(struct lock *lkp, int decr) {
+
+ KASSERT(lkp->lk_sharecount >= decr, ("shareunlock: count < decr"));
+
+ if (lkp->lk_sharecount == decr) {
+ lkp->lk_flags &= ~LK_SHARE_NONZERO;
+ if (lkp->lk_flags & (LK_WANT_UPGRADE | LK_WANT_EXCL)) {
+ wakeup(lkp);
+ }
+ lkp->lk_sharecount = 0;
+ } else {
+ lkp->lk_sharecount -= decr;
+ }
+}
+
+/*
+ * This is the waitloop optimization, and note for this to work
+ * simple_lock and simple_unlock should be subroutines to avoid
+ * optimization troubles.
+ */
+static int
+apause(struct lock *lkp, int flags) {
+ int lock_wait;
+ lock_wait = LOCK_WAIT_TIME;
+ for (; lock_wait > 0; lock_wait--) {
+ int i;
+ if ((lkp->lk_flags & flags) == 0)
+ return 0;
+ simple_unlock(&lkp->lk_interlock);
+ for (i = LOCK_SAMPLE_WAIT; i > 0; i--) {
+ if ((lkp->lk_flags & flags) == 0) {
+ simple_lock(&lkp->lk_interlock);
+ if ((lkp->lk_flags & flags) == 0)
+ return 0;
+ break;
+ }
+ }
+ }
+ return 1;
+}
+
+static int
+acquire(struct lock *lkp, int extflags, int wanted) {
+ int s, error;
+
+ if ((extflags & LK_NOWAIT) && (lkp->lk_flags & wanted)) {
+ return EBUSY;
+ }
+
+ if (((lkp->lk_flags | extflags) & LK_NOPAUSE) == 0) {
+ error = apause(lkp, wanted);
+ if (error == 0)
+ return 0;
+ }
+
+ s = splhigh();
+ while ((lkp->lk_flags & wanted) != 0) {
+ lkp->lk_flags |= LK_WAIT_NONZERO;
+ lkp->lk_waitcount++;
+ simple_unlock(&lkp->lk_interlock);
+ error = tsleep(lkp, lkp->lk_prio, lkp->lk_wmesg, lkp->lk_timo);
+ simple_lock(&lkp->lk_interlock);
+ if (lkp->lk_waitcount == 1) {
+ lkp->lk_flags &= ~LK_WAIT_NONZERO;
+ lkp->lk_waitcount = 0;
+ } else {
+ lkp->lk_waitcount--;
+ }
+ if (error) {
+ splx(s);
+ return error;
+ }
+ if (extflags & LK_SLEEPFAIL) {
+ splx(s);
+ return ENOLCK;
+ }
+ }
+ splx(s);
+ return 0;
+}
+
+/*
+ * Set, change, or release a lock.
+ *
+ * Shared requests increment the shared count. Exclusive requests set the
+ * LK_WANT_EXCL flag (preventing further shared locks), and wait for already
+ * accepted shared locks and shared-to-exclusive upgrades to go away.
+ */
+int
+#ifndef DEBUG_LOCKS
+lockmgr(lkp, flags, interlkp, p)
+#else
+debuglockmgr(lkp, flags, interlkp, p, name, file, line)
+#endif
+ struct lock *lkp;
+ u_int flags;
+ struct simplelock *interlkp;
+ struct proc *p;
+#ifdef DEBUG_LOCKS
+ const char *name; /* Name of lock function */
+ const char *file; /* Name of file call is from */
+ int line; /* Line number in file */
+#endif
+{
+ int error;
+ pid_t pid;
+ int extflags;
+
+ error = 0;
+ if (p == NULL)
+ pid = LK_KERNPROC;
+ else
+ pid = p->p_pid;
+
+ simple_lock(&lkp->lk_interlock);
+ if (flags & LK_INTERLOCK)
+ simple_unlock(interlkp);
+
+ extflags = (flags | lkp->lk_flags) & LK_EXTFLG_MASK;
+
+ switch (flags & LK_TYPE_MASK) {
+
+ case LK_SHARED:
+ if (lkp->lk_lockholder != pid) {
+ error = acquire(lkp, extflags,
+ LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE);
+ if (error)
+ break;
+ sharelock(lkp, 1);
+ COUNT(p, 1);
+ break;
+ }
+ /*
+ * We hold an exclusive lock, so downgrade it to shared.
+ * An alternative would be to fail with EDEADLK.
+ */
+ sharelock(lkp, 1);
+ COUNT(p, 1);
+ /* fall into downgrade */
+
+ case LK_DOWNGRADE:
+#if !defined(MAX_PERF)
+ if (lkp->lk_lockholder != pid || lkp->lk_exclusivecount == 0)
+ panic("lockmgr: not holding exclusive lock");
+#endif
+ sharelock(lkp, lkp->lk_exclusivecount);
+ lkp->lk_exclusivecount = 0;
+ lkp->lk_flags &= ~LK_HAVE_EXCL;
+ lkp->lk_lockholder = LK_NOPROC;
+ if (lkp->lk_waitcount)
+ wakeup((void *)lkp);
+ break;
+
+ case LK_EXCLUPGRADE:
+ /*
+ * If another process is ahead of us to get an upgrade,
+ * then we want to fail rather than have an intervening
+ * exclusive access.
+ */
+ if (lkp->lk_flags & LK_WANT_UPGRADE) {
+ shareunlock(lkp, 1);
+ COUNT(p, -1);
+ error = EBUSY;
+ break;
+ }
+ /* fall into normal upgrade */
+
+ case LK_UPGRADE:
+ /*
+ * Upgrade a shared lock to an exclusive one. If another
+ * shared lock has already requested an upgrade to an
+ * exclusive lock, our shared lock is released and an
+ * exclusive lock is requested (which will be granted
+ * after the upgrade). If we return an error, the file
+ * will always be unlocked.
+ */
+#if !defined(MAX_PERF)
+ if ((lkp->lk_lockholder == pid) || (lkp->lk_sharecount <= 0))
+ panic("lockmgr: upgrade exclusive lock");
+#endif
+ shareunlock(lkp, 1);
+ COUNT(p, -1);
+ /*
+ * If we are just polling, check to see if we will block.
+ */
+ if ((extflags & LK_NOWAIT) &&
+ ((lkp->lk_flags & LK_WANT_UPGRADE) ||
+ lkp->lk_sharecount > 1)) {
+ error = EBUSY;
+ break;
+ }
+ if ((lkp->lk_flags & LK_WANT_UPGRADE) == 0) {
+ /*
+ * We are first shared lock to request an upgrade, so
+ * request upgrade and wait for the shared count to
+ * drop to zero, then take exclusive lock.
+ */
+ lkp->lk_flags |= LK_WANT_UPGRADE;
+ error = acquire(lkp, extflags, LK_SHARE_NONZERO);
+ lkp->lk_flags &= ~LK_WANT_UPGRADE;
+
+ if (error)
+ break;
+ lkp->lk_flags |= LK_HAVE_EXCL;
+ lkp->lk_lockholder = pid;
+#if !defined(MAX_PERF)
+ if (lkp->lk_exclusivecount != 0)
+ panic("lockmgr: non-zero exclusive count");
+#endif
+ lkp->lk_exclusivecount = 1;
+#if defined(DEBUG_LOCKS)
+ lkp->lk_filename = file;
+ lkp->lk_lineno = line;
+ lkp->lk_lockername = name;
+#endif
+ COUNT(p, 1);
+ break;
+ }
+ /*
+ * Someone else has requested upgrade. Release our shared
+ * lock, awaken upgrade requestor if we are the last shared
+ * lock, then request an exclusive lock.
+ */
+ if ( (lkp->lk_flags & (LK_SHARE_NONZERO|LK_WAIT_NONZERO)) ==
+ LK_WAIT_NONZERO)
+ wakeup((void *)lkp);
+ /* fall into exclusive request */
+
+ case LK_EXCLUSIVE:
+ if (lkp->lk_lockholder == pid && pid != LK_KERNPROC) {
+ /*
+ * Recursive lock.
+ */
+#if !defined(MAX_PERF)
+ if ((extflags & LK_CANRECURSE) == 0)
+ panic("lockmgr: locking against myself");
+#endif
+ lkp->lk_exclusivecount++;
+ COUNT(p, 1);
+ break;
+ }
+ /*
+ * If we are just polling, check to see if we will sleep.
+ */
+ if ((extflags & LK_NOWAIT) &&
+ (lkp->lk_flags & (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | LK_SHARE_NONZERO))) {
+ error = EBUSY;
+ break;
+ }
+ /*
+ * Try to acquire the want_exclusive flag.
+ */
+ error = acquire(lkp, extflags, (LK_HAVE_EXCL | LK_WANT_EXCL));
+ if (error)
+ break;
+ lkp->lk_flags |= LK_WANT_EXCL;
+ /*
+ * Wait for shared locks and upgrades to finish.
+ */
+ error = acquire(lkp, extflags, LK_WANT_UPGRADE | LK_SHARE_NONZERO);
+ lkp->lk_flags &= ~LK_WANT_EXCL;
+ if (error)
+ break;
+ lkp->lk_flags |= LK_HAVE_EXCL;
+ lkp->lk_lockholder = pid;
+#if !defined(MAX_PERF)
+ if (lkp->lk_exclusivecount != 0)
+ panic("lockmgr: non-zero exclusive count");
+#endif
+ lkp->lk_exclusivecount = 1;
+#if defined(DEBUG_LOCKS)
+ lkp->lk_filename = file;
+ lkp->lk_lineno = line;
+ lkp->lk_lockername = name;
+#endif
+ COUNT(p, 1);
+ break;
+
+ case LK_RELEASE:
+ if (lkp->lk_exclusivecount != 0) {
+#if !defined(MAX_PERF)
+ if (pid != lkp->lk_lockholder)
+ panic("lockmgr: pid %d, not %s %d unlocking",
+ pid, "exclusive lock holder",
+ lkp->lk_lockholder);
+#endif
+ COUNT(p, -1);
+ if (lkp->lk_exclusivecount == 1) {
+ lkp->lk_flags &= ~LK_HAVE_EXCL;
+ lkp->lk_lockholder = LK_NOPROC;
+ lkp->lk_exclusivecount = 0;
+ } else {
+ lkp->lk_exclusivecount--;
+ }
+ } else if (lkp->lk_flags & LK_SHARE_NONZERO) {
+ shareunlock(lkp, 1);
+ COUNT(p, -1);
+ }
+ if (lkp->lk_flags & LK_WAIT_NONZERO)
+ wakeup((void *)lkp);
+ break;
+
+ case LK_DRAIN:
+ /*
+ * Check that we do not already hold the lock, as it can
+ * never drain if we do. Unfortunately, we have no way to
+ * check for holding a shared lock, but at least we can
+ * check for an exclusive one.
+ */
+#if !defined(MAX_PERF)
+ if (lkp->lk_lockholder == pid)
+ panic("lockmgr: draining against myself");
+#endif
+
+ error = acquiredrain(lkp, extflags);
+ if (error)
+ break;
+ lkp->lk_flags |= LK_DRAINING | LK_HAVE_EXCL;
+ lkp->lk_lockholder = pid;
+ lkp->lk_exclusivecount = 1;
+#if defined(DEBUG_LOCKS)
+ lkp->lk_filename = file;
+ lkp->lk_lineno = line;
+ lkp->lk_lockername = name;
+#endif
+ COUNT(p, 1);
+ break;
+
+ default:
+#if !defined(MAX_PERF)
+ simple_unlock(&lkp->lk_interlock);
+ panic("lockmgr: unknown locktype request %d",
+ flags & LK_TYPE_MASK);
+#endif
+ /* NOTREACHED */
+ }
+ if ((lkp->lk_flags & LK_WAITDRAIN) &&
+ (lkp->lk_flags & (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE |
+ LK_SHARE_NONZERO | LK_WAIT_NONZERO)) == 0) {
+ lkp->lk_flags &= ~LK_WAITDRAIN;
+ wakeup((void *)&lkp->lk_flags);
+ }
+ simple_unlock(&lkp->lk_interlock);
+ return (error);
+}
+
+static int
+acquiredrain(struct lock *lkp, int extflags) {
+ int error;
+
+ if ((extflags & LK_NOWAIT) && (lkp->lk_flags & LK_ALL)) {
+ return EBUSY;
+ }
+
+ error = apause(lkp, LK_ALL);
+ if (error == 0)
+ return 0;
+
+ while (lkp->lk_flags & LK_ALL) {
+ lkp->lk_flags |= LK_WAITDRAIN;
+ simple_unlock(&lkp->lk_interlock);
+ error = tsleep(&lkp->lk_flags, lkp->lk_prio,
+ lkp->lk_wmesg, lkp->lk_timo);
+ simple_lock(&lkp->lk_interlock);
+ if (error)
+ return error;
+ if (extflags & LK_SLEEPFAIL) {
+ return ENOLCK;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Initialize a lock; required before use.
+ */
+void
+lockinit(lkp, prio, wmesg, timo, flags)
+ struct lock *lkp;
+ int prio;
+ char *wmesg;
+ int timo;
+ int flags;
+{
+
+ simple_lock_init(&lkp->lk_interlock);
+ lkp->lk_flags = (flags & LK_EXTFLG_MASK);
+ lkp->lk_sharecount = 0;
+ lkp->lk_waitcount = 0;
+ lkp->lk_exclusivecount = 0;
+ lkp->lk_prio = prio;
+ lkp->lk_wmesg = wmesg;
+ lkp->lk_timo = timo;
+ lkp->lk_lockholder = LK_NOPROC;
+}
+
+/*
+ * Determine the status of a lock.
+ */
+int
+lockstatus(lkp)
+ struct lock *lkp;
+{
+ int lock_type = 0;
+
+ simple_lock(&lkp->lk_interlock);
+ if (lkp->lk_exclusivecount != 0)
+ lock_type = LK_EXCLUSIVE;
+ else if (lkp->lk_sharecount != 0)
+ lock_type = LK_SHARED;
+ simple_unlock(&lkp->lk_interlock);
+ return (lock_type);
+}
+
+/*
+ * Print out information about state of a lock. Used by VOP_PRINT
+ * routines to display status about contained locks.
+ */
+void
+lockmgr_printinfo(lkp)
+ struct lock *lkp;
+{
+
+ if (lkp->lk_sharecount)
+ printf(" lock type %s: SHARED (count %d)", lkp->lk_wmesg,
+ lkp->lk_sharecount);
+ else if (lkp->lk_flags & LK_HAVE_EXCL)
+ printf(" lock type %s: EXCL (count %d) by pid %d",
+ lkp->lk_wmesg, lkp->lk_exclusivecount, lkp->lk_lockholder);
+ if (lkp->lk_waitcount > 0)
+ printf(" with %d pending", lkp->lk_waitcount);
+}
+
+#if defined(SIMPLELOCK_DEBUG) && (NCPUS == 1 || defined(COMPILING_LINT))
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+static int lockpausetime = 0;
+SYSCTL_INT(_debug, OID_AUTO, lockpausetime, CTLFLAG_RW, &lockpausetime, 0, "");
+
+static int simplelockrecurse;
+
+/*
+ * Simple lock functions so that the debugger can see from whence
+ * they are being called.
+ */
+void
+simple_lock_init(alp)
+ struct simplelock *alp;
+{
+
+ alp->lock_data = 0;
+}
+
+void
+_simple_lock(alp, id, l)
+ struct simplelock *alp;
+ const char *id;
+ int l;
+{
+
+ if (simplelockrecurse)
+ return;
+ if (alp->lock_data == 1) {
+ if (lockpausetime == -1)
+ panic("%s:%d: simple_lock: lock held", id, l);
+ printf("%s:%d: simple_lock: lock held\n", id, l);
+ if (lockpausetime == 1) {
+ Debugger("simple_lock");
+ /*BACKTRACE(curproc); */
+ } else if (lockpausetime > 1) {
+ printf("%s:%d: simple_lock: lock held...", id, l);
+ tsleep(&lockpausetime, PCATCH | PPAUSE, "slock",
+ lockpausetime * hz);
+ printf(" continuing\n");
+ }
+ }
+ alp->lock_data = 1;
+ if (curproc)
+ curproc->p_simple_locks++;
+}
+
+int
+_simple_lock_try(alp, id, l)
+ struct simplelock *alp;
+ const char *id;
+ int l;
+{
+
+ if (alp->lock_data)
+ return (0);
+ if (simplelockrecurse)
+ return (1);
+ alp->lock_data = 1;
+ if (curproc)
+ curproc->p_simple_locks++;
+ return (1);
+}
+
+void
+_simple_unlock(alp, id, l)
+ struct simplelock *alp;
+ const char *id;
+ int l;
+{
+
+ if (simplelockrecurse)
+ return;
+ if (alp->lock_data == 0) {
+ if (lockpausetime == -1)
+ panic("%s:%d: simple_unlock: lock not held", id, l);
+ printf("%s:%d: simple_unlock: lock not held\n", id, l);
+ if (lockpausetime == 1) {
+ Debugger("simple_unlock");
+ /* BACKTRACE(curproc); */
+ } else if (lockpausetime > 1) {
+ printf("%s:%d: simple_unlock: lock not held...", id, l);
+ tsleep(&lockpausetime, PCATCH | PPAUSE, "sunlock",
+ lockpausetime * hz);
+ printf(" continuing\n");
+ }
+ }
+ alp->lock_data = 0;
+ if (curproc)
+ curproc->p_simple_locks--;
+}
+#elif defined(SIMPLELOCK_DEBUG)
+#error "SIMPLELOCK_DEBUG is not compatible with SMP!"
+#endif /* SIMPLELOCK_DEBUG && NCPUS == 1 */
diff --git a/sys/kern/kern_lockf.c b/sys/kern/kern_lockf.c
new file mode 100644
index 0000000..cc1b8a5
--- /dev/null
+++ b/sys/kern/kern_lockf.c
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Scooter Morris at Genentech Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
+ * $Id: kern_lockf.c,v 1.19 1998/07/29 17:38:14 bde Exp $
+ */
+
+#include "opt_debug_lockf.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+
+#include <sys/lockf.h>
+
+/*
+ * This variable controls the maximum number of processes that will
+ * be checked in doing deadlock detection.
+ */
+static int maxlockdepth = MAXDEPTH;
+
+#ifdef LOCKF_DEBUG
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+
+
+static int lockf_debug = 0;
+SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
+#endif
+
+static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
+
+#define NOLOCKF (struct lockf *)0
+#define SELF 0x1
+#define OTHERS 0x2
+static int lf_clearlock __P((struct lockf *));
+static int lf_findoverlap __P((struct lockf *,
+ struct lockf *, int, struct lockf ***, struct lockf **));
+static struct lockf *
+ lf_getblock __P((struct lockf *));
+static int lf_getlock __P((struct lockf *, struct flock *));
+static int lf_setlock __P((struct lockf *));
+static void lf_split __P((struct lockf *, struct lockf *));
+static void lf_wakelock __P((struct lockf *));
+
+/*
+ * Advisory record locking support
+ */
+int
+lf_advlock(ap, head, size)
+ struct vop_advlock_args /* {
+ struct vnode *a_vp;
+ caddr_t a_id;
+ int a_op;
+ struct flock *a_fl;
+ int a_flags;
+ } */ *ap;
+ struct lockf **head;
+ u_quad_t size;
+{
+ register struct flock *fl = ap->a_fl;
+ register struct lockf *lock;
+ off_t start, end;
+ int error;
+
+ /*
+ * Convert the flock structure into a start and end.
+ */
+ switch (fl->l_whence) {
+
+ case SEEK_SET:
+ case SEEK_CUR:
+ /*
+ * Caller is responsible for adding any necessary offset
+ * when SEEK_CUR is used.
+ */
+ start = fl->l_start;
+ break;
+
+ case SEEK_END:
+ start = size + fl->l_start;
+ break;
+
+ default:
+ return (EINVAL);
+ }
+ if (start < 0)
+ return (EINVAL);
+ if (fl->l_len == 0)
+ end = -1;
+ else {
+ end = start + fl->l_len - 1;
+ if (end < start)
+ return (EINVAL);
+ }
+ /*
+ * Avoid the common case of unlocking when inode has no locks.
+ */
+ if (*head == (struct lockf *)0) {
+ if (ap->a_op != F_SETLK) {
+ fl->l_type = F_UNLCK;
+ return (0);
+ }
+ }
+ /*
+ * Create the lockf structure
+ */
+ MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
+ lock->lf_start = start;
+ lock->lf_end = end;
+ lock->lf_id = ap->a_id;
+/* lock->lf_inode = ip; */ /* XXX JH */
+ lock->lf_type = fl->l_type;
+ lock->lf_head = head;
+ lock->lf_next = (struct lockf *)0;
+ TAILQ_INIT(&lock->lf_blkhd);
+ lock->lf_flags = ap->a_flags;
+ /*
+ * Do the requested operation.
+ */
+ switch(ap->a_op) {
+ case F_SETLK:
+ return (lf_setlock(lock));
+
+ case F_UNLCK:
+ error = lf_clearlock(lock);
+ FREE(lock, M_LOCKF);
+ return (error);
+
+ case F_GETLK:
+ error = lf_getlock(lock, fl);
+ FREE(lock, M_LOCKF);
+ return (error);
+
+ default:
+ free(lock, M_LOCKF);
+ return (EINVAL);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Set a byte-range lock.
+ */
+static int
+lf_setlock(lock)
+ register struct lockf *lock;
+{
+ register struct lockf *block;
+ struct lockf **head = lock->lf_head;
+ struct lockf **prev, *overlap, *ltmp;
+ static char lockstr[] = "lockf";
+ int ovcase, priority, needtolink, error;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_print("lf_setlock", lock);
+#endif /* LOCKF_DEBUG */
+
+ /*
+ * Set the priority
+ */
+ priority = PLOCK;
+ if (lock->lf_type == F_WRLCK)
+ priority += 4;
+ priority |= PCATCH;
+ /*
+ * Scan lock list for this file looking for locks that would block us.
+ */
+ while ((block = lf_getblock(lock))) {
+ /*
+ * Free the structure and return if nonblocking.
+ */
+ if ((lock->lf_flags & F_WAIT) == 0) {
+ FREE(lock, M_LOCKF);
+ return (EAGAIN);
+ }
+ /*
+ * We are blocked. Since flock style locks cover
+ * the whole file, there is no chance for deadlock.
+ * For byte-range locks we must check for deadlock.
+ *
+ * Deadlock detection is done by looking through the
+ * wait channels to see if there are any cycles that
+ * involve us. MAXDEPTH is set just to make sure we
+ * do not go off into neverland.
+ */
+ if ((lock->lf_flags & F_POSIX) &&
+ (block->lf_flags & F_POSIX)) {
+ register struct proc *wproc;
+ register struct lockf *waitblock;
+ int i = 0;
+
+ /* The block is waiting on something */
+ wproc = (struct proc *)block->lf_id;
+ while (wproc->p_wchan &&
+ (wproc->p_wmesg == lockstr) &&
+ (i++ < maxlockdepth)) {
+ waitblock = (struct lockf *)wproc->p_wchan;
+ /* Get the owner of the blocking lock */
+ waitblock = waitblock->lf_next;
+ if ((waitblock->lf_flags & F_POSIX) == 0)
+ break;
+ wproc = (struct proc *)waitblock->lf_id;
+ if (wproc == (struct proc *)lock->lf_id) {
+ free(lock, M_LOCKF);
+ return (EDEADLK);
+ }
+ }
+ }
+ /*
+ * For flock type locks, we must first remove
+ * any shared locks that we hold before we sleep
+ * waiting for an exclusive lock.
+ */
+ if ((lock->lf_flags & F_FLOCK) &&
+ lock->lf_type == F_WRLCK) {
+ lock->lf_type = F_UNLCK;
+ (void) lf_clearlock(lock);
+ lock->lf_type = F_WRLCK;
+ }
+ /*
+ * Add our lock to the blocked list and sleep until we're free.
+ * Remember who blocked us (for deadlock detection).
+ */
+ lock->lf_next = block;
+ TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1) {
+ lf_print("lf_setlock: blocking on", block);
+ lf_printlist("lf_setlock", block);
+ }
+#endif /* LOCKF_DEBUG */
+ if ((error = tsleep((caddr_t)lock, priority, lockstr, 0))) {
+ /*
+ * We may have been awakened by a signal (in
+ * which case we must remove ourselves from the
+ * blocked list) and/or by another process
+ * releasing a lock (in which case we have already
+ * been removed from the blocked list and our
+ * lf_next field set to NOLOCKF).
+ */
+ if (lock->lf_next)
+ TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock,
+ lf_block);
+ free(lock, M_LOCKF);
+ return (error);
+ }
+ }
+ /*
+ * No blocks!! Add the lock. Note that we will
+ * downgrade or upgrade any overlapping locks this
+ * process already owns.
+ *
+ * Skip over locks owned by other processes.
+ * Handle any locks that overlap and are owned by ourselves.
+ */
+ prev = head;
+ block = *head;
+ needtolink = 1;
+ for (;;) {
+ ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
+ if (ovcase)
+ block = overlap->lf_next;
+ /*
+ * Six cases:
+ * 0) no overlap
+ * 1) overlap == lock
+ * 2) overlap contains lock
+ * 3) lock contains overlap
+ * 4) overlap starts before lock
+ * 5) overlap ends after lock
+ */
+ switch (ovcase) {
+ case 0: /* no overlap */
+ if (needtolink) {
+ *prev = lock;
+ lock->lf_next = overlap;
+ }
+ break;
+
+ case 1: /* overlap == lock */
+ /*
+ * If downgrading lock, others may be
+ * able to acquire it.
+ */
+ if (lock->lf_type == F_RDLCK &&
+ overlap->lf_type == F_WRLCK)
+ lf_wakelock(overlap);
+ overlap->lf_type = lock->lf_type;
+ FREE(lock, M_LOCKF);
+ lock = overlap; /* for debug output below */
+ break;
+
+ case 2: /* overlap contains lock */
+ /*
+ * Check for common starting point and different types.
+ */
+ if (overlap->lf_type == lock->lf_type) {
+ free(lock, M_LOCKF);
+ lock = overlap; /* for debug output below */
+ break;
+ }
+ if (overlap->lf_start == lock->lf_start) {
+ *prev = lock;
+ lock->lf_next = overlap;
+ overlap->lf_start = lock->lf_end + 1;
+ } else
+ lf_split(overlap, lock);
+ lf_wakelock(overlap);
+ break;
+
+ case 3: /* lock contains overlap */
+ /*
+ * If downgrading lock, others may be able to
+ * acquire it, otherwise take the list.
+ */
+ if (lock->lf_type == F_RDLCK &&
+ overlap->lf_type == F_WRLCK) {
+ lf_wakelock(overlap);
+ } else {
+ while (ltmp = overlap->lf_blkhd.tqh_first) {
+ TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
+ lf_block);
+ TAILQ_INSERT_TAIL(&lock->lf_blkhd,
+ ltmp, lf_block);
+ }
+ }
+ /*
+ * Add the new lock if necessary and delete the overlap.
+ */
+ if (needtolink) {
+ *prev = lock;
+ lock->lf_next = overlap->lf_next;
+ prev = &lock->lf_next;
+ needtolink = 0;
+ } else
+ *prev = overlap->lf_next;
+ free(overlap, M_LOCKF);
+ continue;
+
+ case 4: /* overlap starts before lock */
+ /*
+ * Add lock after overlap on the list.
+ */
+ lock->lf_next = overlap->lf_next;
+ overlap->lf_next = lock;
+ overlap->lf_end = lock->lf_start - 1;
+ prev = &lock->lf_next;
+ lf_wakelock(overlap);
+ needtolink = 0;
+ continue;
+
+ case 5: /* overlap ends after lock */
+ /*
+ * Add the new lock before overlap.
+ */
+ if (needtolink) {
+ *prev = lock;
+ lock->lf_next = overlap;
+ }
+ overlap->lf_start = lock->lf_end + 1;
+ lf_wakelock(overlap);
+ break;
+ }
+ break;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1) {
+ lf_print("lf_setlock: got the lock", lock);
+ lf_printlist("lf_setlock", lock);
+ }
+#endif /* LOCKF_DEBUG */
+ return (0);
+}
+
+/*
+ * Remove a byte-range lock on an inode.
+ *
+ * Generally, find the lock (or an overlap to that lock)
+ * and remove it (or shrink it), then wakeup anyone we can.
+ */
+static int
+lf_clearlock(unlock)
+ register struct lockf *unlock;
+{
+ struct lockf **head = unlock->lf_head;
+ register struct lockf *lf = *head;
+ struct lockf *overlap, **prev;
+ int ovcase;
+
+ if (lf == NOLOCKF)
+ return (0);
+#ifdef LOCKF_DEBUG
+ if (unlock->lf_type != F_UNLCK)
+ panic("lf_clearlock: bad type");
+ if (lockf_debug & 1)
+ lf_print("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+ prev = head;
+ while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap))) {
+ /*
+ * Wakeup the list of locks to be retried.
+ */
+ lf_wakelock(overlap);
+
+ switch (ovcase) {
+
+ case 1: /* overlap == lock */
+ *prev = overlap->lf_next;
+ FREE(overlap, M_LOCKF);
+ break;
+
+ case 2: /* overlap contains lock: split it */
+ if (overlap->lf_start == unlock->lf_start) {
+ overlap->lf_start = unlock->lf_end + 1;
+ break;
+ }
+ lf_split(overlap, unlock);
+ overlap->lf_next = unlock->lf_next;
+ break;
+
+ case 3: /* lock contains overlap */
+ *prev = overlap->lf_next;
+ lf = overlap->lf_next;
+ free(overlap, M_LOCKF);
+ continue;
+
+ case 4: /* overlap starts before lock */
+ overlap->lf_end = unlock->lf_start - 1;
+ prev = &overlap->lf_next;
+ lf = overlap->lf_next;
+ continue;
+
+ case 5: /* overlap ends after lock */
+ overlap->lf_start = unlock->lf_end + 1;
+ break;
+ }
+ break;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_printlist("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+ return (0);
+}
+
+/*
+ * Check whether there is a blocking lock,
+ * and if so return its process identifier.
+ */
+static int
+lf_getlock(lock, fl)
+ register struct lockf *lock;
+ register struct flock *fl;
+{
+ register struct lockf *block;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_print("lf_getlock", lock);
+#endif /* LOCKF_DEBUG */
+
+ if ((block = lf_getblock(lock))) {
+ fl->l_type = block->lf_type;
+ fl->l_whence = SEEK_SET;
+ fl->l_start = block->lf_start;
+ if (block->lf_end == -1)
+ fl->l_len = 0;
+ else
+ fl->l_len = block->lf_end - block->lf_start + 1;
+ if (block->lf_flags & F_POSIX)
+ fl->l_pid = ((struct proc *)(block->lf_id))->p_pid;
+ else
+ fl->l_pid = -1;
+ } else {
+ fl->l_type = F_UNLCK;
+ }
+ return (0);
+}
+
+/*
+ * Walk the list of locks for an inode and
+ * return the first blocking lock.
+ */
+static struct lockf *
+lf_getblock(lock)
+ register struct lockf *lock;
+{
+ struct lockf **prev, *overlap, *lf = *(lock->lf_head);
+ int ovcase;
+
+ prev = lock->lf_head;
+ while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap))) {
+ /*
+ * We've found an overlap, see if it blocks us
+ */
+ if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
+ return (overlap);
+ /*
+ * Nope, point to the next one on the list and
+ * see if it blocks us
+ */
+ lf = overlap->lf_next;
+ }
+ return (NOLOCKF);
+}
+
+/*
+ * Walk the list of locks for an inode to
+ * find an overlapping lock (if any).
+ *
+ * NOTE: this returns only the FIRST overlapping lock. There
+ * may be more than one.
+ */
+static int
+lf_findoverlap(lf, lock, type, prev, overlap)
+ register struct lockf *lf;
+ struct lockf *lock;
+ int type;
+ struct lockf ***prev;
+ struct lockf **overlap;
+{
+ off_t start, end;
+
+ *overlap = lf;
+ if (lf == NOLOCKF)
+ return (0);
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ lf_print("lf_findoverlap: looking for overlap in", lock);
+#endif /* LOCKF_DEBUG */
+ start = lock->lf_start;
+ end = lock->lf_end;
+ while (lf != NOLOCKF) {
+ if (((type & SELF) && lf->lf_id != lock->lf_id) ||
+ ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
+ *prev = &lf->lf_next;
+ *overlap = lf = lf->lf_next;
+ continue;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ lf_print("\tchecking", lf);
+#endif /* LOCKF_DEBUG */
+ /*
+ * OK, check for overlap
+ *
+ * Six cases:
+ * 0) no overlap
+ * 1) overlap == lock
+ * 2) overlap contains lock
+ * 3) lock contains overlap
+ * 4) overlap starts before lock
+ * 5) overlap ends after lock
+ */
+ if ((lf->lf_end != -1 && start > lf->lf_end) ||
+ (end != -1 && lf->lf_start > end)) {
+ /* Case 0 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("no overlap\n");
+#endif /* LOCKF_DEBUG */
+ if ((type & SELF) && end != -1 && lf->lf_start > end)
+ return (0);
+ *prev = &lf->lf_next;
+ *overlap = lf = lf->lf_next;
+ continue;
+ }
+ if ((lf->lf_start == start) && (lf->lf_end == end)) {
+ /* Case 1 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap == lock\n");
+#endif /* LOCKF_DEBUG */
+ return (1);
+ }
+ if ((lf->lf_start <= start) &&
+ (end != -1) &&
+ ((lf->lf_end >= end) || (lf->lf_end == -1))) {
+ /* Case 2 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap contains lock\n");
+#endif /* LOCKF_DEBUG */
+ return (2);
+ }
+ if (start <= lf->lf_start &&
+ (end == -1 ||
+ (lf->lf_end != -1 && end >= lf->lf_end))) {
+ /* Case 3 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("lock contains overlap\n");
+#endif /* LOCKF_DEBUG */
+ return (3);
+ }
+ if ((lf->lf_start < start) &&
+ ((lf->lf_end >= start) || (lf->lf_end == -1))) {
+ /* Case 4 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap starts before lock\n");
+#endif /* LOCKF_DEBUG */
+ return (4);
+ }
+ if ((lf->lf_start > start) &&
+ (end != -1) &&
+ ((lf->lf_end > end) || (lf->lf_end == -1))) {
+ /* Case 5 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap ends after lock\n");
+#endif /* LOCKF_DEBUG */
+ return (5);
+ }
+ panic("lf_findoverlap: default");
+ }
+ return (0);
+}
+
+/*
+ * Split a lock and a contained region into
+ * two or three locks as necessary.
+ */
+static void
+lf_split(lock1, lock2)
+ register struct lockf *lock1;
+ register struct lockf *lock2;
+{
+ register struct lockf *splitlock;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2) {
+ lf_print("lf_split", lock1);
+ lf_print("splitting from", lock2);
+ }
+#endif /* LOCKF_DEBUG */
+ /*
+ * Check to see if spliting into only two pieces.
+ */
+ if (lock1->lf_start == lock2->lf_start) {
+ lock1->lf_start = lock2->lf_end + 1;
+ lock2->lf_next = lock1;
+ return;
+ }
+ if (lock1->lf_end == lock2->lf_end) {
+ lock1->lf_end = lock2->lf_start - 1;
+ lock2->lf_next = lock1->lf_next;
+ lock1->lf_next = lock2;
+ return;
+ }
+ /*
+ * Make a new lock consisting of the last part of
+ * the encompassing lock
+ */
+ MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
+ bcopy((caddr_t)lock1, (caddr_t)splitlock, sizeof *splitlock);
+ splitlock->lf_start = lock2->lf_end + 1;
+ TAILQ_INIT(&splitlock->lf_blkhd);
+ lock1->lf_end = lock2->lf_start - 1;
+ /*
+ * OK, now link it in
+ */
+ splitlock->lf_next = lock1->lf_next;
+ lock2->lf_next = splitlock;
+ lock1->lf_next = lock2;
+}
+
+/*
+ * Wakeup a blocklist
+ */
+static void
+lf_wakelock(listhead)
+ struct lockf *listhead;
+{
+ register struct lockf *wakelock;
+
+ while (wakelock = listhead->lf_blkhd.tqh_first) {
+ TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
+ wakelock->lf_next = NOLOCKF;
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ lf_print("lf_wakelock: awakening", wakelock);
+#endif /* LOCKF_DEBUG */
+ wakeup((caddr_t)wakelock);
+ }
+}
+
+#ifdef LOCKF_DEBUG
+/*
+ * Print out a lock.
+ */
+void
+lf_print(tag, lock)
+ char *tag;
+ register struct lockf *lock;
+{
+
+ printf("%s: lock %p for ", tag, (void *)lock);
+ if (lock->lf_flags & F_POSIX)
+ printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid);
+ else
+ printf("id %p", (void *)lock->lf_id);
+ /* XXX no %qd in kernel. Truncate. */
+ printf(" in ino %lu on dev <%d, %d>, %s, start %ld, end %ld",
+ (u_long)lock->lf_inode->i_number,
+ major(lock->lf_inode->i_dev),
+ minor(lock->lf_inode->i_dev),
+ lock->lf_type == F_RDLCK ? "shared" :
+ lock->lf_type == F_WRLCK ? "exclusive" :
+ lock->lf_type == F_UNLCK ? "unlock" :
+ "unknown", (long)lock->lf_start, (long)lock->lf_end);
+ if (lock->lf_blkhd.tqh_first)
+ printf(" block %p\n", (void *)lock->lf_blkhd.tqh_first);
+ else
+ printf("\n");
+}
+
+void
+lf_printlist(tag, lock)
+ char *tag;
+ struct lockf *lock;
+{
+ register struct lockf *lf, *blk;
+
+ printf("%s: Lock list for ino %lu on dev <%d, %d>:\n",
+ tag, (u_long)lock->lf_inode->i_number,
+ major(lock->lf_inode->i_dev),
+ minor(lock->lf_inode->i_dev));
+ for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) {
+ printf("\tlock %p for ",(void *)lf);
+ if (lf->lf_flags & F_POSIX)
+ printf("proc %ld",
+ (long)((struct proc *)lf->lf_id)->p_pid);
+ else
+ printf("id %p", (void *)lf->lf_id);
+ /* XXX no %qd in kernel. Truncate. */
+ printf(", %s, start %ld, end %ld",
+ lf->lf_type == F_RDLCK ? "shared" :
+ lf->lf_type == F_WRLCK ? "exclusive" :
+ lf->lf_type == F_UNLCK ? "unlock" :
+ "unknown", (long)lf->lf_start, (long)lf->lf_end);
+ for (blk = lf->lf_blkhd.tqh_first; blk;
+ blk = blk->lf_block.tqe_next) {
+ printf("\n\t\tlock request %p for ", (void *)blk);
+ if (blk->lf_flags & F_POSIX)
+ printf("proc %ld",
+ (long)((struct proc *)blk->lf_id)->p_pid);
+ else
+ printf("id %p", (void *)blk->lf_id);
+ /* XXX no %qd in kernel. Truncate. */
+ printf(", %s, start %ld, end %ld",
+ blk->lf_type == F_RDLCK ? "shared" :
+ blk->lf_type == F_WRLCK ? "exclusive" :
+ blk->lf_type == F_UNLCK ? "unlock" :
+ "unknown", (long)blk->lf_start,
+ (long)blk->lf_end);
+ if (blk->lf_blkhd.tqh_first)
+ panic("lf_printlist: bad list");
+ }
+ printf("\n");
+ }
+}
+#endif /* LOCKF_DEBUG */
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
new file mode 100644
index 0000000..be9f9d3
--- /dev/null
+++ b/sys/kern/kern_malloc.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 1987, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94
+ * $Id: kern_malloc.c,v 1.51 1999/01/10 01:58:24 eivind Exp $
+ */
+
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#define MALLOC_INSTANTIATE
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/vmmeter.h>
+#include <sys/lock.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+static void kmeminit __P((void *));
+SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL)
+
+static MALLOC_DEFINE(M_FREE, "free", "should be on free list");
+
+static struct malloc_type *kmemstatistics;
+static struct kmembuckets bucket[MINBUCKET + 16];
+static struct kmemusage *kmemusage;
+static char *kmembase;
+static char *kmemlimit;
+static int vm_kmem_size;
+
+#ifdef INVARIANTS
+/*
+ * This structure provides a set of masks to catch unaligned frees.
+ */
+static long addrmask[] = { 0,
+ 0x00000001, 0x00000003, 0x00000007, 0x0000000f,
+ 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
+ 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
+ 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
+};
+
+/*
+ * The WEIRD_ADDR is used as known text to copy into free objects so
+ * that modifications after frees can be detected.
+ */
+#define WEIRD_ADDR 0xdeadc0de
+#define MAX_COPY 64
+
+/*
+ * Normally the first word of the structure is used to hold the list
+ * pointer for free objects. However, when running with diagnostics,
+ * we use the third and fourth fields, so as to catch modifications
+ * in the most commonly trashed first two words.
+ */
+struct freelist {
+ long spare0;
+ struct malloc_type *type;
+ long spare1;
+ caddr_t next;
+};
+#else /* !INVARIANTS */
+struct freelist {
+ caddr_t next;
+};
+#endif /* INVARIANTS */
+
+/*
+ * malloc:
+ *
+ * Allocate a block of memory.
+ *
+ * If M_NOWAIT is set, this routine will not block and return NULL if
+ * the allocation fails.
+ *
+ * If M_ASLEEP is set (M_NOWAIT must also be set), this routine
+ * will have the side effect of calling asleep() if it returns NULL,
+ * allowing the parent to await() at some future time.
+ */
+void *
+malloc(size, type, flags)
+ unsigned long size;
+ struct malloc_type *type;
+ int flags;
+{
+ register struct kmembuckets *kbp;
+ register struct kmemusage *kup;
+ register struct freelist *freep;
+ long indx, npg, allocsize;
+ int s;
+ caddr_t va, cp, savedlist;
+#ifdef INVARIANTS
+ long *end, *lp;
+ int copysize;
+ char *savedtype;
+#endif
+ register struct malloc_type *ksp = type;
+
+ /*
+ * Must be at splmem() prior to initializing segment to handle
+ * potential initialization race.
+ */
+
+ s = splmem();
+
+ if (!type->ks_next) {
+ malloc_init(type);
+ }
+
+ indx = BUCKETINDX(size);
+ kbp = &bucket[indx];
+
+ while (ksp->ks_memuse >= ksp->ks_limit) {
+ if (flags & M_ASLEEP) {
+ if (ksp->ks_limblocks < 65535)
+ ksp->ks_limblocks++;
+ asleep((caddr_t)ksp, PSWP+2, type->ks_shortdesc, 0);
+ }
+ if (flags & M_NOWAIT) {
+ splx(s);
+ return ((void *) NULL);
+ }
+ if (ksp->ks_limblocks < 65535)
+ ksp->ks_limblocks++;
+ tsleep((caddr_t)ksp, PSWP+2, type->ks_shortdesc, 0);
+ }
+ ksp->ks_size |= 1 << indx;
+#ifdef INVARIANTS
+ copysize = 1 << indx < MAX_COPY ? 1 << indx : MAX_COPY;
+#endif
+ if (kbp->kb_next == NULL) {
+ kbp->kb_last = NULL;
+ if (size > MAXALLOCSAVE)
+ allocsize = roundup(size, PAGE_SIZE);
+ else
+ allocsize = 1 << indx;
+ npg = btoc(allocsize);
+ va = (caddr_t) kmem_malloc(kmem_map, (vm_size_t)ctob(npg), flags);
+ if (va == NULL) {
+ splx(s);
+ return ((void *) NULL);
+ }
+ kbp->kb_total += kbp->kb_elmpercl;
+ kup = btokup(va);
+ kup->ku_indx = indx;
+ if (allocsize > MAXALLOCSAVE) {
+ if (npg > 65535)
+ panic("malloc: allocation too large");
+ kup->ku_pagecnt = npg;
+ ksp->ks_memuse += allocsize;
+ goto out;
+ }
+ kup->ku_freecnt = kbp->kb_elmpercl;
+ kbp->kb_totalfree += kbp->kb_elmpercl;
+ /*
+ * Just in case we blocked while allocating memory,
+ * and someone else also allocated memory for this
+ * bucket, don't assume the list is still empty.
+ */
+ savedlist = kbp->kb_next;
+ kbp->kb_next = cp = va + (npg * PAGE_SIZE) - allocsize;
+ for (;;) {
+ freep = (struct freelist *)cp;
+#ifdef INVARIANTS
+ /*
+ * Copy in known text to detect modification
+ * after freeing.
+ */
+ end = (long *)&cp[copysize];
+ for (lp = (long *)cp; lp < end; lp++)
+ *lp = WEIRD_ADDR;
+ freep->type = M_FREE;
+#endif /* INVARIANTS */
+ if (cp <= va)
+ break;
+ cp -= allocsize;
+ freep->next = cp;
+ }
+ freep->next = savedlist;
+ if (kbp->kb_last == NULL)
+ kbp->kb_last = (caddr_t)freep;
+ }
+ va = kbp->kb_next;
+ kbp->kb_next = ((struct freelist *)va)->next;
+#ifdef INVARIANTS
+ freep = (struct freelist *)va;
+ savedtype = (char *) type->ks_shortdesc;
+#if BYTE_ORDER == BIG_ENDIAN
+ freep->type = (struct malloc_type *)WEIRD_ADDR >> 16;
+#endif
+#if BYTE_ORDER == LITTLE_ENDIAN
+ freep->type = (struct malloc_type *)WEIRD_ADDR;
+#endif
+ if ((intptr_t)(void *)&freep->next & 0x2)
+ freep->next = (caddr_t)((WEIRD_ADDR >> 16)|(WEIRD_ADDR << 16));
+ else
+ freep->next = (caddr_t)WEIRD_ADDR;
+ end = (long *)&va[copysize];
+ for (lp = (long *)va; lp < end; lp++) {
+ if (*lp == WEIRD_ADDR)
+ continue;
+ printf("%s %ld of object %p size %lu %s %s (0x%lx != 0x%lx)\n",
+ "Data modified on freelist: word",
+ (long)(lp - (long *)va), (void *)va, size,
+ "previous type", savedtype, *lp, (u_long)WEIRD_ADDR);
+ break;
+ }
+ freep->spare0 = 0;
+#endif /* INVARIANTS */
+ kup = btokup(va);
+ if (kup->ku_indx != indx)
+ panic("malloc: wrong bucket");
+ if (kup->ku_freecnt == 0)
+ panic("malloc: lost data");
+ kup->ku_freecnt--;
+ kbp->kb_totalfree--;
+ ksp->ks_memuse += 1 << indx;
+out:
+ kbp->kb_calls++;
+ ksp->ks_inuse++;
+ ksp->ks_calls++;
+ if (ksp->ks_memuse > ksp->ks_maxused)
+ ksp->ks_maxused = ksp->ks_memuse;
+ splx(s);
+ return ((void *) va);
+}
+
+/*
+ * free:
+ *
+ * Free a block of memory allocated by malloc.
+ *
+ * This routine may not block.
+ */
+void
+free(addr, type)
+ void *addr;
+ struct malloc_type *type;
+{
+ register struct kmembuckets *kbp;
+ register struct kmemusage *kup;
+ register struct freelist *freep;
+ long size;
+ int s;
+#ifdef INVARIANTS
+ struct freelist *fp;
+ long *end, *lp, alloc, copysize;
+#endif
+ register struct malloc_type *ksp = type;
+
+ if (!type->ks_next)
+ panic("freeing with unknown type (%s)", type->ks_shortdesc);
+
+ KASSERT(kmembase <= (char *)addr && (char *)addr < kmemlimit,
+ ("free: address %p out of range", (void *)addr));
+ kup = btokup(addr);
+ size = 1 << kup->ku_indx;
+ kbp = &bucket[kup->ku_indx];
+ s = splmem();
+#ifdef INVARIANTS
+ /*
+ * Check for returns of data that do not point to the
+ * beginning of the allocation.
+ */
+ if (size > PAGE_SIZE)
+ alloc = addrmask[BUCKETINDX(PAGE_SIZE)];
+ else
+ alloc = addrmask[kup->ku_indx];
+ if (((uintptr_t)(void *)addr & alloc) != 0)
+ panic("free: unaligned addr %p, size %ld, type %s, mask %ld",
+ (void *)addr, size, type->ks_shortdesc, alloc);
+#endif /* INVARIANTS */
+ if (size > MAXALLOCSAVE) {
+ kmem_free(kmem_map, (vm_offset_t)addr, ctob(kup->ku_pagecnt));
+ size = kup->ku_pagecnt << PAGE_SHIFT;
+ ksp->ks_memuse -= size;
+ kup->ku_indx = 0;
+ kup->ku_pagecnt = 0;
+ if (ksp->ks_memuse + size >= ksp->ks_limit &&
+ ksp->ks_memuse < ksp->ks_limit)
+ wakeup((caddr_t)ksp);
+ ksp->ks_inuse--;
+ kbp->kb_total -= 1;
+ splx(s);
+ return;
+ }
+ freep = (struct freelist *)addr;
+#ifdef INVARIANTS
+ /*
+ * Check for multiple frees. Use a quick check to see if
+ * it looks free before laboriously searching the freelist.
+ */
+ if (freep->spare0 == WEIRD_ADDR) {
+ fp = (struct freelist *)kbp->kb_next;
+ while (fp) {
+ if (fp->spare0 != WEIRD_ADDR)
+ panic("free: free item %p modified", fp);
+ else if (addr == (caddr_t)fp)
+ panic("free: multiple freed item %p", addr);
+ fp = (struct freelist *)fp->next;
+ }
+ }
+ /*
+ * Copy in known text to detect modification after freeing
+ * and to make it look free. Also, save the type being freed
+ * so we can list likely culprit if modification is detected
+ * when the object is reallocated.
+ */
+ copysize = size < MAX_COPY ? size : MAX_COPY;
+ end = (long *)&((caddr_t)addr)[copysize];
+ for (lp = (long *)addr; lp < end; lp++)
+ *lp = WEIRD_ADDR;
+ freep->type = type;
+#endif /* INVARIANTS */
+ kup->ku_freecnt++;
+ if (kup->ku_freecnt >= kbp->kb_elmpercl)
+ if (kup->ku_freecnt > kbp->kb_elmpercl)
+ panic("free: multiple frees");
+ else if (kbp->kb_totalfree > kbp->kb_highwat)
+ kbp->kb_couldfree++;
+ kbp->kb_totalfree++;
+ ksp->ks_memuse -= size;
+ if (ksp->ks_memuse + size >= ksp->ks_limit &&
+ ksp->ks_memuse < ksp->ks_limit)
+ wakeup((caddr_t)ksp);
+ ksp->ks_inuse--;
+#ifdef OLD_MALLOC_MEMORY_POLICY
+ if (kbp->kb_next == NULL)
+ kbp->kb_next = addr;
+ else
+ ((struct freelist *)kbp->kb_last)->next = addr;
+ freep->next = NULL;
+ kbp->kb_last = addr;
+#else
+ /*
+ * Return memory to the head of the queue for quick reuse. This
+ * can improve performance by improving the probability of the
+ * item being in the cache when it is reused.
+ */
+ if (kbp->kb_next == NULL) {
+ kbp->kb_next = addr;
+ kbp->kb_last = addr;
+ freep->next = NULL;
+ } else {
+ freep->next = kbp->kb_next;
+ kbp->kb_next = addr;
+ }
+#endif
+ splx(s);
+}
+
+/*
+ * Initialize the kernel memory allocator
+ */
+/* ARGSUSED*/
+static void
+kmeminit(dummy)
+ void *dummy;
+{
+ register long indx;
+ int npg;
+ int mem_size;
+
+#if ((MAXALLOCSAVE & (MAXALLOCSAVE - 1)) != 0)
+#error "kmeminit: MAXALLOCSAVE not power of 2"
+#endif
+#if (MAXALLOCSAVE > MINALLOCSIZE * 32768)
+#error "kmeminit: MAXALLOCSAVE too big"
+#endif
+#if (MAXALLOCSAVE < PAGE_SIZE)
+#error "kmeminit: MAXALLOCSAVE too small"
+#endif
+
+ /*
+ * Try to auto-tune the kernel memory size, so that it is
+ * more applicable for a wider range of machine sizes.
+ * On an X86, a VM_KMEM_SIZE_SCALE value of 4 is good, while
+ * a VM_KMEM_SIZE of 12MB is a fair compromise. The
+ * VM_KMEM_SIZE_MAX is dependent on the maximum KVA space
+ * available, and on an X86 with a total KVA space of 256MB,
+ * try to keep VM_KMEM_SIZE_MAX at 80MB or below.
+ *
+ * Note that the kmem_map is also used by the zone allocator,
+ * so make sure that there is enough space.
+ */
+ vm_kmem_size = VM_KMEM_SIZE;
+ mem_size = cnt.v_page_count * PAGE_SIZE;
+
+#if defined(VM_KMEM_SIZE_SCALE)
+ if ((mem_size / VM_KMEM_SIZE_SCALE) > vm_kmem_size)
+ vm_kmem_size = mem_size / VM_KMEM_SIZE_SCALE;
+#endif
+
+#if defined(VM_KMEM_SIZE_MAX)
+ if (vm_kmem_size >= VM_KMEM_SIZE_MAX)
+ vm_kmem_size = VM_KMEM_SIZE_MAX;
+#endif
+
+ if (vm_kmem_size > 2 * (cnt.v_page_count * PAGE_SIZE))
+ vm_kmem_size = 2 * (cnt.v_page_count * PAGE_SIZE);
+
+ npg = (nmbufs * MSIZE + nmbclusters * MCLBYTES + vm_kmem_size)
+ / PAGE_SIZE;
+
+ kmemusage = (struct kmemusage *) kmem_alloc(kernel_map,
+ (vm_size_t)(npg * sizeof(struct kmemusage)));
+ kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
+ (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE));
+ kmem_map->system_map = 1;
+ for (indx = 0; indx < MINBUCKET + 16; indx++) {
+ if (1 << indx >= PAGE_SIZE)
+ bucket[indx].kb_elmpercl = 1;
+ else
+ bucket[indx].kb_elmpercl = PAGE_SIZE / (1 << indx);
+ bucket[indx].kb_highwat = 5 * bucket[indx].kb_elmpercl;
+ }
+}
+
+void
+malloc_init(data)
+ void *data;
+{
+ struct malloc_type *type = (struct malloc_type *)data;
+
+ if (type->ks_magic != M_MAGIC)
+ panic("malloc type lacks magic");
+
+ if (type->ks_next)
+ return;
+
+ if (cnt.v_page_count == 0)
+ panic("malloc_init not allowed before vm init");
+
+ /*
+ * The default limits for each malloc region is 1/2 of the
+ * malloc portion of the kmem map size.
+ */
+ type->ks_limit = vm_kmem_size / 2;
+ type->ks_next = kmemstatistics;
+ kmemstatistics = type;
+}
+
+void
+malloc_uninit(data)
+ void *data;
+{
+ struct malloc_type *type = (struct malloc_type *)data;
+ struct malloc_type *t;
+
+ if (type->ks_magic != M_MAGIC)
+ panic("malloc type lacks magic");
+
+ if (cnt.v_page_count == 0)
+ panic("malloc_uninit not allowed before vm init");
+
+ if (type == kmemstatistics)
+ kmemstatistics = type->ks_next;
+ else {
+ for (t = kmemstatistics; t->ks_next != NULL; t = t->ks_next) {
+ if (t->ks_next == type) {
+ t->ks_next = type->ks_next;
+ break;
+ }
+ }
+ }
+}
diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c
new file mode 100644
index 0000000..22fcd33
--- /dev/null
+++ b/sys/kern/kern_mib.c
@@ -0,0 +1,182 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
+ * $Id: kern_mib.c,v 1.15 1998/03/28 11:49:52 dufault Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+
+#if defined(SMP)
+#include <machine/smp.h>
+#endif
+
+SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0,
+ "Sysctl internal magic");
+SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW, 0,
+ "High kernel, proc, limits &c");
+SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW, 0,
+ "Virtual memory");
+SYSCTL_NODE(, CTL_VFS, vfs, CTLFLAG_RW, 0,
+ "File system");
+SYSCTL_NODE(, CTL_NET, net, CTLFLAG_RW, 0,
+ "Network, (see socket.h)");
+SYSCTL_NODE(, CTL_DEBUG, debug, CTLFLAG_RW, 0,
+ "Debugging");
+SYSCTL_NODE(, CTL_HW, hw, CTLFLAG_RW, 0,
+ "hardware");
+SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0,
+ "machine dependent");
+SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW, 0,
+ "user-level");
+
+SYSCTL_NODE(, CTL_P1003_1B, p1003_1b, CTLFLAG_RW, 0,
+ "p1003_1b, (see p1003_1b.h)");
+
+SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD, osrelease, 0, "");
+
+SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD, 0, BSD, "");
+
+SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD, version, 0, "");
+
+SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD, ostype, 0, "");
+
+extern int osreldate;
+SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD, &osreldate, 0, "");
+
+SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RD, &maxproc, 0, "");
+
+SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid,
+ CTLFLAG_RW, &maxprocperuid, 0, "");
+
+SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD, 0, ARG_MAX, "");
+
+SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD, 0, _KPOSIX_VERSION, "");
+
+SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD, 0, NGROUPS_MAX, "");
+
+SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD, 0, 1, "");
+
+#ifdef _POSIX_SAVED_IDS
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 1, "");
+#else
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 0, "");
+#endif
+
+char kernelname[MAXPATHLEN] = "/kernel"; /* XXX bloat */
+
+SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile,
+ CTLFLAG_RW, kernelname, sizeof kernelname, "");
+
+#ifdef SMP
+SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, &mp_ncpus, 0, "");
+#else
+SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, 0, 1, "");
+#endif
+
+SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD, 0, BYTE_ORDER, "");
+
+SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD, 0, PAGE_SIZE, "");
+
+static char machine_arch[] = MACHINE_ARCH;
+SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD,
+ machine_arch, 0, "");
+
+char hostname[MAXHOSTNAMELEN];
+
+SYSCTL_STRING(_kern, KERN_HOSTNAME, hostname, CTLFLAG_RW,
+ hostname, sizeof(hostname), "");
+
+int securelevel = -1;
+
+static int
+sysctl_kern_securelvl SYSCTL_HANDLER_ARGS
+{
+ int error, level;
+
+ level = securelevel;
+ error = sysctl_handle_int(oidp, &level, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (level < securelevel)
+ return (EPERM);
+ securelevel = level;
+ return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, CTLTYPE_INT|CTLFLAG_RW,
+ 0, 0, sysctl_kern_securelvl, "I", "");
+
+char domainname[MAXHOSTNAMELEN];
+SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW,
+ &domainname, sizeof(domainname), "");
+
+long hostid;
+/* Some trouble here, if sizeof (int) != sizeof (long) */
+SYSCTL_INT(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "");
+
+/*
+ * This is really cheating. These actually live in the libc, something
+ * which I'm not quite sure is a good idea anyway, but in order for
+ * getnext and friends to actually work, we define dummies here.
+ */
+SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD, "", 0, "");
+SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD, 0, 0, "");
diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c
new file mode 100644
index 0000000..afe9f2e
--- /dev/null
+++ b/sys/kern/kern_module.c
@@ -0,0 +1,330 @@
+/*-
+ * Copyright (c) 1997 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: kern_module.c,v 1.13 1999/01/09 14:59:50 dfr Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/module.h>
+#include <sys/linker.h>
+#include <sys/proc.h>
+
+#define M_MODULE M_TEMP /* XXX */
+
+typedef TAILQ_HEAD(, module) modulelist_t;
+struct module {
+ TAILQ_ENTRY(module) link; /* chain together all modules */
+ TAILQ_ENTRY(module) flink; /* all modules in a file */
+ struct linker_file* file; /* file which contains this module */
+ int refs; /* reference count */
+ int id; /* unique id number */
+ char *name; /* module name */
+ modeventhand_t handler; /* event handler */
+ void *arg; /* argument for handler */
+ modspecific_t data; /* module specific data */
+};
+
+#define MOD_EVENT(mod, type) (mod)->handler((mod), (type), (mod)->arg)
+
+static modulelist_t modules;
+static int nextid = 1;
+
+static void module_shutdown(int, void*);
+
+static void
+module_init(void* arg)
+{
+ TAILQ_INIT(&modules);
+ at_shutdown(module_shutdown, 0, SHUTDOWN_POST_SYNC);
+}
+
+SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0);
+
+static void
+module_shutdown(int arg1, void* arg2)
+{
+ module_t mod;
+
+ for (mod = TAILQ_FIRST(&modules); mod; mod = TAILQ_NEXT(mod, link))
+ MOD_EVENT(mod, MOD_SHUTDOWN);
+}
+
+void
+module_register_init(void *arg)
+{
+ moduledata_t* data = (moduledata_t*) arg;
+ int error;
+
+ error = module_register(data->name, data->evhand, data->priv, data->_file);
+ if (error)
+ printf("module_register_init: module_register(%s, %lx, %p) error %d\n",
+ data->name, (u_long)(uintfptr_t)data->evhand, data->priv, error);
+}
+
+int
+module_register(const char* name, modeventhand_t handler, void* arg, void *file)
+{
+ size_t namelen;
+ module_t newmod;
+ int error;
+ linker_file_t container = file;
+
+ namelen = strlen(name) + 1;
+ newmod = (module_t) malloc(sizeof(struct module) + namelen,
+ M_MODULE, M_WAITOK);
+ if (newmod == 0)
+ return ENOMEM;
+
+ newmod->refs = 1;
+ newmod->id = nextid++;
+ newmod->name = (char *) (newmod + 1);
+ strcpy(newmod->name, name);
+ newmod->handler = handler;
+ newmod->arg = arg;
+ bzero(&newmod->data, sizeof(newmod->data));
+ TAILQ_INSERT_TAIL(&modules, newmod, link);
+
+ if (container == NULL)
+ container = linker_current_file;
+ if (container) {
+ TAILQ_INSERT_TAIL(&container->modules, newmod, flink);
+ newmod->file = container;
+ } else
+ newmod->file = 0;
+
+ if (error = MOD_EVENT(newmod, MOD_LOAD)) {
+ MOD_EVENT(newmod, MOD_UNLOAD);
+ module_release(newmod);
+ return error;
+ }
+
+ return 0;
+}
+
+void
+module_reference(module_t mod)
+{
+ MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs));
+
+ mod->refs++;
+}
+
+void
+module_release(module_t mod)
+{
+ if (mod->refs <= 0)
+ panic("module_release: bad reference count");
+
+ MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs));
+
+ mod->refs--;
+ if (mod->refs == 0) {
+ TAILQ_REMOVE(&modules, mod, link);
+ if (mod->file) {
+ TAILQ_REMOVE(&mod->file->modules, mod, flink);
+ }
+ free(mod, M_MODULE);
+ }
+}
+
+module_t
+module_lookupbyname(const char* name)
+{
+ module_t mod;
+
+ for (mod = TAILQ_FIRST(&modules); mod; mod = TAILQ_NEXT(mod, link)) {
+ if (!strcmp(mod->name, name))
+ return mod;
+ }
+
+ return 0;
+}
+
+module_t
+module_lookupbyid(int modid)
+{
+ module_t mod;
+
+ for (mod = TAILQ_FIRST(&modules); mod; mod = TAILQ_NEXT(mod, link)) {
+ if (mod->id == modid)
+ return mod;
+ }
+
+ return 0;
+}
+
+int
+module_unload(module_t mod)
+{
+ return MOD_EVENT(mod, MOD_UNLOAD);
+}
+
+int
+module_getid(module_t mod)
+{
+ return mod->id;
+}
+
+module_t
+module_getfnext(module_t mod)
+{
+ return TAILQ_NEXT(mod, flink);
+}
+
+void
+module_setspecific(module_t mod, modspecific_t *datap)
+{
+ mod->data = *datap;
+}
+
+/*
+ * Syscalls.
+ */
+int
+modnext(struct proc* p, struct modnext_args* uap)
+{
+ module_t mod;
+
+ p->p_retval[0] = -1;
+ if (SCARG(uap, modid) == 0) {
+ mod = TAILQ_FIRST(&modules);
+ if (mod) {
+ p->p_retval[0] = mod->id;
+ return 0;
+ } else
+ return ENOENT;
+ }
+
+ mod = module_lookupbyid(SCARG(uap, modid));
+ if (!mod)
+ return ENOENT;
+
+ if (TAILQ_NEXT(mod, link))
+ p->p_retval[0] = TAILQ_NEXT(mod, link)->id;
+ else
+ p->p_retval[0] = 0;
+ return 0;
+}
+
+int
+modfnext(struct proc* p, struct modfnext_args* uap)
+{
+ module_t mod;
+
+ p->p_retval[0] = -1;
+
+ mod = module_lookupbyid(SCARG(uap, modid));
+ if (!mod)
+ return ENOENT;
+
+ if (TAILQ_NEXT(mod, flink))
+ p->p_retval[0] = TAILQ_NEXT(mod, flink)->id;
+ else
+ p->p_retval[0] = 0;
+ return 0;
+}
+
+struct module_stat_v1 {
+ int version; /* set to sizeof(struct module_stat) */
+ char name[MAXMODNAME];
+ int refs;
+ int id;
+};
+
+int
+modstat(struct proc* p, struct modstat_args* uap)
+{
+ module_t mod;
+ int error = 0;
+ int namelen;
+ int version;
+ struct module_stat* stat;
+
+ mod = module_lookupbyid(SCARG(uap, modid));
+ if (!mod)
+ return ENOENT;
+
+ stat = SCARG(uap, stat);
+
+ /*
+ * Check the version of the user's structure.
+ */
+ if (error = copyin(&stat->version, &version, sizeof(version)))
+ goto out;
+ if (version != sizeof(struct module_stat_v1)
+ && version != sizeof(struct module_stat)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ namelen = strlen(mod->name) + 1;
+ if (namelen > MAXMODNAME)
+ namelen = MAXMODNAME;
+ if (error = copyout(mod->name, &stat->name[0], namelen))
+ goto out;
+
+ if (error = copyout(&mod->refs, &stat->refs, sizeof(int)))
+ goto out;
+ if (error = copyout(&mod->id, &stat->id, sizeof(int)))
+ goto out;
+
+ /*
+ * >v1 stat includes module data.
+ */
+ if (version == sizeof(struct module_stat)) {
+ if (error = copyout(&mod->data, &stat->data, sizeof(mod->data)))
+ goto out;
+ }
+
+ p->p_retval[0] = 0;
+
+out:
+ return error;
+}
+
+int
+modfind(struct proc* p, struct modfind_args* uap)
+{
+ int error = 0;
+ char name[MAXMODNAME];
+ module_t mod;
+
+ if (error = copyinstr(SCARG(uap, name), name, sizeof name, 0))
+ goto out;
+
+ mod = module_lookupbyname(name);
+ if (!mod)
+ error = ENOENT;
+ else
+ p->p_retval[0] = mod->id;
+
+out:
+ return error;
+}
diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c
new file mode 100644
index 0000000..2f4114d
--- /dev/null
+++ b/sys/kern/kern_ntptime.c
@@ -0,0 +1,856 @@
+/******************************************************************************
+ * *
+ * Copyright (c) David L. Mills 1993, 1994 *
+ * *
+ * Permission to use, copy, modify, and distribute this software and its *
+ * documentation for any purpose and without fee is hereby granted, provided *
+ * that the above copyright notice appears in all copies and that both the *
+ * copyright notice and this permission notice appear in supporting *
+ * documentation, and that the name University of Delaware not be used in *
+ * advertising or publicity pertaining to distribution of the software *
+ * without specific, written prior permission. The University of Delaware *
+ * makes no representations about the suitability this software for any *
+ * purpose. It is provided "as is" without express or implied warranty. *
+ * *
+ ******************************************************************************/
+
+/*
+ * Modification history kern_ntptime.c
+ *
+ * 24 Sep 94 David L. Mills
+ * Tightened code at exits.
+ *
+ * 24 Mar 94 David L. Mills
+ * Revised syscall interface to include new variables for PPS
+ * time discipline.
+ *
+ * 14 Feb 94 David L. Mills
+ * Added code for external clock
+ *
+ * 28 Nov 93 David L. Mills
+ * Revised frequency scaling to conform with adjusted parameters
+ *
+ * 17 Sep 93 David L. Mills
+ * Created file
+ */
+/*
+ * ntp_gettime(), ntp_adjtime() - precision time interface for SunOS
+ * V4.1.1 and V4.1.3
+ *
+ * These routines consitute the Network Time Protocol (NTP) interfaces
+ * for user and daemon application programs. The ntp_gettime() routine
+ * provides the time, maximum error (synch distance) and estimated error
+ * (dispersion) to client user application programs. The ntp_adjtime()
+ * routine is used by the NTP daemon to adjust the system clock to an
+ * externally derived time. The time offset and related variables set by
+ * this routine are used by hardclock() to adjust the phase and
+ * frequency of the phase-lock loop which controls the system clock.
+ */
+
+#include "opt_ntp.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/timex.h>
+#include <sys/timepps.h>
+#include <sys/sysctl.h>
+
+/*
+ * Phase/frequency-lock loop (PLL/FLL) definitions
+ *
+ * The following variables are read and set by the ntp_adjtime() system
+ * call.
+ *
+ * time_state shows the state of the system clock, with values defined
+ * in the timex.h header file.
+ *
+ * time_status shows the status of the system clock, with bits defined
+ * in the timex.h header file.
+ *
+ * time_offset is used by the PLL/FLL to adjust the system time in small
+ * increments.
+ *
+ * time_constant determines the bandwidth or "stiffness" of the PLL.
+ *
+ * time_tolerance determines maximum frequency error or tolerance of the
+ * CPU clock oscillator and is a property of the architecture; however,
+ * in principle it could change as result of the presence of external
+ * discipline signals, for instance.
+ *
+ * time_precision is usually equal to the kernel tick variable; however,
+ * in cases where a precision clock counter or external clock is
+ * available, the resolution can be much less than this and depend on
+ * whether the external clock is working or not.
+ *
+ * time_maxerror is initialized by a ntp_adjtime() call and increased by
+ * the kernel once each second to reflect the maximum error
+ * bound growth.
+ *
+ * time_esterror is set and read by the ntp_adjtime() call, but
+ * otherwise not used by the kernel.
+ */
+static int time_status = STA_UNSYNC; /* clock status bits */
+static int time_state = TIME_OK; /* clock state */
+static long time_offset = 0; /* time offset (us) */
+static long time_constant = 0; /* pll time constant */
+static long time_tolerance = MAXFREQ; /* frequency tolerance (scaled ppm) */
+static long time_precision = 1; /* clock precision (us) */
+static long time_maxerror = MAXPHASE; /* maximum error (us) */
+static long time_esterror = MAXPHASE; /* estimated error (us) */
+static int time_daemon = 0; /* No timedaemon active */
+
+/*
+ * The following variables establish the state of the PLL/FLL and the
+ * residual time and frequency offset of the local clock. The scale
+ * factors are defined in the timex.h header file.
+ *
+ * time_phase and time_freq are the phase increment and the frequency
+ * increment, respectively, of the kernel time variable at each tick of
+ * the clock.
+ *
+ * time_freq is set via ntp_adjtime() from a value stored in a file when
+ * the synchronization daemon is first started. Its value is retrieved
+ * via ntp_adjtime() and written to the file about once per hour by the
+ * daemon.
+ *
+ * time_adj is the adjustment added to the value of tick at each timer
+ * interrupt and is recomputed from time_phase and time_freq at each
+ * seconds rollover.
+ *
+ * time_reftime is the second's portion of the system time on the last
+ * call to ntp_adjtime(). It is used to adjust the time_freq variable
+ * and to increase the time_maxerror as the time since last update
+ * increases.
+ */
+long time_phase = 0; /* phase offset (scaled us) */
+static long time_freq = 0; /* frequency offset (scaled ppm) */
+long time_adj = 0; /* tick adjust (scaled 1 / hz) */
+static long time_reftime = 0; /* time at last adjustment (s) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used only if the kernel PPS discipline
+ * code is configured (PPS_SYNC). The scale factors are defined in the
+ * timex.h header file.
+ *
+ * pps_time contains the time at each calibration interval, as read by
+ * microtime(). pps_count counts the seconds of the calibration
+ * interval, the duration of which is nominally pps_shift in powers of
+ * two.
+ *
+ * pps_offset is the time offset produced by the time median filter
+ * pps_tf[], while pps_jitter is the dispersion (jitter) measured by
+ * this filter.
+ *
+ * pps_freq is the frequency offset produced by the frequency median
+ * filter pps_ff[], while pps_stabil is the dispersion (wander) measured
+ * by this filter.
+ *
+ * pps_usec is latched from a high resolution counter or external clock
+ * at pps_time. Here we want the hardware counter contents only, not the
+ * contents plus the time_tv.usec as usual.
+ *
+ * pps_valid counts the number of seconds since the last PPS update. It
+ * is used as a watchdog timer to disable the PPS discipline should the
+ * PPS signal be lost.
+ *
+ * pps_glitch counts the number of seconds since the beginning of an
+ * offset burst more than tick/2 from current nominal offset. It is used
+ * mainly to suppress error bursts due to priority conflicts between the
+ * PPS interrupt and timer interrupt.
+ *
+ * pps_intcnt counts the calibration intervals for use in the interval-
+ * adaptation algorithm. It's just too complicated for words.
+ */
+static struct timeval pps_time; /* kernel time at last interval */
+static long pps_offset = 0; /* pps time offset (us) */
+static long pps_jitter = MAXTIME; /* pps time dispersion (jitter) (us) */
+static long pps_tf[] = {0, 0, 0}; /* pps time offset median filter (us) */
+static long pps_freq = 0; /* frequency offset (scaled ppm) */
+static long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */
+static long pps_ff[] = {0, 0, 0}; /* frequency offset median filter */
+static long pps_usec = 0; /* microsec counter at last interval */
+static long pps_valid = PPS_VALID; /* pps signal watchdog counter */
+static int pps_glitch = 0; /* pps signal glitch counter */
+static int pps_count = 0; /* calibration interval counter (s) */
+static int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */
+static int pps_intcnt = 0; /* intervals at current duration */
+
+/*
+ * PPS signal quality monitors
+ *
+ * pps_jitcnt counts the seconds that have been discarded because the
+ * jitter measured by the time median filter exceeds the limit MAXTIME
+ * (100 us).
+ *
+ * pps_calcnt counts the frequency calibration intervals, which are
+ * variable from 4 s to 256 s.
+ *
+ * pps_errcnt counts the calibration intervals which have been discarded
+ * because the wander exceeds the limit MAXFREQ (100 ppm) or where the
+ * calibration interval jitter exceeds two ticks.
+ *
+ * pps_stbcnt counts the calibration intervals that have been discarded
+ * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us).
+ */
+static long pps_jitcnt = 0; /* jitter limit exceeded */
+static long pps_calcnt = 0; /* calibration intervals */
+static long pps_errcnt = 0; /* calibration errors */
+static long pps_stbcnt = 0; /* stability limit exceeded */
+#endif /* PPS_SYNC */
+
+static void hardupdate __P((int64_t offset, int prescaled));
+
+/*
+ * hardupdate() - local clock update
+ *
+ * This routine is called by ntp_adjtime() to update the local clock
+ * phase and frequency. The implementation is of an adaptive-parameter,
+ * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
+ * time and frequency offset estimates for each call. If the kernel PPS
+ * discipline code is configured (PPS_SYNC), the PPS signal itself
+ * determines the new time offset, instead of the calling argument.
+ * Presumably, calls to ntp_adjtime() occur only when the caller
+ * believes the local clock is valid within some bound (+-128 ms with
+ * NTP). If the caller's time is far different than the PPS time, an
+ * argument will ensue, and it's not clear who will lose.
+ *
+ * For uncompensated quartz crystal oscillatores and nominal update
+ * intervals less than 1024 s, operation should be in phase-lock mode
+ * (STA_FLL = 0), where the loop is disciplined to phase. For update
+ * intervals greater than thiss, operation should be in frequency-lock
+ * mode (STA_FLL = 1), where the loop is disciplined to frequency.
+ *
+ * Note: splclock() is in effect.
+ */
+static void
+hardupdate(offset, prescaled)
+ int64_t offset;
+ int prescaled;
+{
+ long mtemp;
+ int64_t ltemp;
+
+ if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME))
+ return;
+ if (prescaled)
+ ltemp = offset;
+ else
+ ltemp = offset << SHIFT_UPDATE;
+#ifdef PPS_SYNC
+ if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+ ltemp = pps_offset << SHIFT_UPDATE;
+#endif /* PPS_SYNC */
+
+ /*
+ * Scale the phase adjustment and clamp to the operating range.
+ */
+ if (ltemp > (MAXPHASE << SHIFT_UPDATE))
+ time_offset = MAXPHASE << SHIFT_UPDATE;
+ else if (ltemp < -(MAXPHASE << SHIFT_UPDATE))
+ time_offset = -(MAXPHASE << SHIFT_UPDATE);
+ else
+ time_offset = ltemp;
+
+ /*
+ * Select whether the frequency is to be controlled and in which
+ * mode (PLL or FLL). Clamp to the operating range. Ugly
+ * multiply/divide should be replaced someday.
+ */
+ if (time_status & STA_FREQHOLD || time_reftime == 0)
+ time_reftime = time_second;
+ mtemp = time_second - time_reftime;
+ time_reftime = time_second;
+ if (time_status & STA_FLL) {
+ if (mtemp >= MINSEC) {
+ ltemp = ((time_offset / mtemp) << (SHIFT_USEC -
+ SHIFT_UPDATE));
+ if (ltemp < 0)
+ time_freq -= -ltemp >> SHIFT_KH;
+ else
+ time_freq += ltemp >> SHIFT_KH;
+ }
+ } else {
+ if (mtemp < MAXSEC) {
+ ltemp = time_offset * mtemp;
+ if (ltemp < 0)
+ time_freq -= -ltemp >> ((int64_t)time_constant +
+ time_constant + SHIFT_KF -
+ SHIFT_USEC + SHIFT_UPDATE);
+ else
+ time_freq += ltemp >> ((int64_t)time_constant +
+ time_constant + SHIFT_KF -
+ SHIFT_USEC + SHIFT_UPDATE);
+ }
+ }
+ if (time_freq > time_tolerance)
+ time_freq = time_tolerance;
+ else if (time_freq < -time_tolerance)
+ time_freq = -time_tolerance;
+}
+
+/*
+ * On rollover of the second the phase adjustment to be used for
+ * the next second is calculated. Also, the maximum error is
+ * increased by the tolerance. If the PPS frequency discipline
+ * code is present, the phase is increased to compensate for the
+ * CPU clock oscillator frequency error.
+ *
+ * On a 32-bit machine and given parameters in the timex.h
+ * header file, the maximum phase adjustment is +-512 ms and
+ * maximum frequency offset is a tad less than) +-512 ppm. On a
+ * 64-bit machine, you shouldn't need to ask.
+ */
+void
+ntp_update_second(struct timecounter *tc)
+{
+ u_int32_t *newsec;
+ long ltemp;
+
+ if (!time_daemon)
+ return;
+
+ newsec = &tc->tc_offset_sec;
+ time_maxerror += time_tolerance >> SHIFT_USEC;
+
+ /*
+ * Compute the phase adjustment for the next second. In
+ * PLL mode, the offset is reduced by a fixed factor
+ * times the time constant. In FLL mode the offset is
+ * used directly. In either mode, the maximum phase
+ * adjustment for each second is clamped so as to spread
+ * the adjustment over not more than the number of
+ * seconds between updates.
+ */
+ if (time_offset < 0) {
+ ltemp = -time_offset;
+ if (!(time_status & STA_FLL))
+ ltemp >>= SHIFT_KG + time_constant;
+ if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+ ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+ time_offset += ltemp;
+ time_adj = -ltemp << (SHIFT_SCALE - SHIFT_UPDATE);
+ } else {
+ ltemp = time_offset;
+ if (!(time_status & STA_FLL))
+ ltemp >>= SHIFT_KG + time_constant;
+ if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+ ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+ time_offset -= ltemp;
+ time_adj = ltemp << (SHIFT_SCALE - SHIFT_UPDATE);
+ }
+
+ /*
+ * Compute the frequency estimate and additional phase
+ * adjustment due to frequency error for the next
+ * second. When the PPS signal is engaged, gnaw on the
+ * watchdog counter and update the frequency computed by
+ * the pll and the PPS signal.
+ */
+#ifdef PPS_SYNC
+ pps_valid++;
+ if (pps_valid == PPS_VALID) {
+ pps_jitter = MAXTIME;
+ pps_stabil = MAXFREQ;
+ time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+ STA_PPSWANDER | STA_PPSERROR);
+ }
+ ltemp = time_freq + pps_freq;
+#else
+ ltemp = time_freq;
+#endif /* PPS_SYNC */
+ if (ltemp < 0)
+ time_adj -= -ltemp << (SHIFT_SCALE - SHIFT_USEC);
+ else
+ time_adj += ltemp << (SHIFT_SCALE - SHIFT_USEC);
+
+ tc->tc_adjustment = time_adj;
+
+ /* XXX - this is really bogus, but can't be fixed until
+ xntpd's idea of the system clock is fixed to know how
+ the user wants leap seconds handled; in the mean time,
+ we assume that users of NTP are running without proper
+ leap second support (this is now the default anyway) */
+ /*
+ * Leap second processing. If in leap-insert state at
+ * the end of the day, the system clock is set back one
+ * second; if in leap-delete state, the system clock is
+ * set ahead one second. The microtime() routine or
+ * external clock driver will insure that reported time
+ * is always monotonic. The ugly divides should be
+ * replaced.
+ */
+ switch (time_state) {
+
+ case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
+ break;
+
+ case TIME_INS:
+ if ((*newsec) % 86400 == 0) {
+ (*newsec)--;
+ time_state = TIME_OOP;
+ }
+ break;
+
+ case TIME_DEL:
+ if (((*newsec) + 1) % 86400 == 0) {
+ (*newsec)++;
+ time_state = TIME_WAIT;
+ }
+ break;
+
+ case TIME_OOP:
+ time_state = TIME_WAIT;
+ break;
+
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ break;
+ }
+}
+
+static int
+ntp_sysctl SYSCTL_HANDLER_ARGS
+{
+ struct timeval atv;
+ struct ntptimeval ntv;
+ int s;
+
+ s = splclock();
+ microtime(&atv);
+ ntv.time = atv;
+ ntv.maxerror = time_maxerror;
+ ntv.esterror = time_esterror;
+ splx(s);
+
+ ntv.time_state = time_state;
+
+ /*
+ * Status word error decode. If any of these conditions
+ * occur, an error is returned, instead of the status
+ * word. Most applications will care only about the fact
+ * the system clock may not be trusted, not about the
+ * details.
+ *
+ * Hardware or software error
+ */
+ if (time_status & (STA_UNSYNC | STA_CLOCKERR)) {
+ ntv.time_state = TIME_ERROR;
+ }
+
+ /*
+ * PPS signal lost when either time or frequency
+ * synchronization requested
+ */
+ if (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+ !(time_status & STA_PPSSIGNAL)) {
+ ntv.time_state = TIME_ERROR;
+ }
+
+ /*
+ * PPS jitter exceeded when time synchronization
+ * requested
+ */
+ if (time_status & STA_PPSTIME &&
+ time_status & STA_PPSJITTER) {
+ ntv.time_state = TIME_ERROR;
+ }
+
+ /*
+ * PPS wander exceeded or calibration error when
+ * frequency synchronization requested
+ */
+ if (time_status & STA_PPSFREQ &&
+ time_status & (STA_PPSWANDER | STA_PPSERROR)) {
+ ntv.time_state = TIME_ERROR;
+ }
+ return (sysctl_handle_opaque(oidp, &ntv, sizeof ntv, req));
+}
+
+SYSCTL_NODE(_kern, KERN_NTP_PLL, ntp_pll, CTLFLAG_RW, 0,
+ "NTP kernel PLL related stuff");
+SYSCTL_PROC(_kern_ntp_pll, NTP_PLL_GETTIME, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", "");
+
+/*
+ * ntp_adjtime() - NTP daemon application interface
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ntp_adjtime_args {
+ struct timex *tp;
+};
+#endif
+
+int
+ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap)
+{
+ struct timex ntv;
+ int modes;
+ int s;
+ int error;
+
+ time_daemon = 1;
+
+ error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
+ if (error)
+ return error;
+
+ /*
+ * Update selected clock variables - only the superuser can
+ * change anything. Note that there is no error checking here on
+ * the assumption the superuser should know what it is doing.
+ */
+ modes = ntv.modes;
+ if ((modes != 0)
+ && (error = suser(p->p_cred->pc_ucred, &p->p_acflag)))
+ return error;
+
+ s = splclock();
+ if (modes & MOD_FREQUENCY)
+#ifdef PPS_SYNC
+ time_freq = ntv.freq - pps_freq;
+#else /* PPS_SYNC */
+ time_freq = ntv.freq;
+#endif /* PPS_SYNC */
+ if (modes & MOD_MAXERROR)
+ time_maxerror = ntv.maxerror;
+ if (modes & MOD_ESTERROR)
+ time_esterror = ntv.esterror;
+ if (modes & MOD_STATUS) {
+ time_status &= STA_RONLY;
+ time_status |= ntv.status & ~STA_RONLY;
+ }
+ if (modes & MOD_TIMECONST)
+ time_constant = ntv.constant;
+ if (modes & MOD_OFFSET)
+ hardupdate(ntv.offset, modes & MOD_DOSCALE);
+
+ ntv.modes |= MOD_CANSCALE;
+ /*
+ * Retrieve all clock variables
+ */
+ if (modes & MOD_DOSCALE)
+ ntv.offset = time_offset;
+ else if (time_offset < 0)
+ ntv.offset = -(-time_offset >> SHIFT_UPDATE);
+ else
+ ntv.offset = time_offset >> SHIFT_UPDATE;
+#ifdef PPS_SYNC
+ ntv.freq = time_freq + pps_freq;
+#else /* PPS_SYNC */
+ ntv.freq = time_freq;
+#endif /* PPS_SYNC */
+ ntv.maxerror = time_maxerror;
+ ntv.esterror = time_esterror;
+ ntv.status = time_status;
+ ntv.constant = time_constant;
+ ntv.precision = time_precision;
+ ntv.tolerance = time_tolerance;
+#ifdef PPS_SYNC
+ ntv.shift = pps_shift;
+ ntv.ppsfreq = pps_freq;
+ ntv.jitter = pps_jitter >> PPS_AVG;
+ ntv.stabil = pps_stabil;
+ ntv.calcnt = pps_calcnt;
+ ntv.errcnt = pps_errcnt;
+ ntv.jitcnt = pps_jitcnt;
+ ntv.stbcnt = pps_stbcnt;
+#endif /* PPS_SYNC */
+ (void)splx(s);
+
+ error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv));
+ if (!error) {
+ /*
+ * Status word error decode. See comments in
+ * ntp_gettime() routine.
+ */
+ p->p_retval[0] = time_state;
+ if (time_status & (STA_UNSYNC | STA_CLOCKERR))
+ p->p_retval[0] = TIME_ERROR;
+ if (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+ !(time_status & STA_PPSSIGNAL))
+ p->p_retval[0] = TIME_ERROR;
+ if (time_status & STA_PPSTIME &&
+ time_status & STA_PPSJITTER)
+ p->p_retval[0] = TIME_ERROR;
+ if (time_status & STA_PPSFREQ &&
+ time_status & (STA_PPSWANDER | STA_PPSERROR))
+ p->p_retval[0] = TIME_ERROR;
+ }
+ return error;
+}
+
+#ifdef PPS_SYNC
+
+/* We need this ugly monster twice, so let's macroize it. */
+
+#define MEDIAN3X(a, m, s, i1, i2, i3) \
+ do { \
+ m = a[i2]; \
+ s = a[i1] - a[i3]; \
+ } while (0)
+
+#define MEDIAN3(a, m, s) \
+ do { \
+ if (a[0] > a[1]) { \
+ if (a[1] > a[2]) \
+ MEDIAN3X(a, m, s, 0, 1, 2); \
+ else if (a[2] > a[0]) \
+ MEDIAN3X(a, m, s, 2, 0, 1); \
+ else \
+ MEDIAN3X(a, m, s, 0, 2, 1); \
+ } else { \
+ if (a[2] > a[1]) \
+ MEDIAN3X(a, m, s, 2, 1, 0); \
+ else if (a[0] > a[2]) \
+ MEDIAN3X(a, m, s, 1, 0, 2); \
+ else \
+ MEDIAN3X(a, m, s, 1, 2, 0); \
+ } \
+ } while (0)
+
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS interrupt in order to discipline
+ * the CPU clock oscillator to the PPS signal. It measures the PPS phase
+ * and leaves it in a handy spot for the hardclock() routine. It
+ * integrates successive PPS phase differences and calculates the
+ * frequency offset. This is used in hardclock() to discipline the CPU
+ * clock oscillator so that intrinsic frequency error is cancelled out.
+ * The code requires the caller to capture the time and hardware counter
+ * value at the on-time PPS signal transition.
+ *
+ * Note that, on some Unix systems, this routine runs at an interrupt
+ * priority level higher than the timer interrupt routine hardclock().
+ * Therefore, the variables used are distinct from the hardclock()
+ * variables, except for certain exceptions: The PPS frequency pps_freq
+ * and phase pps_offset variables are determined by this routine and
+ * updated atomically. The time_tolerance variable can be considered a
+ * constant, since it is infrequently changed, and then only when the
+ * PPS signal is disabled. The watchdog counter pps_valid is updated
+ * once per second by hardclock() and is atomically cleared in this
+ * routine.
+ */
+void
+hardpps(tvp, p_usec)
+ struct timeval *tvp; /* time at PPS */
+ long p_usec; /* hardware counter at PPS */
+{
+ long u_usec, v_usec, bigtick;
+ long cal_sec, cal_usec;
+
+ /*
+ * An occasional glitch can be produced when the PPS interrupt
+ * occurs in the hardclock() routine before the time variable is
+ * updated. Here the offset is discarded when the difference
+ * between it and the last one is greater than tick/2, but not
+ * if the interval since the first discard exceeds 30 s.
+ */
+ time_status |= STA_PPSSIGNAL;
+ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+ pps_valid = 0;
+ u_usec = -tvp->tv_usec;
+ if (u_usec < -500000)
+ u_usec += 1000000;
+ v_usec = pps_offset - u_usec;
+ if (v_usec < 0)
+ v_usec = -v_usec;
+ if (v_usec > (tick >> 1)) {
+ if (pps_glitch > MAXGLITCH) {
+ pps_glitch = 0;
+ pps_tf[2] = u_usec;
+ pps_tf[1] = u_usec;
+ } else {
+ pps_glitch++;
+ u_usec = pps_offset;
+ }
+ } else
+ pps_glitch = 0;
+
+ /*
+ * A three-stage median filter is used to help deglitch the pps
+ * time. The median sample becomes the time offset estimate; the
+ * difference between the other two samples becomes the time
+ * dispersion (jitter) estimate.
+ */
+ pps_tf[2] = pps_tf[1];
+ pps_tf[1] = pps_tf[0];
+ pps_tf[0] = u_usec;
+ MEDIAN3(pps_tf, pps_offset, v_usec);
+ if (v_usec > MAXTIME)
+ pps_jitcnt++;
+ v_usec = (v_usec << PPS_AVG) - pps_jitter;
+ if (v_usec < 0)
+ pps_jitter -= -v_usec >> PPS_AVG;
+ else
+ pps_jitter += v_usec >> PPS_AVG;
+ if (pps_jitter > (MAXTIME >> 1))
+ time_status |= STA_PPSJITTER;
+
+ /*
+ * During the calibration interval adjust the starting time when
+ * the tick overflows. At the end of the interval compute the
+ * duration of the interval and the difference of the hardware
+ * counters at the beginning and end of the interval. This code
+ * is deliciously complicated by the fact valid differences may
+ * exceed the value of tick when using long calibration
+ * intervals and small ticks. Note that the counter can be
+ * greater than tick if caught at just the wrong instant, but
+ * the values returned and used here are correct.
+ */
+ bigtick = (long)tick << SHIFT_USEC;
+ pps_usec -= pps_freq;
+ if (pps_usec >= bigtick)
+ pps_usec -= bigtick;
+ if (pps_usec < 0)
+ pps_usec += bigtick;
+ pps_time.tv_sec++;
+ pps_count++;
+ if (pps_count < (1 << pps_shift))
+ return;
+ pps_count = 0;
+ pps_calcnt++;
+ u_usec = p_usec << SHIFT_USEC;
+ v_usec = pps_usec - u_usec;
+ if (v_usec >= bigtick >> 1)
+ v_usec -= bigtick;
+ if (v_usec < -(bigtick >> 1))
+ v_usec += bigtick;
+ if (v_usec < 0)
+ v_usec = -(-v_usec >> pps_shift);
+ else
+ v_usec = v_usec >> pps_shift;
+ pps_usec = u_usec;
+ cal_sec = tvp->tv_sec;
+ cal_usec = tvp->tv_usec;
+ cal_sec -= pps_time.tv_sec;
+ cal_usec -= pps_time.tv_usec;
+ if (cal_usec < 0) {
+ cal_usec += 1000000;
+ cal_sec--;
+ }
+ pps_time = *tvp;
+
+ /*
+ * Check for lost interrupts, noise, excessive jitter and
+ * excessive frequency error. The number of timer ticks during
+ * the interval may vary +-1 tick. Add to this a margin of one
+ * tick for the PPS signal jitter and maximum frequency
+ * deviation. If the limits are exceeded, the calibration
+ * interval is reset to the minimum and we start over.
+ */
+ u_usec = (long)tick << 1;
+ if (!((cal_sec == -1 && cal_usec > (1000000 - u_usec))
+ || (cal_sec == 0 && cal_usec < u_usec))
+ || v_usec > time_tolerance || v_usec < -time_tolerance) {
+ pps_errcnt++;
+ pps_shift = PPS_SHIFT;
+ pps_intcnt = 0;
+ time_status |= STA_PPSERROR;
+ return;
+ }
+
+ /*
+ * A three-stage median filter is used to help deglitch the pps
+ * frequency. The median sample becomes the frequency offset
+ * estimate; the difference between the other two samples
+ * becomes the frequency dispersion (stability) estimate.
+ */
+ pps_ff[2] = pps_ff[1];
+ pps_ff[1] = pps_ff[0];
+ pps_ff[0] = v_usec;
+ MEDIAN3(pps_ff, u_usec, v_usec);
+
+ /*
+ * Here the frequency dispersion (stability) is updated. If it
+ * is less than one-fourth the maximum (MAXFREQ), the frequency
+ * offset is updated as well, but clamped to the tolerance. It
+ * will be processed later by the hardclock() routine.
+ */
+ v_usec = (v_usec >> 1) - pps_stabil;
+ if (v_usec < 0)
+ pps_stabil -= -v_usec >> PPS_AVG;
+ else
+ pps_stabil += v_usec >> PPS_AVG;
+ if (pps_stabil > MAXFREQ >> 2) {
+ pps_stbcnt++;
+ time_status |= STA_PPSWANDER;
+ return;
+ }
+ if (time_status & STA_PPSFREQ) {
+ if (u_usec < 0) {
+ pps_freq -= -u_usec >> PPS_AVG;
+ if (pps_freq < -time_tolerance)
+ pps_freq = -time_tolerance;
+ u_usec = -u_usec;
+ } else {
+ pps_freq += u_usec >> PPS_AVG;
+ if (pps_freq > time_tolerance)
+ pps_freq = time_tolerance;
+ }
+ }
+
+ /*
+ * Here the calibration interval is adjusted. If the maximum
+ * time difference is greater than tick / 4, reduce the interval
+ * by half. If this is not the case for four consecutive
+ * intervals, double the interval.
+ */
+ if (u_usec << pps_shift > bigtick >> 2) {
+ pps_intcnt = 0;
+ if (pps_shift > PPS_SHIFT)
+ pps_shift--;
+ } else if (pps_intcnt >= 4) {
+ pps_intcnt = 0;
+ if (pps_shift < PPS_SHIFTMAX)
+ pps_shift++;
+ } else
+ pps_intcnt++;
+}
+
+#endif /* PPS_SYNC */
+
+int
+std_pps_ioctl(u_long cmd, caddr_t data, pps_params_t *pp, pps_info_t *pi, int ppscap)
+{
+ pps_params_t *app;
+ pps_info_t *api;
+
+ switch (cmd) {
+ case PPS_IOC_CREATE:
+ return (0);
+ case PPS_IOC_DESTROY:
+ return (0);
+ case PPS_IOC_SETPARAMS:
+ app = (pps_params_t *)data;
+ if (app->mode & ~ppscap)
+ return (EINVAL);
+ *pp = *app;
+ return (0);
+ case PPS_IOC_GETPARAMS:
+ app = (pps_params_t *)data;
+ *app = *pp;
+ return (0);
+ case PPS_IOC_GETCAP:
+ *(int*)data = ppscap;
+ return (0);
+ case PPS_IOC_FETCH:
+ api = (pps_info_t *)data;
+ *api = *pi;
+ pi->current_mode = pp->mode;
+ return (0);
+ case PPS_IOC_WAIT:
+ return (EOPNOTSUPP);
+ default:
+ return (ENODEV);
+ }
+}
diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
new file mode 100644
index 0000000..ad63a98
--- /dev/null
+++ b/sys/kern/kern_physio.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice immediately at the beginning of the file, without modification,
+ * this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ * John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ * are met.
+ *
+ * $Id: kern_physio.c,v 1.29 1998/10/25 17:44:51 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static void physwakeup __P((struct buf *bp));
+static struct buf * phygetvpbuf(dev_t dev, int resid);
+
+int
+physio(strategy, bp, dev, rw, minp, uio)
+ d_strategy_t *strategy;
+ struct buf *bp;
+ dev_t dev;
+ int rw;
+ u_int (*minp) __P((struct buf *bp));
+ struct uio *uio;
+{
+ int i;
+ int bufflags = rw?B_READ:0;
+ int error;
+ int spl;
+ caddr_t sa;
+ int bp_alloc = (bp == 0);
+ struct buf *bpa;
+
+/*
+ * keep the process from being swapped
+ */
+ curproc->p_flag |= P_PHYSIO;
+
+ /* create and build a buffer header for a transfer */
+ bpa = (struct buf *)phygetvpbuf(dev, uio->uio_resid);
+ if (!bp_alloc) {
+ spl = splbio();
+ while (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ tsleep((caddr_t)bp, PRIBIO, "physbw", 0);
+ }
+ bp->b_flags |= B_BUSY;
+ splx(spl);
+ } else {
+ bp = bpa;
+ }
+
+ /*
+ * get a copy of the kva from the physical buffer
+ */
+ sa = bpa->b_data;
+ bp->b_proc = curproc;
+ error = bp->b_error = 0;
+
+ for(i=0;i<uio->uio_iovcnt;i++) {
+ while( uio->uio_iov[i].iov_len) {
+
+ bp->b_dev = dev;
+ bp->b_bcount = uio->uio_iov[i].iov_len;
+ bp->b_flags = B_BUSY | B_PHYS | B_CALL | bufflags;
+ bp->b_iodone = physwakeup;
+ bp->b_data = uio->uio_iov[i].iov_base;
+ bp->b_bcount = minp( bp);
+ if( minp != minphys)
+ bp->b_bcount = minphys( bp);
+ bp->b_bufsize = bp->b_bcount;
+ /*
+ * pass in the kva from the physical buffer
+ * for the temporary kernel mapping.
+ */
+ bp->b_saveaddr = sa;
+ bp->b_blkno = btodb(uio->uio_offset);
+ bp->b_offset = uio->uio_offset;
+
+ if (uio->uio_segflg == UIO_USERSPACE) {
+ if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) {
+ error = EFAULT;
+ goto doerror;
+ }
+ if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) {
+ error = EFAULT;
+ goto doerror;
+ }
+
+ /* bring buffer into kernel space */
+ vmapbuf(bp);
+ }
+
+ /* perform transfer */
+ (*strategy)(bp);
+
+ spl = splbio();
+ while ((bp->b_flags & B_DONE) == 0)
+ tsleep((caddr_t)bp, PRIBIO, "physstr", 0);
+ splx(spl);
+
+ /* release mapping into kernel space */
+ if (uio->uio_segflg == UIO_USERSPACE)
+ vunmapbuf(bp);
+
+ /*
+ * update the uio data
+ */
+ {
+ int iolen = bp->b_bcount - bp->b_resid;
+
+ if (iolen == 0 && !(bp->b_flags & B_ERROR))
+ goto doerror; /* EOF */
+ uio->uio_iov[i].iov_len -= iolen;
+ uio->uio_iov[i].iov_base += iolen;
+ uio->uio_resid -= iolen;
+ uio->uio_offset += iolen;
+ }
+
+ /*
+ * check for an error
+ */
+ if( bp->b_flags & B_ERROR) {
+ error = bp->b_error;
+ goto doerror;
+ }
+ }
+ }
+
+
+doerror:
+ relpbuf(bpa, NULL);
+ if (!bp_alloc) {
+ bp->b_flags &= ~(B_BUSY|B_PHYS);
+ if( bp->b_flags & B_WANTED) {
+ bp->b_flags &= ~B_WANTED;
+ wakeup((caddr_t)bp);
+ }
+ }
+/*
+ * allow the process to be swapped
+ */
+ curproc->p_flag &= ~P_PHYSIO;
+
+ return (error);
+}
+
+u_int
+minphys(bp)
+ struct buf *bp;
+{
+ u_int maxphys = DFLTPHYS;
+ struct cdevsw *bdsw;
+
+ bdsw = cdevsw[major(bp->b_dev)];
+
+ if (bdsw && bdsw->d_maxio) {
+ maxphys = bdsw->d_maxio;
+ }
+ if (bp->b_kvasize && (bp->b_kvasize < maxphys))
+ maxphys = bp->b_kvasize;
+
+ if(((vm_offset_t) bp->b_data) & PAGE_MASK) {
+ maxphys -= PAGE_SIZE;
+ }
+
+ if( bp->b_bcount > maxphys) {
+ bp->b_bcount = maxphys;
+ }
+
+ return bp->b_bcount;
+}
+
+struct buf *
+phygetvpbuf(dev_t dev, int resid)
+{
+ struct cdevsw *bdsw;
+ int maxio;
+
+ bdsw = cdevsw[major(dev)];
+ if ((bdsw == NULL) || (bdsw->d_bmaj == -1))
+ return getpbuf(NULL);
+
+ maxio = bdsw->d_maxio;
+ if (resid > maxio)
+ resid = maxio;
+
+ return getpbuf(NULL);
+}
+
+static void
+physwakeup(bp)
+ struct buf *bp;
+{
+ wakeup((caddr_t) bp);
+ bp->b_flags &= ~B_CALL;
+}
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
new file mode 100644
index 0000000..0c6feac
--- /dev/null
+++ b/sys/kern/kern_proc.c
@@ -0,0 +1,608 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95
+ * $Id: kern_proc.c,v 1.42 1999/01/10 01:58:24 eivind Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/filedesc.h>
+#include <sys/tty.h>
+#include <sys/signalvar.h>
+#include <vm/vm.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/user.h>
+#include <vm/vm_zone.h>
+
+static MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
+MALLOC_DEFINE(M_SESSION, "session", "session header");
+static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
+MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
+
+struct prochd qs[NQS]; /* as good a place as any... */
+struct prochd rtqs[NQS]; /* Space for REALTIME queues too */
+struct prochd idqs[NQS]; /* Space for IDLE queues too */
+
+static void pgdelete __P((struct pgrp *));
+
+/*
+ * Structure associated with user cacheing.
+ */
+struct uidinfo {
+ LIST_ENTRY(uidinfo) ui_hash;
+ uid_t ui_uid;
+ long ui_proccnt;
+};
+#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
+static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
+static u_long uihash; /* size of hash table - 1 */
+
+static void orphanpg __P((struct pgrp *pg));
+
+/*
+ * Other process lists
+ */
+struct pidhashhead *pidhashtbl;
+u_long pidhash;
+struct pgrphashhead *pgrphashtbl;
+u_long pgrphash;
+struct proclist allproc;
+struct proclist zombproc;
+vm_zone_t proc_zone;
+
+/*
+ * Initialize global process hashing structures.
+ */
+void
+procinit()
+{
+
+ LIST_INIT(&allproc);
+ LIST_INIT(&zombproc);
+ pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
+ pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
+ uihashtbl = hashinit(maxproc / 16, M_PROC, &uihash);
+ proc_zone = zinit("PROC", sizeof (struct proc), 0, 0, 5);
+}
+
+/*
+ * Change the count associated with number of processes
+ * a given user is using.
+ */
+int
+chgproccnt(uid, diff)
+ uid_t uid;
+ int diff;
+{
+ register struct uidinfo *uip;
+ register struct uihashhead *uipp;
+
+ uipp = UIHASH(uid);
+ for (uip = uipp->lh_first; uip != 0; uip = uip->ui_hash.le_next)
+ if (uip->ui_uid == uid)
+ break;
+ if (uip) {
+ uip->ui_proccnt += diff;
+ if (uip->ui_proccnt > 0)
+ return (uip->ui_proccnt);
+ if (uip->ui_proccnt < 0)
+ panic("chgproccnt: procs < 0");
+ LIST_REMOVE(uip, ui_hash);
+ FREE(uip, M_PROC);
+ return (0);
+ }
+ if (diff <= 0) {
+ if (diff == 0)
+ return(0);
+ panic("chgproccnt: lost user");
+ }
+ MALLOC(uip, struct uidinfo *, sizeof(*uip), M_PROC, M_WAITOK);
+ LIST_INSERT_HEAD(uipp, uip, ui_hash);
+ uip->ui_uid = uid;
+ uip->ui_proccnt = diff;
+ return (diff);
+}
+
+/*
+ * Is p an inferior of the current process?
+ */
+int
+inferior(p)
+ register struct proc *p;
+{
+
+ for (; p != curproc; p = p->p_pptr)
+ if (p->p_pid == 0)
+ return (0);
+ return (1);
+}
+
+/*
+ * Locate a process by number
+ */
+struct proc *
+pfind(pid)
+ register pid_t pid;
+{
+ register struct proc *p;
+
+ for (p = PIDHASH(pid)->lh_first; p != 0; p = p->p_hash.le_next)
+ if (p->p_pid == pid)
+ return (p);
+ return (NULL);
+}
+
+/*
+ * Locate a process group by number
+ */
+struct pgrp *
+pgfind(pgid)
+ register pid_t pgid;
+{
+ register struct pgrp *pgrp;
+
+ for (pgrp = PGRPHASH(pgid)->lh_first; pgrp != 0;
+ pgrp = pgrp->pg_hash.le_next)
+ if (pgrp->pg_id == pgid)
+ return (pgrp);
+ return (NULL);
+}
+
+/*
+ * Move p to a new or existing process group (and session)
+ */
+int
+enterpgrp(p, pgid, mksess)
+ register struct proc *p;
+ pid_t pgid;
+ int mksess;
+{
+ register struct pgrp *pgrp = pgfind(pgid);
+
+ KASSERT(pgrp == NULL || !mksess,
+ ("enterpgrp: setsid into non-empty pgrp"));
+ KASSERT(!SESS_LEADER(p),
+ ("enterpgrp: session leader attempted setpgrp"));
+
+ if (pgrp == NULL) {
+ pid_t savepid = p->p_pid;
+ struct proc *np;
+ /*
+ * new process group
+ */
+ KASSERT(p->p_pid == pgid,
+ ("enterpgrp: new pgrp and pid != pgid"));
+ MALLOC(pgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP,
+ M_WAITOK);
+ if ((np = pfind(savepid)) == NULL || np != p)
+ return (ESRCH);
+ if (mksess) {
+ register struct session *sess;
+
+ /*
+ * new session
+ */
+ MALLOC(sess, struct session *, sizeof(struct session),
+ M_SESSION, M_WAITOK);
+ sess->s_leader = p;
+ sess->s_sid = p->p_pid;
+ sess->s_count = 1;
+ sess->s_ttyvp = NULL;
+ sess->s_ttyp = NULL;
+ bcopy(p->p_session->s_login, sess->s_login,
+ sizeof(sess->s_login));
+ p->p_flag &= ~P_CONTROLT;
+ pgrp->pg_session = sess;
+ KASSERT(p == curproc,
+ ("enterpgrp: mksession and p != curproc"));
+ } else {
+ pgrp->pg_session = p->p_session;
+ pgrp->pg_session->s_count++;
+ }
+ pgrp->pg_id = pgid;
+ LIST_INIT(&pgrp->pg_members);
+ LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
+ pgrp->pg_jobc = 0;
+ SLIST_INIT(&pgrp->pg_sigiolst);
+ } else if (pgrp == p->p_pgrp)
+ return (0);
+
+ /*
+ * Adjust eligibility of affected pgrps to participate in job control.
+ * Increment eligibility counts before decrementing, otherwise we
+ * could reach 0 spuriously during the first call.
+ */
+ fixjobc(p, pgrp, 1);
+ fixjobc(p, p->p_pgrp, 0);
+
+ LIST_REMOVE(p, p_pglist);
+ if (p->p_pgrp->pg_members.lh_first == 0)
+ pgdelete(p->p_pgrp);
+ p->p_pgrp = pgrp;
+ LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
+ return (0);
+}
+
+/*
+ * remove process from process group
+ */
+int
+leavepgrp(p)
+ register struct proc *p;
+{
+
+ LIST_REMOVE(p, p_pglist);
+ if (p->p_pgrp->pg_members.lh_first == 0)
+ pgdelete(p->p_pgrp);
+ p->p_pgrp = 0;
+ return (0);
+}
+
+/*
+ * delete a process group
+ */
+static void
+pgdelete(pgrp)
+ register struct pgrp *pgrp;
+{
+
+ /*
+ * Reset any sigio structures pointing to us as a result of
+ * F_SETOWN with our pgid.
+ */
+ funsetownlst(&pgrp->pg_sigiolst);
+
+ if (pgrp->pg_session->s_ttyp != NULL &&
+ pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
+ pgrp->pg_session->s_ttyp->t_pgrp = NULL;
+ LIST_REMOVE(pgrp, pg_hash);
+ if (--pgrp->pg_session->s_count == 0)
+ FREE(pgrp->pg_session, M_SESSION);
+ FREE(pgrp, M_PGRP);
+}
+
+/*
+ * Adjust pgrp jobc counters when specified process changes process group.
+ * We count the number of processes in each process group that "qualify"
+ * the group for terminal job control (those with a parent in a different
+ * process group of the same session). If that count reaches zero, the
+ * process group becomes orphaned. Check both the specified process'
+ * process group and that of its children.
+ * entering == 0 => p is leaving specified group.
+ * entering == 1 => p is entering specified group.
+ */
+void
+fixjobc(p, pgrp, entering)
+ register struct proc *p;
+ register struct pgrp *pgrp;
+ int entering;
+{
+ register struct pgrp *hispgrp;
+ register struct session *mysession = pgrp->pg_session;
+
+ /*
+ * Check p's parent to see whether p qualifies its own process
+ * group; if so, adjust count for p's process group.
+ */
+ if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
+ hispgrp->pg_session == mysession)
+ if (entering)
+ pgrp->pg_jobc++;
+ else if (--pgrp->pg_jobc == 0)
+ orphanpg(pgrp);
+
+ /*
+ * Check this process' children to see whether they qualify
+ * their process groups; if so, adjust counts for children's
+ * process groups.
+ */
+ for (p = p->p_children.lh_first; p != 0; p = p->p_sibling.le_next)
+ if ((hispgrp = p->p_pgrp) != pgrp &&
+ hispgrp->pg_session == mysession &&
+ p->p_stat != SZOMB)
+ if (entering)
+ hispgrp->pg_jobc++;
+ else if (--hispgrp->pg_jobc == 0)
+ orphanpg(hispgrp);
+}
+
+/*
+ * A process group has become orphaned;
+ * if there are any stopped processes in the group,
+ * hang-up all process in that group.
+ */
+static void
+orphanpg(pg)
+ struct pgrp *pg;
+{
+ register struct proc *p;
+
+ for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) {
+ if (p->p_stat == SSTOP) {
+ for (p = pg->pg_members.lh_first; p != 0;
+ p = p->p_pglist.le_next) {
+ psignal(p, SIGHUP);
+ psignal(p, SIGCONT);
+ }
+ return;
+ }
+ }
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(pgrpdump, pgrpdump)
+{
+ register struct pgrp *pgrp;
+ register struct proc *p;
+ register int i;
+
+ for (i = 0; i <= pgrphash; i++) {
+ if (pgrp = pgrphashtbl[i].lh_first) {
+ printf("\tindx %d\n", i);
+ for (; pgrp != 0; pgrp = pgrp->pg_hash.le_next) {
+ printf(
+ "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
+ (void *)pgrp, (long)pgrp->pg_id,
+ (void *)pgrp->pg_session,
+ pgrp->pg_session->s_count,
+ (void *)pgrp->pg_members.lh_first);
+ for (p = pgrp->pg_members.lh_first; p != 0;
+ p = p->p_pglist.le_next) {
+ printf("\t\tpid %ld addr %p pgrp %p\n",
+ (long)p->p_pid, (void *)p,
+ (void *)p->p_pgrp);
+ }
+ }
+ }
+ }
+}
+#endif /* DDB */
+
+/*
+ * Fill in an eproc structure for the specified process.
+ */
+void
+fill_eproc(p, ep)
+ register struct proc *p;
+ register struct eproc *ep;
+{
+ register struct tty *tp;
+
+ bzero(ep, sizeof(*ep));
+
+ ep->e_paddr = p;
+ if (p->p_cred) {
+ ep->e_pcred = *p->p_cred;
+ if (p->p_ucred)
+ ep->e_ucred = *p->p_ucred;
+ }
+#ifdef COMPAT_LINUX_THREADS
+ if (p->p_procsig){
+ ep->e_procsig = *p->p_procsig;
+ }
+#endif
+ if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) {
+ register struct vmspace *vm = p->p_vmspace;
+
+#ifdef pmap_resident_count
+ ep->e_vm.vm_rssize = pmap_resident_count(&vm->vm_pmap); /*XXX*/
+#else
+ ep->e_vm.vm_rssize = vm->vm_rssize;
+#endif
+ ep->e_vm.vm_tsize = vm->vm_tsize;
+ ep->e_vm.vm_dsize = vm->vm_dsize;
+ ep->e_vm.vm_ssize = vm->vm_ssize;
+ ep->e_vm.vm_taddr = vm->vm_taddr;
+ ep->e_vm.vm_daddr = vm->vm_daddr;
+ ep->e_vm.vm_minsaddr = vm->vm_minsaddr;
+ ep->e_vm.vm_maxsaddr = vm->vm_maxsaddr;
+ ep->e_vm.vm_map = vm->vm_map;
+#ifndef sparc
+ ep->e_vm.vm_pmap = vm->vm_pmap;
+#endif
+ }
+ if (p->p_pptr)
+ ep->e_ppid = p->p_pptr->p_pid;
+ if (p->p_pgrp) {
+ ep->e_pgid = p->p_pgrp->pg_id;
+ ep->e_jobc = p->p_pgrp->pg_jobc;
+ ep->e_sess = p->p_pgrp->pg_session;
+
+ if (ep->e_sess) {
+ bcopy(ep->e_sess->s_login, ep->e_login, sizeof(ep->e_login));
+ if (ep->e_sess->s_ttyvp)
+ ep->e_flag = EPROC_CTTY;
+ if (p->p_session && SESS_LEADER(p))
+ ep->e_flag |= EPROC_SLEADER;
+ }
+ }
+ if ((p->p_flag & P_CONTROLT) &&
+ (ep->e_sess != NULL) &&
+ ((tp = ep->e_sess->s_ttyp) != NULL)) {
+ ep->e_tdev = tp->t_dev;
+ ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
+ ep->e_tsess = tp->t_session;
+ } else
+ ep->e_tdev = NODEV;
+ if (p->p_wmesg) {
+ strncpy(ep->e_wmesg, p->p_wmesg, WMESGLEN);
+ ep->e_wmesg[WMESGLEN] = 0;
+ }
+}
+
+static struct proc *
+zpfind(pid_t pid)
+{
+ struct proc *p;
+
+ for (p = zombproc.lh_first; p != 0; p = p->p_list.le_next)
+ if (p->p_pid == pid)
+ return (p);
+ return (NULL);
+}
+
+
+static int
+sysctl_out_proc(struct proc *p, struct sysctl_req *req, int doingzomb)
+{
+ struct eproc eproc;
+ int error;
+ pid_t pid = p->p_pid;
+
+ fill_eproc(p, &eproc);
+ error = SYSCTL_OUT(req,(caddr_t)p, sizeof(struct proc));
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req,(caddr_t)&eproc, sizeof(eproc));
+ if (error)
+ return (error);
+ if (!doingzomb && pid && (pfind(pid) != p))
+ return EAGAIN;
+ if (doingzomb && zpfind(pid) != p)
+ return EAGAIN;
+ return (0);
+}
+
+static int
+sysctl_kern_proc SYSCTL_HANDLER_ARGS
+{
+ int *name = (int*) arg1;
+ u_int namelen = arg2;
+ struct proc *p;
+ int doingzomb;
+ int error = 0;
+
+ if (oidp->oid_number == KERN_PROC_PID) {
+ if (namelen != 1)
+ return (EINVAL);
+ p = pfind((pid_t)name[0]);
+ if (!p)
+ return (0);
+ error = sysctl_out_proc(p, req, 0);
+ return (error);
+ }
+ if (oidp->oid_number == KERN_PROC_ALL && !namelen)
+ ;
+ else if (oidp->oid_number != KERN_PROC_ALL && namelen == 1)
+ ;
+ else
+ return (EINVAL);
+
+ if (!req->oldptr) {
+ /* overestimate by 5 procs */
+ error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
+ if (error)
+ return (error);
+ }
+ for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
+ if (!doingzomb)
+ p = allproc.lh_first;
+ else
+ p = zombproc.lh_first;
+ for (; p != 0; p = p->p_list.le_next) {
+ /*
+ * Skip embryonic processes.
+ */
+ if (p->p_stat == SIDL)
+ continue;
+ /*
+ * TODO - make more efficient (see notes below).
+ * do by session.
+ */
+ switch (oidp->oid_number) {
+
+ case KERN_PROC_PGRP:
+ /* could do this by traversing pgrp */
+ if (p->p_pgrp == NULL ||
+ p->p_pgrp->pg_id != (pid_t)name[0])
+ continue;
+ break;
+
+ case KERN_PROC_TTY:
+ if ((p->p_flag & P_CONTROLT) == 0 ||
+ p->p_session == NULL ||
+ p->p_session->s_ttyp == NULL ||
+ p->p_session->s_ttyp->t_dev != (dev_t)name[0])
+ continue;
+ break;
+
+ case KERN_PROC_UID:
+ if (p->p_ucred == NULL ||
+ p->p_ucred->cr_uid != (uid_t)name[0])
+ continue;
+ break;
+
+ case KERN_PROC_RUID:
+ if (p->p_ucred == NULL ||
+ p->p_cred->p_ruid != (uid_t)name[0])
+ continue;
+ break;
+ }
+
+ error = sysctl_out_proc(p, req, doingzomb);
+ if (error)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+
+SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table");
+
+SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT,
+ 0, 0, sysctl_kern_proc, "S,proc", "");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
new file mode 100644
index 0000000..e5e1a3e
--- /dev/null
+++ b/sys/kern/kern_prot.c
@@ -0,0 +1,898 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94
+ * $Id: kern_prot.c,v 1.42 1998/11/10 09:16:29 peter Exp $
+ */
+
+/*
+ * System calls related to processes and protection
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/acct.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+#include <sys/pioctl.h>
+
+static MALLOC_DEFINE(M_CRED, "cred", "credentials");
+
+#ifndef _SYS_SYSPROTO_H_
+struct getpid_args {
+ int dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+getpid(p, uap)
+ struct proc *p;
+ struct getpid_args *uap;
+{
+
+ p->p_retval[0] = p->p_pid;
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+ p->p_retval[1] = p->p_pptr->p_pid;
+#endif
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getppid_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+getppid(p, uap)
+ struct proc *p;
+ struct getppid_args *uap;
+{
+
+ p->p_retval[0] = p->p_pptr->p_pid;
+ return (0);
+}
+
+/* Get process group ID; note that POSIX getpgrp takes no parameter */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgrp_args {
+ int dummy;
+};
+#endif
+
+int
+getpgrp(p, uap)
+ struct proc *p;
+ struct getpgrp_args *uap;
+{
+
+ p->p_retval[0] = p->p_pgrp->pg_id;
+ return (0);
+}
+
+/* Get an arbitary pid's process group id */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgid_args {
+ pid_t pid;
+};
+#endif
+
+int
+getpgid(p, uap)
+ struct proc *p;
+ struct getpgid_args *uap;
+{
+ struct proc *pt;
+
+ pt = p;
+ if (uap->pid == 0)
+ goto found;
+
+ if ((pt = pfind(uap->pid)) == 0)
+ return ESRCH;
+found:
+ p->p_retval[0] = pt->p_pgrp->pg_id;
+ return 0;
+}
+
+/*
+ * Get an arbitary pid's session id.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getsid_args {
+ pid_t pid;
+};
+#endif
+
+int
+getsid(p, uap)
+ struct proc *p;
+ struct getsid_args *uap;
+{
+ struct proc *pt;
+
+ pt = p;
+ if (uap->pid == 0)
+ goto found;
+
+ if ((pt == pfind(uap->pid)) == 0)
+ return ESRCH;
+found:
+ p->p_retval[0] = pt->p_session->s_sid;
+ return 0;
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct getuid_args {
+ int dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+getuid(p, uap)
+ struct proc *p;
+ struct getuid_args *uap;
+{
+
+ p->p_retval[0] = p->p_cred->p_ruid;
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+ p->p_retval[1] = p->p_ucred->cr_uid;
+#endif
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct geteuid_args {
+ int dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+geteuid(p, uap)
+ struct proc *p;
+ struct geteuid_args *uap;
+{
+
+ p->p_retval[0] = p->p_ucred->cr_uid;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getgid_args {
+ int dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+getgid(p, uap)
+ struct proc *p;
+ struct getgid_args *uap;
+{
+
+ p->p_retval[0] = p->p_cred->p_rgid;
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+ p->p_retval[1] = p->p_ucred->cr_groups[0];
+#endif
+ return (0);
+}
+
+/*
+ * Get effective group ID. The "egid" is groups[0], and could be obtained
+ * via getgroups. This syscall exists because it is somewhat painful to do
+ * correctly in a library function.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getegid_args {
+ int dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+getegid(p, uap)
+ struct proc *p;
+ struct getegid_args *uap;
+{
+
+ p->p_retval[0] = p->p_ucred->cr_groups[0];
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getgroups_args {
+ u_int gidsetsize;
+ gid_t *gidset;
+};
+#endif
+int
+getgroups(p, uap)
+ struct proc *p;
+ register struct getgroups_args *uap;
+{
+ register struct pcred *pc = p->p_cred;
+ register u_int ngrp;
+ int error;
+
+ if ((ngrp = uap->gidsetsize) == 0) {
+ p->p_retval[0] = pc->pc_ucred->cr_ngroups;
+ return (0);
+ }
+ if (ngrp < pc->pc_ucred->cr_ngroups)
+ return (EINVAL);
+ ngrp = pc->pc_ucred->cr_ngroups;
+ if ((error = copyout((caddr_t)pc->pc_ucred->cr_groups,
+ (caddr_t)uap->gidset, ngrp * sizeof(gid_t))))
+ return (error);
+ p->p_retval[0] = ngrp;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setsid_args {
+ int dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+setsid(p, uap)
+ register struct proc *p;
+ struct setsid_args *uap;
+{
+
+ if (p->p_pgid == p->p_pid || pgfind(p->p_pid)) {
+ return (EPERM);
+ } else {
+ (void)enterpgrp(p, p->p_pid, 1);
+ p->p_retval[0] = p->p_pid;
+ return (0);
+ }
+}
+
+/*
+ * set process group (setpgid/old setpgrp)
+ *
+ * caller does setpgid(targpid, targpgid)
+ *
+ * pid must be caller or child of caller (ESRCH)
+ * if a child
+ * pid must be in same session (EPERM)
+ * pid can't have done an exec (EACCES)
+ * if pgid != pid
+ * there must exist some pid in same session having pgid (EPERM)
+ * pid must not be session leader (EPERM)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setpgid_args {
+ int pid; /* target process id */
+ int pgid; /* target pgrp id */
+};
+#endif
+/* ARGSUSED */
+int
+setpgid(curp, uap)
+ struct proc *curp;
+ register struct setpgid_args *uap;
+{
+ register struct proc *targp; /* target process */
+ register struct pgrp *pgrp; /* target pgrp */
+
+ if (uap->pgid < 0)
+ return (EINVAL);
+ if (uap->pid != 0 && uap->pid != curp->p_pid) {
+ if ((targp = pfind(uap->pid)) == 0 || !inferior(targp))
+ return (ESRCH);
+ if (targp->p_pgrp == NULL || targp->p_session != curp->p_session)
+ return (EPERM);
+ if (targp->p_flag & P_EXEC)
+ return (EACCES);
+ } else
+ targp = curp;
+ if (SESS_LEADER(targp))
+ return (EPERM);
+ if (uap->pgid == 0)
+ uap->pgid = targp->p_pid;
+ else if (uap->pgid != targp->p_pid)
+ if ((pgrp = pgfind(uap->pgid)) == 0 ||
+ pgrp->pg_session != curp->p_session)
+ return (EPERM);
+ return (enterpgrp(targp, uap->pgid, 0));
+}
+
+/*
+ * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD
+ * compatable. It says that setting the uid/gid to euid/egid is a special
+ * case of "appropriate privilege". Once the rules are expanded out, this
+ * basically means that setuid(nnn) sets all three id's, in all permitted
+ * cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid())
+ * does not set the saved id - this is dangerous for traditional BSD
+ * programs. For this reason, we *really* do not want to set
+ * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2.
+ */
+#define POSIX_APPENDIX_B_4_2_2
+
+#ifndef _SYS_SYSPROTO_H_
+struct setuid_args {
+ uid_t uid;
+};
+#endif
+/* ARGSUSED */
+int
+setuid(p, uap)
+ struct proc *p;
+ struct setuid_args *uap;
+{
+ register struct pcred *pc = p->p_cred;
+ register uid_t uid;
+ int error;
+
+ /*
+ * See if we have "permission" by POSIX 1003.1 rules.
+ *
+ * Note that setuid(geteuid()) is a special case of
+ * "appropriate privileges" in appendix B.4.2.2. We need
+ * to use this clause to be compatable with traditional BSD
+ * semantics. Basically, it means that "setuid(xx)" sets all
+ * three id's (assuming you have privs).
+ *
+ * Notes on the logic. We do things in three steps.
+ * 1: We determine if the euid is going to change, and do EPERM
+ * right away. We unconditionally change the euid later if this
+ * test is satisfied, simplifying that part of the logic.
+ * 2: We determine if the real and/or saved uid's are going to
+ * change. Determined by compile options.
+ * 3: Change euid last. (after tests in #2 for "appropriate privs")
+ */
+ uid = uap->uid;
+ if (uid != pc->p_ruid && /* allow setuid(getuid()) */
+#ifdef _POSIX_SAVED_IDS
+ uid != pc->p_svuid && /* allow setuid(saved gid) */
+#endif
+#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
+ uid != pc->pc_ucred->cr_uid && /* allow setuid(geteuid()) */
+#endif
+ (error = suser(pc->pc_ucred, &p->p_acflag)))
+ return (error);
+
+#ifdef _POSIX_SAVED_IDS
+ /*
+ * Do we have "appropriate privileges" (are we root or uid == euid)
+ * If so, we are changing the real uid and/or saved uid.
+ */
+ if (
+#ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */
+ uid == pc->pc_ucred->cr_uid ||
+#endif
+ suser(pc->pc_ucred, &p->p_acflag) == 0) /* we are using privs */
+#endif
+ {
+ /*
+ * Transfer proc count to new user.
+ */
+ if (uid != pc->p_ruid) {
+ (void)chgproccnt(pc->p_ruid, -1);
+ (void)chgproccnt(uid, 1);
+ }
+ /*
+ * Set real uid
+ */
+ if (uid != pc->p_ruid) {
+ pc->p_ruid = uid;
+ setsugid(p);
+ }
+ /*
+ * Set saved uid
+ *
+ * XXX always set saved uid even if not _POSIX_SAVED_IDS, as
+ * the security of seteuid() depends on it. B.4.2.2 says it
+ * is important that we should do this.
+ */
+ if (pc->p_svuid != uid) {
+ pc->p_svuid = uid;
+ setsugid(p);
+ }
+ }
+
+ /*
+ * In all permitted cases, we are changing the euid.
+ * Copy credentials so other references do not see our changes.
+ */
+ if (pc->pc_ucred->cr_uid != uid) {
+ pc->pc_ucred = crcopy(pc->pc_ucred);
+ pc->pc_ucred->cr_uid = uid;
+ setsugid(p);
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct seteuid_args {
+ uid_t euid;
+};
+#endif
+/* ARGSUSED */
+int
+seteuid(p, uap)
+ struct proc *p;
+ struct seteuid_args *uap;
+{
+ register struct pcred *pc = p->p_cred;
+ register uid_t euid;
+ int error;
+
+ euid = uap->euid;
+ if (euid != pc->p_ruid && /* allow seteuid(getuid()) */
+ euid != pc->p_svuid && /* allow seteuid(saved uid) */
+ (error = suser(pc->pc_ucred, &p->p_acflag)))
+ return (error);
+ /*
+ * Everything's okay, do it. Copy credentials so other references do
+ * not see our changes.
+ */
+ if (pc->pc_ucred->cr_uid != euid) {
+ pc->pc_ucred = crcopy(pc->pc_ucred);
+ pc->pc_ucred->cr_uid = euid;
+ setsugid(p);
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setgid_args {
+ gid_t gid;
+};
+#endif
+/* ARGSUSED */
+int
+setgid(p, uap)
+ struct proc *p;
+ struct setgid_args *uap;
+{
+ register struct pcred *pc = p->p_cred;
+ register gid_t gid;
+ int error;
+
+ /*
+ * See if we have "permission" by POSIX 1003.1 rules.
+ *
+ * Note that setgid(getegid()) is a special case of
+ * "appropriate privileges" in appendix B.4.2.2. We need
+ * to use this clause to be compatable with traditional BSD
+ * semantics. Basically, it means that "setgid(xx)" sets all
+ * three id's (assuming you have privs).
+ *
+ * For notes on the logic here, see setuid() above.
+ */
+ gid = uap->gid;
+ if (gid != pc->p_rgid && /* allow setgid(getgid()) */
+#ifdef _POSIX_SAVED_IDS
+ gid != pc->p_svgid && /* allow setgid(saved gid) */
+#endif
+#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
+ gid != pc->pc_ucred->cr_groups[0] && /* allow setgid(getegid()) */
+#endif
+ (error = suser(pc->pc_ucred, &p->p_acflag)))
+ return (error);
+
+#ifdef _POSIX_SAVED_IDS
+ /*
+ * Do we have "appropriate privileges" (are we root or gid == egid)
+ * If so, we are changing the real uid and saved gid.
+ */
+ if (
+#ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */
+ gid == pc->pc_ucred->cr_groups[0] ||
+#endif
+ suser(pc->pc_ucred, &p->p_acflag) == 0) /* we are using privs */
+#endif
+ {
+ /*
+ * Set real gid
+ */
+ if (pc->p_rgid != gid) {
+ pc->p_rgid = gid;
+ setsugid(p);
+ }
+ /*
+ * Set saved gid
+ *
+ * XXX always set saved gid even if not _POSIX_SAVED_IDS, as
+ * the security of setegid() depends on it. B.4.2.2 says it
+ * is important that we should do this.
+ */
+ if (pc->p_svgid != gid) {
+ pc->p_svgid = gid;
+ setsugid(p);
+ }
+ }
+ /*
+ * In all cases permitted cases, we are changing the egid.
+ * Copy credentials so other references do not see our changes.
+ */
+ if (pc->pc_ucred->cr_groups[0] != gid) {
+ pc->pc_ucred = crcopy(pc->pc_ucred);
+ pc->pc_ucred->cr_groups[0] = gid;
+ setsugid(p);
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setegid_args {
+ gid_t egid;
+};
+#endif
+/* ARGSUSED */
+int
+setegid(p, uap)
+ struct proc *p;
+ struct setegid_args *uap;
+{
+ register struct pcred *pc = p->p_cred;
+ register gid_t egid;
+ int error;
+
+ egid = uap->egid;
+ if (egid != pc->p_rgid && /* allow setegid(getgid()) */
+ egid != pc->p_svgid && /* allow setegid(saved gid) */
+ (error = suser(pc->pc_ucred, &p->p_acflag)))
+ return (error);
+ if (pc->pc_ucred->cr_groups[0] != egid) {
+ pc->pc_ucred = crcopy(pc->pc_ucred);
+ pc->pc_ucred->cr_groups[0] = egid;
+ setsugid(p);
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setgroups_args {
+ u_int gidsetsize;
+ gid_t *gidset;
+};
+#endif
+/* ARGSUSED */
+int
+setgroups(p, uap)
+ struct proc *p;
+ struct setgroups_args *uap;
+{
+ register struct pcred *pc = p->p_cred;
+ register u_int ngrp;
+ int error;
+
+ if ((error = suser(pc->pc_ucred, &p->p_acflag)))
+ return (error);
+ ngrp = uap->gidsetsize;
+ if (ngrp > NGROUPS)
+ return (EINVAL);
+ /*
+ * XXX A little bit lazy here. We could test if anything has
+ * changed before crcopy() and setting P_SUGID.
+ */
+ pc->pc_ucred = crcopy(pc->pc_ucred);
+ if (ngrp < 1) {
+ /*
+ * setgroups(0, NULL) is a legitimate way of clearing the
+ * groups vector on non-BSD systems (which generally do not
+ * have the egid in the groups[0]). We risk security holes
+ * when running non-BSD software if we do not do the same.
+ */
+ pc->pc_ucred->cr_ngroups = 1;
+ } else {
+ if ((error = copyin((caddr_t)uap->gidset,
+ (caddr_t)pc->pc_ucred->cr_groups, ngrp * sizeof(gid_t))))
+ return (error);
+ pc->pc_ucred->cr_ngroups = ngrp;
+ }
+ setsugid(p);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setreuid_args {
+ uid_t ruid;
+ uid_t euid;
+};
+#endif
+/* ARGSUSED */
+int
+setreuid(p, uap)
+ register struct proc *p;
+ struct setreuid_args *uap;
+{
+ register struct pcred *pc = p->p_cred;
+ register uid_t ruid, euid;
+ int error;
+
+ ruid = uap->ruid;
+ euid = uap->euid;
+ if ((ruid != (uid_t)-1 && ruid != pc->p_ruid && ruid != pc->p_svuid ||
+ euid != (uid_t)-1 && euid != pc->pc_ucred->cr_uid &&
+ euid != pc->p_ruid && euid != pc->p_svuid) &&
+ (error = suser(pc->pc_ucred, &p->p_acflag)))
+ return (error);
+
+ if (euid != (uid_t)-1 && pc->pc_ucred->cr_uid != euid) {
+ pc->pc_ucred = crcopy(pc->pc_ucred);
+ pc->pc_ucred->cr_uid = euid;
+ setsugid(p);
+ }
+ if (ruid != (uid_t)-1 && pc->p_ruid != ruid) {
+ (void)chgproccnt(pc->p_ruid, -1);
+ (void)chgproccnt(ruid, 1);
+ pc->p_ruid = ruid;
+ setsugid(p);
+ }
+ if ((ruid != (uid_t)-1 || pc->pc_ucred->cr_uid != pc->p_ruid) &&
+ pc->p_svuid != pc->pc_ucred->cr_uid) {
+ pc->p_svuid = pc->pc_ucred->cr_uid;
+ setsugid(p);
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setregid_args {
+ gid_t rgid;
+ gid_t egid;
+};
+#endif
+/* ARGSUSED */
+int
+setregid(p, uap)
+ register struct proc *p;
+ struct setregid_args *uap;
+{
+ register struct pcred *pc = p->p_cred;
+ register gid_t rgid, egid;
+ int error;
+
+ rgid = uap->rgid;
+ egid = uap->egid;
+ if ((rgid != (gid_t)-1 && rgid != pc->p_rgid && rgid != pc->p_svgid ||
+ egid != (gid_t)-1 && egid != pc->pc_ucred->cr_groups[0] &&
+ egid != pc->p_rgid && egid != pc->p_svgid) &&
+ (error = suser(pc->pc_ucred, &p->p_acflag)))
+ return (error);
+
+ if (egid != (gid_t)-1 && pc->pc_ucred->cr_groups[0] != egid) {
+ pc->pc_ucred = crcopy(pc->pc_ucred);
+ pc->pc_ucred->cr_groups[0] = egid;
+ setsugid(p);
+ }
+ if (rgid != (gid_t)-1 && pc->p_rgid != rgid) {
+ pc->p_rgid = rgid;
+ setsugid(p);
+ }
+ if ((rgid != (gid_t)-1 || pc->pc_ucred->cr_groups[0] != pc->p_rgid) &&
+ pc->p_svgid != pc->pc_ucred->cr_groups[0]) {
+ pc->p_svgid = pc->pc_ucred->cr_groups[0];
+ setsugid(p);
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct issetugid_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+issetugid(p, uap)
+ register struct proc *p;
+ struct issetugid_args *uap;
+{
+ /*
+ * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time,
+ * we use P_SUGID because we consider changing the owners as
+ * "tainting" as well.
+ * This is significant for procs that start as root and "become"
+ * a user without an exec - programs cannot know *everything*
+ * that libc *might* have put in their data segment.
+ */
+ if (p->p_flag & P_SUGID)
+ return (1);
+ return (0);
+}
+
+/*
+ * Check if gid is a member of the group set.
+ */
+int
+groupmember(gid, cred)
+ gid_t gid;
+ register struct ucred *cred;
+{
+ register gid_t *gp;
+ gid_t *egp;
+
+ egp = &(cred->cr_groups[cred->cr_ngroups]);
+ for (gp = cred->cr_groups; gp < egp; gp++)
+ if (*gp == gid)
+ return (1);
+ return (0);
+}
+
+/*
+ * Test whether the specified credentials imply "super-user"
+ * privilege; if so, and we have accounting info, set the flag
+ * indicating use of super-powers.
+ * Returns 0 or error.
+ */
+int
+suser(cred, acflag)
+ struct ucred *cred;
+ u_short *acflag;
+{
+ if (cred->cr_uid == 0) {
+ if (acflag)
+ *acflag |= ASU;
+ return (0);
+ }
+ return (EPERM);
+}
+
+/*
+ * Allocate a zeroed cred structure.
+ */
+struct ucred *
+crget()
+{
+ register struct ucred *cr;
+
+ MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK);
+ bzero((caddr_t)cr, sizeof(*cr));
+ cr->cr_ref = 1;
+ return (cr);
+}
+
+/*
+ * Free a cred structure.
+ * Throws away space when ref count gets to 0.
+ */
+void
+crfree(cr)
+ struct ucred *cr;
+{
+ if (--cr->cr_ref == 0)
+ FREE((caddr_t)cr, M_CRED);
+}
+
+/*
+ * Copy cred structure to a new one and free the old one.
+ */
+struct ucred *
+crcopy(cr)
+ struct ucred *cr;
+{
+ struct ucred *newcr;
+
+ if (cr->cr_ref == 1)
+ return (cr);
+ newcr = crget();
+ *newcr = *cr;
+ crfree(cr);
+ newcr->cr_ref = 1;
+ return (newcr);
+}
+
+/*
+ * Dup cred struct to a new held one.
+ */
+struct ucred *
+crdup(cr)
+ struct ucred *cr;
+{
+ struct ucred *newcr;
+
+ newcr = crget();
+ *newcr = *cr;
+ newcr->cr_ref = 1;
+ return (newcr);
+}
+
+/*
+ * Get login name, if available.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getlogin_args {
+ char *namebuf;
+ u_int namelen;
+};
+#endif
+/* ARGSUSED */
+int
+getlogin(p, uap)
+ struct proc *p;
+ struct getlogin_args *uap;
+{
+
+ if (uap->namelen > MAXLOGNAME)
+ uap->namelen = MAXLOGNAME;
+ return (copyout((caddr_t) p->p_pgrp->pg_session->s_login,
+ (caddr_t) uap->namebuf, uap->namelen));
+}
+
+/*
+ * Set login name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setlogin_args {
+ char *namebuf;
+};
+#endif
+/* ARGSUSED */
+int
+setlogin(p, uap)
+ struct proc *p;
+ struct setlogin_args *uap;
+{
+ int error;
+ char logintmp[MAXLOGNAME];
+
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
+ return (error);
+ error = copyinstr((caddr_t) uap->namebuf, (caddr_t) logintmp,
+ sizeof(logintmp), (size_t *)0);
+ if (error == ENAMETOOLONG)
+ error = EINVAL;
+ else if (!error)
+ (void) memcpy(p->p_pgrp->pg_session->s_login, logintmp,
+ sizeof(logintmp));
+ return (error);
+}
+
+void
+setsugid(p)
+ struct proc *p;
+{
+ p->p_flag |= P_SUGID;
+ if (!(p->p_pfsflags & PF_ISUGID))
+ p->p_stops = 0;
+}
diff --git a/sys/kern/kern_random.c b/sys/kern/kern_random.c
new file mode 100644
index 0000000..d635668
--- /dev/null
+++ b/sys/kern/kern_random.c
@@ -0,0 +1,379 @@
+/*
+ * random_machdep.c -- A strong random number generator
+ *
+ * $Id: random_machdep.c,v 1.28 1998/06/18 15:32:07 bde Exp $
+ *
+ * Version 0.95, last modified 18-Oct-95
+ *
+ * Copyright Theodore Ts'o, 1994, 1995. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, and the entire permission notice in its entirety,
+ * including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions. (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/select.h>
+#include <sys/poll.h>
+#include <sys/md5.h>
+
+#include <machine/random.h>
+
+#include <i386/isa/icu.h>
+#include <i386/isa/intr_machdep.h>
+
+#define MAX_BLKDEV 4
+
+/*
+ * The pool is stirred with a primitive polynomial of degree 128
+ * over GF(2), namely x^128 + x^99 + x^59 + x^31 + x^9 + x^7 + 1.
+ * For a pool of size 64, try x^64+x^62+x^38+x^10+x^6+x+1.
+ */
+#define POOLWORDS 128 /* Power of 2 - note that this is 32-bit words */
+#define POOLBITS (POOLWORDS*32)
+
+#if POOLWORDS == 128
+#define TAP1 99 /* The polynomial taps */
+#define TAP2 59
+#define TAP3 31
+#define TAP4 9
+#define TAP5 7
+#elif POOLWORDS == 64
+#define TAP1 62 /* The polynomial taps */
+#define TAP2 38
+#define TAP3 10
+#define TAP4 6
+#define TAP5 1
+#else
+#error No primitive polynomial available for chosen POOLWORDS
+#endif
+
+#define WRITEBUFFER 512 /* size in bytes */
+
+/* There is actually only one of these, globally. */
+struct random_bucket {
+ u_int add_ptr;
+ u_int entropy_count;
+ int input_rotate;
+ u_int32_t *pool;
+ struct selinfo rsel;
+};
+
+/* There is one of these per entropy source */
+struct timer_rand_state {
+ u_long last_time;
+ int last_delta;
+ int nbits;
+};
+
+static struct random_bucket random_state;
+static u_int32_t random_pool[POOLWORDS];
+static struct timer_rand_state keyboard_timer_state;
+static struct timer_rand_state extract_timer_state;
+static struct timer_rand_state irq_timer_state[ICU_LEN];
+#ifdef notyet
+static struct timer_rand_state blkdev_timer_state[MAX_BLKDEV];
+#endif
+static struct wait_queue *random_wait;
+
+#ifndef MIN
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+void
+rand_initialize(void)
+{
+ random_state.add_ptr = 0;
+ random_state.entropy_count = 0;
+ random_state.pool = random_pool;
+ random_wait = NULL;
+ random_state.rsel.si_flags = 0;
+ random_state.rsel.si_pid = 0;
+}
+
+/*
+ * This function adds an int into the entropy "pool". It does not
+ * update the entropy estimate. The caller must do this if appropriate.
+ *
+ * The pool is stirred with a primitive polynomial of degree 128
+ * over GF(2), namely x^128 + x^99 + x^59 + x^31 + x^9 + x^7 + 1.
+ * For a pool of size 64, try x^64+x^62+x^38+x^10+x^6+x+1.
+ *
+ * We rotate the input word by a changing number of bits, to help
+ * assure that all bits in the entropy get toggled. Otherwise, if we
+ * consistently feed the entropy pool small numbers (like ticks and
+ * scancodes, for example), the upper bits of the entropy pool don't
+ * get affected. --- TYT, 10/11/95
+ */
+static __inline void
+add_entropy_word(struct random_bucket *r, const u_int32_t input)
+{
+ u_int i;
+ u_int32_t w;
+
+ w = (input << r->input_rotate) | (input >> (32 - r->input_rotate));
+ i = r->add_ptr = (r->add_ptr - 1) & (POOLWORDS-1);
+ if (i)
+ r->input_rotate = (r->input_rotate + 7) & 31;
+ else
+ /*
+ * At the beginning of the pool, add an extra 7 bits
+ * rotation, so that successive passes spread the
+ * input bits across the pool evenly.
+ */
+ r->input_rotate = (r->input_rotate + 14) & 31;
+
+ /* XOR in the various taps */
+ w ^= r->pool[(i+TAP1)&(POOLWORDS-1)];
+ w ^= r->pool[(i+TAP2)&(POOLWORDS-1)];
+ w ^= r->pool[(i+TAP3)&(POOLWORDS-1)];
+ w ^= r->pool[(i+TAP4)&(POOLWORDS-1)];
+ w ^= r->pool[(i+TAP5)&(POOLWORDS-1)];
+ w ^= r->pool[i];
+ /* Rotate w left 1 bit (stolen from SHA) and store */
+ r->pool[i] = (w << 1) | (w >> 31);
+}
+
+/*
+ * This function adds entropy to the entropy "pool" by using timing
+ * delays. It uses the timer_rand_state structure to make an estimate
+ * of how any bits of entropy this call has added to the pool.
+ *
+ * The number "num" is also added to the pool - it should somehow describe
+ * the type of event which just happened. This is currently 0-255 for
+ * keyboard scan codes, and 256 upwards for interrupts.
+ * On the i386, this is assumed to be at most 16 bits, and the high bits
+ * are used for a high-resolution timer.
+ */
+static void
+add_timer_randomness(struct random_bucket *r, struct timer_rand_state *state,
+ u_int num)
+{
+ int delta, delta2;
+ u_int nbits;
+ u_int32_t time;
+
+ num ^= timecounter->tc_get_timecount(timecounter) << 16;
+ r->entropy_count += 2;
+
+ time = ticks;
+
+ add_entropy_word(r, (u_int32_t) num);
+ add_entropy_word(r, time);
+
+ /*
+ * Calculate number of bits of randomness we probably
+ * added. We take into account the first and second order
+ * deltas in order to make our estimate.
+ */
+ delta = time - state->last_time;
+ state->last_time = time;
+
+ delta2 = delta - state->last_delta;
+ state->last_delta = delta;
+
+ if (delta < 0) delta = -delta;
+ if (delta2 < 0) delta2 = -delta2;
+ delta = MIN(delta, delta2) >> 1;
+ for (nbits = 0; delta; nbits++)
+ delta >>= 1;
+
+ r->entropy_count += nbits;
+
+ /* Prevent overflow */
+ if (r->entropy_count > POOLBITS)
+ r->entropy_count = POOLBITS;
+
+ if (r->entropy_count >= 8)
+ selwakeup(&random_state.rsel);
+}
+
+void
+add_keyboard_randomness(u_char scancode)
+{
+ add_timer_randomness(&random_state, &keyboard_timer_state, scancode);
+}
+
+void
+add_interrupt_randomness(void *vsc)
+{
+ int intr;
+ struct random_softc *sc = vsc;
+
+ (sc->sc_handler)(sc->sc_arg);
+ intr = sc->sc_intr;
+ add_timer_randomness(&random_state, &irq_timer_state[intr], intr);
+}
+
+#ifdef notused
+void
+add_blkdev_randomness(int major)
+{
+ if (major >= MAX_BLKDEV)
+ return;
+
+ add_timer_randomness(&random_state, &blkdev_timer_state[major],
+ 0x200+major);
+}
+#endif /* notused */
+
+#if POOLWORDS % 16
+#error extract_entropy() assumes that POOLWORDS is a multiple of 16 words.
+#endif
+/*
+ * This function extracts randomness from the "entropy pool", and
+ * returns it in a buffer. This function computes how many remaining
+ * bits of entropy are left in the pool, but it does not restrict the
+ * number of bytes that are actually obtained.
+ */
+static __inline int
+extract_entropy(struct random_bucket *r, char *buf, int nbytes)
+{
+ int ret, i;
+ u_int32_t tmp[4];
+
+ add_timer_randomness(r, &extract_timer_state, nbytes);
+
+ /* Redundant, but just in case... */
+ if (r->entropy_count > POOLBITS)
+ r->entropy_count = POOLBITS;
+ /* Why is this here? Left in from Ted Ts'o. Perhaps to limit time. */
+ if (nbytes > 32768)
+ nbytes = 32768;
+
+ ret = nbytes;
+ if (r->entropy_count / 8 >= nbytes)
+ r->entropy_count -= nbytes*8;
+ else
+ r->entropy_count = 0;
+
+ while (nbytes) {
+ /* Hash the pool to get the output */
+ tmp[0] = 0x67452301;
+ tmp[1] = 0xefcdab89;
+ tmp[2] = 0x98badcfe;
+ tmp[3] = 0x10325476;
+ for (i = 0; i < POOLWORDS; i += 16)
+ MD5Transform(tmp, (char *)(r->pool+i));
+ /* Modify pool so next hash will produce different results */
+ add_entropy_word(r, tmp[0]);
+ add_entropy_word(r, tmp[1]);
+ add_entropy_word(r, tmp[2]);
+ add_entropy_word(r, tmp[3]);
+ /*
+ * Run the MD5 Transform one more time, since we want
+ * to add at least minimal obscuring of the inputs to
+ * add_entropy_word(). --- TYT
+ */
+ MD5Transform(tmp, (char *)(r->pool));
+
+ /* Copy data to destination buffer */
+ i = MIN(nbytes, 16);
+ bcopy(tmp, buf, i);
+ nbytes -= i;
+ buf += i;
+ }
+
+ /* Wipe data from memory */
+ bzero(tmp, sizeof(tmp));
+
+ return ret;
+}
+
+#ifdef notused /* XXX NOT the exported kernel interface */
+/*
+ * This function is the exported kernel interface. It returns some
+ * number of good random numbers, suitable for seeding TCP sequence
+ * numbers, etc.
+ */
+void
+get_random_bytes(void *buf, u_int nbytes)
+{
+ extract_entropy(&random_state, (char *) buf, nbytes);
+}
+#endif /* notused */
+
+u_int
+read_random(void *buf, u_int nbytes)
+{
+ if ((nbytes * 8) > random_state.entropy_count)
+ nbytes = random_state.entropy_count / 8;
+
+ return extract_entropy(&random_state, (char *)buf, nbytes);
+}
+
+u_int
+read_random_unlimited(void *buf, u_int nbytes)
+{
+ return extract_entropy(&random_state, (char *)buf, nbytes);
+}
+
+#ifdef notused
+u_int
+write_random(const char *buf, u_int nbytes)
+{
+ u_int i;
+ u_int32_t word, *p;
+
+ for (i = nbytes, p = (u_int32_t *)buf;
+ i >= sizeof(u_int32_t);
+ i-= sizeof(u_int32_t), p++)
+ add_entropy_word(&random_state, *p);
+ if (i) {
+ word = 0;
+ bcopy(p, &word, i);
+ add_entropy_word(&random_state, word);
+ }
+ return nbytes;
+}
+#endif /* notused */
+
+int
+random_poll(dev_t dev, int events, struct proc *p)
+{
+ int s;
+ int revents = 0;
+
+ s = splhigh();
+ if (events & (POLLIN | POLLRDNORM))
+ if (random_state.entropy_count >= 8)
+ revents |= events & (POLLIN | POLLRDNORM);
+ else
+ selrecord(p, &random_state.rsel);
+
+ splx(s);
+ if (events & (POLLOUT | POLLWRNORM))
+ revents |= events & (POLLOUT | POLLWRNORM); /* heh */
+
+ return (revents);
+}
+
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
new file mode 100644
index 0000000..1bad1d2
--- /dev/null
+++ b/sys/kern/kern_resource.c
@@ -0,0 +1,623 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94
+ * $Id: kern_resource.c,v 1.37 1998/05/28 09:30:18 phk Exp $
+ */
+
+#include "opt_compat.h"
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/resourcevar.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+static int donice __P((struct proc *curp, struct proc *chgp, int n));
+static int dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp));
+
+/*
+ * Resource controls and accounting.
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct getpriority_args {
+ int which;
+ int who;
+};
+#endif
+int
+getpriority(curp, uap)
+ struct proc *curp;
+ register struct getpriority_args *uap;
+{
+ register struct proc *p;
+ register int low = PRIO_MAX + 1;
+
+ switch (uap->which) {
+
+ case PRIO_PROCESS:
+ if (uap->who == 0)
+ p = curp;
+ else
+ p = pfind(uap->who);
+ if (p == 0)
+ break;
+ low = p->p_nice;
+ break;
+
+ case PRIO_PGRP: {
+ register struct pgrp *pg;
+
+ if (uap->who == 0)
+ pg = curp->p_pgrp;
+ else if ((pg = pgfind(uap->who)) == NULL)
+ break;
+ for (p = pg->pg_members.lh_first; p != 0;
+ p = p->p_pglist.le_next) {
+ if (p->p_nice < low)
+ low = p->p_nice;
+ }
+ break;
+ }
+
+ case PRIO_USER:
+ if (uap->who == 0)
+ uap->who = curp->p_ucred->cr_uid;
+ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next)
+ if (p->p_ucred->cr_uid == uap->who &&
+ p->p_nice < low)
+ low = p->p_nice;
+ break;
+
+ default:
+ return (EINVAL);
+ }
+ if (low == PRIO_MAX + 1)
+ return (ESRCH);
+ curp->p_retval[0] = low;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setpriority_args {
+ int which;
+ int who;
+ int prio;
+};
+#endif
+/* ARGSUSED */
+int
+setpriority(curp, uap)
+ struct proc *curp;
+ register struct setpriority_args *uap;
+{
+ register struct proc *p;
+ int found = 0, error = 0;
+
+ switch (uap->which) {
+
+ case PRIO_PROCESS:
+ if (uap->who == 0)
+ p = curp;
+ else
+ p = pfind(uap->who);
+ if (p == 0)
+ break;
+ error = donice(curp, p, uap->prio);
+ found++;
+ break;
+
+ case PRIO_PGRP: {
+ register struct pgrp *pg;
+
+ if (uap->who == 0)
+ pg = curp->p_pgrp;
+ else if ((pg = pgfind(uap->who)) == NULL)
+ break;
+ for (p = pg->pg_members.lh_first; p != 0;
+ p = p->p_pglist.le_next) {
+ error = donice(curp, p, uap->prio);
+ found++;
+ }
+ break;
+ }
+
+ case PRIO_USER:
+ if (uap->who == 0)
+ uap->who = curp->p_ucred->cr_uid;
+ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next)
+ if (p->p_ucred->cr_uid == uap->who) {
+ error = donice(curp, p, uap->prio);
+ found++;
+ }
+ break;
+
+ default:
+ return (EINVAL);
+ }
+ if (found == 0)
+ return (ESRCH);
+ return (error);
+}
+
+static int
+donice(curp, chgp, n)
+ register struct proc *curp, *chgp;
+ register int n;
+{
+ register struct pcred *pcred = curp->p_cred;
+
+ if (pcred->pc_ucred->cr_uid && pcred->p_ruid &&
+ pcred->pc_ucred->cr_uid != chgp->p_ucred->cr_uid &&
+ pcred->p_ruid != chgp->p_ucred->cr_uid)
+ return (EPERM);
+ if (n > PRIO_MAX)
+ n = PRIO_MAX;
+ if (n < PRIO_MIN)
+ n = PRIO_MIN;
+ if (n < chgp->p_nice && suser(pcred->pc_ucred, &curp->p_acflag))
+ return (EACCES);
+ chgp->p_nice = n;
+ (void)resetpriority(chgp);
+ return (0);
+}
+
+/* rtprio system call */
+#ifndef _SYS_SYSPROTO_H_
+struct rtprio_args {
+ int function;
+ pid_t pid;
+ struct rtprio *rtp;
+};
+#endif
+
+/*
+ * Set realtime priority
+ */
+
+/* ARGSUSED */
+int
+rtprio(curp, uap)
+ struct proc *curp;
+ register struct rtprio_args *uap;
+{
+ register struct proc *p;
+ register struct pcred *pcred = curp->p_cred;
+ struct rtprio rtp;
+ int error;
+
+ error = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
+ if (error)
+ return (error);
+
+ if (uap->pid == 0)
+ p = curp;
+ else
+ p = pfind(uap->pid);
+
+ if (p == 0)
+ return (ESRCH);
+
+ switch (uap->function) {
+ case RTP_LOOKUP:
+ return (copyout(&p->p_rtprio, uap->rtp, sizeof(struct rtprio)));
+ case RTP_SET:
+ if (pcred->pc_ucred->cr_uid && pcred->p_ruid &&
+ pcred->pc_ucred->cr_uid != p->p_ucred->cr_uid &&
+ pcred->p_ruid != p->p_ucred->cr_uid)
+ return (EPERM);
+ /* disallow setting rtprio in most cases if not superuser */
+ if (suser(pcred->pc_ucred, &curp->p_acflag)) {
+ /* can't set someone else's */
+ if (uap->pid)
+ return (EPERM);
+ /* can't set realtime priority */
+/*
+ * Realtime priority has to be restricted for reasons which should be
+ * obvious. However, for idle priority, there is a potential for
+ * system deadlock if an idleprio process gains a lock on a resource
+ * that other processes need (and the idleprio process can't run
+ * due to a CPU-bound normal process). Fix me! XXX
+ */
+#if 0
+ if (RTP_PRIO_IS_REALTIME(rtp.type))
+#endif
+ if (rtp.type != RTP_PRIO_NORMAL)
+ return (EPERM);
+ }
+ switch (rtp.type) {
+#ifdef RTP_PRIO_FIFO
+ case RTP_PRIO_FIFO:
+#endif
+ case RTP_PRIO_REALTIME:
+ case RTP_PRIO_NORMAL:
+ case RTP_PRIO_IDLE:
+ if (rtp.prio > RTP_PRIO_MAX)
+ return (EINVAL);
+ p->p_rtprio = rtp;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+
+ default:
+ return (EINVAL);
+ }
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct osetrlimit_args {
+ u_int which;
+ struct orlimit *rlp;
+};
+#endif
+/* ARGSUSED */
+int
+osetrlimit(p, uap)
+ struct proc *p;
+ register struct osetrlimit_args *uap;
+{
+ struct orlimit olim;
+ struct rlimit lim;
+ int error;
+
+ if ((error =
+ copyin((caddr_t)uap->rlp, (caddr_t)&olim, sizeof(struct orlimit))))
+ return (error);
+ lim.rlim_cur = olim.rlim_cur;
+ lim.rlim_max = olim.rlim_max;
+ return (dosetrlimit(p, uap->which, &lim));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ogetrlimit_args {
+ u_int which;
+ struct orlimit *rlp;
+};
+#endif
+/* ARGSUSED */
+int
+ogetrlimit(p, uap)
+ struct proc *p;
+ register struct ogetrlimit_args *uap;
+{
+ struct orlimit olim;
+
+ if (uap->which >= RLIM_NLIMITS)
+ return (EINVAL);
+ olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur;
+ if (olim.rlim_cur == -1)
+ olim.rlim_cur = 0x7fffffff;
+ olim.rlim_max = p->p_rlimit[uap->which].rlim_max;
+ if (olim.rlim_max == -1)
+ olim.rlim_max = 0x7fffffff;
+ return (copyout((caddr_t)&olim, (caddr_t)uap->rlp, sizeof(olim)));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+#ifndef _SYS_SYSPROTO_H_
+struct __setrlimit_args {
+ u_int which;
+ struct rlimit *rlp;
+};
+#endif
+/* ARGSUSED */
+int
+setrlimit(p, uap)
+ struct proc *p;
+ register struct __setrlimit_args *uap;
+{
+ struct rlimit alim;
+ int error;
+
+ if ((error =
+ copyin((caddr_t)uap->rlp, (caddr_t)&alim, sizeof (struct rlimit))))
+ return (error);
+ return (dosetrlimit(p, uap->which, &alim));
+}
+
+static int
+dosetrlimit(p, which, limp)
+ struct proc *p;
+ u_int which;
+ struct rlimit *limp;
+{
+ register struct rlimit *alimp;
+ int error;
+
+ if (which >= RLIM_NLIMITS)
+ return (EINVAL);
+ alimp = &p->p_rlimit[which];
+
+ /*
+ * Preserve historical bugs by treating negative limits as unsigned.
+ */
+ if (limp->rlim_cur < 0)
+ limp->rlim_cur = RLIM_INFINITY;
+ if (limp->rlim_max < 0)
+ limp->rlim_max = RLIM_INFINITY;
+
+ if (limp->rlim_cur > alimp->rlim_max ||
+ limp->rlim_max > alimp->rlim_max)
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
+ return (error);
+ if (limp->rlim_cur > limp->rlim_max)
+ limp->rlim_cur = limp->rlim_max;
+ if (p->p_limit->p_refcnt > 1 &&
+ (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
+ p->p_limit->p_refcnt--;
+ p->p_limit = limcopy(p->p_limit);
+ alimp = &p->p_rlimit[which];
+ }
+
+ switch (which) {
+
+ case RLIMIT_CPU:
+ if (limp->rlim_cur > RLIM_INFINITY / (rlim_t)1000000)
+ p->p_limit->p_cpulimit = RLIM_INFINITY;
+ else
+ p->p_limit->p_cpulimit =
+ (rlim_t)1000000 * limp->rlim_cur;
+ break;
+ case RLIMIT_DATA:
+ if (limp->rlim_cur > MAXDSIZ)
+ limp->rlim_cur = MAXDSIZ;
+ if (limp->rlim_max > MAXDSIZ)
+ limp->rlim_max = MAXDSIZ;
+ break;
+
+ case RLIMIT_STACK:
+ if (limp->rlim_cur > MAXSSIZ)
+ limp->rlim_cur = MAXSSIZ;
+ if (limp->rlim_max > MAXSSIZ)
+ limp->rlim_max = MAXSSIZ;
+ /*
+ * Stack is allocated to the max at exec time with only
+ * "rlim_cur" bytes accessible. If stack limit is going
+ * up make more accessible, if going down make inaccessible.
+ */
+ if (limp->rlim_cur != alimp->rlim_cur) {
+ vm_offset_t addr;
+ vm_size_t size;
+ vm_prot_t prot;
+
+ if (limp->rlim_cur > alimp->rlim_cur) {
+ prot = VM_PROT_ALL;
+ size = limp->rlim_cur - alimp->rlim_cur;
+ addr = USRSTACK - limp->rlim_cur;
+ } else {
+ prot = VM_PROT_NONE;
+ size = alimp->rlim_cur - limp->rlim_cur;
+ addr = USRSTACK - alimp->rlim_cur;
+ }
+ addr = trunc_page(addr);
+ size = round_page(size);
+ (void) vm_map_protect(&p->p_vmspace->vm_map,
+ addr, addr+size, prot, FALSE);
+ }
+ break;
+
+ case RLIMIT_NOFILE:
+ if (limp->rlim_cur > maxfilesperproc)
+ limp->rlim_cur = maxfilesperproc;
+ if (limp->rlim_max > maxfilesperproc)
+ limp->rlim_max = maxfilesperproc;
+ break;
+
+ case RLIMIT_NPROC:
+ if (limp->rlim_cur > maxprocperuid)
+ limp->rlim_cur = maxprocperuid;
+ if (limp->rlim_max > maxprocperuid)
+ limp->rlim_max = maxprocperuid;
+ break;
+ }
+ *alimp = *limp;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct __getrlimit_args {
+ u_int which;
+ struct rlimit *rlp;
+};
+#endif
+/* ARGSUSED */
+int
+getrlimit(p, uap)
+ struct proc *p;
+ register struct __getrlimit_args *uap;
+{
+
+ if (uap->which >= RLIM_NLIMITS)
+ return (EINVAL);
+ return (copyout((caddr_t)&p->p_rlimit[uap->which], (caddr_t)uap->rlp,
+ sizeof (struct rlimit)));
+}
+
+/*
+ * Transform the running time and tick information in proc p into user,
+ * system, and interrupt time usage.
+ */
+void
+calcru(p, up, sp, ip)
+ struct proc *p;
+ struct timeval *up;
+ struct timeval *sp;
+ struct timeval *ip;
+{
+ int64_t totusec;
+ u_int64_t u, st, ut, it, tot;
+ int s;
+ struct timeval tv;
+
+ /* XXX: why spl-protect ? worst case is an off-by-one report */
+ s = splstatclock();
+ st = p->p_sticks;
+ ut = p->p_uticks;
+ it = p->p_iticks;
+ splx(s);
+
+ tot = st + ut + it;
+ if (tot == 0) {
+ st = 1;
+ tot = 1;
+ }
+
+ totusec = p->p_runtime;
+#ifdef SMP
+ if (p->p_oncpu != (char)0xff) {
+#else
+ if (p == curproc) {
+#endif
+ /*
+ * Adjust for the current time slice. This is actually fairly
+ * important since the error here is on the order of a time
+ * quantum, which is much greater than the sampling error.
+ */
+ microuptime(&tv);
+ totusec += (tv.tv_usec - p->p_switchtime.tv_usec) +
+ (tv.tv_sec - p->p_switchtime.tv_sec) * (int64_t)1000000;
+
+ /*
+ * Copy the time that was just read to `switchtime' in case
+ * we are being called from exit1(). Exits don't go through
+ * mi_switch(), so `switchtime' doesn't get set in the normal
+ * way. We set it here instead of more cleanly in exit1()
+ * to avoid losing track of the time between the calls to
+ * microuptime().
+ */
+ switchtime = tv;
+ }
+ if (totusec < 0) {
+ /* XXX no %qd in kernel. Truncate. */
+ printf("calcru: negative time of %ld usec for pid %d (%s)\n",
+ (long)totusec, p->p_pid, p->p_comm);
+ totusec = 0;
+ }
+ u = totusec;
+ st = (u * st) / tot;
+ sp->tv_sec = st / 1000000;
+ sp->tv_usec = st % 1000000;
+ ut = (u * ut) / tot;
+ up->tv_sec = ut / 1000000;
+ up->tv_usec = ut % 1000000;
+ if (ip != NULL) {
+ it = (u * it) / tot;
+ ip->tv_sec = it / 1000000;
+ ip->tv_usec = it % 1000000;
+ }
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getrusage_args {
+ int who;
+ struct rusage *rusage;
+};
+#endif
+/* ARGSUSED */
+int
+getrusage(p, uap)
+ register struct proc *p;
+ register struct getrusage_args *uap;
+{
+ register struct rusage *rup;
+
+ switch (uap->who) {
+
+ case RUSAGE_SELF:
+ rup = &p->p_stats->p_ru;
+ calcru(p, &rup->ru_utime, &rup->ru_stime, NULL);
+ break;
+
+ case RUSAGE_CHILDREN:
+ rup = &p->p_stats->p_cru;
+ break;
+
+ default:
+ return (EINVAL);
+ }
+ return (copyout((caddr_t)rup, (caddr_t)uap->rusage,
+ sizeof (struct rusage)));
+}
+
+void
+ruadd(ru, ru2)
+ register struct rusage *ru, *ru2;
+{
+ register long *ip, *ip2;
+ register int i;
+
+ timevaladd(&ru->ru_utime, &ru2->ru_utime);
+ timevaladd(&ru->ru_stime, &ru2->ru_stime);
+ if (ru->ru_maxrss < ru2->ru_maxrss)
+ ru->ru_maxrss = ru2->ru_maxrss;
+ ip = &ru->ru_first; ip2 = &ru2->ru_first;
+ for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
+ *ip++ += *ip2++;
+}
+
+/*
+ * Make a copy of the plimit structure.
+ * We share these structures copy-on-write after fork,
+ * and copy when a limit is changed.
+ */
+struct plimit *
+limcopy(lim)
+ struct plimit *lim;
+{
+ register struct plimit *copy;
+
+ MALLOC(copy, struct plimit *, sizeof(struct plimit),
+ M_SUBPROC, M_WAITOK);
+ bcopy(lim->pl_rlimit, copy->pl_rlimit, sizeof(struct plimit));
+ copy->p_lflags = 0;
+ copy->p_refcnt = 1;
+ return (copy);
+}
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
new file mode 100644
index 0000000..4d6db41
--- /dev/null
+++ b/sys/kern/kern_shutdown.c
@@ -0,0 +1,530 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94
+ * $Id: kern_shutdown.c,v 1.43 1998/12/04 22:54:51 archie Exp $
+ */
+
+#include "opt_ddb.h"
+#include "opt_hw_wdog.h"
+#include "opt_panic.h"
+#include "opt_show_busybufs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/reboot.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/sysproto.h>
+
+#include <machine/pcb.h>
+#include <machine/clock.h>
+#include <machine/cons.h>
+#include <machine/md_var.h>
+#ifdef SMP
+#include <machine/smp.h> /* smp_active, cpuid */
+#endif
+
+#include <sys/signalvar.h>
+
+#ifndef PANIC_REBOOT_WAIT_TIME
+#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
+#endif
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#ifdef DDB
+#ifdef DDB_UNATTENDED
+int debugger_on_panic = 0;
+#else
+int debugger_on_panic = 1;
+#endif
+SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW,
+ &debugger_on_panic, 0, "");
+#endif
+
+#ifdef HW_WDOG
+/*
+ * If there is a hardware watchdog, point this at the function needed to
+ * hold it off.
+ * It's needed when the kernel needs to do some lengthy operations.
+ * e.g. in wd.c when dumping core.. It's most annoying to have
+ * your precious core-dump only half written because the wdog kicked in.
+ */
+watchdog_tickle_fn wdog_tickler = NULL;
+#endif /* HW_WDOG */
+
+/*
+ * Variable panicstr contains argument to first call to panic; used as flag
+ * to indicate that the kernel has already called panic.
+ */
+const char *panicstr;
+
+/*
+ * callout list for things to do a shutdown
+ */
+typedef struct shutdown_list_element {
+ LIST_ENTRY(shutdown_list_element) links;
+ bootlist_fn function;
+ void *arg;
+ int priority;
+} *sle_p;
+
+/*
+ * There are three shutdown lists. Some things need to be shut down
+ * earlier than others.
+ */
+LIST_HEAD(shutdown_list, shutdown_list_element);
+
+static struct shutdown_list shutdown_lists[SHUTDOWN_FINAL + 1];
+
+static void boot __P((int)) __dead2;
+static void dumpsys __P((void));
+
+#ifndef _SYS_SYSPROTO_H_
+struct reboot_args {
+ int opt;
+};
+#endif
+/* ARGSUSED */
+
+/*
+ * The system call that results in a reboot
+ */
+int
+reboot(p, uap)
+ struct proc *p;
+ struct reboot_args *uap;
+{
+ int error;
+
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
+ return (error);
+
+ boot(uap->opt);
+ return (0);
+}
+
+/*
+ * Called by events that want to shut down.. e.g <CTL><ALT><DEL> on a PC
+ */
+void
+shutdown_nice()
+{
+ /* Send a signal to init(8) and have it shutdown the world */
+ if (initproc != NULL) {
+ psignal(initproc, SIGINT);
+ } else {
+ /* No init(8) running, so simply reboot */
+ boot(RB_NOSYNC);
+ }
+ return;
+}
+static int waittime = -1;
+static struct pcb dumppcb;
+
+/*
+ * Go through the rigmarole of shutting down..
+ * this used to be in machdep.c but I'll be dammned if I could see
+ * anything machine dependant in it.
+ */
+static void
+boot(howto)
+ int howto;
+{
+ sle_p ep;
+
+#ifdef SMP
+ if (smp_active) {
+ printf("boot() called on cpu#%d\n", cpuid);
+ }
+#endif
+ /*
+ * Do any callouts that should be done BEFORE syncing the filesystems.
+ */
+ LIST_FOREACH(ep, &shutdown_lists[SHUTDOWN_PRE_SYNC], links)
+ (*ep->function)(howto, ep->arg);
+
+ /*
+ * Now sync filesystems
+ */
+ if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
+ register struct buf *bp;
+ int iter, nbusy;
+
+ waittime = 0;
+ printf("\nsyncing disks... ");
+
+ sync(&proc0, NULL);
+
+ /*
+ * With soft updates, some buffers that are
+ * written will be remarked as dirty until other
+ * buffers are written.
+ */
+ for (iter = 0; iter < 20; iter++) {
+ nbusy = 0;
+ for (bp = &buf[nbuf]; --bp >= buf; ) {
+ if ((bp->b_flags & (B_BUSY | B_INVAL))
+ == B_BUSY) {
+ nbusy++;
+ } else if ((bp->b_flags & (B_DELWRI | B_INVAL))
+ == B_DELWRI) {
+ /* bawrite(bp);*/
+ nbusy++;
+ }
+ }
+ if (nbusy == 0)
+ break;
+ printf("%d ", nbusy);
+ sync(&proc0, NULL);
+ DELAY(50000 * iter);
+ }
+ /*
+ * Count only busy local buffers to prevent forcing
+ * a fsck if we're just a client of a wedged NFS server
+ */
+ nbusy = 0;
+ for (bp = &buf[nbuf]; --bp >= buf; ) {
+ if (((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY)
+ ||((bp->b_flags & (B_DELWRI | B_INVAL))== B_DELWRI))
+ if(bp->b_dev == NODEV)
+ CIRCLEQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list);
+ else
+ nbusy++;
+
+
+ }
+ if (nbusy) {
+ /*
+ * Failed to sync all blocks. Indicate this and don't
+ * unmount filesystems (thus forcing an fsck on reboot).
+ */
+ printf("giving up\n");
+#ifdef SHOW_BUSYBUFS
+ nbusy = 0;
+ for (bp = &buf[nbuf]; --bp >= buf; ) {
+ if ((bp->b_flags & (B_BUSY | B_INVAL))
+ == B_BUSY) {
+ nbusy++;
+ printf(
+ "%d: dev:%08lx, flags:%08lx, blkno:%ld, lblkno:%ld\n",
+ nbusy, (u_long)bp->b_dev,
+ bp->b_flags, (long)bp->b_blkno,
+ (long)bp->b_lblkno);
+ }
+ }
+ DELAY(5000000); /* 5 seconds */
+#endif
+ } else {
+ printf("done\n");
+ /*
+ * Unmount filesystems
+ */
+ if (panicstr == 0)
+ vfs_unmountall();
+ }
+ DELAY(100000); /* wait for console output to finish */
+ }
+
+ /*
+ * Ok, now do things that assume all filesystem activity has
+ * been completed.
+ */
+ LIST_FOREACH(ep, &shutdown_lists[SHUTDOWN_POST_SYNC], links)
+ (*ep->function)(howto, ep->arg);
+ splhigh();
+ if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold) {
+ savectx(&dumppcb);
+#ifdef __i386__
+ dumppcb.pcb_cr3 = rcr3();
+#endif
+ dumpsys();
+ }
+
+ /* Now that we're going to really halt the system... */
+ LIST_FOREACH(ep, &shutdown_lists[SHUTDOWN_FINAL], links)
+ (*ep->function)(howto, ep->arg);
+
+ if (howto & RB_HALT) {
+ printf("\n");
+ printf("The operating system has halted.\n");
+ printf("Please press any key to reboot.\n\n");
+ switch (cngetc()) {
+ case -1: /* No console, just die */
+ cpu_halt();
+ /* NOTREACHED */
+ default:
+ howto &= ~RB_HALT;
+ break;
+ }
+ } else if (howto & RB_DUMP) {
+ /* System Paniced */
+
+ if (PANIC_REBOOT_WAIT_TIME != 0) {
+ if (PANIC_REBOOT_WAIT_TIME != -1) {
+ int loop;
+ printf("Automatic reboot in %d seconds - "
+ "press a key on the console to abort\n",
+ PANIC_REBOOT_WAIT_TIME);
+ for (loop = PANIC_REBOOT_WAIT_TIME * 10;
+ loop > 0; --loop) {
+ DELAY(1000 * 100); /* 1/10th second */
+ /* Did user type a key? */
+ if (cncheckc() != -1)
+ break;
+ }
+ if (!loop)
+ goto die;
+ }
+ } else { /* zero time specified - reboot NOW */
+ goto die;
+ }
+ printf("--> Press a key on the console to reboot <--\n");
+ cngetc();
+ }
+die:
+ printf("Rebooting...\n");
+ DELAY(1000000); /* wait 1 sec for printf's to complete and be read */
+ /* cpu_boot(howto); */ /* doesn't do anything at the moment */
+ cpu_reset();
+ for(;;) ;
+ /* NOTREACHED */
+}
+
+/*
+ * Magic number for savecore
+ *
+ * exported (symorder) and used at least by savecore(8)
+ *
+ */
+static u_long const dumpmag = 0x8fca0101UL;
+
+static int dumpsize = 0; /* also for savecore */
+
+static int dodump = 1;
+SYSCTL_INT(_machdep, OID_AUTO, do_dump, CTLFLAG_RW, &dodump, 0, "");
+
+/* ARGSUSED */
+static void dump_conf __P((void *dummy));
+static void
+dump_conf(dummy)
+ void *dummy;
+{
+ cpu_dumpconf();
+}
+SYSINIT(dump_conf, SI_SUB_DUMP_CONF, SI_ORDER_FIRST, dump_conf, NULL)
+
+/*
+ * Doadump comes here after turning off memory management and
+ * getting on the dump stack, either when called above, or by
+ * the auto-restart code.
+ */
+static void
+dumpsys(void)
+{
+
+ if (!dodump)
+ return;
+ if (dumpdev == NODEV)
+ return;
+ if (!(bdevsw[major(dumpdev)]))
+ return;
+ if (!(bdevsw[major(dumpdev)]->d_dump))
+ return;
+ dumpsize = Maxmem;
+ printf("\ndumping to dev %lx, offset %ld\n", (u_long)dumpdev, dumplo);
+ printf("dump ");
+ switch ((*bdevsw[major(dumpdev)]->d_dump)(dumpdev)) {
+
+ case ENXIO:
+ printf("device bad\n");
+ break;
+
+ case EFAULT:
+ printf("device not ready\n");
+ break;
+
+ case EINVAL:
+ printf("area improper\n");
+ break;
+
+ case EIO:
+ printf("i/o error\n");
+ break;
+
+ case EINTR:
+ printf("aborted from console\n");
+ break;
+
+ default:
+ printf("succeeded\n");
+ break;
+ }
+}
+
+/*
+ * Panic is called on unresolvable fatal errors. It prints "panic: mesg",
+ * and then reboots. If we are called twice, then we avoid trying to sync
+ * the disks as this often leads to recursive panics.
+ */
+void
+panic(const char *fmt, ...)
+{
+ int bootopt;
+ va_list ap;
+ static char buf[256];
+
+ bootopt = RB_AUTOBOOT | RB_DUMP;
+ if (panicstr)
+ bootopt |= RB_NOSYNC;
+ else
+ panicstr = fmt;
+
+ va_start(ap, fmt);
+ (void)vsnprintf(buf, sizeof(buf), fmt, ap);
+ if (panicstr == fmt)
+ panicstr = buf;
+ va_end(ap);
+ printf("panic: %s\n", buf);
+#ifdef SMP
+ /* three seperate prints in case of an unmapped page and trap */
+ printf("mp_lock = %08x; ", mp_lock);
+ printf("cpuid = %d; ", cpuid);
+ printf("lapic.id = %08x\n", lapic.id);
+#endif
+
+#if defined(DDB)
+ if (debugger_on_panic)
+ Debugger ("panic");
+#endif
+ boot(bootopt);
+}
+
+/*
+ * Three routines to handle adding/deleting items on the
+ * shutdown callout lists
+ *
+ * at_shutdown():
+ * Take the arguments given and put them onto the shutdown callout list.
+ * However first make sure that it's not already there.
+ * returns 0 on success.
+ */
+int
+at_shutdown(bootlist_fn function, void *arg, int queue)
+{
+ return(at_shutdown_pri(function, arg, queue, SHUTDOWN_PRI_DEFAULT));
+}
+
+/*
+ * at_shutdown_pri():
+ * Take the arguments given and put them onto the shutdown callout list
+ * with the given execution priority.
+ * returns 0 on success.
+ */
+int
+at_shutdown_pri(bootlist_fn function, void *arg, int queue, int pri)
+{
+ sle_p ep, ip;
+
+ if (queue < SHUTDOWN_PRE_SYNC
+ || queue > SHUTDOWN_FINAL) {
+ printf("at_shutdown: bad exit callout queue %d specified\n",
+ queue);
+ return (EINVAL);
+ }
+ if (rm_at_shutdown(function, arg))
+ printf("at_shutdown: exit callout entry was already present\n");
+ ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
+ if (ep == NULL)
+ return (ENOMEM);
+ ep->function = function;
+ ep->arg = arg;
+ ep->priority = pri;
+
+ /* Sort into list of items on this queue */
+ ip = LIST_FIRST(&shutdown_lists[queue]);
+ if (ip == NULL) {
+ LIST_INSERT_HEAD(&shutdown_lists[queue], ep, links);
+ } else {
+ for (; LIST_NEXT(ip, links) != NULL; ip = LIST_NEXT(ip, links)) {
+ if (ep->priority < ip->priority) {
+ LIST_INSERT_BEFORE(ip, ep, links);
+ ep = NULL;
+ break;
+ }
+ }
+ if (ep != NULL)
+ LIST_INSERT_AFTER(ip, ep, links);
+ }
+ return (0);
+}
+
+/*
+ * Scan the exit callout lists for the given items and remove them.
+ * Returns the number of items removed.
+ */
+int
+rm_at_shutdown(bootlist_fn function, void *arg)
+{
+ sle_p ep;
+ int count;
+ int queue;
+
+ count = 0;
+ for (queue = SHUTDOWN_PRE_SYNC; queue < SHUTDOWN_FINAL; queue++) {
+ LIST_FOREACH(ep, &shutdown_lists[queue], links) {
+ if ((ep->function == function) && (ep->arg == arg)) {
+ LIST_REMOVE(ep, links);
+ free(ep, M_TEMP);
+ count++;
+ }
+ }
+ }
+ return (count);
+}
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
new file mode 100644
index 0000000..bf89d8a
--- /dev/null
+++ b/sys/kern/kern_sig.c
@@ -0,0 +1,1455 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94
+ * $Id: kern_sig.c,v 1.52 1999/01/08 17:31:10 eivind Exp $
+ */
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#define SIGPROP /* include signal properties table */
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/systm.h>
+#include <sys/acct.h>
+#include <sys/fcntl.h>
+#include <sys/wait.h>
+#include <sys/ktrace.h>
+#include <sys/syslog.h>
+#include <sys/stat.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+
+#include <machine/cpu.h>
+#ifdef SMP
+#include <machine/smp.h>
+#endif
+
+static int killpg1 __P((struct proc *cp, int signum, int pgid, int all));
+static void setsigvec __P((struct proc *p, int signum, struct sigaction *sa));
+static void stop __P((struct proc *));
+
+static int kern_logsigexit = 1;
+SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "");
+
+/*
+ * Can process p, with pcred pc, send the signal signum to process q?
+ */
+#define CANSIGNAL(p, pc, q, signum) \
+ ((pc)->pc_ucred->cr_uid == 0 || \
+ (pc)->p_ruid == (q)->p_cred->p_ruid || \
+ (pc)->pc_ucred->cr_uid == (q)->p_cred->p_ruid || \
+ (pc)->p_ruid == (q)->p_ucred->cr_uid || \
+ (pc)->pc_ucred->cr_uid == (q)->p_ucred->cr_uid || \
+ ((signum) == SIGCONT && (q)->p_session == (p)->p_session))
+
+/*
+ * Policy -- Can real uid ruid with ucred uc send a signal to process q?
+ */
+#define CANSIGIO(ruid, uc, q) \
+ ((uc)->cr_uid == 0 || \
+ (ruid) == (q)->p_cred->p_ruid || \
+ (uc)->cr_uid == (q)->p_cred->p_ruid || \
+ (ruid) == (q)->p_ucred->cr_uid || \
+ (uc)->cr_uid == (q)->p_ucred->cr_uid)
+
+int sugid_coredump;
+SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, &sugid_coredump, 0, "");
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigaction_args {
+ int signum;
+ struct sigaction *nsa;
+ struct sigaction *osa;
+};
+#endif
+/* ARGSUSED */
+int
+sigaction(p, uap)
+ struct proc *p;
+ register struct sigaction_args *uap;
+{
+ struct sigaction vec;
+ register struct sigaction *sa;
+ register struct sigacts *ps = p->p_sigacts;
+ register int signum;
+ int bit, error;
+
+ signum = uap->signum;
+ if (signum <= 0 || signum >= NSIG)
+ return (EINVAL);
+ sa = &vec;
+ if (uap->osa) {
+ sa->sa_handler = ps->ps_sigact[signum];
+ sa->sa_mask = ps->ps_catchmask[signum];
+ bit = sigmask(signum);
+ sa->sa_flags = 0;
+ if ((ps->ps_sigonstack & bit) != 0)
+ sa->sa_flags |= SA_ONSTACK;
+ if ((ps->ps_sigintr & bit) == 0)
+ sa->sa_flags |= SA_RESTART;
+ if ((ps->ps_sigreset & bit) != 0)
+ sa->sa_flags |= SA_RESETHAND;
+ if ((ps->ps_signodefer & bit) != 0)
+ sa->sa_flags |= SA_NODEFER;
+#ifndef COMPAT_LINUX_THREADS
+ if (signum == SIGCHLD && p->p_flag & P_NOCLDSTOP)
+#else
+ if (signum == SIGCHLD && p->p_procsig->ps_flag & P_NOCLDSTOP)
+#endif /* COMPAT_LINUX_THREADS */
+ sa->sa_flags |= SA_NOCLDSTOP;
+#ifndef COMPAT_LINUX_THREADS
+ if (signum == SIGCHLD && p->p_flag & P_NOCLDWAIT)
+#else
+ if (signum == SIGCHLD && p->p_procsig->ps_flag & P_NOCLDWAIT)
+#endif /* COMPAT_LINUX_THREADS */
+ sa->sa_flags |= SA_NOCLDWAIT;
+ if ((error = copyout((caddr_t)sa, (caddr_t)uap->osa,
+ sizeof (vec))))
+ return (error);
+ }
+ if (uap->nsa) {
+ if ((error = copyin((caddr_t)uap->nsa, (caddr_t)sa,
+ sizeof (vec))))
+ return (error);
+ if ((signum == SIGKILL || signum == SIGSTOP) &&
+ sa->sa_handler != SIG_DFL)
+ return (EINVAL);
+ setsigvec(p, signum, sa);
+ }
+ return (0);
+}
+
+static void
+setsigvec(p, signum, sa)
+ register struct proc *p;
+ int signum;
+ register struct sigaction *sa;
+{
+ register struct sigacts *ps = p->p_sigacts;
+ register int bit;
+
+ bit = sigmask(signum);
+ /*
+ * Change setting atomically.
+ */
+ (void) splhigh();
+ ps->ps_sigact[signum] = sa->sa_handler;
+ ps->ps_catchmask[signum] = sa->sa_mask &~ sigcantmask;
+ if ((sa->sa_flags & SA_RESTART) == 0)
+ ps->ps_sigintr |= bit;
+ else
+ ps->ps_sigintr &= ~bit;
+ if (sa->sa_flags & SA_ONSTACK)
+ ps->ps_sigonstack |= bit;
+ else
+ ps->ps_sigonstack &= ~bit;
+ if (sa->sa_flags & SA_RESETHAND)
+ ps->ps_sigreset |= bit;
+ else
+ ps->ps_sigreset &= ~bit;
+ if (sa->sa_flags & SA_NODEFER)
+ ps->ps_signodefer |= bit;
+ else
+ ps->ps_signodefer &= ~bit;
+#ifdef COMPAT_SUNOS
+ if (sa->sa_flags & SA_USERTRAMP)
+ ps->ps_usertramp |= bit;
+ else
+ ps->ps_usertramp &= ~bit;
+#endif
+ if (signum == SIGCHLD) {
+ if (sa->sa_flags & SA_NOCLDSTOP)
+#ifndef COMPAT_LINUX_THREADS
+ p->p_flag |= P_NOCLDSTOP;
+ else
+ p->p_flag &= ~P_NOCLDSTOP;
+#else
+ p->p_procsig->ps_flag |= P_NOCLDSTOP;
+ else
+ p->p_procsig->ps_flag &= ~P_NOCLDSTOP;
+#endif /* COMPAT_LINUX_THREADS */
+ if (sa->sa_flags & SA_NOCLDWAIT) {
+ /*
+ * Paranoia: since SA_NOCLDWAIT is implemented by
+ * reparenting the dying child to PID 1 (and
+ * trust it to reap the zombie), PID 1 itself is
+ * forbidden to set SA_NOCLDWAIT.
+ */
+ if (p->p_pid == 1)
+#ifndef COMPAT_LINUX_THREADS
+ p->p_flag &= ~P_NOCLDWAIT;
+ else
+ p->p_flag |= P_NOCLDWAIT;
+#else
+ p->p_procsig->ps_flag &= ~P_NOCLDWAIT;
+ else
+ p->p_procsig->ps_flag |= P_NOCLDWAIT;
+#endif /* COMPAT_LINUX_THREADS */
+ } else
+#ifndef COMPAT_LINUX_THREADS
+ p->p_flag &= ~P_NOCLDWAIT;
+#else
+ p->p_procsig->ps_flag &= ~P_NOCLDWAIT;
+#endif /* COMPAT_LINUX_THREADS */
+ }
+ /*
+ * Set bit in p_sigignore for signals that are set to SIG_IGN,
+ * and for signals set to SIG_DFL where the default is to ignore.
+ * However, don't put SIGCONT in p_sigignore,
+ * as we have to restart the process.
+ */
+ if (sa->sa_handler == SIG_IGN ||
+ (sigprop[signum] & SA_IGNORE && sa->sa_handler == SIG_DFL)) {
+ p->p_siglist &= ~bit; /* never to be seen again */
+ if (signum != SIGCONT)
+ p->p_sigignore |= bit; /* easier in psignal */
+ p->p_sigcatch &= ~bit;
+ } else {
+ p->p_sigignore &= ~bit;
+ if (sa->sa_handler == SIG_DFL)
+ p->p_sigcatch &= ~bit;
+ else
+ p->p_sigcatch |= bit;
+ }
+ (void) spl0();
+}
+
+/*
+ * Initialize signal state for process 0;
+ * set to ignore signals that are ignored by default.
+ */
+void
+siginit(p)
+ struct proc *p;
+{
+ register int i;
+
+ for (i = 0; i < NSIG; i++)
+ if (sigprop[i] & SA_IGNORE && i != SIGCONT)
+ p->p_sigignore |= sigmask(i);
+}
+
+/*
+ * Reset signals for an exec of the specified process.
+ */
+void
+execsigs(p)
+ register struct proc *p;
+{
+ register struct sigacts *ps = p->p_sigacts;
+ register int nc, mask;
+
+ /*
+ * Reset caught signals. Held signals remain held
+ * through p_sigmask (unless they were caught,
+ * and are now ignored by default).
+ */
+ while (p->p_sigcatch) {
+ nc = ffs((long)p->p_sigcatch);
+ mask = sigmask(nc);
+ p->p_sigcatch &= ~mask;
+ if (sigprop[nc] & SA_IGNORE) {
+ if (nc != SIGCONT)
+ p->p_sigignore |= mask;
+ p->p_siglist &= ~mask;
+ }
+ ps->ps_sigact[nc] = SIG_DFL;
+ }
+ /*
+ * Reset stack state to the user stack.
+ * Clear set of signals caught on the signal stack.
+ */
+ ps->ps_sigstk.ss_flags = SS_DISABLE;
+ ps->ps_sigstk.ss_size = 0;
+ ps->ps_sigstk.ss_sp = 0;
+ ps->ps_flags = 0;
+}
+
+/*
+ * Manipulate signal mask.
+ * Note that we receive new mask, not pointer,
+ * and return old mask as return value;
+ * the library stub does the rest.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sigprocmask_args {
+ int how;
+ sigset_t mask;
+};
+#endif
+int
+sigprocmask(p, uap)
+ register struct proc *p;
+ struct sigprocmask_args *uap;
+{
+ int error = 0;
+
+ p->p_retval[0] = p->p_sigmask;
+ (void) splhigh();
+
+ switch (uap->how) {
+ case SIG_BLOCK:
+ p->p_sigmask |= uap->mask &~ sigcantmask;
+ break;
+
+ case SIG_UNBLOCK:
+ p->p_sigmask &= ~uap->mask;
+ break;
+
+ case SIG_SETMASK:
+ p->p_sigmask = uap->mask &~ sigcantmask;
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ (void) spl0();
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigpending_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sigpending(p, uap)
+ struct proc *p;
+ struct sigpending_args *uap;
+{
+
+ p->p_retval[0] = p->p_siglist;
+ return (0);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Generalized interface signal handler, 4.3-compatible.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct osigvec_args {
+ int signum;
+ struct sigvec *nsv;
+ struct sigvec *osv;
+};
+#endif
+/* ARGSUSED */
+int
+osigvec(p, uap)
+ struct proc *p;
+ register struct osigvec_args *uap;
+{
+ struct sigvec vec;
+ register struct sigacts *ps = p->p_sigacts;
+ register struct sigvec *sv;
+ register int signum;
+ int bit, error;
+
+ signum = uap->signum;
+ if (signum <= 0 || signum >= NSIG)
+ return (EINVAL);
+ sv = &vec;
+ if (uap->osv) {
+ *(sig_t *)&sv->sv_handler = ps->ps_sigact[signum];
+ sv->sv_mask = ps->ps_catchmask[signum];
+ bit = sigmask(signum);
+ sv->sv_flags = 0;
+ if ((ps->ps_sigonstack & bit) != 0)
+ sv->sv_flags |= SV_ONSTACK;
+ if ((ps->ps_sigintr & bit) != 0)
+ sv->sv_flags |= SV_INTERRUPT;
+ if ((ps->ps_sigreset & bit) != 0)
+ sv->sv_flags |= SV_RESETHAND;
+ if ((ps->ps_signodefer & bit) != 0)
+ sv->sv_flags |= SV_NODEFER;
+#ifndef COMPAT_SUNOS
+#ifndef COMPAT_LINUX_THREADS
+ if (signum == SIGCHLD && p->p_flag & P_NOCLDSTOP)
+#else
+ if (signum == SIGCHLD && p->p_procsig->ps_flag & P_NOCLDSTOP)
+#endif /* COMPAT_LINUX_THREADS */
+ sv->sv_flags |= SV_NOCLDSTOP;
+#endif
+ if ((error = copyout((caddr_t)sv, (caddr_t)uap->osv,
+ sizeof (vec))))
+ return (error);
+ }
+ if (uap->nsv) {
+ if ((error = copyin((caddr_t)uap->nsv, (caddr_t)sv,
+ sizeof (vec))))
+ return (error);
+ if ((signum == SIGKILL || signum == SIGSTOP) &&
+ sv->sv_handler != SIG_DFL)
+ return (EINVAL);
+#ifdef COMPAT_SUNOS
+ sv->sv_flags |= SA_USERTRAMP;
+#endif
+ sv->sv_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */
+ setsigvec(p, signum, (struct sigaction *)sv);
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct osigblock_args {
+ int mask;
+};
+#endif
+int
+osigblock(p, uap)
+ register struct proc *p;
+ struct osigblock_args *uap;
+{
+
+ (void) splhigh();
+ p->p_retval[0] = p->p_sigmask;
+ p->p_sigmask |= uap->mask &~ sigcantmask;
+ (void) spl0();
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct osigsetmask_args {
+ int mask;
+};
+#endif
+int
+osigsetmask(p, uap)
+ struct proc *p;
+ struct osigsetmask_args *uap;
+{
+
+ (void) splhigh();
+ p->p_retval[0] = p->p_sigmask;
+ p->p_sigmask = uap->mask &~ sigcantmask;
+ (void) spl0();
+ return (0);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Suspend process until signal, providing mask to be set
+ * in the meantime. Note nonstandard calling convention:
+ * libc stub passes mask, not pointer, to save a copyin.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sigsuspend_args {
+ sigset_t mask;
+};
+#endif
+/* ARGSUSED */
+int
+sigsuspend(p, uap)
+ register struct proc *p;
+ struct sigsuspend_args *uap;
+{
+ register struct sigacts *ps = p->p_sigacts;
+
+ /*
+ * When returning from sigpause, we want
+ * the old mask to be restored after the
+ * signal handler has finished. Thus, we
+ * save it here and mark the sigacts structure
+ * to indicate this.
+ */
+#ifndef COMPAT_LINUX_THREADS
+ ps->ps_oldmask = p->p_sigmask;
+ ps->ps_flags |= SAS_OLDMASK;
+#else
+ p->p_oldsigmask = p->p_sigmask;
+#endif /* COMPAT_LINUX_THREADS */
+ p->p_sigmask = uap->mask &~ sigcantmask;
+ while (tsleep((caddr_t) ps, PPAUSE|PCATCH, "pause", 0) == 0)
+ /* void */;
+ /* always return EINTR rather than ERESTART... */
+ return (EINTR);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct osigstack_args {
+ struct sigstack *nss;
+ struct sigstack *oss;
+};
+#endif
+/* ARGSUSED */
+int
+osigstack(p, uap)
+ struct proc *p;
+ register struct osigstack_args *uap;
+{
+ struct sigstack ss;
+ struct sigacts *psp;
+ int error = 0;
+
+ psp = p->p_sigacts;
+ ss.ss_sp = psp->ps_sigstk.ss_sp;
+ ss.ss_onstack = psp->ps_sigstk.ss_flags & SS_ONSTACK;
+ if (uap->oss && (error = copyout((caddr_t)&ss, (caddr_t)uap->oss,
+ sizeof (struct sigstack))))
+ return (error);
+ if (uap->nss && (error = copyin((caddr_t)uap->nss, (caddr_t)&ss,
+ sizeof (ss))) == 0) {
+ psp->ps_sigstk.ss_sp = ss.ss_sp;
+ psp->ps_sigstk.ss_size = 0;
+ psp->ps_sigstk.ss_flags |= ss.ss_onstack & SS_ONSTACK;
+ psp->ps_flags |= SAS_ALTSTACK;
+ }
+ return (error);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigaltstack_args {
+ struct sigaltstack *nss;
+ struct sigaltstack *oss;
+};
+#endif
+/* ARGSUSED */
+int
+sigaltstack(p, uap)
+ struct proc *p;
+ register struct sigaltstack_args *uap;
+{
+ struct sigacts *psp;
+ struct sigaltstack ss;
+ int error;
+
+ psp = p->p_sigacts;
+ if ((psp->ps_flags & SAS_ALTSTACK) == 0)
+ psp->ps_sigstk.ss_flags |= SS_DISABLE;
+ if (uap->oss && (error = copyout((caddr_t)&psp->ps_sigstk,
+ (caddr_t)uap->oss, sizeof (struct sigaltstack))))
+ return (error);
+ if (uap->nss == 0)
+ return (0);
+ if ((error = copyin((caddr_t)uap->nss, (caddr_t)&ss, sizeof (ss))))
+ return (error);
+ if (ss.ss_flags & SS_DISABLE) {
+ if (psp->ps_sigstk.ss_flags & SS_ONSTACK)
+ return (EINVAL);
+ psp->ps_flags &= ~SAS_ALTSTACK;
+ psp->ps_sigstk.ss_flags = ss.ss_flags;
+ return (0);
+ }
+ if (ss.ss_size < MINSIGSTKSZ)
+ return (ENOMEM);
+ psp->ps_flags |= SAS_ALTSTACK;
+ psp->ps_sigstk= ss;
+ return (0);
+}
+
+/*
+ * Common code for kill process group/broadcast kill.
+ * cp is calling process.
+ */
+int
+killpg1(cp, signum, pgid, all)
+ register struct proc *cp;
+ int signum, pgid, all;
+{
+ register struct proc *p;
+ register struct pcred *pc = cp->p_cred;
+ struct pgrp *pgrp;
+ int nfound = 0;
+
+ if (all)
+ /*
+ * broadcast
+ */
+ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+ if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+ p == cp || !CANSIGNAL(cp, pc, p, signum))
+ continue;
+ nfound++;
+ if (signum)
+ psignal(p, signum);
+ }
+ else {
+ if (pgid == 0)
+ /*
+ * zero pgid means send to my process group.
+ */
+ pgrp = cp->p_pgrp;
+ else {
+ pgrp = pgfind(pgid);
+ if (pgrp == NULL)
+ return (ESRCH);
+ }
+ for (p = pgrp->pg_members.lh_first; p != 0;
+ p = p->p_pglist.le_next) {
+ if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+ p->p_stat == SZOMB ||
+ !CANSIGNAL(cp, pc, p, signum))
+ continue;
+ nfound++;
+ if (signum)
+ psignal(p, signum);
+ }
+ }
+ return (nfound ? 0 : ESRCH);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct kill_args {
+ int pid;
+ int signum;
+};
+#endif
+/* ARGSUSED */
+int
+kill(cp, uap)
+ register struct proc *cp;
+ register struct kill_args *uap;
+{
+ register struct proc *p;
+ register struct pcred *pc = cp->p_cred;
+
+ if ((u_int)uap->signum >= NSIG)
+ return (EINVAL);
+ if (uap->pid > 0) {
+ /* kill single process */
+ if ((p = pfind(uap->pid)) == NULL)
+ return (ESRCH);
+ if (!CANSIGNAL(cp, pc, p, uap->signum))
+ return (EPERM);
+ if (uap->signum)
+ psignal(p, uap->signum);
+ return (0);
+ }
+ switch (uap->pid) {
+ case -1: /* broadcast signal */
+ return (killpg1(cp, uap->signum, 0, 1));
+ case 0: /* signal own process group */
+ return (killpg1(cp, uap->signum, 0, 0));
+ default: /* negative explicit process group */
+ return (killpg1(cp, uap->signum, -uap->pid, 0));
+ }
+ /* NOTREACHED */
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct okillpg_args {
+ int pgid;
+ int signum;
+};
+#endif
+/* ARGSUSED */
+int
+okillpg(p, uap)
+ struct proc *p;
+ register struct okillpg_args *uap;
+{
+
+ if ((u_int)uap->signum >= NSIG)
+ return (EINVAL);
+ return (killpg1(p, uap->signum, uap->pgid, 0));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Send a signal to a process group.
+ */
+void
+gsignal(pgid, signum)
+ int pgid, signum;
+{
+ struct pgrp *pgrp;
+
+ if (pgid && (pgrp = pgfind(pgid)))
+ pgsignal(pgrp, signum, 0);
+}
+
+/*
+ * Send a signal to a process group. If checktty is 1,
+ * limit to members which have a controlling terminal.
+ */
+void
+pgsignal(pgrp, signum, checkctty)
+ struct pgrp *pgrp;
+ int signum, checkctty;
+{
+ register struct proc *p;
+
+ if (pgrp)
+ for (p = pgrp->pg_members.lh_first; p != 0;
+ p = p->p_pglist.le_next)
+ if (checkctty == 0 || p->p_flag & P_CONTROLT)
+ psignal(p, signum);
+}
+
+/*
+ * Send a signal caused by a trap to the current process.
+ * If it will be caught immediately, deliver it with correct code.
+ * Otherwise, post it normally.
+ */
+void
+trapsignal(p, signum, code)
+ struct proc *p;
+ register int signum;
+ u_long code;
+{
+ register struct sigacts *ps = p->p_sigacts;
+ int mask;
+
+ mask = sigmask(signum);
+ if ((p->p_flag & P_TRACED) == 0 && (p->p_sigcatch & mask) != 0 &&
+ (p->p_sigmask & mask) == 0) {
+ p->p_stats->p_ru.ru_nsignals++;
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_PSIG))
+ ktrpsig(p->p_tracep, signum, ps->ps_sigact[signum],
+ p->p_sigmask, code);
+#endif
+ (*p->p_sysent->sv_sendsig)(ps->ps_sigact[signum], signum,
+ p->p_sigmask, code);
+ p->p_sigmask |= ps->ps_catchmask[signum] |
+ (mask & ~ps->ps_signodefer);
+ if ((ps->ps_sigreset & mask) != 0) {
+ /*
+ * See setsigvec() for origin of this code.
+ */
+ p->p_sigcatch &= ~mask;
+ if (signum != SIGCONT && sigprop[signum] & SA_IGNORE)
+ p->p_sigignore |= mask;
+ ps->ps_sigact[signum] = SIG_DFL;
+ }
+ } else {
+#ifndef COMPAT_LINUX_THREADS
+ ps->ps_code = code; /* XXX for core dump/debugger */
+ ps->ps_sig = signum; /* XXX to verify code */
+#else
+ p->p_code = code; /* XXX for core dump/debugger */
+ p->p_sig = signum; /* XXX to verify code */
+#endif /* COMPAT_LINUX_THREADS */
+ psignal(p, signum);
+ }
+}
+
+/*
+ * Send the signal to the process. If the signal has an action, the action
+ * is usually performed by the target process rather than the caller; we add
+ * the signal to the set of pending signals for the process.
+ *
+ * Exceptions:
+ * o When a stop signal is sent to a sleeping process that takes the
+ * default action, the process is stopped without awakening it.
+ * o SIGCONT restarts stopped processes (or puts them back to sleep)
+ * regardless of the signal action (eg, blocked or ignored).
+ *
+ * Other ignored signals are discarded immediately.
+ */
+void
+psignal(p, signum)
+ register struct proc *p;
+ register int signum;
+{
+ register int s, prop;
+ register sig_t action;
+ int mask;
+
+ if ((u_int)signum >= NSIG || signum == 0) {
+ printf("psignal: signum %d\n", signum);
+ panic("psignal signal number");
+ }
+ mask = sigmask(signum);
+ prop = sigprop[signum];
+
+ /*
+ * If proc is traced, always give parent a chance;
+ * if signal event is tracked by procfs, give *that*
+ * a chance, as well.
+ */
+ if ((p->p_flag & P_TRACED) || (p->p_stops & S_SIG))
+ action = SIG_DFL;
+ else {
+ /*
+ * If the signal is being ignored,
+ * then we forget about it immediately.
+ * (Note: we don't set SIGCONT in p_sigignore,
+ * and if it is set to SIG_IGN,
+ * action will be SIG_DFL here.)
+ */
+#ifndef COMPAT_LINUX_THREADS
+ if (p->p_sigignore & mask)
+#else
+ if ((p->p_sigignore & mask) || (p->p_flag & P_WEXIT))
+#endif /* COMPAT_LINUX_THREADS */
+ return;
+ if (p->p_sigmask & mask)
+ action = SIG_HOLD;
+ else if (p->p_sigcatch & mask)
+ action = SIG_CATCH;
+ else
+ action = SIG_DFL;
+ }
+
+ if (p->p_nice > NZERO && action == SIG_DFL && (prop & SA_KILL) &&
+ (p->p_flag & P_TRACED) == 0)
+ p->p_nice = NZERO;
+
+ if (prop & SA_CONT)
+ p->p_siglist &= ~stopsigmask;
+
+ if (prop & SA_STOP) {
+ /*
+ * If sending a tty stop signal to a member of an orphaned
+ * process group, discard the signal here if the action
+ * is default; don't stop the process below if sleeping,
+ * and don't clear any pending SIGCONT.
+ */
+ if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 &&
+ action == SIG_DFL)
+ return;
+ p->p_siglist &= ~contsigmask;
+ }
+ p->p_siglist |= mask;
+
+ /*
+ * Defer further processing for signals which are held,
+ * except that stopped processes must be continued by SIGCONT.
+ */
+ if (action == SIG_HOLD && ((prop & SA_CONT) == 0 || p->p_stat != SSTOP))
+ return;
+ s = splhigh();
+ switch (p->p_stat) {
+
+ case SSLEEP:
+ /*
+ * If process is sleeping uninterruptibly
+ * we can't interrupt the sleep... the signal will
+ * be noticed when the process returns through
+ * trap() or syscall().
+ */
+ if ((p->p_flag & P_SINTR) == 0)
+ goto out;
+ /*
+ * Process is sleeping and traced... make it runnable
+ * so it can discover the signal in issignal() and stop
+ * for the parent.
+ */
+ if (p->p_flag & P_TRACED)
+ goto run;
+ /*
+ * If SIGCONT is default (or ignored) and process is
+ * asleep, we are finished; the process should not
+ * be awakened.
+ */
+ if ((prop & SA_CONT) && action == SIG_DFL) {
+ p->p_siglist &= ~mask;
+ goto out;
+ }
+ /*
+ * When a sleeping process receives a stop
+ * signal, process immediately if possible.
+ * All other (caught or default) signals
+ * cause the process to run.
+ */
+ if (prop & SA_STOP) {
+ if (action != SIG_DFL)
+ goto runfast;
+ /*
+ * If a child holding parent blocked,
+ * stopping could cause deadlock.
+ */
+ if (p->p_flag & P_PPWAIT)
+ goto out;
+ p->p_siglist &= ~mask;
+ p->p_xstat = signum;
+#ifndef COMPAT_LINUX_THREADS
+ if ((p->p_pptr->p_flag & P_NOCLDSTOP) == 0)
+#else
+ if ((p->p_pptr->p_procsig->ps_flag & P_NOCLDSTOP) == 0)
+#endif /* COMPAT_LINUX_THREADS */
+ psignal(p->p_pptr, SIGCHLD);
+ stop(p);
+ goto out;
+ } else
+ goto runfast;
+ /*NOTREACHED*/
+
+ case SSTOP:
+ /*
+ * If traced process is already stopped,
+ * then no further action is necessary.
+ */
+ if (p->p_flag & P_TRACED)
+ goto out;
+
+ /*
+ * Kill signal always sets processes running.
+ */
+ if (signum == SIGKILL)
+ goto runfast;
+
+ if (prop & SA_CONT) {
+ /*
+ * If SIGCONT is default (or ignored), we continue the
+ * process but don't leave the signal in p_siglist, as
+ * it has no further action. If SIGCONT is held, we
+ * continue the process and leave the signal in
+ * p_siglist. If the process catches SIGCONT, let it
+ * handle the signal itself. If it isn't waiting on
+ * an event, then it goes back to run state.
+ * Otherwise, process goes back to sleep state.
+ */
+ if (action == SIG_DFL)
+ p->p_siglist &= ~mask;
+ if (action == SIG_CATCH)
+ goto runfast;
+ if (p->p_wchan == 0)
+ goto run;
+ p->p_stat = SSLEEP;
+ goto out;
+ }
+
+ if (prop & SA_STOP) {
+ /*
+ * Already stopped, don't need to stop again.
+ * (If we did the shell could get confused.)
+ */
+ p->p_siglist &= ~mask; /* take it away */
+ goto out;
+ }
+
+ /*
+ * If process is sleeping interruptibly, then simulate a
+ * wakeup so that when it is continued, it will be made
+ * runnable and can look at the signal. But don't make
+ * the process runnable, leave it stopped.
+ */
+ if (p->p_wchan && p->p_flag & P_SINTR)
+ unsleep(p);
+ goto out;
+
+ default:
+ /*
+ * SRUN, SIDL, SZOMB do nothing with the signal,
+ * other than kicking ourselves if we are running.
+ * It will either never be noticed, or noticed very soon.
+ */
+ if (p == curproc)
+ signotify(p);
+#ifdef SMP
+ else if (p->p_stat == SRUN)
+ forward_signal(p);
+#endif
+ goto out;
+ }
+ /*NOTREACHED*/
+
+runfast:
+ /*
+ * Raise priority to at least PUSER.
+ */
+ if (p->p_priority > PUSER)
+ p->p_priority = PUSER;
+run:
+ setrunnable(p);
+out:
+ splx(s);
+}
+
+/*
+ * If the current process has received a signal (should be caught or cause
+ * termination, should interrupt current syscall), return the signal number.
+ * Stop signals with default action are processed immediately, then cleared;
+ * they aren't returned. This is checked after each entry to the system for
+ * a syscall or trap (though this can usually be done without calling issignal
+ * by checking the pending signal masks in the CURSIG macro.) The normal call
+ * sequence is
+ *
+ * while (signum = CURSIG(curproc))
+ * postsig(signum);
+ */
+int
+issignal(p)
+ register struct proc *p;
+{
+ register int signum, mask, prop;
+
+ for (;;) {
+ int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
+
+ mask = p->p_siglist & ~p->p_sigmask;
+ if (p->p_flag & P_PPWAIT)
+ mask &= ~stopsigmask;
+ if (mask == 0) /* no signal to send */
+ return (0);
+ signum = ffs((long)mask);
+ mask = sigmask(signum);
+ prop = sigprop[signum];
+
+ STOPEVENT(p, S_SIG, signum);
+
+ /*
+ * We should see pending but ignored signals
+ * only if P_TRACED was on when they were posted.
+ */
+ if ((mask & p->p_sigignore) && (traced == 0)) {
+ p->p_siglist &= ~mask;
+ continue;
+ }
+ if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
+ /*
+ * If traced, always stop, and stay
+ * stopped until released by the parent.
+ */
+ p->p_xstat = signum;
+ psignal(p->p_pptr, SIGCHLD);
+ do {
+ stop(p);
+ mi_switch();
+ } while (!trace_req(p)
+ && p->p_flag & P_TRACED);
+
+ /*
+ * If the traced bit got turned off, go back up
+ * to the top to rescan signals. This ensures
+ * that p_sig* and ps_sigact are consistent.
+ */
+ if ((p->p_flag & P_TRACED) == 0)
+ continue;
+
+ /*
+ * If parent wants us to take the signal,
+ * then it will leave it in p->p_xstat;
+ * otherwise we just look for signals again.
+ */
+ p->p_siglist &= ~mask; /* clear the old signal */
+ signum = p->p_xstat;
+ if (signum == 0)
+ continue;
+
+ /*
+ * Put the new signal into p_siglist. If the
+ * signal is being masked, look for other signals.
+ */
+ mask = sigmask(signum);
+ p->p_siglist |= mask;
+ if (p->p_sigmask & mask)
+ continue;
+ }
+
+ /*
+ * Decide whether the signal should be returned.
+ * Return the signal's number, or fall through
+ * to clear it from the pending mask.
+ */
+ switch ((int)(intptr_t)p->p_sigacts->ps_sigact[signum]) {
+
+ case (int)SIG_DFL:
+ /*
+ * Don't take default actions on system processes.
+ */
+ if (p->p_pid <= 1) {
+#ifdef DIAGNOSTIC
+ /*
+ * Are you sure you want to ignore SIGSEGV
+ * in init? XXX
+ */
+ printf("Process (pid %lu) got signal %d\n",
+ (u_long)p->p_pid, signum);
+#endif
+ break; /* == ignore */
+ }
+ /*
+ * If there is a pending stop signal to process
+ * with default action, stop here,
+ * then clear the signal. However,
+ * if process is member of an orphaned
+ * process group, ignore tty stop signals.
+ */
+ if (prop & SA_STOP) {
+ if (p->p_flag & P_TRACED ||
+ (p->p_pgrp->pg_jobc == 0 &&
+ prop & SA_TTYSTOP))
+ break; /* == ignore */
+ p->p_xstat = signum;
+ stop(p);
+#ifndef COMPAT_LINUX_THREADS
+ if ((p->p_pptr->p_flag & P_NOCLDSTOP) == 0)
+#else
+ if ((p->p_pptr->p_procsig->ps_flag & P_NOCLDSTOP) == 0)
+#endif /* COMPAT_LINUX_THREADS */
+ psignal(p->p_pptr, SIGCHLD);
+ mi_switch();
+ break;
+ } else if (prop & SA_IGNORE) {
+ /*
+ * Except for SIGCONT, shouldn't get here.
+ * Default action is to ignore; drop it.
+ */
+ break; /* == ignore */
+ } else
+ return (signum);
+ /*NOTREACHED*/
+
+ case (int)SIG_IGN:
+ /*
+ * Masking above should prevent us ever trying
+ * to take action on an ignored signal other
+ * than SIGCONT, unless process is traced.
+ */
+ if ((prop & SA_CONT) == 0 &&
+ (p->p_flag & P_TRACED) == 0)
+ printf("issignal\n");
+ break; /* == ignore */
+
+ default:
+ /*
+ * This signal has an action, let
+ * postsig() process it.
+ */
+ return (signum);
+ }
+ p->p_siglist &= ~mask; /* take the signal! */
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Put the argument process into the stopped state and notify the parent
+ * via wakeup. Signals are handled elsewhere. The process must not be
+ * on the run queue.
+ */
+void
+stop(p)
+ register struct proc *p;
+{
+
+ p->p_stat = SSTOP;
+ p->p_flag &= ~P_WAITED;
+ wakeup((caddr_t)p->p_pptr);
+}
+
+/*
+ * Take the action for the specified signal
+ * from the current set of pending signals.
+ */
+void
+postsig(signum)
+ register int signum;
+{
+ register struct proc *p = curproc;
+ register struct sigacts *ps = p->p_sigacts;
+ register sig_t action;
+ int code, mask, returnmask;
+
+ KASSERT(signum != 0, ("postsig"));
+
+ mask = sigmask(signum);
+ p->p_siglist &= ~mask;
+ action = ps->ps_sigact[signum];
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_PSIG))
+ ktrpsig(p->p_tracep,
+#ifndef COMPAT_LINUX_THREADS
+ signum, action, ps->ps_flags & SAS_OLDMASK ?
+ ps->ps_oldmask : p->p_sigmask, 0);
+#else
+ signum, action, p->p_oldsigmask ?
+ p->p_oldsigmask : p->p_sigmask, 0);
+#endif /* COMPAT_LINUX_THREADS */
+#endif
+ STOPEVENT(p, S_SIG, signum);
+
+ if (action == SIG_DFL) {
+ /*
+ * Default action, where the default is to kill
+ * the process. (Other cases were ignored above.)
+ */
+ sigexit(p, signum);
+ /* NOTREACHED */
+ } else {
+ /*
+ * If we get here, the signal must be caught.
+ */
+ KASSERT(action != SIG_IGN && (p->p_sigmask & mask) == 0,
+ ("postsig action"));
+ /*
+ * Set the new mask value and also defer further
+ * occurences of this signal.
+ *
+ * Special case: user has done a sigpause. Here the
+ * current mask is not of interest, but rather the
+ * mask from before the sigpause is what we want
+ * restored after the signal processing is completed.
+ */
+ (void) splhigh();
+#ifndef COMPAT_LINUX_THREADS
+ if (ps->ps_flags & SAS_OLDMASK) {
+ returnmask = ps->ps_oldmask;
+ ps->ps_flags &= ~SAS_OLDMASK;
+#else
+ if (p->p_oldsigmask) {
+ returnmask = p->p_oldsigmask;
+ p->p_oldsigmask = 0;
+#endif /* COMPAT_LINUX_THREADS */
+ } else
+ returnmask = p->p_sigmask;
+ p->p_sigmask |= ps->ps_catchmask[signum] |
+ (mask & ~ps->ps_signodefer);
+ if ((ps->ps_sigreset & mask) != 0) {
+ /*
+ * See setsigvec() for origin of this code.
+ */
+ p->p_sigcatch &= ~mask;
+ if (signum != SIGCONT && sigprop[signum] & SA_IGNORE)
+ p->p_sigignore |= mask;
+ ps->ps_sigact[signum] = SIG_DFL;
+ }
+ (void) spl0();
+ p->p_stats->p_ru.ru_nsignals++;
+#ifndef COMPAT_LINUX_THREADS
+ if (ps->ps_sig != signum) {
+#else
+ if (p->p_sig != signum) {
+#endif /* COMPAT_LINUX_THREADS */
+ code = 0;
+ } else {
+#ifndef COMPAT_LINUX_THREADS
+ code = ps->ps_code;
+ ps->ps_code = 0;
+ ps->ps_sig = 0;
+#else
+ code = p->p_code;
+ p->p_code = 0;
+ p->p_sig = 0;
+#endif /* COMPAT_LINUX_THREADS */
+ }
+ (*p->p_sysent->sv_sendsig)(action, signum, returnmask, code);
+ }
+}
+
+/*
+ * Kill the current process for stated reason.
+ */
+void
+killproc(p, why)
+ struct proc *p;
+ char *why;
+{
+ log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
+ p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, why);
+ psignal(p, SIGKILL);
+}
+
+/*
+ * Force the current process to exit with the specified signal, dumping core
+ * if appropriate. We bypass the normal tests for masked and caught signals,
+ * allowing unrecoverable failures to terminate the process without changing
+ * signal state. Mark the accounting record with the signal termination.
+ * If dumping core, save the signal number for the debugger. Calls exit and
+ * does not return.
+ */
+void
+sigexit(p, signum)
+ register struct proc *p;
+ int signum;
+{
+
+ p->p_acflag |= AXSIG;
+ if (sigprop[signum] & SA_CORE) {
+#ifndef COMPAT_LINUX_THREADS
+ p->p_sigacts->ps_sig = signum;
+#else
+ p->p_sig = signum;
+#endif /* COMPAT_LINUX_THREADS */
+ /*
+ * Log signals which would cause core dumps
+ * (Log as LOG_INFO to appease those who don't want
+ * these messages.)
+ * XXX : Todo, as well as euid, write out ruid too
+ */
+ if (p->p_sysent->sv_coredump != NULL &&
+ (*p->p_sysent->sv_coredump)(p) == 0)
+ signum |= WCOREFLAG;
+ if (kern_logsigexit)
+ log(LOG_INFO,
+ "pid %d (%s), uid %d: exited on signal %d%s\n",
+ p->p_pid, p->p_comm,
+ p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1,
+ signum &~ WCOREFLAG,
+ signum & WCOREFLAG ? " (core dumped)" : "");
+ }
+ exit1(p, W_EXITCODE(0, signum));
+ /* NOTREACHED */
+}
+
+static char corefilename[MAXPATHLEN+1] = {"%N.core"};
+SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
+ sizeof(corefilename), "process corefile name format string");
+
+/*
+ * expand_name(name, uid, pid)
+ * Expand the name described in corefilename, using name, uid, and pid.
+ * corefilename is a printf-like string, with three format specifiers:
+ * %N name of process ("name")
+ * %P process id (pid)
+ * %U user id (uid)
+ * For example, "%N.core" is the default; they can be disabled completely
+ * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
+ * This is controlled by the sysctl variable kern.corefile (see above).
+ */
+
+char *
+expand_name(name, uid, pid)
+const char *name; int uid; int pid; {
+ char *temp;
+ char buf[11]; /* Buffer for pid/uid -- max 4B */
+ int i, n;
+ char *format = corefilename;
+
+ temp = malloc(MAXPATHLEN + 3, M_TEMP, M_NOWAIT);
+ if (temp == NULL)
+ return NULL;
+ bzero(temp, MAXPATHLEN+3);
+ for (i = 0, n = 0; i < MAXPATHLEN && format[i]; i++) {
+ int l;
+ switch (format[i]) {
+ case '%': /* Format character */
+ i++;
+ switch (format[i]) {
+ case '%':
+ temp[n++] = '%';
+ break;
+ case 'N': /* process name */
+ l = strlen(name);
+ if ((n + l) > MAXPATHLEN) {
+ log(LOG_ERR, "pid %d (%s), uid (%d): Path `%s%s' is too long\n",
+ pid, name, uid, temp, name);
+ free(temp, M_TEMP);
+ return NULL;
+ }
+ memcpy(temp+n, name, l);
+ n += l;
+ break;
+ case 'P': /* process id */
+ sprintf(buf, "%u", pid);
+ l = strlen(buf);
+ if ((n + l) > MAXPATHLEN) {
+ log(LOG_ERR, "pid %d (%s), uid (%d): Path `%s%s' is too long\n",
+ pid, name, uid, temp, name);
+ free(temp, M_TEMP);
+ return NULL;
+ }
+ memcpy(temp+n, buf, l);
+ n += l;
+ break;
+ case 'U': /* user id */
+ sprintf(buf, "%u", uid);
+ l = strlen(buf);
+ if ((n + l) > MAXPATHLEN) {
+ log(LOG_ERR, "pid %d (%s), uid (%d): Path `%s%s' is too long\n",
+ pid, name, uid, temp, name);
+ free(temp, M_TEMP);
+ return NULL;
+ }
+ memcpy(temp+n, buf, l);
+ n += l;
+ break;
+ default:
+ log(LOG_ERR, "Unknown format character %c in `%s'\n", format[i], format);
+ }
+ break;
+ default:
+ temp[n++] = format[i];
+ }
+ }
+ return temp;
+}
+
+/*
+ * Nonexistent system call-- signal process (may want to handle it).
+ * Flag error in case process won't see signal immediately (blocked or ignored).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nosys_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+nosys(p, args)
+ struct proc *p;
+ struct nosys_args *args;
+{
+
+ psignal(p, SIGSYS);
+ return (EINVAL);
+}
+
+/*
+ * Send a signal to a SIGIO or SIGURG to a process or process group using
+ * stored credentials rather than those of the current process.
+ */
+void
+pgsigio(sigio, signum, checkctty)
+ struct sigio *sigio;
+ int signum, checkctty;
+{
+ if (sigio == NULL)
+ return;
+
+ if (sigio->sio_pgid > 0) {
+ if (CANSIGIO(sigio->sio_ruid, sigio->sio_ucred,
+ sigio->sio_proc))
+ psignal(sigio->sio_proc, signum);
+ } else if (sigio->sio_pgid < 0) {
+ struct proc *p;
+
+ for (p = sigio->sio_pgrp->pg_members.lh_first; p != NULL;
+ p = p->p_pglist.le_next)
+ if (CANSIGIO(sigio->sio_ruid, sigio->sio_ucred, p) &&
+ (checkctty == 0 || (p->p_flag & P_CONTROLT)))
+ psignal(p, signum);
+ }
+}
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
new file mode 100644
index 0000000..a96d554
--- /dev/null
+++ b/sys/kern/kern_subr.c
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
+ * $Id: kern_subr.c,v 1.23 1999/01/08 17:31:10 eivind Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+
+int
+uiomove(cp, n, uio)
+ register caddr_t cp;
+ register int n;
+ register struct uio *uio;
+{
+ register struct iovec *iov;
+ u_int cnt;
+ int error;
+
+ KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+ ("uiomove: mode"));
+ KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_procp == curproc,
+ ("uiomove proc"));
+
+ while (n > 0 && uio->uio_resid) {
+ iov = uio->uio_iov;
+ cnt = iov->iov_len;
+ if (cnt == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ continue;
+ }
+ if (cnt > n)
+ cnt = n;
+
+ switch (uio->uio_segflg) {
+
+ case UIO_USERSPACE:
+ case UIO_USERISPACE:
+ if (uio->uio_rw == UIO_READ)
+ error = copyout(cp, iov->iov_base, cnt);
+ else
+ error = copyin(iov->iov_base, cp, cnt);
+ if (error)
+ return (error);
+ break;
+
+ case UIO_SYSSPACE:
+ if (uio->uio_rw == UIO_READ)
+ bcopy((caddr_t)cp, iov->iov_base, cnt);
+ else
+ bcopy(iov->iov_base, (caddr_t)cp, cnt);
+ break;
+ case UIO_NOCOPY:
+ break;
+ }
+ iov->iov_base += cnt;
+ iov->iov_len -= cnt;
+ uio->uio_resid -= cnt;
+ uio->uio_offset += cnt;
+ cp += cnt;
+ n -= cnt;
+ }
+ return (0);
+}
+
+int
+uiomoveco(cp, n, uio, obj)
+ caddr_t cp;
+ int n;
+ struct uio *uio;
+ struct vm_object *obj;
+{
+ struct iovec *iov;
+ u_int cnt;
+ int error;
+
+ KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+ ("uiomoveco: mode"));
+ KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_procp == curproc,
+ ("uiomoveco proc"));
+
+ while (n > 0 && uio->uio_resid) {
+ iov = uio->uio_iov;
+ cnt = iov->iov_len;
+ if (cnt == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ continue;
+ }
+ if (cnt > n)
+ cnt = n;
+
+ switch (uio->uio_segflg) {
+
+ case UIO_USERSPACE:
+ case UIO_USERISPACE:
+ if (uio->uio_rw == UIO_READ) {
+ if (vfs_ioopt && ((cnt & PAGE_MASK) == 0) &&
+ ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
+ ((uio->uio_offset & PAGE_MASK) == 0) &&
+ ((((intptr_t) cp) & PAGE_MASK) == 0)) {
+ error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
+ uio->uio_offset, cnt,
+ (vm_offset_t) iov->iov_base, NULL);
+ } else {
+ error = copyout(cp, iov->iov_base, cnt);
+ }
+ } else {
+ error = copyin(iov->iov_base, cp, cnt);
+ }
+ if (error)
+ return (error);
+ break;
+
+ case UIO_SYSSPACE:
+ if (uio->uio_rw == UIO_READ)
+ bcopy((caddr_t)cp, iov->iov_base, cnt);
+ else
+ bcopy(iov->iov_base, (caddr_t)cp, cnt);
+ break;
+ case UIO_NOCOPY:
+ break;
+ }
+ iov->iov_base += cnt;
+ iov->iov_len -= cnt;
+ uio->uio_resid -= cnt;
+ uio->uio_offset += cnt;
+ cp += cnt;
+ n -= cnt;
+ }
+ return (0);
+}
+
+int
+uioread(n, uio, obj, nread)
+ int n;
+ struct uio *uio;
+ struct vm_object *obj;
+ int *nread;
+{
+ int npagesmoved;
+ struct iovec *iov;
+ u_int cnt, tcnt;
+ int error;
+
+ *nread = 0;
+ if (vfs_ioopt < 2)
+ return 0;
+
+ error = 0;
+
+ while (n > 0 && uio->uio_resid) {
+ iov = uio->uio_iov;
+ cnt = iov->iov_len;
+ if (cnt == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ continue;
+ }
+ if (cnt > n)
+ cnt = n;
+
+ if ((uio->uio_segflg == UIO_USERSPACE) &&
+ ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
+ ((uio->uio_offset & PAGE_MASK) == 0) ) {
+
+ if (cnt < PAGE_SIZE)
+ break;
+
+ cnt &= ~PAGE_MASK;
+
+ error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
+ uio->uio_offset, cnt,
+ (vm_offset_t) iov->iov_base, &npagesmoved);
+
+ if (npagesmoved == 0)
+ break;
+
+ tcnt = npagesmoved * PAGE_SIZE;
+ cnt = tcnt;
+
+ if (error)
+ break;
+
+ iov->iov_base += cnt;
+ iov->iov_len -= cnt;
+ uio->uio_resid -= cnt;
+ uio->uio_offset += cnt;
+ *nread += cnt;
+ n -= cnt;
+ } else {
+ break;
+ }
+ }
+ return error;
+}
+
+/*
+ * Give next character to user as result of read.
+ */
+int
+ureadc(c, uio)
+ register int c;
+ register struct uio *uio;
+{
+ register struct iovec *iov;
+
+again:
+ if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
+ panic("ureadc");
+ iov = uio->uio_iov;
+ if (iov->iov_len == 0) {
+ uio->uio_iovcnt--;
+ uio->uio_iov++;
+ goto again;
+ }
+ switch (uio->uio_segflg) {
+
+ case UIO_USERSPACE:
+ if (subyte(iov->iov_base, c) < 0)
+ return (EFAULT);
+ break;
+
+ case UIO_SYSSPACE:
+ *iov->iov_base = c;
+ break;
+
+ case UIO_USERISPACE:
+ if (suibyte(iov->iov_base, c) < 0)
+ return (EFAULT);
+ break;
+ case UIO_NOCOPY:
+ break;
+ }
+ iov->iov_base++;
+ iov->iov_len--;
+ uio->uio_resid--;
+ uio->uio_offset++;
+ return (0);
+}
+
+#ifdef vax /* unused except by ct.c, other oddities XXX */
+/*
+ * Get next character written in by user from uio.
+ */
+int
+uwritec(uio)
+ struct uio *uio;
+{
+ register struct iovec *iov;
+ register int c;
+
+ if (uio->uio_resid <= 0)
+ return (-1);
+again:
+ if (uio->uio_iovcnt <= 0)
+ panic("uwritec");
+ iov = uio->uio_iov;
+ if (iov->iov_len == 0) {
+ uio->uio_iov++;
+ if (--uio->uio_iovcnt == 0)
+ return (-1);
+ goto again;
+ }
+ switch (uio->uio_segflg) {
+
+ case UIO_USERSPACE:
+ c = fubyte(iov->iov_base);
+ break;
+
+ case UIO_SYSSPACE:
+ c = *(u_char *) iov->iov_base;
+ break;
+
+ case UIO_USERISPACE:
+ c = fuibyte(iov->iov_base);
+ break;
+ }
+ if (c < 0)
+ return (-1);
+ iov->iov_base++;
+ iov->iov_len--;
+ uio->uio_resid--;
+ uio->uio_offset++;
+ return (c);
+}
+#endif /* vax */
+
+/*
+ * General routine to allocate a hash table.
+ */
+void *
+hashinit(elements, type, hashmask)
+ int elements;
+ struct malloc_type *type;
+ u_long *hashmask;
+{
+ long hashsize;
+ LIST_HEAD(generic, generic) *hashtbl;
+ int i;
+
+ if (elements <= 0)
+ panic("hashinit: bad elements");
+ for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
+ continue;
+ hashsize >>= 1;
+ hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
+ for (i = 0; i < hashsize; i++)
+ LIST_INIT(&hashtbl[i]);
+ *hashmask = hashsize - 1;
+ return (hashtbl);
+}
+
+static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039,
+ 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653,
+ 7159, 7673, 8191, 12281, 16381, 24571, 32749 };
+#define NPRIMES (sizeof(primes) / sizeof(primes[0]))
+
+/*
+ * General routine to allocate a prime number sized hash table.
+ */
+void *
+phashinit(elements, type, nentries)
+ int elements;
+ struct malloc_type *type;
+ u_long *nentries;
+{
+ long hashsize;
+ LIST_HEAD(generic, generic) *hashtbl;
+ int i;
+
+ if (elements <= 0)
+ panic("phashinit: bad elements");
+ for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
+ i++;
+ if (i == NPRIMES)
+ break;
+ hashsize = primes[i];
+ }
+ hashsize = primes[i - 1];
+ hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
+ for (i = 0; i < hashsize; i++)
+ LIST_INIT(&hashtbl[i]);
+ *nentries = hashsize;
+ return (hashtbl);
+}
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
new file mode 100644
index 0000000..f8baf85
--- /dev/null
+++ b/sys/kern/kern_synch.c
@@ -0,0 +1,923 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
+ * $Id: kern_synch.c,v 1.71 1999/01/08 17:31:10 eivind Exp $
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
+#include <sys/sysctl.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/cpu.h>
+#ifdef SMP
+#include <machine/smp.h>
+#endif
+#include <machine/limits.h> /* for UCHAR_MAX = typeof(p_priority)_MAX */
+
+static void rqinit __P((void *));
+SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
+
+u_char curpriority; /* usrpri of curproc */
+int lbolt; /* once a second sleep address */
+
+static void endtsleep __P((void *));
+static void roundrobin __P((void *arg));
+static void schedcpu __P((void *arg));
+static void updatepri __P((struct proc *p));
+
+#define MAXIMUM_SCHEDULE_QUANTUM (1000000) /* arbitrary limit */
+#ifndef DEFAULT_SCHEDULE_QUANTUM
+#define DEFAULT_SCHEDULE_QUANTUM 10
+#endif
+static int quantum = DEFAULT_SCHEDULE_QUANTUM; /* default value */
+
+static int
+sysctl_kern_quantum SYSCTL_HANDLER_ARGS
+{
+ int error;
+ int new_val = quantum;
+
+ new_val = quantum;
+ error = sysctl_handle_int(oidp, &new_val, 0, req);
+ if (error == 0) {
+ if ((new_val > 0) && (new_val < MAXIMUM_SCHEDULE_QUANTUM)) {
+ quantum = new_val;
+ } else {
+ error = EINVAL;
+ }
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
+ 0, sizeof quantum, sysctl_kern_quantum, "I", "");
+
+/* maybe_resched: Decide if you need to reschedule or not
+ * taking the priorities and schedulers into account.
+ */
+static void maybe_resched(struct proc *chk)
+{
+ struct proc *p = curproc; /* XXX */
+
+ /*
+ * Compare priorities if the new process is on the same scheduler,
+ * otherwise the one on the more realtimeish scheduler wins.
+ *
+ * XXX idle scheduler still broken because proccess stays on idle
+ * scheduler during waits (such as when getting FS locks). If a
+ * standard process becomes runaway cpu-bound, the system can lockup
+ * due to idle-scheduler processes in wakeup never getting any cpu.
+ */
+ if (p == 0 ||
+ (chk->p_priority < curpriority && RTP_PRIO_BASE(p->p_rtprio.type) == RTP_PRIO_BASE(chk->p_rtprio.type)) ||
+ RTP_PRIO_BASE(chk->p_rtprio.type) < RTP_PRIO_BASE(p->p_rtprio.type)
+ ) {
+ need_resched();
+ }
+}
+
+#define ROUNDROBIN_INTERVAL (hz / quantum)
+int roundrobin_interval(void)
+{
+ return ROUNDROBIN_INTERVAL;
+}
+
+/*
+ * Force switch among equal priority processes every 100ms.
+ */
+/* ARGSUSED */
+static void
+roundrobin(arg)
+ void *arg;
+{
+#ifndef SMP
+ struct proc *p = curproc; /* XXX */
+#endif
+
+#ifdef SMP
+ need_resched();
+ forward_roundrobin();
+#else
+ if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
+ need_resched();
+#endif
+
+ timeout(roundrobin, NULL, ROUNDROBIN_INTERVAL);
+}
+
+/*
+ * Constants for digital decay and forget:
+ * 90% of (p_estcpu) usage in 5 * loadav time
+ * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
+ * Note that, as ps(1) mentions, this can let percentages
+ * total over 100% (I've seen 137.9% for 3 processes).
+ *
+ * Note that statclock() updates p_estcpu and p_cpticks asynchronously.
+ *
+ * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
+ * That is, the system wants to compute a value of decay such
+ * that the following for loop:
+ * for (i = 0; i < (5 * loadavg); i++)
+ * p_estcpu *= decay;
+ * will compute
+ * p_estcpu *= 0.1;
+ * for all values of loadavg:
+ *
+ * Mathematically this loop can be expressed by saying:
+ * decay ** (5 * loadavg) ~= .1
+ *
+ * The system computes decay as:
+ * decay = (2 * loadavg) / (2 * loadavg + 1)
+ *
+ * We wish to prove that the system's computation of decay
+ * will always fulfill the equation:
+ * decay ** (5 * loadavg) ~= .1
+ *
+ * If we compute b as:
+ * b = 2 * loadavg
+ * then
+ * decay = b / (b + 1)
+ *
+ * We now need to prove two things:
+ * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
+ * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
+ *
+ * Facts:
+ * For x close to zero, exp(x) =~ 1 + x, since
+ * exp(x) = 0! + x**1/1! + x**2/2! + ... .
+ * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
+ * For x close to zero, ln(1+x) =~ x, since
+ * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
+ * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
+ * ln(.1) =~ -2.30
+ *
+ * Proof of (1):
+ * Solve (factor)**(power) =~ .1 given power (5*loadav):
+ * solving for factor,
+ * ln(factor) =~ (-2.30/5*loadav), or
+ * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
+ * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
+ *
+ * Proof of (2):
+ * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
+ * solving for power,
+ * power*ln(b/(b+1)) =~ -2.30, or
+ * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
+ *
+ * Actual power values for the implemented algorithm are as follows:
+ * loadav: 1 2 3 4
+ * power: 5.68 10.32 14.94 19.55
+ */
+
+/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
+#define loadfactor(loadav) (2 * (loadav))
+#define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
+
+/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
+SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
+
+/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
+static int fscale __unused = FSCALE;
+SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
+
+/*
+ * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
+ * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
+ * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
+ *
+ * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
+ * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
+ *
+ * If you don't want to bother with the faster/more-accurate formula, you
+ * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
+ * (more general) method of calculating the %age of CPU used by a process.
+ */
+#define CCPU_SHIFT 11
+
+/*
+ * Recompute process priorities, every hz ticks.
+ */
+/* ARGSUSED */
+static void
+schedcpu(arg)
+ void *arg;
+{
+ register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+ register struct proc *p;
+ register int realstathz, s;
+ register unsigned int newcpu;
+
+ realstathz = stathz ? stathz : hz;
+ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+ /*
+ * Increment time in/out of memory and sleep time
+ * (if sleeping). We ignore overflow; with 16-bit int's
+ * (remember them?) overflow takes 45 days.
+ */
+ p->p_swtime++;
+ if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
+ p->p_slptime++;
+ p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
+ /*
+ * If the process has slept the entire second,
+ * stop recalculating its priority until it wakes up.
+ */
+ if (p->p_slptime > 1)
+ continue;
+ s = splhigh(); /* prevent state changes and protect run queue */
+ /*
+ * p_pctcpu is only for ps.
+ */
+#if (FSHIFT >= CCPU_SHIFT)
+ p->p_pctcpu += (realstathz == 100)?
+ ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
+ 100 * (((fixpt_t) p->p_cpticks)
+ << (FSHIFT - CCPU_SHIFT)) / realstathz;
+#else
+ p->p_pctcpu += ((FSCALE - ccpu) *
+ (p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
+#endif
+ p->p_cpticks = 0;
+ newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu) + p->p_nice;
+ p->p_estcpu = min(newcpu, UCHAR_MAX);
+ resetpriority(p);
+ if (p->p_priority >= PUSER) {
+#define PPQ (128 / NQS) /* priorities per queue */
+ if ((p != curproc) &&
+#ifdef SMP
+ (u_char)p->p_oncpu == 0xff && /* idle */
+#endif
+ p->p_stat == SRUN &&
+ (p->p_flag & P_INMEM) &&
+ (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
+ remrq(p);
+ p->p_priority = p->p_usrpri;
+ setrunqueue(p);
+ } else
+ p->p_priority = p->p_usrpri;
+ }
+ splx(s);
+ }
+ vmmeter();
+ wakeup((caddr_t)&lbolt);
+ timeout(schedcpu, (void *)0, hz);
+}
+
+/*
+ * Recalculate the priority of a process after it has slept for a while.
+ * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
+ * least six times the loadfactor will decay p_estcpu to zero.
+ */
+static void
+updatepri(p)
+ register struct proc *p;
+{
+ register unsigned int newcpu = p->p_estcpu;
+ register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+
+ if (p->p_slptime > 5 * loadfac)
+ p->p_estcpu = 0;
+ else {
+ p->p_slptime--; /* the first time was done in schedcpu */
+ while (newcpu && --p->p_slptime)
+ newcpu = (int) decay_cpu(loadfac, newcpu);
+ p->p_estcpu = min(newcpu, UCHAR_MAX);
+ }
+ resetpriority(p);
+}
+
+/*
+ * We're only looking at 7 bits of the address; everything is
+ * aligned to 4, lots of things are aligned to greater powers
+ * of 2. Shift right by 8, i.e. drop the bottom 256 worth.
+ */
+#define TABLESIZE 128
+static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
+#define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1))
+
+/*
+ * During autoconfiguration or after a panic, a sleep will simply
+ * lower the priority briefly to allow interrupts, then return.
+ * The priority to be used (safepri) is machine-dependent, thus this
+ * value is initialized and maintained in the machine-dependent layers.
+ * This priority will typically be 0, or the lowest priority
+ * that is safe for use on the interrupt stack; it can be made
+ * higher to block network software interrupts after panics.
+ */
+int safepri;
+
+void
+sleepinit()
+{
+ int i;
+
+ for (i = 0; i < TABLESIZE; i++)
+ TAILQ_INIT(&slpque[i]);
+}
+
+/*
+ * General sleep call. Suspends the current process until a wakeup is
+ * performed on the specified identifier. The process will then be made
+ * runnable with the specified priority. Sleeps at most timo/hz seconds
+ * (0 means no timeout). If pri includes PCATCH flag, signals are checked
+ * before and after sleeping, else signals are not checked. Returns 0 if
+ * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
+ * signal needs to be delivered, ERESTART is returned if the current system
+ * call should be restarted if possible, and EINTR is returned if the system
+ * call should be interrupted by the signal (return EINTR).
+ */
+int
+tsleep(ident, priority, wmesg, timo)
+ void *ident;
+ int priority, timo;
+ const char *wmesg;
+{
+ struct proc *p = curproc;
+ int s, sig, catch = priority & PCATCH;
+ struct callout_handle thandle;
+
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_CSW))
+ ktrcsw(p->p_tracep, 1, 0);
+#endif
+ s = splhigh();
+ if (cold || panicstr) {
+ /*
+ * After a panic, or during autoconfiguration,
+ * just give interrupts a chance, then just return;
+ * don't run any other procs or panic below,
+ * in case this is the idle process and already asleep.
+ */
+ splx(safepri);
+ splx(s);
+ return (0);
+ }
+ KASSERT(p != NULL, ("tsleep1"));
+ KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep"));
+ /*
+ * Process may be sitting on a slpque if asleep() was called, remove
+ * it before re-adding.
+ */
+ if (p->p_wchan != NULL)
+ unsleep(p);
+
+ p->p_wchan = ident;
+ p->p_wmesg = wmesg;
+ p->p_slptime = 0;
+ p->p_priority = priority & PRIMASK;
+ TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
+ if (timo)
+ thandle = timeout(endtsleep, (void *)p, timo);
+ /*
+ * We put ourselves on the sleep queue and start our timeout
+ * before calling CURSIG, as we could stop there, and a wakeup
+ * or a SIGCONT (or both) could occur while we were stopped.
+ * A SIGCONT would cause us to be marked as SSLEEP
+ * without resuming us, thus we must be ready for sleep
+ * when CURSIG is called. If the wakeup happens while we're
+ * stopped, p->p_wchan will be 0 upon return from CURSIG.
+ */
+ if (catch) {
+ p->p_flag |= P_SINTR;
+ if ((sig = CURSIG(p))) {
+ if (p->p_wchan)
+ unsleep(p);
+ p->p_stat = SRUN;
+ goto resume;
+ }
+ if (p->p_wchan == 0) {
+ catch = 0;
+ goto resume;
+ }
+ } else
+ sig = 0;
+ p->p_stat = SSLEEP;
+ p->p_stats->p_ru.ru_nvcsw++;
+ mi_switch();
+resume:
+ curpriority = p->p_usrpri;
+ splx(s);
+ p->p_flag &= ~P_SINTR;
+ if (p->p_flag & P_TIMEOUT) {
+ p->p_flag &= ~P_TIMEOUT;
+ if (sig == 0) {
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_CSW))
+ ktrcsw(p->p_tracep, 0, 0);
+#endif
+ return (EWOULDBLOCK);
+ }
+ } else if (timo)
+ untimeout(endtsleep, (void *)p, thandle);
+ if (catch && (sig != 0 || (sig = CURSIG(p)))) {
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_CSW))
+ ktrcsw(p->p_tracep, 0, 0);
+#endif
+ if (p->p_sigacts->ps_sigintr & sigmask(sig))
+ return (EINTR);
+ return (ERESTART);
+ }
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_CSW))
+ ktrcsw(p->p_tracep, 0, 0);
+#endif
+ return (0);
+}
+
+/*
+ * asleep() - async sleep call. Place process on wait queue and return
+ * immediately without blocking. The process stays runnable until await()
+ * is called. If ident is NULL, remove process from wait queue if it is still
+ * on one.
+ *
+ * Only the most recent sleep condition is effective when making successive
+ * calls to asleep() or when calling tsleep().
+ *
+ * The timeout, if any, is not initiated until await() is called. The sleep
+ * priority, signal, and timeout is specified in the asleep() call but may be
+ * overriden in the await() call.
+ *
+ * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
+ */
+
+int
+asleep(void *ident, int priority, const char *wmesg, int timo)
+{
+ struct proc *p = curproc;
+ int s;
+
+ /*
+ * splhigh() while manipulating sleep structures and slpque.
+ *
+ * Remove preexisting wait condition (if any) and place process
+ * on appropriate slpque, but do not put process to sleep.
+ */
+
+ s = splhigh();
+
+ if (p->p_wchan != NULL)
+ unsleep(p);
+
+ if (ident) {
+ p->p_wchan = ident;
+ p->p_wmesg = wmesg;
+ p->p_slptime = 0;
+ p->p_asleep.as_priority = priority;
+ p->p_asleep.as_timo = timo;
+ TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
+ }
+
+ splx(s);
+
+ return(0);
+}
+
+/*
+ * await() - wait for async condition to occur. The process blocks until
+ * wakeup() is called on the most recent asleep() address. If wakeup is called
+ * priority to await(), await() winds up being a NOP.
+ *
+ * If await() is called more then once (without an intervening asleep() call),
+ * await() is still effectively a NOP but it calls mi_switch() to give other
+ * processes some cpu before returning. The process is left runnable.
+ *
+ * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
+ */
+
+int
+await(int priority, int timo)
+{
+ struct proc *p = curproc;
+ int s;
+
+ s = splhigh();
+
+ if (p->p_wchan != NULL) {
+ struct callout_handle thandle;
+ int sig;
+ int catch;
+
+ /*
+ * The call to await() can override defaults specified in
+ * the original asleep().
+ */
+ if (priority < 0)
+ priority = p->p_asleep.as_priority;
+ if (timo < 0)
+ timo = p->p_asleep.as_timo;
+
+ /*
+ * Install timeout
+ */
+
+ if (timo)
+ thandle = timeout(endtsleep, (void *)p, timo);
+
+ sig = 0;
+ catch = priority & PCATCH;
+
+ if (catch) {
+ p->p_flag |= P_SINTR;
+ if ((sig = CURSIG(p))) {
+ if (p->p_wchan)
+ unsleep(p);
+ p->p_stat = SRUN;
+ goto resume;
+ }
+ if (p->p_wchan == NULL) {
+ catch = 0;
+ goto resume;
+ }
+ }
+ p->p_stat = SSLEEP;
+ p->p_stats->p_ru.ru_nvcsw++;
+ mi_switch();
+resume:
+ curpriority = p->p_usrpri;
+
+ splx(s);
+ p->p_flag &= ~P_SINTR;
+ if (p->p_flag & P_TIMEOUT) {
+ p->p_flag &= ~P_TIMEOUT;
+ if (sig == 0) {
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_CSW))
+ ktrcsw(p->p_tracep, 0, 0);
+#endif
+ return (EWOULDBLOCK);
+ }
+ } else if (timo)
+ untimeout(endtsleep, (void *)p, thandle);
+ if (catch && (sig != 0 || (sig = CURSIG(p)))) {
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_CSW))
+ ktrcsw(p->p_tracep, 0, 0);
+#endif
+ if (p->p_sigacts->ps_sigintr & sigmask(sig))
+ return (EINTR);
+ return (ERESTART);
+ }
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_CSW))
+ ktrcsw(p->p_tracep, 0, 0);
+#endif
+ } else {
+ /*
+ * If as_priority is 0, await() has been called without an
+ * intervening asleep(). We are still effectively a NOP,
+ * but we call mi_switch() for safety.
+ */
+
+ if (p->p_asleep.as_priority == 0) {
+ p->p_stats->p_ru.ru_nvcsw++;
+ mi_switch();
+ }
+ splx(s);
+ }
+
+ /*
+ * clear p_asleep.as_priority as an indication that await() has been
+ * called. If await() is called again without an intervening asleep(),
+ * await() is still effectively a NOP but the above mi_switch() code
+ * is triggered as a safety.
+ */
+ p->p_asleep.as_priority = 0;
+
+ return (0);
+}
+
+/*
+ * Implement timeout for tsleep or asleep()/await()
+ *
+ * If process hasn't been awakened (wchan non-zero),
+ * set timeout flag and undo the sleep. If proc
+ * is stopped, just unsleep so it will remain stopped.
+ */
+static void
+endtsleep(arg)
+ void *arg;
+{
+ register struct proc *p;
+ int s;
+
+ p = (struct proc *)arg;
+ s = splhigh();
+ if (p->p_wchan) {
+ if (p->p_stat == SSLEEP)
+ setrunnable(p);
+ else
+ unsleep(p);
+ p->p_flag |= P_TIMEOUT;
+ }
+ splx(s);
+}
+
+/*
+ * Remove a process from its wait queue
+ */
+void
+unsleep(p)
+ register struct proc *p;
+{
+ int s;
+
+ s = splhigh();
+ if (p->p_wchan) {
+ TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
+ p->p_wchan = 0;
+ }
+ splx(s);
+}
+
+/*
+ * Make all processes sleeping on the specified identifier runnable.
+ */
+void
+wakeup(ident)
+ register void *ident;
+{
+ register struct slpquehead *qp;
+ register struct proc *p;
+ int s;
+
+ s = splhigh();
+ qp = &slpque[LOOKUP(ident)];
+restart:
+ for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) {
+ if (p->p_wchan == ident) {
+ TAILQ_REMOVE(qp, p, p_procq);
+ p->p_wchan = 0;
+ if (p->p_stat == SSLEEP) {
+ /* OPTIMIZED EXPANSION OF setrunnable(p); */
+ if (p->p_slptime > 1)
+ updatepri(p);
+ p->p_slptime = 0;
+ p->p_stat = SRUN;
+ if (p->p_flag & P_INMEM) {
+ setrunqueue(p);
+ maybe_resched(p);
+ } else {
+ p->p_flag |= P_SWAPINREQ;
+ wakeup((caddr_t)&proc0);
+ }
+ /* END INLINE EXPANSION */
+ goto restart;
+ }
+ }
+ }
+ splx(s);
+}
+
+/*
+ * Make a process sleeping on the specified identifier runnable.
+ * May wake more than one process if a target prcoess is currently
+ * swapped out.
+ */
+void
+wakeup_one(ident)
+ register void *ident;
+{
+ register struct slpquehead *qp;
+ register struct proc *p;
+ int s;
+
+ s = splhigh();
+ qp = &slpque[LOOKUP(ident)];
+
+ for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) {
+ if (p->p_wchan == ident) {
+ TAILQ_REMOVE(qp, p, p_procq);
+ p->p_wchan = 0;
+ if (p->p_stat == SSLEEP) {
+ /* OPTIMIZED EXPANSION OF setrunnable(p); */
+ if (p->p_slptime > 1)
+ updatepri(p);
+ p->p_slptime = 0;
+ p->p_stat = SRUN;
+ if (p->p_flag & P_INMEM) {
+ setrunqueue(p);
+ maybe_resched(p);
+ break;
+ } else {
+ p->p_flag |= P_SWAPINREQ;
+ wakeup((caddr_t)&proc0);
+ }
+ /* END INLINE EXPANSION */
+ }
+ }
+ }
+ splx(s);
+}
+
+/*
+ * The machine independent parts of mi_switch().
+ * Must be called at splstatclock() or higher.
+ */
+void
+mi_switch()
+{
+ register struct proc *p = curproc; /* XXX */
+ register struct rlimit *rlim;
+ int x;
+
+ /*
+ * XXX this spl is almost unnecessary. It is partly to allow for
+ * sloppy callers that don't do it (issignal() via CURSIG() is the
+ * main offender). It is partly to work around a bug in the i386
+ * cpu_switch() (the ipl is not preserved). We ran for years
+ * without it. I think there was only a interrupt latency problem.
+ * The main caller, tsleep(), does an splx() a couple of instructions
+ * after calling here. The buggy caller, issignal(), usually calls
+ * here at spl0() and sometimes returns at splhigh(). The process
+ * then runs for a little too long at splhigh(). The ipl gets fixed
+ * when the process returns to user mode (or earlier).
+ *
+ * It would probably be better to always call here at spl0(). Callers
+ * are prepared to give up control to another process, so they must
+ * be prepared to be interrupted. The clock stuff here may not
+ * actually need splstatclock().
+ */
+ x = splstatclock();
+
+#ifdef SIMPLELOCK_DEBUG
+ if (p->p_simple_locks)
+ printf("sleep: holding simple lock\n");
+#endif
+ /*
+ * Compute the amount of time during which the current
+ * process was running, and add that to its total so far.
+ */
+ microuptime(&switchtime);
+ p->p_runtime += (switchtime.tv_usec - p->p_switchtime.tv_usec) +
+ (switchtime.tv_sec - p->p_switchtime.tv_sec) * (int64_t)1000000;
+
+ /*
+ * Check if the process exceeds its cpu resource allocation.
+ * If over max, kill it.
+ */
+ if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
+ p->p_runtime > p->p_limit->p_cpulimit) {
+ rlim = &p->p_rlimit[RLIMIT_CPU];
+ if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
+ killproc(p, "exceeded maximum CPU limit");
+ } else {
+ psignal(p, SIGXCPU);
+ if (rlim->rlim_cur < rlim->rlim_max) {
+ /* XXX: we should make a private copy */
+ rlim->rlim_cur += 5;
+ }
+ }
+ }
+
+ /*
+ * Pick a new current process and record its start time.
+ */
+ cnt.v_swtch++;
+ cpu_switch(p);
+ if (switchtime.tv_sec)
+ p->p_switchtime = switchtime;
+ else
+ microuptime(&p->p_switchtime);
+ splx(x);
+}
+
+/*
+ * Initialize the (doubly-linked) run queues
+ * to be empty.
+ */
+/* ARGSUSED*/
+static void
+rqinit(dummy)
+ void *dummy;
+{
+ register int i;
+
+ for (i = 0; i < NQS; i++) {
+ qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i];
+ rtqs[i].ph_link = rtqs[i].ph_rlink = (struct proc *)&rtqs[i];
+ idqs[i].ph_link = idqs[i].ph_rlink = (struct proc *)&idqs[i];
+ }
+}
+
+/*
+ * Change process state to be runnable,
+ * placing it on the run queue if it is in memory,
+ * and awakening the swapper if it isn't in memory.
+ */
+void
+setrunnable(p)
+ register struct proc *p;
+{
+ register int s;
+
+ s = splhigh();
+ switch (p->p_stat) {
+ case 0:
+ case SRUN:
+ case SZOMB:
+ default:
+ panic("setrunnable");
+ case SSTOP:
+ case SSLEEP:
+ unsleep(p); /* e.g. when sending signals */
+ break;
+
+ case SIDL:
+ break;
+ }
+ p->p_stat = SRUN;
+ if (p->p_flag & P_INMEM)
+ setrunqueue(p);
+ splx(s);
+ if (p->p_slptime > 1)
+ updatepri(p);
+ p->p_slptime = 0;
+ if ((p->p_flag & P_INMEM) == 0) {
+ p->p_flag |= P_SWAPINREQ;
+ wakeup((caddr_t)&proc0);
+ }
+ else
+ maybe_resched(p);
+}
+
+/*
+ * Compute the priority of a process when running in user mode.
+ * Arrange to reschedule if the resulting priority is better
+ * than that of the current process.
+ */
+void
+resetpriority(p)
+ register struct proc *p;
+{
+ register unsigned int newpriority;
+
+ if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
+ newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
+ newpriority = min(newpriority, MAXPRI);
+ p->p_usrpri = newpriority;
+ }
+ maybe_resched(p);
+}
+
+/* ARGSUSED */
+static void sched_setup __P((void *dummy));
+static void
+sched_setup(dummy)
+ void *dummy;
+{
+ /* Kick off timeout driven events by calling first time. */
+ roundrobin(NULL);
+ schedcpu(NULL);
+}
+SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
+
diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c
new file mode 100644
index 0000000..e1192a9
--- /dev/null
+++ b/sys/kern/kern_syscalls.c
@@ -0,0 +1,109 @@
+/*-
+ * Copyright (c) 1999 Assar Westerlund
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: kern_syscalls.c,v 1.2 1999/01/09 14:59:50 dfr Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/syscall.h>
+#include <sys/module.h>
+#include <sys/linker.h>
+#include <sys/proc.h>
+
+/*
+ * Acts like "nosys" but can be identified in sysent for dynamic call
+ * number assignment for a limited number of calls.
+ *
+ * Place holder for system call slots reserved for loadable modules.
+ */
+int
+lkmnosys(struct proc *p, struct nosys_args *args)
+{
+ return(nosys(p, args));
+}
+
+int
+syscall_register(int *offset, struct sysent *new_sysent,
+ struct sysent *old_sysent)
+{
+ if (*offset == NO_SYSCALL) {
+ int i;
+
+ for (i = 1; i < SYS_MAXSYSCALL; ++i)
+ if (sysent[i].sy_call == (sy_call_t *)lkmnosys)
+ break;
+ if (i == SYS_MAXSYSCALL)
+ return ENFILE;
+ *offset = i;
+ } else if (*offset < 0 || *offset >= SYS_MAXSYSCALL)
+ return EINVAL;
+ else if (sysent[*offset].sy_call != (sy_call_t *)lkmnosys)
+ return EEXIST;
+
+ *old_sysent = sysent[*offset];
+ sysent[*offset] = *new_sysent;
+ return 0;
+}
+
+int
+syscall_deregister(int *offset, struct sysent *old_sysent)
+{
+ if (*offset)
+ sysent[*offset] = *old_sysent;
+ return 0;
+}
+
+int
+syscall_module_handler(struct module *mod, int what, void *arg)
+{
+ struct syscall_module_data *data = (struct syscall_module_data*)arg;
+ modspecific_t ms;
+ int error;
+
+ switch (what) {
+ case MOD_LOAD :
+ error = syscall_register(data->offset, data->new_sysent,
+ &data->old_sysent);
+ if (error)
+ return error;
+ ms.intval = *data->offset;
+ module_setspecific(mod, &ms);
+ break;
+ case MOD_UNLOAD :
+ error = syscall_deregister(data->offset, &data->old_sysent);
+ if (error)
+ return error;
+ break;
+ }
+ if (data->chainevh)
+ return data->chainevh(mod, what, data->chainarg);
+ else
+ return 0;
+}
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
new file mode 100644
index 0000000..fbf2f6a
--- /dev/null
+++ b/sys/kern/kern_sysctl.c
@@ -0,0 +1,1122 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
+ * $Id: kern_sysctl.c,v 1.81 1998/12/27 18:03:29 dfr Exp $
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
+
+/*
+ * Locking and stats
+ */
+static struct sysctl_lock {
+ int sl_lock;
+ int sl_want;
+ int sl_locked;
+} memlock;
+
+static int sysctl_root SYSCTL_HANDLER_ARGS;
+
+extern struct linker_set sysctl_;
+
+/*
+ * Initialization of the MIB tree.
+ *
+ * Order by number in each linker_set.
+ */
+
+static int
+sysctl_order_cmp(const void *a, const void *b)
+{
+ struct sysctl_oid const * const *pa;
+ struct sysctl_oid const * const *pb;
+
+ pa = (struct sysctl_oid const * const *)a;
+ pb = (struct sysctl_oid const * const *)b;
+ if (*pa == NULL && *pb == NULL)
+ return 0;
+ if (*pa == NULL)
+ return (1);
+ if (*pb == NULL)
+ return (-1);
+ return ((*pa)->oid_number - (*pb)->oid_number);
+}
+
+static void
+sysctl_order(void *arg)
+{
+ int j, k;
+ struct linker_set *l = (struct linker_set *) arg;
+ struct sysctl_oid **oidpp;
+
+ /* First, find the highest oid we have */
+ j = l->ls_length;
+ oidpp = (struct sysctl_oid **) l->ls_items;
+ for (k = 0; j--; oidpp++) {
+ if (!*oidpp)
+ continue;
+ if ((*oidpp)->oid_arg1 == arg) {
+ *oidpp = 0;
+ continue;
+ }
+ if ((*oidpp)->oid_number > k)
+ k = (*oidpp)->oid_number;
+ }
+
+ /* Next, replace all OID_AUTO oids with new numbers */
+ j = l->ls_length;
+ oidpp = (struct sysctl_oid **) l->ls_items;
+ k += 100;
+ for (; j--; oidpp++)
+ if (*oidpp && (*oidpp)->oid_number == OID_AUTO)
+ (*oidpp)->oid_number = k++;
+
+ /* Finally: sort by oid */
+ j = l->ls_length;
+ oidpp = (struct sysctl_oid **) l->ls_items;
+ for (; j--; oidpp++) {
+ if (!*oidpp)
+ continue;
+ if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE)
+ if (!(*oidpp)->oid_handler)
+ sysctl_order((*oidpp)->oid_arg1);
+ }
+ qsort(l->ls_items, l->ls_length, sizeof l->ls_items[0],
+ sysctl_order_cmp);
+}
+
+SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_order, &sysctl_);
+
+void
+sysctl_order_all(void)
+{
+ sysctl_order(&sysctl_);
+}
+
+/*
+ * "Staff-functions"
+ *
+ * These functions implement a presently undocumented interface
+ * used by the sysctl program to walk the tree, and get the type
+ * so it can print the value.
+ * This interface is under work and consideration, and should probably
+ * be killed with a big axe by the first person who can find the time.
+ * (be aware though, that the proper interface isn't as obvious as it
+ * may seem, there are various conflicting requirements.
+ *
+ * {0,0} printf the entire MIB-tree.
+ * {0,1,...} return the name of the "..." OID.
+ * {0,2,...} return the next OID.
+ * {0,3} return the OID of the name in "new"
+ * {0,4,...} return the kind & format info for the "..." OID.
+ */
+
+static void
+sysctl_sysctl_debug_dump_node(struct linker_set *l, int i)
+{
+ int j, k;
+ struct sysctl_oid **oidpp;
+
+ j = l->ls_length;
+ oidpp = (struct sysctl_oid **) l->ls_items;
+ for (; j--; oidpp++) {
+
+ if (!*oidpp)
+ continue;
+
+ for (k=0; k<i; k++)
+ printf(" ");
+
+ printf("%d %s ", (*oidpp)->oid_number, (*oidpp)->oid_name);
+
+ printf("%c%c",
+ (*oidpp)->oid_kind & CTLFLAG_RD ? 'R':' ',
+ (*oidpp)->oid_kind & CTLFLAG_WR ? 'W':' ');
+
+ if ((*oidpp)->oid_handler)
+ printf(" *Handler");
+
+ switch ((*oidpp)->oid_kind & CTLTYPE) {
+ case CTLTYPE_NODE:
+ printf(" Node\n");
+ if (!(*oidpp)->oid_handler) {
+ sysctl_sysctl_debug_dump_node(
+ (*oidpp)->oid_arg1, i+2);
+ }
+ break;
+ case CTLTYPE_INT: printf(" Int\n"); break;
+ case CTLTYPE_STRING: printf(" String\n"); break;
+ case CTLTYPE_QUAD: printf(" Quad\n"); break;
+ case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
+ default: printf("\n");
+ }
+
+ }
+}
+
+static int
+sysctl_sysctl_debug SYSCTL_HANDLER_ARGS
+{
+ sysctl_sysctl_debug_dump_node(&sysctl_, 0);
+ return ENOENT;
+}
+
+SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
+ 0, 0, sysctl_sysctl_debug, "-", "");
+
+static int
+sysctl_sysctl_name SYSCTL_HANDLER_ARGS
+{
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ int i, j, error = 0;
+ struct sysctl_oid **oidpp;
+ struct linker_set *lsp = &sysctl_;
+ char buf[10];
+
+ while (namelen) {
+ if (!lsp) {
+ snprintf(buf,sizeof(buf),"%d",*name);
+ if (req->oldidx)
+ error = SYSCTL_OUT(req, ".", 1);
+ if (!error)
+ error = SYSCTL_OUT(req, buf, strlen(buf));
+ if (error)
+ return (error);
+ namelen--;
+ name++;
+ continue;
+ }
+ oidpp = (struct sysctl_oid **) lsp->ls_items;
+ j = lsp->ls_length;
+ lsp = 0;
+ for (i = 0; i < j; i++, oidpp++) {
+ if (*oidpp && ((*oidpp)->oid_number != *name))
+ continue;
+
+ if (req->oldidx)
+ error = SYSCTL_OUT(req, ".", 1);
+ if (!error)
+ error = SYSCTL_OUT(req, (*oidpp)->oid_name,
+ strlen((*oidpp)->oid_name));
+ if (error)
+ return (error);
+
+ namelen--;
+ name++;
+
+ if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ break;
+
+ if ((*oidpp)->oid_handler)
+ break;
+
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ break;
+ }
+ }
+ return (SYSCTL_OUT(req, "", 1));
+}
+
+SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, "");
+
+static int
+sysctl_sysctl_next_ls (struct linker_set *lsp, int *name, u_int namelen,
+ int *next, int *len, int level, struct sysctl_oid **oidp)
+{
+ int i, j;
+ struct sysctl_oid **oidpp;
+
+ oidpp = (struct sysctl_oid **) lsp->ls_items;
+ j = lsp->ls_length;
+ *len = level;
+ for (i = 0; i < j; i++, oidpp++) {
+ if (!*oidpp)
+ continue;
+
+ *next = (*oidpp)->oid_number;
+ *oidp = *oidpp;
+
+ if (!namelen) {
+ if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ return 0;
+ if ((*oidpp)->oid_handler)
+ /* We really should call the handler here...*/
+ return 0;
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ if (!sysctl_sysctl_next_ls (lsp, 0, 0, next+1,
+ len, level+1, oidp))
+ return 0;
+ goto next;
+ }
+
+ if ((*oidpp)->oid_number < *name)
+ continue;
+
+ if ((*oidpp)->oid_number > *name) {
+ if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ return 0;
+ if ((*oidpp)->oid_handler)
+ return 0;
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ if (!sysctl_sysctl_next_ls (lsp, name+1, namelen-1,
+ next+1, len, level+1, oidp))
+ return (0);
+ goto next;
+ }
+ if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ continue;
+
+ if ((*oidpp)->oid_handler)
+ continue;
+
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ if (!sysctl_sysctl_next_ls (lsp, name+1, namelen-1, next+1,
+ len, level+1, oidp))
+ return (0);
+ next:
+ namelen = 1;
+ *len = level;
+ }
+ return 1;
+}
+
+static int
+sysctl_sysctl_next SYSCTL_HANDLER_ARGS
+{
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ int i, j, error;
+ struct sysctl_oid *oid;
+ struct linker_set *lsp = &sysctl_;
+ int newoid[CTL_MAXNAME];
+
+ i = sysctl_sysctl_next_ls (lsp, name, namelen, newoid, &j, 1, &oid);
+ if (i)
+ return ENOENT;
+ error = SYSCTL_OUT(req, newoid, j * sizeof (int));
+ return (error);
+}
+
+SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, "");
+
+static int
+name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidp)
+{
+ int i, j;
+ struct sysctl_oid **oidpp;
+ struct linker_set *lsp = &sysctl_;
+ char *p;
+
+ if (!*name)
+ return ENOENT;
+
+ p = name + strlen(name) - 1 ;
+ if (*p == '.')
+ *p = '\0';
+
+ *len = 0;
+
+ for (p = name; *p && *p != '.'; p++)
+ ;
+ i = *p;
+ if (i == '.')
+ *p = '\0';
+
+ j = lsp->ls_length;
+ oidpp = (struct sysctl_oid **) lsp->ls_items;
+
+ while (j-- && *len < CTL_MAXNAME) {
+ if (!*oidpp)
+ continue;
+ if (strcmp(name, (*oidpp)->oid_name)) {
+ oidpp++;
+ continue;
+ }
+ *oid++ = (*oidpp)->oid_number;
+ (*len)++;
+
+ if (!i) {
+ if (oidp)
+ *oidp = *oidpp;
+ return (0);
+ }
+
+ if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ break;
+
+ if ((*oidpp)->oid_handler)
+ break;
+
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ j = lsp->ls_length;
+ oidpp = (struct sysctl_oid **)lsp->ls_items;
+ name = p+1;
+ for (p = name; *p && *p != '.'; p++)
+ ;
+ i = *p;
+ if (i == '.')
+ *p = '\0';
+ }
+ return ENOENT;
+}
+
+static int
+sysctl_sysctl_name2oid SYSCTL_HANDLER_ARGS
+{
+ char *p;
+ int error, oid[CTL_MAXNAME], len;
+ struct sysctl_oid *op = 0;
+
+ if (!req->newlen)
+ return ENOENT;
+
+ p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
+
+ error = SYSCTL_IN(req, p, req->newlen);
+ if (error) {
+ free(p, M_SYSCTL);
+ return (error);
+ }
+
+ p [req->newlen] = '\0';
+
+ error = name2oid(p, oid, &len, &op);
+
+ free(p, M_SYSCTL);
+
+ if (error)
+ return (error);
+
+ error = SYSCTL_OUT(req, oid, len * sizeof *oid);
+ return (error);
+}
+
+SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0,
+ sysctl_sysctl_name2oid, "I", "");
+
+static int
+sysctl_sysctl_oidfmt SYSCTL_HANDLER_ARGS
+{
+ int *name = (int *) arg1, error;
+ u_int namelen = arg2;
+ int indx, j;
+ struct sysctl_oid **oidpp;
+ struct linker_set *lsp = &sysctl_;
+
+ j = lsp->ls_length;
+ oidpp = (struct sysctl_oid **) lsp->ls_items;
+
+ indx = 0;
+ while (j-- && indx < CTL_MAXNAME) {
+ if (*oidpp && ((*oidpp)->oid_number == name[indx])) {
+ indx++;
+ if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ if ((*oidpp)->oid_handler)
+ goto found;
+ if (indx == namelen)
+ goto found;
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ j = lsp->ls_length;
+ oidpp = (struct sysctl_oid **)lsp->ls_items;
+ } else {
+ if (indx != namelen)
+ return EISDIR;
+ goto found;
+ }
+ } else {
+ oidpp++;
+ }
+ }
+ return ENOENT;
+found:
+ if (!(*oidpp)->oid_fmt)
+ return ENOENT;
+ error = SYSCTL_OUT(req,
+ &(*oidpp)->oid_kind, sizeof((*oidpp)->oid_kind));
+ if (!error)
+ error = SYSCTL_OUT(req, (*oidpp)->oid_fmt,
+ strlen((*oidpp)->oid_fmt)+1);
+ return (error);
+}
+
+
+SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, "");
+
+/*
+ * Default "handler" functions.
+ */
+
+/*
+ * Handle an int, signed or unsigned.
+ * Two cases:
+ * a variable: point arg1 at it.
+ * a constant: pass it in arg2.
+ */
+
+int
+sysctl_handle_int SYSCTL_HANDLER_ARGS
+{
+ int error = 0;
+
+ if (arg1)
+ error = SYSCTL_OUT(req, arg1, sizeof(int));
+ else
+ error = SYSCTL_OUT(req, &arg2, sizeof(int));
+
+ if (error || !req->newptr)
+ return (error);
+
+ if (!arg1)
+ error = EPERM;
+ else
+ error = SYSCTL_IN(req, arg1, sizeof(int));
+ return (error);
+}
+
+/*
+ * Handle a long, signed or unsigned.
+ * Two cases:
+ * a variable: point arg1 at it.
+ * a constant: pass it in arg2.
+ */
+
+int
+sysctl_handle_long SYSCTL_HANDLER_ARGS
+{
+ int error = 0;
+
+ error = SYSCTL_OUT(req, arg1, sizeof(long));
+
+ if (error || !req->newptr)
+ return (error);
+
+ if (!arg1)
+ error = EPERM;
+ else
+ error = SYSCTL_IN(req, arg1, sizeof(long));
+ return (error);
+}
+
+/*
+ * Handle our generic '\0' terminated 'C' string.
+ * Two cases:
+ * a variable string: point arg1 at it, arg2 is max length.
+ * a constant string: point arg1 at it, arg2 is zero.
+ */
+
+int
+sysctl_handle_string SYSCTL_HANDLER_ARGS
+{
+ int error=0;
+
+ error = SYSCTL_OUT(req, arg1, strlen((char *)arg1)+1);
+
+ if (error || !req->newptr || !arg2)
+ return (error);
+
+ if ((req->newlen - req->newidx) > arg2) {
+ error = E2BIG;
+ } else {
+ arg2 = (req->newlen - req->newidx);
+ error = SYSCTL_IN(req, arg1, arg2);
+ ((char *)arg1)[arg2] = '\0';
+ }
+
+ return (error);
+}
+
+/*
+ * Handle any kind of opaque data.
+ * arg1 points to it, arg2 is the size.
+ */
+
+int
+sysctl_handle_opaque SYSCTL_HANDLER_ARGS
+{
+ int error;
+
+ error = SYSCTL_OUT(req, arg1, arg2);
+
+ if (error || !req->newptr)
+ return (error);
+
+ error = SYSCTL_IN(req, arg1, arg2);
+
+ return (error);
+}
+
+/*
+ * Transfer functions to/from kernel space.
+ * XXX: rather untested at this point
+ */
+static int
+sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
+{
+ size_t i = 0;
+
+ if (req->oldptr) {
+ i = l;
+ if (i > req->oldlen - req->oldidx)
+ i = req->oldlen - req->oldidx;
+ if (i > 0)
+ bcopy(p, (char *)req->oldptr + req->oldidx, i);
+ }
+ req->oldidx += l;
+ if (req->oldptr && i != l)
+ return (ENOMEM);
+ return (0);
+}
+
+static int
+sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
+{
+ if (!req->newptr)
+ return 0;
+ if (req->newlen - req->newidx < l)
+ return (EINVAL);
+ bcopy((char *)req->newptr + req->newidx, p, l);
+ req->newidx += l;
+ return (0);
+}
+
+int
+kernel_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval)
+{
+ int error = 0;
+ struct sysctl_req req;
+
+ bzero(&req, sizeof req);
+
+ req.p = p;
+
+ if (oldlenp) {
+ req.oldlen = *oldlenp;
+ }
+
+ if (old) {
+ req.oldptr= old;
+ }
+
+ if (newlen) {
+ req.newlen = newlen;
+ req.newptr = new;
+ }
+
+ req.oldfunc = sysctl_old_kernel;
+ req.newfunc = sysctl_new_kernel;
+ req.lock = 1;
+
+ /* XXX this should probably be done in a general way */
+ while (memlock.sl_lock) {
+ memlock.sl_want = 1;
+ (void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0);
+ memlock.sl_locked++;
+ }
+ memlock.sl_lock = 1;
+
+ error = sysctl_root(0, name, namelen, &req);
+
+ if (req.lock == 2)
+ vsunlock(req.oldptr, req.oldlen, B_WRITE);
+
+ memlock.sl_lock = 0;
+
+ if (memlock.sl_want) {
+ memlock.sl_want = 0;
+ wakeup((caddr_t)&memlock);
+ }
+
+ if (error && error != ENOMEM)
+ return (error);
+
+ if (retval) {
+ if (req.oldptr && req.oldidx > req.oldlen)
+ *retval = req.oldlen;
+ else
+ *retval = req.oldidx;
+ }
+ return (error);
+}
+
+/*
+ * Transfer function to/from user space.
+ */
+static int
+sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
+{
+ int error = 0;
+ size_t i = 0;
+
+ if (req->lock == 1 && req->oldptr) {
+ vslock(req->oldptr, req->oldlen);
+ req->lock = 2;
+ }
+ if (req->oldptr) {
+ i = l;
+ if (i > req->oldlen - req->oldidx)
+ i = req->oldlen - req->oldidx;
+ if (i > 0)
+ error = copyout(p, (char *)req->oldptr + req->oldidx,
+ i);
+ }
+ req->oldidx += l;
+ if (error)
+ return (error);
+ if (req->oldptr && i < l)
+ return (ENOMEM);
+ return (0);
+}
+
+static int
+sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
+{
+ int error;
+
+ if (!req->newptr)
+ return 0;
+ if (req->newlen - req->newidx < l)
+ return (EINVAL);
+ error = copyin((char *)req->newptr + req->newidx, p, l);
+ req->newidx += l;
+ return (error);
+}
+
+/*
+ * Traverse our tree, and find the right node, execute whatever it points
+ * at, and return the resulting error code.
+ */
+
+int
+sysctl_root SYSCTL_HANDLER_ARGS
+{
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ int indx, i, j;
+ struct sysctl_oid **oidpp;
+ struct linker_set *lsp = &sysctl_;
+
+ j = lsp->ls_length;
+ oidpp = (struct sysctl_oid **) lsp->ls_items;
+
+ indx = 0;
+ while (j-- && indx < CTL_MAXNAME) {
+ if (*oidpp && ((*oidpp)->oid_number == name[indx])) {
+ indx++;
+ if ((*oidpp)->oid_kind & CTLFLAG_NOLOCK)
+ req->lock = 0;
+ if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ if ((*oidpp)->oid_handler)
+ goto found;
+ if (indx == namelen)
+ return ENOENT;
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ j = lsp->ls_length;
+ oidpp = (struct sysctl_oid **)lsp->ls_items;
+ } else {
+ if (indx != namelen)
+ return EISDIR;
+ goto found;
+ }
+ } else {
+ oidpp++;
+ }
+ }
+ return ENOENT;
+found:
+ /* If writing isn't allowed */
+ if (req->newptr && (!((*oidpp)->oid_kind & CTLFLAG_WR) ||
+ (((*oidpp)->oid_kind & CTLFLAG_SECURE) && securelevel > 0)))
+ return (EPERM);
+
+ /* Most likely only root can write */
+ if (!((*oidpp)->oid_kind & CTLFLAG_ANYBODY) &&
+ req->newptr && req->p &&
+ (i = suser(req->p->p_ucred, &req->p->p_acflag)))
+ return (i);
+
+ if (!(*oidpp)->oid_handler)
+ return EINVAL;
+
+ if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ i = ((*oidpp)->oid_handler) (*oidpp,
+ name + indx, namelen - indx,
+ req);
+ } else {
+ i = ((*oidpp)->oid_handler) (*oidpp,
+ (*oidpp)->oid_arg1, (*oidpp)->oid_arg2,
+ req);
+ }
+ return (i);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sysctl_args {
+ int *name;
+ u_int namelen;
+ void *old;
+ size_t *oldlenp;
+ void *new;
+ size_t newlen;
+};
+#endif
+
+int
+__sysctl(struct proc *p, struct sysctl_args *uap)
+{
+ int error, i, name[CTL_MAXNAME];
+ size_t j;
+
+ if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
+ return (EINVAL);
+
+ error = copyin(uap->name, &name, uap->namelen * sizeof(int));
+ if (error)
+ return (error);
+
+ error = userland_sysctl(p, name, uap->namelen,
+ uap->old, uap->oldlenp, 0,
+ uap->new, uap->newlen, &j);
+ if (error && error != ENOMEM)
+ return (error);
+ if (uap->oldlenp) {
+ i = copyout(&j, uap->oldlenp, sizeof(j));
+ if (i)
+ return (i);
+ }
+ return (error);
+}
+
+/*
+ * This is used from various compatibility syscalls too. That's why name
+ * must be in kernel space.
+ */
+int
+userland_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval)
+{
+ int error = 0;
+ struct sysctl_req req, req2;
+
+ bzero(&req, sizeof req);
+
+ req.p = p;
+
+ if (oldlenp) {
+ if (inkernel) {
+ req.oldlen = *oldlenp;
+ } else {
+ error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
+ if (error)
+ return (error);
+ }
+ }
+
+ if (old) {
+ if (!useracc(old, req.oldlen, B_WRITE))
+ return (EFAULT);
+ req.oldptr= old;
+ }
+
+ if (newlen) {
+ if (!useracc(new, req.newlen, B_READ))
+ return (EFAULT);
+ req.newlen = newlen;
+ req.newptr = new;
+ }
+
+ req.oldfunc = sysctl_old_user;
+ req.newfunc = sysctl_new_user;
+ req.lock = 1;
+
+ /* XXX this should probably be done in a general way */
+ while (memlock.sl_lock) {
+ memlock.sl_want = 1;
+ (void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0);
+ memlock.sl_locked++;
+ }
+ memlock.sl_lock = 1;
+
+ do {
+ req2 = req;
+ error = sysctl_root(0, name, namelen, &req2);
+ } while (error == EAGAIN);
+
+ req = req2;
+ if (req.lock == 2)
+ vsunlock(req.oldptr, req.oldlen, B_WRITE);
+
+ memlock.sl_lock = 0;
+
+ if (memlock.sl_want) {
+ memlock.sl_want = 0;
+ wakeup((caddr_t)&memlock);
+ }
+
+ if (error && error != ENOMEM)
+ return (error);
+
+ if (retval) {
+ if (req.oldptr && req.oldidx > req.oldlen)
+ *retval = req.oldlen;
+ else
+ *retval = req.oldidx;
+ }
+ return (error);
+}
+
+#ifdef COMPAT_43
+#include <sys/socket.h>
+#include <vm/vm_param.h>
+
+#define KINFO_PROC (0<<8)
+#define KINFO_RT (1<<8)
+#define KINFO_VNODE (2<<8)
+#define KINFO_FILE (3<<8)
+#define KINFO_METER (4<<8)
+#define KINFO_LOADAVG (5<<8)
+#define KINFO_CLOCKRATE (6<<8)
+
+/* Non-standard BSDI extension - only present on their 4.3 net-2 releases */
+#define KINFO_BSDI_SYSINFO (101<<8)
+
+/*
+ * XXX this is bloat, but I hope it's better here than on the potentially
+ * limited kernel stack... -Peter
+ */
+
+static struct {
+ int bsdi_machine; /* "i386" on BSD/386 */
+/* ^^^ this is an offset to the string, relative to the struct start */
+ char *pad0;
+ long pad1;
+ long pad2;
+ long pad3;
+ u_long pad4;
+ u_long pad5;
+ u_long pad6;
+
+ int bsdi_ostype; /* "BSD/386" on BSD/386 */
+ int bsdi_osrelease; /* "1.1" on BSD/386 */
+ long pad7;
+ long pad8;
+ char *pad9;
+
+ long pad10;
+ long pad11;
+ int pad12;
+ long pad13;
+ quad_t pad14;
+ long pad15;
+
+ struct timeval pad16;
+ /* we dont set this, because BSDI's uname used gethostname() instead */
+ int bsdi_hostname; /* hostname on BSD/386 */
+
+ /* the actual string data is appended here */
+
+} bsdi_si;
+/*
+ * this data is appended to the end of the bsdi_si structure during copyout.
+ * The "char *" offsets are relative to the base of the bsdi_si struct.
+ * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings
+ * should not exceed the length of the buffer here... (or else!! :-)
+ */
+static char bsdi_strings[80]; /* It had better be less than this! */
+
+#ifndef _SYS_SYSPROTO_H_
+struct getkerninfo_args {
+ int op;
+ char *where;
+ size_t *size;
+ int arg;
+};
+#endif
+
+int
+ogetkerninfo(struct proc *p, struct getkerninfo_args *uap)
+{
+ int error, name[6];
+ size_t size;
+
+ switch (uap->op & 0xff00) {
+
+ case KINFO_RT:
+ name[0] = CTL_NET;
+ name[1] = PF_ROUTE;
+ name[2] = 0;
+ name[3] = (uap->op & 0xff0000) >> 16;
+ name[4] = uap->op & 0xff;
+ name[5] = uap->arg;
+ error = userland_sysctl(p, name, 6, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_VNODE:
+ name[0] = CTL_KERN;
+ name[1] = KERN_VNODE;
+ error = userland_sysctl(p, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_PROC:
+ name[0] = CTL_KERN;
+ name[1] = KERN_PROC;
+ name[2] = uap->op & 0xff;
+ name[3] = uap->arg;
+ error = userland_sysctl(p, name, 4, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_FILE:
+ name[0] = CTL_KERN;
+ name[1] = KERN_FILE;
+ error = userland_sysctl(p, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_METER:
+ name[0] = CTL_VM;
+ name[1] = VM_METER;
+ error = userland_sysctl(p, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_LOADAVG:
+ name[0] = CTL_VM;
+ name[1] = VM_LOADAVG;
+ error = userland_sysctl(p, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_CLOCKRATE:
+ name[0] = CTL_KERN;
+ name[1] = KERN_CLOCKRATE;
+ error = userland_sysctl(p, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
+ break;
+
+ case KINFO_BSDI_SYSINFO: {
+ /*
+ * this is pretty crude, but it's just enough for uname()
+ * from BSDI's 1.x libc to work.
+ *
+ * In particular, it doesn't return the same results when
+ * the supplied buffer is too small. BSDI's version apparently
+ * will return the amount copied, and set the *size to how
+ * much was needed. The emulation framework here isn't capable
+ * of that, so we just set both to the amount copied.
+ * BSDI's 2.x product apparently fails with ENOMEM in this
+ * scenario.
+ */
+
+ u_int needed;
+ u_int left;
+ char *s;
+
+ bzero((char *)&bsdi_si, sizeof(bsdi_si));
+ bzero(bsdi_strings, sizeof(bsdi_strings));
+
+ s = bsdi_strings;
+
+ bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si);
+ strcpy(s, ostype);
+ s += strlen(s) + 1;
+
+ bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si);
+ strcpy(s, osrelease);
+ s += strlen(s) + 1;
+
+ bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si);
+ strcpy(s, machine);
+ s += strlen(s) + 1;
+
+ needed = sizeof(bsdi_si) + (s - bsdi_strings);
+
+ if (uap->where == NULL) {
+ /* process is asking how much buffer to supply.. */
+ size = needed;
+ error = 0;
+ break;
+ }
+
+
+ /* if too much buffer supplied, trim it down */
+ if (size > needed)
+ size = needed;
+
+ /* how much of the buffer is remaining */
+ left = size;
+
+ if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0)
+ break;
+
+ /* is there any point in continuing? */
+ if (left > sizeof(bsdi_si)) {
+ left -= sizeof(bsdi_si);
+ error = copyout(&bsdi_strings,
+ uap->where + sizeof(bsdi_si), left);
+ }
+ break;
+ }
+
+ default:
+ return (EOPNOTSUPP);
+ }
+ if (error)
+ return (error);
+ p->p_retval[0] = size;
+ if (uap->size)
+ error = copyout((caddr_t)&size, (caddr_t)uap->size,
+ sizeof(size));
+ return (error);
+}
+#endif /* COMPAT_43 */
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
new file mode 100644
index 0000000..2ea378f
--- /dev/null
+++ b/sys/kern/kern_tc.c
@@ -0,0 +1,870 @@
+/*-
+ * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
+ * $Id: kern_clock.c,v 1.85 1998/11/23 09:58:53 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/dkstat.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/timex.h>
+#include <vm/vm.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#include <machine/limits.h>
+
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+
+#if defined(SMP) && defined(BETTER_CLOCK)
+#include <machine/smp.h>
+#endif
+
+/* This is where the NTIMECOUNTER option hangs out */
+#include "opt_ntp.h"
+
+/*
+ * Number of timecounters used to implement stable storage
+ */
+#ifndef NTIMECOUNTER
+#define NTIMECOUNTER 5
+#endif
+
+static MALLOC_DEFINE(M_TIMECOUNTER, "timecounter",
+ "Timecounter stable storage");
+
+static void initclocks __P((void *dummy));
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
+
+static void tco_forward __P((int force));
+static void tco_setscales __P((struct timecounter *tc));
+static __inline unsigned tco_delta __P((struct timecounter *tc));
+
+/* Some of these don't belong here, but it's easiest to concentrate them. */
+#if defined(SMP) && defined(BETTER_CLOCK)
+long cp_time[CPUSTATES];
+#else
+static long cp_time[CPUSTATES];
+#endif
+
+long tk_cancc;
+long tk_nin;
+long tk_nout;
+long tk_rawcc;
+
+time_t time_second;
+
+/*
+ * Which update policy to use.
+ * 0 - every tick, bad hardware may fail with "calcru negative..."
+ * 1 - more resistent to the above hardware, but less efficient.
+ */
+static int tco_method;
+
+/*
+ * Implement a dummy timecounter which we can use until we get a real one
+ * in the air. This allows the console and other early stuff to use
+ * timeservices.
+ */
+
+static unsigned
+dummy_get_timecount(struct timecounter *tc)
+{
+ static unsigned now;
+ return (++now);
+}
+
+static struct timecounter dummy_timecounter = {
+ dummy_get_timecount,
+ 0,
+ ~0u,
+ 1000000,
+ "dummy"
+};
+
+struct timecounter *timecounter = &dummy_timecounter;
+
+/*
+ * Clock handling routines.
+ *
+ * This code is written to operate with two timers that run independently of
+ * each other.
+ *
+ * The main timer, running hz times per second, is used to trigger interval
+ * timers, timeouts and rescheduling as needed.
+ *
+ * The second timer handles kernel and user profiling,
+ * and does resource use estimation. If the second timer is programmable,
+ * it is randomized to avoid aliasing between the two clocks. For example,
+ * the randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires. Otherwise, it would never accumulate
+ * cpu ticks. The mean frequency of the second timer is stathz.
+ *
+ * If no second timer exists, stathz will be zero; in this case we drive
+ * profiling and statistics off the main clock. This WILL NOT be accurate;
+ * do not do it unless absolutely necessary.
+ *
+ * The statistics clock may (or may not) be run at a higher rate while
+ * profiling. This profile clock runs at profhz. We require that profhz
+ * be an integral multiple of stathz.
+ *
+ * If the statistics clock is running fast, it must be divided by the ratio
+ * profhz/stathz for statistics. (For profiling, every tick counts.)
+ *
+ * Time-of-day is maintained using a "timecounter", which may or may
+ * not be related to the hardware generating the above mentioned
+ * interrupts.
+ */
+
+int stathz;
+int profhz;
+static int profprocs;
+int ticks;
+static int psdiv, pscnt; /* prof => stat divider */
+int psratio; /* ratio: prof / stat */
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+ void *dummy;
+{
+ register int i;
+
+ /*
+ * Set divisors to 1 (normal case) and let the machine-specific
+ * code do its bit.
+ */
+ psdiv = pscnt = 1;
+ cpu_initclocks();
+
+ /*
+ * Compute profhz/stathz, and fix profhz if needed.
+ */
+ i = stathz ? stathz : hz;
+ if (profhz == 0)
+ profhz = i;
+ psratio = profhz / i;
+}
+
+/*
+ * The real-time timer, interrupting hz times per second.
+ */
+void
+hardclock(frame)
+ register struct clockframe *frame;
+{
+ register struct proc *p;
+
+ p = curproc;
+ if (p) {
+ register struct pstats *pstats;
+
+ /*
+ * Run current process's virtual and profile time, as needed.
+ */
+ pstats = p->p_stats;
+ if (CLKF_USERMODE(frame) &&
+ timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
+ itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
+ psignal(p, SIGVTALRM);
+ if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
+ itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
+ psignal(p, SIGPROF);
+ }
+
+#if defined(SMP) && defined(BETTER_CLOCK)
+ forward_hardclock(pscnt);
+#endif
+
+ /*
+ * If no separate statistics clock is available, run it from here.
+ */
+ if (stathz == 0)
+ statclock(frame);
+
+ tco_forward(0);
+ ticks++;
+
+ /*
+ * Process callouts at a very low cpu priority, so we don't keep the
+ * relatively high clock interrupt priority any longer than necessary.
+ */
+ if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
+ if (CLKF_BASEPRI(frame)) {
+ /*
+ * Save the overhead of a software interrupt;
+ * it will happen as soon as we return, so do it now.
+ */
+ (void)splsoftclock();
+ softclock();
+ } else
+ setsoftclock();
+ } else if (softticks + 1 == ticks)
+ ++softticks;
+}
+
+/*
+ * Compute number of ticks in the specified amount of time.
+ */
+int
+tvtohz(tv)
+ struct timeval *tv;
+{
+ register unsigned long ticks;
+ register long sec, usec;
+
+ /*
+ * If the number of usecs in the whole seconds part of the time
+ * difference fits in a long, then the total number of usecs will
+ * fit in an unsigned long. Compute the total and convert it to
+ * ticks, rounding up and adding 1 to allow for the current tick
+ * to expire. Rounding also depends on unsigned long arithmetic
+ * to avoid overflow.
+ *
+ * Otherwise, if the number of ticks in the whole seconds part of
+ * the time difference fits in a long, then convert the parts to
+ * ticks separately and add, using similar rounding methods and
+ * overflow avoidance. This method would work in the previous
+ * case but it is slightly slower and assumes that hz is integral.
+ *
+ * Otherwise, round the time difference down to the maximum
+ * representable value.
+ *
+ * If ints have 32 bits, then the maximum value for any timeout in
+ * 10ms ticks is 248 days.
+ */
+ sec = tv->tv_sec;
+ usec = tv->tv_usec;
+ if (usec < 0) {
+ sec--;
+ usec += 1000000;
+ }
+ if (sec < 0) {
+#ifdef DIAGNOSTIC
+ if (usec > 0) {
+ sec++;
+ usec -= 1000000;
+ }
+ printf("tvotohz: negative time difference %ld sec %ld usec\n",
+ sec, usec);
+#endif
+ ticks = 1;
+ } else if (sec <= LONG_MAX / 1000000)
+ ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+ / tick + 1;
+ else if (sec <= LONG_MAX / hz)
+ ticks = sec * hz
+ + ((unsigned long)usec + (tick - 1)) / tick + 1;
+ else
+ ticks = LONG_MAX;
+ if (ticks > INT_MAX)
+ ticks = INT_MAX;
+ return ((int)ticks);
+}
+
+/*
+ * Start profiling on a process.
+ *
+ * Kernel profiling passes proc0 which never exits and hence
+ * keeps the profile clock running constantly.
+ */
+void
+startprofclock(p)
+ register struct proc *p;
+{
+ int s;
+
+ if ((p->p_flag & P_PROFIL) == 0) {
+ p->p_flag |= P_PROFIL;
+ if (++profprocs == 1 && stathz != 0) {
+ s = splstatclock();
+ psdiv = pscnt = psratio;
+ setstatclockrate(profhz);
+ splx(s);
+ }
+ }
+}
+
+/*
+ * Stop profiling on a process.
+ */
+void
+stopprofclock(p)
+ register struct proc *p;
+{
+ int s;
+
+ if (p->p_flag & P_PROFIL) {
+ p->p_flag &= ~P_PROFIL;
+ if (--profprocs == 0 && stathz != 0) {
+ s = splstatclock();
+ psdiv = pscnt = 1;
+ setstatclockrate(stathz);
+ splx(s);
+ }
+ }
+}
+
+/*
+ * Statistics clock. Grab profile sample, and if divider reaches 0,
+ * do process and kernel statistics.
+ */
+void
+statclock(frame)
+ register struct clockframe *frame;
+{
+#ifdef GPROF
+ register struct gmonparam *g;
+ int i;
+#endif
+ register struct proc *p;
+ struct pstats *pstats;
+ long rss;
+ struct rusage *ru;
+ struct vmspace *vm;
+
+ if (curproc != NULL && CLKF_USERMODE(frame)) {
+ p = curproc;
+ if (p->p_flag & P_PROFIL)
+ addupc_intr(p, CLKF_PC(frame), 1);
+#if defined(SMP) && defined(BETTER_CLOCK)
+ if (stathz != 0)
+ forward_statclock(pscnt);
+#endif
+ if (--pscnt > 0)
+ return;
+ /*
+ * Came from user mode; CPU was in user state.
+ * If this process is being profiled record the tick.
+ */
+ p->p_uticks++;
+ if (p->p_nice > NZERO)
+ cp_time[CP_NICE]++;
+ else
+ cp_time[CP_USER]++;
+ } else {
+#ifdef GPROF
+ /*
+ * Kernel statistics are just like addupc_intr, only easier.
+ */
+ g = &_gmonparam;
+ if (g->state == GMON_PROF_ON) {
+ i = CLKF_PC(frame) - g->lowpc;
+ if (i < g->textsize) {
+ i /= HISTFRACTION * sizeof(*g->kcount);
+ g->kcount[i]++;
+ }
+ }
+#endif
+#if defined(SMP) && defined(BETTER_CLOCK)
+ if (stathz != 0)
+ forward_statclock(pscnt);
+#endif
+ if (--pscnt > 0)
+ return;
+ /*
+ * Came from kernel mode, so we were:
+ * - handling an interrupt,
+ * - doing syscall or trap work on behalf of the current
+ * user process, or
+ * - spinning in the idle loop.
+ * Whichever it is, charge the time as appropriate.
+ * Note that we charge interrupts to the current process,
+ * regardless of whether they are ``for'' that process,
+ * so that we know how much of its real time was spent
+ * in ``non-process'' (i.e., interrupt) work.
+ */
+ p = curproc;
+ if (CLKF_INTR(frame)) {
+ if (p != NULL)
+ p->p_iticks++;
+ cp_time[CP_INTR]++;
+ } else if (p != NULL) {
+ p->p_sticks++;
+ cp_time[CP_SYS]++;
+ } else
+ cp_time[CP_IDLE]++;
+ }
+ pscnt = psdiv;
+
+ /*
+ * We maintain statistics shown by user-level statistics
+ * programs: the amount of time in each cpu state.
+ */
+
+ /*
+ * We adjust the priority of the current process. The priority of
+ * a process gets worse as it accumulates CPU time. The cpu usage
+ * estimator (p_estcpu) is increased here. The formula for computing
+ * priorities (in kern_synch.c) will compute a different value each
+ * time p_estcpu increases by 4. The cpu usage estimator ramps up
+ * quite quickly when the process is running (linearly), and decays
+ * away exponentially, at a rate which is proportionally slower when
+ * the system is busy. The basic principal is that the system will
+ * 90% forget that the process used a lot of CPU time in 5 * loadav
+ * seconds. This causes the system to favor processes which haven't
+ * run much recently, and to round-robin among other processes.
+ */
+ if (p != NULL) {
+ p->p_cpticks++;
+ if (++p->p_estcpu == 0)
+ p->p_estcpu--;
+ if ((p->p_estcpu & 3) == 0) {
+ resetpriority(p);
+ if (p->p_priority >= PUSER)
+ p->p_priority = p->p_usrpri;
+ }
+
+ /* Update resource usage integrals and maximums. */
+ if ((pstats = p->p_stats) != NULL &&
+ (ru = &pstats->p_ru) != NULL &&
+ (vm = p->p_vmspace) != NULL) {
+ ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
+ ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
+ ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
+ rss = vm->vm_pmap.pm_stats.resident_count *
+ PAGE_SIZE / 1024;
+ if (ru->ru_maxrss < rss)
+ ru->ru_maxrss = rss;
+ }
+ }
+}
+
+/*
+ * Return information about system clocks.
+ */
+static int
+sysctl_kern_clockrate SYSCTL_HANDLER_ARGS
+{
+ struct clockinfo clkinfo;
+ /*
+ * Construct clockinfo structure.
+ */
+ clkinfo.hz = hz;
+ clkinfo.tick = tick;
+ clkinfo.tickadj = tickadj;
+ clkinfo.profhz = profhz;
+ clkinfo.stathz = stathz ? stathz : hz;
+ return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
+ 0, 0, sysctl_kern_clockrate, "S,clockinfo","");
+
+static __inline unsigned
+tco_delta(struct timecounter *tc)
+{
+
+ return ((tc->tc_get_timecount(tc) - tc->tc_offset_count) &
+ tc->tc_counter_mask);
+}
+
+/*
+ * We have four functions for looking at the clock, two for microseconds
+ * and two for nanoseconds. For each there is fast but less precise
+ * version "get{nano|micro}time" which will return a time which is up
+ * to 1/HZ previous to the call, whereas the raw version "{nano|micro}time"
+ * will return a timestamp which is as precise as possible.
+ */
+
+void
+getmicrotime(struct timeval *tvp)
+{
+ struct timecounter *tc;
+
+ if (!tco_method) {
+ tc = timecounter;
+ *tvp = tc->tc_microtime;
+ } else {
+ microtime(tvp);
+ }
+}
+
+void
+getnanotime(struct timespec *tsp)
+{
+ struct timecounter *tc;
+
+ if (!tco_method) {
+ tc = timecounter;
+ *tsp = tc->tc_nanotime;
+ } else {
+ nanotime(tsp);
+ }
+}
+
+void
+microtime(struct timeval *tv)
+{
+ struct timecounter *tc;
+
+ tc = (struct timecounter *)timecounter;
+ tv->tv_sec = tc->tc_offset_sec;
+ tv->tv_usec = tc->tc_offset_micro;
+ tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32;
+ tv->tv_usec += boottime.tv_usec;
+ tv->tv_sec += boottime.tv_sec;
+ while (tv->tv_usec >= 1000000) {
+ tv->tv_usec -= 1000000;
+ tv->tv_sec++;
+ }
+}
+
+void
+nanotime(struct timespec *ts)
+{
+ unsigned count;
+ u_int64_t delta;
+ struct timecounter *tc;
+
+ tc = (struct timecounter *)timecounter;
+ ts->tv_sec = tc->tc_offset_sec;
+ count = tco_delta(tc);
+ delta = tc->tc_offset_nano;
+ delta += ((u_int64_t)count * tc->tc_scale_nano_f);
+ delta >>= 32;
+ delta += ((u_int64_t)count * tc->tc_scale_nano_i);
+ delta += boottime.tv_usec * 1000;
+ ts->tv_sec += boottime.tv_sec;
+ while (delta >= 1000000000) {
+ delta -= 1000000000;
+ ts->tv_sec++;
+ }
+ ts->tv_nsec = delta;
+}
+
+void
+timecounter_timespec(unsigned count, struct timespec *ts)
+{
+ u_int64_t delta;
+ struct timecounter *tc;
+
+ tc = (struct timecounter *)timecounter;
+ ts->tv_sec = tc->tc_offset_sec;
+ count -= tc->tc_offset_count;
+ count &= tc->tc_counter_mask;
+ delta = tc->tc_offset_nano;
+ delta += ((u_int64_t)count * tc->tc_scale_nano_f);
+ delta >>= 32;
+ delta += ((u_int64_t)count * tc->tc_scale_nano_i);
+ delta += boottime.tv_usec * 1000;
+ ts->tv_sec += boottime.tv_sec;
+ while (delta >= 1000000000) {
+ delta -= 1000000000;
+ ts->tv_sec++;
+ }
+ ts->tv_nsec = delta;
+}
+
+void
+getmicrouptime(struct timeval *tvp)
+{
+ struct timecounter *tc;
+
+ if (!tco_method) {
+ tc = timecounter;
+ tvp->tv_sec = tc->tc_offset_sec;
+ tvp->tv_usec = tc->tc_offset_micro;
+ } else {
+ microuptime(tvp);
+ }
+}
+
+void
+getnanouptime(struct timespec *tsp)
+{
+ struct timecounter *tc;
+
+ if (!tco_method) {
+ tc = timecounter;
+ tsp->tv_sec = tc->tc_offset_sec;
+ tsp->tv_nsec = tc->tc_offset_nano >> 32;
+ } else {
+ nanouptime(tsp);
+ }
+}
+
+void
+microuptime(struct timeval *tv)
+{
+ struct timecounter *tc;
+
+ tc = (struct timecounter *)timecounter;
+ tv->tv_sec = tc->tc_offset_sec;
+ tv->tv_usec = tc->tc_offset_micro;
+ tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32;
+ if (tv->tv_usec >= 1000000) {
+ tv->tv_usec -= 1000000;
+ tv->tv_sec++;
+ }
+}
+
+void
+nanouptime(struct timespec *ts)
+{
+ unsigned count;
+ u_int64_t delta;
+ struct timecounter *tc;
+
+ tc = (struct timecounter *)timecounter;
+ ts->tv_sec = tc->tc_offset_sec;
+ count = tco_delta(tc);
+ delta = tc->tc_offset_nano;
+ delta += ((u_int64_t)count * tc->tc_scale_nano_f);
+ delta >>= 32;
+ delta += ((u_int64_t)count * tc->tc_scale_nano_i);
+ if (delta >= 1000000000) {
+ delta -= 1000000000;
+ ts->tv_sec++;
+ }
+ ts->tv_nsec = delta;
+}
+
+static void
+tco_setscales(struct timecounter *tc)
+{
+ u_int64_t scale;
+
+ scale = 1000000000LL << 32;
+ if (tc->tc_adjustment > 0)
+ scale += (tc->tc_adjustment * 1000LL) << 10;
+ else
+ scale -= (-tc->tc_adjustment * 1000LL) << 10;
+ scale /= tc->tc_frequency;
+ tc->tc_scale_micro = scale / 1000;
+ tc->tc_scale_nano_f = scale & 0xffffffff;
+ tc->tc_scale_nano_i = scale >> 32;
+}
+
+void
+init_timecounter(struct timecounter *tc)
+{
+ struct timespec ts1;
+ struct timecounter *t1, *t2, *t3;
+ int i;
+
+ tc->tc_adjustment = 0;
+ tco_setscales(tc);
+ tc->tc_offset_count = tc->tc_get_timecount(tc);
+ tc->tc_tweak = tc;
+ MALLOC(t1, struct timecounter *, sizeof *t1, M_TIMECOUNTER, M_WAITOK);
+ *t1 = *tc;
+ t2 = t1;
+ for (i = 1; i < NTIMECOUNTER; i++) {
+ MALLOC(t3, struct timecounter *, sizeof *t3,
+ M_TIMECOUNTER, M_WAITOK);
+ *t3 = *tc;
+ t3->tc_other = t2;
+ t2 = t3;
+ }
+ t1->tc_other = t3;
+ tc = t1;
+
+ printf("Timecounter \"%s\" frequency %lu Hz\n",
+ tc->tc_name, (u_long)tc->tc_frequency);
+
+ /* XXX: For now always start using the counter. */
+ tc->tc_offset_count = tc->tc_get_timecount(tc);
+ nanouptime(&ts1);
+ tc->tc_offset_nano = (u_int64_t)ts1.tv_nsec << 32;
+ tc->tc_offset_micro = ts1.tv_nsec / 1000;
+ tc->tc_offset_sec = ts1.tv_sec;
+ timecounter = tc;
+}
+
+void
+set_timecounter(struct timespec *ts)
+{
+ struct timespec ts2;
+
+ nanouptime(&ts2);
+ boottime.tv_sec = ts->tv_sec - ts2.tv_sec;
+ boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000;
+ if (boottime.tv_usec < 0) {
+ boottime.tv_usec += 1000000;
+ boottime.tv_sec--;
+ }
+ /* fiddle all the little crinkly bits around the fiords... */
+ tco_forward(1);
+}
+
+
+#if 0 /* Currently unused */
+void
+switch_timecounter(struct timecounter *newtc)
+{
+ int s;
+ struct timecounter *tc;
+ struct timespec ts;
+
+ s = splclock();
+ tc = timecounter;
+ if (newtc == tc || newtc == tc->tc_other) {
+ splx(s);
+ return;
+ }
+ nanouptime(&ts);
+ newtc->tc_offset_sec = ts.tv_sec;
+ newtc->tc_offset_nano = (u_int64_t)ts.tv_nsec << 32;
+ newtc->tc_offset_micro = ts.tv_nsec / 1000;
+ newtc->tc_offset_count = newtc->tc_get_timecount(newtc);
+ timecounter = newtc;
+ splx(s);
+}
+#endif
+
+static struct timecounter *
+sync_other_counter(void)
+{
+ struct timecounter *tc, *tcn, *tco;
+ unsigned delta;
+
+ tco = timecounter;
+ tc = tco->tc_other;
+ tcn = tc->tc_other;
+ *tc = *tco;
+ tc->tc_other = tcn;
+ delta = tco_delta(tc);
+ tc->tc_offset_count += delta;
+ tc->tc_offset_count &= tc->tc_counter_mask;
+ tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_f;
+ tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_i << 32;
+ return (tc);
+}
+
+static void
+tco_forward(int force)
+{
+ struct timecounter *tc, *tco;
+
+ tco = timecounter;
+ tc = sync_other_counter();
+ /*
+ * We may be inducing a tiny error here, the tc_poll_pps() may
+ * process a latched count which happens after the tco_delta()
+ * in sync_other_counter(), which would extend the previous
+ * counters parameters into the domain of this new one.
+ * Since the timewindow is very small for this, the error is
+ * going to be only a few weenieseconds (as Dave Mills would
+ * say), so lets just not talk more about it, OK ?
+ */
+ if (tco->tc_poll_pps)
+ tco->tc_poll_pps(tco);
+ if (timedelta != 0) {
+ tc->tc_offset_nano += (u_int64_t)(tickdelta * 1000) << 32;
+ timedelta -= tickdelta;
+ force++;
+ }
+
+ while (tc->tc_offset_nano >= 1000000000ULL << 32) {
+ tc->tc_offset_nano -= 1000000000ULL << 32;
+ tc->tc_offset_sec++;
+ tc->tc_frequency = tc->tc_tweak->tc_frequency;
+ tc->tc_adjustment = tc->tc_tweak->tc_adjustment;
+ ntp_update_second(tc); /* XXX only needed if xntpd runs */
+ tco_setscales(tc);
+ force++;
+ }
+
+ if (tco_method && !force)
+ return;
+
+ tc->tc_offset_micro = (tc->tc_offset_nano / 1000) >> 32;
+
+ /* Figure out the wall-clock time */
+ tc->tc_nanotime.tv_sec = tc->tc_offset_sec + boottime.tv_sec;
+ tc->tc_nanotime.tv_nsec =
+ (tc->tc_offset_nano >> 32) + boottime.tv_usec * 1000;
+ tc->tc_microtime.tv_usec = tc->tc_offset_micro + boottime.tv_usec;
+ if (tc->tc_nanotime.tv_nsec >= 1000000000) {
+ tc->tc_nanotime.tv_nsec -= 1000000000;
+ tc->tc_microtime.tv_usec -= 1000000;
+ tc->tc_nanotime.tv_sec++;
+ }
+ time_second = tc->tc_microtime.tv_sec = tc->tc_nanotime.tv_sec;
+
+ timecounter = tc;
+}
+
+static int
+sysctl_kern_timecounter_frequency SYSCTL_HANDLER_ARGS
+{
+
+ return (sysctl_handle_opaque(oidp,
+ &timecounter->tc_tweak->tc_frequency,
+ sizeof(timecounter->tc_tweak->tc_frequency), req));
+}
+
+static int
+sysctl_kern_timecounter_adjustment SYSCTL_HANDLER_ARGS
+{
+
+ return (sysctl_handle_opaque(oidp,
+ &timecounter->tc_tweak->tc_adjustment,
+ sizeof(timecounter->tc_tweak->tc_adjustment), req));
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+
+SYSCTL_INT(_kern_timecounter, KERN_ARGMAX, method, CTLFLAG_RW, &tco_method, 0,
+ "This variable determines the method used for updating timecounters. "
+ "If the default algorithm (0) fails with \"calcru negative...\" messages "
+ "try the alternate algorithm (1) which handles bad hardware better."
+
+);
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, frequency, CTLTYPE_INT | CTLFLAG_RW,
+ 0, sizeof(u_int), sysctl_kern_timecounter_frequency, "I", "");
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, adjustment, CTLTYPE_INT | CTLFLAG_RW,
+ 0, sizeof(int), sysctl_kern_timecounter_adjustment, "I", "");
diff --git a/sys/kern/kern_threads.c b/sys/kern/kern_threads.c
new file mode 100644
index 0000000..57e8d96
--- /dev/null
+++ b/sys/kern/kern_threads.c
@@ -0,0 +1,154 @@
+/*
+ *
+ * Portions of this code was derived from the file kern_fork.c and as such
+ * is subject to the copyrights below.
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (c) 1996 Douglas Santry
+ *
+ * This code is subject to the beer copyright. If I chance to meet you in a
+ * bar and this code helped you in some way, you owe me a beer. Only
+ * in Germany will I accept domestic beer. This code may or may not work
+ * and I certainly make no claims as to its fitness for *any* purpose.
+ *
+ * $Id: kern_threads.c,v 1.9 1998/10/25 17:44:51 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+
+/*
+ * Low level support for sleep/wakeup paradigm
+ * If a timeout is specified:
+ * returns 0 if wakeup
+ * returns EAGAIN if timed out
+ * returns EINVAL if error
+ *
+ * If a timeout is not specified:
+ *
+ * returns time waiting in ticks.
+ */
+int
+thr_sleep(struct proc *p, struct thr_sleep_args *uap) {
+ int sleepstart;
+ struct timespec ts;
+ struct timeval atv;
+ int error, timo;
+
+ timo = 0;
+ if (uap->timeout != 0) {
+ /*
+ * Get timespec struct
+ */
+ if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) {
+ p->p_wakeup = 0;
+ return error;
+ }
+ if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) {
+ p->p_wakeup = 0;
+ return (EINVAL);
+ }
+ TIMESPEC_TO_TIMEVAL(&atv, &ts);
+ if (itimerfix(&atv)) {
+ p->p_wakeup = 0;
+ return (EINVAL);
+ }
+ timo = tvtohz(&atv);
+ }
+
+ p->p_retval[0] = 0;
+ if (p->p_wakeup == 0) {
+ sleepstart = ticks;
+ p->p_flag |= P_SINTR;
+ error = tsleep(p, PRIBIO, "thrslp", timo);
+ p->p_flag &= ~P_SINTR;
+ if (error == EWOULDBLOCK) {
+ p->p_wakeup = 0;
+ p->p_retval[0] = EAGAIN;
+ return 0;
+ }
+ if (uap->timeout == 0)
+ p->p_retval[0] = ticks - sleepstart;
+ }
+ p->p_wakeup = 0;
+ return (0);
+}
+
+int
+thr_wakeup(struct proc *p, struct thr_wakeup_args *uap) {
+ struct proc *pSlave = p->p_leader;
+
+ while(pSlave && (pSlave->p_pid != uap->pid))
+ pSlave = pSlave->p_peers;
+
+ if(pSlave == 0) {
+ p->p_retval[0] = ESRCH;
+ return(0);
+ }
+
+ pSlave->p_wakeup++;
+ if((pSlave->p_stat == SSLEEP) && (pSlave->p_wchan == pSlave)) {
+ wakeup(pSlave);
+ return(0);
+ }
+
+ p->p_retval[0] = EAGAIN;
+ return 0;
+}
+
+/*
+ * General purpose yield system call
+ */
+int
+yield(struct proc *p, struct yield_args *uap) {
+ int s;
+
+ p->p_retval[0] = 0;
+
+ s = splhigh();
+ p->p_priority = MAXPRI;
+ setrunqueue(p);
+ mi_switch();
+ splx(s);
+
+ return(0);
+}
+
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
new file mode 100644
index 0000000..2bd17bb
--- /dev/null
+++ b/sys/kern/kern_time.c
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_time.c 8.1 (Berkeley) 6/10/93
+ * $Id: kern_time.c,v 1.58 1998/06/09 13:10:53 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/sysproto.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+struct timezone tz;
+
+/*
+ * Time of day and interval timer support.
+ *
+ * These routines provide the kernel entry points to get and set
+ * the time-of-day and per-process interval timers. Subroutines
+ * here provide support for adding and subtracting timeval structures
+ * and decrementing interval timers, optionally reloading the interval
+ * timers when they expire.
+ */
+
+static int nanosleep1 __P((struct proc *p, struct timespec *rqt,
+ struct timespec *rmt));
+static int settime __P((struct timeval *));
+static void timevalfix __P((struct timeval *));
+static void no_lease_updatetime __P((int));
+
+static void
+no_lease_updatetime(deltat)
+ int deltat;
+{
+}
+
+void (*lease_updatetime) __P((int)) = no_lease_updatetime;
+
+static int
+settime(tv)
+ struct timeval *tv;
+{
+ struct timeval delta, tv1;
+ struct timespec ts;
+ int s;
+
+ s = splclock();
+ microtime(&tv1);
+ delta = *tv;
+ timevalsub(&delta, &tv1);
+
+ /*
+ * If the system is secure, we do not allow the time to be
+ * set to an earlier value (it may be slowed using adjtime,
+ * but not set back). This feature prevent interlopers from
+ * setting arbitrary time stamps on files.
+ */
+ if (delta.tv_sec < 0 && securelevel > 1) {
+ splx(s);
+ return (EPERM);
+ }
+
+ ts.tv_sec = tv->tv_sec;
+ ts.tv_nsec = tv->tv_usec * 1000;
+ set_timecounter(&ts);
+ (void) splsoftclock();
+ lease_updatetime(delta.tv_sec);
+ splx(s);
+ resettodr();
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_gettime_args {
+ clockid_t clock_id;
+ struct timespec *tp;
+};
+#endif
+
+/* ARGSUSED */
+int
+clock_gettime(p, uap)
+ struct proc *p;
+ struct clock_gettime_args *uap;
+{
+ struct timespec ats;
+
+ if (SCARG(uap, clock_id) != CLOCK_REALTIME)
+ return (EINVAL);
+ nanotime(&ats);
+ return (copyout(&ats, SCARG(uap, tp), sizeof(ats)));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_settime_args {
+ clockid_t clock_id;
+ const struct timespec *tp;
+};
+#endif
+
+/* ARGSUSED */
+int
+clock_settime(p, uap)
+ struct proc *p;
+ struct clock_settime_args *uap;
+{
+ struct timeval atv;
+ struct timespec ats;
+ int error;
+
+ if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
+ return (error);
+ if (SCARG(uap, clock_id) != CLOCK_REALTIME)
+ return (EINVAL);
+ if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0)
+ return (error);
+ if (ats.tv_nsec < 0 || ats.tv_nsec >= 1000000000)
+ return (EINVAL);
+ /* XXX Don't convert nsec->usec and back */
+ TIMESPEC_TO_TIMEVAL(&atv, &ats);
+ if ((error = settime(&atv)))
+ return (error);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_getres_args {
+ clockid_t clock_id;
+ struct timespec *tp;
+};
+#endif
+
+int
+clock_getres(p, uap)
+ struct proc *p;
+ struct clock_getres_args *uap;
+{
+ struct timespec ts;
+ int error;
+
+ if (SCARG(uap, clock_id) != CLOCK_REALTIME)
+ return (EINVAL);
+ error = 0;
+ if (SCARG(uap, tp)) {
+ ts.tv_sec = 0;
+ ts.tv_nsec = 1000000000 / timecounter->tc_frequency;
+ error = copyout(&ts, SCARG(uap, tp), sizeof(ts));
+ }
+ return (error);
+}
+
+static int nanowait;
+
+static int
+nanosleep1(p, rqt, rmt)
+ struct proc *p;
+ struct timespec *rqt, *rmt;
+{
+ struct timespec ts, ts2, ts3;
+ struct timeval tv;
+ int error;
+
+ if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
+ return (EINVAL);
+ if (rqt->tv_sec < 0 || rqt->tv_sec == 0 && rqt->tv_nsec == 0)
+ return (0);
+ getnanouptime(&ts);
+ timespecadd(&ts, rqt);
+ TIMESPEC_TO_TIMEVAL(&tv, rqt);
+ for (;;) {
+ error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp",
+ tvtohz(&tv));
+ getnanouptime(&ts2);
+ if (error != EWOULDBLOCK) {
+ if (error == ERESTART)
+ error = EINTR;
+ if (rmt != NULL) {
+ timespecsub(&ts, &ts2);
+ if (ts.tv_sec < 0)
+ timespecclear(&ts);
+ *rmt = ts;
+ }
+ return (error);
+ }
+ if (timespeccmp(&ts2, &ts, >=))
+ return (0);
+ ts3 = ts;
+ timespecsub(&ts3, &ts2);
+ TIMESPEC_TO_TIMEVAL(&tv, &ts3);
+ }
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nanosleep_args {
+ struct timespec *rqtp;
+ struct timespec *rmtp;
+};
+#endif
+
+/* ARGSUSED */
+int
+nanosleep(p, uap)
+ struct proc *p;
+ struct nanosleep_args *uap;
+{
+ struct timespec rmt, rqt;
+ int error, error2;
+
+ error = copyin(SCARG(uap, rqtp), &rqt, sizeof(rqt));
+ if (error)
+ return (error);
+ if (SCARG(uap, rmtp))
+ if (!useracc((caddr_t)SCARG(uap, rmtp), sizeof(rmt), B_WRITE))
+ return (EFAULT);
+ error = nanosleep1(p, &rqt, &rmt);
+ if (error && SCARG(uap, rmtp)) {
+ error2 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt));
+ if (error2) /* XXX shouldn't happen, did useracc() above */
+ return (error2);
+ }
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct gettimeofday_args {
+ struct timeval *tp;
+ struct timezone *tzp;
+};
+#endif
+/* ARGSUSED */
+int
+gettimeofday(p, uap)
+ struct proc *p;
+ register struct gettimeofday_args *uap;
+{
+ struct timeval atv;
+ int error = 0;
+
+ if (uap->tp) {
+ microtime(&atv);
+ if ((error = copyout((caddr_t)&atv, (caddr_t)uap->tp,
+ sizeof (atv))))
+ return (error);
+ }
+ if (uap->tzp)
+ error = copyout((caddr_t)&tz, (caddr_t)uap->tzp,
+ sizeof (tz));
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct settimeofday_args {
+ struct timeval *tv;
+ struct timezone *tzp;
+};
+#endif
+/* ARGSUSED */
+int
+settimeofday(p, uap)
+ struct proc *p;
+ struct settimeofday_args *uap;
+{
+ struct timeval atv;
+ struct timezone atz;
+ int error;
+
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
+ return (error);
+ /* Verify all parameters before changing time. */
+ if (uap->tv) {
+ if ((error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
+ sizeof(atv))))
+ return (error);
+ if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
+ return (EINVAL);
+ }
+ if (uap->tzp &&
+ (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz))))
+ return (error);
+ if (uap->tv && (error = settime(&atv)))
+ return (error);
+ if (uap->tzp)
+ tz = atz;
+ return (0);
+}
+
+int tickdelta; /* current clock skew, us. per tick */
+long timedelta; /* unapplied time correction, us. */
+static long bigadj = 1000000; /* use 10x skew above bigadj us. */
+
+#ifndef _SYS_SYSPROTO_H_
+struct adjtime_args {
+ struct timeval *delta;
+ struct timeval *olddelta;
+};
+#endif
+/* ARGSUSED */
+int
+adjtime(p, uap)
+ struct proc *p;
+ register struct adjtime_args *uap;
+{
+ struct timeval atv;
+ register long ndelta, ntickdelta, odelta;
+ int s, error;
+
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
+ return (error);
+ if ((error =
+ copyin((caddr_t)uap->delta, (caddr_t)&atv, sizeof(struct timeval))))
+ return (error);
+
+ /*
+ * Compute the total correction and the rate at which to apply it.
+ * Round the adjustment down to a whole multiple of the per-tick
+ * delta, so that after some number of incremental changes in
+ * hardclock(), tickdelta will become zero, lest the correction
+ * overshoot and start taking us away from the desired final time.
+ */
+ ndelta = atv.tv_sec * 1000000 + atv.tv_usec;
+ if (ndelta > bigadj || ndelta < -bigadj)
+ ntickdelta = 10 * tickadj;
+ else
+ ntickdelta = tickadj;
+ if (ndelta % ntickdelta)
+ ndelta = ndelta / ntickdelta * ntickdelta;
+
+ /*
+ * To make hardclock()'s job easier, make the per-tick delta negative
+ * if we want time to run slower; then hardclock can simply compute
+ * tick + tickdelta, and subtract tickdelta from timedelta.
+ */
+ if (ndelta < 0)
+ ntickdelta = -ntickdelta;
+ s = splclock();
+ odelta = timedelta;
+ timedelta = ndelta;
+ tickdelta = ntickdelta;
+ splx(s);
+
+ if (uap->olddelta) {
+ atv.tv_sec = odelta / 1000000;
+ atv.tv_usec = odelta % 1000000;
+ (void) copyout((caddr_t)&atv, (caddr_t)uap->olddelta,
+ sizeof(struct timeval));
+ }
+ return (0);
+}
+
+/*
+ * Get value of an interval timer. The process virtual and
+ * profiling virtual time timers are kept in the p_stats area, since
+ * they can be swapped out. These are kept internally in the
+ * way they are specified externally: in time until they expire.
+ *
+ * The real time interval timer is kept in the process table slot
+ * for the process, and its value (it_value) is kept as an
+ * absolute time rather than as a delta, so that it is easy to keep
+ * periodic real-time signals from drifting.
+ *
+ * Virtual time timers are processed in the hardclock() routine of
+ * kern_clock.c. The real time timer is processed by a timeout
+ * routine, called from the softclock() routine. Since a callout
+ * may be delayed in real time due to interrupt processing in the system,
+ * it is possible for the real time timeout routine (realitexpire, given below),
+ * to be delayed in real time past when it is supposed to occur. It
+ * does not suffice, therefore, to reload the real timer .it_value from the
+ * real time timers .it_interval. Rather, we compute the next time in
+ * absolute time the timer should go off.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getitimer_args {
+ u_int which;
+ struct itimerval *itv;
+};
+#endif
+/* ARGSUSED */
+int
+getitimer(p, uap)
+ struct proc *p;
+ register struct getitimer_args *uap;
+{
+ struct timeval ctv;
+ struct itimerval aitv;
+ int s;
+
+ if (uap->which > ITIMER_PROF)
+ return (EINVAL);
+ s = splclock(); /* XXX still needed ? */
+ if (uap->which == ITIMER_REAL) {
+ /*
+ * Convert from absolute to relative time in .it_value
+ * part of real time timer. If time for real time timer
+ * has passed return 0, else return difference between
+ * current time and time for the timer to go off.
+ */
+ aitv = p->p_realtimer;
+ if (timevalisset(&aitv.it_value)) {
+ getmicrouptime(&ctv);
+ if (timevalcmp(&aitv.it_value, &ctv, <))
+ timevalclear(&aitv.it_value);
+ else
+ timevalsub(&aitv.it_value, &ctv);
+ }
+ } else
+ aitv = p->p_stats->p_timer[uap->which];
+ splx(s);
+ return (copyout((caddr_t)&aitv, (caddr_t)uap->itv,
+ sizeof (struct itimerval)));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setitimer_args {
+ u_int which;
+ struct itimerval *itv, *oitv;
+};
+#endif
+/* ARGSUSED */
+int
+setitimer(p, uap)
+ struct proc *p;
+ register struct setitimer_args *uap;
+{
+ struct itimerval aitv;
+ struct timeval ctv;
+ register struct itimerval *itvp;
+ int s, error;
+
+ if (uap->which > ITIMER_PROF)
+ return (EINVAL);
+ itvp = uap->itv;
+ if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv,
+ sizeof(struct itimerval))))
+ return (error);
+ if ((uap->itv = uap->oitv) &&
+ (error = getitimer(p, (struct getitimer_args *)uap)))
+ return (error);
+ if (itvp == 0)
+ return (0);
+ if (itimerfix(&aitv.it_value))
+ return (EINVAL);
+ if (!timevalisset(&aitv.it_value))
+ timevalclear(&aitv.it_interval);
+ else if (itimerfix(&aitv.it_interval))
+ return (EINVAL);
+ s = splclock(); /* XXX: still needed ? */
+ if (uap->which == ITIMER_REAL) {
+ if (timevalisset(&p->p_realtimer.it_value))
+ untimeout(realitexpire, (caddr_t)p, p->p_ithandle);
+ if (timevalisset(&aitv.it_value))
+ p->p_ithandle = timeout(realitexpire, (caddr_t)p,
+ tvtohz(&aitv.it_value));
+ getmicrouptime(&ctv);
+ timevaladd(&aitv.it_value, &ctv);
+ p->p_realtimer = aitv;
+ } else
+ p->p_stats->p_timer[uap->which] = aitv;
+ splx(s);
+ return (0);
+}
+
+/*
+ * Real interval timer expired:
+ * send process whose timer expired an alarm signal.
+ * If time is not set up to reload, then just return.
+ * Else compute next time timer should go off which is > current time.
+ * This is where delay in processing this timeout causes multiple
+ * SIGALRM calls to be compressed into one.
+ * tvtohz() always adds 1 to allow for the time until the next clock
+ * interrupt being strictly less than 1 clock tick, but we don't want
+ * that here since we want to appear to be in sync with the clock
+ * interrupt even when we're delayed.
+ */
+void
+realitexpire(arg)
+ void *arg;
+{
+ register struct proc *p;
+ struct timeval ctv, ntv;
+ int s;
+
+ p = (struct proc *)arg;
+ psignal(p, SIGALRM);
+ if (!timevalisset(&p->p_realtimer.it_interval)) {
+ timevalclear(&p->p_realtimer.it_value);
+ return;
+ }
+ for (;;) {
+ s = splclock(); /* XXX: still neeeded ? */
+ timevaladd(&p->p_realtimer.it_value,
+ &p->p_realtimer.it_interval);
+ getmicrouptime(&ctv);
+ if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
+ ntv = p->p_realtimer.it_value;
+ timevalsub(&ntv, &ctv);
+ p->p_ithandle = timeout(realitexpire, (caddr_t)p,
+ tvtohz(&ntv) - 1);
+ splx(s);
+ return;
+ }
+ splx(s);
+ }
+}
+
+/*
+ * Check that a proposed value to load into the .it_value or
+ * .it_interval part of an interval timer is acceptable, and
+ * fix it to have at least minimal value (i.e. if it is less
+ * than the resolution of the clock, round it up.)
+ */
+int
+itimerfix(tv)
+ struct timeval *tv;
+{
+
+ if (tv->tv_sec < 0 || tv->tv_sec > 100000000 ||
+ tv->tv_usec < 0 || tv->tv_usec >= 1000000)
+ return (EINVAL);
+ if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
+ tv->tv_usec = tick;
+ return (0);
+}
+
+/*
+ * Decrement an interval timer by a specified number
+ * of microseconds, which must be less than a second,
+ * i.e. < 1000000. If the timer expires, then reload
+ * it. In this case, carry over (usec - old value) to
+ * reduce the value reloaded into the timer so that
+ * the timer does not drift. This routine assumes
+ * that it is called in a context where the timers
+ * on which it is operating cannot change in value.
+ */
+int
+itimerdecr(itp, usec)
+ register struct itimerval *itp;
+ int usec;
+{
+
+ if (itp->it_value.tv_usec < usec) {
+ if (itp->it_value.tv_sec == 0) {
+ /* expired, and already in next interval */
+ usec -= itp->it_value.tv_usec;
+ goto expire;
+ }
+ itp->it_value.tv_usec += 1000000;
+ itp->it_value.tv_sec--;
+ }
+ itp->it_value.tv_usec -= usec;
+ usec = 0;
+ if (timevalisset(&itp->it_value))
+ return (1);
+ /* expired, exactly at end of interval */
+expire:
+ if (timevalisset(&itp->it_interval)) {
+ itp->it_value = itp->it_interval;
+ itp->it_value.tv_usec -= usec;
+ if (itp->it_value.tv_usec < 0) {
+ itp->it_value.tv_usec += 1000000;
+ itp->it_value.tv_sec--;
+ }
+ } else
+ itp->it_value.tv_usec = 0; /* sec is already 0 */
+ return (0);
+}
+
+/*
+ * Add and subtract routines for timevals.
+ * N.B.: subtract routine doesn't deal with
+ * results which are before the beginning,
+ * it just gets very confused in this case.
+ * Caveat emptor.
+ */
+void
+timevaladd(t1, t2)
+ struct timeval *t1, *t2;
+{
+
+ t1->tv_sec += t2->tv_sec;
+ t1->tv_usec += t2->tv_usec;
+ timevalfix(t1);
+}
+
+void
+timevalsub(t1, t2)
+ struct timeval *t1, *t2;
+{
+
+ t1->tv_sec -= t2->tv_sec;
+ t1->tv_usec -= t2->tv_usec;
+ timevalfix(t1);
+}
+
+static void
+timevalfix(t1)
+ struct timeval *t1;
+{
+
+ if (t1->tv_usec < 0) {
+ t1->tv_sec--;
+ t1->tv_usec += 1000000;
+ }
+ if (t1->tv_usec >= 1000000) {
+ t1->tv_sec++;
+ t1->tv_usec -= 1000000;
+ }
+}
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
new file mode 100644
index 0000000..278fcce
--- /dev/null
+++ b/sys/kern/kern_timeout.c
@@ -0,0 +1,286 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
+ * $Id: kern_timeout.c,v 1.54 1998/02/25 06:13:32 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+
+/*
+ * TODO:
+ * allocate more timeout table slots when table overflows.
+ */
+
+/* Exported to machdep.c and/or kern_clock.c. */
+struct callout *callout;
+struct callout_list callfree;
+int callwheelsize, callwheelbits, callwheelmask;
+struct callout_tailq *callwheel;
+int softticks; /* Like ticks, but for softclock(). */
+
+static struct callout *nextsoftcheck; /* Next callout to be checked. */
+
+/*
+ * The callout mechanism is based on the work of Adam M. Costello and
+ * George Varghese, published in a technical report entitled "Redesigning
+ * the BSD Callout and Timer Facilities" and modified slightly for inclusion
+ * in FreeBSD by Justin T. Gibbs. The original work on the data structures
+ * used in this implementation was published by G.Varghese and A. Lauck in
+ * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
+ * the Efficient Implementation of a Timer Facility" in the Proceedings of
+ * the 11th ACM Annual Symposium on Operating Systems Principles,
+ * Austin, Texas Nov 1987.
+ */
+
+/*
+ * Software (low priority) clock interrupt.
+ * Run periodic events from timeout queue.
+ */
+void
+softclock()
+{
+ register struct callout *c;
+ register struct callout_tailq *bucket;
+ register int s;
+ register int curticks;
+ register int steps; /* #steps since we last allowed interrupts */
+
+#ifndef MAX_SOFTCLOCK_STEPS
+#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */
+#endif /* MAX_SOFTCLOCK_STEPS */
+
+ steps = 0;
+ s = splhigh();
+ while (softticks != ticks) {
+ softticks++;
+ /*
+ * softticks may be modified by hard clock, so cache
+ * it while we work on a given bucket.
+ */
+ curticks = softticks;
+ bucket = &callwheel[curticks & callwheelmask];
+ c = TAILQ_FIRST(bucket);
+ while (c) {
+ if (c->c_time != curticks) {
+ c = TAILQ_NEXT(c, c_links.tqe);
+ ++steps;
+ if (steps >= MAX_SOFTCLOCK_STEPS) {
+ nextsoftcheck = c;
+ /* Give interrupts a chance. */
+ splx(s);
+ s = splhigh();
+ c = nextsoftcheck;
+ steps = 0;
+ }
+ } else {
+ void (*c_func)(void *);
+ void *c_arg;
+
+ nextsoftcheck = TAILQ_NEXT(c, c_links.tqe);
+ TAILQ_REMOVE(bucket, c, c_links.tqe);
+ c_func = c->c_func;
+ c_arg = c->c_arg;
+ c->c_func = NULL;
+ SLIST_INSERT_HEAD(&callfree, c, c_links.sle);
+ splx(s);
+ c_func(c_arg);
+ s = splhigh();
+ steps = 0;
+ c = nextsoftcheck;
+ }
+ }
+ }
+ nextsoftcheck = NULL;
+ splx(s);
+}
+
+/*
+ * timeout --
+ * Execute a function after a specified length of time.
+ *
+ * untimeout --
+ * Cancel previous timeout function call.
+ *
+ * callout_handle_init --
+ * Initialize a handle so that using it with untimeout is benign.
+ *
+ * See AT&T BCI Driver Reference Manual for specification. This
+ * implementation differs from that one in that although an
+ * identification value is returned from timeout, the original
+ * arguments to timeout as well as the identifier are used to
+ * identify entries for untimeout.
+ */
+struct callout_handle
+timeout(ftn, arg, to_ticks)
+ timeout_t *ftn;
+ void *arg;
+ register int to_ticks;
+{
+ int s;
+ struct callout *new;
+ struct callout_handle handle;
+
+ if (to_ticks <= 0)
+ to_ticks = 1;
+
+ /* Lock out the clock. */
+ s = splhigh();
+
+ /* Fill in the next free callout structure. */
+ new = SLIST_FIRST(&callfree);
+ if (new == NULL)
+ /* XXX Attempt to malloc first */
+ panic("timeout table full");
+
+ SLIST_REMOVE_HEAD(&callfree, c_links.sle);
+ new->c_arg = arg;
+ new->c_func = ftn;
+ new->c_time = ticks + to_ticks;
+ TAILQ_INSERT_TAIL(&callwheel[new->c_time & callwheelmask],
+ new, c_links.tqe);
+
+ splx(s);
+ handle.callout = new;
+ return (handle);
+}
+
+void
+untimeout(ftn, arg, handle)
+ timeout_t *ftn;
+ void *arg;
+ struct callout_handle handle;
+{
+ register int s;
+
+ /*
+ * Check for a handle that was initialized
+ * by callout_handle_init, but never used
+ * for a real timeout.
+ */
+ if (handle.callout == NULL)
+ return;
+
+ s = splhigh();
+ if ((handle.callout->c_func == ftn)
+ && (handle.callout->c_arg == arg)) {
+ if (nextsoftcheck == handle.callout) {
+ nextsoftcheck = TAILQ_NEXT(handle.callout, c_links.tqe);
+ }
+ TAILQ_REMOVE(&callwheel[handle.callout->c_time & callwheelmask],
+ handle.callout, c_links.tqe);
+ handle.callout->c_func = NULL;
+ SLIST_INSERT_HEAD(&callfree, handle.callout, c_links.sle);
+ }
+ splx(s);
+}
+
+void
+callout_handle_init(struct callout_handle *handle)
+{
+ handle->callout = NULL;
+}
+
+#ifdef APM_FIXUP_CALLTODO
+/*
+ * Adjust the kernel calltodo timeout list. This routine is used after
+ * an APM resume to recalculate the calltodo timer list values with the
+ * number of hz's we have been sleeping. The next hardclock() will detect
+ * that there are fired timers and run softclock() to execute them.
+ *
+ * Please note, I have not done an exhaustive analysis of what code this
+ * might break. I am motivated to have my select()'s and alarm()'s that
+ * have expired during suspend firing upon resume so that the applications
+ * which set the timer can do the maintanence the timer was for as close
+ * as possible to the originally intended time. Testing this code for a
+ * week showed that resuming from a suspend resulted in 22 to 25 timers
+ * firing, which seemed independant on whether the suspend was 2 hours or
+ * 2 days. Your milage may vary. - Ken Key <key@cs.utk.edu>
+ */
+void
+adjust_timeout_calltodo(time_change)
+ struct timeval *time_change;
+{
+ register struct callout *p;
+ unsigned long delta_ticks;
+ int s;
+
+ /*
+ * How many ticks were we asleep?
+ * (stolen from tvtohz()).
+ */
+
+ /* Don't do anything */
+ if (time_change->tv_sec < 0)
+ return;
+ else if (time_change->tv_sec <= LONG_MAX / 1000000)
+ delta_ticks = (time_change->tv_sec * 1000000 +
+ time_change->tv_usec + (tick - 1)) / tick + 1;
+ else if (time_change->tv_sec <= LONG_MAX / hz)
+ delta_ticks = time_change->tv_sec * hz +
+ (time_change->tv_usec + (tick - 1)) / tick + 1;
+ else
+ delta_ticks = LONG_MAX;
+
+ if (delta_ticks > INT_MAX)
+ delta_ticks = INT_MAX;
+
+ /*
+ * Now rip through the timer calltodo list looking for timers
+ * to expire.
+ */
+
+ /* don't collide with softclock() */
+ s = splhigh();
+ for (p = calltodo.c_next; p != NULL; p = p->c_next) {
+ p->c_time -= delta_ticks;
+
+ /* Break if the timer had more time on it than delta_ticks */
+ if (p->c_time > 0)
+ break;
+
+ /* take back the ticks the timer didn't use (p->c_time <= 0) */
+ delta_ticks = -p->c_time;
+ }
+ splx(s);
+
+ return;
+}
+#endif /* APM_FIXUP_CALLTODO */
diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c
new file mode 100644
index 0000000..b7cb83b
--- /dev/null
+++ b/sys/kern/kern_xxx.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_xxx.c 8.2 (Berkeley) 11/14/93
+ * $Id: kern_xxx.c,v 1.27 1997/12/16 17:40:21 eivind Exp $
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/utsname.h>
+
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+
+#ifndef _SYS_SYSPROTO_H_
+struct gethostname_args {
+ char *hostname;
+ u_int len;
+};
+#endif
+/* ARGSUSED */
+int
+ogethostname(p, uap)
+ struct proc *p;
+ struct gethostname_args *uap;
+{
+ int name[2];
+ size_t len = uap->len;
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_HOSTNAME;
+ return (userland_sysctl(p, name, 2, uap->hostname, &len,
+ 1, 0, 0, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sethostname_args {
+ char *hostname;
+ u_int len;
+};
+#endif
+/* ARGSUSED */
+int
+osethostname(p, uap)
+ struct proc *p;
+ register struct sethostname_args *uap;
+{
+ int name[2];
+ int error;
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_HOSTNAME;
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
+ return (error);
+ return (userland_sysctl(p, name, 2, 0, 0, 0,
+ uap->hostname, uap->len, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ogethostid_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+ogethostid(p, uap)
+ struct proc *p;
+ struct ogethostid_args *uap;
+{
+
+ *(long *)(p->p_retval) = hostid;
+ return (0);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+#ifdef COMPAT_43
+#ifndef _SYS_SYSPROTO_H_
+struct osethostid_args {
+ long hostid;
+};
+#endif
+/* ARGSUSED */
+int
+osethostid(p, uap)
+ struct proc *p;
+ struct osethostid_args *uap;
+{
+ int error;
+
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
+ return (error);
+ hostid = uap->hostid;
+ return (0);
+}
+
+int
+oquota(p, uap)
+ struct proc *p;
+ struct oquota_args *uap;
+{
+
+ return (ENOSYS);
+}
+#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct uname_args {
+ struct utsname *name;
+};
+#endif
+
+/* ARGSUSED */
+int
+uname(p, uap)
+ struct proc *p;
+ struct uname_args *uap;
+{
+ int name[2], rtval;
+ size_t len;
+ char *s, *us;
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_OSTYPE;
+ len = sizeof uap->name->sysname;
+ rtval = userland_sysctl(p, name, 2, uap->name->sysname, &len,
+ 1, 0, 0, 0);
+ if( rtval) return rtval;
+ subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0);
+
+ name[1] = KERN_HOSTNAME;
+ len = sizeof uap->name->nodename;
+ rtval = userland_sysctl(p, name, 2, uap->name->nodename, &len,
+ 1, 0, 0, 0);
+ if( rtval) return rtval;
+ subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0);
+
+ name[1] = KERN_OSRELEASE;
+ len = sizeof uap->name->release;
+ rtval = userland_sysctl(p, name, 2, uap->name->release, &len,
+ 1, 0, 0, 0);
+ if( rtval) return rtval;
+ subyte( uap->name->release + sizeof(uap->name->release) - 1, 0);
+
+/*
+ name = KERN_VERSION;
+ len = sizeof uap->name->version;
+ rtval = userland_sysctl(p, name, 2, uap->name->version, &len,
+ 1, 0, 0, 0);
+ if( rtval) return rtval;
+ subyte( uap->name->version + sizeof(uap->name->version) - 1, 0);
+*/
+
+/*
+ * this stupid hackery to make the version field look like FreeBSD 1.1
+ */
+ for(s = version; *s && *s != '#'; s++);
+
+ for(us = uap->name->version; *s && *s != ':'; s++) {
+ rtval = subyte( us++, *s);
+ if( rtval)
+ return rtval;
+ }
+ rtval = subyte( us++, 0);
+ if( rtval)
+ return rtval;
+
+ name[0] = CTL_HW;
+ name[1] = HW_MACHINE;
+ len = sizeof uap->name->machine;
+ rtval = userland_sysctl(p, name, 2, uap->name->machine, &len,
+ 1, 0, 0, 0);
+ if( rtval) return rtval;
+ subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0);
+
+ return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getdomainname_args {
+ char *domainname;
+ int len;
+};
+#endif
+
+/* ARGSUSED */
+int
+getdomainname(p, uap)
+ struct proc *p;
+ struct getdomainname_args *uap;
+{
+ int domainnamelen = strlen(domainname) + 1;
+ if ((u_int)uap->len > domainnamelen + 1)
+ uap->len = domainnamelen + 1;
+ return (copyout((caddr_t)domainname, (caddr_t)uap->domainname, uap->len));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setdomainname_args {
+ char *domainname;
+ int len;
+};
+#endif
+
+/* ARGSUSED */
+int
+setdomainname(p, uap)
+ struct proc *p;
+ struct setdomainname_args *uap;
+{
+ int error, domainnamelen;
+
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
+ return (error);
+ if ((u_int)uap->len > sizeof (domainname) - 1)
+ return EINVAL;
+ domainnamelen = uap->len;
+ error = copyin((caddr_t)uap->domainname, domainname, uap->len);
+ domainname[domainnamelen] = 0;
+ return (error);
+}
+
diff --git a/sys/kern/ksched.c b/sys/kern/ksched.c
new file mode 100644
index 0000000..3718e253
--- /dev/null
+++ b/sys/kern/ksched.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 1996, 1997
+ * HD Associates, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/* ksched: Soft real time scheduling based on "rtprio".
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/resource.h>
+#include <machine/cpu.h> /* For need_resched */
+
+#include <posix4/posix4.h>
+
+/* ksched: Real-time extension to support POSIX priority scheduling.
+ */
+
+struct ksched {
+ struct timespec rr_interval;
+};
+
+int ksched_attach(struct ksched **p)
+{
+ struct ksched *ksched= p31b_malloc(sizeof(*ksched));
+
+ ksched->rr_interval.tv_sec = 0;
+ ksched->rr_interval.tv_nsec = 1000000000L / roundrobin_interval();
+
+ *p = ksched;
+ return 0;
+}
+
+int ksched_detach(struct ksched *p)
+{
+ p31b_free(p);
+
+ return 0;
+}
+
+/*
+ * XXX About priorities
+ *
+ * POSIX 1003.1b requires that numerically higher priorities be of
+ * higher priority. It also permits sched_setparam to be
+ * implementation defined for SCHED_OTHER. I don't like
+ * the notion of inverted priorites for normal processes when
+ * you can use "setpriority" for that.
+ *
+ * I'm rejecting sched_setparam for SCHED_OTHER with EINVAL.
+ */
+
+/* Macros to convert between the unix (lower numerically is higher priority)
+ * and POSIX 1003.1b (higher numerically is higher priority)
+ */
+
+#define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P))
+#define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P))
+
+/* These improve readability a bit for me:
+ */
+#define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX)
+#define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN)
+
+static __inline int
+getscheduler(int *ret, struct ksched *ksched, struct proc *p)
+{
+ int e = 0;
+
+ switch (p->p_rtprio.type)
+ {
+ case RTP_PRIO_FIFO:
+ *ret = SCHED_FIFO;
+ break;
+
+ case RTP_PRIO_REALTIME:
+ *ret = SCHED_RR;
+ break;
+
+ default:
+ *ret = SCHED_OTHER;
+ break;
+ }
+
+ return e;
+}
+
+int ksched_setparam(int *ret, struct ksched *ksched,
+ struct proc *p, const struct sched_param *param)
+{
+ int e, policy;
+
+ e = getscheduler(&policy, ksched, p);
+
+ if (e == 0)
+ {
+ if (policy == SCHED_OTHER)
+ e = EINVAL;
+ else
+ e = ksched_setscheduler(ret, ksched, p, policy, param);
+ }
+
+ return e;
+}
+
+int ksched_getparam(int *ret, struct ksched *ksched,
+ struct proc *p, struct sched_param *param)
+{
+ if (RTP_PRIO_IS_REALTIME(p->p_rtprio.type))
+ param->sched_priority = rtpprio_to_p4prio(p->p_rtprio.prio);
+
+ return 0;
+}
+
+/*
+ * XXX The priority and scheduler modifications should
+ * be moved into published interfaces in kern/kern_sync.
+ *
+ * The permissions to modify process p were checked in "p31b_proc()".
+ *
+ */
+int ksched_setscheduler(int *ret, struct ksched *ksched,
+ struct proc *p, int policy, const struct sched_param *param)
+{
+ int e = 0;
+ struct rtprio rtp;
+
+ switch(policy)
+ {
+ case SCHED_RR:
+ case SCHED_FIFO:
+
+ if (param->sched_priority >= P1B_PRIO_MIN &&
+ param->sched_priority <= P1B_PRIO_MAX)
+ {
+ rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+ rtp.type = (policy == SCHED_FIFO)
+ ? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;
+
+ p->p_rtprio = rtp;
+ need_resched();
+ }
+ else
+ e = EPERM;
+
+
+ break;
+
+ case SCHED_OTHER:
+ {
+ rtp.type = RTP_PRIO_NORMAL;
+ rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+ p->p_rtprio = rtp;
+
+ /* XXX Simply revert to whatever we had for last
+ * normal scheduler priorities.
+ * This puts a requirement
+ * on the scheduling code: You must leave the
+ * scheduling info alone.
+ */
+ need_resched();
+ }
+ break;
+ }
+
+ return e;
+}
+
+int ksched_getscheduler(int *ret, struct ksched *ksched, struct proc *p)
+{
+ return getscheduler(ret, ksched, p);
+}
+
+/* ksched_yield: Yield the CPU.
+ */
+int ksched_yield(int *ret, struct ksched *ksched)
+{
+ need_resched();
+ return 0;
+}
+
+int ksched_get_priority_max(int *ret, struct ksched *ksched, int policy)
+{
+ int e = 0;
+
+ switch (policy)
+ {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ *ret = RTP_PRIO_MAX;
+ break;
+
+ case SCHED_OTHER:
+ *ret = PRIO_MAX;
+ break;
+
+ default:
+ e = EINVAL;
+ }
+
+ return e;
+}
+
+int ksched_get_priority_min(int *ret, struct ksched *ksched, int policy)
+{
+ int e = 0;
+
+ switch (policy)
+ {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ *ret = P1B_PRIO_MIN;
+ break;
+
+ case SCHED_OTHER:
+ *ret = PRIO_MIN;
+ break;
+
+ default:
+ e = EINVAL;
+ }
+
+ return e;
+}
+
+int ksched_rr_get_interval(int *ret, struct ksched *ksched,
+ struct proc *p, struct timespec *timespec)
+{
+ *timespec = ksched->rr_interval;
+
+ return 0;
+}
diff --git a/sys/kern/link_aout.c b/sys/kern/link_aout.c
new file mode 100644
index 0000000..29b5884
--- /dev/null
+++ b/sys/kern/link_aout.c
@@ -0,0 +1,585 @@
+/*-
+ * Copyright (c) 1997 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: link_aout.c,v 1.16 1998/11/03 14:25:21 peter Exp $
+ */
+
+#ifndef __alpha__
+
+#define FREEBSD_AOUT 1
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+#include <a.out.h>
+#include <link.h>
+
+static int link_aout_load_module(const char*, linker_file_t*);
+
+static int link_aout_load_file(const char*, linker_file_t*);
+
+static int link_aout_lookup_symbol(linker_file_t, const char*,
+ linker_sym_t*);
+static int link_aout_symbol_values(linker_file_t file, linker_sym_t sym,
+ linker_symval_t* symval);
+static int link_aout_search_symbol(linker_file_t lf, caddr_t value,
+ linker_sym_t* sym, long* diffp);
+static void link_aout_unload_file(linker_file_t);
+static void link_aout_unload_module(linker_file_t);
+
+static struct linker_class_ops link_aout_class_ops = {
+ link_aout_load_module,
+};
+
+static struct linker_file_ops link_aout_file_ops = {
+ link_aout_lookup_symbol,
+ link_aout_symbol_values,
+ link_aout_search_symbol,
+ link_aout_unload_file,
+};
+static struct linker_file_ops link_aout_module_ops = {
+ link_aout_lookup_symbol,
+ link_aout_symbol_values,
+ link_aout_search_symbol,
+ link_aout_unload_module,
+};
+
+typedef struct aout_file {
+ char* address; /* Load address */
+ struct _dynamic* dynamic; /* Symbol table etc. */
+} *aout_file_t;
+
+static int load_dependancies(linker_file_t lf);
+static int relocate_file(linker_file_t lf);
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic _DYNAMIC;
+
+static void
+link_aout_init(void* arg)
+{
+#ifndef __ELF__
+ struct _dynamic* dp = &_DYNAMIC;
+#endif
+
+ linker_add_class("a.out", NULL, &link_aout_class_ops);
+
+#ifndef __ELF__
+ if (dp) {
+ aout_file_t af;
+
+ af = malloc(sizeof(struct aout_file), M_LINKER, M_NOWAIT);
+ if (af == NULL)
+ panic("link_aout_init: Can't create linker structures for kernel");
+ bzero(af, sizeof(*af));
+
+ af->address = 0;
+ af->dynamic = dp;
+ linker_kernel_file =
+ linker_make_file(kernelname, af, &link_aout_file_ops);
+ if (linker_kernel_file == NULL)
+ panic("link_aout_init: Can't create linker structures for kernel");
+ /*
+ * XXX there must be a better way of getting these constants.
+ */
+ linker_kernel_file->address = (caddr_t) 0xf0100000;
+ linker_kernel_file->size = -0xf0100000;
+ linker_current_file = linker_kernel_file;
+ }
+#endif
+}
+
+SYSINIT(link_aout, SI_SUB_KLD, SI_ORDER_THIRD, link_aout_init, 0);
+
+static int
+link_aout_load_module(const char* filename, linker_file_t* result)
+{
+ caddr_t modptr, baseptr;
+ char *type;
+ struct exec *ehdr;
+ aout_file_t af;
+ linker_file_t lf;
+ int error;
+
+ /* Look to see if we have the module preloaded. */
+ if ((modptr = preload_search_by_name(filename)) == NULL)
+ return(link_aout_load_file(filename, result));
+
+ /* It's preloaded, check we can handle it and collect information. */
+ if (((type = (char *)preload_search_info(modptr, MODINFO_TYPE)) == NULL) ||
+ strcmp(type, "a.out module") ||
+ ((baseptr = preload_search_info(modptr, MODINFO_ADDR)) == NULL) ||
+ ((ehdr = (struct exec *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_AOUTEXEC)) == NULL))
+ return(0); /* we can't handle this */
+
+ /* Looks like we can handle this one */
+ af = malloc(sizeof(struct aout_file), M_LINKER, M_WAITOK);
+ bzero(af, sizeof(*af));
+ af->address = baseptr;
+
+ /* Assume _DYNAMIC is the first data item. */
+ af->dynamic = (struct _dynamic*)(af->address + ehdr->a_text);
+ if (af->dynamic->d_version != LD_VERSION_BSD) {
+ free(af, M_LINKER);
+ return(0); /* we can't handle this */
+ }
+ af->dynamic->d_un.d_sdt = (struct section_dispatch_table *)
+ ((char *)af->dynamic->d_un.d_sdt + (vm_offset_t)af->address);
+
+ /* Register with kld */
+ lf = linker_make_file(filename, af, &link_aout_module_ops);
+ if (lf == NULL) {
+ free(af, M_LINKER);
+ return(ENOMEM);
+ }
+ lf->address = af->address;
+ lf->size = ehdr->a_text + ehdr->a_data + ehdr->a_bss;
+
+ /* Try to load dependancies */
+ if (((error = load_dependancies(lf)) != 0) ||
+ ((error = relocate_file(lf)) != 0)) {
+ linker_file_unload(lf);
+ return(error);
+ }
+ *result = lf;
+ return(0);
+}
+
+static int
+link_aout_load_file(const char* filename, linker_file_t* result)
+{
+ struct nameidata nd;
+ struct proc* p = curproc; /* XXX */
+ int error = 0;
+ int resid;
+ struct exec header;
+ aout_file_t af;
+ linker_file_t lf;
+ char *pathname;
+
+ pathname = linker_search_path(filename);
+ if (pathname == NULL)
+ return ENOENT;
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, pathname, p);
+ error = vn_open(&nd, FREAD, 0);
+ free(pathname, M_LINKER);
+ if (error)
+ return error;
+
+ /*
+ * Read the a.out header from the file.
+ */
+ error = vn_rdwr(UIO_READ, nd.ni_vp, (void*) &header, sizeof header, 0,
+ UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+ if (error)
+ goto out;
+
+ if (N_BADMAG(header) || !(N_GETFLAG(header) & EX_DYNAMIC))
+ goto out;
+
+ /*
+ * We have an a.out file, so make some space to read it in.
+ */
+ af = malloc(sizeof(struct aout_file), M_LINKER, M_WAITOK);
+ bzero(af, sizeof(*af));
+ af->address = malloc(header.a_text + header.a_data + header.a_bss,
+ M_LINKER, M_WAITOK);
+
+ /*
+ * Read the text and data sections and zero the bss.
+ */
+ error = vn_rdwr(UIO_READ, nd.ni_vp, (void*) af->address,
+ header.a_text + header.a_data, 0,
+ UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+ if (error)
+ goto out;
+ bzero(af->address + header.a_text + header.a_data, header.a_bss);
+
+ /*
+ * Assume _DYNAMIC is the first data item.
+ */
+ af->dynamic = (struct _dynamic*) (af->address + header.a_text);
+ if (af->dynamic->d_version != LD_VERSION_BSD) {
+ free(af->address, M_LINKER);
+ free(af, M_LINKER);
+ goto out;
+ }
+ af->dynamic->d_un.d_sdt = (struct section_dispatch_table *)
+ ((char *)af->dynamic->d_un.d_sdt + (vm_offset_t)af->address);
+
+ lf = linker_make_file(filename, af, &link_aout_file_ops);
+ if (lf == NULL) {
+ free(af->address, M_LINKER);
+ free(af, M_LINKER);
+ error = ENOMEM;
+ goto out;
+ }
+ lf->address = af->address;
+ lf->size = header.a_text + header.a_data + header.a_bss;
+
+ if ((error = load_dependancies(lf)) != 0
+ || (error = relocate_file(lf)) != 0) {
+ linker_file_unload(lf);
+ goto out;
+ }
+
+ *result = lf;
+
+out:
+ VOP_UNLOCK(nd.ni_vp, 0, p);
+ vn_close(nd.ni_vp, FREAD, p->p_ucred, p);
+
+ return error;
+}
+
+static void
+link_aout_unload_file(linker_file_t file)
+{
+ aout_file_t af = file->priv;
+
+ if (af) {
+ if (af->address)
+ free(af->address, M_LINKER);
+ free(af, M_LINKER);
+ }
+}
+
+static void
+link_aout_unload_module(linker_file_t file)
+{
+ aout_file_t af = file->priv;
+
+ if (af)
+ free(af, M_LINKER);
+ if (file->filename)
+ preload_delete_name(file->filename);
+}
+
+#define AOUT_RELOC(af, type, off) (type*) ((af)->address + (off))
+
+static int
+load_dependancies(linker_file_t lf)
+{
+ aout_file_t af = lf->priv;
+ linker_file_t lfdep;
+ long off;
+ struct sod* sodp;
+ char* name;
+ char* filename = 0;
+ int error = 0;
+
+ /*
+ * All files are dependant on /kernel.
+ */
+ if (linker_kernel_file) {
+ linker_kernel_file->refs++;
+ linker_file_add_dependancy(lf, linker_kernel_file);
+ }
+
+ off = LD_NEED(af->dynamic);
+
+ /*
+ * Load the dependancies.
+ */
+ while (off != 0) {
+ sodp = AOUT_RELOC(af, struct sod, off);
+ name = AOUT_RELOC(af, char, sodp->sod_name);
+
+ error = linker_load_file(name, &lfdep);
+ if (error)
+ goto out;
+ error = linker_file_add_dependancy(lf, lfdep);
+ if (error)
+ goto out;
+ off = sodp->sod_next;
+ }
+
+out:
+ if (filename)
+ free(filename, M_TEMP);
+ return error;
+}
+
+/*
+ * XXX i386 dependant.
+ */
+static long
+read_relocation(struct relocation_info* r, char* addr)
+{
+ int length = r->r_length;
+ if (length == 0)
+ return *(u_char*) addr;
+ else if (length == 1)
+ return *(u_short*) addr;
+ else if (length == 2)
+ return *(u_int*) addr;
+ else
+ printf("link_aout: unsupported relocation size %d\n", r->r_length);
+ return 0;
+}
+
+static void
+write_relocation(struct relocation_info* r, char* addr, long value)
+{
+ int length = r->r_length;
+ if (length == 0)
+ *(u_char*) addr = value;
+ else if (length == 1)
+ *(u_short*) addr = value;
+ else if (length == 2)
+ *(u_int*) addr = value;
+ else
+ printf("link_aout: unsupported relocation size %d\n", r->r_length);
+}
+
+static int
+relocate_file(linker_file_t lf)
+{
+ aout_file_t af = lf->priv;
+ struct relocation_info* rel;
+ struct relocation_info* erel;
+ struct relocation_info* r;
+ struct nzlist* symbolbase;
+ char* stringbase;
+ struct nzlist* np;
+ char* sym;
+ long relocation;
+
+ rel = AOUT_RELOC(af, struct relocation_info, LD_REL(af->dynamic));
+ erel = AOUT_RELOC(af, struct relocation_info,
+ LD_REL(af->dynamic) + LD_RELSZ(af->dynamic));
+ symbolbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic));
+ stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic));
+
+ for (r = rel; r < erel; r++) {
+ char* addr;
+
+ if (r->r_address == 0)
+ break;
+
+ addr = AOUT_RELOC(af, char, r->r_address);
+ if (r->r_extern) {
+ np = &symbolbase[r->r_symbolnum];
+ sym = &stringbase[np->nz_strx];
+
+ if (sym[0] != '_') {
+ printf("link_aout: bad symbol name %s\n", sym);
+ relocation = 0;
+ } else
+ relocation = (intptr_t)
+ linker_file_lookup_symbol(lf, sym + 1,
+ np->nz_type != (N_SETV+N_EXT));
+ if (!relocation) {
+ printf("link_aout: symbol %s not found\n", sym);
+ return ENOENT;
+ }
+
+ relocation += read_relocation(r, addr);
+
+ if (r->r_jmptable) {
+ printf("link_aout: can't cope with jump table relocations\n");
+ continue;
+ }
+
+ if (r->r_pcrel)
+ relocation -= (intptr_t) af->address;
+
+ if (r->r_copy) {
+ printf("link_aout: can't cope with copy relocations\n");
+ continue;
+ }
+
+ write_relocation(r, addr, relocation);
+ } else {
+ write_relocation(r, addr,
+ (intptr_t)(read_relocation(r, addr) + af->address));
+ }
+
+ }
+
+ return 0;
+}
+
+static long
+symbol_hash_value(aout_file_t af, const char* name)
+{
+ long hashval;
+ const char* p;
+
+ hashval = '_'; /* fake a starting '_' for C symbols */
+ for (p = name; *p; p++)
+ hashval = (hashval << 1) + *p;
+
+ return (hashval & 0x7fffffff) % LD_BUCKETS(af->dynamic);
+}
+
+int
+link_aout_lookup_symbol(linker_file_t file, const char* name,
+ linker_sym_t* sym)
+{
+ aout_file_t af = file->priv;
+ long hashval;
+ struct rrs_hash* hashbase;
+ struct nzlist* symbolbase;
+ char* stringbase;
+ struct rrs_hash* hp;
+ struct nzlist* np;
+ char* cp;
+
+ if (LD_BUCKETS(af->dynamic) == 0)
+ return NULL;
+
+ hashbase = AOUT_RELOC(af, struct rrs_hash, LD_HASH(af->dynamic));
+ symbolbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic));
+ stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic));
+
+restart:
+ hashval = symbol_hash_value(af, name);
+ hp = &hashbase[hashval];
+ if (hp->rh_symbolnum == -1)
+ return ENOENT;
+
+ while (hp) {
+ np = (struct nzlist *) &symbolbase[hp->rh_symbolnum];
+ cp = stringbase + np->nz_strx;
+ /*
+ * Note: we fake the leading '_' for C symbols.
+ */
+ if (cp[0] == '_' && !strcmp(cp + 1, name))
+ break;
+
+ if (hp->rh_next == 0)
+ hp = NULL;
+ else
+ hp = &hashbase[hp->rh_next];
+ }
+
+ if (hp == NULL)
+ /*
+ * Not found.
+ */
+ return ENOENT;
+
+ /*
+ * Check for an aliased symbol, whatever that is.
+ */
+ if (np->nz_type == N_INDR+N_EXT) {
+ name = stringbase + (++np)->nz_strx + 1; /* +1 for '_' */
+ goto restart;
+ }
+
+ /*
+ * Check this is an actual definition of the symbol.
+ */
+ if (np->nz_value == 0)
+ return ENOENT;
+
+ if (np->nz_type == N_UNDF+N_EXT && np->nz_value != 0) {
+ if (np->nz_other == AUX_FUNC)
+ /* weak function */
+ return ENOENT;
+ }
+
+ *sym = (linker_sym_t) np;
+
+ return 0;
+}
+
+
+static int
+link_aout_symbol_values(linker_file_t file, linker_sym_t sym,
+ linker_symval_t* symval)
+{
+ aout_file_t af = file->priv;
+ struct nzlist* np = (struct nzlist*) sym;
+ char* stringbase;
+ long numsym = LD_STABSZ(af->dynamic) / sizeof(struct nzlist);
+ struct nzlist *symbase;
+
+ /* Is it one of ours? It could be another module... */
+ symbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic));
+ if (np < symbase)
+ return ENOENT;
+ if ((np - symbase) > numsym)
+ return ENOENT;
+
+ stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic));
+
+ symval->name = stringbase + np->nz_strx + 1; /* +1 for '_' */
+ if (np->nz_type == N_UNDF+N_EXT && np->nz_value != 0) {
+ symval->value = 0;
+ symval->size = np->nz_value;
+ } else {
+ symval->value = AOUT_RELOC(af, char, np->nz_value);
+ symval->size = np->nz_size;
+ }
+ return 0;
+}
+
+static int
+link_aout_search_symbol(linker_file_t lf, caddr_t value,
+ linker_sym_t* sym, long* diffp)
+{
+ aout_file_t af = lf->priv;
+ u_long off = (uintptr_t) (void *) value;
+ u_long diff = off;
+ struct nzlist* sp;
+ struct nzlist* ep;
+ struct nzlist* best = 0;
+
+ for (sp = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic)),
+ ep = (struct nzlist *) ((caddr_t) sp + LD_STABSZ(af->dynamic));
+ sp < ep; sp++) {
+ if (sp->nz_name == 0)
+ continue;
+ if (off >= sp->nz_value) {
+ if (off - sp->nz_value < diff) {
+ diff = off - sp->nz_value;
+ best = sp;
+ if (diff == 0)
+ break;
+ } else if (off - sp->nz_value == diff) {
+ best = sp;
+ }
+ }
+ }
+ if (best == 0)
+ *diffp = off;
+ else
+ *diffp = diff;
+ *sym = (linker_sym_t) best;
+
+ return 0;
+}
+
+#endif /* !__alpha__ */
diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c
new file mode 100644
index 0000000..c5e84da
--- /dev/null
+++ b/sys/kern/link_elf.c
@@ -0,0 +1,981 @@
+/*-
+ * Copyright (c) 1998 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: link_elf.c,v 1.10 1998/11/06 15:16:07 peter Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+#include <machine/elf.h>
+
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+static int link_elf_load_module(const char*, linker_file_t*);
+static int link_elf_load_file(const char*, linker_file_t*);
+static int link_elf_lookup_symbol(linker_file_t, const char*,
+ linker_sym_t*);
+static int link_elf_symbol_values(linker_file_t, linker_sym_t, linker_symval_t*);
+static int link_elf_search_symbol(linker_file_t, caddr_t value,
+ linker_sym_t* sym, long* diffp);
+
+static void link_elf_unload_file(linker_file_t);
+static void link_elf_unload_module(linker_file_t);
+
+static struct linker_class_ops link_elf_class_ops = {
+ link_elf_load_module,
+};
+
+static struct linker_file_ops link_elf_file_ops = {
+ link_elf_lookup_symbol,
+ link_elf_symbol_values,
+ link_elf_search_symbol,
+ link_elf_unload_file,
+};
+
+static struct linker_file_ops link_elf_module_ops = {
+ link_elf_lookup_symbol,
+ link_elf_symbol_values,
+ link_elf_search_symbol,
+ link_elf_unload_module,
+};
+typedef struct elf_file {
+ caddr_t address; /* Relocation address */
+#ifdef SPARSE_MAPPING
+ vm_object_t object; /* VM object to hold file pages */
+#endif
+ const Elf_Dyn* dynamic; /* Symbol table etc. */
+ Elf_Off nbuckets; /* DT_HASH info */
+ Elf_Off nchains;
+ const Elf_Off* buckets;
+ const Elf_Off* chains;
+ caddr_t hash;
+ caddr_t strtab; /* DT_STRTAB */
+ int strsz; /* DT_STRSZ */
+ const Elf_Sym* symtab; /* DT_SYMTAB */
+ Elf_Addr* got; /* DT_PLTGOT */
+ const Elf_Rel* pltrel; /* DT_JMPREL */
+ int pltrelsize; /* DT_PLTRELSZ */
+ const Elf_Rela* pltrela; /* DT_JMPREL */
+ int pltrelasize; /* DT_PLTRELSZ */
+ const Elf_Rel* rel; /* DT_REL */
+ int relsize; /* DT_RELSZ */
+ const Elf_Rela* rela; /* DT_RELA */
+ int relasize; /* DT_RELASZ */
+ caddr_t modptr;
+ const Elf_Sym* ddbsymtab; /* The symbol table we are using */
+ long ddbsymcnt; /* Number of symbols */
+ caddr_t ddbstrtab; /* String table */
+ long ddbstrcnt; /* number of bytes in string table */
+ caddr_t symbase; /* malloc'ed symbold base */
+ caddr_t strbase; /* malloc'ed string base */
+} *elf_file_t;
+
+static int parse_dynamic(linker_file_t lf);
+static int load_dependancies(linker_file_t lf);
+static int relocate_file(linker_file_t lf);
+static int parse_module_symbols(linker_file_t lf);
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic _DYNAMIC;
+
+static void
+link_elf_init(void* arg)
+{
+#ifdef __ELF__
+ Elf_Dyn *dp;
+ caddr_t modptr, baseptr, sizeptr;
+ elf_file_t ef;
+ char *modname;
+#endif
+
+#if ELF_TARG_CLASS == ELFCLASS32
+ linker_add_class("elf32", NULL, &link_elf_class_ops);
+#else
+ linker_add_class("elf64", NULL, &link_elf_class_ops);
+#endif
+
+#ifdef __ELF__
+ dp = (Elf_Dyn*) &_DYNAMIC;
+ if (dp) {
+ ef = malloc(sizeof(struct elf_file), M_LINKER, M_NOWAIT);
+ if (ef == NULL)
+ panic("link_elf_init: Can't create linker structures for kernel");
+ bzero(ef, sizeof(*ef));
+
+ ef->address = 0;
+#ifdef SPARSE_MAPPING
+ ef->object = 0;
+#endif
+ ef->dynamic = dp;
+ modname = NULL;
+ modptr = preload_search_by_type("elf kernel");
+ if (modptr)
+ modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+ if (modname == NULL)
+ modname = "kernel";
+ linker_kernel_file = linker_make_file(modname, ef, &link_elf_file_ops);
+ if (linker_kernel_file == NULL)
+ panic("link_elf_init: Can't create linker structures for kernel");
+ parse_dynamic(linker_kernel_file);
+ /* Sigh, magic constants. */
+#ifdef __alpha__
+ linker_kernel_file->address = (caddr_t) 0xfffffc0000300000;
+#else
+ linker_kernel_file->address = (caddr_t) 0xf0100000;
+#endif
+ linker_kernel_file->size = -(long)linker_kernel_file->address;
+
+ if (modptr) {
+ ef->modptr = modptr;
+ baseptr = preload_search_info(modptr, MODINFO_ADDR);
+ if (baseptr)
+ linker_kernel_file->address = *(caddr_t *)baseptr;
+ sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+ if (sizeptr)
+ linker_kernel_file->size = *(size_t *)sizeptr;
+ }
+ (void)parse_module_symbols(linker_kernel_file);
+ linker_current_file = linker_kernel_file;
+ }
+#endif
+}
+
+SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
+
+static int
+parse_module_symbols(linker_file_t lf)
+{
+ elf_file_t ef = lf->priv;
+ caddr_t pointer;
+ caddr_t ssym, esym, base;
+ caddr_t strtab;
+ int strcnt;
+ Elf_Sym* symtab;
+ int symcnt;
+
+ if (ef->modptr == NULL)
+ return 0;
+ pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_SSYM);
+ if (pointer == NULL)
+ return 0;
+ ssym = *(caddr_t *)pointer;
+ pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_ESYM);
+ if (pointer == NULL)
+ return 0;
+ esym = *(caddr_t *)pointer;
+
+ base = ssym;
+
+ symcnt = *(long *)base;
+ base += sizeof(long);
+ symtab = (Elf_Sym *)base;
+ base += roundup(symcnt, sizeof(long));
+
+ if (base > esym || base < ssym) {
+ printf("Symbols are corrupt!\n");
+ return EINVAL;
+ }
+
+ strcnt = *(long *)base;
+ base += sizeof(long);
+ strtab = base;
+ base += roundup(strcnt, sizeof(long));
+
+ if (base > esym || base < ssym) {
+ printf("Symbols are corrupt!\n");
+ return EINVAL;
+ }
+
+ ef->ddbsymtab = symtab;
+ ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+ ef->ddbstrtab = strtab;
+ ef->ddbstrcnt = strcnt;
+
+ return 0;
+}
+
+static int
+parse_dynamic(linker_file_t lf)
+{
+ elf_file_t ef = lf->priv;
+ const Elf_Dyn *dp;
+ int plttype = DT_REL;
+
+ for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+ switch (dp->d_tag) {
+ case DT_HASH:
+ {
+ /* From src/libexec/rtld-elf/rtld.c */
+ const Elf_Off *hashtab = (const Elf_Off *)
+ (ef->address + dp->d_un.d_ptr);
+ ef->nbuckets = hashtab[0];
+ ef->nchains = hashtab[1];
+ ef->buckets = hashtab + 2;
+ ef->chains = ef->buckets + ef->nbuckets;
+ break;
+ }
+ case DT_STRTAB:
+ ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_STRSZ:
+ ef->strsz = dp->d_un.d_val;
+ break;
+ case DT_SYMTAB:
+ ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_SYMENT:
+ if (dp->d_un.d_val != sizeof(Elf_Sym))
+ return ENOEXEC;
+ break;
+ case DT_PLTGOT:
+ ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_REL:
+ ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_RELSZ:
+ ef->relsize = dp->d_un.d_val;
+ break;
+ case DT_RELENT:
+ if (dp->d_un.d_val != sizeof(Elf_Rel))
+ return ENOEXEC;
+ break;
+ case DT_JMPREL:
+ ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_PLTRELSZ:
+ ef->pltrelsize = dp->d_un.d_val;
+ break;
+ case DT_RELA:
+ ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_RELASZ:
+ ef->relasize = dp->d_un.d_val;
+ break;
+ case DT_RELAENT:
+ if (dp->d_un.d_val != sizeof(Elf_Rela))
+ return ENOEXEC;
+ break;
+ case DT_PLTREL:
+ plttype = dp->d_un.d_val;
+ if (plttype != DT_REL && plttype != DT_RELA)
+ return ENOEXEC;
+ break;
+ }
+ }
+
+ if (plttype == DT_RELA) {
+ ef->pltrela = (const Elf_Rela *) ef->pltrel;
+ ef->pltrel = NULL;
+ ef->pltrelasize = ef->pltrelsize;
+ ef->pltrelsize = 0;
+ }
+
+ ef->ddbsymtab = ef->symtab;
+ ef->ddbsymcnt = ef->nchains;
+ ef->ddbstrtab = ef->strtab;
+ ef->ddbstrcnt = ef->strsz;
+
+ return 0;
+}
+
+static void
+link_elf_error(const char *s)
+{
+ printf("kldload: %s\n", s);
+}
+
+static int
+link_elf_load_module(const char *filename, linker_file_t *result)
+{
+ caddr_t modptr, baseptr, sizeptr, dynptr;
+ char *type;
+ elf_file_t ef;
+ linker_file_t lf;
+ int error;
+ vm_offset_t dp;
+
+ /* Look to see if we have the module preloaded */
+ modptr = preload_search_by_name(filename);
+ if (modptr == NULL)
+ return (link_elf_load_file(filename, result));
+
+ /* It's preloaded, check we can handle it and collect information */
+ type = (char *)preload_search_info(modptr, MODINFO_TYPE);
+ baseptr = preload_search_info(modptr, MODINFO_ADDR);
+ sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+ dynptr = preload_search_info(modptr, MODINFO_METADATA|MODINFOMD_DYNAMIC);
+ if (type == NULL || strcmp(type, "elf module") != 0)
+ return (EFTYPE);
+ if (baseptr == NULL || sizeptr == NULL || dynptr == NULL)
+ return (EINVAL);
+
+ ef = malloc(sizeof(struct elf_file), M_LINKER, M_WAITOK);
+ if (ef == NULL)
+ return (ENOMEM);
+ bzero(ef, sizeof(*ef));
+ ef->modptr = modptr;
+ ef->address = *(caddr_t *)baseptr;
+#ifdef SPARSE_MAPPING
+ ef->object = 0;
+#endif
+ dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr;
+ ef->dynamic = (Elf_Dyn *)dp;
+ lf = linker_make_file(filename, ef, &link_elf_module_ops);
+ if (lf == NULL) {
+ free(ef, M_LINKER);
+ return ENOMEM;
+ }
+ lf->address = ef->address;
+ lf->size = *(size_t *)sizeptr;
+
+ error = parse_dynamic(lf);
+ if (error) {
+ linker_file_unload(lf);
+ return error;
+ }
+ error = load_dependancies(lf);
+ if (error) {
+ linker_file_unload(lf);
+ return error;
+ }
+ error = relocate_file(lf);
+ if (error) {
+ linker_file_unload(lf);
+ return error;
+ }
+ (void)parse_module_symbols(lf);
+ *result = lf;
+ return (0);
+}
+
+static int
+link_elf_load_file(const char* filename, linker_file_t* result)
+{
+ struct nameidata nd;
+ struct proc* p = curproc; /* XXX */
+ Elf_Ehdr *hdr;
+ caddr_t firstpage;
+ int nbytes, i;
+ Elf_Phdr *phdr;
+ Elf_Phdr *phlimit;
+ Elf_Phdr *segs[2];
+ int nsegs;
+ Elf_Phdr *phdyn;
+ Elf_Phdr *phphdr;
+ caddr_t mapbase;
+ size_t mapsize;
+ Elf_Off base_offset;
+ Elf_Addr base_vaddr;
+ Elf_Addr base_vlimit;
+ int error = 0;
+ int resid;
+ elf_file_t ef;
+ linker_file_t lf;
+ char *pathname;
+ Elf_Shdr *shdr;
+ int symtabindex;
+ int symstrindex;
+ int symcnt;
+ int strcnt;
+
+ shdr = NULL;
+ lf = NULL;
+
+ pathname = linker_search_path(filename);
+ if (pathname == NULL)
+ return ENOENT;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, pathname, p);
+ error = vn_open(&nd, FREAD, 0);
+ free(pathname, M_LINKER);
+ if (error)
+ return error;
+
+ /*
+ * Read the elf header from the file.
+ */
+ firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
+ if (firstpage == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ hdr = (Elf_Ehdr *)firstpage;
+ error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
+ UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+ nbytes = PAGE_SIZE - resid;
+ if (error)
+ goto out;
+
+ if (!IS_ELF(*hdr)) {
+ error = ENOEXEC;
+ goto out;
+ }
+
+ if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
+ || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
+ link_elf_error("Unsupported file layout");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_ident[EI_VERSION] != EV_CURRENT
+ || hdr->e_version != EV_CURRENT) {
+ link_elf_error("Unsupported file version");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
+ link_elf_error("Unsupported file type");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_machine != ELF_TARG_MACH) {
+ link_elf_error("Unsupported machine");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * We rely on the program header being in the first page. This is
+ * not strictly required by the ABI specification, but it seems to
+ * always true in practice. And, it simplifies things considerably.
+ */
+ if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
+ (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
+ (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
+ link_elf_error("Unreadable program headers");
+
+ /*
+ * Scan the program header entries, and save key information.
+ *
+ * We rely on there being exactly two load segments, text and data,
+ * in that order.
+ */
+ phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
+ phlimit = phdr + hdr->e_phnum;
+ nsegs = 0;
+ phdyn = NULL;
+ phphdr = NULL;
+ while (phdr < phlimit) {
+ switch (phdr->p_type) {
+
+ case PT_LOAD:
+ if (nsegs == 2) {
+ link_elf_error("Too many sections");
+ error = ENOEXEC;
+ goto out;
+ }
+ segs[nsegs] = phdr;
+ ++nsegs;
+ break;
+
+ case PT_PHDR:
+ phphdr = phdr;
+ break;
+
+ case PT_DYNAMIC:
+ phdyn = phdr;
+ break;
+ }
+
+ ++phdr;
+ }
+ if (phdyn == NULL) {
+ link_elf_error("Object is not dynamically-linked");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * Allocate the entire address space of the object, to stake out our
+ * contiguous region, and to establish the base address for relocation.
+ */
+ base_offset = trunc_page(segs[0]->p_offset);
+ base_vaddr = trunc_page(segs[0]->p_vaddr);
+ base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz);
+ mapsize = base_vlimit - base_vaddr;
+
+ ef = malloc(sizeof(struct elf_file), M_LINKER, M_WAITOK);
+ bzero(ef, sizeof(*ef));
+#ifdef SPARSE_MAPPING
+ ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
+ if (ef->object == NULL) {
+ free(ef, M_LINKER);
+ error = ENOMEM;
+ goto out;
+ }
+ vm_object_reference(ef->object);
+ ef->address = (caddr_t) vm_map_min(kernel_map);
+ error = vm_map_find(kernel_map, ef->object, 0,
+ (vm_offset_t *) &ef->address,
+ mapsize, 1,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error) {
+ vm_object_deallocate(ef->object);
+ free(ef, M_LINKER);
+ goto out;
+ }
+#else
+ ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
+#endif
+ mapbase = ef->address;
+
+ /*
+ * Read the text and data sections and zero the bss.
+ */
+ for (i = 0; i < 2; i++) {
+ caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ segbase, segs[i]->p_filesz, segs[i]->p_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+ if (error) {
+#ifdef SPARSE_MAPPING
+ vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+ (vm_offset_t) ef->address
+ + (ef->object->size << PAGE_SHIFT));
+ vm_object_deallocate(ef->object);
+#else
+ free(ef->address, M_LINKER);
+#endif
+ free(ef, M_LINKER);
+ goto out;
+ }
+ bzero(segbase + segs[i]->p_filesz,
+ segs[i]->p_memsz - segs[i]->p_filesz);
+
+#ifdef SPARSE_MAPPING
+ /*
+ * Wire down the pages
+ */
+ vm_map_pageable(kernel_map,
+ (vm_offset_t) segbase,
+ (vm_offset_t) segbase + segs[i]->p_memsz,
+ FALSE);
+#endif
+ }
+
+ ef->dynamic = (const Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
+
+ lf = linker_make_file(filename, ef, &link_elf_file_ops);
+ if (lf == NULL) {
+#ifdef SPARSE_MAPPING
+ vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+ (vm_offset_t) ef->address
+ + (ef->object->size << PAGE_SHIFT));
+ vm_object_deallocate(ef->object);
+#else
+ free(ef->address, M_LINKER);
+#endif
+ free(ef, M_LINKER);
+ error = ENOMEM;
+ goto out;
+ }
+ lf->address = ef->address;
+ lf->size = mapsize;
+
+ error = parse_dynamic(lf);
+ if (error)
+ goto out;
+ error = load_dependancies(lf);
+ if (error)
+ goto out;
+ error = relocate_file(lf);
+ if (error)
+ goto out;
+
+ /* Try and load the symbol table if it's present. (you can strip it!) */
+ nbytes = hdr->e_shnum * hdr->e_shentsize;
+ if (nbytes == 0 || hdr->e_shoff == 0)
+ goto nosyms;
+ shdr = malloc(nbytes, M_LINKER, M_WAITOK);
+ if (shdr == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ bzero(shdr, nbytes);
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ (caddr_t)shdr, nbytes, hdr->e_shoff,
+ UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+ if (error)
+ goto out;
+ symtabindex = -1;
+ symstrindex = -1;
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if (shdr[i].sh_type == SHT_SYMTAB) {
+ symtabindex = i;
+ symstrindex = shdr[i].sh_link;
+ }
+ }
+ if (symtabindex < 0 || symstrindex < 0)
+ goto nosyms;
+
+ symcnt = shdr[symtabindex].sh_size;
+ ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
+ strcnt = shdr[symstrindex].sh_size;
+ ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);
+
+ if (ef->symbase == NULL || ef->strbase == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ ef->symbase, symcnt, shdr[symtabindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+ if (error)
+ goto out;
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ ef->strbase, strcnt, shdr[symstrindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+ if (error)
+ goto out;
+
+ ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+ ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
+ ef->ddbstrcnt = strcnt;
+ ef->ddbstrtab = ef->strbase;
+
+nosyms:
+
+ *result = lf;
+
+out:
+ if (error && lf)
+ linker_file_unload(lf);
+ if (shdr)
+ free(shdr, M_LINKER);
+ if (firstpage)
+ free(firstpage, M_LINKER);
+ VOP_UNLOCK(nd.ni_vp, 0, p);
+ vn_close(nd.ni_vp, FREAD, p->p_ucred, p);
+
+ return error;
+}
+
+static void
+link_elf_unload_file(linker_file_t file)
+{
+ elf_file_t ef = file->priv;
+
+ if (ef) {
+#ifdef SPARSE_MAPPING
+ if (ef->object) {
+ vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+ (vm_offset_t) ef->address
+ + (ef->object->size << PAGE_SHIFT));
+ vm_object_deallocate(ef->object);
+ }
+#else
+ if (ef->address)
+ free(ef->address, M_LINKER);
+#endif
+ if (ef->symbase)
+ free(ef->symbase, M_LINKER);
+ if (ef->strbase)
+ free(ef->strbase, M_LINKER);
+ free(ef, M_LINKER);
+ }
+}
+
+static void
+link_elf_unload_module(linker_file_t file)
+{
+ elf_file_t ef = file->priv;
+
+ if (ef)
+ free(ef, M_LINKER);
+ if (file->filename)
+ preload_delete_name(file->filename);
+}
+
+static int
+load_dependancies(linker_file_t lf)
+{
+ elf_file_t ef = lf->priv;
+ linker_file_t lfdep;
+ char* name;
+ const Elf_Dyn *dp;
+ int error = 0;
+
+ /*
+ * All files are dependant on /kernel.
+ */
+ if (linker_kernel_file) {
+ linker_kernel_file->refs++;
+ linker_file_add_dependancy(lf, linker_kernel_file);
+ }
+
+ for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+ if (dp->d_tag == DT_NEEDED) {
+ name = ef->strtab + dp->d_un.d_val;
+
+ error = linker_load_file(name, &lfdep);
+ if (error)
+ goto out;
+ error = linker_file_add_dependancy(lf, lfdep);
+ if (error)
+ goto out;
+ }
+ }
+
+out:
+ return error;
+}
+
+static const char *
+symbol_name(elf_file_t ef, Elf_Word r_info)
+{
+ const Elf_Sym *ref;
+
+ if (ELF_R_SYM(r_info)) {
+ ref = ef->symtab + ELF_R_SYM(r_info);
+ return ef->strtab + ref->st_name;
+ } else
+ return NULL;
+}
+
+static int
+relocate_file(linker_file_t lf)
+{
+ elf_file_t ef = lf->priv;
+ const Elf_Rel *rellim;
+ const Elf_Rel *rel;
+ const Elf_Rela *relalim;
+ const Elf_Rela *rela;
+ const char *symname;
+
+ /* Perform relocations without addend if there are any: */
+ rel = ef->rel;
+ if (rel) {
+ rellim = (const Elf_Rel *) ((caddr_t) ef->rel + ef->relsize);
+ while (rel < rellim) {
+ symname = symbol_name(ef, rel->r_info);
+ if (elf_reloc(lf, rel, ELF_RELOC_REL, symname)) {
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rel++;
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ rela = ef->rela;
+ if (rela) {
+ relalim = (const Elf_Rela *) ((caddr_t) ef->rela + ef->relasize);
+ while (rela < relalim) {
+ symname = symbol_name(ef, rela->r_info);
+ if (elf_reloc(lf, rela, ELF_RELOC_RELA, symname)) {
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rela++;
+ }
+ }
+
+ /* Perform PLT relocations without addend if there are any: */
+ rel = ef->pltrel;
+ if (rel) {
+ rellim = (const Elf_Rel *) ((caddr_t) ef->pltrel + ef->pltrelsize);
+ while (rel < rellim) {
+ symname = symbol_name(ef, rel->r_info);
+ if (elf_reloc(lf, rel, ELF_RELOC_REL, symname)) {
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rel++;
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ rela = ef->pltrela;
+ if (rela) {
+ relalim = (const Elf_Rela *) ((caddr_t) ef->pltrela + ef->pltrelasize);
+ while (rela < relalim) {
+ symname = symbol_name(ef, rela->r_info);
+ if (elf_reloc(lf, rela, ELF_RELOC_RELA, symname)) {
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rela++;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Hash function for symbol table lookup. Don't even think about changing
+ * this. It is specified by the System V ABI.
+ */
+static unsigned long
+elf_hash(const char *name)
+{
+ const unsigned char *p = (const unsigned char *) name;
+ unsigned long h = 0;
+ unsigned long g;
+
+ while (*p != '\0') {
+ h = (h << 4) + *p++;
+ if ((g = h & 0xf0000000) != 0)
+ h ^= g >> 24;
+ h &= ~g;
+ }
+ return h;
+}
+
+int
+link_elf_lookup_symbol(linker_file_t lf, const char* name, linker_sym_t* sym)
+{
+ elf_file_t ef = lf->priv;
+ unsigned long symnum;
+ const Elf_Sym* symp;
+ const char *strp;
+ unsigned long hash;
+ int i;
+
+ /* First, search hashed global symbols */
+ hash = elf_hash(name);
+ symnum = ef->buckets[hash % ef->nbuckets];
+
+ while (symnum != STN_UNDEF) {
+ if (symnum >= ef->nchains) {
+ printf("link_elf_lookup_symbol: corrupt symbol table\n");
+ return ENOENT;
+ }
+
+ symp = ef->symtab + symnum;
+ if (symp->st_name == 0) {
+ printf("link_elf_lookup_symbol: corrupt symbol table\n");
+ return ENOENT;
+ }
+
+ strp = ef->strtab + symp->st_name;
+
+ if (strcmp(name, strp) == 0) {
+ if (symp->st_shndx != SHN_UNDEF ||
+ (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+ *sym = (linker_sym_t) symp;
+ return 0;
+ } else
+ return ENOENT;
+ }
+
+ symnum = ef->chains[symnum];
+ }
+
+ /* If we have not found it, look at the full table (if loaded) */
+ if (ef->symtab == ef->ddbsymtab)
+ return ENOENT;
+
+ /* Exhaustive search */
+ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+ strp = ef->ddbstrtab + symp->st_name;
+ if (strcmp(name, strp) == 0) {
+ if (symp->st_shndx != SHN_UNDEF ||
+ (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+ *sym = (linker_sym_t) symp;
+ return 0;
+ } else
+ return ENOENT;
+ }
+ }
+
+ return ENOENT;
+}
+
+static int
+link_elf_symbol_values(linker_file_t lf, linker_sym_t sym, linker_symval_t* symval)
+{
+ elf_file_t ef = lf->priv;
+ Elf_Sym* es = (Elf_Sym*) sym;
+
+ if (es >= ef->symtab && ((es - ef->symtab) < ef->nchains)) {
+ symval->name = ef->strtab + es->st_name;
+ symval->value = (caddr_t) ef->address + es->st_value;
+ symval->size = es->st_size;
+ return 0;
+ }
+ if (ef->symtab == ef->ddbsymtab)
+ return ENOENT;
+ if (es >= ef->ddbsymtab && ((es - ef->ddbsymtab) < ef->ddbsymcnt)) {
+ symval->name = ef->ddbstrtab + es->st_name;
+ symval->value = (caddr_t) ef->address + es->st_value;
+ symval->size = es->st_size;
+ return 0;
+ }
+ return ENOENT;
+}
+
+static int
+link_elf_search_symbol(linker_file_t lf, caddr_t value,
+ linker_sym_t* sym, long* diffp)
+{
+ elf_file_t ef = lf->priv;
+ u_long off = (u_long) value;
+ u_long diff = off;
+ const Elf_Sym* es;
+ const Elf_Sym* best = 0;
+ int i;
+
+ for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
+ if (es->st_name == 0)
+ continue;
+ if (off >= es->st_value) {
+ if (off - es->st_value < diff) {
+ diff = off - es->st_value;
+ best = es;
+ if (diff == 0)
+ break;
+ } else if (off - es->st_value == diff) {
+ best = es;
+ }
+ }
+ }
+ if (best == 0)
+ *diffp = off;
+ else
+ *diffp = diff;
+ *sym = (linker_sym_t) best;
+
+ return 0;
+}
diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c
new file mode 100644
index 0000000..c5e84da
--- /dev/null
+++ b/sys/kern/link_elf_obj.c
@@ -0,0 +1,981 @@
+/*-
+ * Copyright (c) 1998 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: link_elf.c,v 1.10 1998/11/06 15:16:07 peter Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+#include <machine/elf.h>
+
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+static int link_elf_load_module(const char*, linker_file_t*);
+static int link_elf_load_file(const char*, linker_file_t*);
+static int link_elf_lookup_symbol(linker_file_t, const char*,
+ linker_sym_t*);
+static int link_elf_symbol_values(linker_file_t, linker_sym_t, linker_symval_t*);
+static int link_elf_search_symbol(linker_file_t, caddr_t value,
+ linker_sym_t* sym, long* diffp);
+
+static void link_elf_unload_file(linker_file_t);
+static void link_elf_unload_module(linker_file_t);
+
+static struct linker_class_ops link_elf_class_ops = {
+ link_elf_load_module,
+};
+
+static struct linker_file_ops link_elf_file_ops = {
+ link_elf_lookup_symbol,
+ link_elf_symbol_values,
+ link_elf_search_symbol,
+ link_elf_unload_file,
+};
+
+static struct linker_file_ops link_elf_module_ops = {
+ link_elf_lookup_symbol,
+ link_elf_symbol_values,
+ link_elf_search_symbol,
+ link_elf_unload_module,
+};
+typedef struct elf_file {
+ caddr_t address; /* Relocation address */
+#ifdef SPARSE_MAPPING
+ vm_object_t object; /* VM object to hold file pages */
+#endif
+ const Elf_Dyn* dynamic; /* Symbol table etc. */
+ Elf_Off nbuckets; /* DT_HASH info */
+ Elf_Off nchains;
+ const Elf_Off* buckets;
+ const Elf_Off* chains;
+ caddr_t hash;
+ caddr_t strtab; /* DT_STRTAB */
+ int strsz; /* DT_STRSZ */
+ const Elf_Sym* symtab; /* DT_SYMTAB */
+ Elf_Addr* got; /* DT_PLTGOT */
+ const Elf_Rel* pltrel; /* DT_JMPREL */
+ int pltrelsize; /* DT_PLTRELSZ */
+ const Elf_Rela* pltrela; /* DT_JMPREL */
+ int pltrelasize; /* DT_PLTRELSZ */
+ const Elf_Rel* rel; /* DT_REL */
+ int relsize; /* DT_RELSZ */
+ const Elf_Rela* rela; /* DT_RELA */
+ int relasize; /* DT_RELASZ */
+ caddr_t modptr;
+ const Elf_Sym* ddbsymtab; /* The symbol table we are using */
+ long ddbsymcnt; /* Number of symbols */
+ caddr_t ddbstrtab; /* String table */
+ long ddbstrcnt; /* number of bytes in string table */
+ caddr_t symbase; /* malloc'ed symbold base */
+ caddr_t strbase; /* malloc'ed string base */
+} *elf_file_t;
+
+static int parse_dynamic(linker_file_t lf);
+static int load_dependancies(linker_file_t lf);
+static int relocate_file(linker_file_t lf);
+static int parse_module_symbols(linker_file_t lf);
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic _DYNAMIC;
+
+static void
+link_elf_init(void* arg)
+{
+#ifdef __ELF__
+ Elf_Dyn *dp;
+ caddr_t modptr, baseptr, sizeptr;
+ elf_file_t ef;
+ char *modname;
+#endif
+
+#if ELF_TARG_CLASS == ELFCLASS32
+ linker_add_class("elf32", NULL, &link_elf_class_ops);
+#else
+ linker_add_class("elf64", NULL, &link_elf_class_ops);
+#endif
+
+#ifdef __ELF__
+ dp = (Elf_Dyn*) &_DYNAMIC;
+ if (dp) {
+ ef = malloc(sizeof(struct elf_file), M_LINKER, M_NOWAIT);
+ if (ef == NULL)
+ panic("link_elf_init: Can't create linker structures for kernel");
+ bzero(ef, sizeof(*ef));
+
+ ef->address = 0;
+#ifdef SPARSE_MAPPING
+ ef->object = 0;
+#endif
+ ef->dynamic = dp;
+ modname = NULL;
+ modptr = preload_search_by_type("elf kernel");
+ if (modptr)
+ modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+ if (modname == NULL)
+ modname = "kernel";
+ linker_kernel_file = linker_make_file(modname, ef, &link_elf_file_ops);
+ if (linker_kernel_file == NULL)
+ panic("link_elf_init: Can't create linker structures for kernel");
+ parse_dynamic(linker_kernel_file);
+ /* Sigh, magic constants. */
+#ifdef __alpha__
+ linker_kernel_file->address = (caddr_t) 0xfffffc0000300000;
+#else
+ linker_kernel_file->address = (caddr_t) 0xf0100000;
+#endif
+ linker_kernel_file->size = -(long)linker_kernel_file->address;
+
+ if (modptr) {
+ ef->modptr = modptr;
+ baseptr = preload_search_info(modptr, MODINFO_ADDR);
+ if (baseptr)
+ linker_kernel_file->address = *(caddr_t *)baseptr;
+ sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+ if (sizeptr)
+ linker_kernel_file->size = *(size_t *)sizeptr;
+ }
+ (void)parse_module_symbols(linker_kernel_file);
+ linker_current_file = linker_kernel_file;
+ }
+#endif
+}
+
+SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
+
+static int
+parse_module_symbols(linker_file_t lf)
+{
+ elf_file_t ef = lf->priv;
+ caddr_t pointer;
+ caddr_t ssym, esym, base;
+ caddr_t strtab;
+ int strcnt;
+ Elf_Sym* symtab;
+ int symcnt;
+
+ if (ef->modptr == NULL)
+ return 0;
+ pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_SSYM);
+ if (pointer == NULL)
+ return 0;
+ ssym = *(caddr_t *)pointer;
+ pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_ESYM);
+ if (pointer == NULL)
+ return 0;
+ esym = *(caddr_t *)pointer;
+
+ base = ssym;
+
+ symcnt = *(long *)base;
+ base += sizeof(long);
+ symtab = (Elf_Sym *)base;
+ base += roundup(symcnt, sizeof(long));
+
+ if (base > esym || base < ssym) {
+ printf("Symbols are corrupt!\n");
+ return EINVAL;
+ }
+
+ strcnt = *(long *)base;
+ base += sizeof(long);
+ strtab = base;
+ base += roundup(strcnt, sizeof(long));
+
+ if (base > esym || base < ssym) {
+ printf("Symbols are corrupt!\n");
+ return EINVAL;
+ }
+
+ ef->ddbsymtab = symtab;
+ ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+ ef->ddbstrtab = strtab;
+ ef->ddbstrcnt = strcnt;
+
+ return 0;
+}
+
+static int
+parse_dynamic(linker_file_t lf)
+{
+ elf_file_t ef = lf->priv;
+ const Elf_Dyn *dp;
+ int plttype = DT_REL;
+
+ for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+ switch (dp->d_tag) {
+ case DT_HASH:
+ {
+ /* From src/libexec/rtld-elf/rtld.c */
+ const Elf_Off *hashtab = (const Elf_Off *)
+ (ef->address + dp->d_un.d_ptr);
+ ef->nbuckets = hashtab[0];
+ ef->nchains = hashtab[1];
+ ef->buckets = hashtab + 2;
+ ef->chains = ef->buckets + ef->nbuckets;
+ break;
+ }
+ case DT_STRTAB:
+ ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_STRSZ:
+ ef->strsz = dp->d_un.d_val;
+ break;
+ case DT_SYMTAB:
+ ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_SYMENT:
+ if (dp->d_un.d_val != sizeof(Elf_Sym))
+ return ENOEXEC;
+ break;
+ case DT_PLTGOT:
+ ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_REL:
+ ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_RELSZ:
+ ef->relsize = dp->d_un.d_val;
+ break;
+ case DT_RELENT:
+ if (dp->d_un.d_val != sizeof(Elf_Rel))
+ return ENOEXEC;
+ break;
+ case DT_JMPREL:
+ ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_PLTRELSZ:
+ ef->pltrelsize = dp->d_un.d_val;
+ break;
+ case DT_RELA:
+ ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_RELASZ:
+ ef->relasize = dp->d_un.d_val;
+ break;
+ case DT_RELAENT:
+ if (dp->d_un.d_val != sizeof(Elf_Rela))
+ return ENOEXEC;
+ break;
+ case DT_PLTREL:
+ plttype = dp->d_un.d_val;
+ if (plttype != DT_REL && plttype != DT_RELA)
+ return ENOEXEC;
+ break;
+ }
+ }
+
+ if (plttype == DT_RELA) {
+ ef->pltrela = (const Elf_Rela *) ef->pltrel;
+ ef->pltrel = NULL;
+ ef->pltrelasize = ef->pltrelsize;
+ ef->pltrelsize = 0;
+ }
+
+ ef->ddbsymtab = ef->symtab;
+ ef->ddbsymcnt = ef->nchains;
+ ef->ddbstrtab = ef->strtab;
+ ef->ddbstrcnt = ef->strsz;
+
+ return 0;
+}
+
+static void
+link_elf_error(const char *s)
+{
+ printf("kldload: %s\n", s);
+}
+
+static int
+link_elf_load_module(const char *filename, linker_file_t *result)
+{
+ caddr_t modptr, baseptr, sizeptr, dynptr;
+ char *type;
+ elf_file_t ef;
+ linker_file_t lf;
+ int error;
+ vm_offset_t dp;
+
+ /* Look to see if we have the module preloaded */
+ modptr = preload_search_by_name(filename);
+ if (modptr == NULL)
+ return (link_elf_load_file(filename, result));
+
+ /* It's preloaded, check we can handle it and collect information */
+ type = (char *)preload_search_info(modptr, MODINFO_TYPE);
+ baseptr = preload_search_info(modptr, MODINFO_ADDR);
+ sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+ dynptr = preload_search_info(modptr, MODINFO_METADATA|MODINFOMD_DYNAMIC);
+ if (type == NULL || strcmp(type, "elf module") != 0)
+ return (EFTYPE);
+ if (baseptr == NULL || sizeptr == NULL || dynptr == NULL)
+ return (EINVAL);
+
+ ef = malloc(sizeof(struct elf_file), M_LINKER, M_WAITOK);
+ if (ef == NULL)
+ return (ENOMEM);
+ bzero(ef, sizeof(*ef));
+ ef->modptr = modptr;
+ ef->address = *(caddr_t *)baseptr;
+#ifdef SPARSE_MAPPING
+ ef->object = 0;
+#endif
+ dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr;
+ ef->dynamic = (Elf_Dyn *)dp;
+ lf = linker_make_file(filename, ef, &link_elf_module_ops);
+ if (lf == NULL) {
+ free(ef, M_LINKER);
+ return ENOMEM;
+ }
+ lf->address = ef->address;
+ lf->size = *(size_t *)sizeptr;
+
+ error = parse_dynamic(lf);
+ if (error) {
+ linker_file_unload(lf);
+ return error;
+ }
+ error = load_dependancies(lf);
+ if (error) {
+ linker_file_unload(lf);
+ return error;
+ }
+ error = relocate_file(lf);
+ if (error) {
+ linker_file_unload(lf);
+ return error;
+ }
+ (void)parse_module_symbols(lf);
+ *result = lf;
+ return (0);
+}
+
+static int
+link_elf_load_file(const char* filename, linker_file_t* result)
+{
+ struct nameidata nd;
+ struct proc* p = curproc; /* XXX */
+ Elf_Ehdr *hdr;
+ caddr_t firstpage;
+ int nbytes, i;
+ Elf_Phdr *phdr;
+ Elf_Phdr *phlimit;
+ Elf_Phdr *segs[2];
+ int nsegs;
+ Elf_Phdr *phdyn;
+ Elf_Phdr *phphdr;
+ caddr_t mapbase;
+ size_t mapsize;
+ Elf_Off base_offset;
+ Elf_Addr base_vaddr;
+ Elf_Addr base_vlimit;
+ int error = 0;
+ int resid;
+ elf_file_t ef;
+ linker_file_t lf;
+ char *pathname;
+ Elf_Shdr *shdr;
+ int symtabindex;
+ int symstrindex;
+ int symcnt;
+ int strcnt;
+
+ shdr = NULL;
+ lf = NULL;
+
+ pathname = linker_search_path(filename);
+ if (pathname == NULL)
+ return ENOENT;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, pathname, p);
+ error = vn_open(&nd, FREAD, 0);
+ free(pathname, M_LINKER);
+ if (error)
+ return error;
+
+ /*
+ * Read the elf header from the file.
+ */
+ firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
+ if (firstpage == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ hdr = (Elf_Ehdr *)firstpage;
+ error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
+ UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+ nbytes = PAGE_SIZE - resid;
+ if (error)
+ goto out;
+
+ if (!IS_ELF(*hdr)) {
+ error = ENOEXEC;
+ goto out;
+ }
+
+ if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
+ || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
+ link_elf_error("Unsupported file layout");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_ident[EI_VERSION] != EV_CURRENT
+ || hdr->e_version != EV_CURRENT) {
+ link_elf_error("Unsupported file version");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
+ link_elf_error("Unsupported file type");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_machine != ELF_TARG_MACH) {
+ link_elf_error("Unsupported machine");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * We rely on the program header being in the first page. This is
+ * not strictly required by the ABI specification, but it seems to
+ * always true in practice. And, it simplifies things considerably.
+ */
+ if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
+ (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
+ (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
+ link_elf_error("Unreadable program headers");
+
+ /*
+ * Scan the program header entries, and save key information.
+ *
+ * We rely on there being exactly two load segments, text and data,
+ * in that order.
+ */
+ phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
+ phlimit = phdr + hdr->e_phnum;
+ nsegs = 0;
+ phdyn = NULL;
+ phphdr = NULL;
+ while (phdr < phlimit) {
+ switch (phdr->p_type) {
+
+ case PT_LOAD:
+ if (nsegs == 2) {
+ link_elf_error("Too many sections");
+ error = ENOEXEC;
+ goto out;
+ }
+ segs[nsegs] = phdr;
+ ++nsegs;
+ break;
+
+ case PT_PHDR:
+ phphdr = phdr;
+ break;
+
+ case PT_DYNAMIC:
+ phdyn = phdr;
+ break;
+ }
+
+ ++phdr;
+ }
+ if (phdyn == NULL) {
+ link_elf_error("Object is not dynamically-linked");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * Allocate the entire address space of the object, to stake out our
+ * contiguous region, and to establish the base address for relocation.
+ */
+ base_offset = trunc_page(segs[0]->p_offset);
+ base_vaddr = trunc_page(segs[0]->p_vaddr);
+ base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz);
+ mapsize = base_vlimit - base_vaddr;
+
+ ef = malloc(sizeof(struct elf_file), M_LINKER, M_WAITOK);
+ bzero(ef, sizeof(*ef));
+#ifdef SPARSE_MAPPING
+ ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
+ if (ef->object == NULL) {
+ free(ef, M_LINKER);
+ error = ENOMEM;
+ goto out;
+ }
+ vm_object_reference(ef->object);
+ ef->address = (caddr_t) vm_map_min(kernel_map);
+ error = vm_map_find(kernel_map, ef->object, 0,
+ (vm_offset_t *) &ef->address,
+ mapsize, 1,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error) {
+ vm_object_deallocate(ef->object);
+ free(ef, M_LINKER);
+ goto out;
+ }
+#else
+ ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
+#endif
+ mapbase = ef->address;
+
+ /*
+ * Read the text and data sections and zero the bss.
+ */
+ for (i = 0; i < 2; i++) {
+ caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ segbase, segs[i]->p_filesz, segs[i]->p_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+ if (error) {
+#ifdef SPARSE_MAPPING
+ vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+ (vm_offset_t) ef->address
+ + (ef->object->size << PAGE_SHIFT));
+ vm_object_deallocate(ef->object);
+#else
+ free(ef->address, M_LINKER);
+#endif
+ free(ef, M_LINKER);
+ goto out;
+ }
+ bzero(segbase + segs[i]->p_filesz,
+ segs[i]->p_memsz - segs[i]->p_filesz);
+
+#ifdef SPARSE_MAPPING
+ /*
+ * Wire down the pages
+ */
+ vm_map_pageable(kernel_map,
+ (vm_offset_t) segbase,
+ (vm_offset_t) segbase + segs[i]->p_memsz,
+ FALSE);
+#endif
+ }
+
+ ef->dynamic = (const Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
+
+ lf = linker_make_file(filename, ef, &link_elf_file_ops);
+ if (lf == NULL) {
+#ifdef SPARSE_MAPPING
+ vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+ (vm_offset_t) ef->address
+ + (ef->object->size << PAGE_SHIFT));
+ vm_object_deallocate(ef->object);
+#else
+ free(ef->address, M_LINKER);
+#endif
+ free(ef, M_LINKER);
+ error = ENOMEM;
+ goto out;
+ }
+ lf->address = ef->address;
+ lf->size = mapsize;
+
+ error = parse_dynamic(lf);
+ if (error)
+ goto out;
+ error = load_dependancies(lf);
+ if (error)
+ goto out;
+ error = relocate_file(lf);
+ if (error)
+ goto out;
+
+ /* Try and load the symbol table if it's present. (you can strip it!) */
+ nbytes = hdr->e_shnum * hdr->e_shentsize;
+ if (nbytes == 0 || hdr->e_shoff == 0)
+ goto nosyms;
+ shdr = malloc(nbytes, M_LINKER, M_WAITOK);
+ if (shdr == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ bzero(shdr, nbytes);
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ (caddr_t)shdr, nbytes, hdr->e_shoff,
+ UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+ if (error)
+ goto out;
+ symtabindex = -1;
+ symstrindex = -1;
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if (shdr[i].sh_type == SHT_SYMTAB) {
+ symtabindex = i;
+ symstrindex = shdr[i].sh_link;
+ }
+ }
+ if (symtabindex < 0 || symstrindex < 0)
+ goto nosyms;
+
+ symcnt = shdr[symtabindex].sh_size;
+ ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
+ strcnt = shdr[symstrindex].sh_size;
+ ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);
+
+ if (ef->symbase == NULL || ef->strbase == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ ef->symbase, symcnt, shdr[symtabindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+ if (error)
+ goto out;
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ ef->strbase, strcnt, shdr[symstrindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+ if (error)
+ goto out;
+
+ ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+ ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
+ ef->ddbstrcnt = strcnt;
+ ef->ddbstrtab = ef->strbase;
+
+nosyms:
+
+ *result = lf;
+
+out:
+ if (error && lf)
+ linker_file_unload(lf);
+ if (shdr)
+ free(shdr, M_LINKER);
+ if (firstpage)
+ free(firstpage, M_LINKER);
+ VOP_UNLOCK(nd.ni_vp, 0, p);
+ vn_close(nd.ni_vp, FREAD, p->p_ucred, p);
+
+ return error;
+}
+
+static void
+link_elf_unload_file(linker_file_t file)
+{
+ elf_file_t ef = file->priv;
+
+ if (ef) {
+#ifdef SPARSE_MAPPING
+ if (ef->object) {
+ vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+ (vm_offset_t) ef->address
+ + (ef->object->size << PAGE_SHIFT));
+ vm_object_deallocate(ef->object);
+ }
+#else
+ if (ef->address)
+ free(ef->address, M_LINKER);
+#endif
+ if (ef->symbase)
+ free(ef->symbase, M_LINKER);
+ if (ef->strbase)
+ free(ef->strbase, M_LINKER);
+ free(ef, M_LINKER);
+ }
+}
+
+static void
+link_elf_unload_module(linker_file_t file)
+{
+ elf_file_t ef = file->priv;
+
+ if (ef)
+ free(ef, M_LINKER);
+ if (file->filename)
+ preload_delete_name(file->filename);
+}
+
+static int
+load_dependancies(linker_file_t lf)
+{
+ elf_file_t ef = lf->priv;
+ linker_file_t lfdep;
+ char* name;
+ const Elf_Dyn *dp;
+ int error = 0;
+
+ /*
+ * All files are dependant on /kernel.
+ */
+ if (linker_kernel_file) {
+ linker_kernel_file->refs++;
+ linker_file_add_dependancy(lf, linker_kernel_file);
+ }
+
+ for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+ if (dp->d_tag == DT_NEEDED) {
+ name = ef->strtab + dp->d_un.d_val;
+
+ error = linker_load_file(name, &lfdep);
+ if (error)
+ goto out;
+ error = linker_file_add_dependancy(lf, lfdep);
+ if (error)
+ goto out;
+ }
+ }
+
+out:
+ return error;
+}
+
+static const char *
+symbol_name(elf_file_t ef, Elf_Word r_info)
+{
+ const Elf_Sym *ref;
+
+ if (ELF_R_SYM(r_info)) {
+ ref = ef->symtab + ELF_R_SYM(r_info);
+ return ef->strtab + ref->st_name;
+ } else
+ return NULL;
+}
+
+static int
+relocate_file(linker_file_t lf)
+{
+ elf_file_t ef = lf->priv;
+ const Elf_Rel *rellim;
+ const Elf_Rel *rel;
+ const Elf_Rela *relalim;
+ const Elf_Rela *rela;
+ const char *symname;
+
+ /* Perform relocations without addend if there are any: */
+ rel = ef->rel;
+ if (rel) {
+ rellim = (const Elf_Rel *) ((caddr_t) ef->rel + ef->relsize);
+ while (rel < rellim) {
+ symname = symbol_name(ef, rel->r_info);
+ if (elf_reloc(lf, rel, ELF_RELOC_REL, symname)) {
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rel++;
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ rela = ef->rela;
+ if (rela) {
+ relalim = (const Elf_Rela *) ((caddr_t) ef->rela + ef->relasize);
+ while (rela < relalim) {
+ symname = symbol_name(ef, rela->r_info);
+ if (elf_reloc(lf, rela, ELF_RELOC_RELA, symname)) {
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rela++;
+ }
+ }
+
+ /* Perform PLT relocations without addend if there are any: */
+ rel = ef->pltrel;
+ if (rel) {
+ rellim = (const Elf_Rel *) ((caddr_t) ef->pltrel + ef->pltrelsize);
+ while (rel < rellim) {
+ symname = symbol_name(ef, rel->r_info);
+ if (elf_reloc(lf, rel, ELF_RELOC_REL, symname)) {
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rel++;
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ rela = ef->pltrela;
+ if (rela) {
+ relalim = (const Elf_Rela *) ((caddr_t) ef->pltrela + ef->pltrelasize);
+ while (rela < relalim) {
+ symname = symbol_name(ef, rela->r_info);
+ if (elf_reloc(lf, rela, ELF_RELOC_RELA, symname)) {
+ printf("link_elf: symbol %s undefined\n", symname);
+ return ENOENT;
+ }
+ rela++;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Hash function for symbol table lookup. Don't even think about changing
+ * this. It is specified by the System V ABI.
+ */
+static unsigned long
+elf_hash(const char *name)
+{
+ const unsigned char *p = (const unsigned char *) name;
+ unsigned long h = 0;
+ unsigned long g;
+
+ while (*p != '\0') {
+ h = (h << 4) + *p++;
+ if ((g = h & 0xf0000000) != 0)
+ h ^= g >> 24;
+ h &= ~g;
+ }
+ return h;
+}
+
+int
+link_elf_lookup_symbol(linker_file_t lf, const char* name, linker_sym_t* sym)
+{
+ elf_file_t ef = lf->priv;
+ unsigned long symnum;
+ const Elf_Sym* symp;
+ const char *strp;
+ unsigned long hash;
+ int i;
+
+ /* First, search hashed global symbols */
+ hash = elf_hash(name);
+ symnum = ef->buckets[hash % ef->nbuckets];
+
+ while (symnum != STN_UNDEF) {
+ if (symnum >= ef->nchains) {
+ printf("link_elf_lookup_symbol: corrupt symbol table\n");
+ return ENOENT;
+ }
+
+ symp = ef->symtab + symnum;
+ if (symp->st_name == 0) {
+ printf("link_elf_lookup_symbol: corrupt symbol table\n");
+ return ENOENT;
+ }
+
+ strp = ef->strtab + symp->st_name;
+
+ if (strcmp(name, strp) == 0) {
+ if (symp->st_shndx != SHN_UNDEF ||
+ (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+ *sym = (linker_sym_t) symp;
+ return 0;
+ } else
+ return ENOENT;
+ }
+
+ symnum = ef->chains[symnum];
+ }
+
+ /* If we have not found it, look at the full table (if loaded) */
+ if (ef->symtab == ef->ddbsymtab)
+ return ENOENT;
+
+ /* Exhaustive search */
+ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+ strp = ef->ddbstrtab + symp->st_name;
+ if (strcmp(name, strp) == 0) {
+ if (symp->st_shndx != SHN_UNDEF ||
+ (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+ *sym = (linker_sym_t) symp;
+ return 0;
+ } else
+ return ENOENT;
+ }
+ }
+
+ return ENOENT;
+}
+
+static int
+link_elf_symbol_values(linker_file_t lf, linker_sym_t sym, linker_symval_t* symval)
+{
+ elf_file_t ef = lf->priv;
+ Elf_Sym* es = (Elf_Sym*) sym;
+
+ if (es >= ef->symtab && ((es - ef->symtab) < ef->nchains)) {
+ symval->name = ef->strtab + es->st_name;
+ symval->value = (caddr_t) ef->address + es->st_value;
+ symval->size = es->st_size;
+ return 0;
+ }
+ if (ef->symtab == ef->ddbsymtab)
+ return ENOENT;
+ if (es >= ef->ddbsymtab && ((es - ef->ddbsymtab) < ef->ddbsymcnt)) {
+ symval->name = ef->ddbstrtab + es->st_name;
+ symval->value = (caddr_t) ef->address + es->st_value;
+ symval->size = es->st_size;
+ return 0;
+ }
+ return ENOENT;
+}
+
+static int
+link_elf_search_symbol(linker_file_t lf, caddr_t value,
+ linker_sym_t* sym, long* diffp)
+{
+ elf_file_t ef = lf->priv;
+ u_long off = (u_long) value;
+ u_long diff = off;
+ const Elf_Sym* es;
+ const Elf_Sym* best = 0;
+ int i;
+
+ for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
+ if (es->st_name == 0)
+ continue;
+ if (off >= es->st_value) {
+ if (off - es->st_value < diff) {
+ diff = off - es->st_value;
+ best = es;
+ if (diff == 0)
+ break;
+ } else if (off - es->st_value == diff) {
+ best = es;
+ }
+ }
+ }
+ if (best == 0)
+ *diffp = off;
+ else
+ *diffp = diff;
+ *sym = (linker_sym_t) best;
+
+ return 0;
+}
diff --git a/sys/kern/makedevops.pl b/sys/kern/makedevops.pl
new file mode 100644
index 0000000..24e0b14
--- /dev/null
+++ b/sys/kern/makedevops.pl
@@ -0,0 +1,394 @@
+#!/usr/bin/perl
+#
+# Copyright (c) 1992, 1993
+# The Regents of the University of California. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+# must display the following acknowledgement:
+# This product includes software developed by the University of
+# California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# From @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+# From @(#)makedevops.sh 1.1 1998/06/14 13:53:12 dfr Exp $
+# From @(#)makedevops.sh ?.? 1998/10/05
+
+#
+# Script to produce device front-end sugar.
+#
+
+$debug = 0;
+$cfile = 0; # by default do not produce any file type
+$hfile = 0;
+
+$keepcurrentdir = 1;
+
+$line_width = 80;
+
+use File::Basename;
+
+# Process the command line
+#
+while ( $arg = shift @ARGV ) {
+ if ( $arg eq '-c' ) {
+ warn "Producing .c output files"
+ if $debug;
+ $cfile = 1;
+ } elsif ( $arg eq '-h' ) {
+ warn "Producing .h output files"
+ if $debug;
+ $hfile = 1;
+ } elsif ( $arg eq '-ch' || $arg eq '-hc' ) {
+ warn "Producing .c and .h output files"
+ if $debug;
+ $cfile = 1;
+ $hfile = 1;
+ } elsif ( $arg eq '-d' ) {
+ $debug = 1;
+ } elsif ( $arg eq '-p' ) {
+ warn "Will produce files in original not in current directory"
+ if $debug;
+ $keepcurrentdir = 0;
+ } elsif ( $arg eq '-l' ) {
+ if ( $line_width = shift @ARGV and $line_width > 0 ) {
+ warn "Line width set to $line_width"
+ if $debug;
+ } else {
+ die "Please specify a valid line width after -l";
+ }
+ } elsif ( $arg =~ m/\.m$/ ) {
+ warn "Filename: $arg"
+ if $debug;
+ push @filenames, $arg;
+ } else {
+ warn "$arg ignored"
+ if $debug;
+ }
+}
+
+
+# Validate the command line parameters
+#
+die "usage: $0 [-d] [-p] [-c|-h] srcfile
+where -c produce only .c files
+ -h produce only .h files
+ -p use the path component in the source file for destination dir
+ -l set line width for output files [80]
+ -d switch on debugging
+"
+ unless ($cfile or $hfile)
+ and $#filenames != -1;
+
+# FIXME should be able to do this more easily
+#
+$tmpdir = $ENV{'TMPDIR'}; # environment variables
+$tmpdir = $ENV{'TMP'}
+ if !$tmpdir;
+$tmpdir = $ENV{'TEMP'}
+ if !$tmpdir;
+$tmpdir = '/tmp' # look for a physical directory
+ if !$tmpdir and -d '/tmp';
+$tmpdir = '/usr/tmp'
+ if !$tmpdir and -d '/usr/tmp';
+$tmpdir = '/var/tmp'
+ if !$tmpdir and -d '/var/tmp';
+$tmpdir = '.' # give up and use current dir
+ if !$tmpdir;
+
+foreach $src ( @filenames ) {
+ # Names of the created files
+ $ctmpname = "$tmpdir/ctmp.$$";
+ $htmpname = "$tmpdir/htmp.$$";
+
+ ($name, $path, $suffix) = &fileparse($src, '.m');
+ $path = '.'
+ if $keepcurrentdir;
+ $cfilename="$path/$name.c";
+ $hfilename="$path/$name.h";
+
+ warn "Processing from $src to $cfile / $hfile via $ctmp / $htmp"
+ if $debug;
+
+ die "Could not open $src, $!"
+ if !open SRC, "$src";
+ die "Could not open $ctmpname, $!"
+ if $cfile and !open CFILE, ">$ctmpname";
+ die "Could not open $htmpname, $!"
+ if $hfile and !open HFILE, ">$htmpname";
+
+ if ( $cfile ) {
+ # Produce the header of the C file
+ #
+ print CFILE "/*\n";
+ print CFILE " * This file is produced automatically.\n";
+ print CFILE " * Do not modify anything in here by hand.\n";
+ print CFILE " *\n";
+ print CFILE " * Created from\n";
+ print CFILE " * $src\n";
+ print CFILE " * with\n";
+ print CFILE " * $0\n";
+ print CFILE " */\n";
+ print CFILE "\n";
+ print CFILE "#include <sys/param.h>\n";
+ print CFILE "#include <sys/queue.h>\n";
+ print CFILE "#include <sys/bus_private.h>\n";
+ }
+
+ if ( $hfile ) {
+ # Produce the header of the H file
+ #
+ print HFILE "/*\n";
+ print HFILE " * This file is produced automatically.\n";
+ print HFILE " * Do not modify anything in here by hand.\n";
+ print HFILE " *\n";
+ print HFILE " * Created from\n";
+ print HFILE " * $src\n";
+ print HFILE " * with\n";
+ print HFILE " * $0\n";
+ print HFILE " */\n";
+ print HFILE "\n";
+ }
+
+ %methods = (); # clear list of methods
+ $lineno = 0;
+ $error = 0; # to signal clean up and gerror setting
+
+ LINE: while ( $line = <SRC> ) {
+ $lineno++;
+
+ # take special notice of include directives.
+ #
+ if ( $line =~ m/^#\s*include\s+(["<])([^">]+)([">]).*/i ) {
+ warn "Included file: $1$2" . ($1 eq '<'? '>':'"')
+ if $debug;
+ print CFILE "#include $1$2" . ($1 eq '<'? '>':'"') . "\n"
+ if $cfile;
+ }
+
+ $line =~ s/#.*//; # remove comments
+ $line =~ s/^\s+//; # remove leading ...
+ $line =~ s/\s+$//; # remove trailing whitespace
+
+ if ( $line =~ m/^$/ ) { # skip empty lines
+ # nop
+
+ } elsif ( $line =~ m/^INTERFACE\s*([^\s;]*)(\s*;?)/i ) {
+ $intname = $1;
+ $semicolon = $2;
+ unless ( $intname =~ m/^[a-z_][a-z0-9_]*$/ ) {
+ warn $line
+ if $debug;
+ warn "$src:$lineno: Invalid interface name '$intname', use [a-z_][a-z0-9_]*";
+ $error = 1;
+ last LINE;
+ }
+
+ warn "$src:$lineno: semicolon missing at end of line, no problem"
+ if $semicolon !~ s/;$//;
+
+ warn "Interface $intname"
+ if $debug;
+
+ print HFILE '#ifndef _'.$intname."_if_h_\n"
+ if $hfile;
+ print HFILE '#define _'.$intname."_if_h_\n\n"
+ if $hfile;
+ print CFILE '#include "'.$intname.'_if.h"'."\n\n"
+ if $cfile;
+
+ } elsif ( $line =~ m/^METHOD/i ) {
+ # Get the return type function name and delete that from
+ # the line. What is left is the possibly first function argument
+ # if it is on the same line.
+ #
+ # FIXME For compatibilities sake METHOD and METHODE is accepted.
+ #
+ if ( !$intname ) {
+ warn "$src:$lineno: No interface name defined";
+ $error = 1;
+ last LINE;
+ }
+ $line =~ s/^METHODE?\s+([^{]+?)\s*{\s*//i;
+ @ret = split m/\s+/, $1;
+ $name = pop @ret; # last element is name of method
+ $ret = join(" ", @ret); # return type
+
+ warn "Method: name=$name return type=$ret"
+ if $debug;
+
+ if ( !$name or !$ret ) {
+ warn $line
+ if $debug;
+ warn "$src:$lineno: Invalid method specification";
+ $error = 1;
+ last LINE;
+ }
+
+ unless ( $name =~ m/^[a-z_][a-z_0-9]*$/ ) {
+ warn $line
+ if $debug;
+ warn "$src:$lineno: Invalid method name '$name', use [a-z_][a-z0-9_]*";
+ $error = 1;
+ last LINE;
+ }
+
+ if ( defined($methods{$name}) ) {
+ warn "$src:$lineno: Duplicate method name";
+ $error = 1;
+ last LINE;
+ }
+
+ $methods{$name} = 'VIS';
+
+ while ( $line !~ m/}/ and $line .= <SRC> ) { }
+
+ if ( $line !~ s/};?(.*)// ) { # remove first '}' and trailing garbage
+ # The '}' was not there (the rest is optional), so complain
+ warn "$src:$lineno: Premature end of file";
+ $error = 1;
+ last LINE;
+ }
+ warn "$src:$lineno: Ignored '$1'" # warn about garbage at end of line
+ if $debug and $1;
+
+ # Create a list of variables without the types prepended
+ #
+ $line =~ s/^\s+//; # remove leading ...
+ $line =~ s/\s+$//; # ... and trailing whitespace
+ $line =~ s/\s+/ /; # remove double spaces
+
+ @arguments = split m/\s*;\s*/, $line;
+ @varnames = (); # list of varnames
+ foreach $argument (@arguments) {
+ next # skip argument if argument is empty
+ if !$argument;
+
+ @ar = split m/[*\s]+/, $argument;
+ if ( $#ar == 0 ) { # only 1 word in argument?
+ warn "$src:$lineno: no type for '$argument'";
+ $error = 1;
+ last LINE;
+ }
+
+ push @varnames, $ar[-1]; # last element is name of variable
+ };
+
+ warn 'Arguments: ' . join(', ', @arguments) . "\n"
+ . 'Varnames: ' . join(', ', @varnames)
+ if $debug;
+
+ $mname = $intname.'_'.$name; # method name
+ $umname = uc($mname); # uppercase method name
+
+ $arguments = join(", ", @arguments);
+ $varnames = join(", ", @varnames);
+
+ if ( $hfile ) {
+ # the method description
+ print HFILE "extern struct device_op_desc $mname\_desc;\n";
+ # the method typedef
+ print HFILE &format_line("typedef $ret $mname\_t($arguments);",
+ $line_width, ', ',
+ ',',' ' x length("typedef $ret $mname\_t("))
+ . "\n";
+ # the method declaration
+ print HFILE "$mname\_t $umname;\n\n";
+ }
+
+ if ( $cfile ) {
+ # Print out the method desc
+ print CFILE "struct device_op_desc $mname\_desc = {\n";
+ print CFILE "\t0, \"$mname\"\n";
+ print CFILE "};\n\n";
+
+ # Print out the method itself
+ if ( 0 ) { # haven't chosen the format yet
+ print CFILE "$ret $umname($varnames)\n";
+ print CFILE "\t".join(";\n\t", @arguments).";\n";
+ } else {
+ print CFILE &format_line("$ret $umname($arguments)",
+ $line_width, ', ',
+ ',', ' ' x length("$ret $umname(")) . "\n";
+ }
+ print CFILE "{\n";
+ print CFILE &format_line("\t$mname\_t *m = ($mname\_t *) DEVOPMETH(dev, $mname);",
+ $line_width-8, ' = ', ' =', "\t\t")
+ . "\n";
+ print CFILE "\t".($ret eq 'void'? '':'return ') . "m($varnames);\n";
+ print CFILE "}\n\n";
+ }
+ } else {
+ warn $line
+ if $debug;
+ warn "$src:$lineno: Invalid line encountered";
+ $error = 1;
+ last LINE;
+ }
+ } # end LINE
+
+ # print the final '#endif' in the header file
+ #
+ print HFILE "#endif /* _".$intname."_if_h_ */\n"
+ if $hfile;
+
+ close SRC;
+ close CFILE
+ if $cfile;
+ close HFILE
+ if $hfile;
+
+ if ( !$error ) {
+ if ( $cfile ) {
+ ($rc = system("mv $ctmpname $cfilename"))
+ and warn "mv $ctmpname $cfilename failed, $rc";
+ }
+
+ if ( $hfile ) {
+ ($rc = system("mv $htmpname $hfilename"))
+ and warn "mv $htmpname $hfilename failed, $rc";
+ }
+ } else {
+ warn 'File' . ($hfile and $cfile? 's':'') . ' skipped';
+ ($rc = system("rm -f $htmpname $ctmpname"))
+ and warn "rm -f $htmpname $ctmpname failed, $rc";
+ $gerror = 1;
+ }
+}
+
+exit $gerror;
+
+
+sub format_line {
+ my ($line, $maxlength, $break, $new_end, $new_start) = @_;
+ my $rline = "";
+
+ while ( length($line) > $maxlength
+ and ($i = rindex $line, $break, $maxlength-length($new_end)) != -1 ) {
+ $rline .= substr($line, 0, $i) . $new_end . "\n";
+ $line = $new_start . substr($line, $i+length($break));
+ }
+
+ return $rline . $line;
+}
diff --git a/sys/kern/makedevops.sh b/sys/kern/makedevops.sh
new file mode 100644
index 0000000..a5e9ebd
--- /dev/null
+++ b/sys/kern/makedevops.sh
@@ -0,0 +1,232 @@
+#!/bin/sh -
+#
+# Copyright (c) 1992, 1993
+# The Regents of the University of California. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+# must display the following acknowledgement:
+# This product includes software developed by the University of
+# California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# From @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+# $Id: makedevops.sh,v 1.1 1998/06/14 13:53:12 dfr Exp $
+#
+
+# Script to produce device front-end sugar.
+#
+# usage: makedevops.sh srcfile
+#
+# These awk scripts are not particularly well written, specifically they
+# don't use arrays well and figure out the same information repeatedly.
+# Please rewrite them if you actually understand how to use awk. Note,
+# they use nawk extensions and gawk's toupper.
+
+if [ $# -ne 2 ] ; then
+ echo 'usage: makedevops.sh [-c|-h] srcfile'
+ exit 1
+fi
+
+makec=0
+makeh=0
+
+if [ "$1" = "-c" ]; then
+ makec=1
+fi
+
+if [ "$1" = "-h" ]; then
+ makeh=1
+fi
+
+# Name of the source file.
+SRC=$2
+
+# Names of the created files.
+CTMP=ctmp$$
+HTMP=htmp$$
+
+CFILE=`basename $SRC .m`.c
+HFILE=`basename $SRC .m`.h
+
+# Awk program (must support nawk extensions and gawk's "toupper")
+# Use "awk" at Berkeley, "gawk" elsewhere.
+AWK=awk
+
+# Awk script to take file.do and turn it into file.h and file.c
+$AWK "
+ BEGIN {
+ src = \"$SRC\";
+ header = \"$HTMP\";
+ cfile = \"$CTMP\";
+ hfile = \"$HFILE\";
+ "'
+
+ printf("/*\n") > header;
+ printf(" * This file is produced automatically.\n") > header;
+ printf(" * Do not modify anything in here by hand.\n") > header;
+ printf(" *\n") > header;
+ printf(" * Created from %s with makedevops.sh\n", src) > header;
+ printf(" */\n\n") > header;
+
+ printf("/*\n") > cfile;
+ printf(" * This file is produced automatically.\n") > cfile;
+ printf(" * Do not modify anything in here by hand.\n") > cfile;
+ printf(" *\n") > cfile;
+ printf(" * Created from %s with makedevops.sh\n", src) > cfile;
+ printf(" */\n\n") > cfile;
+ printf("#include <sys/param.h>\n") > cfile;
+ printf("#include <sys/queue.h>\n") > cfile;
+ printf("#include <sys/bus_private.h>\n") > cfile;
+
+ methodcount = 0
+ }
+ NF == 0 {
+ next;
+ }
+ /^#include/ {
+ print $0 > cfile;
+ }
+ /^#/ {
+ next;
+ }
+ /^INTERFACE/ {
+ intname = $2;
+ printf("#ifndef _%s_if_h_\n", intname) > header;
+ printf("#define _%s_if_h_\n\n", intname) > header;
+ printf("#include \"%s\"\n\n", hfile) > cfile;
+ }
+ /^METHOD/ {
+ # Get the function name and return type.
+ ret = "";
+ sep = "";
+ for (i = 2; i < NF - 1; i++) {
+ ret = sep $i;
+ sep = " ";
+ }
+ name = $i;
+
+ # Get the function arguments.
+ for (c1 = 0;; ++c1) {
+ if (getline <= 0)
+ exit
+ if ($0 ~ "^};")
+ break;
+ a[c1] = $0;
+ }
+
+ methods[methodcount++] = name;
+
+ mname = intname "_" name;
+ umname = toupper(mname);
+
+ # Print out the method declaration
+ printf("extern struct device_op_desc %s_desc;\n", mname) > header;
+ printf("%s %s(", ret, umname) > header;
+ sep = ", ";
+ for (c2 = 0; c2 < c1; ++c2) {
+ if (c2 == c1 - 1)
+ sep = " );\n";
+ c3 = split(a[c2], t);
+ for (c4 = 0; c4 < c3; ++c4)
+ printf("%s ", t[c4]) > header;
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("%s%s%s",
+ substr(t[c4], 0, beg - 1),
+ substr(t[c4], beg, end - beg), sep) > header;
+ }
+
+ # Print the method desc
+ printf("struct device_op_desc %s_desc = {\n", mname) > cfile;
+ printf("\t0,\n") > cfile;
+ printf("\t\"%s\"\n", mname) > cfile;
+ printf("};\n\n") > cfile;
+
+ # Print out the method typedef
+ printf("typedef %s %s_t(\n", ret, mname) > cfile;
+ sep = ",\n";
+ for (c2 = 0; c2 < c1; ++c2) {
+ if (c2 == c1 - 1)
+ sep = ");\n";
+ c3 = split(a[c2], t);
+ printf("\t") > cfile;
+ for (c4 = 0; c4 < c3; ++c4)
+ printf("%s ", t[c4]) > cfile;
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("%s%s%s",
+ substr(t[c4], 0, beg - 1),
+ substr(t[c4], beg, end - beg), sep) > cfile;
+ }
+
+ # Print out the method itself
+ printf("%s %s(\n", ret, umname) > cfile;
+ sep = ",\n";
+ for (c2 = 0; c2 < c1; ++c2) {
+ if (c2 == c1 - 1)
+ sep = ")\n";
+ c3 = split(a[c2], t);
+ printf("\t") > cfile;
+ for (c4 = 0; c4 < c3; ++c4)
+ printf("%s ", t[c4]) > cfile;
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("%s%s%s",
+ substr(t[c4], 0, beg - 1),
+ substr(t[c4], beg, end - beg), sep) > cfile;
+ }
+ printf("{\n") > cfile;
+ printf("\t%s_t *m = (%s_t *) DEVOPMETH(dev, %s);\n",
+ mname, mname, mname) > cfile;
+ if (ret != "void")
+ printf("\treturn m(") > cfile;
+ else
+ printf("\tm(") > cfile;
+ sep = ", ";
+ for (c2 = 0; c2 < c1; ++c2) {
+ if (c2 == c1 - 1)
+ sep = ");\n";
+ c3 = split(a[c2], t);
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("%s%s", substr(t[c3], beg, end - beg), sep) > cfile;
+ }
+ printf("}\n\n") > cfile;
+ }
+ END {
+ printf("\n#endif /* _%s_if_h_ */\n", intname) > header;
+ }' < $SRC
+
+if [ $makec = 1 ]; then
+ mv $CTMP $CFILE
+else
+ rm $CTMP
+fi
+
+if [ $makeh = 1 ]; then
+ mv $HTMP $HFILE
+else
+ rm $HTMP
+fi
diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh
new file mode 100644
index 0000000..0cbd247
--- /dev/null
+++ b/sys/kern/makesyscalls.sh
@@ -0,0 +1,394 @@
+#! /bin/sh -
+# @(#)makesyscalls.sh 8.1 (Berkeley) 6/10/93
+# $Id: makesyscalls.sh,v 1.34 1998/06/09 03:32:05 bde Exp $
+
+set -e
+
+# name of compat option:
+compat=COMPAT_43
+
+# output files:
+sysnames="syscalls.c"
+sysproto="../sys/sysproto.h"
+sysproto_h=_SYS_SYSPROTO_H_
+syshdr="../sys/syscall.h"
+sysmk="../sys/syscall.mk"
+syssw="init_sysent.c"
+syshide="../sys/syscall-hide.h"
+syscallprefix="SYS_"
+switchname="sysent"
+namesname="syscallnames"
+
+# tmp files:
+sysdcl="sysent.dcl.$$"
+syscompat="sysent.compat.$$"
+syscompatdcl="sysent.compatdcl.$$"
+sysent="sysent.switch.$$"
+sysinc="sysinc.switch.$$"
+sysarg="sysarg.switch.$$"
+
+trap "rm $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg" 0
+
+touch $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg
+
+case $# in
+ 0) echo "Usage: $0 input-file <config-file>" 1>&2
+ exit 1
+ ;;
+esac
+
+if [ -n "$2" -a -f "$2" ]; then
+ . $2
+fi
+
+sed -e '
+s/\$//g
+:join
+ /\\$/{a\
+
+ N
+ s/\\\n//
+ b join
+ }
+2,${
+ /^#/!s/\([{}()*,]\)/ \1 /g
+}
+' < $1 | awk "
+ BEGIN {
+ sysdcl = \"$sysdcl\"
+ sysproto = \"$sysproto\"
+ sysproto_h = \"$sysproto_h\"
+ syscompat = \"$syscompat\"
+ syscompatdcl = \"$syscompatdcl\"
+ sysent = \"$sysent\"
+ syssw = \"$syssw\"
+ sysinc = \"$sysinc\"
+ sysarg = \"$sysarg\"
+ sysnames = \"$sysnames\"
+ syshdr = \"$syshdr\"
+ sysmk = \"$sysmk\"
+ compat = \"$compat\"
+ syshide = \"$syshide\"
+ syscallprefix = \"$syscallprefix\"
+ switchname = \"$switchname\"
+ namesname = \"$namesname\"
+ infile = \"$1\"
+ "'
+
+ printf "/*\n * System call switch table.\n *\n" > syssw
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > syssw
+
+ printf "/*\n * System call prototypes.\n *\n" > sysarg
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg
+
+ printf "\n#ifdef %s\n\n", compat > syscompat
+
+ printf "/*\n * System call names.\n *\n" > sysnames
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
+
+ printf "/*\n * System call numbers.\n *\n" > syshdr
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr
+ printf "# FreeBSD system call names.\n" > sysmk
+ printf "# DO NOT EDIT-- this file is automatically generated.\n" > sysmk
+ printf "/*\n * System call hiders.\n *\n" > syshide
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshide
+ }
+ NR == 1 {
+ gsub("[$]Id: ", "", $0)
+ gsub(" [$]", "", $0)
+
+ printf " * created from%s\n */\n\n", $0 > syssw
+
+ printf "\n/* The casts are bogus but will do for now. */\n" > sysent
+ printf "struct sysent %s[] = {\n",switchname > sysent
+
+ printf " * created from%s\n */\n\n", $0 > sysarg
+ printf "#ifndef %s\n", sysproto_h > sysarg
+ printf "#define\t%s\n\n", sysproto_h > sysarg
+ printf "#include <sys/signal.h>\n\n" > sysarg
+ printf "struct proc;\n\n" > sysarg
+ printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) ? \\\n" > sysarg
+ printf "\t\t0 : sizeof(register_t) - sizeof(t))\n\n" > sysarg
+
+ printf " * created from%s\n */\n\n", $0 > sysnames
+ printf "char *%s[] = {\n", namesname > sysnames
+
+ printf " * created from%s\n */\n\n", $0 > syshdr
+
+ printf "# created from%s\nMIASM = ", $0 > sysmk
+
+ printf " * created from%s\n */\n\n", $0 > syshide
+ next
+ }
+ NF == 0 || $1 ~ /^;/ {
+ next
+ }
+ $1 ~ /^#[ ]*include/ {
+ print > sysinc
+ next
+ }
+ $1 ~ /^#[ ]*if/ {
+ print > sysent
+ print > sysdcl
+ print > sysarg
+ print > syscompat
+ print > sysnames
+ print > syshide
+ savesyscall = syscall
+ next
+ }
+ $1 ~ /^#[ ]*else/ {
+ print > sysent
+ print > sysdcl
+ print > sysarg
+ print > syscompat
+ print > sysnames
+ print > syshide
+ syscall = savesyscall
+ next
+ }
+ $1 ~ /^#/ {
+ print > sysent
+ print > sysdcl
+ print > sysarg
+ print > syscompat
+ print > sysnames
+ print > syshide
+ next
+ }
+ syscall != $1 {
+ printf "%s: line %d: syscall number out of sync at %d\n",
+ infile, NR, syscall
+ printf "line is:\n"
+ print
+ exit 1
+ }
+ function parserr(was, wanted) {
+ printf "%s: line %d: unexpected %s (expected %s)\n",
+ infile, NR, was, wanted
+ exit 1
+ }
+ function parseline() {
+ f=4 # toss number and type
+ argc= 0;
+ bigargc = 0;
+ if ($NF != "}") {
+ funcalias=$(NF-2)
+ argalias=$(NF-1)
+ rettype=$NF
+ end=NF-3
+ } else {
+ funcalias=""
+ argalias=""
+ rettype="int"
+ end=NF
+ }
+ if ($2 == "NODEF") {
+ funcname=$4
+ return
+ }
+ if ($f != "{")
+ parserr($f, "{")
+ f++
+ if ($end != "}")
+ parserr($end, "}")
+ end--
+ if ($end != ";")
+ parserr($end, ";")
+ end--
+ if ($end != ")")
+ parserr($end, ")")
+ end--
+
+ f++ #function return type
+
+ funcname=$f
+ if (funcalias == "")
+ funcalias = funcname
+ if (argalias == "") {
+ argalias = funcname "_args"
+ if ($2 == "COMPAT")
+ argalias = "o" argalias
+ }
+ f++
+
+ if ($f != "(")
+ parserr($f, ")")
+ f++
+
+ if (f == end) {
+ if ($f != "void")
+ parserr($f, "argument definition")
+ return
+ }
+
+ while (f <= end) {
+ argc++
+ argtype[argc]=""
+ oldf=""
+ while (f < end && $(f+1) != ",") {
+ if (argtype[argc] != "" && oldf != "*")
+ argtype[argc] = argtype[argc]" ";
+ argtype[argc] = argtype[argc]$f;
+ oldf = $f;
+ f++
+ }
+ if (argtype[argc] == "")
+ parserr($f, "argument definition")
+ if (argtype[argc] == "off_t")
+ bigargc++
+ argname[argc]=$f;
+ f += 2; # skip name, and any comma
+ }
+ }
+ { comment = $4
+ if (NF < 7)
+ for (i = 5; i <= NF; i++)
+ comment = comment " " $i
+ }
+ $2 == "STD" || $2 == "NODEF" || $2 == "NOARGS" || $2 == "NOPROTO" \
+ || $2 == "NOIMPL" {
+ parseline()
+ if ((!nosys || funcname != "nosys") && \
+ (funcname != "lkmnosys")) {
+ if (argc != 0 && $2 != "NOARGS" && $2 != "NOPROTO") {
+ printf("struct\t%s {\n", argalias) > sysarg
+ for (i = 1; i <= argc; i++)
+ printf("\t%s\t%s;\tchar %s_[PAD_(%s)];\n",
+ argtype[i], argname[i],
+ argname[i], argtype[i]) > sysarg
+ printf("};\n") > sysarg
+ }
+ else if($2 != "NOARGS" && $2 != "NOPROTO")
+ printf("struct\t%s {\n\tregister_t dummy;\n};\n",
+ argalias) > sysarg
+ }
+ if ($2 != "NOPROTO" && (!nosys || funcname != "nosys") && \
+ (!lkmnosys || funcname != "lkmnosys")) {
+ printf("%s\t%s __P((struct proc *, struct %s *))",
+ rettype, funcname, argalias) > sysdcl
+ if (funcname == "exit")
+ printf(" __dead2") > sysdcl
+ printf(";\n") > sysdcl
+ }
+ if (funcname == "nosys")
+ nosys = 1
+ if (funcname == "lkmnosys")
+ lkmnosys = 1
+ if ($2 != "NOIMPL") {
+ printf("\t{ %d, (sy_call_t *)%s },\t\t",
+ argc+bigargc, funcname) > sysent
+ if(length(funcname) < 11)
+ printf("\t") > sysent
+ printf("/* %d = %s */\n", syscall, funcalias) > sysent
+ } else {
+ printf("\t{ %d, (sy_call_t *)%s },\t\t",
+ argc+bigargc, "nosys") > sysent
+ if(length("nosys") < 11)
+ printf("\t") > sysent
+ printf("/* %d = %s */\n", syscall, funcalias) > sysent
+ }
+ printf("\t\"%s\",\t\t\t/* %d = %s */\n",
+ funcalias, syscall, funcalias) > sysnames
+ if ($2 != "NODEF") {
+ printf("#define\t%s%s\t%d\n", syscallprefix,
+ funcalias, syscall) > syshdr
+ printf(" \\\n\t%s.o", funcalias) > sysmk
+ }
+ if ($3 != "NOHIDE")
+ printf("HIDE_%s(%s)\n", $3, funcname) > syshide
+ syscall++
+ next
+ }
+ $2 == "COMPAT" || $2 == "CPT_NOA" {
+ ncompat++
+ parseline()
+ if (argc != 0 && $2 != "CPT_NOA") {
+ printf("struct\t%s {\n", argalias) > syscompat
+ for (i = 1; i <= argc; i++)
+ printf("\t%s\t%s;\tchar %s_[PAD_(%s)];\n",
+ argtype[i], argname[i],
+ argname[i], argtype[i]) > syscompat
+ printf("};\n") > syscompat
+ }
+ else if($2 != "CPT_NOA")
+ printf("struct\t%s {\n\tregister_t dummy;\n};\n",
+ argalias) > sysarg
+ printf("%s\to%s __P((struct proc *, struct %s *));\n",
+ rettype, funcname, argalias) > syscompatdcl
+ printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n",
+ argc+bigargc, funcname, syscall, funcalias) > sysent
+ printf("\t\"old.%s\",\t\t/* %d = old %s */\n",
+ funcalias, syscall, funcalias) > sysnames
+ printf("\t\t\t\t/* %d is old %s */\n",
+ syscall, funcalias) > syshdr
+ if ($3 != "NOHIDE")
+ printf("HIDE_%s(%s)\n", $3, funcname) > syshide
+ syscall++
+ next
+ }
+ $2 == "LIBCOMPAT" {
+ ncompat++
+ parseline()
+ printf("%s\to%s();\n", rettype, funcname) > syscompatdcl
+ printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n",
+ argc+bigargc, funcname, syscall, funcalias) > sysent
+ printf("\t\"old.%s\",\t\t/* %d = old %s */\n",
+ funcalias, syscall, funcalias) > sysnames
+ printf("#define\t%s%s\t%d\t/* compatibility; still used by libc */\n",
+ syscallprefix, funcalias, syscall) > syshdr
+ printf(" \\\n\t%s.o", funcalias) > sysmk
+ if ($3 != "NOHIDE")
+ printf("HIDE_%s(%s)\n", $3, funcname) > syshide
+ syscall++
+ next
+ }
+ $2 == "OBSOL" {
+ printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = obsolete %s */\n",
+ syscall, comment) > sysent
+ printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n",
+ $4, syscall, comment) > sysnames
+ printf("\t\t\t\t/* %d is obsolete %s */\n",
+ syscall, comment) > syshdr
+ if ($3 != "NOHIDE")
+ printf("HIDE_%s(%s)\n", $3, $4) > syshide
+ syscall++
+ next
+ }
+ $2 == "UNIMPL" {
+ printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = %s */\n",
+ syscall, comment) > sysent
+ printf("\t\"#%d\",\t\t\t/* %d = %s */\n",
+ syscall, syscall, comment) > sysnames
+ if ($3 != "NOHIDE")
+ printf("HIDE_%s(%s)\n", $3, $4) > syshide
+ syscall++
+ next
+ }
+ {
+ printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2
+ exit 1
+ }
+ END {
+ if (ncompat != 0) {
+ printf "#include \"opt_compat.h\"\n\n" > syssw
+ printf "\n#ifdef %s\n", compat > sysinc
+ printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysinc
+ printf "#else\n" > sysinc
+ printf "#define compat(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+ printf "#endif\n" > sysinc
+ }
+
+ printf("\n#endif /* %s */\n\n", compat) > syscompatdcl
+ printf("#undef PAD_\n") > syscompatdcl
+ printf("\n#endif /* !%s */\n", sysproto_h) > syscompatdcl
+
+ printf("\n") > sysmk
+ printf("};\n") > sysent
+ printf("};\n") > sysnames
+ printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \
+ > syshdr
+ } '
+
+cat $sysinc $sysent >> $syssw
+cat $sysarg $sysdcl $syscompat $syscompatdcl > $sysproto
diff --git a/sys/kern/md5c.c b/sys/kern/md5c.c
new file mode 100644
index 0000000..d6175ee
--- /dev/null
+++ b/sys/kern/md5c.c
@@ -0,0 +1,342 @@
+/*
+ * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
+ *
+ * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+ * rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * $Id: md5c.c,v 1.14 1998/05/01 16:40:19 bde Exp $
+ *
+ * This code is the same as the code published by RSA Inc. It has been
+ * edited for clarity and style only.
+ */
+
+#include <sys/types.h>
+
+#ifdef KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include <sys/md5.h>
+
+
+#ifdef KERNEL
+#define memset(x,y,z) bzero(x,z);
+#define memcpy(x,y,z) bcopy(y, x, z)
+#endif
+
+#if defined(__i386__) || defined(__alpha__)
+#define Encode memcpy
+#define Decode memcpy
+#else /* __i386__ */
+
+/*
+ * Encodes input (u_int32_t) into output (unsigned char). Assumes len is
+ * a multiple of 4.
+ */
+
+/* XXX not prototyped, and not compatible with memcpy(). */
+static void
+Encode (output, input, len)
+ unsigned char *output;
+ u_int32_t *input;
+ unsigned int len;
+{
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4) {
+ output[j] = (unsigned char)(input[i] & 0xff);
+ output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+ output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+ output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+ }
+}
+
+/*
+ * Decodes input (unsigned char) into output (u_int32_t). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Decode (output, input, len)
+ u_int32_t *output;
+ const unsigned char *input;
+ unsigned int len;
+{
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4)
+ output[i] = ((u_int32_t)input[j]) | (((u_int32_t)input[j+1]) << 8) |
+ (((u_int32_t)input[j+2]) << 16) | (((u_int32_t)input[j+3]) << 24);
+}
+#endif /* i386 */
+
+static unsigned char PADDING[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G, H and I are basic MD5 functions. */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits. */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/*
+ * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+ * Rotation is separate from addition to prevent recomputation.
+ */
+#define FF(a, b, c, d, x, s, ac) { \
+ (a) += F ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define GG(a, b, c, d, x, s, ac) { \
+ (a) += G ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define HH(a, b, c, d, x, s, ac) { \
+ (a) += H ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define II(a, b, c, d, x, s, ac) { \
+ (a) += I ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+
+/* MD5 initialization. Begins an MD5 operation, writing a new context. */
+
+void
+MD5Init (context)
+ MD5_CTX *context;
+{
+
+ context->count[0] = context->count[1] = 0;
+
+ /* Load magic initialization constants. */
+ context->state[0] = 0x67452301;
+ context->state[1] = 0xefcdab89;
+ context->state[2] = 0x98badcfe;
+ context->state[3] = 0x10325476;
+}
+
+/*
+ * MD5 block update operation. Continues an MD5 message-digest
+ * operation, processing another message block, and updating the
+ * context.
+ */
+
+void
+MD5Update (context, input, inputLen)
+ MD5_CTX *context;
+ const unsigned char *input;
+ unsigned int inputLen;
+{
+ unsigned int i, index, partLen;
+
+ /* Compute number of bytes mod 64 */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+
+ /* Update number of bits */
+ if ((context->count[0] += ((u_int32_t)inputLen << 3))
+ < ((u_int32_t)inputLen << 3))
+ context->count[1]++;
+ context->count[1] += ((u_int32_t)inputLen >> 29);
+
+ partLen = 64 - index;
+
+ /* Transform as many times as possible. */
+ if (inputLen >= partLen) {
+ memcpy((void *)&context->buffer[index], (const void *)input,
+ partLen);
+ MD5Transform (context->state, context->buffer);
+
+ for (i = partLen; i + 63 < inputLen; i += 64)
+ MD5Transform (context->state, &input[i]);
+
+ index = 0;
+ }
+ else
+ i = 0;
+
+ /* Buffer remaining input */
+ memcpy ((void *)&context->buffer[index], (const void *)&input[i],
+ inputLen-i);
+}
+
+/*
+ * MD5 padding. Adds padding followed by original length.
+ */
+
+void
+MD5Pad (context)
+ MD5_CTX *context;
+{
+ unsigned char bits[8];
+ unsigned int index, padLen;
+
+ /* Save number of bits */
+ Encode (bits, context->count, 8);
+
+ /* Pad out to 56 mod 64. */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+ padLen = (index < 56) ? (56 - index) : (120 - index);
+ MD5Update (context, PADDING, padLen);
+
+ /* Append length (before padding) */
+ MD5Update (context, bits, 8);
+}
+
+/*
+ * MD5 finalization. Ends an MD5 message-digest operation, writing the
+ * the message digest and zeroizing the context.
+ */
+
+void
+MD5Final (digest, context)
+ unsigned char digest[16];
+ MD5_CTX *context;
+{
+ /* Do padding. */
+ MD5Pad (context);
+
+ /* Store state in digest */
+ Encode (digest, context->state, 16);
+
+ /* Zeroize sensitive information. */
+ memset ((void *)context, 0, sizeof (*context));
+}
+
+/* MD5 basic transformation. Transforms state based on block. */
+
+void
+MD5Transform (state, block)
+ u_int32_t state[4];
+ const unsigned char block[64];
+{
+ u_int32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+ Decode (x, block, 64);
+
+ /* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+ FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+ FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+ FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+ FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+ FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+ FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+ FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+ FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+ FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+ FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+ FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+ FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+ FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+ FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+ FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+ FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+ /* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+ GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+ GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+ GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+ GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+ GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+ GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */
+ GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+ GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+ GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+ GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+ GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+ GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+ GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+ GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+ GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+ GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+ /* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+ HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+ HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+ HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+ HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+ HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+ HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+ HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+ HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+ HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+ HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+ HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+ HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */
+ HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+ HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+ HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+ HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+ /* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+ II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+ II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+ II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+ II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+ II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+ II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+ II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+ II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+ II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+ II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+ II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+ II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+ II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+ II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+ II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+ II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+
+ /* Zeroize sensitive information. */
+ memset ((void *)x, 0, sizeof (x));
+}
diff --git a/sys/kern/p1003_1b.c b/sys/kern/p1003_1b.c
new file mode 100644
index 0000000..9a70d5c
--- /dev/null
+++ b/sys/kern/p1003_1b.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 1996, 1997, 1998
+ * HD Associates, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/* p1003_1b: Real Time common code.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/syslog.h>
+#include <sys/module.h>
+#include <sys/sysproto.h>
+#include <sys/sysctl.h>
+
+#include <posix4/posix4.h>
+
+MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");
+
+/* p31b_proc: Return a proc struct corresponding to a pid to operate on.
+ *
+ * Enforce permission policy.
+ *
+ * The policy is the same as for sending signals except there
+ * is no notion of process groups.
+ *
+ * pid == 0 means my process.
+ *
+ * This is disabled until I've got a permission gate in again:
+ * only root can do this.
+ */
+
+#if 0
+/*
+ * This is stolen from CANSIGNAL in kern_sig:
+ *
+ * Can process p, with pcred pc, do "write flavor" operations to process q?
+ */
+#define CAN_AFFECT(p, pc, q) \
+ ((pc)->pc_ucred->cr_uid == 0 || \
+ (pc)->p_ruid == (q)->p_cred->p_ruid || \
+ (pc)->pc_ucred->cr_uid == (q)->p_cred->p_ruid || \
+ (pc)->p_ruid == (q)->p_ucred->cr_uid || \
+ (pc)->pc_ucred->cr_uid == (q)->p_ucred->cr_uid)
+#else
+#define CAN_AFFECT(p, pc, q) ((pc)->pc_ucred->cr_uid == 0)
+#endif
+
+/*
+ * p31b_proc: Look up a proc from a PID. If proc is 0 it is
+ * my own proc.
+ */
+int p31b_proc(struct proc *p, pid_t pid, struct proc **pp)
+{
+ int ret = 0;
+ struct proc *other_proc = 0;
+
+ if (pid == 0)
+ other_proc = p;
+ else
+ other_proc = pfind(pid);
+
+ if (other_proc)
+ {
+ /* Enforce permission policy.
+ */
+ if (CAN_AFFECT(p, p->p_cred, other_proc))
+ *pp = other_proc;
+ else
+ ret = EPERM;
+ }
+ else
+ ret = ESRCH;
+
+ return ret;
+}
+
+/* The system calls return ENOSYS if an entry is called that is
+ * not run-time supported. I am also logging since some programs
+ * start to use this when they shouldn't. That will be removed if annoying.
+ */
+int
+syscall_not_present(struct proc *p, const char *s, struct nosys_args *uap)
+{
+ log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
+ p->p_comm, p->p_pid, s);
+
+ /* a " return nosys(p, uap); " here causes a core dump.
+ */
+
+ return ENOSYS;
+}
+
+#if !defined(_KPOSIX_PRIORITY_SCHEDULING)
+
+/* Not configured but loadable via an LKM:
+ */
+
+static int sched_attach(void)
+{
+ return 0;
+}
+
+SYSCALL_NOT_PRESENT_GEN(sched_setparam)
+SYSCALL_NOT_PRESENT_GEN(sched_getparam)
+SYSCALL_NOT_PRESENT_GEN(sched_setscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_getscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_yield)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min)
+SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval)
+
+#else
+
+/* Configured in kernel version:
+ */
+static struct ksched *ksched;
+
+static int sched_attach(void)
+{
+ int ret = ksched_attach(&ksched);
+
+ if (ret == 0)
+ p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 1);
+
+ return ret;
+}
+
+int sched_setparam(struct proc *p,
+ struct sched_setparam_args *uap)
+{
+ int e;
+
+ struct sched_param sched_param;
+ copyin(uap->param, &sched_param, sizeof(sched_param));
+
+ (void) (0
+ || (e = p31b_proc(p, uap->pid, &p))
+ || (e = ksched_setparam(&p->p_retval[0], ksched, p,
+ (const struct sched_param *)&sched_param))
+ );
+
+ return e;
+}
+
+int sched_getparam(struct proc *p,
+ struct sched_getparam_args *uap)
+{
+ int e;
+ struct sched_param sched_param;
+
+ (void) (0
+ || (e = p31b_proc(p, uap->pid, &p))
+ || (e = ksched_getparam(&p->p_retval[0], ksched, p, &sched_param))
+ );
+
+ if (!e)
+ copyout(&sched_param, uap->param, sizeof(sched_param));
+
+ return e;
+}
+int sched_setscheduler(struct proc *p,
+ struct sched_setscheduler_args *uap)
+{
+ int e;
+
+ struct sched_param sched_param;
+ copyin(uap->param, &sched_param, sizeof(sched_param));
+
+ (void) (0
+ || (e = p31b_proc(p, uap->pid, &p))
+ || (e = ksched_setscheduler(&p->p_retval[0],
+ ksched, p, uap->policy,
+ (const struct sched_param *)&sched_param))
+ );
+
+ return e;
+}
+int sched_getscheduler(struct proc *p,
+ struct sched_getscheduler_args *uap)
+{
+ int e;
+ (void) (0
+ || (e = p31b_proc(p, uap->pid, &p))
+ || (e = ksched_getscheduler(&p->p_retval[0], ksched, p))
+ );
+
+ return e;
+}
+int sched_yield(struct proc *p,
+ struct sched_yield_args *uap)
+{
+ return ksched_yield(&p->p_retval[0], ksched);
+}
+int sched_get_priority_max(struct proc *p,
+ struct sched_get_priority_max_args *uap)
+{
+ return ksched_get_priority_max(&p->p_retval[0],
+ ksched, uap->policy);
+}
+int sched_get_priority_min(struct proc *p,
+ struct sched_get_priority_min_args *uap)
+{
+ return ksched_get_priority_min(&p->p_retval[0],
+ ksched, uap->policy);
+}
+int sched_rr_get_interval(struct proc *p,
+ struct sched_rr_get_interval_args *uap)
+{
+ int e;
+
+ (void) (0
+ || (e = p31b_proc(p, uap->pid, &p))
+ || (e = ksched_rr_get_interval(&p->p_retval[0], ksched,
+ p, uap->interval))
+ );
+
+ return e;
+}
+
+#endif
+
+static void p31binit(void *notused)
+{
+ (void) sched_attach();
+ p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+}
+
+SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL);
diff --git a/sys/kern/posix4_mib.c b/sys/kern/posix4_mib.c
new file mode 100644
index 0000000..523f76b
--- /dev/null
+++ b/sys/kern/posix4_mib.c
@@ -0,0 +1,94 @@
+/*-
+ * Copyright (c) 1998
+ * HD Associates, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <posix4/posix4.h>
+
+static int facility[CTL_P1003_1B_MAXID - 1];
+
+/* OID_AUTO isn't working with sysconf(3). I guess I'd have to
+ * modify it to do a lookup by name from the index.
+ * For now I've left it a top-level sysctl.
+ */
+
+#if 1
+
+#define P1B_SYSCTL(num, name) \
+SYSCTL_INT(_p1003_1b, num, \
+ name, CTLFLAG_RD, facility + num - 1, 0, "");
+
+#else
+
+#define P1B_SYSCTL(num, name) \
+SYSCTL_INT(_kern_p1003_1b, OID_AUTO, \
+ name, CTLFLAG_RD, facility + num - 1, 0, "");
+SYSCTL_NODE(_kern, OID_AUTO, p1003_1b, CTLFLAG_RW, 0, "P1003.1B");
+
+#endif
+
+
+P1B_SYSCTL(CTL_P1003_1B_ASYNCHRONOUS_IO, asynchronous_io);
+P1B_SYSCTL(CTL_P1003_1B_MAPPED_FILES, mapped_files);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK, memlock);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK_RANGE, memlock_range);
+P1B_SYSCTL(CTL_P1003_1B_MEMORY_PROTECTION, memory_protection);
+P1B_SYSCTL(CTL_P1003_1B_MESSAGE_PASSING, message_passing);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITIZED_IO, prioritized_io);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITY_SCHEDULING, priority_scheduling);
+P1B_SYSCTL(CTL_P1003_1B_REALTIME_SIGNALS, realtime_signals);
+P1B_SYSCTL(CTL_P1003_1B_SEMAPHORES, semaphores);
+P1B_SYSCTL(CTL_P1003_1B_FSYNC, fsync);
+P1B_SYSCTL(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, shared_memory_objects);
+P1B_SYSCTL(CTL_P1003_1B_SYNCHRONIZED_IO, synchronized_io);
+P1B_SYSCTL(CTL_P1003_1B_TIMERS, timers);
+P1B_SYSCTL(CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_MAX, aio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, aio_prio_delta_max);
+P1B_SYSCTL(CTL_P1003_1B_DELAYTIMER_MAX, delaytimer_max);
+P1B_SYSCTL(CTL_P1003_1B_MQ_OPEN_MAX, mq_open_max);
+P1B_SYSCTL(CTL_P1003_1B_PAGESIZE, pagesize);
+P1B_SYSCTL(CTL_P1003_1B_RTSIG_MAX, rtsig_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_NSEMS_MAX, sem_nsems_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_VALUE_MAX, sem_value_max);
+P1B_SYSCTL(CTL_P1003_1B_SIGQUEUE_MAX, sigqueue_max);
+P1B_SYSCTL(CTL_P1003_1B_TIMER_MAX, timer_max);
+
+/* p31b_setcfg: Set the configuration
+ */
+void p31b_setcfg(int num, int value)
+{
+ if (num >= 1 && num < CTL_P1003_1B_MAXID)
+ facility[num - 1] = value;
+}
diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c
new file mode 100644
index 0000000..9234732
--- /dev/null
+++ b/sys/kern/subr_autoconf.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This software was developed by the Computer Systems Engineering group
+ * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
+ * contributed to Berkeley.
+ *
+ * All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Lawrence Berkeley Laboratories.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_autoconf.c 8.1 (Berkeley) 6/10/93
+ *
+ * $Id: subr_autoconf.c,v 1.7 1998/12/04 22:54:51 archie Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/device.h>
+#ifdef UNUSED
+#include <sys/malloc.h>
+#endif
+
+/*
+ * Autoconfiguration subroutines.
+ */
+
+#ifdef UNUSED
+/*
+ * ioconf.c exports exactly two names: cfdata and cfroots. All system
+ * devices and drivers are found via these tables.
+ */
+extern struct cfdata cfdata[];
+extern short cfroots[];
+
+#define ROOT ((struct device *)NULL)
+
+struct matchinfo {
+ cfmatch_t fn;
+ struct device *parent;
+ void *aux;
+ struct cfdata *match;
+ int pri;
+};
+
+/*
+ * Apply the matching function and choose the best. This is used
+ * a few times and we want to keep the code small.
+ */
+static void
+mapply(m, cf)
+ register struct matchinfo *m;
+ register struct cfdata *cf;
+{
+ register int pri;
+
+ if (m->fn != NULL)
+ pri = (*m->fn)(m->parent, cf, m->aux);
+ else
+ pri = (*cf->cf_driver->cd_match)(m->parent, cf, m->aux);
+ if (pri > m->pri) {
+ m->match = cf;
+ m->pri = pri;
+ }
+}
+
+/*
+ * Iterate over all potential children of some device, calling the given
+ * function (default being the child's match function) for each one.
+ * Nonzero returns are matches; the highest value returned is considered
+ * the best match. Return the `found child' if we got a match, or NULL
+ * otherwise. The `aux' pointer is simply passed on through.
+ *
+ * Note that this function is designed so that it can be used to apply
+ * an arbitrary function to all potential children (its return value
+ * can be ignored).
+ */
+struct cfdata *
+config_search(fn, parent, aux)
+ cfmatch_t fn;
+ register struct device *parent;
+ void *aux;
+{
+ register struct cfdata *cf;
+ register short *p;
+ struct matchinfo m;
+
+ m.fn = fn;
+ m.parent = parent;
+ m.aux = aux;
+ m.match = NULL;
+ m.pri = 0;
+ for (cf = cfdata; cf->cf_driver; cf++) {
+ /*
+ * Skip cf if no longer eligible, otherwise scan through
+ * parents for one matching `parent', and try match function.
+ */
+ if (cf->cf_fstate == FSTATE_FOUND)
+ continue;
+ for (p = cf->cf_parents; *p >= 0; p++)
+ if (parent->dv_cfdata == &cfdata[*p])
+ mapply(&m, cf);
+ }
+ return (m.match);
+}
+
+/*
+ * Find the given root device.
+ * This is much like config_search, but there is no parent.
+ */
+struct cfdata *
+config_rootsearch(fn, rootname, aux)
+ register cfmatch_t fn;
+ register char *rootname;
+ register void *aux;
+{
+ register struct cfdata *cf;
+ register short *p;
+ struct matchinfo m;
+
+ m.fn = fn;
+ m.parent = ROOT;
+ m.aux = aux;
+ m.match = NULL;
+ m.pri = 0;
+ /*
+ * Look at root entries for matching name. We do not bother
+ * with found-state here since only one root should ever be
+ * searched (and it must be done first).
+ */
+ for (p = cfroots; *p >= 0; p++) {
+ cf = &cfdata[*p];
+ if (strcmp(cf->cf_driver->cd_name, rootname) == 0)
+ mapply(&m, cf);
+ }
+ return (m.match);
+}
+
+static char *msgs[3] = { "", " not configured\n", " unsupported\n" };
+
+/*
+ * The given `aux' argument describes a device that has been found
+ * on the given parent, but not necessarily configured. Locate the
+ * configuration data for that device (using the cd_match configuration
+ * driver function) and attach it, and return true. If the device was
+ * not configured, call the given `print' function and return 0.
+ */
+int
+config_found(parent, aux, print)
+ struct device *parent;
+ void *aux;
+ cfprint_t print;
+{
+ struct cfdata *cf;
+
+ if ((cf = config_search((cfmatch_t)NULL, parent, aux)) != NULL) {
+ config_attach(parent, cf, aux, print);
+ return (1);
+ }
+ printf(msgs[(*print)(aux, parent->dv_xname)]);
+ return (0);
+}
+
+/*
+ * As above, but for root devices.
+ */
+int
+config_rootfound(rootname, aux)
+ char *rootname;
+ void *aux;
+{
+ struct cfdata *cf;
+
+ if ((cf = config_rootsearch((cfmatch_t)NULL, rootname, aux)) != NULL) {
+ config_attach(ROOT, cf, aux, (cfprint_t)NULL);
+ return (1);
+ }
+ printf("root device %s not configured\n", rootname);
+ return (0);
+}
+
+/* just like sprintf(buf, "%d") except that it works from the end */
+static char *
+number(ep, n)
+ register char *ep;
+ register int n;
+{
+
+ *--ep = 0;
+ while (n >= 10) {
+ *--ep = (n % 10) + '0';
+ n /= 10;
+ }
+ *--ep = n + '0';
+ return (ep);
+}
+
+/*
+ * Attach a found device. Allocates memory for device variables.
+ */
+void
+config_attach(parent, cf, aux, print)
+ register struct device *parent;
+ register struct cfdata *cf;
+ register void *aux;
+ cfprint_t print;
+{
+ register struct device *dev;
+ register struct cfdriver *cd;
+ register size_t lname, lunit;
+ register char *xunit;
+ int myunit;
+ char num[10];
+ static struct device **nextp = &alldevs;
+
+ cd = cf->cf_driver;
+ if (cd->cd_devsize < sizeof(struct device))
+ panic("config_attach");
+ myunit = cf->cf_unit;
+ if (cf->cf_fstate == FSTATE_NOTFOUND)
+ cf->cf_fstate = FSTATE_FOUND;
+ else
+ cf->cf_unit++;
+
+ /* compute length of name and decimal expansion of unit number */
+ lname = strlen(cd->cd_name);
+ xunit = number(&num[sizeof num], myunit);
+ lunit = &num[sizeof num] - xunit;
+ if (lname + lunit >= sizeof(dev->dv_xname))
+ panic("config_attach: device name too long");
+
+ /* get memory for all device vars */
+ dev = (struct device *)malloc(cd->cd_devsize, M_DEVBUF, M_WAITOK);
+ /* XXX cannot wait! */
+ bzero(dev, cd->cd_devsize);
+ *nextp = dev; /* link up */
+ nextp = &dev->dv_next;
+ dev->dv_class = cd->cd_class;
+ dev->dv_cfdata = cf;
+ dev->dv_unit = myunit;
+ bcopy(cd->cd_name, dev->dv_xname, lname);
+ bcopy(xunit, dev->dv_xname + lname, lunit);
+ dev->dv_parent = parent;
+ if (parent == ROOT)
+ printf("%s (root)", dev->dv_xname);
+ else {
+ printf("%s at %s", dev->dv_xname, parent->dv_xname);
+ (void) (*print)(aux, (char *)0);
+ }
+
+ /* put this device in the devices array */
+ if (dev->dv_unit >= cd->cd_ndevs) {
+ /*
+ * Need to expand the array.
+ */
+ int old = cd->cd_ndevs, oldbytes, new, newbytes;
+ void **nsp;
+
+ if (old == 0) {
+ nsp = malloc(MINALLOCSIZE, M_DEVBUF, M_WAITOK); /*XXX*/
+ bzero(nsp, MINALLOCSIZE);
+ cd->cd_ndevs = MINALLOCSIZE / sizeof(void *);
+ } else {
+ new = cd->cd_ndevs;
+ do {
+ new *= 2;
+ } while (new <= dev->dv_unit);
+ cd->cd_ndevs = new;
+ oldbytes = old * sizeof(void *);
+ newbytes = new * sizeof(void *);
+ nsp = malloc(newbytes, M_DEVBUF, M_WAITOK); /*XXX*/
+ bcopy(cd->cd_devs, nsp, oldbytes);
+ bzero(&nsp[old], newbytes - oldbytes);
+ free(cd->cd_devs, M_DEVBUF);
+ }
+ cd->cd_devs = nsp;
+ }
+ if (cd->cd_devs[dev->dv_unit])
+ panic("config_attach: duplicate %s", dev->dv_xname);
+ cd->cd_devs[dev->dv_unit] = dev;
+
+ /*
+ * Before attaching, clobber any unfound devices that are
+ * otherwise identical.
+ */
+ for (cf = cfdata; cf->cf_driver; cf++)
+ if (cf->cf_driver == cd && cf->cf_unit == dev->dv_unit &&
+ cf->cf_fstate == FSTATE_NOTFOUND)
+ cf->cf_fstate = FSTATE_FOUND;
+ (*cd->cd_attach)(parent, dev, aux);
+}
+
+/*
+ * Attach an event. These must come from initially-zero space (see
+ * commented-out assignments below), but that occurs naturally for
+ * device instance variables.
+ */
+void
+evcnt_attach(dev, name, ev)
+ struct device *dev;
+ const char *name;
+ struct evcnt *ev;
+{
+ static struct evcnt **nextp = &allevents;
+
+ KASSERT(strlen(name) < sizeof(ev->ev_name), ("evcnt_attach"));
+
+ /* ev->ev_next = NULL; */
+ ev->ev_dev = dev;
+ /* ev->ev_count = 0; */
+ snprintf(ev->ev_name, sizeof(ev->ev_name), "%s", name);
+ *nextp = ev;
+ nextp = &ev->ev_next;
+}
+
+#endif
+
+/*
+ * "Interrupt driven config" functions.
+ */
+static TAILQ_HEAD(, intr_config_hook) intr_config_hook_list =
+ TAILQ_HEAD_INITIALIZER(intr_config_hook_list);
+
+
+/* ARGSUSED */
+static void run_interrupt_driven_config_hooks __P((void *dummy));
+static void
+run_interrupt_driven_config_hooks(dummy)
+ void *dummy;
+{
+ struct intr_config_hook *hook;
+
+ for (hook = intr_config_hook_list.tqh_first; hook != NULL;
+ hook = hook->ich_links.tqe_next) {
+ (*hook->ich_func)(hook->ich_arg);
+ }
+
+ while (intr_config_hook_list.tqh_first != NULL) {
+ tsleep(&intr_config_hook_list, PCONFIG, "conifhk", 0);
+ }
+}
+SYSINIT(intr_config_hooks, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST,
+ run_interrupt_driven_config_hooks, NULL)
+
+/*
+ * Register a hook that will be called after "cold"
+ * autoconfiguration is complete and interrupts can
+ * be used to complete initialization.
+ */
+int
+config_intrhook_establish(hook)
+ struct intr_config_hook *hook;
+{
+ struct intr_config_hook *hook_entry;
+
+ for (hook_entry = intr_config_hook_list.tqh_first; hook_entry != NULL;
+ hook_entry = hook_entry->ich_links.tqe_next)
+ if (hook_entry == hook)
+ break;
+ if (hook_entry != NULL) {
+ printf("config_intrhook_establish: establishing an "
+ "already established hook.\n");
+ return (1);
+ }
+ TAILQ_INSERT_TAIL(&intr_config_hook_list, hook, ich_links);
+ if (cold == 0)
+ /* XXX Sufficient for LKMs loaded after initial config??? */
+ run_interrupt_driven_config_hooks(NULL);
+ return (0);
+}
+
+void
+config_intrhook_disestablish(hook)
+ struct intr_config_hook *hook;
+{
+ struct intr_config_hook *hook_entry;
+
+ for (hook_entry = intr_config_hook_list.tqh_first; hook_entry != NULL;
+ hook_entry = hook_entry->ich_links.tqe_next)
+ if (hook_entry == hook)
+ break;
+ if (hook_entry == NULL)
+ panic("config_intrhook_disestablish: disestablishing an "
+ "unestablished hook");
+
+ TAILQ_REMOVE(&intr_config_hook_list, hook, ich_links);
+ /* Wakeup anyone watching the list */
+ wakeup(&intr_config_hook_list);
+}
diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c
new file mode 100644
index 0000000..10af2ea
--- /dev/null
+++ b/sys/kern/subr_blist.c
@@ -0,0 +1,928 @@
+
+/*
+ * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
+ *
+ * (c)Copyright 1998, Matthew Dillon. Terms for use and redistribution
+ * are covered by the BSD Copyright as found in /usr/src/COPYRIGHT.
+ *
+ * This module implements a general bitmap allocator/deallocator. The
+ * allocator eats around 2 bits per 'block'. The module does not
+ * try to interpret the meaning of a 'block' other then to return
+ * SWAPBLK_NONE on an allocation failure.
+ *
+ * A radix tree is used to maintain the bitmap. Two radix constants are
+ * involved: One for the bitmaps contained in the leaf nodes (typically
+ * 32), and one for the meta nodes (typically 16). Both meta and leaf
+ * nodes have a hint field. This field gives us a hint as to the largest
+ * free contiguous range of blocks under the node. It may contain a
+ * value that is too high, but will never contain a value that is too
+ * low. When the radix tree is searched, allocation failures in subtrees
+ * update the hint.
+ *
+ * The radix tree also implements two collapsed states for meta nodes:
+ * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is
+ * in either of these two states, all information contained underneath
+ * the node is considered stale. These states are used to optimize
+ * allocation and freeing operations.
+ *
+ * The hinting greatly increases code efficiency for allocations while
+ * the general radix structure optimizes both allocations and frees. The
+ * radix tree should be able to operate well no matter how much
+ * fragmentation there is and no matter how large a bitmap is used.
+ *
+ * Unlike the rlist code, the blist code wires all necessary memory at
+ * creation time. Neither allocations nor frees require interaction with
+ * the memory subsystem. In contrast, the rlist code may allocate memory
+ * on an rlist_free() call. The non-blocking features of the blist code
+ * are used to great advantage in the swap code (vm/nswap_pager.c). The
+ * rlist code uses a little less overall memory then the blist code (but
+ * due to swap interleaving not all that much less), but the blist code
+ * scales much, much better.
+ *
+ * LAYOUT: The radix tree is layed out recursively using a
+ * linear array. Each meta node is immediately followed (layed out
+ * sequentially in memory) by BLIST_META_RADIX lower level nodes. This
+ * is a recursive structure but one that can be easily scanned through
+ * a very simple 'skip' calculation. In order to support large radixes,
+ * portions of the tree may reside outside our memory allocation. We
+ * handle this with an early-termination optimization (when bighint is
+ * set to -1) on the scan. The memory allocation is only large enough
+ * to cover the number of blocks requested at creation time even if it
+ * must be encompassed in larger root-node radix.
+ *
+ * NOTE: the allocator cannot currently allocate more then
+ * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too
+ * large' if you try. This is an area that could use improvement. The
+ * radix is large enough that this restriction does not effect the swap
+ * system, though. Currently only the allocation code is effected by
+ * this algorithmic unfeature. The freeing code can handle arbitrary
+ * ranges.
+ *
+ * This code can be compiled stand-alone for debugging.
+ */
+
+#ifdef KERNEL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/blist.h>
+#include <sys/malloc.h>
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_page.h>
+
+#else
+
+#ifndef BLIST_NO_DEBUG
+#define BLIST_DEBUG
+#endif
+
+#define SWAPBLK_NONE ((daddr_t)-1)
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#define malloc(a,b,c) malloc(a)
+#define free(a,b) free(a)
+
+typedef unsigned int u_daddr_t;
+
+#include <sys/blist.h>
+
+void panic(const char *ctl, ...);
+
+#endif
+
+/*
+ * static support functions
+ */
+
+static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
+static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk,
+ daddr_t count, daddr_t radix, int skip);
+static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
+static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
+ daddr_t radix, int skip, daddr_t blk);
+static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
+ daddr_t skip, blist_t dest, daddr_t count);
+static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix,
+ int skip, daddr_t count);
+#ifndef KERNEL
+static void blst_radix_print(blmeta_t *scan, daddr_t blk,
+ daddr_t radix, int skip, int tab);
+#endif
+
+#ifdef KERNEL
+static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
+#endif
+
+/*
+ * blist_create() - create a blist capable of handling up to the specified
+ * number of blocks
+ *
+ * blocks must be greater then 0
+ *
+ * The smallest blist consists of a single leaf node capable of
+ * managing BLIST_BMAP_RADIX blocks.
+ */
+
+blist_t
+blist_create(daddr_t blocks)
+{
+ blist_t bl;
+ int radix;
+ int skip = 0;
+
+ /*
+ * Calculate radix and skip field used for scanning.
+ */
+ radix = BLIST_BMAP_RADIX;
+
+ while (radix < blocks) {
+ radix <<= BLIST_META_RADIX_SHIFT;
+ skip = (skip + 1) << BLIST_META_RADIX_SHIFT;
+ }
+
+ bl = malloc(sizeof(struct blist), M_SWAP, M_WAITOK);
+
+ bzero(bl, sizeof(*bl));
+
+ bl->bl_blocks = blocks;
+ bl->bl_radix = radix;
+ bl->bl_skip = skip;
+ bl->bl_rootblks = 1 +
+ blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
+ bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, M_WAITOK);
+
+#if defined(BLIST_DEBUG)
+ printf(
+ "BLIST representing %d blocks (%d MB of swap)"
+ ", requiring %dK of ram\n",
+ bl->bl_blocks,
+ bl->bl_blocks * 4 / 1024,
+ (bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
+ );
+ printf("BLIST raw radix tree contains %d records\n", bl->bl_rootblks);
+#endif
+ blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
+
+ return(bl);
+}
+
+void
+blist_destroy(blist_t bl)
+{
+ free(bl->bl_root, M_SWAP);
+ free(bl, M_SWAP);
+}
+
+/*
+ * blist_alloc() - reserve space in the block bitmap. Return the base
+ * of a contiguous region or SWAPBLK_NONE if space could
+ * not be allocated.
+ */
+
+daddr_t
+blist_alloc(blist_t bl, daddr_t count)
+{
+ daddr_t blk = SWAPBLK_NONE;
+
+ if (bl) {
+ if (bl->bl_radix == BLIST_BMAP_RADIX)
+ blk = blst_leaf_alloc(bl->bl_root, 0, count);
+ else
+ blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
+ if (blk != SWAPBLK_NONE)
+ bl->bl_free -= count;
+ }
+ return(blk);
+}
+
+/*
+ * blist_free() - free up space in the block bitmap. Return the base
+ * of a contiguous region. Panic if an inconsistancy is
+ * found.
+ */
+
+void
+blist_free(blist_t bl, daddr_t blkno, daddr_t count)
+{
+ if (bl) {
+ if (bl->bl_radix == BLIST_BMAP_RADIX)
+ blst_leaf_free(bl->bl_root, blkno, count);
+ else
+ blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
+ bl->bl_free += count;
+ }
+}
+
+/*
+ * blist_resize() - resize an existing radix tree to handle the
+ * specified number of blocks. This will reallocate
+ * the tree and transfer the previous bitmap to the new
+ * one. When extending the tree you can specify whether
+ * the new blocks are to left allocated or freed.
+ */
+
+void
+blist_resize(blist_t *pbl, daddr_t count, int freenew)
+{
+ blist_t newbl = blist_create(count);
+ blist_t save = *pbl;
+
+ *pbl = newbl;
+ if (count > save->bl_blocks)
+ count = save->bl_blocks;
+ blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
+
+ /*
+ * If resizing upwards, should we free the new space or not?
+ */
+ if (freenew && count < newbl->bl_blocks) {
+ blist_free(newbl, count, newbl->bl_blocks - count);
+ }
+ blist_destroy(save);
+}
+
+#ifdef BLIST_DEBUG
+
+/*
+ * blist_print() - dump radix tree
+ */
+
+void
+blist_print(blist_t bl)
+{
+ printf("BLIST {\n");
+ blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
+ printf("}\n");
+}
+
+#endif
+
+/************************************************************************
+ * ALLOCATION SUPPORT FUNCTIONS *
+ ************************************************************************
+ *
+ * These support functions do all the actual work. They may seem
+ * rather longish, but that's because I've commented them up. The
+ * actual code is straight forward.
+ *
+ */
+
+/*
+ * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap).
+ *
+ * This is the core of the allocator and is optimized for the 1 block
+ * and the BLIST_BMAP_RADIX block allocation cases. Other cases are
+ * somewhat slower. The 1 block allocation case is log2 and extremely
+ * quick.
+ */
+
+static daddr_t
+blst_leaf_alloc(
+ blmeta_t *scan,
+ daddr_t blk,
+ int count
+) {
+ u_daddr_t orig = scan->u.bmu_bitmap;
+
+ if (orig == 0) {
+ /*
+ * Optimize bitmap all-allocated case. Also, count = 1
+ * case assumes at least 1 bit is free in the bitmap, so
+ * we have to take care of this case here.
+ */
+ scan->bm_bighint = 0;
+ return(SWAPBLK_NONE);
+ }
+ if (count == 1) {
+ /*
+ * Optimized code to allocate one bit out of the bitmap
+ */
+ u_daddr_t mask;
+ int j = BLIST_BMAP_RADIX/2;
+ int r = 0;
+
+ mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
+
+ while (j) {
+ if ((orig & mask) == 0) {
+ r += j;
+ orig >>= j;
+ }
+ j >>= 1;
+ mask >>= j;
+ }
+ scan->u.bmu_bitmap &= ~(1 << r);
+ return(blk + r);
+ }
+ if (count <= BLIST_BMAP_RADIX) {
+ /*
+ * non-optimized code to allocate N bits out of the bitmap.
+ * The more bits, the faster the code runs. It will run
+ * the slowest allocating 2 bits, but since there aren't any
+ * memory ops in the core loop (or shouldn't be, anyway),
+ * you probably won't notice the difference.
+ */
+ int j;
+ int n = BLIST_BMAP_RADIX - count;
+ u_daddr_t mask;
+
+ mask = (u_daddr_t)-1 >> n;
+
+ for (j = 0; j <= n; ++j) {
+ if ((orig & mask) == mask) {
+ scan->u.bmu_bitmap &= ~mask;
+ return(blk + j);
+ }
+ mask = (mask << 1);
+ }
+ }
+ /*
+ * We couldn't allocate count in this subtree, update bighint.
+ */
+ scan->bm_bighint = count - 1;
+ return(SWAPBLK_NONE);
+}
+
+/*
+ * blist_meta_alloc() - allocate at a meta in the radix tree.
+ *
+ * Attempt to allocate at a meta node. If we can't, we update
+ * bighint and return a failure. Updating bighint optimize future
+ * calls that hit this node. We have to check for our collapse cases
+ * and we have a few optimizations strewn in as well.
+ */
+
+static daddr_t
+blst_meta_alloc(
+ blmeta_t *scan,
+ daddr_t blk,
+ daddr_t count,
+ daddr_t radix,
+ int skip
+) {
+ int i;
+ int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+ if (scan->u.bmu_avail == 0) {
+ /*
+ * ALL-ALLOCATED special case
+ */
+ scan->bm_bighint = count;
+ return(SWAPBLK_NONE);
+ }
+
+ if (scan->u.bmu_avail == radix) {
+ radix >>= BLIST_META_RADIX_SHIFT;
+
+ /*
+ * ALL-FREE special case, initialize uninitialize
+ * sublevel.
+ */
+ for (i = 1; i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1)
+ break;
+ if (next_skip == 1) {
+ scan[i].u.bmu_bitmap = (u_daddr_t)-1;
+ scan[i].bm_bighint = BLIST_BMAP_RADIX;
+ } else {
+ scan[i].bm_bighint = radix;
+ scan[i].u.bmu_avail = radix;
+ }
+ }
+ } else {
+ radix >>= BLIST_META_RADIX_SHIFT;
+ }
+
+ for (i = 1; i <= skip; i += next_skip) {
+ if (count <= scan[i].bm_bighint) {
+ /*
+ * count fits in object
+ */
+ daddr_t r;
+ if (next_skip == 1) {
+ r = blst_leaf_alloc(&scan[i], blk, count);
+ } else {
+ r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1);
+ }
+ if (r != SWAPBLK_NONE) {
+ scan->u.bmu_avail -= count;
+ if (scan->bm_bighint > scan->u.bmu_avail)
+ scan->bm_bighint = scan->u.bmu_avail;
+ return(r);
+ }
+ } else if (scan[i].bm_bighint == (daddr_t)-1) {
+ /*
+ * Terminator
+ */
+ break;
+ } else if (count > radix) {
+ /*
+ * count does not fit in object even if it were
+ * complete free.
+ */
+ panic("blist_meta_alloc: allocation too large");
+ }
+ blk += radix;
+ }
+
+ /*
+ * We couldn't allocate count in this subtree, update bighint.
+ */
+ if (scan->bm_bighint >= count)
+ scan->bm_bighint = count - 1;
+ return(SWAPBLK_NONE);
+}
+
+/*
+ * BLST_LEAF_FREE() - free allocated block from leaf bitmap
+ *
+ */
+
+static void
+blst_leaf_free(
+ blmeta_t *scan,
+ daddr_t blk,
+ int count
+) {
+ /*
+ * free some data in this bitmap
+ *
+ * e.g.
+ * 0000111111111110000
+ * \_________/\__/
+ * v n
+ */
+ int n = blk & (BLIST_BMAP_RADIX - 1);
+ u_daddr_t mask;
+
+ mask = ((u_daddr_t)-1 << n) &
+ ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
+
+ if (scan->u.bmu_bitmap & mask)
+ panic("blst_radix_free: freeing free block");
+ scan->u.bmu_bitmap |= mask;
+
+ /*
+ * We could probably do a better job here. We are required to make
+ * bighint at least as large as the biggest contiguous block of
+ * data. If we just shoehorn it, a little extra overhead will
+ * be incured on the next allocation (but only that one typically).
+ */
+ scan->bm_bighint = BLIST_BMAP_RADIX;
+}
+
+/*
+ * BLST_META_FREE() - free allocated blocks from radix tree meta info
+ *
+ * This support routine frees a range of blocks from the bitmap.
+ * The range must be entirely enclosed by this radix node. If a
+ * meta node, we break the range down recursively to free blocks
+ * in subnodes (which means that this code can free an arbitrary
+ * range whereas the allocation code cannot allocate an arbitrary
+ * range).
+ */
+
+static void
+blst_meta_free(
+ blmeta_t *scan,
+ daddr_t freeBlk,
+ daddr_t count,
+ daddr_t radix,
+ int skip,
+ daddr_t blk
+) {
+ int i;
+ int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+#if 0
+ printf("FREE (%x,%d) FROM (%x,%d)\n",
+ freeBlk, count,
+ blk, radix
+ );
+#endif
+
+ if (scan->u.bmu_avail == 0) {
+ /*
+ * ALL-ALLOCATED special case, with possible
+ * shortcut to ALL-FREE special case.
+ */
+ scan->u.bmu_avail = count;
+ scan->bm_bighint = count;
+
+ if (count != radix) {
+ for (i = 1; i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1)
+ break;
+ scan[i].bm_bighint = 0;
+ if (next_skip == 1) {
+ scan[i].u.bmu_bitmap = 0;
+ } else {
+ scan[i].u.bmu_avail = 0;
+ }
+ }
+ /* fall through */
+ }
+ } else {
+ scan->u.bmu_avail += count;
+ /* scan->bm_bighint = radix; */
+ }
+
+ /*
+ * ALL-FREE special case.
+ */
+
+ if (scan->u.bmu_avail == radix)
+ return;
+#if !defined(MAX_PERF)
+ if (scan->u.bmu_avail > radix)
+ panic("blst_meta_free: freeing already free blocks (%d) %d/%d", count, scan->u.bmu_avail, radix);
+#endif
+
+ /*
+ * Break the free down into its components
+ */
+
+ radix >>= BLIST_META_RADIX_SHIFT;
+
+ i = (freeBlk - blk) / radix;
+ blk += i * radix;
+ i = i * next_skip + 1;
+
+ while (i <= skip && blk < freeBlk + count) {
+ daddr_t v;
+
+ v = blk + radix - freeBlk;
+ if (v > count)
+ v = count;
+
+ if (scan->bm_bighint == (daddr_t)-1)
+ panic("blst_meta_free: freeing unexpected range");
+
+ if (next_skip == 1) {
+ blst_leaf_free(&scan[i], freeBlk, v);
+ } else {
+ blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
+ }
+ if (scan->bm_bighint < scan[i].bm_bighint)
+ scan->bm_bighint = scan[i].bm_bighint;
+ count -= v;
+ freeBlk += v;
+ blk += radix;
+ i += next_skip;
+ }
+}
+
+/*
+ * BLIST_RADIX_COPY() - copy one radix tree to another
+ *
+ * Locates free space in the source tree and frees it in the destination
+ * tree. The space may not already be free in the destination.
+ */
+
+static void blst_copy(
+ blmeta_t *scan,
+ daddr_t blk,
+ daddr_t radix,
+ daddr_t skip,
+ blist_t dest,
+ daddr_t count
+) {
+ int next_skip;
+ int i;
+
+ /*
+ * Leaf node
+ */
+
+ if (radix == BLIST_BMAP_RADIX) {
+ u_daddr_t v = scan->u.bmu_bitmap;
+
+ if (v == (u_daddr_t)-1) {
+ blist_free(dest, blk, count);
+ } else if (v != 0) {
+ int i;
+
+ for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
+ if (v & (1 << i))
+ blist_free(dest, blk + i, 1);
+ }
+ }
+ return;
+ }
+
+ /*
+ * Meta node
+ */
+
+ if (scan->u.bmu_avail == 0) {
+ /*
+ * Source all allocated, leave dest allocated
+ */
+ return;
+ }
+ if (scan->u.bmu_avail == radix) {
+ /*
+ * Source all free, free entire dest
+ */
+ if (count < radix)
+ blist_free(dest, blk, count);
+ else
+ blist_free(dest, blk, radix);
+ return;
+ }
+
+
+ radix >>= BLIST_META_RADIX_SHIFT;
+ next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+ for (i = 1; count && i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1)
+ break;
+
+ if (count >= radix) {
+ blst_copy(
+ &scan[i],
+ blk,
+ radix,
+ next_skip - 1,
+ dest,
+ radix
+ );
+ count -= radix;
+ } else {
+ if (count) {
+ blst_copy(
+ &scan[i],
+ blk,
+ radix,
+ next_skip - 1,
+ dest,
+ count
+ );
+ }
+ count = 0;
+ }
+ blk += radix;
+ }
+}
+
+/*
+ * BLST_RADIX_INIT() - initialize radix tree
+ *
+ * Initialize our meta structures and bitmaps and calculate the exact
+ * amount of space required to manage 'count' blocks - this space may
+ * be considerably less then the calculated radix due to the large
+ * RADIX values we use.
+ */
+
+static daddr_t
+blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
+{
+ int i;
+ int next_skip;
+ daddr_t memindex = 0;
+
+ /*
+ * Leaf node
+ */
+
+ if (radix == BLIST_BMAP_RADIX) {
+ if (scan) {
+ scan->bm_bighint = 0;
+ scan->u.bmu_bitmap = 0;
+ }
+ return(memindex);
+ }
+
+ /*
+ * Meta node. If allocating the entire object we can special
+ * case it. However, we need to figure out how much memory
+ * is required to manage 'count' blocks, so we continue on anyway.
+ */
+
+ if (scan) {
+ scan->bm_bighint = 0;
+ scan->u.bmu_avail = 0;
+ }
+
+ radix >>= BLIST_META_RADIX_SHIFT;
+ next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+ for (i = 1; i <= skip; i += next_skip) {
+ if (count >= radix) {
+ /*
+ * Allocate the entire object
+ */
+ memindex = i + blst_radix_init(
+ ((scan) ? &scan[i] : NULL),
+ radix,
+ next_skip - 1,
+ radix
+ );
+ count -= radix;
+ } else if (count > 0) {
+ /*
+ * Allocate a partial object
+ */
+ memindex = i + blst_radix_init(
+ ((scan) ? &scan[i] : NULL),
+ radix,
+ next_skip - 1,
+ count
+ );
+ count = 0;
+ } else {
+ /*
+ * Add terminator and break out
+ */
+ if (scan)
+ scan[i].bm_bighint = (daddr_t)-1;
+ break;
+ }
+ }
+ if (memindex < i)
+ memindex = i;
+ return(memindex);
+}
+
+#ifdef BLIST_DEBUG
+
+static void
+blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
+{
+ int i;
+ int next_skip;
+ int lastState = 0;
+
+ if (radix == BLIST_BMAP_RADIX) {
+ printf(
+ "%*.*s(%04x,%d): bitmap %08x big=%d\n",
+ tab, tab, "",
+ blk, radix,
+ scan->u.bmu_bitmap,
+ scan->bm_bighint
+ );
+ return;
+ }
+
+ if (scan->u.bmu_avail == 0) {
+ printf(
+ "%*.*s(%04x,%d) ALL ALLOCATED\n",
+ tab, tab, "",
+ blk,
+ radix
+ );
+ return;
+ }
+ if (scan->u.bmu_avail == radix) {
+ printf(
+ "%*.*s(%04x,%d) ALL FREE\n",
+ tab, tab, "",
+ blk,
+ radix
+ );
+ return;
+ }
+
+ printf(
+ "%*.*s(%04x,%d): subtree (%d/%d) big=%d {\n",
+ tab, tab, "",
+ blk, radix,
+ scan->u.bmu_avail,
+ radix,
+ scan->bm_bighint
+ );
+
+ radix >>= BLIST_META_RADIX_SHIFT;
+ next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+ tab += 4;
+
+ for (i = 1; i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1) {
+ printf(
+ "%*.*s(%04x,%d): Terminator\n",
+ tab, tab, "",
+ blk, radix
+ );
+ lastState = 0;
+ break;
+ }
+ blst_radix_print(
+ &scan[i],
+ blk,
+ radix,
+ next_skip - 1,
+ tab
+ );
+ blk += radix;
+ }
+ tab -= 4;
+
+ printf(
+ "%*.*s}\n",
+ tab, tab, ""
+ );
+}
+
+#endif
+
+#ifdef BLIST_DEBUG
+
+int
+main(int ac, char **av)
+{
+ int size = 1024;
+ int i;
+ blist_t bl;
+
+ for (i = 1; i < ac; ++i) {
+ const char *ptr = av[i];
+ if (*ptr != '-') {
+ size = strtol(ptr, NULL, 0);
+ continue;
+ }
+ ptr += 2;
+ fprintf(stderr, "Bad option: %s\n", ptr - 2);
+ exit(1);
+ }
+ bl = blist_create(size);
+ blist_free(bl, 0, size);
+
+ for (;;) {
+ char buf[1024];
+ daddr_t da = 0;
+ daddr_t count = 0;
+
+
+ printf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix);
+ fflush(stdout);
+ if (fgets(buf, sizeof(buf), stdin) == NULL)
+ break;
+ switch(buf[0]) {
+ case 'r':
+ if (sscanf(buf + 1, "%d", &count) == 1) {
+ blist_resize(&bl, count, 1);
+ } else {
+ printf("?\n");
+ }
+ case 'p':
+ blist_print(bl);
+ break;
+ case 'a':
+ if (sscanf(buf + 1, "%d", &count) == 1) {
+ daddr_t blk = blist_alloc(bl, count);
+ printf(" R=%04x\n", blk);
+ } else {
+ printf("?\n");
+ }
+ break;
+ case 'f':
+ if (sscanf(buf + 1, "%x %d", &da, &count) == 2) {
+ blist_free(bl, da, count);
+ } else {
+ printf("?\n");
+ }
+ break;
+ case '?':
+ case 'h':
+ puts(
+ "p -print\n"
+ "a %d -allocate\n"
+ "f %x %d -free\n"
+ "r %d -resize\n"
+ "h/? -help"
+ );
+ break;
+ default:
+ printf("?\n");
+ break;
+ }
+ }
+ return(0);
+}
+
+void
+panic(const char *ctl, ...)
+{
+ va_list va;
+
+ va_start(va, ctl);
+ vfprintf(stderr, ctl, va);
+ fprintf(stderr, "\n");
+ va_end(va);
+ exit(1);
+}
+
+#endif
+
diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c
new file mode 100644
index 0000000..dc4c88a
--- /dev/null
+++ b/sys/kern/subr_bus.c
@@ -0,0 +1,1572 @@
+/*-
+ * Copyright (c) 1997,1998 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: subr_bus.c,v 1.13 1999/01/10 22:04:05 n_hibma Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/bus_private.h>
+#include <sys/systm.h>
+#include <machine/stdarg.h> /* for device_printf() */
+
+#include "opt_bus.h"
+
+#ifdef BUS_DEBUG
+#define PDEBUG(a) (printf(__FUNCTION__ ":%d: ", __LINE__), printf a, printf("\n"))
+#define DEVICENAME(d) ((d)? device_get_name(d): "no device")
+#define DRIVERNAME(d) ((d)? d->name : "no driver")
+#define DEVCLANAME(d) ((d)? d->name : "no devclass")
+
+/* Produce the indenting, indent*2 spaces plus a '.' ahead of that to
+ * prevent syslog from deleting initial spaces
+ */
+#define indentprintf(p) do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf(" "); printf p ; } while(0)
+
+static void print_method_list(device_method_t *m, int indent);
+static void print_device_ops(device_ops_t ops, int indent);
+static void print_device_short(device_t dev, int indent);
+static void print_device(device_t dev, int indent);
+void print_device_tree_short(device_t dev, int indent);
+void print_device_tree(device_t dev, int indent);
+static void print_driver_short(driver_t *driver, int indent);
+static void print_driver(driver_t *driver, int indent);
+static void print_driver_list(driver_list_t drivers, int indent);
+static void print_devclass_short(devclass_t dc, int indent);
+static void print_devclass(devclass_t dc, int indent);
+void print_devclass_list_short(void);
+void print_devclass_list(void);
+
+#else
+/* Make the compiler ignore the function calls */
+#define PDEBUG(a) /* nop */
+#define DEVICENAME(d) /* nop */
+#define DRIVERNAME(d) /* nop */
+#define DEVCLANAME(d) /* nop */
+
+#define print_method_list(m,i) /* nop */
+#define print_device_ops(o,i) /* nop */
+#define print_device_short(d,i) /* nop */
+#define print_device(d,i) /* nop */
+#define print_device_tree_short(d,i) /* nop */
+#define print_device_tree(d,i) /* nop */
+#define print_driver_short(d,i) /* nop */
+#define print_driver(d,i) /* nop */
+#define print_driver_list(d,i) /* nop */
+#define print_devclass_short(d,i) /* nop */
+#define print_devclass(d,i) /* nop */
+#define print_devclass_list_short() /* nop */
+#define print_devclass_list() /* nop */
+#endif
+
+
+/*
+ * Method table handling
+ */
+static int next_method_offset = 1;
+static int methods_count = 0;
+static int methods_size = 0;
+
+struct method {
+ int offset;
+ char* name;
+};
+
+static struct method *methods = 0;
+
+static void
+register_method(struct device_op_desc *desc)
+{
+ int i;
+ struct method* m;
+
+ for (i = 0; i < methods_count; i++)
+ if (!strcmp(methods[i].name, desc->name)) {
+ desc->offset = methods[i].offset;
+ PDEBUG(("methods[%d] has the same name, %s, with offset %d",
+ i, desc->name, desc->offset));
+ return;
+ }
+
+ if (methods_count == methods_size) {
+ struct method* p;
+
+ methods_size += 10;
+ p = (struct method*) malloc(methods_size * sizeof(struct method),
+ M_DEVBUF, M_NOWAIT);
+ if (!p)
+ panic("register_method: out of memory");
+ if (methods) {
+ bcopy(methods, p, methods_count * sizeof(struct method));
+ free(methods, M_DEVBUF);
+ }
+ methods = p;
+ }
+ m = &methods[methods_count++];
+ m->name = malloc(strlen(desc->name) + 1, M_DEVBUF, M_NOWAIT);
+ if (!m->name)
+ panic("register_method: out of memory");
+ strcpy(m->name, desc->name);
+ desc->offset = m->offset = next_method_offset++;
+}
+
+static int error_method(void)
+{
+ return ENXIO;
+}
+
+static struct device_ops null_ops = {
+ 1,
+ { error_method }
+};
+
+static void
+compile_methods(driver_t *driver)
+{
+ device_ops_t ops;
+ struct device_method *m;
+ int i;
+
+ /*
+ * First register any methods which need it.
+ */
+ for (i = 0, m = driver->methods; m->desc; i++, m++)
+ if (!m->desc->offset)
+ register_method(m->desc);
+ else
+ PDEBUG(("offset not equal to zero, method desc %d left as is", i));
+
+ /*
+ * Then allocate the compiled op table.
+ */
+ ops = malloc(sizeof(struct device_ops) + (next_method_offset-1) * sizeof(devop_t),
+ M_DEVBUF, M_NOWAIT);
+ if (!ops)
+ panic("compile_methods: out of memory");
+
+ ops->maxoffset = next_method_offset;
+ for (i = 0; i < next_method_offset; i++)
+ ops->methods[i] = error_method;
+ for (i = 0, m = driver->methods; m->desc; i++, m++)
+ ops->methods[m->desc->offset] = m->func;
+ PDEBUG(("%s has %d method%s, wasting %d bytes",
+ DRIVERNAME(driver), i, (i==1?"":"s"),
+ (next_method_offset-i)*sizeof(devop_t)));
+
+ driver->ops = ops;
+}
+
+/*
+ * Devclass implementation
+ */
+
+static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);
+
+static devclass_t
+devclass_find_internal(const char *classname, int create)
+{
+ devclass_t dc;
+
+ PDEBUG(("looking for %s", classname));
+ if (!classname)
+ return NULL;
+
+ for (dc = TAILQ_FIRST(&devclasses); dc; dc = TAILQ_NEXT(dc, link))
+ if (!strcmp(dc->name, classname))
+ return dc;
+
+ PDEBUG(("%s not found%s", classname, (create? ", creating": "")));
+ if (create) {
+ dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
+ M_DEVBUF, M_NOWAIT);
+ if (!dc)
+ return NULL;
+ dc->name = (char*) (dc + 1);
+ strcpy(dc->name, classname);
+ dc->devices = NULL;
+ dc->maxunit = 0;
+ dc->nextunit = 0;
+ TAILQ_INIT(&dc->drivers);
+ TAILQ_INSERT_TAIL(&devclasses, dc, link);
+ }
+
+ return dc;
+}
+
+devclass_t
+devclass_find(const char *classname)
+{
+ return devclass_find_internal(classname, FALSE);
+}
+
+int
+devclass_add_driver(devclass_t dc, driver_t *driver)
+{
+ PDEBUG(("%s", DRIVERNAME(driver)));
+ /*
+ * Compile the drivers methods.
+ */
+ compile_methods(driver);
+
+ /*
+ * Make sure the devclass which the driver is implementing exists.
+ */
+ devclass_find_internal(driver->name, TRUE);
+
+ TAILQ_INSERT_TAIL(&dc->drivers, driver, link);
+
+ return 0;
+}
+
+int
+devclass_delete_driver(devclass_t busclass, driver_t *driver)
+{
+ devclass_t dc = devclass_find(driver->name);
+ device_t dev;
+ int i;
+ int error;
+
+ PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
+
+ if (!dc)
+ return 0;
+
+ /*
+ * Disassociate from any devices. We iterate through all the
+ * devices in the devclass of the driver and detach any which are
+ * using the driver.
+ */
+ for (i = 0; i < dc->maxunit; i++) {
+ if (dc->devices[i]) {
+ dev = dc->devices[i];
+ if (dev->driver == driver) {
+ if (error = device_detach(dev))
+ return error;
+ device_set_driver(dev, NULL);
+ }
+ }
+ }
+
+ TAILQ_REMOVE(&busclass->drivers, driver, link);
+ return 0;
+}
+
+driver_t *
+devclass_find_driver(devclass_t dc, const char *classname)
+{
+ driver_t *driver;
+
+ PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));
+
+ for (driver = TAILQ_FIRST(&dc->drivers); driver;
+ driver = TAILQ_NEXT(driver, link)) {
+ if (!strcmp(driver->name, classname))
+ return driver;
+ }
+
+ PDEBUG(("not found"));
+ return NULL;
+}
+
+const char *
+devclass_get_name(devclass_t dc)
+{
+ return dc->name;
+}
+
+device_t
+devclass_get_device(devclass_t dc, int unit)
+{
+ if (unit < 0 || unit >= dc->maxunit)
+ return NULL;
+ return dc->devices[unit];
+}
+
+void *
+devclass_get_softc(devclass_t dc, int unit)
+{
+ device_t dev;
+
+ if (unit < 0 || unit >= dc->maxunit)
+ return NULL;
+ dev = dc->devices[unit];
+ if (!dev || dev->state < DS_ATTACHED)
+ return NULL;
+ return dev->softc;
+}
+
+int
+devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp)
+{
+ int i;
+ int count;
+ device_t *list;
+
+ count = 0;
+ for (i = 0; i < dc->maxunit; i++)
+ if (dc->devices[i])
+ count++;
+
+ list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT);
+ if (!list)
+ return ENOMEM;
+
+ count = 0;
+ for (i = 0; i < dc->maxunit; i++)
+ if (dc->devices[i]) {
+ list[count] = dc->devices[i];
+ count++;
+ }
+
+ *devlistp = list;
+ *devcountp = count;
+
+ return 0;
+}
+
+int
+devclass_get_maxunit(devclass_t dc)
+{
+ return dc->maxunit;
+}
+
+static int
+devclass_alloc_unit(devclass_t dc, int *unitp)
+{
+ int unit = *unitp;
+
+ PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));
+
+ /*
+ * If we have been given a wired unit number, check for existing
+ * device.
+ */
+ if (unit != -1) {
+ device_t dev;
+ dev = devclass_get_device(dc, unit);
+ if (dev) {
+ printf("devclass_alloc_unit: %s%d already exists, using next available unit number\n", dc->name, unit);
+ unit = -1;
+ }
+ }
+
+ if (unit == -1) {
+ unit = dc->nextunit;
+ dc->nextunit++;
+ } else if (dc->nextunit <= unit)
+ dc->nextunit = unit + 1;
+
+ if (unit >= dc->maxunit) {
+ device_t *newlist;
+ int newsize;
+
+ newsize = (dc->maxunit ? 2 * dc->maxunit
+ : MINALLOCSIZE / sizeof(device_t));
+ newlist = malloc(sizeof(device_t) * newsize, M_DEVBUF, M_NOWAIT);
+ if (!newlist)
+ return ENOMEM;
+ bcopy(dc->devices, newlist, sizeof(device_t) * dc->maxunit);
+ bzero(newlist + dc->maxunit,
+ sizeof(device_t) * (newsize - dc->maxunit));
+ if (dc->devices)
+ free(dc->devices, M_DEVBUF);
+ dc->devices = newlist;
+ dc->maxunit = newsize;
+ }
+ PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));
+
+ *unitp = unit;
+ return 0;
+}
+
+static int
+devclass_add_device(devclass_t dc, device_t dev)
+{
+ int error;
+
+ PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
+
+ if (error = devclass_alloc_unit(dc, &dev->unit))
+ return error;
+ dc->devices[dev->unit] = dev;
+ dev->devclass = dc;
+ return 0;
+}
+
+static int
+devclass_delete_device(devclass_t dc, device_t dev)
+{
+ if (!dc || !dev)
+ return 0;
+
+ PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
+
+ if (dev->devclass != dc
+ || dc->devices[dev->unit] != dev)
+ panic("devclass_delete_device: inconsistent device class");
+ dc->devices[dev->unit] = NULL;
+ if (dev->flags & DF_WILDCARD)
+ dev->unit = -1;
+ dev->devclass = NULL;
+ while (dc->nextunit > 0 && dc->devices[dc->nextunit - 1] == NULL)
+ dc->nextunit--;
+ return 0;
+}
+
+static device_t
+make_device(device_t parent, const char *name,
+ int unit, void *ivars)
+{
+ device_t dev;
+ devclass_t dc;
+ int error;
+
+ PDEBUG(("%s at %s as unit %d with%s ivars",
+ name, DEVICENAME(parent), unit, (ivars? "":"out")));
+
+ if (name) {
+ dc = devclass_find_internal(name, TRUE);
+ if (!dc) {
+ printf("make_device: can't find device class %s\n", name);
+ return NULL;
+ }
+
+ if (error = devclass_alloc_unit(dc, &unit))
+ return NULL;
+ } else
+ dc = NULL;
+
+ dev = malloc(sizeof(struct device), M_DEVBUF, M_NOWAIT);
+ if (!dev)
+ return 0;
+
+ dev->parent = parent;
+ TAILQ_INIT(&dev->children);
+ dev->ops = &null_ops;
+ dev->driver = NULL;
+ dev->devclass = dc;
+ dev->unit = unit;
+ dev->desc = NULL;
+ dev->busy = 0;
+ dev->flags = DF_ENABLED;
+ if (unit == -1)
+ dev->flags |= DF_WILDCARD;
+ if (name)
+ dev->flags |= DF_FIXEDCLASS;
+ dev->ivars = ivars;
+ dev->softc = NULL;
+
+ if (dc)
+ dc->devices[unit] = dev;
+
+ dev->state = DS_NOTPRESENT;
+
+ return dev;
+}
+
+static void
+device_print_child(device_t dev, device_t child)
+{
+ printf("%s%d", device_get_name(child), device_get_unit(child));
+ if (device_is_alive(child)) {
+ if (device_get_desc(child))
+ printf(": <%s>", device_get_desc(child));
+ BUS_PRINT_CHILD(dev, child);
+ } else
+ printf(" not found");
+ printf("\n");
+}
+
+device_t
+device_add_child(device_t dev, const char *name, int unit, void *ivars)
+{
+ device_t child;
+
+ PDEBUG(("%s at %s as unit %d with%s ivars",
+ name, DEVICENAME(dev), unit, (ivars? "":"out")));
+
+ child = make_device(dev, name, unit, ivars);
+
+ if (child)
+ TAILQ_INSERT_TAIL(&dev->children, child, link);
+ else
+ PDEBUG(("%s failed", name));
+
+ return child;
+}
+
+device_t
+device_add_child_after(device_t dev, device_t place, const char *name,
+ int unit, void *ivars)
+{
+ device_t child;
+
+ PDEBUG(("%s at %s after %s as unit %d with%s ivars",
+ name, DEVICENAME(dev), DEVICENAME(place), unit, (ivars? "":"out")));
+
+ child = make_device(dev, name, unit, ivars);
+
+ if (place) {
+ TAILQ_INSERT_AFTER(&dev->children, place, dev, link);
+ } else {
+ TAILQ_INSERT_HEAD(&dev->children, dev, link);
+ }
+
+ return child;
+}
+
+int
+device_delete_child(device_t dev, device_t child)
+{
+ int error;
+ device_t grandchild;
+
+ PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
+
+ /* remove children first */
+ while ( (grandchild = TAILQ_FIRST(&child->children)) ) {
+ error = device_delete_child(child, grandchild);
+ if (error)
+ return error;
+ }
+
+ if (error = device_detach(child))
+ return error;
+ if (child->devclass)
+ devclass_delete_device(child->devclass, child);
+ TAILQ_REMOVE(&dev->children, child, link);
+ free(child, M_DEVBUF);
+
+ return 0;
+}
+
+/*
+ * Find only devices attached to this bus.
+ */
+device_t
+device_find_child(device_t dev, const char *classname, int unit)
+{
+ devclass_t dc;
+ device_t child;
+
+ dc = devclass_find(classname);
+ if (!dc)
+ return NULL;
+
+ child = devclass_get_device(dc, unit);
+ if (child && child->parent == dev)
+ return child;
+ return NULL;
+}
+
+static driver_t *
+first_matching_driver(devclass_t dc, device_t dev)
+{
+ if (dev->devclass)
+ return devclass_find_driver(dc, dev->devclass->name);
+ else
+ return TAILQ_FIRST(&dc->drivers);
+}
+
+static driver_t *
+next_matching_driver(devclass_t dc, device_t dev, driver_t *last)
+{
+ if (dev->devclass) {
+ driver_t *driver;
+ for (driver = TAILQ_NEXT(last, link); driver;
+ driver = TAILQ_NEXT(driver, link))
+ if (!strcmp(dev->devclass->name, driver->name))
+ return driver;
+ return NULL;
+ } else
+ return TAILQ_NEXT(last, link);
+}
+
+static int
+device_probe_child(device_t dev, device_t child)
+{
+ devclass_t dc;
+ driver_t *driver;
+
+ dc = dev->devclass;
+ if (dc == NULL)
+ panic("device_probe_child: parent device has no devclass");
+
+ if (child->state == DS_ALIVE)
+ return 0;
+
+ for (driver = first_matching_driver(dc, child);
+ driver;
+ driver = next_matching_driver(dc, child, driver)) {
+ PDEBUG(("Trying %s", DRIVERNAME(driver)));
+ device_set_driver(child, driver);
+ if (DEVICE_PROBE(child) == 0) {
+ if (!child->devclass)
+ device_set_devclass(child, driver->name);
+ child->state = DS_ALIVE;
+ return 0;
+ }
+ }
+
+ return ENXIO;
+}
+
+device_t
+device_get_parent(device_t dev)
+{
+ return dev->parent;
+}
+
+int
+device_get_children(device_t dev, device_t **devlistp, int *devcountp)
+{
+ int count;
+ device_t child;
+ device_t *list;
+
+ count = 0;
+ for (child = TAILQ_FIRST(&dev->children); child;
+ child = TAILQ_NEXT(child, link))
+ count++;
+
+ list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT);
+ if (!list)
+ return ENOMEM;
+
+ count = 0;
+ for (child = TAILQ_FIRST(&dev->children); child;
+ child = TAILQ_NEXT(child, link)) {
+ list[count] = child;
+ count++;
+ }
+
+ *devlistp = list;
+ *devcountp = count;
+
+ return 0;
+}
+
+driver_t *
+device_get_driver(device_t dev)
+{
+ return dev->driver;
+}
+
+devclass_t
+device_get_devclass(device_t dev)
+{
+ return dev->devclass;
+}
+
+const char *
+device_get_name(device_t dev)
+{
+ if (dev->devclass)
+ return devclass_get_name(dev->devclass);
+ return NULL;
+}
+
+int
+device_get_unit(device_t dev)
+{
+ return dev->unit;
+}
+
+const char *
+device_get_desc(device_t dev)
+{
+ return dev->desc;
+}
+
+void
+device_print_prettyname(device_t dev)
+{
+ const char *name = device_get_name(dev);
+
+ if (name == 0)
+ name = "(no driver assigned)";
+ printf("%s%d: ", name, device_get_unit(dev));
+}
+
+void
+device_printf(device_t dev, const char * fmt, ...)
+{
+ va_list ap;
+
+ device_print_prettyname(dev);
+ va_start(ap, fmt);
+ vprintf(fmt, ap);
+ va_end(ap);
+}
+
+void
+device_set_desc(device_t dev, const char* desc)
+{
+ dev->desc = desc;
+}
+
+void *
+device_get_softc(device_t dev)
+{
+ return dev->softc;
+}
+
+void *
+device_get_ivars(device_t dev)
+{
+ return dev->ivars;
+}
+
+device_state_t
+device_get_state(device_t dev)
+{
+ return dev->state;
+}
+
+void
+device_enable(device_t dev)
+{
+ dev->flags |= DF_ENABLED;
+}
+
+void
+device_disable(device_t dev)
+{
+ dev->flags &= ~DF_ENABLED;
+}
+
+void
+device_busy(device_t dev)
+{
+ if (dev->state < DS_ATTACHED)
+ panic("device_busy: called for unattached device");
+ if (dev->busy == 0 && dev->parent)
+ device_busy(dev->parent);
+ dev->busy++;
+ dev->state = DS_BUSY;
+}
+
+void
+device_unbusy(device_t dev)
+{
+ if (dev->state != DS_BUSY)
+ panic("device_unbusy: called for non-busy device");
+ dev->busy--;
+ if (dev->busy == 0) {
+ if (dev->parent)
+ device_unbusy(dev->parent);
+ dev->state = DS_ATTACHED;
+ }
+}
+
+int
+device_is_enabled(device_t dev)
+{
+ return (dev->flags & DF_ENABLED) != 0;
+}
+
+int
+device_is_alive(device_t dev)
+{
+ return dev->state >= DS_ALIVE;
+}
+
+int
+device_set_devclass(device_t dev, const char *classname)
+{
+ devclass_t dc;
+
+ if (dev->devclass) {
+ printf("device_set_devclass: device class already set\n");
+ return EINVAL;
+ }
+
+ dc = devclass_find_internal(classname, TRUE);
+ if (!dc)
+ return ENOMEM;
+
+ return devclass_add_device(dc, dev);
+}
+
+int
+device_set_driver(device_t dev, driver_t *driver)
+{
+ if (dev->state >= DS_ATTACHED)
+ return EBUSY;
+
+ if (dev->driver == driver)
+ return 0;
+
+ if (dev->softc) {
+ free(dev->softc, M_DEVBUF);
+ dev->softc = NULL;
+ }
+ dev->ops = &null_ops;
+ dev->driver = driver;
+ if (driver) {
+ dev->ops = driver->ops;
+ dev->softc = malloc(driver->softc, M_DEVBUF, M_NOWAIT);
+ if (!dev->softc) {
+ dev->ops = &null_ops;
+ dev->driver = NULL;
+ return ENOMEM;
+ }
+ bzero(dev->softc, driver->softc);
+ }
+ return 0;
+}
+
+int
+device_probe_and_attach(device_t dev)
+{
+ device_t bus = dev->parent;
+ int error = 0;
+
+ if (dev->state >= DS_ALIVE)
+ return 0;
+
+ if (dev->flags & DF_ENABLED) {
+ error = device_probe_child(bus, dev);
+ if (!error) {
+ device_print_child(bus, dev);
+ error = DEVICE_ATTACH(dev);
+ if (!error)
+ dev->state = DS_ATTACHED;
+ else {
+ printf("device_probe_and_attach: %s%d attach returned %d\n",
+ dev->driver->name, dev->unit, error);
+ device_set_driver(dev, NULL);
+ dev->state = DS_NOTPRESENT;
+ }
+ }
+ } else {
+ device_print_prettyname(dev);
+ printf("not probed (disabled)\n");
+ }
+
+ return error;
+}
+
+int
+device_detach(device_t dev)
+{
+ int error;
+
+ PDEBUG(("%s", DEVICENAME(dev)));
+ if (dev->state == DS_BUSY)
+ return EBUSY;
+ if (dev->state != DS_ATTACHED)
+ return 0;
+
+ if (error = DEVICE_DETACH(dev))
+ return error;
+
+ if (!(dev->flags & DF_FIXEDCLASS))
+ devclass_delete_device(dev->devclass, dev);
+
+ dev->state = DS_NOTPRESENT;
+ device_set_driver(dev, NULL);
+
+ return 0;
+}
+
+int
+device_shutdown(device_t dev)
+{
+ if (dev->state < DS_ATTACHED)
+ return 0;
+ return DEVICE_SHUTDOWN(dev);
+}
+
+/*
+ * Access functions for device resources.
+ */
+extern struct config_device devtab[];
+extern int devtab_count;
+
+static int
+resource_match_string(int i, char *resname, char *value)
+{
+ int j;
+ struct config_resource *res;
+
+ for (j = 0, res = devtab[i].resources;
+ j < devtab[i].resource_count; j++, res++)
+ if (!strcmp(res->name, resname)
+ && res->type == RES_STRING
+ && !strcmp(res->u.stringval, value))
+ return TRUE;
+ return FALSE;
+}
+
+static int
+resource_find(const char *name, int unit, char *resname,
+ struct config_resource **result)
+{
+ int i, j;
+ struct config_resource *res;
+
+ /*
+ * First check specific instances, then generic.
+ */
+ for (i = 0; i < devtab_count; i++) {
+ if (devtab[i].unit < 0)
+ continue;
+ if (!strcmp(devtab[i].name, name) && devtab[i].unit == unit) {
+ res = devtab[i].resources;
+ for (j = 0; j < devtab[i].resource_count; j++, res++)
+ if (!strcmp(res->name, resname)) {
+ *result = res;
+ return 0;
+ }
+ }
+ }
+ for (i = 0; i < devtab_count; i++) {
+ if (devtab[i].unit >= 0)
+ continue;
+ if (!strcmp(devtab[i].name, name) && devtab[i].unit == unit) {
+ res = devtab[i].resources;
+ for (j = 0; j < devtab[i].resource_count; j++, res++)
+ if (!strcmp(res->name, resname)) {
+ *result = res;
+ return 0;
+ }
+ }
+ }
+ return ENOENT;
+}
+
+int
+resource_int_value(const char *name, int unit, char *resname, int *result)
+{
+ int error;
+ struct config_resource *res;
+ if ((error = resource_find(name, unit, resname, &res)) != 0)
+ return error;
+ if (res->type != RES_INT)
+ return EFTYPE;
+ *result = res->u.intval;
+ return 0;
+}
+
+int
+resource_long_value(const char *name, int unit, char *resname, long *result)
+{
+ int error;
+ struct config_resource *res;
+ if ((error = resource_find(name, unit, resname, &res)) != 0)
+ return error;
+ if (res->type != RES_LONG)
+ return EFTYPE;
+ *result = res->u.longval;
+ return 0;
+}
+
+int
+resource_string_value(const char *name, int unit, char *resname, char **result)
+{
+ int error;
+ struct config_resource *res;
+ if ((error = resource_find(name, unit, resname, &res)) != 0)
+ return error;
+ if (res->type != RES_STRING)
+ return EFTYPE;
+ *result = res->u.stringval;
+ return 0;
+}
+
+int
+resource_query_string(int i, char *resname, char *value)
+{
+ if (i < 0)
+ i = 0;
+ else
+ i = i + 1;
+ for (; i < devtab_count; i++)
+ if (resource_match_string(i, resname, value))
+ return i;
+ return -1;
+}
+
+char *
+resource_query_name(int i)
+{
+ return devtab[i].name;
+}
+
+int
+resource_query_unit(int i)
+{
+ return devtab[i].unit;
+}
+
+
+/*
+ * Some useful method implementations to make life easier for bus drivers.
+ */
+int
+bus_generic_attach(device_t dev)
+{
+ device_t child;
+
+ for (child = TAILQ_FIRST(&dev->children);
+ child; child = TAILQ_NEXT(child, link))
+ device_probe_and_attach(child);
+
+ return 0;
+}
+
+int
+bus_generic_detach(device_t dev)
+{
+ device_t child;
+ int error;
+
+ if (dev->state != DS_ATTACHED)
+ return EBUSY;
+
+ for (child = TAILQ_FIRST(&dev->children);
+ child; child = TAILQ_NEXT(child, link))
+ if (error = device_detach(child))
+ return error;
+
+ return 0;
+}
+
+int
+bus_generic_shutdown(device_t dev)
+{
+ device_t child;
+
+ for (child = TAILQ_FIRST(&dev->children);
+ child; child = TAILQ_NEXT(child, link))
+ DEVICE_SHUTDOWN(child);
+
+ return 0;
+}
+
+int
+bus_generic_suspend(device_t dev)
+{
+ int error;
+ device_t child, child2;
+
+ for (child = TAILQ_FIRST(&dev->children);
+ child; child = TAILQ_NEXT(child, link)) {
+ error = DEVICE_SUSPEND(child);
+ if (error) {
+ for (child2 = TAILQ_FIRST(&dev->children);
+ child2 && child2 != child;
+ child2 = TAILQ_NEXT(child2, link))
+ DEVICE_RESUME(child2);
+ return (error);
+ }
+ }
+ return 0;
+}
+
+int
+bus_generic_resume(device_t dev)
+{
+ device_t child;
+
+ for (child = TAILQ_FIRST(&dev->children);
+ child; child = TAILQ_NEXT(child, link)) {
+ DEVICE_RESUME(child);
+ /* if resume fails, there's nothing we can usefully do... */
+ }
+ return 0;
+}
+
+void
+bus_generic_print_child(device_t dev, device_t child)
+{
+ printf(" on %s%d", device_get_name(dev), device_get_unit(dev));
+}
+
+int
+bus_generic_read_ivar(device_t dev, device_t child, int index,
+ uintptr_t * result)
+{
+ return ENOENT;
+}
+
+int
+bus_generic_write_ivar(device_t dev, device_t child, int index,
+ uintptr_t value)
+{
+ return ENOENT;
+}
+
+int
+bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
+ driver_intr_t *intr, void *arg, void **cookiep)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_SETUP_INTR(dev->parent, child, irq, intr, arg,
+ cookiep));
+ else
+ return (EINVAL);
+}
+
+int
+bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
+ void *cookie)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
+ else
+ return (EINVAL);
+}
+
+struct resource *
+bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
+ u_long start, u_long end, u_long count, u_int flags)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid,
+ start, end, count, flags));
+ else
+ return (NULL);
+}
+
+int
+bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
+ struct resource *r)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid,
+ r));
+ else
+ return (EINVAL);
+}
+
+int
+bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
+ struct resource *r)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid,
+ r));
+ else
+ return (EINVAL);
+}
+
+int
+bus_generic_deactivate_resource(device_t dev, device_t child, int type,
+ int rid, struct resource *r)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
+ r));
+ else
+ return (EINVAL);
+}
+
+/*
+ * Some convenience functions to make it easier for drivers to use the
+ * resource-management functions. All these really do is hide the
+ * indirection through the parent's method table, making for slightly
+ * less-wordy code. In the future, it might make sense for this code
+ * to maintain some sort of a list of resources allocated by each device.
+ */
+struct resource *
+bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
+ u_long count, u_int flags)
+{
+ if (dev->parent == 0)
+ return (0);
+ return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
+ count, flags));
+}
+
+int
+bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
+{
+ if (dev->parent == 0)
+ return (EINVAL);
+ return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+int
+bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
+{
+ if (dev->parent == 0)
+ return (EINVAL);
+ return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+int
+bus_release_resource(device_t dev, int type, int rid, struct resource *r)
+{
+ if (dev->parent == 0)
+ return (EINVAL);
+ return (BUS_RELEASE_RESOURCE(dev->parent, dev,
+ type, rid, r));
+}
+
+static void
+root_print_child(device_t dev, device_t child)
+{
+}
+
+static int
+root_setup_intr(device_t dev, device_t child, driver_intr_t *intr, void *arg,
+ void **cookiep)
+{
+ /*
+ * If an interrupt mapping gets to here something bad has happened.
+ */
+ panic("root_setup_intr");
+}
+
+static device_method_t root_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_suspend, bus_generic_suspend),
+ DEVMETHOD(device_resume, bus_generic_resume),
+
+ /* Bus interface */
+ DEVMETHOD(bus_print_child, root_print_child),
+ DEVMETHOD(bus_read_ivar, bus_generic_read_ivar),
+ DEVMETHOD(bus_write_ivar, bus_generic_write_ivar),
+ DEVMETHOD(bus_setup_intr, root_setup_intr),
+
+ { 0, 0 }
+};
+
+static driver_t root_driver = {
+ "root",
+ root_methods,
+ DRIVER_TYPE_MISC,
+ 1, /* no softc */
+};
+
+device_t root_bus;
+devclass_t root_devclass;
+
+static int
+root_bus_module_handler(module_t mod, int what, void* arg)
+{
+ switch (what) {
+ case MOD_LOAD:
+ compile_methods(&root_driver);
+ root_bus = make_device(NULL, "root", 0, NULL);
+ root_bus->desc = "System root bus";
+ root_bus->ops = root_driver.ops;
+ root_bus->driver = &root_driver;
+ root_bus->state = DS_ATTACHED;
+ root_devclass = devclass_find_internal("root", FALSE);
+ return 0;
+ }
+
+ return 0;
+}
+
+static moduledata_t root_bus_mod = {
+ "rootbus",
+ root_bus_module_handler,
+ 0
+};
+DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+
+void
+root_bus_configure(void)
+{
+ device_t dev;
+
+ PDEBUG(("."));
+
+ for (dev = TAILQ_FIRST(&root_bus->children); dev;
+ dev = TAILQ_NEXT(dev, link)) {
+ device_probe_and_attach(dev);
+ }
+}
+
+int
+driver_module_handler(module_t mod, int what, void *arg)
+{
+ int error, i;
+ struct driver_module_data *dmd;
+ devclass_t bus_devclass;
+
+ dmd = (struct driver_module_data *)arg;
+ bus_devclass = devclass_find_internal(dmd->dmd_busname, TRUE);
+ error = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ for (i = 0; !error && i < dmd->dmd_ndrivers; i++) {
+ PDEBUG(("Loading module: driver %s on bus %s",
+ DRIVERNAME(dmd->dmd_drivers[i]),
+ dmd->dmd_busname));
+ error = devclass_add_driver(bus_devclass,
+ dmd->dmd_drivers[i]);
+ }
+ if (error)
+ break;
+
+ /*
+ * The drivers loaded in this way are assumed to all
+ * implement the same devclass.
+ */
+ *dmd->dmd_devclass =
+ devclass_find_internal(dmd->dmd_drivers[0]->name,
+ TRUE);
+ break;
+
+ case MOD_UNLOAD:
+ for (i = 0; !error && i < dmd->dmd_ndrivers; i++) {
+ PDEBUG(("Unloading module: driver %s from bus %s",
+ DRIVERNAME(dmd->dmd_drivers[i]),
+ dmd->dmd_busname));
+ error = devclass_delete_driver(bus_devclass,
+ dmd->dmd_drivers[i]);
+ }
+ break;
+ }
+
+ if (!error && dmd->dmd_chainevh)
+ error = dmd->dmd_chainevh(mod, what, dmd->dmd_chainarg);
+ return (error);
+}
+
+#ifdef BUS_DEBUG
+
+/* the _short versions avoid iteration by not calling anything that prints
+ * more than oneliners. I love oneliners.
+ */
+
+static void
+print_method_list(device_method_t *m, int indent)
+{
+ int i;
+
+ if (!m)
+ return;
+
+ for (i = 0; m->desc; i++, m++)
+ indentprintf(("method %d: %s, offset=%d\n",
+ i, m->desc->name, m->desc->offset));
+}
+
+static void
+print_device_ops(device_ops_t ops, int indent)
+{
+ int i;
+ int count = 0;
+
+ if (!ops)
+ return;
+
+ /* we present a list of the methods that are pointing to the
+ * error_method, but ignore the 0'th elements; it is always
+ * error_method.
+ */
+ for (i = 1; i < ops->maxoffset; i++) {
+ if (ops->methods[i] == error_method) {
+ if (count == 0)
+ indentprintf(("error_method:"));
+ printf(" %d", i);
+ count++;
+ }
+ }
+ if (count)
+ printf("\n");
+
+ indentprintf(("(%d method%s, %d valid, %d error_method%s)\n",
+ ops->maxoffset-1, (ops->maxoffset-1 == 1? "":"s"),
+ ops->maxoffset-1-count,
+ count, (count == 1? "":"'s")));
+}
+
+static void
+print_device_short(device_t dev, int indent)
+{
+ if (!dev)
+ return;
+
+ indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%sivars,%ssoftc,busy=%d\n",
+ dev->unit, dev->desc,
+ (dev->parent? "":"no "),
+ (TAILQ_EMPTY(&dev->children)? "no ":""),
+ (dev->flags&DF_ENABLED? "enabled,":"disabled,"),
+ (dev->flags&DF_FIXEDCLASS? "fixed,":""),
+ (dev->flags&DF_WILDCARD? "wildcard,":""),
+ (dev->ivars? "":"no "),
+ (dev->softc? "":"no "),
+ dev->busy));
+}
+
+static void
+print_device(device_t dev, int indent)
+{
+ if (!dev)
+ return;
+
+ print_device_short(dev, indent);
+
+ indentprintf(("Parent:\n"));
+ print_device_short(dev->parent, indent+1);
+ indentprintf(("Methods:\n"));
+ print_device_ops(dev->ops, indent+1);
+ indentprintf(("Driver:\n"));
+ print_driver_short(dev->driver, indent+1);
+ indentprintf(("Devclass:\n"));
+ print_devclass_short(dev->devclass, indent+1);
+}
+
+void
+print_device_tree_short(device_t dev, int indent)
+/* print the device and all its children (indented) */
+{
+ device_t child;
+
+ if (!dev)
+ return;
+
+ print_device_short(dev, indent);
+
+ for (child = TAILQ_FIRST(&dev->children); child;
+ child = TAILQ_NEXT(child, link))
+ print_device_tree_short(child, indent+1);
+}
+
+void
+print_device_tree(device_t dev, int indent)
+/* print the device and all its children (indented) */
+{
+ device_t child;
+
+ if (!dev)
+ return;
+
+ print_device(dev, indent);
+
+ for (child = TAILQ_FIRST(&dev->children); child;
+ child = TAILQ_NEXT(child, link))
+ print_device_tree(child, indent+1);
+}
+
+static void
+print_driver_short(driver_t *driver, int indent)
+{
+ if (!driver)
+ return;
+
+ indentprintf(("driver %s: type = %s%s%s%s, softc size = %d\n",
+ driver->name,
+ /* yes, I know this looks silly, but going to bed at
+ * two o'clock and having to get up at 7:30 again is silly
+ * as well. As is sticking your head in a bucket of water.
+ */
+ (driver->type == DRIVER_TYPE_TTY? "tty":""),
+ (driver->type == DRIVER_TYPE_BIO? "bio":""),
+ (driver->type == DRIVER_TYPE_NET? "net":""),
+ (driver->type == DRIVER_TYPE_MISC? "misc":""),
+ driver->softc));
+}
+
+static void
+print_driver(driver_t *driver, int indent)
+{
+ if (!driver)
+ return;
+
+ print_driver_short(driver, indent);
+ indentprintf(("Methods:\n"));
+ print_method_list(driver->methods, indent+1);
+ indentprintf(("Operations:\n"));
+ print_device_ops(driver->ops, indent+1);
+}
+
+
+static void
+print_driver_list(driver_list_t drivers, int indent)
+{
+ driver_t *driver;
+
+ for (driver = TAILQ_FIRST(&drivers); driver;
+ driver = TAILQ_NEXT(driver, link))
+ print_driver(driver, indent);
+}
+
+static void
+print_devclass_short(devclass_t dc, int indent)
+{
+ if ( !dc )
+ return;
+
+ indentprintf(("devclass %s: max units = %d, next unit = %d\n",
+ dc->name, dc->maxunit, dc->nextunit));
+}
+
+static void
+print_devclass(devclass_t dc, int indent)
+{
+ int i;
+
+ if ( !dc )
+ return;
+
+ print_devclass_short(dc, indent);
+ indentprintf(("Drivers:\n"));
+ print_driver_list(dc->drivers, indent+1);
+
+ indentprintf(("Devices:\n"));
+ for (i = 0; i < dc->maxunit; i++)
+ if (dc->devices[i])
+ print_device(dc->devices[i], indent+1);
+}
+
+void
+print_devclass_list_short(void)
+{
+ devclass_t dc;
+
+ printf("Short listing of devclasses, drivers & devices:\n");
+ for (dc = TAILQ_FIRST(&devclasses); dc; dc = TAILQ_NEXT(dc, link))
+ print_devclass_short(dc, 0);
+}
+
+void
+print_devclass_list(void)
+{
+ devclass_t dc;
+
+ printf("Full listing of devclasses, drivers & devices:\n");
+ for (dc = TAILQ_FIRST(&devclasses); dc; dc = TAILQ_NEXT(dc, link))
+ print_devclass(dc, 0);
+}
+
+#endif
diff --git a/sys/kern/subr_clist.c b/sys/kern/subr_clist.c
new file mode 100644
index 0000000..593d00c
--- /dev/null
+++ b/sys/kern/subr_clist.c
@@ -0,0 +1,694 @@
+/*
+ * Copyright (c) 1994, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: tty_subr.c,v 1.29 1998/04/15 17:46:27 bde Exp $
+ */
+
+/*
+ * clist support routines
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/tty.h>
+#include <sys/clist.h>
+
+static void clist_init __P((void *));
+SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL)
+
+static struct cblock *cfreelist = 0;
+int cfreecount = 0;
+static int cslushcount;
+static int ctotcount;
+
+#ifndef INITIAL_CBLOCKS
+#define INITIAL_CBLOCKS 50
+#endif
+
+static struct cblock *cblock_alloc __P((void));
+static void cblock_alloc_cblocks __P((int number));
+static void cblock_free __P((struct cblock *cblockp));
+static void cblock_free_cblocks __P((int number));
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cbstat, cbstat)
+{
+ printf(
+ "tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n",
+ ctotcount * CBSIZE, ctotcount * CBSIZE - cfreecount, cfreecount,
+ cfreecount - cslushcount * CBSIZE, cslushcount * CBSIZE);
+}
+#endif /* DDB */
+
+/*
+ * Called from init_main.c
+ */
+/* ARGSUSED*/
+static void
+clist_init(dummy)
+ void *dummy;
+{
+ /*
+ * Allocate an initial base set of cblocks as a 'slush'.
+ * We allocate non-slush cblocks with each initial ttyopen() and
+ * deallocate them with each ttyclose().
+ * We should adjust the slush allocation. This can't be done in
+ * the i/o routines because they are sometimes called from
+ * interrupt handlers when it may be unsafe to call malloc().
+ */
+ cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS);
+}
+
+/*
+ * Remove a cblock from the cfreelist queue and return a pointer
+ * to it.
+ */
+static __inline struct cblock *
+cblock_alloc()
+{
+ struct cblock *cblockp;
+
+ cblockp = cfreelist;
+ if (cblockp == NULL)
+ panic("clist reservation botch");
+ cfreelist = cblockp->c_next;
+ cblockp->c_next = NULL;
+ cfreecount -= CBSIZE;
+ return (cblockp);
+}
+
+/*
+ * Add a cblock to the cfreelist queue.
+ */
+static __inline void
+cblock_free(cblockp)
+ struct cblock *cblockp;
+{
+ if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1))
+ bzero(cblockp->c_quote, sizeof cblockp->c_quote);
+ cblockp->c_next = cfreelist;
+ cfreelist = cblockp;
+ cfreecount += CBSIZE;
+}
+
+/*
+ * Allocate some cblocks for the cfreelist queue.
+ */
+static void
+cblock_alloc_cblocks(number)
+ int number;
+{
+ int i;
+ struct cblock *cbp;
+
+ for (i = 0; i < number; ++i) {
+ cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT);
+ if (cbp == NULL) {
+ printf(
+"clist_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n");
+ cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK);
+ }
+ /*
+ * Freed cblocks have zero quotes and garbage elsewhere.
+ * Set the may-have-quote bit to force zeroing the quotes.
+ */
+ setbit(cbp->c_quote, CBQSIZE * NBBY - 1);
+ cblock_free(cbp);
+ }
+ ctotcount += number;
+}
+
+/*
+ * Set the cblock allocation policy for a a clist.
+ * Must be called in process context at spltty().
+ */
+void
+clist_alloc_cblocks(clistp, ccmax, ccreserved)
+ struct clist *clistp;
+ int ccmax;
+ int ccreserved;
+{
+ int dcbr;
+
+ /*
+ * Allow for wasted space at the head.
+ */
+ if (ccmax != 0)
+ ccmax += CBSIZE - 1;
+ if (ccreserved != 0)
+ ccreserved += CBSIZE - 1;
+
+ clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE;
+ dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved;
+ if (dcbr >= 0)
+ cblock_alloc_cblocks(dcbr);
+ else {
+ if (clistp->c_cbreserved + dcbr < clistp->c_cbcount)
+ dcbr = clistp->c_cbcount - clistp->c_cbreserved;
+ cblock_free_cblocks(-dcbr);
+ }
+ clistp->c_cbreserved += dcbr;
+}
+
+/*
+ * Free some cblocks from the cfreelist queue back to the
+ * system malloc pool.
+ */
+static void
+cblock_free_cblocks(number)
+ int number;
+{
+ int i;
+
+ for (i = 0; i < number; ++i)
+ free(cblock_alloc(), M_TTYS);
+ ctotcount -= number;
+}
+
+/*
+ * Free the cblocks reserved for a clist.
+ * Must be called at spltty().
+ */
+void
+clist_free_cblocks(clistp)
+ struct clist *clistp;
+{
+ if (clistp->c_cbcount != 0)
+ panic("freeing active clist cblocks");
+ cblock_free_cblocks(clistp->c_cbreserved);
+ clistp->c_cbmax = 0;
+ clistp->c_cbreserved = 0;
+}
+
+/*
+ * Get a character from the head of a clist.
+ */
+int
+getc(clistp)
+ struct clist *clistp;
+{
+ int chr = -1;
+ int s;
+ struct cblock *cblockp;
+
+ s = spltty();
+
+ /* If there are characters in the list, get one */
+ if (clistp->c_cc) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+ chr = (u_char)*clistp->c_cf;
+
+ /*
+ * If this char is quoted, set the flag.
+ */
+ if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info))
+ chr |= TTY_QUOTE;
+
+ /*
+ * Advance to next character.
+ */
+ clistp->c_cf++;
+ clistp->c_cc--;
+ /*
+ * If we have advanced the 'first' character pointer
+ * past the end of this cblock, advance to the next one.
+ * If there are no more characters, set the first and
+ * last pointers to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+ return (chr);
+}
+
+/*
+ * Copy 'amount' of chars, beginning at head of clist 'clistp' to
+ * destination linear buffer 'dest'. Return number of characters
+ * actually copied.
+ */
+int
+q_to_b(clistp, dest, amount)
+ struct clist *clistp;
+ char *dest;
+ int amount;
+{
+ struct cblock *cblockp;
+ struct cblock *cblockn;
+ char *dest_orig = dest;
+ int numc;
+ int s;
+
+ s = spltty();
+
+ while (clistp && amount && (clistp->c_cc > 0)) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+ cblockn = cblockp + 1; /* pointer arithmetic! */
+ numc = min(amount, (char *)cblockn - clistp->c_cf);
+ numc = min(numc, clistp->c_cc);
+ bcopy(clistp->c_cf, dest, numc);
+ amount -= numc;
+ clistp->c_cf += numc;
+ clistp->c_cc -= numc;
+ dest += numc;
+ /*
+ * If this cblock has been emptied, advance to the next
+ * one. If there are no more characters, set the first
+ * and last pointer to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+ return (dest - dest_orig);
+}
+
+/*
+ * Flush 'amount' of chars, beginning at head of clist 'clistp'.
+ */
+void
+ndflush(clistp, amount)
+ struct clist *clistp;
+ int amount;
+{
+ struct cblock *cblockp;
+ struct cblock *cblockn;
+ int numc;
+ int s;
+
+ s = spltty();
+
+ while (amount && (clistp->c_cc > 0)) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+ cblockn = cblockp + 1; /* pointer arithmetic! */
+ numc = min(amount, (char *)cblockn - clistp->c_cf);
+ numc = min(numc, clistp->c_cc);
+ amount -= numc;
+ clistp->c_cf += numc;
+ clistp->c_cc -= numc;
+ /*
+ * If this cblock has been emptied, advance to the next
+ * one. If there are no more characters, set the first
+ * and last pointer to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+}
+
+/*
+ * Add a character to the end of a clist. Return -1 is no
+ * more clists, or 0 for success.
+ */
+int
+putc(chr, clistp)
+ int chr;
+ struct clist *clistp;
+{
+ struct cblock *cblockp;
+ int s;
+
+ s = spltty();
+
+ if (clistp->c_cl == NULL) {
+ if (clistp->c_cbreserved < 1) {
+ splx(s);
+ printf("putc to a clist with no reserved cblocks\n");
+ return (-1); /* nothing done */
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount = 1;
+ clistp->c_cf = clistp->c_cl = cblockp->c_info;
+ clistp->c_cc = 0;
+ } else {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+ if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+ struct cblock *prev = (cblockp - 1);
+
+ if (clistp->c_cbcount >= clistp->c_cbreserved) {
+ if (clistp->c_cbcount >= clistp->c_cbmax
+ || cslushcount <= 0) {
+ splx(s);
+ return (-1);
+ }
+ --cslushcount;
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount++;
+ prev->c_next = cblockp;
+ clistp->c_cl = cblockp->c_info;
+ }
+ }
+
+ /*
+ * If this character is quoted, set the quote bit, if not, clear it.
+ */
+ if (chr & TTY_QUOTE) {
+ setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+ /*
+ * Use one of the spare quote bits to record that something
+ * may be quoted.
+ */
+ setbit(cblockp->c_quote, CBQSIZE * NBBY - 1);
+ } else
+ clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+
+ *clistp->c_cl++ = chr;
+ clistp->c_cc++;
+
+ splx(s);
+ return (0);
+}
+
+/*
+ * Copy data from linear buffer to clist chain. Return the
+ * number of characters not copied.
+ */
+int
+b_to_q(src, amount, clistp)
+ char *src;
+ int amount;
+ struct clist *clistp;
+{
+ struct cblock *cblockp;
+ char *firstbyte, *lastbyte;
+ u_char startmask, endmask;
+ int startbit, endbit, num_between, numc;
+ int s;
+
+ /*
+ * Avoid allocating an initial cblock and then not using it.
+ * c_cc == 0 must imply c_cbount == 0.
+ */
+ if (amount <= 0)
+ return (amount);
+
+ s = spltty();
+
+ /*
+ * If there are no cblocks assigned to this clist yet,
+ * then get one.
+ */
+ if (clistp->c_cl == NULL) {
+ if (clistp->c_cbreserved < 1) {
+ splx(s);
+ printf("b_to_q to a clist with no reserved cblocks.\n");
+ return (amount); /* nothing done */
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount = 1;
+ clistp->c_cf = clistp->c_cl = cblockp->c_info;
+ clistp->c_cc = 0;
+ } else {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+ }
+
+ while (amount) {
+ /*
+ * Get another cblock if needed.
+ */
+ if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+ struct cblock *prev = cblockp - 1;
+
+ if (clistp->c_cbcount >= clistp->c_cbreserved) {
+ if (clistp->c_cbcount >= clistp->c_cbmax
+ || cslushcount <= 0) {
+ splx(s);
+ return (amount);
+ }
+ --cslushcount;
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount++;
+ prev->c_next = cblockp;
+ clistp->c_cl = cblockp->c_info;
+ }
+
+ /*
+ * Copy a chunk of the linear buffer up to the end
+ * of this cblock.
+ */
+ numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl);
+ bcopy(src, clistp->c_cl, numc);
+
+ /*
+ * Clear quote bits if they aren't known to be clear.
+ * The following could probably be made into a seperate
+ * "bitzero()" routine, but why bother?
+ */
+ if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) {
+ startbit = clistp->c_cl - (char *)cblockp->c_info;
+ endbit = startbit + numc - 1;
+
+ firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY);
+ lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY);
+
+ /*
+ * Calculate mask of bits to preserve in first and
+ * last bytes.
+ */
+ startmask = NBBY - (startbit % NBBY);
+ startmask = 0xff >> startmask;
+ endmask = (endbit % NBBY);
+ endmask = 0xff << (endmask + 1);
+
+ if (firstbyte != lastbyte) {
+ *firstbyte &= startmask;
+ *lastbyte &= endmask;
+
+ num_between = lastbyte - firstbyte - 1;
+ if (num_between)
+ bzero(firstbyte + 1, num_between);
+ } else {
+ *firstbyte &= (startmask | endmask);
+ }
+ }
+
+ /*
+ * ...and update pointer for the next chunk.
+ */
+ src += numc;
+ clistp->c_cl += numc;
+ clistp->c_cc += numc;
+ amount -= numc;
+ /*
+ * If we go through the loop again, it's always
+ * for data in the next cblock, so by adding one (cblock),
+ * (which makes the pointer 1 beyond the end of this
+ * cblock) we prepare for the assignment of 'prev'
+ * above.
+ */
+ cblockp += 1;
+
+ }
+
+ splx(s);
+ return (amount);
+}
+
+/*
+ * Get the next character in the clist. Store it at dst. Don't
+ * advance any clist pointers, but return a pointer to the next
+ * character position.
+ */
+char *
+nextc(clistp, cp, dst)
+ struct clist *clistp;
+ char *cp;
+ int *dst;
+{
+ struct cblock *cblockp;
+
+ ++cp;
+ /*
+ * See if the next character is beyond the end of
+ * the clist.
+ */
+ if (clistp->c_cc && (cp != clistp->c_cl)) {
+ /*
+ * If the next character is beyond the end of this
+ * cblock, advance to the next cblock.
+ */
+ if (((intptr_t)cp & CROUND) == 0)
+ cp = ((struct cblock *)cp - 1)->c_next->c_info;
+ cblockp = (struct cblock *)((intptr_t)cp & ~CROUND);
+
+ /*
+ * Get the character. Set the quote flag if this character
+ * is quoted.
+ */
+ *dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0);
+
+ return (cp);
+ }
+
+ return (NULL);
+}
+
+/*
+ * "Unput" a character from a clist.
+ */
+int
+unputc(clistp)
+ struct clist *clistp;
+{
+ struct cblock *cblockp = 0, *cbp = 0;
+ int s;
+ int chr = -1;
+
+
+ s = spltty();
+
+ if (clistp->c_cc) {
+ --clistp->c_cc;
+ --clistp->c_cl;
+
+ chr = (u_char)*clistp->c_cl;
+
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+
+ /*
+ * Set quote flag if this character was quoted.
+ */
+ if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info))
+ chr |= TTY_QUOTE;
+
+ /*
+ * If all of the characters have been unput in this
+ * cblock, then find the previous one and free this
+ * one.
+ */
+ if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) {
+ cbp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+
+ while (cbp->c_next != cblockp)
+ cbp = cbp->c_next;
+
+ /*
+ * When the previous cblock is at the end, the 'last'
+ * pointer always points (invalidly) one past.
+ */
+ clistp->c_cl = (char *)(cbp+1);
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ cbp->c_next = NULL;
+ }
+ }
+
+ /*
+ * If there are no more characters on the list, then
+ * free the last cblock.
+ */
+ if ((clistp->c_cc == 0) && clistp->c_cl) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+
+ splx(s);
+ return (chr);
+}
+
+/*
+ * Move characters in source clist to destination clist,
+ * preserving quote bits.
+ */
+void
+catq(src_clistp, dest_clistp)
+ struct clist *src_clistp, *dest_clistp;
+{
+ int chr, s;
+
+ s = spltty();
+ /*
+ * If the destination clist is empty (has no cblocks atttached),
+ * and there are no possible complications with the resource counters,
+ * then we simply assign the current clist to the destination.
+ */
+ if (!dest_clistp->c_cf
+ && src_clistp->c_cbcount <= src_clistp->c_cbmax
+ && src_clistp->c_cbcount <= dest_clistp->c_cbmax) {
+ dest_clistp->c_cf = src_clistp->c_cf;
+ dest_clistp->c_cl = src_clistp->c_cl;
+ src_clistp->c_cf = src_clistp->c_cl = NULL;
+
+ dest_clistp->c_cc = src_clistp->c_cc;
+ src_clistp->c_cc = 0;
+ dest_clistp->c_cbcount = src_clistp->c_cbcount;
+ src_clistp->c_cbcount = 0;
+
+ splx(s);
+ return;
+ }
+
+ splx(s);
+
+ /*
+ * XXX This should probably be optimized to more than one
+ * character at a time.
+ */
+ while ((chr = getc(src_clistp)) != -1)
+ putc(chr, dest_clistp);
+}
diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c
new file mode 100644
index 0000000..5fcf88e
--- /dev/null
+++ b/sys/kern/subr_devstat.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 1997, 1998 Kenneth D. Merry.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: subr_devstat.c,v 1.7 1998/12/04 22:54:51 archie Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+#include <sys/devicestat.h>
+
+static int devstat_num_devs;
+static long devstat_generation;
+static int devstat_version = DEVSTAT_VERSION;
+static int devstat_current_devnumber;
+
+STAILQ_HEAD(devstatlist, devstat) device_statq;
+
+/*
+ * Take a malloced and zeroed devstat structure given to us, fill it in
+ * and add it to the queue of devices.
+ */
+void
+devstat_add_entry(struct devstat *ds, const char *dev_name,
+ int unit_number, u_int32_t block_size,
+ devstat_support_flags flags,
+ devstat_type_flags device_type)
+{
+ int s;
+ struct devstatlist *devstat_head;
+
+ if (ds == NULL)
+ return;
+
+ if (devstat_num_devs == 0)
+ STAILQ_INIT(&device_statq);
+
+ devstat_generation++;
+ devstat_num_devs++;
+
+ devstat_head = &device_statq;
+
+ STAILQ_INSERT_TAIL(devstat_head, ds, dev_links);
+
+ ds->device_number = devstat_current_devnumber++;
+ ds->unit_number = unit_number;
+ strncpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN);
+ ds->device_name[DEVSTAT_NAME_LEN - 1] = 0;
+ ds->block_size = block_size;
+ ds->flags = flags;
+ ds->device_type = device_type;
+
+ s = splclock();
+ getmicrotime(&ds->dev_creation_time);
+ splx(s);
+}
+
+/*
+ * Remove a devstat structure from the list of devices.
+ */
+void
+devstat_remove_entry(struct devstat *ds)
+{
+ struct devstatlist *devstat_head;
+
+ if (ds == NULL)
+ return;
+
+ devstat_generation++;
+ devstat_num_devs--;
+
+ devstat_head = &device_statq;
+
+ /* Remove this entry from the devstat queue */
+ STAILQ_REMOVE(devstat_head, ds, devstat, dev_links);
+}
+
+/*
+ * Record a transaction start.
+ */
+void
+devstat_start_transaction(struct devstat *ds)
+{
+ int s;
+
+ /* sanity check */
+ if (ds == NULL)
+ return;
+
+ /*
+ * We only want to set the start time when we are going from idle
+ * to busy. The start time is really the start of the latest busy
+ * period.
+ */
+ if (ds->busy_count == 0) {
+ s = splclock();
+ getmicrouptime(&ds->start_time);
+ splx(s);
+ }
+ ds->busy_count++;
+}
+
+/*
+ * Record the ending of a transaction, and incrment the various counters.
+ */
+void
+devstat_end_transaction(struct devstat *ds, u_int32_t bytes,
+ devstat_tag_type tag_type, devstat_trans_flags flags)
+{
+ int s;
+ struct timeval busy_time;
+
+ /* sanity check */
+ if (ds == NULL)
+ return;
+
+ s = splclock();
+ getmicrouptime(&ds->last_comp_time);
+ splx(s);
+
+ ds->busy_count--;
+
+ /*
+ * There might be some transactions (DEVSTAT_NO_DATA) that don't
+ * transfer any data.
+ */
+ if (flags == DEVSTAT_READ) {
+ ds->bytes_read += bytes;
+ ds->num_reads++;
+ } else if (flags == DEVSTAT_WRITE) {
+ ds->bytes_written += bytes;
+ ds->num_writes++;
+ } else
+ ds->num_other++;
+
+ /*
+ * Keep a count of the various tag types sent.
+ */
+ if (tag_type != DEVSTAT_TAG_NONE)
+ ds->tag_types[tag_type]++;
+
+ /*
+ * We only update the busy time when we go idle. Otherwise, this
+ * calculation would require many more clock cycles.
+ */
+ if (ds->busy_count == 0) {
+ /* Calculate how long we were busy */
+ busy_time = ds->last_comp_time;
+ timevalsub(&busy_time, &ds->start_time);
+
+ /* Add our busy time to the total busy time. */
+ timevaladd(&ds->busy_time, &busy_time);
+ } else if (ds->busy_count < 0)
+ printf("devstat_end_transaction: HELP!! busy_count "
+ "for %s%d is < 0 (%d)!\n", ds->device_name,
+ ds->unit_number, ds->busy_count);
+}
+
+/*
+ * This is the sysctl handler for the devstat package. The data pushed out
+ * on the kern.devstat.all sysctl variable consists of the current devstat
+ * generation number, and then an array of devstat structures, one for each
+ * device in the system.
+ *
+ * I'm really not too fond of this method of doing things, but there really
+ * aren't that many alternatives. We must have some method of making sure
+ * that the generation number the user gets corresponds with the data the
+ * user gets. If the user makes a separate sysctl call to get the
+ * generation, and then a sysctl call to get the device statistics, the
+ * device list could have changed in that brief period of time. By
+ * supplying the generation number along with the statistics output, we can
+ * guarantee that the generation number and the statistics match up.
+ */
+static int
+sysctl_devstat SYSCTL_HANDLER_ARGS
+{
+ int error, i;
+ struct devstat *nds;
+ struct devstatlist *devstat_head;
+
+ if (devstat_num_devs == 0)
+ return(EINVAL);
+
+ error = 0;
+ devstat_head = &device_statq;
+
+ /*
+ * First push out the generation number.
+ */
+ error = SYSCTL_OUT(req, &devstat_generation, sizeof(long));
+
+ /*
+ * Now push out all the devices.
+ */
+ for (i = 0, nds = devstat_head->stqh_first;
+ (nds != NULL) && (i < devstat_num_devs) && (error == 0);
+ nds = nds->dev_links.stqe_next, i++)
+ error = SYSCTL_OUT(req, nds, sizeof(struct devstat));
+
+ return(error);
+}
+
+/*
+ * Sysctl entries for devstat. The first one is a node that all the rest
+ * hang off of.
+ */
+SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, 0, "Device Statistics");
+
+SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE,
+ 0, 0, sysctl_devstat, "S,devstat", "All Devices");
+/*
+ * Export the number of devices in the system so that userland utilities
+ * can determine how much memory to allocate to hold all the devices.
+ */
+SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, &devstat_num_devs,
+ 0, "Number of devices in the devstat list");
+SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD,
+ &devstat_generation, "Devstat list generation");
+SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, &devstat_version,
+ 0, "Devstat list version number");
diff --git a/sys/kern/subr_disklabel.c b/sys/kern/subr_disklabel.c
new file mode 100644
index 0000000..33f1d2a
--- /dev/null
+++ b/sys/kern/subr_disklabel.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
+ * $Id: ufs_disksubr.c,v 1.38 1998/10/17 07:49:04 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/disklabel.h>
+#include <sys/diskslice.h>
+#include <sys/syslog.h>
+
+/*
+ * Seek sort for disks.
+ *
+ * The buf_queue keep two queues, sorted in ascending block order. The first
+ * queue holds those requests which are positioned after the current block
+ * (in the first request); the second, which starts at queue->switch_point,
+ * holds requests which came in after their block number was passed. Thus
+ * we implement a one way scan, retracting after reaching the end of the drive
+ * to the first request on the second queue, at which time it becomes the
+ * first queue.
+ *
+ * A one-way scan is natural because of the way UNIX read-ahead blocks are
+ * allocated.
+ */
+
+void
+bufqdisksort(bufq, bp)
+ struct buf_queue_head *bufq;
+ struct buf *bp;
+{
+ struct buf *bq;
+ struct buf *bn;
+ struct buf *be;
+
+ be = TAILQ_LAST(&bufq->queue, buf_queue);
+ /*
+ * If the queue is empty or we are an
+ * ordered transaction, then it's easy.
+ */
+ if ((bq = bufq_first(bufq)) == NULL
+ || (bp->b_flags & B_ORDERED) != 0) {
+ bufq_insert_tail(bufq, bp);
+ return;
+ } else if (bufq->insert_point != NULL) {
+
+ /*
+ * A certain portion of the list is
+ * "locked" to preserve ordering, so
+ * we can only insert after the insert
+ * point.
+ */
+ bq = bufq->insert_point;
+ } else {
+
+ /*
+ * If we lie before the last removed (currently active)
+ * request, and are not inserting ourselves into the
+ * "locked" portion of the list, then we must add ourselves
+ * to the second request list.
+ */
+ if (bp->b_pblkno < bufq->last_pblkno) {
+
+ bq = bufq->switch_point;
+ /*
+ * If we are starting a new secondary list,
+ * then it's easy.
+ */
+ if (bq == NULL) {
+ bufq->switch_point = bp;
+ bufq_insert_tail(bufq, bp);
+ return;
+ }
+ /*
+ * If we lie ahead of the current switch point,
+ * insert us before the switch point and move
+ * the switch point.
+ */
+ if (bp->b_pblkno < bq->b_pblkno) {
+ bufq->switch_point = bp;
+ TAILQ_INSERT_BEFORE(bq, bp, b_act);
+ return;
+ }
+ } else {
+ if (bufq->switch_point != NULL)
+ be = TAILQ_PREV(bufq->switch_point,
+ buf_queue, b_act);
+ /*
+ * If we lie between last_pblkno and bq,
+ * insert before bq.
+ */
+ if (bp->b_pblkno < bq->b_pblkno) {
+ TAILQ_INSERT_BEFORE(bq, bp, b_act);
+ return;
+ }
+ }
+ }
+
+ /*
+ * Request is at/after our current position in the list.
+ * Optimize for sequential I/O by seeing if we go at the tail.
+ */
+ if (bp->b_pblkno > be->b_pblkno) {
+ TAILQ_INSERT_AFTER(&bufq->queue, be, bp, b_act);
+ return;
+ }
+
+ /* Otherwise, insertion sort */
+ while ((bn = TAILQ_NEXT(bq, b_act)) != NULL) {
+
+ /*
+ * We want to go after the current request if it is the end
+ * of the first request list, or if the next request is a
+ * larger cylinder than our request.
+ */
+ if (bn == bufq->switch_point
+ || bp->b_pblkno < bn->b_pblkno)
+ break;
+ bq = bn;
+ }
+ TAILQ_INSERT_AFTER(&bufq->queue, bq, bp, b_act);
+}
+
+
+/*
+ * Attempt to read a disk label from a device using the indicated strategy
+ * routine. The label must be partly set up before this: secpercyl, secsize
+ * and anything required in the strategy routine (e.g., dummy bounds for the
+ * partition containing the label) must be filled in before calling us.
+ * Returns NULL on success and an error string on failure.
+ */
+char *
+readdisklabel(dev, strat, lp)
+ dev_t dev;
+ d_strategy_t *strat;
+ register struct disklabel *lp;
+{
+ register struct buf *bp;
+ struct disklabel *dlp;
+ char *msg = NULL;
+
+ bp = geteblk((int)lp->d_secsize);
+ bp->b_dev = dev;
+ bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
+ bp->b_bcount = lp->d_secsize;
+ bp->b_flags &= ~B_INVAL;
+ bp->b_flags |= B_BUSY | B_READ;
+ (*strat)(bp);
+ if (biowait(bp))
+ msg = "I/O error";
+ else for (dlp = (struct disklabel *)bp->b_data;
+ dlp <= (struct disklabel *)((char *)bp->b_data +
+ lp->d_secsize - sizeof(*dlp));
+ dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
+ if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) {
+ if (msg == NULL)
+ msg = "no disk label";
+ } else if (dlp->d_npartitions > MAXPARTITIONS ||
+ dkcksum(dlp) != 0)
+ msg = "disk label corrupted";
+ else {
+ *lp = *dlp;
+ msg = NULL;
+ break;
+ }
+ }
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ return (msg);
+}
+
+/*
+ * Check new disk label for sensibility before setting it.
+ */
+int
+setdisklabel(olp, nlp, openmask)
+ register struct disklabel *olp, *nlp;
+ u_long openmask;
+{
+ register int i;
+ register struct partition *opp, *npp;
+
+ /*
+ * Check it is actually a disklabel we are looking at.
+ */
+ if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC ||
+ dkcksum(nlp) != 0)
+ return (EINVAL);
+ /*
+ * For each partition that we think is open,
+ */
+ while ((i = ffs((long)openmask)) != 0) {
+ i--;
+ /*
+ * Check it is not changing....
+ */
+ openmask &= ~(1 << i);
+ if (nlp->d_npartitions <= i)
+ return (EBUSY);
+ opp = &olp->d_partitions[i];
+ npp = &nlp->d_partitions[i];
+ if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size)
+ return (EBUSY);
+ /*
+ * Copy internally-set partition information
+ * if new label doesn't include it. XXX
+ * (If we are using it then we had better stay the same type)
+ * This is possibly dubious, as someone else noted (XXX)
+ */
+ if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) {
+ npp->p_fstype = opp->p_fstype;
+ npp->p_fsize = opp->p_fsize;
+ npp->p_frag = opp->p_frag;
+ npp->p_cpg = opp->p_cpg;
+ }
+ }
+ nlp->d_checksum = 0;
+ nlp->d_checksum = dkcksum(nlp);
+ *olp = *nlp;
+ return (0);
+}
+
+/*
+ * Write disk label back to device after modification.
+ */
+int
+writedisklabel(dev, strat, lp)
+ dev_t dev;
+ d_strategy_t *strat;
+ register struct disklabel *lp;
+{
+ struct buf *bp;
+ struct disklabel *dlp;
+ int error = 0;
+
+ if (lp->d_partitions[RAW_PART].p_offset != 0)
+ return (EXDEV); /* not quite right */
+ bp = geteblk((int)lp->d_secsize);
+ bp->b_dev = dkmodpart(dev, RAW_PART);
+ bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
+ bp->b_bcount = lp->d_secsize;
+#if 1
+ /*
+ * We read the label first to see if it's there,
+ * in which case we will put ours at the same offset into the block..
+ * (I think this is stupid [Julian])
+ * Note that you can't write a label out over a corrupted label!
+ * (also stupid.. how do you write the first one? by raw writes?)
+ */
+ bp->b_flags &= ~B_INVAL;
+ bp->b_flags |= B_BUSY | B_READ;
+ (*strat)(bp);
+ error = biowait(bp);
+ if (error)
+ goto done;
+ for (dlp = (struct disklabel *)bp->b_data;
+ dlp <= (struct disklabel *)
+ ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp));
+ dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
+ if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC &&
+ dkcksum(dlp) == 0) {
+ *dlp = *lp;
+ bp->b_flags &= ~(B_DONE | B_READ);
+ bp->b_flags |= B_BUSY | B_WRITE;
+#ifdef __alpha__
+ alpha_fix_srm_checksum(bp);
+#endif
+ (*strat)(bp);
+ error = biowait(bp);
+ goto done;
+ }
+ }
+ error = ESRCH;
+done:
+#else
+ bzero(bp->b_data, lp->d_secsize);
+ dlp = (struct disklabel *)bp->b_data;
+ *dlp = *lp;
+ bp->b_flags &= ~B_INVAL;
+ bp->b_flags |= B_BUSY | B_WRITE;
+ (*strat)(bp);
+ error = biowait(bp);
+#endif
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ return (error);
+}
+
+/*
+ * Compute checksum for disk label.
+ */
+u_int
+dkcksum(lp)
+ register struct disklabel *lp;
+{
+ register u_short *start, *end;
+ register u_short sum = 0;
+
+ start = (u_short *)lp;
+ end = (u_short *)&lp->d_partitions[lp->d_npartitions];
+ while (start < end)
+ sum ^= *start++;
+ return (sum);
+}
+
+/*
+ * Disk error is the preface to plaintive error messages
+ * about failing disk transfers. It prints messages of the form
+
+hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
+
+ * if the offset of the error in the transfer and a disk label
+ * are both available. blkdone should be -1 if the position of the error
+ * is unknown; the disklabel pointer may be null from drivers that have not
+ * been converted to use them. The message is printed with printf
+ * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
+ * The message should be completed (with at least a newline) with printf
+ * or addlog, respectively. There is no trailing space.
+ */
+void
+diskerr(bp, dname, what, pri, blkdone, lp)
+ register struct buf *bp;
+ char *dname, *what;
+ int pri, blkdone;
+ register struct disklabel *lp;
+{
+ int unit = dkunit(bp->b_dev);
+ int slice = dkslice(bp->b_dev);
+ int part = dkpart(bp->b_dev);
+ register int (*pr) __P((const char *, ...));
+ char partname[2];
+ char *sname;
+ daddr_t sn;
+
+ if (pri != LOG_PRINTF) {
+ log(pri, "%s", "");
+ pr = addlog;
+ } else
+ pr = printf;
+ sname = dsname(dname, unit, slice, part, partname);
+ (*pr)("%s%s: %s %sing fsbn ", sname, partname, what,
+ bp->b_flags & B_READ ? "read" : "writ");
+ sn = bp->b_blkno;
+ if (bp->b_bcount <= DEV_BSIZE)
+ (*pr)("%ld", (long)sn);
+ else {
+ if (blkdone >= 0) {
+ sn += blkdone;
+ (*pr)("%ld of ", (long)sn);
+ }
+ (*pr)("%ld-%ld", (long)bp->b_blkno,
+ (long)(bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE));
+ }
+ if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
+#ifdef tahoe
+ sn *= DEV_BSIZE / lp->d_secsize; /* XXX */
+#endif
+ sn += lp->d_partitions[part].p_offset;
+ /*
+ * XXX should add slice offset and not print the slice,
+ * but we don't know the slice pointer.
+ * XXX should print bp->b_pblkno so that this will work
+ * independent of slices, labels and bad sector remapping,
+ * but some drivers don't set bp->b_pblkno.
+ */
+ (*pr)(" (%s bn %ld; cn %ld", sname, (long)sn,
+ (long)(sn / lp->d_secpercyl));
+ sn %= (long)lp->d_secpercyl;
+ (*pr)(" tn %ld sn %ld)", (long)(sn / lp->d_nsectors),
+ (long)(sn % lp->d_nsectors));
+ }
+}
diff --git a/sys/kern/subr_diskmbr.c b/sys/kern/subr_diskmbr.c
new file mode 100644
index 0000000..adfd39c
--- /dev/null
+++ b/sys/kern/subr_diskmbr.c
@@ -0,0 +1,445 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91
+ * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ * $Id: diskslice_machdep.c,v 1.31 1998/08/10 07:22:14 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/disklabel.h>
+#define DOSPTYP_EXTENDED 5
+#define DOSPTYP_EXTENDEDX 15
+#define DOSPTYP_ONTRACK 84
+#include <sys/diskslice.h>
+#include <sys/malloc.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+
+#define TRACE(str) do { if (dsi_debug) printf str; } while (0)
+
+static volatile u_char dsi_debug;
+
+static struct dos_partition historical_bogus_partition_table[NDOSPART] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+ { 0x80, 0, 1, 0, DOSPTYP_386BSD, 255, 255, 255, 0, 50000, },
+};
+
+static int check_part __P((char *sname, struct dos_partition *dp,
+ u_long offset, int nsectors, int ntracks,
+ u_long mbr_offset));
+static void extended __P((char *dname, dev_t dev, d_strategy_t *strat,
+ struct disklabel *lp, struct diskslices *ssp,
+ u_long ext_offset, u_long ext_size,
+ u_long base_ext_offset, int nsectors, int ntracks,
+ u_long mbr_offset));
+
+static int
+check_part(sname, dp, offset, nsectors, ntracks, mbr_offset )
+ char *sname;
+ struct dos_partition *dp;
+ u_long offset;
+ int nsectors;
+ int ntracks;
+ u_long mbr_offset;
+{
+ int chs_ecyl;
+ int chs_esect;
+ int chs_scyl;
+ int chs_ssect;
+ int error;
+ u_long esector;
+ u_long esector1;
+ u_long secpercyl;
+ u_long ssector;
+ u_long ssector1;
+
+ secpercyl = (u_long)nsectors * ntracks;
+ chs_scyl = DPCYL(dp->dp_scyl, dp->dp_ssect);
+ chs_ssect = DPSECT(dp->dp_ssect);
+ ssector = chs_ssect - 1 + dp->dp_shd * nsectors + chs_scyl * secpercyl
+ + mbr_offset;
+ ssector1 = offset + dp->dp_start;
+
+ /*
+ * If ssector1 is on a cylinder >= 1024, then ssector can't be right.
+ * Allow the C/H/S for it to be 1023/ntracks-1/nsectors, or correct
+ * apart from the cylinder being reduced modulo 1024. Always allow
+ * 1023/255/63.
+ */
+ if (ssector < ssector1
+ && ((chs_ssect == nsectors && dp->dp_shd == ntracks - 1
+ && chs_scyl == 1023)
+ || (secpercyl != 0
+ && (ssector1 - ssector) % (1024 * secpercyl) == 0))
+ || (dp->dp_scyl == 255 && dp->dp_shd == 255
+ && dp->dp_ssect == 255)) {
+ TRACE(("%s: C/H/S start %d/%d/%d, start %lu: allow\n",
+ sname, chs_scyl, dp->dp_shd, chs_ssect, ssector1));
+ ssector = ssector1;
+ }
+
+ chs_ecyl = DPCYL(dp->dp_ecyl, dp->dp_esect);
+ chs_esect = DPSECT(dp->dp_esect);
+ esector = chs_esect - 1 + dp->dp_ehd * nsectors + chs_ecyl * secpercyl
+ + mbr_offset;
+ esector1 = ssector1 + dp->dp_size - 1;
+
+ /* Allow certain bogus C/H/S values for esector, as above. */
+ if (esector < esector1
+ && ((chs_esect == nsectors && dp->dp_ehd == ntracks - 1
+ && chs_ecyl == 1023)
+ || (secpercyl != 0
+ && (esector1 - esector) % (1024 * secpercyl) == 0))
+ || (dp->dp_ecyl == 255 && dp->dp_ehd == 255
+ && dp->dp_esect == 255)) {
+ TRACE(("%s: C/H/S end %d/%d/%d, end %lu: allow\n",
+ sname, chs_ecyl, dp->dp_ehd, chs_esect, esector1));
+ esector = esector1;
+ }
+
+ error = (ssector == ssector1 && esector == esector1) ? 0 : EINVAL;
+ if (bootverbose)
+ printf("%s: type 0x%x, start %lu, end = %lu, size %lu %s\n",
+ sname, dp->dp_typ, ssector1, esector1,
+ (u_long)dp->dp_size, error ? "" : ": OK");
+ if (ssector != ssector1 && bootverbose)
+ printf("%s: C/H/S start %d/%d/%d (%lu) != start %lu: invalid\n",
+ sname, chs_scyl, dp->dp_shd, chs_ssect,
+ ssector, ssector1);
+ if (esector != esector1 && bootverbose)
+ printf("%s: C/H/S end %d/%d/%d (%lu) != end %lu: invalid\n",
+ sname, chs_ecyl, dp->dp_ehd, chs_esect,
+ esector, esector1);
+ return (error);
+}
+
+int
+dsinit(dname, dev, strat, lp, sspp)
+ char *dname;
+ dev_t dev;
+ d_strategy_t *strat;
+ struct disklabel *lp;
+ struct diskslices **sspp;
+{
+ struct buf *bp;
+ u_char *cp;
+ int dospart;
+ struct dos_partition *dp;
+ struct dos_partition *dp0;
+ int error;
+ int max_ncyls;
+ int max_nsectors;
+ int max_ntracks;
+ u_long mbr_offset;
+ char partname[2];
+ u_long secpercyl;
+ char *sname;
+ struct diskslice *sp;
+ struct diskslices *ssp;
+
+ mbr_offset = DOSBBSECTOR;
+reread_mbr:
+ /* Read master boot record. */
+ bp = geteblk((int)lp->d_secsize);
+ bp->b_dev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+ bp->b_blkno = mbr_offset;
+ bp->b_bcount = lp->d_secsize;
+ bp->b_flags |= B_BUSY | B_READ;
+ (*strat)(bp);
+ if (biowait(bp) != 0) {
+ diskerr(bp, dname, "error reading primary partition table",
+ LOG_PRINTF, 0, (struct disklabel *)NULL);
+ printf("\n");
+ error = EIO;
+ goto done;
+ }
+
+ /* Weakly verify it. */
+ cp = bp->b_data;
+ sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART,
+ partname);
+ if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) {
+ if (bootverbose)
+ printf("%s: invalid primary partition table: no magic\n",
+ sname);
+ error = EINVAL;
+ goto done;
+ }
+ dp0 = (struct dos_partition *)(cp + DOSPARTOFF);
+
+ /* Check for "Ontrack Diskmanager". */
+ for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+ if (dp->dp_typ == DOSPTYP_ONTRACK) {
+ if (bootverbose)
+ printf(
+ "%s: Found \"Ontrack Disk Manager\" on this disk.\n", sname);
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ mbr_offset = 63;
+ goto reread_mbr;
+ }
+ }
+
+ if (bcmp(dp0, historical_bogus_partition_table,
+ sizeof historical_bogus_partition_table) == 0) {
+ TRACE(("%s: invalid primary partition table: historical\n",
+ sname));
+ error = EINVAL;
+ goto done;
+ }
+
+ /* Guess the geometry. */
+ /*
+ * TODO:
+ * Perhaps skip entries with 0 size.
+ * Perhaps only look at entries of type DOSPTYP_386BSD.
+ */
+ max_ncyls = 0;
+ max_nsectors = 0;
+ max_ntracks = 0;
+ for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+ int ncyls;
+ int nsectors;
+ int ntracks;
+
+ ncyls = DPCYL(dp->dp_ecyl, dp->dp_esect) + 1;
+ if (max_ncyls < ncyls)
+ max_ncyls = ncyls;
+ nsectors = DPSECT(dp->dp_esect);
+ if (max_nsectors < nsectors)
+ max_nsectors = nsectors;
+ ntracks = dp->dp_ehd + 1;
+ if (max_ntracks < ntracks)
+ max_ntracks = ntracks;
+ }
+
+ /*
+ * Check that we have guessed the geometry right by checking the
+ * partition entries.
+ */
+ /*
+ * TODO:
+ * As above.
+ * Check for overlaps.
+ * Check against d_secperunit if the latter is reliable.
+ */
+ error = 0;
+ for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+ if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0
+ && dp->dp_start == 0 && dp->dp_size == 0)
+ continue;
+ sname = dsname(dname, dkunit(dev), BASE_SLICE + dospart,
+ RAW_PART, partname);
+
+ /*
+ * Temporarily ignore errors from this check. We could
+ * simplify things by accepting the table eariler if we
+ * always ignore errors here. Perhaps we should always
+ * accept the table if the magic is right but not let
+ * bad entries affect the geometry.
+ */
+ check_part(sname, dp, mbr_offset, max_nsectors, max_ntracks,
+ mbr_offset);
+ }
+ if (error != 0)
+ goto done;
+
+ /*
+ * Accept the DOS partition table.
+ * First adjust the label (we have been careful not to change it
+ * before we can guarantee success).
+ */
+ secpercyl = (u_long)max_nsectors * max_ntracks;
+ if (secpercyl != 0) {
+ u_long secperunit;
+
+ lp->d_nsectors = max_nsectors;
+ lp->d_ntracks = max_ntracks;
+ lp->d_secpercyl = secpercyl;
+ secperunit = secpercyl * max_ncyls;
+ if (lp->d_secperunit < secperunit)
+ lp->d_secperunit = secperunit;
+ lp->d_ncylinders = lp->d_secperunit / secpercyl;
+ }
+
+ /*
+ * We are passed a pointer to a suitably initialized minimal
+ * slices "struct" with no dangling pointers in it. Replace it
+ * by a maximal one. This usually oversizes the "struct", but
+ * enlarging it while searching for logical drives would be
+ * inconvenient.
+ */
+ free(*sspp, M_DEVBUF);
+ ssp = dsmakeslicestruct(MAX_SLICES, lp);
+ *sspp = ssp;
+
+ /* Initialize normal slices. */
+ sp = &ssp->dss_slices[BASE_SLICE];
+ for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++, sp++) {
+ sp->ds_offset = mbr_offset + dp->dp_start;
+ sp->ds_size = dp->dp_size;
+ sp->ds_type = dp->dp_typ;
+#if 0
+ lp->d_subtype |= (lp->d_subtype & 3) | dospart
+ | DSTYPE_INDOSPART;
+#endif
+ }
+ ssp->dss_nslices = BASE_SLICE + NDOSPART;
+
+ /* Handle extended partitions. */
+ sp -= NDOSPART;
+ for (dospart = 0; dospart < NDOSPART; dospart++, sp++)
+ if (sp->ds_type == DOSPTYP_EXTENDED ||
+ sp->ds_type == DOSPTYP_EXTENDEDX)
+ extended(dname, bp->b_dev, strat, lp, ssp,
+ sp->ds_offset, sp->ds_size, sp->ds_offset,
+ max_nsectors, max_ntracks, mbr_offset);
+
+done:
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ if (error == EINVAL)
+ error = 0;
+ return (error);
+}
+
+void
+extended(dname, dev, strat, lp, ssp, ext_offset, ext_size, base_ext_offset,
+ nsectors, ntracks, mbr_offset)
+ char *dname;
+ dev_t dev;
+ struct disklabel *lp;
+ d_strategy_t *strat;
+ struct diskslices *ssp;
+ u_long ext_offset;
+ u_long ext_size;
+ u_long base_ext_offset;
+ int nsectors;
+ int ntracks;
+ u_long mbr_offset;
+{
+ struct buf *bp;
+ u_char *cp;
+ int dospart;
+ struct dos_partition *dp;
+ u_long ext_offsets[NDOSPART];
+ u_long ext_sizes[NDOSPART];
+ char partname[2];
+ int slice;
+ char *sname;
+ struct diskslice *sp;
+
+ /* Read extended boot record. */
+ bp = geteblk((int)lp->d_secsize);
+ bp->b_dev = dev;
+ bp->b_blkno = ext_offset;
+ bp->b_bcount = lp->d_secsize;
+ bp->b_flags |= B_BUSY | B_READ;
+ (*strat)(bp);
+ if (biowait(bp) != 0) {
+ diskerr(bp, dname, "error reading extended partition table",
+ LOG_PRINTF, 0, (struct disklabel *)NULL);
+ printf("\n");
+ goto done;
+ }
+
+ /* Weakly verify it. */
+ cp = bp->b_data;
+ if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) {
+ sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART,
+ partname);
+ if (bootverbose)
+ printf("%s: invalid extended partition table: no magic\n",
+ sname);
+ goto done;
+ }
+
+ for (dospart = 0,
+ dp = (struct dos_partition *)(bp->b_data + DOSPARTOFF),
+ slice = ssp->dss_nslices, sp = &ssp->dss_slices[slice];
+ dospart < NDOSPART; dospart++, dp++) {
+ ext_sizes[dospart] = 0;
+ if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0
+ && dp->dp_start == 0 && dp->dp_size == 0)
+ continue;
+ if (dp->dp_typ == DOSPTYP_EXTENDED ||
+ dp->dp_typ == DOSPTYP_EXTENDEDX) {
+ char buf[32];
+
+ sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE,
+ RAW_PART, partname);
+ snprintf(buf, sizeof(buf), "%s", sname);
+ if (strlen(buf) < sizeof buf - 11)
+ strcat(buf, "<extended>");
+ check_part(buf, dp, base_ext_offset, nsectors,
+ ntracks, mbr_offset);
+ ext_offsets[dospart] = base_ext_offset + dp->dp_start;
+ ext_sizes[dospart] = dp->dp_size;
+ } else {
+ sname = dsname(dname, dkunit(dev), slice, RAW_PART,
+ partname);
+ check_part(sname, dp, ext_offset, nsectors, ntracks,
+ mbr_offset);
+ if (slice >= MAX_SLICES) {
+ printf("%s: too many slices\n", sname);
+ slice++;
+ continue;
+ }
+ sp->ds_offset = ext_offset + dp->dp_start;
+ sp->ds_size = dp->dp_size;
+ sp->ds_type = dp->dp_typ;
+ ssp->dss_nslices++;
+ slice++;
+ sp++;
+ }
+ }
+
+ /* If we found any more slices, recursively find all the subslices. */
+ for (dospart = 0; dospart < NDOSPART; dospart++)
+ if (ext_sizes[dospart] != 0)
+ extended(dname, dev, strat, lp, ssp,
+ ext_offsets[dospart], ext_sizes[dospart],
+ base_ext_offset, nsectors, ntracks,
+ mbr_offset);
+
+done:
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+}
diff --git a/sys/kern/subr_diskslice.c b/sys/kern/subr_diskslice.c
new file mode 100644
index 0000000..fa0e4a4
--- /dev/null
+++ b/sys/kern/subr_diskslice.c
@@ -0,0 +1,1192 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)wd.c 7.2 (Berkeley) 5/9/91
+ * from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $
+ * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91
+ * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ * $Id: subr_diskslice.c,v 1.60 1998/12/04 22:54:51 archie Exp $
+ */
+
+#include "opt_devfs.h"
+
+#include <stddef.h>
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif
+#include <sys/disklabel.h>
+#include <sys/diskslice.h>
+#include <sys/dkbad.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>
+#include <sys/stat.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+#include <ufs/ffs/fs.h>
+
+#define TRACE(str) do { if (ds_debug) printf str; } while (0)
+
+typedef u_char bool_t;
+
+static volatile bool_t ds_debug;
+
+static struct disklabel *clone_label __P((struct disklabel *lp));
+static void dsiodone __P((struct buf *bp));
+static char *fixlabel __P((char *sname, struct diskslice *sp,
+ struct disklabel *lp, int writeflag));
+static void free_ds_label __P((struct diskslices *ssp, int slice));
+#ifdef DEVFS
+static void free_ds_labeldevs __P((struct diskslices *ssp, int slice));
+#endif
+static void partition_info __P((char *sname, int part, struct partition *pp));
+static void slice_info __P((char *sname, struct diskslice *sp));
+static void set_ds_bad __P((struct diskslices *ssp, int slice,
+ struct dkbad_intern *btp));
+static void set_ds_label __P((struct diskslices *ssp, int slice,
+ struct disklabel *lp));
+#ifdef DEVFS
+static void set_ds_labeldevs __P((char *dname, dev_t dev,
+ struct diskslices *ssp));
+static void set_ds_labeldevs_unaliased __P((char *dname, dev_t dev,
+ struct diskslices *ssp));
+#endif
+static void set_ds_wlabel __P((struct diskslices *ssp, int slice,
+ int wlabel));
+
+/*
+ * Duplicate a label for the whole disk, and initialize defaults in the
+ * copy for fields that are not already initialized. The caller only
+ * needs to initialize d_secsize and d_secperunit, and zero the fields
+ * that are to be defaulted.
+ */
+static struct disklabel *
+clone_label(lp)
+ struct disklabel *lp;
+{
+ struct disklabel *lp1;
+
+ lp1 = malloc(sizeof *lp1, M_DEVBUF, M_WAITOK);
+ *lp1 = *lp;
+ lp = NULL;
+ if (lp1->d_typename[0] == '\0')
+ strncpy(lp1->d_typename, "amnesiac", sizeof(lp1->d_typename));
+ if (lp1->d_packname[0] == '\0')
+ strncpy(lp1->d_packname, "fictitious", sizeof(lp1->d_packname));
+ if (lp1->d_nsectors == 0)
+ lp1->d_nsectors = 32;
+ if (lp1->d_ntracks == 0)
+ lp1->d_ntracks = 64;
+ lp1->d_secpercyl = lp1->d_nsectors * lp1->d_ntracks;
+ lp1->d_ncylinders = lp1->d_secperunit / lp1->d_secpercyl;
+ if (lp1->d_rpm == 0)
+ lp1->d_rpm = 3600;
+ if (lp1->d_interleave == 0)
+ lp1->d_interleave = 1;
+ if (lp1->d_npartitions < RAW_PART + 1)
+ lp1->d_npartitions = MAXPARTITIONS;
+ if (lp1->d_bbsize == 0)
+ lp1->d_bbsize = BBSIZE;
+ if (lp1->d_sbsize == 0)
+ lp1->d_sbsize = SBSIZE;
+ lp1->d_partitions[RAW_PART].p_size = lp1->d_secperunit;
+ lp1->d_magic = DISKMAGIC;
+ lp1->d_magic2 = DISKMAGIC;
+ lp1->d_checksum = dkcksum(lp1);
+ return (lp1);
+}
+
+/*
+ * Determine the size of the transfer, and make sure it is
+ * within the boundaries of the partition. Adjust transfer
+ * if needed, and signal errors or early completion.
+ *
+ * XXX TODO:
+ * o Do bad sector remapping. May need to split buffer.
+ * o Split buffers that are too big for the device.
+ * o Check for overflow.
+ * o Finish cleaning this up.
+ */
+int
+dscheck(bp, ssp)
+ struct buf *bp;
+ struct diskslices *ssp;
+{
+ daddr_t blkno;
+ u_long endsecno;
+ daddr_t labelsect;
+ struct disklabel *lp;
+ char *msg;
+ long nsec;
+ struct partition *pp;
+ daddr_t secno;
+ daddr_t slicerel_secno;
+ struct diskslice *sp;
+ int s;
+
+ blkno = bp->b_blkno;
+ if (blkno < 0) {
+ printf("dscheck: negative b_blkno %ld\n", (long)blkno);
+ bp->b_error = EINVAL;
+ goto bad;
+ }
+ sp = &ssp->dss_slices[dkslice(bp->b_dev)];
+ lp = sp->ds_label;
+ if (ssp->dss_secmult == 1) {
+ if (bp->b_bcount % (u_long)DEV_BSIZE)
+ goto bad_bcount;
+ secno = blkno;
+ nsec = bp->b_bcount >> DEV_BSHIFT;
+ } else if (ssp->dss_secshift != -1) {
+ if (bp->b_bcount & (ssp->dss_secsize - 1))
+ goto bad_bcount;
+ if (blkno & (ssp->dss_secmult - 1))
+ goto bad_blkno;
+ secno = blkno >> ssp->dss_secshift;
+ nsec = bp->b_bcount >> (DEV_BSHIFT + ssp->dss_secshift);
+ } else {
+ if (bp->b_bcount % ssp->dss_secsize)
+ goto bad_bcount;
+ if (blkno % ssp->dss_secmult)
+ goto bad_blkno;
+ secno = blkno / ssp->dss_secmult;
+ nsec = bp->b_bcount / ssp->dss_secsize;
+ }
+ if (lp == NULL) {
+ labelsect = -LABELSECTOR - 1;
+ endsecno = sp->ds_size;
+ slicerel_secno = secno;
+ } else {
+ labelsect = lp->d_partitions[LABEL_PART].p_offset;
+if (labelsect != 0) Debugger("labelsect != 0 in dscheck()");
+ pp = &lp->d_partitions[dkpart(bp->b_dev)];
+ endsecno = pp->p_size;
+ slicerel_secno = pp->p_offset + secno;
+ if (sp->ds_bad != NULL && ds_debug) {
+ daddr_t newsecno;
+
+ newsecno = transbad144(sp->ds_bad, slicerel_secno);
+ if (newsecno != slicerel_secno)
+ printf("should map bad sector %ld -> %ld\n",
+ (long)slicerel_secno, (long)newsecno);
+ }
+ }
+
+ /* overwriting disk label ? */
+ /* XXX should also protect bootstrap in first 8K */
+ if (slicerel_secno <= LABELSECTOR + labelsect &&
+#if LABELSECTOR != 0
+ slicerel_secno + nsec > LABELSECTOR + labelsect &&
+#endif
+ (bp->b_flags & B_READ) == 0 && sp->ds_wlabel == 0) {
+ bp->b_error = EROFS;
+ goto bad;
+ }
+
+#if defined(DOSBBSECTOR) && defined(notyet)
+ /* overwriting master boot record? */
+ if (slicerel_secno <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 &&
+ sp->ds_wlabel == 0) {
+ bp->b_error = EROFS;
+ goto bad;
+ }
+#endif
+
+ /* beyond partition? */
+ if (secno + nsec > endsecno) {
+ /* if exactly at end of disk, return an EOF */
+ if (secno == endsecno) {
+ bp->b_resid = bp->b_bcount;
+ return (0);
+ }
+ /* or truncate if part of it fits */
+ nsec = endsecno - secno;
+ if (nsec <= 0) {
+ bp->b_error = EINVAL;
+ goto bad;
+ }
+ bp->b_bcount = nsec * ssp->dss_secsize;
+ }
+
+ bp->b_pblkno = sp->ds_offset + slicerel_secno;
+
+ /*
+ * Snoop on label accesses if the slice offset is nonzero. Fudge
+ * offsets in the label to keep the in-core label coherent with
+ * the on-disk one.
+ */
+ if (slicerel_secno <= LABELSECTOR + labelsect
+#if LABELSECTOR != 0
+ && slicerel_secno + nsec > LABELSECTOR + labelsect
+#endif
+ && sp->ds_offset != 0) {
+ struct iodone_chain *ic;
+
+ ic = malloc(sizeof *ic , M_DEVBUF, M_WAITOK);
+ ic->ic_prev_flags = bp->b_flags;
+ ic->ic_prev_iodone = bp->b_iodone;
+ ic->ic_prev_iodone_chain = bp->b_iodone_chain;
+ ic->ic_args[0].ia_long = (LABELSECTOR + labelsect -
+ slicerel_secno) * ssp->dss_secsize;
+ ic->ic_args[1].ia_ptr = sp;
+ bp->b_flags |= B_CALL;
+ bp->b_iodone = dsiodone;
+ bp->b_iodone_chain = ic;
+ if (!(bp->b_flags & B_READ)) {
+ /*
+ * XXX even disklabel(8) writes directly so we need
+ * to adjust writes. Perhaps we should drop support
+ * for DIOCWLABEL (always write protect labels) and
+ * require the use of DIOCWDINFO.
+ *
+ * XXX probably need to copy the data to avoid even
+ * temporarily corrupting the in-core copy.
+ */
+ if (bp->b_vp != NULL) {
+ s = splbio();
+ bp->b_vp->v_numoutput++;
+ splx(s);
+ }
+ /* XXX need name here. */
+ msg = fixlabel((char *)NULL, sp,
+ (struct disklabel *)
+ (bp->b_data + ic->ic_args[0].ia_long),
+ TRUE);
+ if (msg != NULL) {
+ printf("%s\n", msg);
+ bp->b_error = EROFS;
+ goto bad;
+ }
+ }
+ }
+ return (1);
+
+bad_bcount:
+ printf("dscheck: b_bcount %ld is not on a sector boundary (ssize %d)\n",
+ bp->b_bcount, ssp->dss_secsize);
+ bp->b_error = EINVAL;
+ goto bad;
+
+bad_blkno:
+ printf("dscheck: b_blkno %ld is not on a sector boundary (ssize %d)\n",
+ (long)blkno, ssp->dss_secsize);
+ bp->b_error = EINVAL;
+ goto bad;
+
+bad:
+ bp->b_resid = bp->b_bcount;
+ bp->b_flags |= B_ERROR;
+ return (-1);
+}
+
+void
+dsclose(dev, mode, ssp)
+ dev_t dev;
+ int mode;
+ struct diskslices *ssp;
+{
+ u_char mask;
+ struct diskslice *sp;
+
+ sp = &ssp->dss_slices[dkslice(dev)];
+ mask = 1 << dkpart(dev);
+ switch (mode) {
+ case S_IFBLK:
+ sp->ds_bopenmask &= ~mask;
+ break;
+ case S_IFCHR:
+ sp->ds_copenmask &= ~mask;
+ break;
+ }
+ sp->ds_openmask = sp->ds_bopenmask | sp->ds_copenmask;
+}
+
+void
+dsgone(sspp)
+ struct diskslices **sspp;
+{
+ int slice;
+ struct diskslice *sp;
+ struct diskslices *ssp;
+
+ for (slice = 0, ssp = *sspp; slice < ssp->dss_nslices; slice++) {
+ sp = &ssp->dss_slices[slice];
+ if (sp->ds_bad != NULL) {
+ free(sp->ds_bad, M_DEVBUF);
+ set_ds_bad(ssp, slice, (struct dkbad_intern *)NULL);
+ }
+#ifdef DEVFS
+ if (sp->ds_bdev != NULL)
+ devfs_remove_dev(sp->ds_bdev);
+ if (sp->ds_cdev != NULL)
+ devfs_remove_dev(sp->ds_cdev);
+#endif
+ free_ds_label(ssp, slice);
+ }
+ free(ssp, M_DEVBUF);
+ *sspp = NULL;
+}
+
+/*
+ * For the "write" commands (DIOCSBAD, DIOCSDINFO and DIOCWDINFO), this
+ * is subject to the same restriction as dsopen().
+ */
+int
+dsioctl(dname, dev, cmd, data, flags, sspp, strat, setgeom)
+ char *dname;
+ dev_t dev;
+ u_long cmd;
+ caddr_t data;
+ int flags;
+ struct diskslices **sspp;
+ d_strategy_t *strat;
+ ds_setgeom_t *setgeom;
+{
+ int error;
+ struct disklabel *lp;
+ int old_wlabel;
+ u_char openmask;
+ int part;
+ int slice;
+ struct diskslice *sp;
+ struct diskslices *ssp;
+
+ slice = dkslice(dev);
+ ssp = *sspp;
+ sp = &ssp->dss_slices[slice];
+ lp = sp->ds_label;
+ switch (cmd) {
+
+ case DIOCGDINFO:
+ if (lp == NULL)
+ return (EINVAL);
+ *(struct disklabel *)data = *lp;
+ return (0);
+
+#ifdef notyet
+ case DIOCGDINFOP:
+ if (lp == NULL)
+ return (EINVAL);
+ *(struct disklabel **)data = lp;
+ return (0);
+#endif
+
+ case DIOCGPART:
+ if (lp == NULL)
+ return (EINVAL);
+ ((struct partinfo *)data)->disklab = lp;
+ ((struct partinfo *)data)->part
+ = &lp->d_partitions[dkpart(dev)];
+ return (0);
+
+ case DIOCGSLICEINFO:
+ bcopy(ssp, data, (char *)&ssp->dss_slices[ssp->dss_nslices] -
+ (char *)ssp);
+ return (0);
+
+ case DIOCSBAD:
+ if (slice == WHOLE_DISK_SLICE)
+ return (ENODEV);
+ if (!(flags & FWRITE))
+ return (EBADF);
+ if (lp == NULL)
+ return (EINVAL);
+ if (sp->ds_bad != NULL)
+ free(sp->ds_bad, M_DEVBUF);
+ set_ds_bad(ssp, slice, internbad144((struct dkbad *)data, lp));
+ return (0);
+
+ case DIOCSDINFO:
+ if (slice == WHOLE_DISK_SLICE)
+ return (ENODEV);
+ if (!(flags & FWRITE))
+ return (EBADF);
+ lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK);
+ if (sp->ds_label == NULL)
+ bzero(lp, sizeof *lp);
+ else
+ bcopy(sp->ds_label, lp, sizeof *lp);
+ if (sp->ds_label == NULL)
+ openmask = 0;
+ else {
+ openmask = sp->ds_openmask;
+ if (slice == COMPATIBILITY_SLICE)
+ openmask |= ssp->dss_slices[
+ ssp->dss_first_bsd_slice].ds_openmask;
+ else if (slice == ssp->dss_first_bsd_slice)
+ openmask |= ssp->dss_slices[
+ COMPATIBILITY_SLICE].ds_openmask;
+ }
+ error = setdisklabel(lp, (struct disklabel *)data,
+ (u_long)openmask);
+ /* XXX why doesn't setdisklabel() check this? */
+ if (error == 0 && lp->d_partitions[RAW_PART].p_offset != 0)
+ error = EXDEV;
+ if (error == 0) {
+ if (lp->d_secperunit > sp->ds_size)
+ error = ENOSPC;
+ for (part = 0; part < lp->d_npartitions; part++)
+ if (lp->d_partitions[part].p_size > sp->ds_size)
+ error = ENOSPC;
+ }
+#if 0 /* XXX */
+ if (error != 0 && setgeom != NULL)
+ error = setgeom(lp);
+#endif
+ if (error != 0) {
+ free(lp, M_DEVBUF);
+ return (error);
+ }
+ free_ds_label(ssp, slice);
+ set_ds_label(ssp, slice, lp);
+#ifdef DEVFS
+ set_ds_labeldevs(dname, dev, ssp);
+#endif
+ return (0);
+
+ case DIOCSYNCSLICEINFO:
+ if (slice != WHOLE_DISK_SLICE || dkpart(dev) != RAW_PART)
+ return (EINVAL);
+ if (!*(int *)data)
+ for (slice = 0; slice < ssp->dss_nslices; slice++) {
+ openmask = ssp->dss_slices[slice].ds_openmask;
+ if (openmask
+ && (slice != WHOLE_DISK_SLICE
+ || openmask & ~(1 << RAW_PART)))
+ return (EBUSY);
+ }
+
+ /*
+ * Temporarily forget the current slices struct and read
+ * the current one.
+ * XXX should wait for current accesses on this disk to
+ * complete, then lock out future accesses and opens.
+ */
+ *sspp = NULL;
+ lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK);
+ *lp = *ssp->dss_slices[WHOLE_DISK_SLICE].ds_label;
+ error = dsopen(dname, dev,
+ ssp->dss_slices[WHOLE_DISK_SLICE].ds_copenmask
+ & (1 << RAW_PART) ? S_IFCHR : S_IFBLK,
+ ssp->dss_oflags, sspp, lp, strat, setgeom,
+ ssp->dss_cdevsw);
+ if (error != 0) {
+ free(lp, M_DEVBUF);
+ *sspp = ssp;
+ return (error);
+ }
+
+ /*
+ * Reopen everything. This is a no-op except in the "force"
+ * case and when the raw bdev and cdev are both open. Abort
+ * if anything fails.
+ */
+ for (slice = 0; slice < ssp->dss_nslices; slice++) {
+ for (openmask = ssp->dss_slices[slice].ds_bopenmask,
+ part = 0; openmask; openmask >>= 1, part++) {
+ if (!(openmask & 1))
+ continue;
+ error = dsopen(dname,
+ dkmodslice(dkmodpart(dev, part),
+ slice),
+ S_IFBLK, ssp->dss_oflags, sspp,
+ lp, strat, setgeom,
+ ssp->dss_cdevsw);
+ if (error != 0) {
+ /* XXX should free devfs toks. */
+ free(lp, M_DEVBUF);
+ /* XXX should restore devfs toks. */
+ *sspp = ssp;
+ return (EBUSY);
+ }
+ }
+ for (openmask = ssp->dss_slices[slice].ds_copenmask,
+ part = 0; openmask; openmask >>= 1, part++) {
+ if (!(openmask & 1))
+ continue;
+ error = dsopen(dname,
+ dkmodslice(dkmodpart(dev, part),
+ slice),
+ S_IFCHR, ssp->dss_oflags, sspp,
+ lp, strat, setgeom,
+ ssp->dss_cdevsw);
+ if (error != 0) {
+ /* XXX should free devfs toks. */
+ free(lp, M_DEVBUF);
+ /* XXX should restore devfs toks. */
+ *sspp = ssp;
+ return (EBUSY);
+ }
+ }
+ }
+
+ /* XXX devfs tokens? */
+ free(lp, M_DEVBUF);
+ dsgone(&ssp);
+ return (0);
+
+ case DIOCWDINFO:
+ error = dsioctl(dname, dev, DIOCSDINFO, data, flags, &ssp,
+ strat, setgeom);
+ if (error != 0)
+ return (error);
+ /*
+ * XXX this used to hack on dk_openpart to fake opening
+ * partition 0 in case that is used instead of dkpart(dev).
+ */
+ old_wlabel = sp->ds_wlabel;
+ set_ds_wlabel(ssp, slice, TRUE);
+ error = writedisklabel(dev, strat, sp->ds_label);
+ /* XXX should invalidate in-core label if write failed. */
+ set_ds_wlabel(ssp, slice, old_wlabel);
+ return (error);
+
+ case DIOCWLABEL:
+ if (slice == WHOLE_DISK_SLICE)
+ return (ENODEV);
+ if (!(flags & FWRITE))
+ return (EBADF);
+ set_ds_wlabel(ssp, slice, *(int *)data != 0);
+ return (0);
+
+ default:
+ return (ENOIOCTL);
+ }
+}
+
+static void
+dsiodone(bp)
+ struct buf *bp;
+{
+ struct iodone_chain *ic;
+ char *msg;
+
+ ic = bp->b_iodone_chain;
+ bp->b_flags = (ic->ic_prev_flags & B_CALL)
+ | (bp->b_flags & ~(B_CALL | B_DONE));
+ bp->b_iodone = ic->ic_prev_iodone;
+ bp->b_iodone_chain = ic->ic_prev_iodone_chain;
+ if (!(bp->b_flags & B_READ)
+ || (!(bp->b_flags & B_ERROR) && bp->b_error == 0)) {
+ msg = fixlabel((char *)NULL, ic->ic_args[1].ia_ptr,
+ (struct disklabel *)
+ (bp->b_data + ic->ic_args[0].ia_long),
+ FALSE);
+ if (msg != NULL)
+ printf("%s\n", msg);
+ }
+ free(ic, M_DEVBUF);
+ biodone(bp);
+}
+
+int
+dsisopen(ssp)
+ struct diskslices *ssp;
+{
+ int slice;
+
+ if (ssp == NULL)
+ return (0);
+ for (slice = 0; slice < ssp->dss_nslices; slice++)
+ if (ssp->dss_slices[slice].ds_openmask)
+ return (1);
+ return (0);
+}
+
+/*
+ * Allocate a slices "struct" and initialize it to contain only an empty
+ * compatibility slice (pointing to itself), a whole disk slice (covering
+ * the disk as described by the label), and (nslices - BASE_SLICES) empty
+ * slices beginning at BASE_SLICE.
+ */
+struct diskslices *
+dsmakeslicestruct(nslices, lp)
+ int nslices;
+ struct disklabel *lp;
+{
+ struct diskslice *sp;
+ struct diskslices *ssp;
+
+ ssp = malloc(offsetof(struct diskslices, dss_slices) +
+ nslices * sizeof *sp, M_DEVBUF, M_WAITOK);
+ ssp->dss_cdevsw = NULL;
+ ssp->dss_first_bsd_slice = COMPATIBILITY_SLICE;
+ ssp->dss_nslices = nslices;
+ ssp->dss_oflags = 0;
+ ssp->dss_secmult = lp->d_secsize / DEV_BSIZE;
+ if (ssp->dss_secmult & (ssp->dss_secmult - 1))
+ ssp->dss_secshift = -1;
+ else
+ ssp->dss_secshift = ffs(ssp->dss_secmult) - 1;
+ ssp->dss_secsize = lp->d_secsize;
+ sp = &ssp->dss_slices[0];
+ bzero(sp, nslices * sizeof *sp);
+ sp[WHOLE_DISK_SLICE].ds_size = lp->d_secperunit;
+ return (ssp);
+}
+
+char *
+dsname(dname, unit, slice, part, partname)
+ char *dname;
+ int unit;
+ int slice;
+ int part;
+ char *partname;
+{
+ static char name[32];
+
+ if (strlen(dname) > 16)
+ dname = "nametoolong";
+ snprintf(name, sizeof(name), "%s%d", dname, unit);
+ partname[0] = '\0';
+ if (slice != WHOLE_DISK_SLICE || part != RAW_PART) {
+ partname[0] = 'a' + part;
+ partname[1] = '\0';
+ if (slice != COMPATIBILITY_SLICE)
+ snprintf(name + strlen(name),
+ sizeof(name) - strlen(name), "s%d", slice - 1);
+ }
+ return (name);
+}
+
+/*
+ * This should only be called when the unit is inactive and the strategy
+ * routine should not allow it to become active unless we call it. Our
+ * strategy routine must be special to allow activity.
+ */
+int
+dsopen(dname, dev, mode, flags, sspp, lp, strat, setgeom, cdevsw)
+ char *dname;
+ dev_t dev;
+ int mode;
+ u_int flags;
+ struct diskslices **sspp;
+ struct disklabel *lp;
+ d_strategy_t *strat;
+ ds_setgeom_t *setgeom;
+ struct cdevsw *cdevsw;
+{
+ struct dkbad *btp;
+ dev_t dev1;
+ int error;
+ struct disklabel *lp1;
+ char *msg;
+ u_char mask;
+#ifdef DEVFS
+ int mynor;
+#endif
+ bool_t need_init;
+ int part;
+ char partname[2];
+ int slice;
+ char *sname;
+ struct diskslice *sp;
+ struct diskslices *ssp;
+ int unit;
+
+ if (lp->d_secsize % DEV_BSIZE)
+ return (EINVAL);
+
+ /*
+ * XXX reinitialize the slice table unless there is an open device
+ * on the unit. This should only be done if the media has changed.
+ */
+ ssp = *sspp;
+ need_init = !dsisopen(ssp);
+ if (ssp != NULL && need_init)
+ dsgone(sspp);
+ if (need_init) {
+ /*
+ * Allocate a minimal slices "struct". This will become
+ * the final slices "struct" if we don't want real slices
+ * or if we can't find any real slices.
+ */
+ *sspp = dsmakeslicestruct(BASE_SLICE, lp);
+
+ if (!(flags & DSO_ONESLICE)) {
+ TRACE(("dsinit\n"));
+ error = dsinit(dname, dev, strat, lp, sspp);
+ if (error != 0) {
+ dsgone(sspp);
+ return (error);
+ }
+ }
+ ssp = *sspp;
+ ssp->dss_oflags = flags;
+#ifdef DEVFS
+ ssp->dss_cdevsw = cdevsw;
+#endif
+
+ /*
+ * If there are no real slices, then make the compatiblity
+ * slice cover the whole disk.
+ */
+ if (ssp->dss_nslices == BASE_SLICE)
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_size
+ = lp->d_secperunit;
+
+ /* Point the compatibility slice at the BSD slice, if any. */
+ for (slice = BASE_SLICE; slice < ssp->dss_nslices; slice++) {
+ sp = &ssp->dss_slices[slice];
+ if (sp->ds_type == DOSPTYP_386BSD /* XXX */) {
+ ssp->dss_first_bsd_slice = slice;
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_offset
+ = sp->ds_offset;
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_size
+ = sp->ds_size;
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_type
+ = sp->ds_type;
+ break;
+ }
+ }
+
+ ssp->dss_slices[WHOLE_DISK_SLICE].ds_label = clone_label(lp);
+ ssp->dss_slices[WHOLE_DISK_SLICE].ds_wlabel = TRUE;
+ if (setgeom != NULL) {
+ error = setgeom(lp);
+ if (error != 0) {
+ dsgone(sspp);
+ return (error);
+ }
+ }
+ }
+
+ unit = dkunit(dev);
+
+ /*
+ * Initialize secondary info for all slices. It is needed for more
+ * than the current slice in the DEVFS case.
+ */
+ for (slice = 0; slice < ssp->dss_nslices; slice++) {
+ sp = &ssp->dss_slices[slice];
+ if (sp->ds_label != NULL)
+ continue;
+ dev1 = dkmodslice(dkmodpart(dev, RAW_PART), slice);
+ sname = dsname(dname, unit, slice, RAW_PART, partname);
+#ifdef DEVFS
+ if (slice != COMPATIBILITY_SLICE && sp->ds_bdev == NULL
+ && sp->ds_size != 0) {
+ mynor = minor(dev1);
+ sp->ds_bdev =
+ devfs_add_devswf(bdevsw, mynor, DV_BLK,
+ UID_ROOT, GID_OPERATOR, 0640,
+ "%s", sname);
+ sp->ds_cdev =
+ devfs_add_devswf(cdevsw, mynor, DV_CHR,
+ UID_ROOT, GID_OPERATOR, 0640,
+ "r%s", sname);
+ }
+#endif
+ /*
+ * XXX this should probably only be done for the need_init
+ * case, but there may be a problem with DIOCSYNCSLICEINFO.
+ */
+ set_ds_wlabel(ssp, slice, TRUE); /* XXX invert */
+ lp1 = clone_label(lp);
+ TRACE(("readdisklabel\n"));
+ if (flags & DSO_NOLABELS)
+ msg = NULL;
+ else
+ msg = readdisklabel(dev1, strat, lp1);
+#if 0 /* XXX */
+ if (msg == NULL && setgeom != NULL && setgeom(lp1) != 0)
+ msg = "setgeom failed";
+#endif
+ if (msg == NULL)
+ msg = fixlabel(sname, sp, lp1, FALSE);
+ if (msg == NULL && lp1->d_secsize != ssp->dss_secsize)
+ msg = "inconsistent sector size";
+ if (msg != NULL) {
+ free(lp1, M_DEVBUF);
+ if (sp->ds_type == DOSPTYP_386BSD /* XXX */)
+ log(LOG_WARNING, "%s: cannot find label (%s)\n",
+ sname, msg);
+ continue;
+ }
+ if (lp1->d_flags & D_BADSECT) {
+ btp = malloc(sizeof *btp, M_DEVBUF, M_WAITOK);
+ TRACE(("readbad144\n"));
+ msg = readbad144(dev1, strat, lp1, btp);
+ if (msg != NULL) {
+ log(LOG_WARNING,
+ "%s: cannot find bad sector table (%s)\n",
+ sname, msg);
+ free(btp, M_DEVBUF);
+ free(lp1, M_DEVBUF);
+ continue;
+ }
+ set_ds_bad(ssp, slice, internbad144(btp, lp1));
+ free(btp, M_DEVBUF);
+ if (sp->ds_bad == NULL) {
+ free(lp1, M_DEVBUF);
+ continue;
+ }
+ }
+ set_ds_label(ssp, slice, lp1);
+#ifdef DEVFS
+ set_ds_labeldevs(dname, dev1, ssp);
+#endif
+ set_ds_wlabel(ssp, slice, FALSE);
+ }
+
+ slice = dkslice(dev);
+ if (slice >= ssp->dss_nslices)
+ return (ENXIO);
+ sp = &ssp->dss_slices[slice];
+ part = dkpart(dev);
+ if (part != RAW_PART
+ && (sp->ds_label == NULL || part >= sp->ds_label->d_npartitions))
+ return (EINVAL); /* XXX needs translation */
+ mask = 1 << part;
+ switch (mode) {
+ case S_IFBLK:
+ sp->ds_bopenmask |= mask;
+ break;
+ case S_IFCHR:
+ sp->ds_copenmask |= mask;
+ break;
+ }
+ sp->ds_openmask = sp->ds_bopenmask | sp->ds_copenmask;
+ return (0);
+}
+
+int
+dssize(dev, sspp, dopen, dclose)
+ dev_t dev;
+ struct diskslices **sspp;
+ d_open_t dopen;
+ d_close_t dclose;
+{
+ struct disklabel *lp;
+ int part;
+ int slice;
+ struct diskslices *ssp;
+
+ slice = dkslice(dev);
+ part = dkpart(dev);
+ ssp = *sspp;
+ if (ssp == NULL || slice >= ssp->dss_nslices
+ || !(ssp->dss_slices[slice].ds_bopenmask & (1 << part))) {
+ if (dopen(dev, FREAD, S_IFBLK, (struct proc *)NULL) != 0)
+ return (-1);
+ dclose(dev, FREAD, S_IFBLK, (struct proc *)NULL);
+ ssp = *sspp;
+ }
+ lp = ssp->dss_slices[slice].ds_label;
+ if (lp == NULL)
+ return (-1);
+ return ((int)lp->d_partitions[part].p_size);
+}
+
+static void
+free_ds_label(ssp, slice)
+ struct diskslices *ssp;
+ int slice;
+{
+ struct disklabel *lp;
+ struct diskslice *sp;
+
+ sp = &ssp->dss_slices[slice];
+ lp = sp->ds_label;
+ if (lp == NULL)
+ return;
+#ifdef DEVFS
+ free_ds_labeldevs(ssp, slice);
+ if (slice == COMPATIBILITY_SLICE)
+ free_ds_labeldevs(ssp, ssp->dss_first_bsd_slice);
+ else if (slice == ssp->dss_first_bsd_slice)
+ free_ds_labeldevs(ssp, COMPATIBILITY_SLICE);
+#endif
+ free(lp, M_DEVBUF);
+ set_ds_label(ssp, slice, (struct disklabel *)NULL);
+}
+
+#ifdef DEVFS
+static void
+free_ds_labeldevs(ssp, slice)
+ struct diskslices *ssp;
+ int slice;
+{
+ struct disklabel *lp;
+ int part;
+ struct diskslice *sp;
+
+ sp = &ssp->dss_slices[slice];
+ lp = sp->ds_label;
+ if (lp == NULL)
+ return;
+ for (part = 0; part < lp->d_npartitions; part++) {
+ if (sp->ds_bdevs[part] != NULL) {
+ devfs_remove_dev(sp->ds_bdevs[part]);
+ sp->ds_bdevs[part] = NULL;
+ }
+ if (sp->ds_cdevs[part] != NULL) {
+ devfs_remove_dev(sp->ds_cdevs[part]);
+ sp->ds_cdevs[part] = NULL;
+ }
+ }
+}
+#endif
+
+static char *
+fixlabel(sname, sp, lp, writeflag)
+ char *sname;
+ struct diskslice *sp;
+ struct disklabel *lp;
+ int writeflag;
+{
+ u_long end;
+ u_long offset;
+ int part;
+ struct partition *pp;
+ u_long start;
+ bool_t warned;
+
+ /* These errors "can't happen" so don't bother reporting details. */
+ if (lp->d_magic != DISKMAGIC || lp->d_magic2 != DISKMAGIC)
+ return ("fixlabel: invalid magic");
+ if (dkcksum(lp) != 0)
+ return ("fixlabel: invalid checksum");
+
+ pp = &lp->d_partitions[RAW_PART];
+ if (writeflag) {
+ start = 0;
+ offset = sp->ds_offset;
+ } else {
+ start = sp->ds_offset;
+ offset = -sp->ds_offset;
+ }
+ if (pp->p_offset != start) {
+ if (sname != NULL) {
+ printf(
+"%s: rejecting BSD label: raw partition offset != slice offset\n",
+ sname);
+ slice_info(sname, sp);
+ partition_info(sname, RAW_PART, pp);
+ }
+ return ("fixlabel: raw partition offset != slice offset");
+ }
+ if (pp->p_size != sp->ds_size) {
+ if (sname != NULL) {
+ printf("%s: raw partition size != slice size\n", sname);
+ slice_info(sname, sp);
+ partition_info(sname, RAW_PART, pp);
+ }
+ if (pp->p_size > sp->ds_size) {
+ if (sname == NULL)
+ return ("fixlabel: raw partition size > slice size");
+ printf("%s: truncating raw partition\n", sname);
+ pp->p_size = sp->ds_size;
+ }
+ }
+ end = start + sp->ds_size;
+ if (start > end)
+ return ("fixlabel: slice wraps");
+ if (lp->d_secpercyl <= 0)
+ return ("fixlabel: d_secpercyl <= 0");
+ pp -= RAW_PART;
+ warned = FALSE;
+ for (part = 0; part < lp->d_npartitions; part++, pp++) {
+ if (pp->p_offset != 0 || pp->p_size != 0) {
+ if (pp->p_offset < start
+ || pp->p_offset + pp->p_size > end
+ || pp->p_offset + pp->p_size < pp->p_offset) {
+ if (sname != NULL) {
+ printf(
+"%s: rejecting partition in BSD label: it isn't entirely within the slice\n",
+ sname);
+ if (!warned) {
+ slice_info(sname, sp);
+ warned = TRUE;
+ }
+ partition_info(sname, part, pp);
+ }
+ /* XXX else silently discard junk. */
+ bzero(pp, sizeof *pp);
+ } else
+ pp->p_offset += offset;
+ }
+ }
+ lp->d_ncylinders = sp->ds_size / lp->d_secpercyl;
+ lp->d_secperunit = sp->ds_size;
+ lp->d_checksum = 0;
+ lp->d_checksum = dkcksum(lp);
+ return (NULL);
+}
+
+static void
+partition_info(sname, part, pp)
+ char *sname;
+ int part;
+ struct partition *pp;
+{
+ printf("%s%c: start %lu, end %lu, size %lu\n", sname, 'a' + part,
+ (u_long)pp->p_offset, (u_long)(pp->p_offset + pp->p_size - 1),
+ (u_long)pp->p_size);
+}
+
+static void
+slice_info(sname, sp)
+ char *sname;
+ struct diskslice *sp;
+{
+ printf("%s: start %lu, end %lu, size %lu\n", sname,
+ sp->ds_offset, sp->ds_offset + sp->ds_size - 1, sp->ds_size);
+}
+
+/*
+ * Most changes to ds_bad, ds_label and ds_wlabel are made using the
+ * following functions to ensure coherency of the compatibility slice
+ * with the first BSD slice. The openmask fields are _not_ shared and
+ * the other fields (ds_offset and ds_size) aren't changed after they
+ * are initialized.
+ */
+static void
+set_ds_bad(ssp, slice, btp)
+ struct diskslices *ssp;
+ int slice;
+ struct dkbad_intern *btp;
+{
+ ssp->dss_slices[slice].ds_bad = btp;
+ if (slice == COMPATIBILITY_SLICE)
+ ssp->dss_slices[ssp->dss_first_bsd_slice].ds_bad = btp;
+ else if (slice == ssp->dss_first_bsd_slice)
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_bad = btp;
+}
+
+static void
+set_ds_label(ssp, slice, lp)
+ struct diskslices *ssp;
+ int slice;
+ struct disklabel *lp;
+{
+ ssp->dss_slices[slice].ds_label = lp;
+ if (slice == COMPATIBILITY_SLICE)
+ ssp->dss_slices[ssp->dss_first_bsd_slice].ds_label = lp;
+ else if (slice == ssp->dss_first_bsd_slice)
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_label = lp;
+}
+
+#ifdef DEVFS
+static void
+set_ds_labeldevs(dname, dev, ssp)
+ char *dname;
+ dev_t dev;
+ struct diskslices *ssp;
+{
+ int slice;
+
+ set_ds_labeldevs_unaliased(dname, dev, ssp);
+ if (ssp->dss_first_bsd_slice == COMPATIBILITY_SLICE)
+ return;
+ slice = dkslice(dev);
+ if (slice == COMPATIBILITY_SLICE)
+ set_ds_labeldevs_unaliased(dname,
+ dkmodslice(dev, ssp->dss_first_bsd_slice), ssp);
+ else if (slice == ssp->dss_first_bsd_slice)
+ set_ds_labeldevs_unaliased(dname,
+ dkmodslice(dev, COMPATIBILITY_SLICE), ssp);
+}
+
+static void
+set_ds_labeldevs_unaliased(dname, dev, ssp)
+ char *dname;
+ dev_t dev;
+ struct diskslices *ssp;
+{
+ struct disklabel *lp;
+ int mynor;
+ int part;
+ char partname[2];
+ struct partition *pp;
+ int slice;
+ char *sname;
+ struct diskslice *sp;
+
+ slice = dkslice(dev);
+ sp = &ssp->dss_slices[slice];
+ if (sp->ds_size == 0)
+ return;
+ lp = sp->ds_label;
+ for (part = 0; part < lp->d_npartitions; part++) {
+ pp = &lp->d_partitions[part];
+ if (pp->p_size == 0)
+ continue;
+ sname = dsname(dname, dkunit(dev), slice, part, partname);
+ if (part == RAW_PART && sp->ds_bdev != NULL) {
+ sp->ds_bdevs[part] =
+ devfs_makelink(sp->ds_bdev,
+ "%s%s", sname, partname);
+ sp->ds_cdevs[part] =
+ devfs_makelink(sp->ds_cdev,
+ "r%s%s", sname, partname);
+ } else {
+ mynor = minor(dkmodpart(dev, part));
+ sp->ds_bdevs[part] =
+ devfs_add_devswf(ssp->dss_cdevsw, mynor, DV_BLK,
+ UID_ROOT, GID_OPERATOR, 0640,
+ "%s%s", sname, partname);
+ sp->ds_cdevs[part] =
+ devfs_add_devswf(ssp->dss_cdevsw, mynor, DV_CHR,
+ UID_ROOT, GID_OPERATOR, 0640,
+ "r%s%s", sname, partname);
+ }
+ }
+}
+#endif /* DEVFS */
+
+static void
+set_ds_wlabel(ssp, slice, wlabel)
+ struct diskslices *ssp;
+ int slice;
+ int wlabel;
+{
+ ssp->dss_slices[slice].ds_wlabel = wlabel;
+ if (slice == COMPATIBILITY_SLICE)
+ ssp->dss_slices[ssp->dss_first_bsd_slice].ds_wlabel = wlabel;
+ else if (slice == ssp->dss_first_bsd_slice)
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_wlabel = wlabel;
+}
diff --git a/sys/kern/subr_dkbad.c b/sys/kern/subr_dkbad.c
new file mode 100644
index 0000000..4686a17
--- /dev/null
+++ b/sys/kern/subr_dkbad.c
@@ -0,0 +1,160 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)wd.c 7.2 (Berkeley) 5/9/91
+ * from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $
+ * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91
+ * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ * $Id: subr_dkbad.c,v 1.7 1997/11/24 04:14:21 dyson Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/disklabel.h>
+#include <sys/dkbad.h>
+#include <sys/malloc.h>
+
+/*
+ * Internalize the bad sector table.
+ * TODO:
+ * o Fix types.
+ * Type long should be daddr_t since we compare with blkno's.
+ * Sentinel -1 should be ((daddr_t)-1).
+ * o Can remove explicit test for sentinel if it is a positive
+ * (unsigned or not) value larger than all possible blkno's.
+ * o Check that the table is sorted.
+ * o Use faster searches.
+ * o Use the internal table in wddump().
+ * o Don't duplicate so much code.
+ * o Do all bad block handing in a driver-independent file.
+ * o Remove limit of 126 spare sectors.
+ */
+struct dkbad_intern *
+internbad144(btp, lp)
+ struct dkbad *btp;
+ struct disklabel *lp;
+{
+ struct dkbad_intern *bip;
+ int i;
+
+ bip = malloc(sizeof *bip, M_DEVBUF, M_WAITOK);
+ /*
+ * Spare sectors are allocated beginning with the last sector of
+ * the second last track of the disk (the last track is used for
+ * the bad sector list).
+ */
+ bip->bi_maxspare = lp->d_secperunit - lp->d_nsectors - 1;
+ bip->bi_nbad = DKBAD_MAXBAD;
+ i = 0;
+ for (; i < DKBAD_MAXBAD && btp->bt_bad[i].bt_cyl != DKBAD_NOCYL; i++)
+ bip->bi_bad[i] = btp->bt_bad[i].bt_cyl * lp->d_secpercyl
+ + (btp->bt_bad[i].bt_trksec >> 8)
+ * lp->d_nsectors
+ + (btp->bt_bad[i].bt_trksec & 0x00ff);
+ bip->bi_bad[i] = -1;
+ return (bip);
+}
+
+char *
+readbad144(dev, strat, lp, bdp)
+ dev_t dev;
+ d_strategy_t *strat;
+ struct disklabel *lp;
+ struct dkbad *bdp;
+{
+ struct buf *bp;
+ struct dkbad *db;
+ int i;
+ char *msg;
+
+ bp = geteblk((int)lp->d_secsize);
+ i = 0;
+ do {
+ /* Read a bad sector table. */
+ bp->b_dev = dev;
+ bp->b_blkno = lp->d_secperunit - lp->d_nsectors + i;
+ if (lp->d_secsize > DEV_BSIZE)
+ bp->b_blkno *= lp->d_secsize / DEV_BSIZE;
+ else
+ bp->b_blkno /= DEV_BSIZE / lp->d_secsize;
+ bp->b_bcount = lp->d_secsize;
+ bp->b_flags |= B_BUSY | B_READ;
+ bp->b_flags &= ~B_ERROR;
+ (*strat)(bp);
+
+ /* If successful, validate, otherwise try another. */
+ if (biowait(bp) == 0) {
+ db = (struct dkbad *)(bp->b_data);
+ if (db->bt_mbz == 0 && db->bt_flag == DKBAD_MAGIC) {
+ msg = NULL;
+ *bdp = *db;
+ break;
+ }
+ msg = "bad sector table corrupted";
+ } else
+ msg = "bad sector table I/O error";
+ } while ((bp->b_flags & B_ERROR) && (i += 2) < 10 &&
+ i < lp->d_nsectors);
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ return (msg);
+}
+
+daddr_t
+transbad144(bip, blkno)
+ struct dkbad_intern *bip;
+ daddr_t blkno;
+{
+ int i;
+
+ /*
+ * List is sorted, so the search can terminate when it is past our
+ * sector.
+ */
+ for (i = 0; bip->bi_bad[i] != -1 && bip->bi_bad[i] <= blkno; i++)
+ if (bip->bi_bad[i] == blkno)
+ /*
+ * Spare sectors are allocated in decreasing order.
+ */
+ return (bip->bi_maxspare - i);
+ return (blkno);
+}
diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c
new file mode 100644
index 0000000..1204376
--- /dev/null
+++ b/sys/kern/subr_log.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_log.c 8.1 (Berkeley) 6/10/93
+ * $Id: subr_log.c,v 1.32 1998/11/11 10:55:56 truckman Exp $
+ */
+
+/*
+ * Error log buffer for kernel printf's.
+ */
+
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/msgbuf.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/poll.h>
+#include <sys/filedesc.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+#define LOG_RDPRI (PZERO + 1)
+
+#define LOG_ASYNC 0x04
+#define LOG_RDWAIT 0x08
+
+static d_open_t logopen;
+static d_close_t logclose;
+static d_read_t logread;
+static d_ioctl_t logioctl;
+static d_poll_t logpoll;
+
+#define CDEV_MAJOR 7
+static struct cdevsw log_cdevsw =
+ { logopen, logclose, logread, nowrite, /*7*/
+ logioctl, nostop, nullreset, nodevtotty,/* klog */
+ logpoll, nommap, NULL, "log", NULL, -1 };
+
+static struct logsoftc {
+ int sc_state; /* see above for possibilities */
+ struct selinfo sc_selp; /* process waiting on select call */
+ struct sigio *sc_sigio; /* information for async I/O */
+} logsoftc;
+
+int log_open; /* also used in log() */
+
+/*ARGSUSED*/
+static int
+logopen(dev, flags, mode, p)
+ dev_t dev;
+ int flags, mode;
+ struct proc *p;
+{
+ if (log_open)
+ return (EBUSY);
+ log_open = 1;
+ fsetown(p->p_pid, &logsoftc.sc_sigio); /* signal process only */
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+logclose(dev, flag, mode, p)
+ dev_t dev;
+ int flag, mode;
+ struct proc *p;
+{
+
+ log_open = 0;
+ logsoftc.sc_state = 0;
+ funsetown(logsoftc.sc_sigio);
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+logread(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ register struct msgbuf *mbp = msgbufp;
+ register long l;
+ register int s;
+ int error = 0;
+
+ s = splhigh();
+ while (mbp->msg_bufr == mbp->msg_bufx) {
+ if (flag & IO_NDELAY) {
+ splx(s);
+ return (EWOULDBLOCK);
+ }
+ logsoftc.sc_state |= LOG_RDWAIT;
+ if ((error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH,
+ "klog", 0))) {
+ splx(s);
+ return (error);
+ }
+ }
+ splx(s);
+ logsoftc.sc_state &= ~LOG_RDWAIT;
+
+ while (uio->uio_resid > 0) {
+ l = mbp->msg_bufx - mbp->msg_bufr;
+ if (l < 0)
+ l = mbp->msg_size - mbp->msg_bufr;
+ l = min(l, uio->uio_resid);
+ if (l == 0)
+ break;
+ error = uiomove((caddr_t)msgbufp->msg_ptr + mbp->msg_bufr,
+ (int)l, uio);
+ if (error)
+ break;
+ mbp->msg_bufr += l;
+ if (mbp->msg_bufr >= mbp->msg_size)
+ mbp->msg_bufr = 0;
+ }
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+logpoll(dev, events, p)
+ dev_t dev;
+ int events;
+ struct proc *p;
+{
+ int s;
+ int revents = 0;
+
+ s = splhigh();
+
+ if (events & (POLLIN | POLLRDNORM))
+ if (msgbufp->msg_bufr != msgbufp->msg_bufx)
+ revents |= events & (POLLIN | POLLRDNORM);
+ else
+ selrecord(p, &logsoftc.sc_selp);
+
+ splx(s);
+ return (revents);
+}
+
+void
+logwakeup()
+{
+ if (!log_open)
+ return;
+ selwakeup(&logsoftc.sc_selp);
+ if ((logsoftc.sc_state & LOG_ASYNC) && logsoftc.sc_sigio != NULL)
+ pgsigio(logsoftc.sc_sigio, SIGIO, 0);
+ if (logsoftc.sc_state & LOG_RDWAIT) {
+ wakeup((caddr_t)msgbufp);
+ logsoftc.sc_state &= ~LOG_RDWAIT;
+ }
+}
+
+/*ARGSUSED*/
+static int
+logioctl(dev, com, data, flag, p)
+ dev_t dev;
+ u_long com;
+ caddr_t data;
+ int flag;
+ struct proc *p;
+{
+ long l;
+ int s;
+
+ switch (com) {
+
+ /* return number of characters immediately available */
+ case FIONREAD:
+ s = splhigh();
+ l = msgbufp->msg_bufx - msgbufp->msg_bufr;
+ splx(s);
+ if (l < 0)
+ l += msgbufp->msg_size;
+ *(int *)data = l;
+ break;
+
+ case FIONBIO:
+ break;
+
+ case FIOASYNC:
+ if (*(int *)data)
+ logsoftc.sc_state |= LOG_ASYNC;
+ else
+ logsoftc.sc_state &= ~LOG_ASYNC;
+ break;
+
+ case FIOSETOWN:
+ return (fsetown(*(int *)data, &logsoftc.sc_sigio));
+
+ case FIOGETOWN:
+ *(int *)data = fgetown(logsoftc.sc_sigio);
+ break;
+
+ /* This is deprecated, FIOSETOWN should be used instead. */
+ case TIOCSPGRP:
+ return (fsetown(-(*(int *)data), &logsoftc.sc_sigio));
+
+ /* This is deprecated, FIOGETOWN should be used instead */
+ case TIOCGPGRP:
+ *(int *)data = -fgetown(logsoftc.sc_sigio);
+ break;
+
+ default:
+ return (ENOTTY);
+ }
+ return (0);
+}
+
+static int log_devsw_installed;
+#ifdef DEVFS
+static void *log_devfs_token;
+#endif
+
+static void log_drvinit __P((void *unused));
+static void
+log_drvinit(unused)
+ void *unused;
+{
+ dev_t dev;
+
+ if( ! log_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR,0);
+ cdevsw_add(&dev,&log_cdevsw,NULL);
+ log_devsw_installed = 1;
+#ifdef DEVFS
+ log_devfs_token = devfs_add_devswf(&log_cdevsw, 0, DV_CHR,
+ UID_ROOT, GID_WHEEL, 0600,
+ "klog");
+#endif
+ }
+}
+
+SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,log_drvinit,NULL)
diff --git a/sys/kern/subr_module.c b/sys/kern/subr_module.c
new file mode 100644
index 0000000..7eb635a
--- /dev/null
+++ b/sys/kern/subr_module.c
@@ -0,0 +1,267 @@
+/*-
+ * Copyright (c) 1998 Michael Smith
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: subr_module.c,v 1.3 1998/10/12 09:03:48 peter Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/linker.h>
+
+/*
+ * Preloaded module support
+ */
+
+caddr_t preload_metadata;
+
+/*
+ * Search for the preloaded module (name)
+ */
+caddr_t
+preload_search_by_name(const char *name)
+{
+ caddr_t curp;
+ u_int32_t *hdr;
+ int next;
+
+ if (preload_metadata != NULL) {
+
+ curp = preload_metadata;
+ for (;;) {
+ hdr = (u_int32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* Search for a MODINFO_NAME field */
+ if ((hdr[0] == MODINFO_NAME) &&
+ !strcmp(name, curp + sizeof(u_int32_t) * 2))
+ return(curp);
+
+ /* skip to next field */
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+ return(NULL);
+}
+
+/*
+ * Search for the first preloaded module of (type)
+ */
+caddr_t
+preload_search_by_type(const char *type)
+{
+ caddr_t curp, lname;
+ u_int32_t *hdr;
+ int next;
+
+ if (preload_metadata != NULL) {
+
+ curp = preload_metadata;
+ lname = NULL;
+ for (;;) {
+ hdr = (u_int32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* remember the start of each record */
+ if (hdr[0] == MODINFO_NAME)
+ lname = curp;
+
+ /* Search for a MODINFO_TYPE field */
+ if ((hdr[0] == MODINFO_TYPE) &&
+ !strcmp(type, curp + sizeof(u_int32_t) * 2))
+ return(lname);
+
+ /* skip to next field */
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+ return(NULL);
+}
+
+/*
+ * Walk through the preloaded module list
+ */
+caddr_t
+preload_search_next_name(caddr_t base)
+{
+ caddr_t curp;
+ u_int32_t *hdr;
+ int next;
+
+ if (preload_metadata != NULL) {
+
+ /* Pick up where we left off last time */
+ if (base) {
+ /* skip to next field */
+ curp = base;
+ hdr = (u_int32_t *)curp;
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ } else
+ curp = preload_metadata;
+
+ for (;;) {
+ hdr = (u_int32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* Found a new record? */
+ if (hdr[0] == MODINFO_NAME)
+ return curp;
+
+ /* skip to next field */
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+ return(NULL);
+}
+
+/*
+ * Given a preloaded module handle (mod), return a pointer
+ * to the data for the attribute (inf).
+ */
+caddr_t
+preload_search_info(caddr_t mod, int inf)
+{
+ caddr_t curp;
+ u_int32_t *hdr;
+ u_int32_t type = 0;
+ int next;
+
+ curp = mod;
+ for (;;) {
+ hdr = (u_int32_t *)curp;
+ /* end of module data? */
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+ /*
+ * We give up once we've looped back to what we were looking at
+ * first - this should normally be a MODINFO_NAME field.
+ */
+ if (type == 0) {
+ type = hdr[0];
+ } else {
+ if (hdr[0] == type)
+ break;
+ }
+
+ /*
+ * Attribute match? Return pointer to data.
+ * Consumer may safely assume that size value preceeds
+ * data.
+ */
+ if (hdr[0] == inf)
+ return(curp + (sizeof(u_int32_t) * 2));
+
+ /* skip to next field */
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ return(NULL);
+}
+
+/*
+ * Delete a preload record by name.
+ */
+void
+preload_delete_name(const char *name)
+{
+ caddr_t curp;
+ u_int32_t *hdr;
+ int next;
+ int clearing;
+
+ if (preload_metadata != NULL) {
+
+ clearing = 0;
+ curp = preload_metadata;
+ for (;;) {
+ hdr = (u_int32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* Search for a MODINFO_NAME field */
+ if (hdr[0] == MODINFO_NAME) {
+ if (!strcmp(name, curp + sizeof(u_int32_t) * 2))
+ clearing = 1; /* got it, start clearing */
+ else if (clearing)
+ clearing = 0; /* at next one now.. better stop */
+ }
+ if (clearing)
+ hdr[0] = MODINFO_EMPTY;
+
+ /* skip to next field */
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+}
+
+/* Called from locore on i386. Convert physical pointers to kvm. Sigh. */
+void
+preload_bootstrap_relocate(vm_offset_t offset)
+{
+ caddr_t curp;
+ u_int32_t *hdr;
+ vm_offset_t *ptr;
+ int next;
+
+ if (preload_metadata != NULL) {
+
+ curp = preload_metadata;
+ for (;;) {
+ hdr = (u_int32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* Deal with the ones that we know we have to fix */
+ switch (hdr[0]) {
+ case MODINFO_ADDR:
+ case MODINFO_METADATA|MODINFOMD_SSYM:
+ case MODINFO_METADATA|MODINFOMD_ESYM:
+ ptr = (vm_offset_t *)(curp + (sizeof(u_int32_t) * 2));
+ *ptr += offset;
+ break;
+ }
+ /* The rest is beyond us for now */
+
+ /* skip to next field */
+ next = sizeof(u_int32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+}
diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
new file mode 100644
index 0000000..ef98c59
--- /dev/null
+++ b/sys/kern/subr_param.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 1980, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)param.c 8.3 (Berkeley) 8/20/94
+ * $Id: param.c,v 1.31 1998/11/05 14:28:17 dg Exp $
+ */
+
+#include <stddef.h>
+
+#include "opt_sysvipc.h"
+#include "opt_param.h"
+
+#include <sys/param.h>
+
+#ifdef SYSVSHM
+#include <machine/vmparam.h>
+#include <sys/shm.h>
+#endif
+#ifdef SYSVSEM
+#include <sys/sem.h>
+#endif
+#ifdef SYSVMSG
+#include <sys/msg.h>
+#endif
+
+/*
+ * System parameter formulae.
+ *
+ * This file is copied into each directory where we compile
+ * the kernel; it should be modified there to suit local taste
+ * if necessary.
+ *
+ * Compiled with -DMAXUSERS=xx
+ */
+
+#ifndef HZ
+#define HZ 100
+#endif
+int hz = HZ;
+int tick = 1000000 / HZ;
+int tickadj = howmany(30000, 60 * HZ); /* can adjust 30ms in 60s */
+#define NPROC (20 + 16 * MAXUSERS)
+#define MAXFILES (NPROC*2)
+int maxproc = NPROC; /* maximum # of processes */
+int maxprocperuid = NPROC-1; /* maximum # of processes per user */
+int maxfiles = MAXFILES; /* system wide open files limit */
+int maxfilesperproc = MAXFILES; /* per-process open files limit */
+int ncallout = 16 + NPROC + MAXFILES; /* maximum # of timer events */
+
+/* maximum # of mbuf clusters */
+#ifndef NMBCLUSTERS
+#define NMBCLUSTERS (512 + MAXUSERS * 16)
+#endif
+int nmbclusters = NMBCLUSTERS;
+
+#if MAXFILES > NMBCLUSTERS
+#define MAXSOCKETS MAXFILES
+#else
+#define MAXSOCKETS NMBCLUSTERS
+#endif
+int maxsockets = MAXSOCKETS;
+
+/* allocate 1/4th amount of virtual address space for mbufs XXX */
+int nmbufs = NMBCLUSTERS * 4;
+
+/* maximum # of sf_bufs (sendfile(2) zero-copy virtual buffers) */
+#ifndef NSFBUFS
+#define NSFBUFS (512 + MAXUSERS * 16)
+#endif
+int nsfbufs = NSFBUFS;
+
+/*
+ * Values in support of System V compatible shared memory. XXX
+ */
+#ifdef SYSVSHM
+#ifndef SHMMAX
+#define SHMMAX (SHMMAXPGS*PAGE_SIZE)
+#endif
+#ifndef SHMMIN
+#define SHMMIN 1
+#endif
+#ifndef SHMMNI
+#define SHMMNI 32 /* <= SHMMMNI in shm.h */
+#endif
+#ifndef SHMSEG
+#define SHMSEG 8
+#endif
+#ifndef SHMALL
+#define SHMALL (SHMMAXPGS)
+#endif
+
+struct shminfo shminfo = {
+ SHMMAX,
+ SHMMIN,
+ SHMMNI,
+ SHMSEG,
+ SHMALL
+};
+#endif
+
+/*
+ * Values in support of System V compatible semaphores.
+ */
+
+#ifdef SYSVSEM
+
+struct seminfo seminfo = {
+ SEMMAP, /* # of entries in semaphore map */
+ SEMMNI, /* # of semaphore identifiers */
+ SEMMNS, /* # of semaphores in system */
+ SEMMNU, /* # of undo structures in system */
+ SEMMSL, /* max # of semaphores per id */
+ SEMOPM, /* max # of operations per semop call */
+ SEMUME, /* max # of undo entries per process */
+ SEMUSZ, /* size in bytes of undo structure */
+ SEMVMX, /* semaphore maximum value */
+ SEMAEM /* adjust on exit max value */
+};
+#endif
+
+/*
+ * Values in support of System V compatible messages.
+ */
+
+#ifdef SYSVMSG
+
+struct msginfo msginfo = {
+ MSGMAX, /* max chars in a message */
+ MSGMNI, /* # of message queue identifiers */
+ MSGMNB, /* max chars in a queue */
+ MSGTQL, /* max messages in system */
+ MSGSSZ, /* size of a message segment */
+ /* (must be small power of 2 greater than 4) */
+ MSGSEG /* number of message segments */
+};
+#endif
+
+/*
+ * These may be set to nonzero here or by patching.
+ * If they are nonzero at bootstrap time then they are
+ * initialized to values dependent on the memory size.
+ */
+#ifdef NBUF
+int nbuf = NBUF;
+#else
+int nbuf = 0;
+#endif
+int nswbuf = 0;
+
+/*
+ * These have to be allocated somewhere; allocating
+ * them here forces loader errors if this file is omitted
+ * (if they've been externed everywhere else; hah!).
+ */
+struct buf *swbuf;
diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c
new file mode 100644
index 0000000..424ac9f
--- /dev/null
+++ b/sys/kern/subr_prf.c
@@ -0,0 +1,716 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94
+ * $Id: subr_prf.c,v 1.50 1998/09/06 06:25:04 ache Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/msgbuf.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/tprintf.h>
+#include <sys/syslog.h>
+#include <machine/cons.h>
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#define TOCONS 0x01
+#define TOTTY 0x02
+#define TOLOG 0x04
+
+struct tty *constty; /* pointer to console "window" tty */
+
+struct putchar_arg {
+ int flags;
+ struct tty *tty;
+};
+
+struct snprintf_arg {
+ char *str;
+ size_t remain;
+};
+
+static void (*v_putc)(int) = cnputc; /* routine to putc on virtual console */
+static void logpri __P((int level));
+static void msglogchar(int c, void *dummyarg);
+static void putchar __P((int ch, void *arg));
+static char *ksprintn __P((u_long num, int base, int *len));
+static void snprintf_func __P((int ch, void *arg));
+
+static int consintr = 1; /* Ok to handle console interrupts? */
+static int msgbufmapped; /* Set when safe to use msgbuf */
+
+/*
+ * Warn that a system table is full.
+ */
+void
+tablefull(tab)
+ const char *tab;
+{
+
+ log(LOG_ERR, "%s: table is full\n", tab);
+}
+
+/*
+ * Uprintf prints to the controlling terminal for the current process.
+ * It may block if the tty queue is overfull. No message is printed if
+ * the queue does not clear in a reasonable time.
+ */
+void
+uprintf(const char *fmt, ...)
+{
+ struct proc *p = curproc;
+ va_list ap;
+ struct putchar_arg pca;
+
+ if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
+ va_start(ap, fmt);
+ pca.tty = p->p_session->s_ttyp;
+ pca.flags = TOTTY;
+ kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+ }
+}
+
+tpr_t
+tprintf_open(p)
+ register struct proc *p;
+{
+
+ if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
+ SESSHOLD(p->p_session);
+ return ((tpr_t) p->p_session);
+ }
+ return ((tpr_t) NULL);
+}
+
+void
+tprintf_close(sess)
+ tpr_t sess;
+{
+
+ if (sess)
+ SESSRELE((struct session *) sess);
+}
+
+/*
+ * tprintf prints on the controlling terminal associated
+ * with the given session.
+ */
+void
+tprintf(tpr_t tpr, const char *fmt, ...)
+{
+ register struct session *sess = (struct session *)tpr;
+ struct tty *tp = NULL;
+ int flags = TOLOG;
+ va_list ap;
+ struct putchar_arg pca;
+
+ logpri(LOG_INFO);
+ if (sess && sess->s_ttyvp && ttycheckoutq(sess->s_ttyp, 0)) {
+ flags |= TOTTY;
+ tp = sess->s_ttyp;
+ }
+ va_start(ap, fmt);
+ pca.tty = tp;
+ pca.flags = flags;
+ kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+ logwakeup();
+}
+
+/*
+ * Ttyprintf displays a message on a tty; it should be used only by
+ * the tty driver, or anything that knows the underlying tty will not
+ * be revoke(2)'d away. Other callers should use tprintf.
+ */
+void
+ttyprintf(struct tty *tp, const char *fmt, ...)
+{
+ va_list ap;
+ struct putchar_arg pca;
+ va_start(ap, fmt);
+ pca.tty = tp;
+ pca.flags = TOTTY;
+ kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+}
+
+extern int log_open;
+
+/*
+ * Log writes to the log buffer, and guarantees not to sleep (so can be
+ * called by interrupt routines). If there is no process reading the
+ * log yet, it writes to the console also.
+ */
+void
+log(int level, const char *fmt, ...)
+{
+ register int s;
+ va_list ap;
+
+ s = splhigh();
+ logpri(level);
+ va_start(ap, fmt);
+
+ kvprintf(fmt, msglogchar, NULL, 10, ap);
+ va_end(ap);
+
+ splx(s);
+ if (!log_open) {
+ struct putchar_arg pca;
+ va_start(ap, fmt);
+ pca.tty = NULL;
+ pca.flags = TOCONS;
+ kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+ }
+ logwakeup();
+}
+
+static void
+logpri(level)
+ int level;
+{
+ register char *p;
+
+ msglogchar('<', NULL);
+ for (p = ksprintn((u_long)level, 10, NULL); *p;)
+ msglogchar(*p--, NULL);
+ msglogchar('>', NULL);
+}
+
+int
+addlog(const char *fmt, ...)
+{
+ register int s;
+ va_list ap;
+ int retval;
+
+ s = splhigh();
+ va_start(ap, fmt);
+ retval = kvprintf(fmt, msglogchar, NULL, 10, ap);
+ splx(s);
+ va_end(ap);
+ if (!log_open) {
+ struct putchar_arg pca;
+ va_start(ap, fmt);
+ pca.tty = NULL;
+ pca.flags = TOCONS;
+ kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+ }
+ logwakeup();
+ return (retval);
+}
+
+int
+printf(const char *fmt, ...)
+{
+ va_list ap;
+ register int savintr;
+ struct putchar_arg pca;
+ int retval;
+
+ savintr = consintr; /* disable interrupts */
+ consintr = 0;
+ va_start(ap, fmt);
+ pca.tty = NULL;
+ pca.flags = TOCONS | TOLOG;
+ retval = kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+ if (!panicstr)
+ logwakeup();
+ consintr = savintr; /* reenable interrupts */
+ return retval;
+}
+
+void
+vprintf(const char *fmt, va_list ap)
+{
+ register int savintr;
+ struct putchar_arg pca;
+
+ savintr = consintr; /* disable interrupts */
+ consintr = 0;
+ pca.tty = NULL;
+ pca.flags = TOCONS | TOLOG;
+ kvprintf(fmt, putchar, &pca, 10, ap);
+ if (!panicstr)
+ logwakeup();
+ consintr = savintr; /* reenable interrupts */
+}
+
+/*
+ * Print a character on console or users terminal. If destination is
+ * the console then the last bunch of characters are saved in msgbuf for
+ * inspection later.
+ */
+static void
+putchar(int c, void *arg)
+{
+ struct putchar_arg *ap = (struct putchar_arg*) arg;
+ int flags = ap->flags;
+ struct tty *tp = ap->tty;
+ if (panicstr)
+ constty = NULL;
+ if ((flags & TOCONS) && tp == NULL && constty) {
+ tp = constty;
+ flags |= TOTTY;
+ }
+ if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 &&
+ (flags & TOCONS) && tp == constty)
+ constty = NULL;
+ if ((flags & TOLOG))
+ msglogchar(c, NULL);
+ if ((flags & TOCONS) && constty == NULL && c != '\0')
+ (*v_putc)(c);
+}
+
+/*
+ * Scaled down version of sprintf(3).
+ */
+int
+sprintf(char *buf, const char *cfmt, ...)
+{
+ int retval;
+ va_list ap;
+
+ va_start(ap, cfmt);
+ retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+ buf[retval] = '\0';
+ va_end(ap);
+ return retval;
+}
+
+/*
+ * Scaled down version of vsprintf(3).
+ */
+int
+vsprintf(char *buf, const char *cfmt, va_list ap)
+{
+ int retval;
+
+ retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+ buf[retval] = '\0';
+ return retval;
+}
+
+/*
+ * Scaled down version of snprintf(3).
+ */
+int
+snprintf(char *str, size_t size, const char *format, ...)
+{
+ int retval;
+ va_list ap;
+
+ va_start(ap, format);
+ retval = vsnprintf(str, size, format, ap);
+ va_end(ap);
+ return(retval);
+}
+
+/*
+ * Scaled down version of vsnprintf(3).
+ */
+int
+vsnprintf(char *str, size_t size, const char *format, va_list ap)
+{
+ struct snprintf_arg info;
+ int retval;
+
+ info.str = str;
+ info.remain = size;
+ retval = kvprintf(format, snprintf_func, &info, 10, ap);
+ if (info.remain >= 1)
+ *info.str++ = '\0';
+ return retval;
+}
+
+static void
+snprintf_func(int ch, void *arg)
+{
+ struct snprintf_arg *const info = arg;
+
+ if (info->remain >= 2) {
+ *info->str++ = ch;
+ info->remain--;
+ }
+}
+
+/*
+ * Put a number (base <= 16) in a buffer in reverse order; return an
+ * optional length and a pointer to the NULL terminated (preceded?)
+ * buffer.
+ */
+static char *
+ksprintn(ul, base, lenp)
+ register u_long ul;
+ register int base, *lenp;
+{ /* A long in base 8, plus NULL. */
+ static char buf[sizeof(long) * NBBY / 3 + 2];
+ register char *p;
+
+ p = buf;
+ do {
+ *++p = hex2ascii(ul % base);
+ } while (ul /= base);
+ if (lenp)
+ *lenp = p - buf;
+ return (p);
+}
+
+/*
+ * Scaled down version of printf(3).
+ *
+ * Two additional formats:
+ *
+ * The format %b is supported to decode error registers.
+ * Its usage is:
+ *
+ * printf("reg=%b\n", regval, "<base><arg>*");
+ *
+ * where <base> is the output base expressed as a control character, e.g.
+ * \10 gives octal; \20 gives hex. Each arg is a sequence of characters,
+ * the first of which gives the bit number to be inspected (origin 1), and
+ * the next characters (up to a control character, i.e. a character <= 32),
+ * give the name of the register. Thus:
+ *
+ * kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
+ *
+ * would produce output:
+ *
+ * reg=3<BITTWO,BITONE>
+ *
+ * XXX: %D -- Hexdump, takes pointer and separator string:
+ * ("%6D", ptr, ":") -> XX:XX:XX:XX:XX:XX
+ * ("%*D", len, ptr, " " -> XX XX XX XX ...
+ */
+int
+kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap)
+{
+#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; }
+ char *p, *q, *d;
+ u_char *up;
+ int ch, n;
+ u_long ul;
+ int base, lflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
+ int dwidth;
+ char padc;
+ int retval = 0;
+
+ if (!func)
+ d = (char *) arg;
+ else
+ d = NULL;
+
+ if (fmt == NULL)
+ fmt = "(fmt null)\n";
+
+ if (radix < 2 || radix > 36)
+ radix = 10;
+
+ for (;;) {
+ padc = ' ';
+ width = 0;
+ while ((ch = (u_char)*fmt++) != '%') {
+ if (ch == '\0')
+ return retval;
+ PCHAR(ch);
+ }
+ lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
+ sign = 0; dot = 0; dwidth = 0;
+reswitch: switch (ch = (u_char)*fmt++) {
+ case '.':
+ dot = 1;
+ goto reswitch;
+ case '#':
+ sharpflag = 1;
+ goto reswitch;
+ case '+':
+ sign = 1;
+ goto reswitch;
+ case '-':
+ ladjust = 1;
+ goto reswitch;
+ case '%':
+ PCHAR(ch);
+ break;
+ case '*':
+ if (!dot) {
+ width = va_arg(ap, int);
+ if (width < 0) {
+ ladjust = !ladjust;
+ width = -width;
+ }
+ } else {
+ dwidth = va_arg(ap, int);
+ }
+ goto reswitch;
+ case '0':
+ if (!dot) {
+ padc = '0';
+ goto reswitch;
+ }
+ case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ for (n = 0;; ++fmt) {
+ n = n * 10 + ch - '0';
+ ch = *fmt;
+ if (ch < '0' || ch > '9')
+ break;
+ }
+ if (dot)
+ dwidth = n;
+ else
+ width = n;
+ goto reswitch;
+ case 'b':
+ ul = va_arg(ap, int);
+ p = va_arg(ap, char *);
+ for (q = ksprintn(ul, *p++, NULL); *q;)
+ PCHAR(*q--);
+
+ if (!ul)
+ break;
+
+ for (tmp = 0; *p;) {
+ n = *p++;
+ if (ul & (1 << (n - 1))) {
+ PCHAR(tmp ? ',' : '<');
+ for (; (n = *p) > ' '; ++p)
+ PCHAR(n);
+ tmp = 1;
+ } else
+ for (; *p > ' '; ++p)
+ continue;
+ }
+ if (tmp)
+ PCHAR('>');
+ break;
+ case 'c':
+ PCHAR(va_arg(ap, int));
+ break;
+ case 'D':
+ up = va_arg(ap, u_char *);
+ p = va_arg(ap, char *);
+ if (!width)
+ width = 16;
+ while(width--) {
+ PCHAR(hex2ascii(*up >> 4));
+ PCHAR(hex2ascii(*up & 0x0f));
+ up++;
+ if (width)
+ for (q=p;*q;q++)
+ PCHAR(*q);
+ }
+ break;
+ case 'd':
+ ul = lflag ? va_arg(ap, long) : va_arg(ap, int);
+ sign = 1;
+ base = 10;
+ goto number;
+ case 'l':
+ lflag = 1;
+ goto reswitch;
+ case 'o':
+ ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
+ base = 8;
+ goto nosign;
+ case 'p':
+ ul = (uintptr_t)va_arg(ap, void *);
+ base = 16;
+ sharpflag = (width == 0);
+ goto nosign;
+ case 'n':
+ case 'r':
+ ul = lflag ? va_arg(ap, u_long) :
+ sign ? (u_long)va_arg(ap, int) : va_arg(ap, u_int);
+ base = radix;
+ goto number;
+ case 's':
+ p = va_arg(ap, char *);
+ if (p == NULL)
+ p = "(null)";
+ if (!dot)
+ n = strlen (p);
+ else
+ for (n = 0; n < dwidth && p[n]; n++)
+ continue;
+
+ width -= n;
+
+ if (!ladjust && width > 0)
+ while (width--)
+ PCHAR(padc);
+ while (n--)
+ PCHAR(*p++);
+ if (ladjust && width > 0)
+ while (width--)
+ PCHAR(padc);
+ break;
+ case 'u':
+ ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
+ base = 10;
+ goto nosign;
+ case 'x':
+ ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
+ base = 16;
+ goto nosign;
+ case 'z':
+ ul = lflag ? va_arg(ap, u_long) :
+ sign ? (u_long)va_arg(ap, int) : va_arg(ap, u_int);
+ base = 16;
+ goto number;
+nosign: sign = 0;
+number: if (sign && (long)ul < 0L) {
+ neg = 1;
+ ul = -(long)ul;
+ }
+ p = ksprintn(ul, base, &tmp);
+ if (sharpflag && ul != 0) {
+ if (base == 8)
+ tmp++;
+ else if (base == 16)
+ tmp += 2;
+ }
+ if (neg)
+ tmp++;
+
+ if (!ladjust && width && (width -= tmp) > 0)
+ while (width--)
+ PCHAR(padc);
+ if (neg)
+ PCHAR('-');
+ if (sharpflag && ul != 0) {
+ if (base == 8) {
+ PCHAR('0');
+ } else if (base == 16) {
+ PCHAR('0');
+ PCHAR('x');
+ }
+ }
+
+ while (*p)
+ PCHAR(*p--);
+
+ if (ladjust && width && (width -= tmp) > 0)
+ while (width--)
+ PCHAR(padc);
+
+ break;
+ default:
+ PCHAR('%');
+ if (lflag)
+ PCHAR('l');
+ PCHAR(ch);
+ break;
+ }
+ }
+#undef PCHAR
+}
+
+/*
+ * Put character in log buffer.
+ */
+static void
+msglogchar(int c, void *dummyarg)
+{
+ struct msgbuf *mbp;
+
+ if (c != '\0' && c != '\r' && c != 0177 && msgbufmapped) {
+ mbp = msgbufp;
+ mbp->msg_ptr[mbp->msg_bufx++] = c;
+ if (mbp->msg_bufx >= mbp->msg_size)
+ mbp->msg_bufx = 0;
+ /* If the buffer is full, keep the most recent data. */
+ if (mbp->msg_bufr == mbp->msg_bufx) {
+ if (++mbp->msg_bufr >= mbp->msg_size)
+ mbp->msg_bufr = 0;
+ }
+ }
+}
+
+void
+msgbufinit(void *ptr, size_t size)
+{
+ char *cp;
+
+ cp = (char *)ptr;
+ msgbufp = (struct msgbuf *) (cp + size - sizeof(*msgbufp));
+ if (msgbufp->msg_magic != MSG_MAGIC || msgbufp->msg_ptr != cp) {
+ bzero(cp, size);
+ msgbufp->msg_magic = MSG_MAGIC;
+ msgbufp->msg_size = (char *)msgbufp - cp;
+ msgbufp->msg_ptr = cp;
+ }
+ msgbufmapped = 1;
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(msgbuf, db_show_msgbuf)
+{
+ int i, j;
+
+ if (!msgbufmapped) {
+ db_printf("msgbuf not mapped yet\n");
+ return;
+ }
+ db_printf("msgbufp = %p\n", msgbufp);
+ db_printf("magic = %x, size = %d, r= %d, w = %d, ptr = %p\n",
+ msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_bufr,
+ msgbufp->msg_bufx, msgbufp->msg_ptr);
+ for (i = 0; i < msgbufp->msg_size; i++) {
+ j = (i + msgbufp->msg_bufr) % msgbufp->msg_size;
+ db_printf("%c", msgbufp->msg_ptr[j]);
+ }
+ db_printf("\n");
+}
+
+#endif /* DDB */
diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c
new file mode 100644
index 0000000..d0ecad7
--- /dev/null
+++ b/sys/kern/subr_prof.c
@@ -0,0 +1,457 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_prof.c 8.3 (Berkeley) 9/23/93
+ * $Id: subr_prof.c,v 1.27 1998/07/14 05:09:46 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+
+#ifdef GPROF
+#include <sys/malloc.h>
+#include <sys/gmon.h>
+
+static MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");
+
+static void kmstartup __P((void *));
+SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL)
+
+struct gmonparam _gmonparam = { GMON_PROF_OFF };
+
+#ifdef GUPROF
+void
+nullfunc_loop_profiled()
+{
+ int i;
+
+ for (i = 0; i < CALIB_SCALE; i++)
+ nullfunc_profiled();
+}
+
+#define nullfunc_loop_profiled_end nullfunc_profiled /* XXX */
+
+void
+nullfunc_profiled()
+{
+}
+#endif /* GUPROF */
+
+static void
+kmstartup(dummy)
+ void *dummy;
+{
+ char *cp;
+ struct gmonparam *p = &_gmonparam;
+#ifdef GUPROF
+ int cputime_overhead;
+ int empty_loop_time;
+ int i;
+ int mcount_overhead;
+ int mexitcount_overhead;
+ int nullfunc_loop_overhead;
+ int nullfunc_loop_profiled_time;
+ uintfptr_t tmp_addr;
+#endif
+
+ /*
+ * Round lowpc and highpc to multiples of the density we're using
+ * so the rest of the scaling (here and in gprof) stays in ints.
+ */
+ p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER));
+ p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER));
+ p->textsize = p->highpc - p->lowpc;
+ printf("Profiling kernel, textsize=%lu [%x..%x]\n",
+ p->textsize, p->lowpc, p->highpc);
+ p->kcountsize = p->textsize / HISTFRACTION;
+ p->hashfraction = HASHFRACTION;
+ p->fromssize = p->textsize / HASHFRACTION;
+ p->tolimit = p->textsize * ARCDENSITY / 100;
+ if (p->tolimit < MINARCS)
+ p->tolimit = MINARCS;
+ else if (p->tolimit > MAXARCS)
+ p->tolimit = MAXARCS;
+ p->tossize = p->tolimit * sizeof(struct tostruct);
+ cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize,
+ M_GPROF, M_NOWAIT);
+ if (cp == 0) {
+ printf("No memory for profiling.\n");
+ return;
+ }
+ bzero(cp, p->kcountsize + p->tossize + p->fromssize);
+ p->tos = (struct tostruct *)cp;
+ cp += p->tossize;
+ p->kcount = (HISTCOUNTER *)cp;
+ cp += p->kcountsize;
+ p->froms = (u_short *)cp;
+
+#ifdef GUPROF
+ /* Initialize pointers to overhead counters. */
+ p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime));
+ p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount));
+ p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount));
+
+ /*
+ * Disable interrupts to avoid interference while we calibrate
+ * things.
+ */
+ disable_intr();
+
+ /*
+ * Determine overheads.
+ * XXX this needs to be repeated for each useful timer/counter.
+ */
+ cputime_overhead = 0;
+ startguprof(p);
+ for (i = 0; i < CALIB_SCALE; i++)
+ cputime_overhead += cputime();
+
+ empty_loop();
+ startguprof(p);
+ empty_loop();
+ empty_loop_time = cputime();
+
+ nullfunc_loop_profiled();
+
+ /*
+ * Start profiling. There won't be any normal function calls since
+ * interrupts are disabled, but we will call the profiling routines
+ * directly to determine their overheads.
+ */
+ p->state = GMON_PROF_HIRES;
+
+ startguprof(p);
+ nullfunc_loop_profiled();
+
+ startguprof(p);
+ for (i = 0; i < CALIB_SCALE; i++)
+#if defined(__i386__) && __GNUC__ >= 2
+ __asm("pushl %0; call __mcount; popl %%ecx"
+ :
+ : "i" (profil)
+ : "ax", "bx", "cx", "dx", "memory");
+#else
+#error
+#endif
+ mcount_overhead = KCOUNT(p, PC_TO_I(p, profil));
+
+ startguprof(p);
+ for (i = 0; i < CALIB_SCALE; i++)
+#if defined(__i386__) && __GNUC__ >= 2
+ __asm("call mexitcount; 1:"
+ : : : "ax", "bx", "cx", "dx", "memory");
+ __asm("movl $1b,%0" : "=rm" (tmp_addr));
+#else
+#error
+#endif
+ mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr));
+
+ p->state = GMON_PROF_OFF;
+ stopguprof(p);
+
+ enable_intr();
+
+ nullfunc_loop_profiled_time = 0;
+ for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled;
+ tmp_addr < (uintfptr_t)nullfunc_loop_profiled_end;
+ tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER))
+ nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr));
+#define CALIB_DOSCALE(count) (((count) + CALIB_SCALE / 3) / CALIB_SCALE)
+#define c2n(count, freq) ((int)((count) * 1000000000LL / freq))
+ printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n",
+ CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)),
+ CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)),
+ CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)),
+ CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)),
+ CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate)));
+ cputime_overhead -= empty_loop_time;
+ mcount_overhead -= empty_loop_time;
+ mexitcount_overhead -= empty_loop_time;
+
+ /*-
+ * Profiling overheads are determined by the times between the
+ * following events:
+ * MC1: mcount() is called
+ * MC2: cputime() (called from mcount()) latches the timer
+ * MC3: mcount() completes
+ * ME1: mexitcount() is called
+ * ME2: cputime() (called from mexitcount()) latches the timer
+ * ME3: mexitcount() completes.
+ * The times between the events vary slightly depending on instruction
+ * combination and cache misses, etc. Attempt to determine the
+ * minimum times. These can be subtracted from the profiling times
+ * without much risk of reducing the profiling times below what they
+ * would be when profiling is not configured. Abbreviate:
+ * ab = minimum time between MC1 and MC3
+ * a = minumum time between MC1 and MC2
+ * b = minimum time between MC2 and MC3
+ * cd = minimum time between ME1 and ME3
+ * c = minimum time between ME1 and ME2
+ * d = minimum time between ME2 and ME3.
+ * These satisfy the relations:
+ * ab <= mcount_overhead (just measured)
+ * a + b <= ab
+ * cd <= mexitcount_overhead (just measured)
+ * c + d <= cd
+ * a + d <= nullfunc_loop_profiled_time (just measured)
+ * a >= 0, b >= 0, c >= 0, d >= 0.
+ * Assume that ab and cd are equal to the minimums.
+ */
+ p->cputime_overhead = CALIB_DOSCALE(cputime_overhead);
+ p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead);
+ p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead
+ - cputime_overhead);
+ nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time;
+ p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead
+ - nullfunc_loop_overhead)
+ / 4);
+ p->mexitcount_pre_overhead = p->mexitcount_overhead
+ + p->cputime_overhead
+ - p->mexitcount_post_overhead;
+ p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead)
+ - p->mexitcount_post_overhead;
+ p->mcount_post_overhead = p->mcount_overhead
+ + p->cputime_overhead
+ - p->mcount_pre_overhead;
+ printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n",
+ c2n(p->cputime_overhead, p->profrate),
+ c2n(p->mcount_overhead, p->profrate),
+ c2n(p->mcount_pre_overhead, p->profrate),
+ c2n(p->mcount_post_overhead, p->profrate),
+ c2n(p->cputime_overhead, p->profrate),
+ c2n(p->mexitcount_overhead, p->profrate),
+ c2n(p->mexitcount_pre_overhead, p->profrate),
+ c2n(p->mexitcount_post_overhead, p->profrate));
+ printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n",
+ p->cputime_overhead, p->mcount_overhead,
+ p->mcount_pre_overhead, p->mcount_post_overhead,
+ p->cputime_overhead, p->mexitcount_overhead,
+ p->mexitcount_pre_overhead, p->mexitcount_post_overhead);
+#endif /* GUPROF */
+}
+
+/*
+ * Return kernel profiling information.
+ */
+static int
+sysctl_kern_prof SYSCTL_HANDLER_ARGS
+{
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ struct gmonparam *gp = &_gmonparam;
+ int error;
+ int state;
+
+ /* all sysctl names at this level are terminal */
+ if (namelen != 1)
+ return (ENOTDIR); /* overloaded */
+
+ switch (name[0]) {
+ case GPROF_STATE:
+ state = gp->state;
+ error = sysctl_handle_int(oidp, &state, 0, req);
+ if (error)
+ return (error);
+ if (!req->newptr)
+ return (0);
+ if (state == GMON_PROF_OFF) {
+ gp->state = state;
+ stopprofclock(&proc0);
+ stopguprof(gp);
+ } else if (state == GMON_PROF_ON) {
+ gp->state = GMON_PROF_OFF;
+ stopguprof(gp);
+ gp->profrate = profhz;
+ startprofclock(&proc0);
+ gp->state = state;
+#ifdef GUPROF
+ } else if (state == GMON_PROF_HIRES) {
+ gp->state = GMON_PROF_OFF;
+ stopprofclock(&proc0);
+ startguprof(gp);
+ gp->state = state;
+#endif
+ } else if (state != gp->state)
+ return (EINVAL);
+ return (0);
+ case GPROF_COUNT:
+ return (sysctl_handle_opaque(oidp,
+ gp->kcount, gp->kcountsize, req));
+ case GPROF_FROMS:
+ return (sysctl_handle_opaque(oidp,
+ gp->froms, gp->fromssize, req));
+ case GPROF_TOS:
+ return (sysctl_handle_opaque(oidp,
+ gp->tos, gp->tossize, req));
+ case GPROF_GMONPARAM:
+ return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req));
+ default:
+ return (EOPNOTSUPP);
+ }
+ /* NOTREACHED */
+}
+
+SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, "");
+#endif /* GPROF */
+
+/*
+ * Profiling system call.
+ *
+ * The scale factor is a fixed point number with 16 bits of fraction, so that
+ * 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct profil_args {
+ caddr_t samples;
+ size_t size;
+ size_t offset;
+ u_int scale;
+};
+#endif
+/* ARGSUSED */
+int
+profil(p, uap)
+ struct proc *p;
+ register struct profil_args *uap;
+{
+ register struct uprof *upp;
+ int s;
+
+ if (uap->scale > (1 << 16))
+ return (EINVAL);
+ if (uap->scale == 0) {
+ stopprofclock(p);
+ return (0);
+ }
+ upp = &p->p_stats->p_prof;
+
+ /* Block profile interrupts while changing state. */
+ s = splstatclock();
+ upp->pr_off = uap->offset;
+ upp->pr_scale = uap->scale;
+ upp->pr_base = uap->samples;
+ upp->pr_size = uap->size;
+ startprofclock(p);
+ splx(s);
+
+ return (0);
+}
+
+/*
+ * Scale is a fixed-point number with the binary point 16 bits
+ * into the value, and is <= 1.0. pc is at most 32 bits, so the
+ * intermediate result is at most 48 bits.
+ */
+#define PC_TO_INDEX(pc, prof) \
+ ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
+ (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
+
+/*
+ * Collect user-level profiling statistics; called on a profiling tick,
+ * when a process is running in user-mode. This routine may be called
+ * from an interrupt context. We try to update the user profiling buffers
+ * cheaply with fuswintr() and suswintr(). If that fails, we revert to
+ * an AST that will vector us to trap() with a context in which copyin
+ * and copyout will work. Trap will then call addupc_task().
+ *
+ * Note that we may (rarely) not get around to the AST soon enough, and
+ * lose profile ticks when the next tick overwrites this one, but in this
+ * case the system is overloaded and the profile is probably already
+ * inaccurate.
+ */
+void
+addupc_intr(p, pc, ticks)
+ register struct proc *p;
+ register u_long pc;
+ u_int ticks;
+{
+ register struct uprof *prof;
+ register caddr_t addr;
+ register u_int i;
+ register int v;
+
+ if (ticks == 0)
+ return;
+ prof = &p->p_stats->p_prof;
+ if (pc < prof->pr_off ||
+ (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
+ return; /* out of range; ignore */
+
+ addr = prof->pr_base + i;
+ if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) {
+ prof->pr_addr = pc;
+ prof->pr_ticks = ticks;
+ need_proftick(p);
+ }
+}
+
+/*
+ * Much like before, but we can afford to take faults here. If the
+ * update fails, we simply turn off profiling.
+ */
+void
+addupc_task(p, pc, ticks)
+ register struct proc *p;
+ register u_long pc;
+ u_int ticks;
+{
+ register struct uprof *prof;
+ register caddr_t addr;
+ register u_int i;
+ u_short v;
+
+ /* Testing P_PROFIL may be unnecessary, but is certainly safe. */
+ if ((p->p_flag & P_PROFIL) == 0 || ticks == 0)
+ return;
+
+ prof = &p->p_stats->p_prof;
+ if (pc < prof->pr_off ||
+ (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
+ return;
+
+ addr = prof->pr_base + i;
+ if (copyin(addr, (caddr_t)&v, sizeof(v)) == 0) {
+ v += ticks;
+ if (copyout((caddr_t)&v, addr, sizeof(v)) == 0)
+ return;
+ }
+ stopprofclock(p);
+}
diff --git a/sys/kern/subr_rlist.c b/sys/kern/subr_rlist.c
index 3adf5a8..80a39cf 100644
--- a/sys/kern/subr_rlist.c
+++ b/sys/kern/subr_rlist.c
@@ -12,25 +12,25 @@
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
- * This software is a component of "386BSD" developed by
- William F. Jolitz, TeleMuse.
+ * This software is a component of "386BSD" developed by
+ * William F. Jolitz, TeleMuse.
* 4. Neither the name of the developer nor the name "386BSD"
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
- * THIS SOFTWARE IS A COMPONENT OF 386BSD DEVELOPED BY WILLIAM F. JOLITZ
- * AND IS INTENDED FOR RESEARCH AND EDUCATIONAL PURPOSES ONLY. THIS
- * SOFTWARE SHOULD NOT BE CONSIDERED TO BE A COMMERCIAL PRODUCT.
- * THE DEVELOPER URGES THAT USERS WHO REQUIRE A COMMERCIAL PRODUCT
+ * THIS SOFTWARE IS A COMPONENT OF 386BSD DEVELOPED BY WILLIAM F. JOLITZ
+ * AND IS INTENDED FOR RESEARCH AND EDUCATIONAL PURPOSES ONLY. THIS
+ * SOFTWARE SHOULD NOT BE CONSIDERED TO BE A COMMERCIAL PRODUCT.
+ * THE DEVELOPER URGES THAT USERS WHO REQUIRE A COMMERCIAL PRODUCT
* NOT MAKE USE THIS WORK.
*
* FOR USERS WHO WISH TO UNDERSTAND THE 386BSD SYSTEM DEVELOPED
- * BY WILLIAM F. JOLITZ, WE RECOMMEND THE USER STUDY WRITTEN
- * REFERENCES SUCH AS THE "PORTING UNIX TO THE 386" SERIES
- * (BEGINNING JANUARY 1991 "DR. DOBBS JOURNAL", USA AND BEGINNING
- * JUNE 1991 "UNIX MAGAZIN", GERMANY) BY WILLIAM F. JOLITZ AND
- * LYNNE GREER JOLITZ, AS WELL AS OTHER BOOKS ON UNIX AND THE
- * ON-LINE 386BSD USER MANUAL BEFORE USE. A BOOK DISCUSSING THE INTERNALS
+ * BY WILLIAM F. JOLITZ, WE RECOMMEND THE USER STUDY WRITTEN
+ * REFERENCES SUCH AS THE "PORTING UNIX TO THE 386" SERIES
+ * (BEGINNING JANUARY 1991 "DR. DOBBS JOURNAL", USA AND BEGINNING
+ * JUNE 1991 "UNIX MAGAZIN", GERMANY) BY WILLIAM F. JOLITZ AND
+ * LYNNE GREER JOLITZ, AS WELL AS OTHER BOOKS ON UNIX AND THE
+ * ON-LINE 386BSD USER MANUAL BEFORE USE. A BOOK DISCUSSING THE INTERNALS
* OF 386BSD ENTITLED "386BSD FROM THE INSIDE OUT" WILL BE AVAILABLE LATE 1992.
*
* THIS SOFTWARE IS PROVIDED BY THE DEVELOPER ``AS IS'' AND
@@ -46,99 +46,185 @@
* SUCH DAMAGE.
*
*/
-static char rcsid[] = "$Header: /usr/bill/working/sys/kern/RCS/subr_rlist.c,v 1.2 92/01/21 21:29:31 william Exp $";
+/*
+ * Changes Copyright (C) 1995, David Greenman & John Dyson; This software may
+ * be used, modified, copied, distributed, and sold, in both source and
+ * binary form provided that the above copyright and these terms are
+ * retained. Under no circumstances is the author responsible for the proper
+ * functioning of this software, nor does the author assume any responsibility
+ * for damages incurred with its use.
+ *
+ * --------- DEPRECIATED ---------
+ *
+ * $Id: subr_rlist.c,v 1.30 1999/01/21 08:29:04 dillon Exp $
+ */
-#include "sys/param.h"
-#include "sys/cdefs.h"
-#include "sys/malloc.h"
-#include "rlist.h"
+#if 0
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/rlist.h>
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
/*
* Resource lists.
*/
-/*
- * Add space to a resource list. Used to either
- * initialize a list or return free space to it.
- */
-rlist_free (rlp, start, end)
-register struct rlist **rlp; unsigned start, end; {
- struct rlist *head;
-
- head = *rlp;
-
-loop:
- /* if nothing here, insert (tail of list) */
- if (*rlp == 0) {
- *rlp = (struct rlist *)malloc(sizeof(**rlp), M_TEMP, M_NOWAIT);
- (*rlp)->rl_start = start;
- (*rlp)->rl_end = end;
- (*rlp)->rl_next = 0;
- return;
- }
+#define RLIST_MIN 128
+static int rlist_count=0;
+static struct rlist *rlfree;
- /* if new region overlaps something currently present, panic */
- if (start >= (*rlp)->rl_start && start <= (*rlp)->rl_end) {
- printf("Frag %d:%d, ent %d:%d ", start, end,
- (*rlp)->rl_start, (*rlp)->rl_end);
- panic("overlapping front rlist_free: freed twice?");
+static struct rlist *rlist_malloc __P((void));
+static __inline void rlist_mfree __P((struct rlist *rl));
+
+static struct rlist *
+rlist_malloc()
+{
+ struct rlist *rl;
+ int i;
+ while( rlist_count < RLIST_MIN) {
+ int s = splhigh();
+ rl = (struct rlist *)kmem_alloc(kernel_map, PAGE_SIZE);
+ splx(s);
+ if( !rl)
+ break;
+
+ for(i=0;i<(PAGE_SIZE/(sizeof *rl));i++) {
+ rl->rl_next = rlfree;
+ rlfree = rl;
+ rlist_count++;
+ rl++;
+ }
}
- if (end >= (*rlp)->rl_start && end <= (*rlp)->rl_end) {
- printf("Frag %d:%d, ent %d:%d ", start, end,
- (*rlp)->rl_start, (*rlp)->rl_end);
- panic("overlapping tail rlist_free: freed twice?");
+
+ if( (rl = rlfree) == 0 )
+ panic("Cannot get an rlist entry");
+
+ --rlist_count;
+ rlfree = rl->rl_next;
+ return rl;
+}
+
+static __inline void
+rlist_mfree(rl)
+ struct rlist *rl;
+{
+ rl->rl_next = rlfree;
+ rlfree = rl;
+ ++rlist_count;
+}
+
+void
+rlist_free(rlh, start, end)
+ struct rlisthdr *rlh;
+ u_int start, end;
+{
+ struct rlist **rlp = &rlh->rlh_list;
+ struct rlist *prev_rlp = NULL, *cur_rlp, *next_rlp = NULL;
+ int s;
+
+ s = splhigh();
+ while (rlh->rlh_lock & RLH_LOCKED) {
+ rlh->rlh_lock |= RLH_DESIRED;
+ tsleep(rlh, PSWP, "rlistf", 0);
}
+ rlh->rlh_lock |= RLH_LOCKED;
+ splx(s);
- /* are we adjacent to this element? (in front) */
- if (end+1 == (*rlp)->rl_start) {
- /* coalesce */
- (*rlp)->rl_start = start;
- goto scan;
+ /*
+ * Traverse the list looking for an entry after the one we want
+ * to insert.
+ */
+ cur_rlp = *rlp;
+ while (cur_rlp != NULL) {
+ if (start < cur_rlp->rl_start)
+ break;
+ if (prev_rlp) {
+ KASSERT(prev_rlp->rl_end + 1 != cur_rlp->rl_start,
+ ("rlist_free: missed coalesce opportunity"));
+ KASSERT(prev_rlp->rl_end != cur_rlp->rl_start,
+ ("rlist_free: entries overlap"));
+ KASSERT(prev_rlp->rl_end <= cur_rlp->rl_start,
+ ("entries out of order"));
+ }
+ prev_rlp = cur_rlp;
+ cur_rlp = cur_rlp->rl_next;
}
- /* are we before this element? */
- if (end < (*rlp)->rl_start) {
- register struct rlist *nlp;
+ if (cur_rlp != NULL) {
+
+ if (end >= cur_rlp->rl_start)
+ panic("rlist_free: free end overlaps already freed area");
- nlp = (struct rlist *)malloc(sizeof(*nlp), M_TEMP, M_NOWAIT);
- nlp->rl_start = start;
- nlp->rl_end = end;
- nlp->rl_next = *rlp;
- *rlp = nlp;
- return;
+ if (prev_rlp) {
+ if (start <= prev_rlp->rl_end)
+ panic("rlist_free: free start overlaps already freed area");
+ /*
+ * Attempt to append
+ */
+ if (prev_rlp->rl_end + 1 == start) {
+ prev_rlp->rl_end = end;
+ /*
+ * Attempt to prepend and coalesce
+ */
+ if (end + 1 == cur_rlp->rl_start) {
+ prev_rlp->rl_end = cur_rlp->rl_end;
+ prev_rlp->rl_next = cur_rlp->rl_next;
+ rlist_mfree(cur_rlp);
+ }
+ goto done;
+ }
+ }
+ /*
+ * Attempt to prepend
+ */
+ if (end + 1 == cur_rlp->rl_start) {
+ cur_rlp->rl_start = start;
+ goto done;
+ }
+ }
+ /*
+ * Reached the end of the list without finding a larger entry.
+ * Append to last entry if there is one and it's adjacent.
+ */
+ if (prev_rlp) {
+ if (start <= prev_rlp->rl_end)
+ panic("rlist_free: free start overlaps already freed area at list tail");
+ /*
+ * Attempt to append
+ */
+ if (prev_rlp->rl_end + 1 == start) {
+ prev_rlp->rl_end = end;
+ goto done;
+ }
}
- /* are we adjacent to this element? (at tail) */
- if ((*rlp)->rl_end + 1 == start) {
- /* coalesce */
- (*rlp)->rl_end = end;
- goto scan;
+ /*
+ * Could neither append nor prepend; allocate a new entry.
+ */
+ next_rlp = cur_rlp;
+ cur_rlp = rlist_malloc();
+ cur_rlp->rl_start = start;
+ cur_rlp->rl_end = end;
+ cur_rlp->rl_next = next_rlp;
+ if (prev_rlp) {
+ prev_rlp->rl_next = cur_rlp;
+ } else {
+ /*
+ * No previous - this entry is the new list head.
+ */
+ *rlp = cur_rlp;
}
- /* are we after this element */
- if (start > (*rlp)->rl_end) {
- rlp = &((*rlp)->rl_next);
- goto loop;
- } else
- panic("rlist_free: can't happen");
-
-scan:
- /* can we coalesce list now that we've filled a void? */
- {
- register struct rlist *lp, *lpn;
-
- for (lp = head; lp->rl_next ;) {
- lpn = lp->rl_next;
-
- /* coalesce ? */
- if (lp->rl_end + 1 == lpn->rl_start) {
- lp->rl_end = lpn->rl_end;
- lp->rl_next = lpn->rl_next;
- free(lpn, M_TEMP);
- } else
- lp = lp->rl_next;
- }
+done:
+ rlh->rlh_lock &= ~RLH_LOCKED;
+ if (rlh->rlh_lock & RLH_DESIRED) {
+ wakeup(rlh);
+ rlh->rlh_lock &= ~RLH_DESIRED;
}
+ return;
}
/*
@@ -147,10 +233,23 @@ scan:
* return a value of 1 and set resource start location with
* "*loc". (Note: loc can be zero if we don't wish the value)
*/
-int rlist_alloc (rlp, size, loc)
-struct rlist **rlp; unsigned size, *loc; {
+int
+rlist_alloc (rlh, size, loc)
+ struct rlisthdr *rlh;
+ unsigned size, *loc;
+{
+ struct rlist **rlp = &rlh->rlh_list;
register struct rlist *lp;
+ int s;
+ register struct rlist *olp = 0;
+ s = splhigh();
+ while (rlh->rlh_lock & RLH_LOCKED) {
+ rlh->rlh_lock |= RLH_DESIRED;
+ tsleep(rlh, PSWP, "rlistf", 0);
+ }
+ rlh->rlh_lock |= RLH_LOCKED;
+ splx(s);
/* walk list, allocating first thing that's big enough (first fit) */
for (; *rlp; rlp = &((*rlp)->rl_next))
@@ -163,13 +262,33 @@ struct rlist **rlp; unsigned size, *loc; {
/* did we eat this element entirely? */
if ((*rlp)->rl_start > (*rlp)->rl_end) {
lp = (*rlp)->rl_next;
- free (*rlp, M_TEMP);
- *rlp = lp;
+ rlist_mfree(*rlp);
+ /*
+ * if the deleted element was in fromt
+ * of the list, adjust *rlp, else don't.
+ */
+ if (olp) {
+ olp->rl_next = lp;
+ } else {
+ *rlp = lp;
+ }
}
+ rlh->rlh_lock &= ~RLH_LOCKED;
+ if (rlh->rlh_lock & RLH_DESIRED) {
+ wakeup(rlh);
+ rlh->rlh_lock &= ~RLH_DESIRED;
+ }
return (1);
+ } else {
+ olp = *rlp;
}
+ rlh->rlh_lock &= ~RLH_LOCKED;
+ if (rlh->rlh_lock & RLH_DESIRED) {
+ wakeup(rlh);
+ rlh->rlh_lock &= ~RLH_DESIRED;
+ }
/* nothing in list that's big enough */
return (0);
}
@@ -178,14 +297,20 @@ struct rlist **rlp; unsigned size, *loc; {
* Finished with this resource list, reclaim all space and
* mark it as being empty.
*/
-rlist_destroy (rlp)
-struct rlist **rlp; {
+void
+rlist_destroy (rlh)
+ struct rlisthdr *rlh;
+{
+ struct rlist **rlp = &rlh->rlh_list;
struct rlist *lp, *nlp;
lp = *rlp;
*rlp = 0;
for (; lp; lp = nlp) {
nlp = lp->rl_next;
- free (lp, M_TEMP);
+ rlist_mfree(lp);
}
}
+
+#endif
+
diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c
new file mode 100644
index 0000000..e0526bb
--- /dev/null
+++ b/sys/kern/subr_rman.c
@@ -0,0 +1,591 @@
+/*
+ * Copyright 1998 Massachusetts Institute of Technology
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation for any purpose and without fee is hereby
+ * granted, provided that both the above copyright notice and this
+ * permission notice appear in all copies, that both the above
+ * copyright notice and this permission notice appear in all
+ * supporting documentation, and that the name of M.I.T. not be used
+ * in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission. M.I.T. makes
+ * no representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied
+ * warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
+ * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
+ * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: subr_rman.c,v 1.3 1998/12/07 21:58:29 archie Exp $
+ */
+
+/*
+ * The kernel resource manager. This code is responsible for keeping track
+ * of hardware resources which are apportioned out to various drivers.
+ * It does not actually assign those resources, and it is not expected
+ * that end-device drivers will call into this code directly. Rather,
+ * the code which implements the buses that those devices are attached to,
+ * and the code which manages CPU resources, will call this code, and the
+ * end-device drivers will make upcalls to that code to actually perform
+ * the allocation.
+ *
+ * There are two sorts of resources managed by this code. The first is
+ * the more familiar array (RMAN_ARRAY) type; resources in this class
+ * consist of a sequence of individually-allocatable objects which have
+ * been numbered in some well-defined order. Most of the resources
+ * are of this type, as it is the most familiar. The second type is
+ * called a gauge (RMAN_GAUGE), and models fungible resources (i.e.,
+ * resources in which each instance is indistinguishable from every
+ * other instance). The principal anticipated application of gauges
+ * is in the context of power consumption, where a bus may have a specific
+ * power budget which all attached devices share. RMAN_GAUGE is not
+ * implemented yet.
+ *
+ * For array resources, we make one simplifying assumption: two clients
+ * sharing the same resource must use the same range of indices. That
+ * is to say, sharing of overlapping-but-not-identical regions is not
+ * permitted.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/rman.h>
+#include <sys/bus.h> /* XXX debugging */
+
+MALLOC_DEFINE(M_RMAN, "rman", "Resource manager");
+
+struct rman_head rman_head;
+#ifndef NULL_SIMPLELOCKS
+static struct simplelock rman_lock; /* mutex to protect rman_head */
+#endif
+static int int_rman_activate_resource(struct rman *rm, struct resource *r,
+ struct resource **whohas);
+static int int_rman_release_resource(struct rman *rm, struct resource *r);
+
+#define CIRCLEQ_TERMCOND(var, head) (var == (void *)&(head))
+
+int
+rman_init(struct rman *rm)
+{
+ static int once;
+
+ if (once == 0) {
+ once = 1;
+ TAILQ_INIT(&rman_head);
+ simple_lock_init(&rman_lock);
+ }
+
+ if (rm->rm_type == RMAN_UNINIT)
+ panic("rman_init");
+ if (rm->rm_type == RMAN_GAUGE)
+ panic("implement RMAN_GAUGE");
+
+ CIRCLEQ_INIT(&rm->rm_list);
+ rm->rm_slock = malloc(sizeof *rm->rm_slock, M_RMAN, M_NOWAIT);
+ if (rm->rm_slock == 0)
+ return ENOMEM;
+ simple_lock_init(rm->rm_slock);
+
+ simple_lock(&rman_lock);
+ TAILQ_INSERT_TAIL(&rman_head, rm, rm_link);
+ simple_unlock(&rman_lock);
+ return 0;
+}
+
+/*
+ * NB: this interface is not robust against programming errors which
+ * add multiple copies of the same region.
+ */
+int
+rman_manage_region(struct rman *rm, u_long start, u_long end)
+{
+ struct resource *r, *s;
+
+ r = malloc(sizeof *r, M_RMAN, M_NOWAIT);
+ if (r == 0)
+ return ENOMEM;
+ r->r_sharehead = 0;
+ r->r_start = start;
+ r->r_end = end;
+ r->r_flags = 0;
+ r->r_dev = 0;
+ r->r_rm = rm;
+
+ simple_lock(rm->rm_slock);
+ for (s = rm->rm_list.cqh_first;
+ !CIRCLEQ_TERMCOND(s, rm->rm_list) && s->r_end < r->r_start;
+ s = s->r_link.cqe_next)
+ ;
+
+ if (CIRCLEQ_TERMCOND(s, rm->rm_list)) {
+ CIRCLEQ_INSERT_TAIL(&rm->rm_list, r, r_link);
+ } else {
+ CIRCLEQ_INSERT_BEFORE(&rm->rm_list, s, r, r_link);
+ }
+
+ simple_unlock(rm->rm_slock);
+ return 0;
+}
+
+int
+rman_fini(struct rman *rm)
+{
+ struct resource *r;
+
+ simple_lock(rm->rm_slock);
+ for (r = rm->rm_list.cqh_first; !CIRCLEQ_TERMCOND(r, rm->rm_list);
+ r = r->r_link.cqe_next) {
+ if (r->r_flags & RF_ALLOCATED)
+ return EBUSY;
+ }
+
+ /*
+ * There really should only be one of these if we are in this
+ * state and the code is working properly, but it can't hurt.
+ */
+ for (r = rm->rm_list.cqh_first; !CIRCLEQ_TERMCOND(r, rm->rm_list);
+ r = rm->rm_list.cqh_first) {
+ CIRCLEQ_REMOVE(&rm->rm_list, r, r_link);
+ free(r, M_RMAN);
+ }
+ simple_unlock(rm->rm_slock);
+ simple_lock(&rman_lock);
+ TAILQ_REMOVE(&rman_head, rm, rm_link);
+ simple_unlock(&rman_lock);
+ free(rm->rm_slock, M_RMAN);
+
+ return 0;
+}
+
+struct resource *
+rman_reserve_resource(struct rman *rm, u_long start, u_long end, u_long count,
+ u_int flags, struct device *dev)
+{
+ u_int want_activate;
+ struct resource *r, *s, *rv;
+ u_long rstart, rend;
+
+ rv = 0;
+
+#ifdef RMAN_DEBUG
+ printf("rman_reserve_resource: <%s> request: [%#lx, %#lx], length "
+ "%#lx, flags %u, device %s%d\n", rm->rm_descr, start, end,
+ count, flags, device_get_name(dev), device_get_unit(dev));
+#endif /* RMAN_DEBUG */
+ want_activate = (flags & RF_ACTIVE);
+ flags &= ~RF_ACTIVE;
+
+ simple_lock(rm->rm_slock);
+
+ for (r = rm->rm_list.cqh_first;
+ !CIRCLEQ_TERMCOND(r, rm->rm_list) && r->r_end < start;
+ r = r->r_link.cqe_next)
+ ;
+
+ if (CIRCLEQ_TERMCOND(r, rm->rm_list)) {
+#ifdef RMAN_DEBUG
+ printf("could not find a region\n");
+#endif RMAN_DEBUG
+ goto out;
+ }
+
+ /*
+ * First try to find an acceptable totally-unshared region.
+ */
+ for (s = r; !CIRCLEQ_TERMCOND(s, rm->rm_list);
+ s = s->r_link.cqe_next) {
+#ifdef RMAN_DEBUG
+ printf("considering [%#lx, %#lx]\n", s->r_start, s->r_end);
+#endif /* RMAN_DEBUG */
+ if (s->r_start > end) {
+#ifdef RMAN_DEBUG
+ printf("s->r_start (%#lx) > end (%#lx)\n", s->r_start, end);
+#endif /* RMAN_DEBUG */
+ break;
+ }
+ if (s->r_flags & RF_ALLOCATED) {
+#ifdef RMAN_DEBUG
+ printf("region is allocated\n");
+#endif /* RMAN_DEBUG */
+ continue;
+ }
+ rstart = max(s->r_start, start);
+ rend = min(s->r_end, max(start + count, end));
+#ifdef RMAN_DEBUG
+ printf("truncated region: [%#lx, %#lx]; size %#lx (requested %#lx)\n",
+ rstart, rend, (rend - rstart + 1), count);
+#endif /* RMAN_DEBUG */
+
+ if ((rend - rstart + 1) >= count) {
+#ifdef RMAN_DEBUG
+ printf("candidate region: [%#lx, %#lx], size %#lx\n",
+ rend, rstart, (rend - rstart + 1));
+#endif /* RMAN_DEBUG */
+ if ((s->r_end - s->r_start + 1) == count) {
+#ifdef RMAN_DEBUG
+ printf("candidate region is entire chunk\n");
+#endif /* RMAN_DEBUG */
+ rv = s;
+ rv->r_flags |= RF_ALLOCATED;
+ rv->r_dev = dev;
+ goto out;
+ }
+
+ /*
+ * If s->r_start < rstart and
+ * s->r_end > rstart + count - 1, then
+ * we need to split the region into three pieces
+ * (the middle one will get returned to the user).
+ * Otherwise, we are allocating at either the
+ * beginning or the end of s, so we only need to
+ * split it in two. The first case requires
+ * two new allocations; the second requires but one.
+ */
+ rv = malloc(sizeof *r, M_RMAN, M_NOWAIT);
+ if (rv == 0)
+ goto out;
+ rv->r_start = rstart;
+ rv->r_end = rstart + count - 1;
+ rv->r_flags = flags | RF_ALLOCATED;
+ rv->r_dev = dev;
+ rv->r_sharehead = 0;
+
+ if (s->r_start < rv->r_start && s->r_end > rv->r_end) {
+#ifdef RMAN_DEBUG
+ printf("splitting region in three parts: "
+ "[%#lx, %#lx]; [%#lx, %#lx]; [%#lx, %#lx]\n",
+ s->r_start, rv->r_start - 1,
+ rv->r_start, rv->r_end,
+ rv->r_end + 1, s->r_end);
+#endif /* RMAN_DEBUG */
+ /*
+ * We are allocating in the middle.
+ */
+ r = malloc(sizeof *r, M_RMAN, M_NOWAIT);
+ if (r == 0) {
+ free(rv, M_RMAN);
+ rv = 0;
+ goto out;
+ }
+ r->r_start = rv->r_end + 1;
+ r->r_end = s->r_end;
+ r->r_flags = s->r_flags;
+ r->r_dev = 0;
+ r->r_sharehead = 0;
+ s->r_end = rv->r_start - 1;
+ CIRCLEQ_INSERT_AFTER(&rm->rm_list, s, rv,
+ r_link);
+ CIRCLEQ_INSERT_AFTER(&rm->rm_list, rv, r,
+ r_link);
+ } else if (s->r_start == rv->r_start) {
+#ifdef RMAN_DEBUG
+ printf("allocating from the beginning\n");
+#endif /* RMAN_DEBUG */
+ /*
+ * We are allocating at the beginning.
+ */
+ s->r_start = rv->r_end + 1;
+ CIRCLEQ_INSERT_BEFORE(&rm->rm_list, s, rv,
+ r_link);
+ } else {
+#ifdef RMAN_DEBUG
+ printf("allocating at the end\n");
+#endif /* RMAN_DEBUG */
+ /*
+ * We are allocating at the end.
+ */
+ s->r_end = rv->r_start - 1;
+ CIRCLEQ_INSERT_AFTER(&rm->rm_list, s, rv,
+ r_link);
+ }
+ goto out;
+ }
+ }
+
+ /*
+ * Now find an acceptable shared region, if the client's requirements
+ * allow sharing. By our implementation restriction, a candidate
+ * region must match exactly by both size and sharing type in order
+ * to be considered compatible with the client's request. (The
+ * former restriction could probably be lifted without too much
+ * additional work, but this does not seem warranted.)
+ */
+#ifdef RMAN_DEBUG
+ printf("no unshared regions found\n");
+#endif /* RMAN_DEBUG */
+ if ((flags & (RF_SHAREABLE | RF_TIMESHARE)) == 0)
+ goto out;
+
+ for (s = r; !CIRCLEQ_TERMCOND(s, rm->rm_list);
+ s = s->r_link.cqe_next) {
+ if (s->r_start > end)
+ break;
+ if ((s->r_flags & flags) != flags)
+ continue;
+ rstart = max(s->r_start, start);
+ rend = min(s->r_end, max(start + count, end));
+ if (s->r_start >= start && s->r_end <= end
+ && (s->r_end - s->r_start + 1) == count) {
+ rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT);
+ if (rv == 0)
+ goto out;
+ rv->r_start = s->r_start;
+ rv->r_end = s->r_end;
+ rv->r_flags = s->r_flags &
+ (RF_ALLOCATED | RF_SHAREABLE | RF_TIMESHARE);
+ rv->r_dev = dev;
+ rv->r_rm = rm;
+ if (s->r_sharehead == 0) {
+ s->r_sharehead = malloc(sizeof *s->r_sharehead,
+ M_RMAN, M_NOWAIT);
+ if (s->r_sharehead == 0) {
+ free(rv, M_RMAN);
+ rv = 0;
+ goto out;
+ }
+ LIST_INIT(s->r_sharehead);
+ LIST_INSERT_HEAD(s->r_sharehead, s,
+ r_sharelink);
+ s->r_flags = RF_FIRSTSHARE;
+ }
+ rv->r_sharehead = s->r_sharehead;
+ LIST_INSERT_HEAD(s->r_sharehead, rv, r_sharelink);
+ goto out;
+ }
+ }
+
+ /*
+ * We couldn't find anything.
+ */
+out:
+ /*
+ * If the user specified RF_ACTIVE in the initial flags,
+ * which is reflected in `want_activate', we attempt to atomically
+ * activate the resource. If this fails, we release the resource
+ * and indicate overall failure. (This behavior probably doesn't
+ * make sense for RF_TIMESHARE-type resources.)
+ */
+ if (rv && want_activate) {
+ struct resource *whohas;
+ if (int_rman_activate_resource(rm, rv, &whohas)) {
+ int_rman_release_resource(rm, rv);
+ rv = 0;
+ }
+ }
+
+ simple_unlock(rm->rm_slock);
+ return (rv);
+}
+
+static int
+int_rman_activate_resource(struct rman *rm, struct resource *r,
+ struct resource **whohas)
+{
+ struct resource *s;
+ int ok;
+
+ /*
+ * If we are not timesharing, then there is nothing much to do.
+ * If we already have the resource, then there is nothing at all to do.
+ * If we are not on a sharing list with anybody else, then there is
+ * little to do.
+ */
+ if ((r->r_flags & RF_TIMESHARE) == 0
+ || (r->r_flags & RF_ACTIVE) != 0
+ || r->r_sharehead == 0) {
+ r->r_flags |= RF_ACTIVE;
+ return 0;
+ }
+
+ ok = 1;
+ for (s = r->r_sharehead->lh_first; s && ok;
+ s = s->r_sharelink.le_next) {
+ if ((s->r_flags & RF_ACTIVE) != 0) {
+ ok = 0;
+ *whohas = s;
+ }
+ }
+ if (ok) {
+ r->r_flags |= RF_ACTIVE;
+ return 0;
+ }
+ return EBUSY;
+}
+
+int
+rman_activate_resource(struct resource *r)
+{
+ int rv;
+ struct resource *whohas;
+ struct rman *rm;
+
+ rm = r->r_rm;
+ simple_lock(rm->rm_slock);
+ rv = int_rman_activate_resource(rm, r, &whohas);
+ simple_unlock(rm->rm_slock);
+ return rv;
+}
+
+int
+rman_await_resource(struct resource *r, int pri, int timo)
+{
+ int rv, s;
+ struct resource *whohas;
+ struct rman *rm;
+
+ rm = r->r_rm;
+ for (;;) {
+ simple_lock(rm->rm_slock);
+ rv = int_rman_activate_resource(rm, r, &whohas);
+ if (rv != EBUSY)
+ return (rv);
+
+ if (r->r_sharehead == 0)
+ panic("rman_await_resource");
+ /*
+ * splhigh hopefully will prevent a race between
+ * simple_unlock and tsleep where a process
+ * could conceivably get in and release the resource
+ * before we have a chance to sleep on it.
+ */
+ s = splhigh();
+ whohas->r_flags |= RF_WANTED;
+ simple_unlock(rm->rm_slock);
+ rv = tsleep(r->r_sharehead, pri, "rmwait", timo);
+ if (rv) {
+ splx(s);
+ return rv;
+ }
+ simple_lock(rm->rm_slock);
+ splx(s);
+ }
+}
+
+int
+rman_deactivate_resource(struct resource *r)
+{
+ struct rman *rm;
+
+ rm = r->r_rm;
+ simple_lock(rm->rm_slock);
+ r->r_flags &= ~RF_ACTIVE;
+ if (r->r_flags & RF_WANTED) {
+ r->r_flags &= ~RF_WANTED;
+ wakeup(r->r_sharehead);
+ }
+ simple_unlock(rm->rm_slock);
+ return 0;
+}
+
+static int
+int_rman_release_resource(struct rman *rm, struct resource *r)
+{
+ struct resource *s, *t;
+
+ if (r->r_flags & RF_ACTIVE)
+ return EBUSY;
+
+ /*
+ * Check for a sharing list first. If there is one, then we don't
+ * have to think as hard.
+ */
+ if (r->r_sharehead) {
+ /*
+ * If a sharing list exists, then we know there are at
+ * least two sharers.
+ *
+ * If we are in the main circleq, appoint someone else.
+ */
+ LIST_REMOVE(r, r_sharelink);
+ s = r->r_sharehead->lh_first;
+ if (r->r_flags & RF_FIRSTSHARE) {
+ s->r_flags |= RF_FIRSTSHARE;
+ CIRCLEQ_INSERT_BEFORE(&rm->rm_list, r, s, r_link);
+ CIRCLEQ_REMOVE(&rm->rm_list, r, r_link);
+ }
+
+ /*
+ * Make sure that the sharing list goes away completely
+ * if the resource is no longer being shared at all.
+ */
+ if (s->r_sharelink.le_next == 0) {
+ free(s->r_sharehead, M_RMAN);
+ s->r_sharehead = 0;
+ s->r_flags &= ~RF_FIRSTSHARE;
+ }
+ goto out;
+ }
+
+ /*
+ * Look at the adjacent resources in the list and see if our
+ * segment can be merged with any of them.
+ */
+ s = r->r_link.cqe_prev;
+ t = r->r_link.cqe_next;
+
+ if (s != (void *)&rm->rm_list && (s->r_flags & RF_ALLOCATED) == 0
+ && t != (void *)&rm->rm_list && (t->r_flags & RF_ALLOCATED) == 0) {
+ /*
+ * Merge all three segments.
+ */
+ s->r_end = t->r_end;
+ CIRCLEQ_REMOVE(&rm->rm_list, r, r_link);
+ CIRCLEQ_REMOVE(&rm->rm_list, t, r_link);
+ free(t, M_RMAN);
+ } else if (s != (void *)&rm->rm_list
+ && (s->r_flags & RF_ALLOCATED) == 0) {
+ /*
+ * Merge previous segment with ours.
+ */
+ s->r_end = r->r_end;
+ CIRCLEQ_REMOVE(&rm->rm_list, r, r_link);
+ } else if (t != (void *)&rm->rm_list
+ && (t->r_flags & RF_ALLOCATED) == 0) {
+ /*
+ * Merge next segment with ours.
+ */
+ t->r_start = r->r_start;
+ CIRCLEQ_REMOVE(&rm->rm_list, r, r_link);
+ } else {
+ /*
+ * At this point, we know there is nothing we
+ * can potentially merge with, because on each
+ * side, there is either nothing there or what is
+ * there is still allocated. In that case, we don't
+ * want to remove r from the list; we simply want to
+ * change it to an unallocated region and return
+ * without freeing anything.
+ */
+ r->r_flags &= ~RF_ALLOCATED;
+ return 0;
+ }
+
+out:
+ free(r, M_RMAN);
+ return 0;
+}
+
+int
+rman_release_resource(struct resource *r)
+{
+ int rv;
+ struct rman *rm = r->r_rm;
+
+ simple_lock(rm->rm_slock);
+ rv = int_rman_release_resource(rm, r);
+ simple_unlock(rm->rm_slock);
+ return (rv);
+}
diff --git a/sys/kern/subr_scanf.c b/sys/kern/subr_scanf.c
new file mode 100644
index 0000000..24f8846
--- /dev/null
+++ b/sys/kern/subr_scanf.c
@@ -0,0 +1,793 @@
+/*-
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Chris Torek.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ * From: Id: vfscanf.c,v 1.13 1998/09/25 12:20:27 obrien Exp
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <machine/limits.h>
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#define BUF 32 /* Maximum length of numeric string. */
+
+/*
+ * Flags used during conversion.
+ */
+#define LONG 0x01 /* l: long or double */
+#define SHORT 0x04 /* h: short */
+#define SUPPRESS 0x08 /* suppress assignment */
+#define POINTER 0x10 /* weird %p pointer (`fake hex') */
+#define NOSKIP 0x20 /* do not skip blanks */
+#define QUAD 0x400
+
+/*
+ * The following are used in numeric conversions only:
+ * SIGNOK, NDIGITS, DPTOK, and EXPOK are for floating point;
+ * SIGNOK, NDIGITS, PFXOK, and NZDIGITS are for integral.
+ */
+#define SIGNOK 0x40 /* +/- is (still) legal */
+#define NDIGITS 0x80 /* no digits detected */
+
+#define DPTOK 0x100 /* (float) decimal point is still legal */
+#define EXPOK 0x200 /* (float) exponent (e+3, etc) still legal */
+
+#define PFXOK 0x100 /* 0x prefix is (still) legal */
+#define NZDIGITS 0x200 /* no zero digits detected */
+
+/*
+ * Conversion types.
+ */
+#define CT_CHAR 0 /* %c conversion */
+#define CT_CCL 1 /* %[...] conversion */
+#define CT_STRING 2 /* %s conversion */
+#define CT_INT 3 /* integer, i.e., strtoq or strtouq */
+typedef u_quad_t (*ccfntype)(const char *, char **, int);
+
+#define isspace(c) ((c) == ' ' || (c) == '\t' || \
+ (c) == '\r' || (c) == '\n')
+#define isascii(c) (((c) & ~0x7f) == 0)
+#define isupper(c) ((c) >= 'A' && (c) <= 'Z')
+#define islower(c) ((c) >= 'a' && (c) <= 'z')
+#define isalpha(c) (isupper(c) || (islower(c)))
+#define isdigit(c) ((c) >= '0' && (c) <= '9')
+
+static u_char *__sccl(char *, u_char *);
+
+int
+sscanf(const char *ibuf, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = vsscanf(ibuf, fmt, ap);
+ va_end(ap);
+ return(ret);
+}
+
+int
+vsscanf(const char *inp, char const *fmt0, va_list ap)
+{
+ int inr;
+ u_char *fmt = (u_char *)fmt0;
+ int c; /* character from format, or conversion */
+ size_t width; /* field width, or 0 */
+ char *p; /* points into all kinds of strings */
+ int n; /* handy integer */
+ int flags; /* flags as defined above */
+ char *p0; /* saves original value of p when necessary */
+ int nassigned; /* number of fields assigned */
+ int nconversions; /* number of conversions */
+ int nread; /* number of characters consumed from fp */
+ int base; /* base argument to strtoq/strtouq */
+ ccfntype ccfn; /* conversion function (strtoq/strtouq) */
+ char ccltab[256]; /* character class table for %[...] */
+ char buf[BUF]; /* buffer for numeric conversions */
+
+ /* `basefix' is used to avoid `if' tests in the integer scanner */
+ static short basefix[17] =
+ { 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
+
+ inr = strlen(inp);
+
+ nassigned = 0;
+ nconversions = 0;
+ nread = 0;
+ base = 0; /* XXX just to keep gcc happy */
+ ccfn = NULL; /* XXX just to keep gcc happy */
+ for (;;) {
+ c = *fmt++;
+ if (c == 0)
+ return (nassigned);
+ if (isspace(c)) {
+ while (inr > 0 && isspace(*inp))
+ nread++, inr--, inp++;
+ continue;
+ }
+ if (c != '%')
+ goto literal;
+ width = 0;
+ flags = 0;
+ /*
+ * switch on the format. continue if done;
+ * break once format type is derived.
+ */
+again: c = *fmt++;
+ switch (c) {
+ case '%':
+literal:
+ if (inr <= 0)
+ goto input_failure;
+ if (*inp != c)
+ goto match_failure;
+ inr--, inp++;
+ nread++;
+ continue;
+
+ case '*':
+ flags |= SUPPRESS;
+ goto again;
+ case 'l':
+ flags |= LONG;
+ goto again;
+ case 'q':
+ flags |= QUAD;
+ goto again;
+ case 'h':
+ flags |= SHORT;
+ goto again;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ width = width * 10 + c - '0';
+ goto again;
+
+ /*
+ * Conversions.
+ *
+ */
+ case 'd':
+ c = CT_INT;
+ ccfn = (ccfntype)strtoq;
+ base = 10;
+ break;
+
+ case 'i':
+ c = CT_INT;
+ ccfn = (ccfntype)strtoq;
+ base = 0;
+ break;
+
+ case 'o':
+ c = CT_INT;
+ ccfn = strtouq;
+ base = 8;
+ break;
+
+ case 'u':
+ c = CT_INT;
+ ccfn = strtouq;
+ base = 10;
+ break;
+
+ case 'x':
+ flags |= PFXOK; /* enable 0x prefixing */
+ c = CT_INT;
+ ccfn = strtouq;
+ base = 16;
+ break;
+
+ case 's':
+ c = CT_STRING;
+ break;
+
+ case '[':
+ fmt = __sccl(ccltab, fmt);
+ flags |= NOSKIP;
+ c = CT_CCL;
+ break;
+
+ case 'c':
+ flags |= NOSKIP;
+ c = CT_CHAR;
+ break;
+
+ case 'p': /* pointer format is like hex */
+ flags |= POINTER | PFXOK;
+ c = CT_INT;
+ ccfn = strtouq;
+ base = 16;
+ break;
+
+ case 'n':
+ nconversions++;
+ if (flags & SUPPRESS) /* ??? */
+ continue;
+ if (flags & SHORT)
+ *va_arg(ap, short *) = nread;
+ else if (flags & LONG)
+ *va_arg(ap, long *) = nread;
+ else if (flags & QUAD)
+ *va_arg(ap, quad_t *) = nread;
+ else
+ *va_arg(ap, int *) = nread;
+ continue;
+ }
+
+ /*
+ * We have a conversion that requires input.
+ */
+ if (inr <= 0)
+ goto input_failure;
+
+ /*
+ * Consume leading white space, except for formats
+ * that suppress this.
+ */
+ if ((flags & NOSKIP) == 0) {
+ while (isspace(*inp)) {
+ nread++;
+ if (--inr > 0)
+ inp++;
+ else
+ goto input_failure;
+ }
+ /*
+ * Note that there is at least one character in
+ * the buffer, so conversions that do not set NOSKIP
+ * can no longer result in an input failure.
+ */
+ }
+
+ /*
+ * Do the conversion.
+ */
+ switch (c) {
+
+ case CT_CHAR:
+ /* scan arbitrary characters (sets NOSKIP) */
+ if (width == 0)
+ width = 1;
+ if (flags & SUPPRESS) {
+ size_t sum = 0;
+ for (;;) {
+ if ((n = inr) < width) {
+ sum += n;
+ width -= n;
+ inp += n;
+ if (sum == 0)
+ goto input_failure;
+ break;
+ } else {
+ sum += width;
+ inr -= width;
+ inp += width;
+ break;
+ }
+ }
+ nread += sum;
+ } else {
+ bcopy(inp, va_arg(ap, char *), width);
+ inr -= width;
+ inp += width;
+ nread += width;
+ nassigned++;
+ }
+ nconversions++;
+ break;
+
+ case CT_CCL:
+ /* scan a (nonempty) character class (sets NOSKIP) */
+ if (width == 0)
+ width = (size_t)~0; /* `infinity' */
+ /* take only those things in the class */
+ if (flags & SUPPRESS) {
+ n = 0;
+ while (ccltab[*inp]) {
+ n++, inr--, inp++;
+ if (--width == 0)
+ break;
+ if (inr <= 0) {
+ if (n == 0)
+ goto input_failure;
+ break;
+ }
+ }
+ if (n == 0)
+ goto match_failure;
+ } else {
+ p0 = p = va_arg(ap, char *);
+ while (ccltab[*inp]) {
+ inr--;
+ *p++ = *inp++;
+ if (--width == 0)
+ break;
+ if (inr <= 0) {
+ if (p == p0)
+ goto input_failure;
+ break;
+ }
+ }
+ n = p - p0;
+ if (n == 0)
+ goto match_failure;
+ *p = 0;
+ nassigned++;
+ }
+ nread += n;
+ nconversions++;
+ break;
+
+ case CT_STRING:
+ /* like CCL, but zero-length string OK, & no NOSKIP */
+ if (width == 0)
+ width = (size_t)~0;
+ if (flags & SUPPRESS) {
+ n = 0;
+ while (!isspace(*inp)) {
+ n++, inr--, inp++;
+ if (--width == 0)
+ break;
+ if (inr <= 0)
+ break;
+ }
+ nread += n;
+ } else {
+ p0 = p = va_arg(ap, char *);
+ while (!isspace(*inp)) {
+ inr--;
+ *p++ = *inp++;
+ if (--width == 0)
+ break;
+ if (inr <= 0)
+ break;
+ }
+ *p = 0;
+ nread += p - p0;
+ nassigned++;
+ }
+ nconversions++;
+ continue;
+
+ case CT_INT:
+ /* scan an integer as if by strtoq/strtouq */
+#ifdef hardway
+ if (width == 0 || width > sizeof(buf) - 1)
+ width = sizeof(buf) - 1;
+#else
+ /* size_t is unsigned, hence this optimisation */
+ if (--width > sizeof(buf) - 2)
+ width = sizeof(buf) - 2;
+ width++;
+#endif
+ flags |= SIGNOK | NDIGITS | NZDIGITS;
+ for (p = buf; width; width--) {
+ c = *inp;
+ /*
+ * Switch on the character; `goto ok'
+ * if we accept it as a part of number.
+ */
+ switch (c) {
+
+ /*
+ * The digit 0 is always legal, but is
+ * special. For %i conversions, if no
+ * digits (zero or nonzero) have been
+ * scanned (only signs), we will have
+ * base==0. In that case, we should set
+ * it to 8 and enable 0x prefixing.
+ * Also, if we have not scanned zero digits
+ * before this, do not turn off prefixing
+ * (someone else will turn it off if we
+ * have scanned any nonzero digits).
+ */
+ case '0':
+ if (base == 0) {
+ base = 8;
+ flags |= PFXOK;
+ }
+ if (flags & NZDIGITS)
+ flags &= ~(SIGNOK|NZDIGITS|NDIGITS);
+ else
+ flags &= ~(SIGNOK|PFXOK|NDIGITS);
+ goto ok;
+
+ /* 1 through 7 always legal */
+ case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ base = basefix[base];
+ flags &= ~(SIGNOK | PFXOK | NDIGITS);
+ goto ok;
+
+ /* digits 8 and 9 ok iff decimal or hex */
+ case '8': case '9':
+ base = basefix[base];
+ if (base <= 8)
+ break; /* not legal here */
+ flags &= ~(SIGNOK | PFXOK | NDIGITS);
+ goto ok;
+
+ /* letters ok iff hex */
+ case 'A': case 'B': case 'C':
+ case 'D': case 'E': case 'F':
+ case 'a': case 'b': case 'c':
+ case 'd': case 'e': case 'f':
+ /* no need to fix base here */
+ if (base <= 10)
+ break; /* not legal here */
+ flags &= ~(SIGNOK | PFXOK | NDIGITS);
+ goto ok;
+
+ /* sign ok only as first character */
+ case '+': case '-':
+ if (flags & SIGNOK) {
+ flags &= ~SIGNOK;
+ goto ok;
+ }
+ break;
+
+ /* x ok iff flag still set & 2nd char */
+ case 'x': case 'X':
+ if (flags & PFXOK && p == buf + 1) {
+ base = 16; /* if %i */
+ flags &= ~PFXOK;
+ goto ok;
+ }
+ break;
+ }
+
+ /*
+ * If we got here, c is not a legal character
+ * for a number. Stop accumulating digits.
+ */
+ break;
+ ok:
+ /*
+ * c is legal: store it and look at the next.
+ */
+ *p++ = c;
+ if (--inr > 0)
+ inp++;
+ else
+ break; /* end of input */
+ }
+ /*
+ * If we had only a sign, it is no good; push
+ * back the sign. If the number ends in `x',
+ * it was [sign] '0' 'x', so push back the x
+ * and treat it as [sign] '0'.
+ */
+ if (flags & NDIGITS) {
+ if (p > buf) {
+ inp--;
+ inr++;
+ }
+ goto match_failure;
+ }
+ c = ((u_char *)p)[-1];
+ if (c == 'x' || c == 'X') {
+ --p;
+ inp--;
+ inr++;
+ }
+ if ((flags & SUPPRESS) == 0) {
+ u_quad_t res;
+
+ *p = 0;
+ res = (*ccfn)(buf, (char **)NULL, base);
+ if (flags & POINTER)
+ *va_arg(ap, void **) =
+ (void *)(u_long)res;
+ else if (flags & SHORT)
+ *va_arg(ap, short *) = res;
+ else if (flags & LONG)
+ *va_arg(ap, long *) = res;
+ else if (flags & QUAD)
+ *va_arg(ap, quad_t *) = res;
+ else
+ *va_arg(ap, int *) = res;
+ nassigned++;
+ }
+ nread += p - buf;
+ nconversions++;
+ break;
+
+ }
+ }
+input_failure:
+ return (nconversions != 0 ? nassigned : -1);
+match_failure:
+ return (nassigned);
+}
+
+/*
+ * Fill in the given table from the scanset at the given format
+ * (just after `['). Return a pointer to the character past the
+ * closing `]'. The table has a 1 wherever characters should be
+ * considered part of the scanset.
+ */
+static u_char *
+__sccl(char *tab, u_char *fmt)
+{
+ int c, n, v;
+
+ /* first `clear' the whole table */
+ c = *fmt++; /* first char hat => negated scanset */
+ if (c == '^') {
+ v = 1; /* default => accept */
+ c = *fmt++; /* get new first char */
+ } else
+ v = 0; /* default => reject */
+
+ /* XXX: Will not work if sizeof(tab*) > sizeof(char) */
+ for (n = 0; n < 256; n++)
+ tab[n] = v; /* memset(tab, v, 256) */
+
+ if (c == 0)
+ return (fmt - 1);/* format ended before closing ] */
+
+ /*
+ * Now set the entries corresponding to the actual scanset
+ * to the opposite of the above.
+ *
+ * The first character may be ']' (or '-') without being special;
+ * the last character may be '-'.
+ */
+ v = 1 - v;
+ for (;;) {
+ tab[c] = v; /* take character c */
+doswitch:
+ n = *fmt++; /* and examine the next */
+ switch (n) {
+
+ case 0: /* format ended too soon */
+ return (fmt - 1);
+
+ case '-':
+ /*
+ * A scanset of the form
+ * [01+-]
+ * is defined as `the digit 0, the digit 1,
+ * the character +, the character -', but
+ * the effect of a scanset such as
+ * [a-zA-Z0-9]
+ * is implementation defined. The V7 Unix
+ * scanf treats `a-z' as `the letters a through
+ * z', but treats `a-a' as `the letter a, the
+ * character -, and the letter a'.
+ *
+ * For compatibility, the `-' is not considerd
+ * to define a range if the character following
+ * it is either a close bracket (required by ANSI)
+ * or is not numerically greater than the character
+ * we just stored in the table (c).
+ */
+ n = *fmt;
+ if (n == ']' || n < c) {
+ c = '-';
+ break; /* resume the for(;;) */
+ }
+ fmt++;
+ /* fill in the range */
+ do {
+ tab[++c] = v;
+ } while (c < n);
+ c = n;
+ /*
+ * Alas, the V7 Unix scanf also treats formats
+ * such as [a-c-e] as `the letters a through e'.
+ * This too is permitted by the standard....
+ */
+ goto doswitch;
+ break;
+
+ case ']': /* end of scanset */
+ return (fmt);
+
+ default: /* just another character */
+ c = n;
+ break;
+ }
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Convert a string to an unsigned quad integer.
+ *
+ * Ignores `locale' stuff. Assumes that the upper and lower case
+ * alphabets and digits are each contiguous.
+ */
+u_quad_t
+strtouq(const char *nptr, char **endptr, int base)
+{
+ const char *s = nptr;
+ u_quad_t acc;
+ unsigned char c;
+ u_quad_t qbase, cutoff;
+ int neg, any, cutlim;
+
+ /*
+ * See strtoq for comments as to the logic used.
+ */
+ s = nptr;
+ do {
+ c = *s++;
+ } while (isspace(c));
+ if (c == '-') {
+ neg = 1;
+ c = *s++;
+ } else {
+ neg = 0;
+ if (c == '+')
+ c = *s++;
+ }
+ if ((base == 0 || base == 16) &&
+ c == '0' && (*s == 'x' || *s == 'X')) {
+ c = s[1];
+ s += 2;
+ base = 16;
+ }
+ if (base == 0)
+ base = c == '0' ? 8 : 10;
+ qbase = (unsigned)base;
+ cutoff = (u_quad_t)UQUAD_MAX / qbase;
+ cutlim = (u_quad_t)UQUAD_MAX % qbase;
+ for (acc = 0, any = 0;; c = *s++) {
+ if (!isascii(c))
+ break;
+ if (isdigit(c))
+ c -= '0';
+ else if (isalpha(c))
+ c -= isupper(c) ? 'A' - 10 : 'a' - 10;
+ else
+ break;
+ if (c >= base)
+ break;
+ if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+ any = -1;
+ else {
+ any = 1;
+ acc *= qbase;
+ acc += c;
+ }
+ }
+ if (any < 0) {
+ acc = UQUAD_MAX;
+ } else if (neg)
+ acc = -acc;
+ if (endptr != 0)
+ *endptr = (char *)(any ? s - 1 : nptr);
+ return (acc);
+}
+
+/*
+ * Convert a string to a quad integer.
+ *
+ * Ignores `locale' stuff. Assumes that the upper and lower case
+ * alphabets and digits are each contiguous.
+ */
+quad_t
+strtoq(const char *nptr, char **endptr, int base)
+{
+ const char *s;
+ u_quad_t acc;
+ unsigned char c;
+ u_quad_t qbase, cutoff;
+ int neg, any, cutlim;
+
+ /*
+ * Skip white space and pick up leading +/- sign if any.
+ * If base is 0, allow 0x for hex and 0 for octal, else
+ * assume decimal; if base is already 16, allow 0x.
+ */
+ s = nptr;
+ do {
+ c = *s++;
+ } while (isspace(c));
+ if (c == '-') {
+ neg = 1;
+ c = *s++;
+ } else {
+ neg = 0;
+ if (c == '+')
+ c = *s++;
+ }
+ if ((base == 0 || base == 16) &&
+ c == '0' && (*s == 'x' || *s == 'X')) {
+ c = s[1];
+ s += 2;
+ base = 16;
+ }
+ if (base == 0)
+ base = c == '0' ? 8 : 10;
+
+ /*
+ * Compute the cutoff value between legal numbers and illegal
+ * numbers. That is the largest legal value, divided by the
+ * base. An input number that is greater than this value, if
+ * followed by a legal input character, is too big. One that
+ * is equal to this value may be valid or not; the limit
+ * between valid and invalid numbers is then based on the last
+ * digit. For instance, if the range for quads is
+ * [-9223372036854775808..9223372036854775807] and the input base
+ * is 10, cutoff will be set to 922337203685477580 and cutlim to
+ * either 7 (neg==0) or 8 (neg==1), meaning that if we have
+ * accumulated a value > 922337203685477580, or equal but the
+ * next digit is > 7 (or 8), the number is too big, and we will
+ * return a range error.
+ *
+ * Set any if any `digits' consumed; make it negative to indicate
+ * overflow.
+ */
+ qbase = (unsigned)base;
+ cutoff = neg ? (u_quad_t)-(QUAD_MIN + QUAD_MAX) + QUAD_MAX : QUAD_MAX;
+ cutlim = cutoff % qbase;
+ cutoff /= qbase;
+ for (acc = 0, any = 0;; c = *s++) {
+ if (!isascii(c))
+ break;
+ if (isdigit(c))
+ c -= '0';
+ else if (isalpha(c))
+ c -= isupper(c) ? 'A' - 10 : 'a' - 10;
+ else
+ break;
+ if (c >= base)
+ break;
+ if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+ any = -1;
+ else {
+ any = 1;
+ acc *= qbase;
+ acc += c;
+ }
+ }
+ if (any < 0) {
+ acc = neg ? QUAD_MIN : QUAD_MAX;
+ } else if (neg)
+ acc = -acc;
+ if (endptr != 0)
+ *endptr = (char *)(any ? s - 1 : nptr);
+ return (acc);
+}
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
new file mode 100644
index 0000000..569f04b
--- /dev/null
+++ b/sys/kern/subr_smp.c
@@ -0,0 +1,2663 @@
+/*
+ * Copyright (c) 1996, by Steve Passe
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. The name of the developer may NOT be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: mp_machdep.c,v 1.87 1999/01/12 00:19:31 eivind Exp $
+ */
+
+#include "opt_smp.h"
+#include "opt_vm86.h"
+#include "opt_cpu.h"
+#include "opt_user_ldt.h"
+
+#ifdef SMP
+#include <machine/smptests.h>
+#else
+#error
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#ifdef BETTER_CLOCK
+#include <sys/dkstat.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#ifdef BETTER_CLOCK
+#include <sys/lock.h>
+#include <vm/vm_map.h>
+#include <sys/user.h>
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+#endif
+
+#include <machine/smp.h>
+#include <machine/apic.h>
+#include <machine/mpapic.h>
+#include <machine/segments.h>
+#include <machine/smptests.h> /** TEST_DEFAULT_CONFIG, TEST_TEST1 */
+#include <machine/tss.h>
+#include <machine/specialreg.h>
+#include <machine/cputypes.h>
+#include <machine/globaldata.h>
+
+#include <i386/i386/cons.h> /* cngetc() */
+
+#if defined(APIC_IO)
+#include <machine/md_var.h> /* setidt() */
+#include <i386/isa/icu.h> /* IPIs */
+#include <i386/isa/intr_machdep.h> /* IPIs */
+#endif /* APIC_IO */
+
+#if defined(TEST_DEFAULT_CONFIG)
+#define MPFPS_MPFB1 TEST_DEFAULT_CONFIG
+#else
+#define MPFPS_MPFB1 mpfps->mpfb1
+#endif /* TEST_DEFAULT_CONFIG */
+
+#define WARMBOOT_TARGET 0
+#define WARMBOOT_OFF (KERNBASE + 0x0467)
+#define WARMBOOT_SEG (KERNBASE + 0x0469)
+
+#ifdef PC98
+#define BIOS_BASE (0xe8000)
+#define BIOS_SIZE (0x18000)
+#else
+#define BIOS_BASE (0xf0000)
+#define BIOS_SIZE (0x10000)
+#endif
+#define BIOS_COUNT (BIOS_SIZE/4)
+
+#define CMOS_REG (0x70)
+#define CMOS_DATA (0x71)
+#define BIOS_RESET (0x0f)
+#define BIOS_WARM (0x0a)
+
+#define PROCENTRY_FLAG_EN 0x01
+#define PROCENTRY_FLAG_BP 0x02
+#define IOAPICENTRY_FLAG_EN 0x01
+
+
+/* MP Floating Pointer Structure */
+typedef struct MPFPS {
+ char signature[4];
+ void *pap;
+ u_char length;
+ u_char spec_rev;
+ u_char checksum;
+ u_char mpfb1;
+ u_char mpfb2;
+ u_char mpfb3;
+ u_char mpfb4;
+ u_char mpfb5;
+} *mpfps_t;
+
+/* MP Configuration Table Header */
+typedef struct MPCTH {
+ char signature[4];
+ u_short base_table_length;
+ u_char spec_rev;
+ u_char checksum;
+ u_char oem_id[8];
+ u_char product_id[12];
+ void *oem_table_pointer;
+ u_short oem_table_size;
+ u_short entry_count;
+ void *apic_address;
+ u_short extended_table_length;
+ u_char extended_table_checksum;
+ u_char reserved;
+} *mpcth_t;
+
+
+typedef struct PROCENTRY {
+ u_char type;
+ u_char apic_id;
+ u_char apic_version;
+ u_char cpu_flags;
+ u_long cpu_signature;
+ u_long feature_flags;
+ u_long reserved1;
+ u_long reserved2;
+} *proc_entry_ptr;
+
+typedef struct BUSENTRY {
+ u_char type;
+ u_char bus_id;
+ char bus_type[6];
+} *bus_entry_ptr;
+
+typedef struct IOAPICENTRY {
+ u_char type;
+ u_char apic_id;
+ u_char apic_version;
+ u_char apic_flags;
+ void *apic_address;
+} *io_apic_entry_ptr;
+
+typedef struct INTENTRY {
+ u_char type;
+ u_char int_type;
+ u_short int_flags;
+ u_char src_bus_id;
+ u_char src_bus_irq;
+ u_char dst_apic_id;
+ u_char dst_apic_int;
+} *int_entry_ptr;
+
+/* descriptions of MP basetable entries */
+typedef struct BASETABLE_ENTRY {
+ u_char type;
+ u_char length;
+ char name[16];
+} basetable_entry;
+
+/*
+ * this code MUST be enabled here and in mpboot.s.
+ * it follows the very early stages of AP boot by placing values in CMOS ram.
+ * it NORMALLY will never be needed and thus the primitive method for enabling.
+ *
+#define CHECK_POINTS
+ */
+
+#if defined(CHECK_POINTS) && !defined(PC98)
+#define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA))
+#define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
+
+#define CHECK_INIT(D); \
+ CHECK_WRITE(0x34, (D)); \
+ CHECK_WRITE(0x35, (D)); \
+ CHECK_WRITE(0x36, (D)); \
+ CHECK_WRITE(0x37, (D)); \
+ CHECK_WRITE(0x38, (D)); \
+ CHECK_WRITE(0x39, (D));
+
+#define CHECK_PRINT(S); \
+ printf("%s: %d, %d, %d, %d, %d, %d\n", \
+ (S), \
+ CHECK_READ(0x34), \
+ CHECK_READ(0x35), \
+ CHECK_READ(0x36), \
+ CHECK_READ(0x37), \
+ CHECK_READ(0x38), \
+ CHECK_READ(0x39));
+
+#else /* CHECK_POINTS */
+
+#define CHECK_INIT(D)
+#define CHECK_PRINT(S)
+
+#endif /* CHECK_POINTS */
+
+/*
+ * Values to send to the POST hardware.
+ */
+#define MP_BOOTADDRESS_POST 0x10
+#define MP_PROBE_POST 0x11
+#define MPTABLE_PASS1_POST 0x12
+
+#define MP_START_POST 0x13
+#define MP_ENABLE_POST 0x14
+#define MPTABLE_PASS2_POST 0x15
+
+#define START_ALL_APS_POST 0x16
+#define INSTALL_AP_TRAMP_POST 0x17
+#define START_AP_POST 0x18
+
+#define MP_ANNOUNCE_POST 0x19
+
+
+/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
+int current_postcode;
+
+/** XXX FIXME: what system files declare these??? */
+extern struct region_descriptor r_gdt, r_idt;
+
+int bsp_apic_ready = 0; /* flags useability of BSP apic */
+int mp_ncpus; /* # of CPUs, including BSP */
+int mp_naps; /* # of Applications processors */
+int mp_nbusses; /* # of busses */
+int mp_napics; /* # of IO APICs */
+int boot_cpu_id; /* designated BSP */
+vm_offset_t cpu_apic_address;
+vm_offset_t io_apic_address[NAPICID]; /* NAPICID is more than enough */
+extern int nkpt;
+
+u_int32_t cpu_apic_versions[NCPU];
+u_int32_t io_apic_versions[NAPIC];
+
+#ifdef APIC_INTR_DIAGNOSTIC
+int apic_itrace_enter[32];
+int apic_itrace_tryisrlock[32];
+int apic_itrace_gotisrlock[32];
+int apic_itrace_active[32];
+int apic_itrace_masked[32];
+int apic_itrace_noisrlock[32];
+int apic_itrace_masked2[32];
+int apic_itrace_unmask[32];
+int apic_itrace_noforward[32];
+int apic_itrace_leave[32];
+int apic_itrace_enter2[32];
+int apic_itrace_doreti[32];
+int apic_itrace_splz[32];
+int apic_itrace_eoi[32];
+#ifdef APIC_INTR_DIAGNOSTIC_IRQ
+unsigned short apic_itrace_debugbuffer[32768];
+int apic_itrace_debugbuffer_idx;
+struct simplelock apic_itrace_debuglock;
+#endif
+#endif
+
+#ifdef APIC_INTR_REORDER
+struct {
+ volatile int *location;
+ int bit;
+} apic_isrbit_location[32];
+#endif
+
+struct apic_intmapinfo int_to_apicintpin[APIC_INTMAPSIZE];
+
+/*
+ * APIC ID logical/physical mapping structures.
+ * We oversize these to simplify boot-time config.
+ */
+int cpu_num_to_apic_id[NAPICID];
+int io_num_to_apic_id[NAPICID];
+int apic_id_to_logical[NAPICID];
+
+
+/* Bitmap of all available CPUs */
+u_int all_cpus;
+
+/* AP uses this PTD during bootstrap. Do not staticize. */
+pd_entry_t *bootPTD;
+
+/* Hotwire a 0->4MB V==P mapping */
+extern pt_entry_t *KPTphys;
+
+/* Virtual address of per-cpu common_tss */
+extern struct i386tss common_tss;
+#ifdef VM86
+extern struct segment_descriptor common_tssd;
+extern u_int private_tss; /* flag indicating private tss */
+extern u_int my_tr;
+#endif /* VM86 */
+
+/* IdlePTD per cpu */
+pd_entry_t *IdlePTDS[NCPU];
+
+/* "my" private page table page, for BSP init */
+extern pt_entry_t SMP_prvpt[];
+
+/* Private page pointer to curcpu's PTD, used during BSP init */
+extern pd_entry_t *my_idlePTD;
+
+struct pcb stoppcbs[NCPU];
+
+int smp_started; /* has the system started? */
+
+/*
+ * Local data and functions.
+ */
+
+static int mp_capable;
+static u_int boot_address;
+static u_int base_memory;
+
+static int picmode; /* 0: virtual wire mode, 1: PIC mode */
+static mpfps_t mpfps;
+static int search_for_sig(u_int32_t target, int count);
+static void mp_enable(u_int boot_addr);
+
+static int mptable_pass1(void);
+static int mptable_pass2(void);
+static void default_mp_table(int type);
+static void fix_mp_table(void);
+static void setup_apic_irq_mapping(void);
+static void init_locks(void);
+static int start_all_aps(u_int boot_addr);
+static void install_ap_tramp(u_int boot_addr);
+static int start_ap(int logicalCpu, u_int boot_addr);
+
+/*
+ * Calculate usable address in base memory for AP trampoline code.
+ */
+u_int
+mp_bootaddress(u_int basemem)
+{
+ POSTCODE(MP_BOOTADDRESS_POST);
+
+ base_memory = basemem * 1024; /* convert to bytes */
+
+ boot_address = base_memory & ~0xfff; /* round down to 4k boundary */
+ if ((base_memory - boot_address) < bootMP_size)
+ boot_address -= 4096; /* not enough, lower by 4k */
+
+ return boot_address;
+}
+
+
+/*
+ * Look for an Intel MP spec table (ie, SMP capable hardware).
+ */
+int
+mp_probe(void)
+{
+ int x;
+ u_long segment;
+ u_int32_t target;
+
+ POSTCODE(MP_PROBE_POST);
+
+ /* see if EBDA exists */
+ if (segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) {
+ /* search first 1K of EBDA */
+ target = (u_int32_t) (segment << 4);
+ if ((x = search_for_sig(target, 1024 / 4)) >= 0)
+ goto found;
+ } else {
+ /* last 1K of base memory, effective 'top of base' passed in */
+ target = (u_int32_t) (base_memory - 0x400);
+ if ((x = search_for_sig(target, 1024 / 4)) >= 0)
+ goto found;
+ }
+
+ /* search the BIOS */
+ target = (u_int32_t) BIOS_BASE;
+ if ((x = search_for_sig(target, BIOS_COUNT)) >= 0)
+ goto found;
+
+ /* nothing found */
+ mpfps = (mpfps_t)0;
+ mp_capable = 0;
+ return 0;
+
+found:
+ /* calculate needed resources */
+ mpfps = (mpfps_t)x;
+ if (mptable_pass1())
+ panic("you must reconfigure your kernel");
+
+ /* flag fact that we are running multiple processors */
+ mp_capable = 1;
+ return 1;
+}
+
+
+/*
+ * Startup the SMP processors.
+ */
+void
+mp_start(void)
+{
+ POSTCODE(MP_START_POST);
+
+ /* look for MP capable motherboard */
+ if (mp_capable)
+ mp_enable(boot_address);
+ else
+ panic("MP hardware not found!");
+}
+
+
+/*
+ * Print various information about the SMP system hardware and setup.
+ */
+void
+mp_announce(void)
+{
+ int x;
+
+ POSTCODE(MP_ANNOUNCE_POST);
+
+ printf("FreeBSD/SMP: Multiprocessor motherboard\n");
+ printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0));
+ printf(", version: 0x%08x", cpu_apic_versions[0]);
+ printf(", at 0x%08x\n", cpu_apic_address);
+ for (x = 1; x <= mp_naps; ++x) {
+ printf(" cpu%d (AP): apic id: %2d", x, CPU_TO_ID(x));
+ printf(", version: 0x%08x", cpu_apic_versions[x]);
+ printf(", at 0x%08x\n", cpu_apic_address);
+ }
+
+#if defined(APIC_IO)
+ for (x = 0; x < mp_napics; ++x) {
+ printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x));
+ printf(", version: 0x%08x", io_apic_versions[x]);
+ printf(", at 0x%08x\n", io_apic_address[x]);
+ }
+#else
+ printf(" Warning: APIC I/O disabled\n");
+#endif /* APIC_IO */
+}
+
+/*
+ * AP cpu's call this to sync up protected mode.
+ */
+void
+init_secondary(void)
+{
+ int gsel_tss;
+#ifndef VM86
+ u_int my_tr;
+#endif
+
+ r_gdt.rd_limit = sizeof(gdt[0]) * (NGDT + NCPU) - 1;
+ r_gdt.rd_base = (int) gdt;
+ lgdt(&r_gdt); /* does magic intra-segment return */
+ lidt(&r_idt);
+ lldt(_default_ldt);
+#ifdef USER_LDT
+ currentldt = _default_ldt;
+#endif
+
+ my_tr = NGDT + cpuid;
+ gsel_tss = GSEL(my_tr, SEL_KPL);
+ gdt[my_tr].sd.sd_type = SDT_SYS386TSS;
+ common_tss.tss_esp0 = 0; /* not used until after switch */
+ common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
+ common_tss.tss_ioopt = (sizeof common_tss) << 16;
+#ifdef VM86
+ common_tssd = gdt[my_tr].sd;
+ private_tss = 0;
+#endif /* VM86 */
+ ltr(gsel_tss);
+
+ load_cr0(0x8005003b); /* XXX! */
+
+ PTD[0] = 0;
+ pmap_set_opt((unsigned *)PTD);
+
+ putmtrr();
+ pmap_setvidram();
+
+ invltlb();
+}
+
+
+#if defined(APIC_IO)
+/*
+ * Final configuration of the BSP's local APIC:
+ * - disable 'pic mode'.
+ * - disable 'virtual wire mode'.
+ * - enable NMI.
+ */
+void
+bsp_apic_configure(void)
+{
+ u_char byte;
+ u_int32_t temp;
+
+ /* leave 'pic mode' if necessary */
+ if (picmode) {
+ outb(0x22, 0x70); /* select IMCR */
+ byte = inb(0x23); /* current contents */
+ byte |= 0x01; /* mask external INTR */
+ outb(0x23, byte); /* disconnect 8259s/NMI */
+ }
+
+ /* mask lint0 (the 8259 'virtual wire' connection) */
+ temp = lapic.lvt_lint0;
+ temp |= APIC_LVT_M; /* set the mask */
+ lapic.lvt_lint0 = temp;
+
+ /* setup lint1 to handle NMI */
+ temp = lapic.lvt_lint1;
+ temp &= ~APIC_LVT_M; /* clear the mask */
+ lapic.lvt_lint1 = temp;
+
+ if (bootverbose)
+ apic_dump("bsp_apic_configure()");
+}
+#endif /* APIC_IO */
+
+
+/*******************************************************************
+ * local functions and data
+ */
+
+/*
+ * start the SMP system
+ */
+static void
+mp_enable(u_int boot_addr)
+{
+ int x;
+#if defined(APIC_IO)
+ int apic;
+ u_int ux;
+#endif /* APIC_IO */
+
+ getmtrr();
+ pmap_setvidram();
+
+ POSTCODE(MP_ENABLE_POST);
+
+ /* turn on 4MB of V == P addressing so we can get to MP table */
+ *(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME);
+ invltlb();
+
+ /* examine the MP table for needed info, uses physical addresses */
+ x = mptable_pass2();
+
+ *(int *)PTD = 0;
+ invltlb();
+
+ /* can't process default configs till the CPU APIC is pmapped */
+ if (x)
+ default_mp_table(x);
+
+ /* post scan cleanup */
+ fix_mp_table();
+ setup_apic_irq_mapping();
+
+#if defined(APIC_IO)
+
+ /* fill the LOGICAL io_apic_versions table */
+ for (apic = 0; apic < mp_napics; ++apic) {
+ ux = io_apic_read(apic, IOAPIC_VER);
+ io_apic_versions[apic] = ux;
+ }
+
+ /* program each IO APIC in the system */
+ for (apic = 0; apic < mp_napics; ++apic)
+ if (io_apic_setup(apic) < 0)
+ panic("IO APIC setup failure");
+
+ /* install a 'Spurious INTerrupt' vector */
+ setidt(XSPURIOUSINT_OFFSET, Xspuriousint,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+
+ /* install an inter-CPU IPI for TLB invalidation */
+ setidt(XINVLTLB_OFFSET, Xinvltlb,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+
+#ifdef BETTER_CLOCK
+ /* install an inter-CPU IPI for reading processor state */
+ setidt(XCPUCHECKSTATE_OFFSET, Xcpucheckstate,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+#endif
+
+ /* install an inter-CPU IPI for forcing an additional software trap */
+ setidt(XCPUAST_OFFSET, Xcpuast,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+
+ /* install an inter-CPU IPI for interrupt forwarding */
+ setidt(XFORWARD_IRQ_OFFSET, Xforward_irq,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+
+ /* install an inter-CPU IPI for CPU stop/restart */
+ setidt(XCPUSTOP_OFFSET, Xcpustop,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+
+#if defined(TEST_TEST1)
+ /* install a "fake hardware INTerrupt" vector */
+ setidt(XTEST1_OFFSET, Xtest1,
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+#endif /** TEST_TEST1 */
+
+#endif /* APIC_IO */
+
+ /* initialize all SMP locks */
+ init_locks();
+
+ /* start each Application Processor */
+ start_all_aps(boot_addr);
+
+ /*
+ * The init process might be started on a different CPU now,
+ * and the boot CPU might not call prepare_usermode to get
+ * cr0 correctly configured. Thus we initialize cr0 here.
+ */
+ load_cr0(rcr0() | CR0_WP | CR0_AM);
+}
+
+
+/*
+ * look for the MP spec signature
+ */
+
+/* string defined by the Intel MP Spec as identifying the MP table */
+#define MP_SIG 0x5f504d5f /* _MP_ */
+#define NEXT(X) ((X) += 4)
+static int
+search_for_sig(u_int32_t target, int count)
+{
+ int x;
+ u_int32_t *addr = (u_int32_t *) (KERNBASE + target);
+
+ for (x = 0; x < count; NEXT(x))
+ if (addr[x] == MP_SIG)
+ /* make array index a byte index */
+ return (target + (x * sizeof(u_int32_t)));
+
+ return -1;
+}
+
+
+static basetable_entry basetable_entry_types[] =
+{
+ {0, 20, "Processor"},
+ {1, 8, "Bus"},
+ {2, 8, "I/O APIC"},
+ {3, 8, "I/O INT"},
+ {4, 8, "Local INT"}
+};
+
+typedef struct BUSDATA {
+ u_char bus_id;
+ enum busTypes bus_type;
+} bus_datum;
+
+typedef struct INTDATA {
+ u_char int_type;
+ u_short int_flags;
+ u_char src_bus_id;
+ u_char src_bus_irq;
+ u_char dst_apic_id;
+ u_char dst_apic_int;
+ u_char int_vector;
+} io_int, local_int;
+
+typedef struct BUSTYPENAME {
+ u_char type;
+ char name[7];
+} bus_type_name;
+
+static bus_type_name bus_type_table[] =
+{
+ {CBUS, "CBUS"},
+ {CBUSII, "CBUSII"},
+ {EISA, "EISA"},
+ {UNKNOWN_BUSTYPE, "---"},
+ {UNKNOWN_BUSTYPE, "---"},
+ {ISA, "ISA"},
+ {UNKNOWN_BUSTYPE, "---"},
+ {UNKNOWN_BUSTYPE, "---"},
+ {UNKNOWN_BUSTYPE, "---"},
+ {UNKNOWN_BUSTYPE, "---"},
+ {UNKNOWN_BUSTYPE, "---"},
+ {UNKNOWN_BUSTYPE, "---"},
+ {PCI, "PCI"},
+ {UNKNOWN_BUSTYPE, "---"},
+ {UNKNOWN_BUSTYPE, "---"},
+ {UNKNOWN_BUSTYPE, "---"},
+ {UNKNOWN_BUSTYPE, "---"},
+ {XPRESS, "XPRESS"},
+ {UNKNOWN_BUSTYPE, "---"}
+};
+/* from MP spec v1.4, table 5-1 */
+static int default_data[7][5] =
+{
+/* nbus, id0, type0, id1, type1 */
+ {1, 0, ISA, 255, 255},
+ {1, 0, EISA, 255, 255},
+ {1, 0, EISA, 255, 255},
+ {0, 255, 255, 255, 255},/* MCA not supported */
+ {2, 0, ISA, 1, PCI},
+ {2, 0, EISA, 1, PCI},
+ {0, 255, 255, 255, 255} /* MCA not supported */
+};
+
+
+/* the bus data */
+static bus_datum bus_data[NBUS];
+
+/* the IO INT data, one entry per possible APIC INTerrupt */
+static io_int io_apic_ints[NINTR];
+
+static int nintrs;
+
+static int processor_entry __P((proc_entry_ptr entry, int cpu));
+static int bus_entry __P((bus_entry_ptr entry, int bus));
+static int io_apic_entry __P((io_apic_entry_ptr entry, int apic));
+static int int_entry __P((int_entry_ptr entry, int intr));
+static int lookup_bus_type __P((char *name));
+
+
+/*
+ * 1st pass on motherboard's Intel MP specification table.
+ *
+ * initializes:
+ * mp_ncpus = 1
+ *
+ * determines:
+ * cpu_apic_address (common to all CPUs)
+ * io_apic_address[N]
+ * mp_naps
+ * mp_nbusses
+ * mp_napics
+ * nintrs
+ */
+static int
+mptable_pass1(void)
+{
+ int x;
+ mpcth_t cth;
+ int totalSize;
+ void* position;
+ int count;
+ int type;
+ int mustpanic;
+
+ POSTCODE(MPTABLE_PASS1_POST);
+
+ mustpanic = 0;
+
+ /* clear various tables */
+ for (x = 0; x < NAPICID; ++x) {
+ io_apic_address[x] = ~0; /* IO APIC address table */
+ }
+
+ /* init everything to empty */
+ mp_naps = 0;
+ mp_nbusses = 0;
+ mp_napics = 0;
+ nintrs = 0;
+
+ /* check for use of 'default' configuration */
+ if (MPFPS_MPFB1 != 0) {
+ /* use default addresses */
+ cpu_apic_address = DEFAULT_APIC_BASE;
+ io_apic_address[0] = DEFAULT_IO_APIC_BASE;
+
+ /* fill in with defaults */
+ mp_naps = 2; /* includes BSP */
+ mp_nbusses = default_data[MPFPS_MPFB1 - 1][0];
+#if defined(APIC_IO)
+ mp_napics = 1;
+ nintrs = 16;
+#endif /* APIC_IO */
+ }
+ else {
+ if ((cth = mpfps->pap) == 0)
+ panic("MP Configuration Table Header MISSING!");
+
+ cpu_apic_address = (vm_offset_t) cth->apic_address;
+
+ /* walk the table, recording info of interest */
+ totalSize = cth->base_table_length - sizeof(struct MPCTH);
+ position = (u_char *) cth + sizeof(struct MPCTH);
+ count = cth->entry_count;
+
+ while (count--) {
+ switch (type = *(u_char *) position) {
+ case 0: /* processor_entry */
+ if (((proc_entry_ptr)position)->cpu_flags
+ & PROCENTRY_FLAG_EN)
+ ++mp_naps;
+ break;
+ case 1: /* bus_entry */
+ ++mp_nbusses;
+ break;
+ case 2: /* io_apic_entry */
+ if (((io_apic_entry_ptr)position)->apic_flags
+ & IOAPICENTRY_FLAG_EN)
+ io_apic_address[mp_napics++] =
+ (vm_offset_t)((io_apic_entry_ptr)
+ position)->apic_address;
+ break;
+ case 3: /* int_entry */
+ ++nintrs;
+ break;
+ case 4: /* int_entry */
+ break;
+ default:
+ panic("mpfps Base Table HOSED!");
+ /* NOTREACHED */
+ }
+
+ totalSize -= basetable_entry_types[type].length;
+ (u_char*)position += basetable_entry_types[type].length;
+ }
+ }
+
+ /* qualify the numbers */
+ if (mp_naps > NCPU)
+#if 0 /* XXX FIXME: kern/4255 */
+ printf("Warning: only using %d of %d available CPUs!\n",
+ NCPU, mp_naps);
+#else
+ {
+ printf("NCPU cannot be different than actual CPU count.\n");
+ printf(" add 'options NCPU=%d' to your kernel config file,\n",
+ mp_naps);
+ printf(" then rerun config & rebuild your SMP kernel\n");
+ mustpanic = 1;
+ }
+#endif /* XXX FIXME: kern/4255 */
+ if (mp_nbusses > NBUS) {
+ printf("found %d busses, increase NBUS\n", mp_nbusses);
+ mustpanic = 1;
+ }
+ if (mp_napics > NAPIC) {
+ printf("found %d apics, increase NAPIC\n", mp_napics);
+ mustpanic = 1;
+ }
+ if (nintrs > NINTR) {
+ printf("found %d intrs, increase NINTR\n", nintrs);
+ mustpanic = 1;
+ }
+
+ /*
+ * Count the BSP.
+ * This is also used as a counter while starting the APs.
+ */
+ mp_ncpus = 1;
+
+ --mp_naps; /* subtract the BSP */
+
+ return mustpanic;
+}
+
+
+/*
+ * 2nd pass on motherboard's Intel MP specification table.
+ *
+ * sets:
+ * boot_cpu_id
+ * ID_TO_IO(N), phy APIC ID to log CPU/IO table
+ * CPU_TO_ID(N), logical CPU to APIC ID table
+ * IO_TO_ID(N), logical IO to APIC ID table
+ * bus_data[N]
+ * io_apic_ints[N]
+ */
+static int
+mptable_pass2(void)
+{
+ int x;
+ mpcth_t cth;
+ int totalSize;
+ void* position;
+ int count;
+ int type;
+ int apic, bus, cpu, intr;
+
+ POSTCODE(MPTABLE_PASS2_POST);
+
+ /* clear various tables */
+ for (x = 0; x < NAPICID; ++x) {
+ ID_TO_IO(x) = -1; /* phy APIC ID to log CPU/IO table */
+ CPU_TO_ID(x) = -1; /* logical CPU to APIC ID table */
+ IO_TO_ID(x) = -1; /* logical IO to APIC ID table */
+ }
+
+ /* clear bus data table */
+ for (x = 0; x < NBUS; ++x)
+ bus_data[x].bus_id = 0xff;
+
+ /* clear IO APIC INT table */
+ for (x = 0; x < NINTR; ++x) {
+ io_apic_ints[x].int_type = 0xff;
+ io_apic_ints[x].int_vector = 0xff;
+ }
+
+ /* setup the cpu/apic mapping arrays */
+ boot_cpu_id = -1;
+
+ /* record whether PIC or virtual-wire mode */
+ picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0;
+
+ /* check for use of 'default' configuration */
+ if (MPFPS_MPFB1 != 0)
+ return MPFPS_MPFB1; /* return default configuration type */
+
+ if ((cth = mpfps->pap) == 0)
+ panic("MP Configuration Table Header MISSING!");
+
+ /* walk the table, recording info of interest */
+ totalSize = cth->base_table_length - sizeof(struct MPCTH);
+ position = (u_char *) cth + sizeof(struct MPCTH);
+ count = cth->entry_count;
+ apic = bus = intr = 0;
+ cpu = 1; /* pre-count the BSP */
+
+ while (count--) {
+ switch (type = *(u_char *) position) {
+ case 0:
+ if (processor_entry(position, cpu))
+ ++cpu;
+ break;
+ case 1:
+ if (bus_entry(position, bus))
+ ++bus;
+ break;
+ case 2:
+ if (io_apic_entry(position, apic))
+ ++apic;
+ break;
+ case 3:
+ if (int_entry(position, intr))
+ ++intr;
+ break;
+ case 4:
+ /* int_entry(position); */
+ break;
+ default:
+ panic("mpfps Base Table HOSED!");
+ /* NOTREACHED */
+ }
+
+ totalSize -= basetable_entry_types[type].length;
+ (u_char *) position += basetable_entry_types[type].length;
+ }
+
+ if (boot_cpu_id == -1)
+ panic("NO BSP found!");
+
+ /* report fact that its NOT a default configuration */
+ return 0;
+}
+
+
+static void
+assign_apic_irq(int apic, int intpin, int irq)
+{
+ int x;
+
+ if (int_to_apicintpin[irq].ioapic != -1)
+ panic("assign_apic_irq: inconsistent table");
+
+ int_to_apicintpin[irq].ioapic = apic;
+ int_to_apicintpin[irq].int_pin = intpin;
+ int_to_apicintpin[irq].apic_address = ioapic[apic];
+ int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin;
+
+ for (x = 0; x < nintrs; x++) {
+ if ((io_apic_ints[x].int_type == 0 ||
+ io_apic_ints[x].int_type == 3) &&
+ io_apic_ints[x].int_vector == 0xff &&
+ io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) &&
+ io_apic_ints[x].dst_apic_int == intpin)
+ io_apic_ints[x].int_vector = irq;
+ }
+}
+
+/*
+ * parse an Intel MP specification table
+ */
+static void
+fix_mp_table(void)
+{
+ int x;
+ int id;
+ int bus_0 = 0; /* Stop GCC warning */
+ int bus_pci = 0; /* Stop GCC warning */
+ int num_pci_bus;
+
+ /*
+ * Fix mis-numbering of the PCI bus and its INT entries if the BIOS
+ * did it wrong. The MP spec says that when more than 1 PCI bus
+ * exists the BIOS must begin with bus entries for the PCI bus and use
+ * actual PCI bus numbering. This implies that when only 1 PCI bus
+ * exists the BIOS can choose to ignore this ordering, and indeed many
+ * MP motherboards do ignore it. This causes a problem when the PCI
+ * sub-system makes requests of the MP sub-system based on PCI bus
+ * numbers. So here we look for the situation and renumber the
+ * busses and associated INTs in an effort to "make it right".
+ */
+
+ /* find bus 0, PCI bus, count the number of PCI busses */
+ for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) {
+ if (bus_data[x].bus_id == 0) {
+ bus_0 = x;
+ }
+ if (bus_data[x].bus_type == PCI) {
+ ++num_pci_bus;
+ bus_pci = x;
+ }
+ }
+ /*
+ * bus_0 == slot of bus with ID of 0
+ * bus_pci == slot of last PCI bus encountered
+ */
+
+ /* check the 1 PCI bus case for sanity */
+ if (num_pci_bus == 1) {
+
+ /* if it is number 0 all is well */
+ if (bus_data[bus_pci].bus_id == 0)
+ return;
+
+ /* mis-numbered, swap with whichever bus uses slot 0 */
+
+ /* swap the bus entry types */
+ bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type;
+ bus_data[bus_0].bus_type = PCI;
+
+ /* swap each relavant INTerrupt entry */
+ id = bus_data[bus_pci].bus_id;
+ for (x = 0; x < nintrs; ++x) {
+ if (io_apic_ints[x].src_bus_id == id) {
+ io_apic_ints[x].src_bus_id = 0;
+ }
+ else if (io_apic_ints[x].src_bus_id == 0) {
+ io_apic_ints[x].src_bus_id = id;
+ }
+ }
+ }
+ /* sanity check if more than 1 PCI bus */
+ else if (num_pci_bus > 1) {
+ for (x = 0; x < mp_nbusses; ++x) {
+ if (bus_data[x].bus_type != PCI)
+ continue;
+ if (bus_data[x].bus_id >= num_pci_bus)
+ panic("bad PCI bus numbering");
+ }
+ }
+}
+
+
+static void
+setup_apic_irq_mapping(void)
+{
+ int x;
+ int int_vector;
+
+ /* Assign low level interrupt handlers */
+ for (x = 0; x < APIC_INTMAPSIZE; x++) {
+ int_to_apicintpin[x].ioapic = -1;
+ int_to_apicintpin[x].int_pin = 0;
+ int_to_apicintpin[x].apic_address = NULL;
+ int_to_apicintpin[x].redirindex = 0;
+ }
+ for (x = 0; x < nintrs; x++) {
+ if (io_apic_ints[x].dst_apic_int <= APIC_INTMAPSIZE &&
+ io_apic_ints[x].dst_apic_id == IO_TO_ID(0) &&
+ io_apic_ints[x].int_vector == 0xff &&
+ (io_apic_ints[x].int_type == 0 ||
+ io_apic_ints[x].int_type == 3)) {
+ assign_apic_irq(0,
+ io_apic_ints[x].dst_apic_int,
+ io_apic_ints[x].dst_apic_int);
+ }
+ }
+ int_vector = 0;
+ while (int_vector < APIC_INTMAPSIZE &&
+ int_to_apicintpin[int_vector].ioapic != -1)
+ int_vector++;
+ for (x = 0; x < nintrs && int_vector < APIC_INTMAPSIZE; x++) {
+ if ((io_apic_ints[x].int_type == 0 ||
+ io_apic_ints[x].int_type == 3) &&
+ io_apic_ints[x].int_vector == 0xff) {
+ assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id),
+ io_apic_ints[x].dst_apic_int,
+ int_vector);
+ int_vector++;
+ while (int_vector < APIC_INTMAPSIZE &&
+ int_to_apicintpin[int_vector].ioapic != -1)
+ int_vector++;
+ }
+ }
+}
+
+
+static int
+processor_entry(proc_entry_ptr entry, int cpu)
+{
+ /* check for usability */
+ if ((cpu >= NCPU) || !(entry->cpu_flags & PROCENTRY_FLAG_EN))
+ return 0;
+
+ /* check for BSP flag */
+ if (entry->cpu_flags & PROCENTRY_FLAG_BP) {
+ boot_cpu_id = entry->apic_id;
+ CPU_TO_ID(0) = entry->apic_id;
+ ID_TO_CPU(entry->apic_id) = 0;
+ return 0; /* its already been counted */
+ }
+
+ /* add another AP to list, if less than max number of CPUs */
+ else {
+ CPU_TO_ID(cpu) = entry->apic_id;
+ ID_TO_CPU(entry->apic_id) = cpu;
+ return 1;
+ }
+}
+
+
+static int
+bus_entry(bus_entry_ptr entry, int bus)
+{
+ int x;
+ char c, name[8];
+
+ /* encode the name into an index */
+ for (x = 0; x < 6; ++x) {
+ if ((c = entry->bus_type[x]) == ' ')
+ break;
+ name[x] = c;
+ }
+ name[x] = '\0';
+
+ if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE)
+ panic("unknown bus type: '%s'", name);
+
+ bus_data[bus].bus_id = entry->bus_id;
+ bus_data[bus].bus_type = x;
+
+ return 1;
+}
+
+
+static int
+io_apic_entry(io_apic_entry_ptr entry, int apic)
+{
+ if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN))
+ return 0;
+
+ IO_TO_ID(apic) = entry->apic_id;
+ ID_TO_IO(entry->apic_id) = apic;
+
+ return 1;
+}
+
+
+static int
+lookup_bus_type(char *name)
+{
+ int x;
+
+ for (x = 0; x < MAX_BUSTYPE; ++x)
+ if (strcmp(bus_type_table[x].name, name) == 0)
+ return bus_type_table[x].type;
+
+ return UNKNOWN_BUSTYPE;
+}
+
+
+static int
+int_entry(int_entry_ptr entry, int intr)
+{
+ int apic;
+
+ io_apic_ints[intr].int_type = entry->int_type;
+ io_apic_ints[intr].int_flags = entry->int_flags;
+ io_apic_ints[intr].src_bus_id = entry->src_bus_id;
+ io_apic_ints[intr].src_bus_irq = entry->src_bus_irq;
+ if (entry->dst_apic_id == 255) {
+ /* This signal goes to all IO APICS. Select an IO APIC
+ with sufficient number of interrupt pins */
+ for (apic = 0; apic < mp_napics; apic++)
+ if (((io_apic_read(apic, IOAPIC_VER) &
+ IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >=
+ entry->dst_apic_int)
+ break;
+ if (apic < mp_napics)
+ io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic);
+ else
+ io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
+ } else
+ io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
+ io_apic_ints[intr].dst_apic_int = entry->dst_apic_int;
+
+ return 1;
+}
+
+
+static int
+apic_int_is_bus_type(int intr, int bus_type)
+{
+ int bus;
+
+ for (bus = 0; bus < mp_nbusses; ++bus)
+ if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id)
+ && ((int) bus_data[bus].bus_type == bus_type))
+ return 1;
+
+ return 0;
+}
+
+
+/*
+ * Given a traditional ISA INT mask, return an APIC mask.
+ */
+u_int
+isa_apic_mask(u_int isa_mask)
+{
+ int isa_irq;
+ int apic_pin;
+
+#if defined(SKIP_IRQ15_REDIRECT)
+ if (isa_mask == (1 << 15)) {
+ printf("skipping ISA IRQ15 redirect\n");
+ return isa_mask;
+ }
+#endif /* SKIP_IRQ15_REDIRECT */
+
+ isa_irq = ffs(isa_mask); /* find its bit position */
+ if (isa_irq == 0) /* doesn't exist */
+ return 0;
+ --isa_irq; /* make it zero based */
+
+ apic_pin = isa_apic_irq(isa_irq); /* look for APIC connection */
+ if (apic_pin == -1)
+ return 0;
+
+ return (1 << apic_pin); /* convert pin# to a mask */
+}
+
+
+/*
+ * Determine which APIC pin an ISA/EISA INT is attached to.
+ */
+#define INTTYPE(I) (io_apic_ints[(I)].int_type)
+#define INTPIN(I) (io_apic_ints[(I)].dst_apic_int)
+#define INTIRQ(I) (io_apic_ints[(I)].int_vector)
+#define INTAPIC(I) (ID_TO_IO(io_apic_ints[(I)].dst_apic_id))
+
+#define SRCBUSIRQ(I) (io_apic_ints[(I)].src_bus_irq)
+int
+isa_apic_irq(int isa_irq)
+{
+ int intr;
+
+ for (intr = 0; intr < nintrs; ++intr) { /* check each record */
+ if (INTTYPE(intr) == 0) { /* standard INT */
+ if (SRCBUSIRQ(intr) == isa_irq) {
+ if (apic_int_is_bus_type(intr, ISA) ||
+ apic_int_is_bus_type(intr, EISA))
+ return INTIRQ(intr); /* found */
+ }
+ }
+ }
+ return -1; /* NOT found */
+}
+
+
+/*
+ * Determine which APIC pin a PCI INT is attached to.
+ */
+#define SRCBUSID(I) (io_apic_ints[(I)].src_bus_id)
+#define SRCBUSDEVICE(I) ((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f)
+#define SRCBUSLINE(I) (io_apic_ints[(I)].src_bus_irq & 0x03)
+int
+pci_apic_irq(int pciBus, int pciDevice, int pciInt)
+{
+ int intr;
+
+ --pciInt; /* zero based */
+
+ for (intr = 0; intr < nintrs; ++intr) /* check each record */
+ if ((INTTYPE(intr) == 0) /* standard INT */
+ && (SRCBUSID(intr) == pciBus)
+ && (SRCBUSDEVICE(intr) == pciDevice)
+ && (SRCBUSLINE(intr) == pciInt)) /* a candidate IRQ */
+ if (apic_int_is_bus_type(intr, PCI))
+ return INTIRQ(intr); /* exact match */
+
+ return -1; /* NOT found */
+}
+
+int
+next_apic_irq(int irq)
+{
+ int intr, ointr;
+ int bus, bustype;
+
+ bus = 0;
+ bustype = 0;
+ for (intr = 0; intr < nintrs; intr++) {
+ if (INTIRQ(intr) != irq || INTTYPE(intr) != 0)
+ continue;
+ bus = SRCBUSID(intr);
+ bustype = apic_bus_type(bus);
+ if (bustype != ISA &&
+ bustype != EISA &&
+ bustype != PCI)
+ continue;
+ break;
+ }
+ if (intr >= nintrs) {
+ return -1;
+ }
+ for (ointr = intr + 1; ointr < nintrs; ointr++) {
+ if (INTTYPE(ointr) != 0)
+ continue;
+ if (bus != SRCBUSID(ointr))
+ continue;
+ if (bustype == PCI) {
+ if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr))
+ continue;
+ if (SRCBUSLINE(intr) != SRCBUSLINE(ointr))
+ continue;
+ }
+ if (bustype == ISA || bustype == EISA) {
+ if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr))
+ continue;
+ }
+ if (INTPIN(intr) == INTPIN(ointr))
+ continue;
+ break;
+ }
+ if (ointr >= nintrs) {
+ return -1;
+ }
+ return INTIRQ(ointr);
+}
+#undef SRCBUSLINE
+#undef SRCBUSDEVICE
+#undef SRCBUSID
+#undef SRCBUSIRQ
+
+#undef INTPIN
+#undef INTIRQ
+#undef INTAPIC
+#undef INTTYPE
+
+
+/*
+ * Reprogram the MB chipset to NOT redirect an ISA INTerrupt.
+ *
+ * XXX FIXME:
+ * Exactly what this means is unclear at this point. It is a solution
+ * for motherboards that redirect the MBIRQ0 pin. Generically a motherboard
+ * could route any of the ISA INTs to upper (>15) IRQ values. But most would
+ * NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an
+ * option.
+ */
+int
+undirect_isa_irq(int rirq)
+{
+#if defined(READY)
+ if (bootverbose)
+ printf("Freeing redirected ISA irq %d.\n", rirq);
+ /** FIXME: tickle the MB redirector chip */
+ return ???;
+#else
+ if (bootverbose)
+ printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq);
+ return 0;
+#endif /* READY */
+}
+
+
+/*
+ * Reprogram the MB chipset to NOT redirect a PCI INTerrupt
+ */
+int
+undirect_pci_irq(int rirq)
+{
+#if defined(READY)
+ if (bootverbose)
+ printf("Freeing redirected PCI irq %d.\n", rirq);
+
+ /** FIXME: tickle the MB redirector chip */
+ return ???;
+#else
+ if (bootverbose)
+ printf("Freeing (NOT implemented) redirected PCI irq %d.\n",
+ rirq);
+ return 0;
+#endif /* READY */
+}
+
+
+/*
+ * given a bus ID, return:
+ * the bus type if found
+ * -1 if NOT found
+ */
+int
+apic_bus_type(int id)
+{
+ int x;
+
+ for (x = 0; x < mp_nbusses; ++x)
+ if (bus_data[x].bus_id == id)
+ return bus_data[x].bus_type;
+
+ return -1;
+}
+
+
+/*
+ * given a LOGICAL APIC# and pin#, return:
+ * the associated src bus ID if found
+ * -1 if NOT found
+ */
+int
+apic_src_bus_id(int apic, int pin)
+{
+ int x;
+
+ /* search each of the possible INTerrupt sources */
+ for (x = 0; x < nintrs; ++x)
+ if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
+ (pin == io_apic_ints[x].dst_apic_int))
+ return (io_apic_ints[x].src_bus_id);
+
+ return -1; /* NOT found */
+}
+
+
+/*
+ * given a LOGICAL APIC# and pin#, return:
+ * the associated src bus IRQ if found
+ * -1 if NOT found
+ */
+int
+apic_src_bus_irq(int apic, int pin)
+{
+ int x;
+
+ for (x = 0; x < nintrs; x++)
+ if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
+ (pin == io_apic_ints[x].dst_apic_int))
+ return (io_apic_ints[x].src_bus_irq);
+
+ return -1; /* NOT found */
+}
+
+
+/*
+ * given a LOGICAL APIC# and pin#, return:
+ * the associated INTerrupt type if found
+ * -1 if NOT found
+ */
+int
+apic_int_type(int apic, int pin)
+{
+ int x;
+
+ /* search each of the possible INTerrupt sources */
+ for (x = 0; x < nintrs; ++x)
+ if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
+ (pin == io_apic_ints[x].dst_apic_int))
+ return (io_apic_ints[x].int_type);
+
+ return -1; /* NOT found */
+}
+
+int
+apic_irq(int apic, int pin)
+{
+ int x;
+ int res;
+
+ for (x = 0; x < nintrs; ++x)
+ if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
+ (pin == io_apic_ints[x].dst_apic_int)) {
+ res = io_apic_ints[x].int_vector;
+ if (res == 0xff)
+ return -1;
+ if (apic != int_to_apicintpin[res].ioapic)
+ panic("apic_irq: inconsistent table");
+ if (pin != int_to_apicintpin[res].int_pin)
+ panic("apic_irq inconsistent table (2)");
+ return res;
+ }
+ return -1;
+}
+
+
+/*
+ * given a LOGICAL APIC# and pin#, return:
+ * the associated trigger mode if found
+ * -1 if NOT found
+ */
+int
+apic_trigger(int apic, int pin)
+{
+ int x;
+
+ /* search each of the possible INTerrupt sources */
+ for (x = 0; x < nintrs; ++x)
+ if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
+ (pin == io_apic_ints[x].dst_apic_int))
+ return ((io_apic_ints[x].int_flags >> 2) & 0x03);
+
+ return -1; /* NOT found */
+}
+
+
+/*
+ * given a LOGICAL APIC# and pin#, return:
+ * the associated 'active' level if found
+ * -1 if NOT found
+ */
+int
+apic_polarity(int apic, int pin)
+{
+ int x;
+
+ /* search each of the possible INTerrupt sources */
+ for (x = 0; x < nintrs; ++x)
+ if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
+ (pin == io_apic_ints[x].dst_apic_int))
+ return (io_apic_ints[x].int_flags & 0x03);
+
+ return -1; /* NOT found */
+}
+
+
+/*
+ * set data according to MP defaults
+ * FIXME: probably not complete yet...
+ */
+static void
+default_mp_table(int type)
+{
+ int ap_cpu_id;
+#if defined(APIC_IO)
+ u_int32_t ux;
+ int io_apic_id;
+ int pin;
+#endif /* APIC_IO */
+
+#if 0
+ printf(" MP default config type: %d\n", type);
+ switch (type) {
+ case 1:
+ printf(" bus: ISA, APIC: 82489DX\n");
+ break;
+ case 2:
+ printf(" bus: EISA, APIC: 82489DX\n");
+ break;
+ case 3:
+ printf(" bus: EISA, APIC: 82489DX\n");
+ break;
+ case 4:
+ printf(" bus: MCA, APIC: 82489DX\n");
+ break;
+ case 5:
+ printf(" bus: ISA+PCI, APIC: Integrated\n");
+ break;
+ case 6:
+ printf(" bus: EISA+PCI, APIC: Integrated\n");
+ break;
+ case 7:
+ printf(" bus: MCA+PCI, APIC: Integrated\n");
+ break;
+ default:
+ printf(" future type\n");
+ break;
+ /* NOTREACHED */
+ }
+#endif /* 0 */
+
+ boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24;
+ ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0;
+
+ /* BSP */
+ CPU_TO_ID(0) = boot_cpu_id;
+ ID_TO_CPU(boot_cpu_id) = 0;
+
+ /* one and only AP */
+ CPU_TO_ID(1) = ap_cpu_id;
+ ID_TO_CPU(ap_cpu_id) = 1;
+
+#if defined(APIC_IO)
+ /* one and only IO APIC */
+ io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24;
+
+ /*
+ * sanity check, refer to MP spec section 3.6.6, last paragraph
+ * necessary as some hardware isn't properly setting up the IO APIC
+ */
+#if defined(REALLY_ANAL_IOAPICID_VALUE)
+ if (io_apic_id != 2) {
+#else
+ if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) {
+#endif /* REALLY_ANAL_IOAPICID_VALUE */
+ ux = io_apic_read(0, IOAPIC_ID); /* get current contents */
+ ux &= ~APIC_ID_MASK; /* clear the ID field */
+ ux |= 0x02000000; /* set it to '2' */
+ io_apic_write(0, IOAPIC_ID, ux); /* write new value */
+ ux = io_apic_read(0, IOAPIC_ID); /* re-read && test */
+ if ((ux & APIC_ID_MASK) != 0x02000000)
+ panic("can't control IO APIC ID, reg: 0x%08x", ux);
+ io_apic_id = 2;
+ }
+ IO_TO_ID(0) = io_apic_id;
+ ID_TO_IO(io_apic_id) = 0;
+#endif /* APIC_IO */
+
+ /* fill out bus entries */
+ switch (type) {
+ case 1:
+ case 2:
+ case 3:
+ case 5:
+ case 6:
+ bus_data[0].bus_id = default_data[type - 1][1];
+ bus_data[0].bus_type = default_data[type - 1][2];
+ bus_data[1].bus_id = default_data[type - 1][3];
+ bus_data[1].bus_type = default_data[type - 1][4];
+ break;
+
+ /* case 4: case 7: MCA NOT supported */
+ default: /* illegal/reserved */
+ panic("BAD default MP config: %d", type);
+ /* NOTREACHED */
+ }
+
+#if defined(APIC_IO)
+ /* general cases from MP v1.4, table 5-2 */
+ for (pin = 0; pin < 16; ++pin) {
+ io_apic_ints[pin].int_type = 0;
+ io_apic_ints[pin].int_flags = 0x05; /* edge/active-hi */
+ io_apic_ints[pin].src_bus_id = 0;
+ io_apic_ints[pin].src_bus_irq = pin; /* IRQ2 caught below */
+ io_apic_ints[pin].dst_apic_id = io_apic_id;
+ io_apic_ints[pin].dst_apic_int = pin; /* 1-to-1 */
+ }
+
+ /* special cases from MP v1.4, table 5-2 */
+ if (type == 2) {
+ io_apic_ints[2].int_type = 0xff; /* N/C */
+ io_apic_ints[13].int_type = 0xff; /* N/C */
+#if !defined(APIC_MIXED_MODE)
+ /** FIXME: ??? */
+ panic("sorry, can't support type 2 default yet");
+#endif /* APIC_MIXED_MODE */
+ }
+ else
+ io_apic_ints[2].src_bus_irq = 0; /* ISA IRQ0 is on APIC INT 2 */
+
+ if (type == 7)
+ io_apic_ints[0].int_type = 0xff; /* N/C */
+ else
+ io_apic_ints[0].int_type = 3; /* vectored 8259 */
+#endif /* APIC_IO */
+}
+
+
+/*
+ * initialize all the SMP locks
+ */
+
+/* critical region around IO APIC, apic_imen */
+struct simplelock imen_lock;
+
+/* critical region around splxx(), cpl, cml, cil, ipending */
+struct simplelock cpl_lock;
+
+/* Make FAST_INTR() routines sequential */
+struct simplelock fast_intr_lock;
+
+/* critical region around INTR() routines */
+struct simplelock intr_lock;
+
+/* lock regions protected in UP kernel via cli/sti */
+struct simplelock mpintr_lock;
+
+/* lock region used by kernel profiling */
+struct simplelock mcount_lock;
+
+#ifdef USE_COMLOCK
+/* locks com (tty) data/hardware accesses: a FASTINTR() */
+struct simplelock com_lock;
+#endif /* USE_COMLOCK */
+
+#ifdef USE_CLOCKLOCK
+/* lock regions around the clock hardware */
+struct simplelock clock_lock;
+#endif /* USE_CLOCKLOCK */
+
+static void
+init_locks(void)
+{
+ /*
+ * Get the initial mp_lock with a count of 1 for the BSP.
+ * This uses a LOGICAL cpu ID, ie BSP == 0.
+ */
+ mp_lock = 0x00000001;
+
+ /* ISR uses its own "giant lock" */
+ isr_lock = FREE_LOCK;
+
+#if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ)
+ s_lock_init((struct simplelock*)&apic_itrace_debuglock);
+#endif
+
+ s_lock_init((struct simplelock*)&mpintr_lock);
+
+ s_lock_init((struct simplelock*)&mcount_lock);
+
+ s_lock_init((struct simplelock*)&fast_intr_lock);
+ s_lock_init((struct simplelock*)&intr_lock);
+ s_lock_init((struct simplelock*)&imen_lock);
+ s_lock_init((struct simplelock*)&cpl_lock);
+
+#ifdef USE_COMLOCK
+ s_lock_init((struct simplelock*)&com_lock);
+#endif /* USE_COMLOCK */
+#ifdef USE_CLOCKLOCK
+ s_lock_init((struct simplelock*)&clock_lock);
+#endif /* USE_CLOCKLOCK */
+}
+
+
+/*
+ * start each AP in our list
+ */
+static int
+start_all_aps(u_int boot_addr)
+{
+ int x, i;
+ u_char mpbiosreason;
+ u_long mpbioswarmvec;
+ pd_entry_t *newptd;
+ pt_entry_t *newpt;
+ struct globaldata *gd;
+ char *stack;
+ pd_entry_t *myPTD;
+
+ POSTCODE(START_ALL_APS_POST);
+
+ /* initialize BSP's local APIC */
+ apic_initialize();
+ bsp_apic_ready = 1;
+
+ /* install the AP 1st level boot code */
+ install_ap_tramp(boot_addr);
+
+
+ /* save the current value of the warm-start vector */
+ mpbioswarmvec = *((u_long *) WARMBOOT_OFF);
+#ifndef PC98
+ outb(CMOS_REG, BIOS_RESET);
+ mpbiosreason = inb(CMOS_DATA);
+#endif
+
+ /* record BSP in CPU map */
+ all_cpus = 1;
+
+ /* start each AP */
+ for (x = 1; x <= mp_naps; ++x) {
+
+ /* This is a bit verbose, it will go away soon. */
+
+ /* alloc new page table directory */
+ newptd = (pd_entry_t *)(kmem_alloc(kernel_map, PAGE_SIZE));
+
+ /* Store the virtual PTD address for this CPU */
+ IdlePTDS[x] = newptd;
+
+ /* clone currently active one (ie: IdlePTD) */
+ bcopy(PTD, newptd, PAGE_SIZE); /* inc prv page pde */
+
+ /* set up 0 -> 4MB P==V mapping for AP boot */
+ newptd[0] = (void *)(uintptr_t)(PG_V | PG_RW |
+ ((uintptr_t)(void *)KPTphys & PG_FRAME));
+
+ /* store PTD for this AP's boot sequence */
+ myPTD = (pd_entry_t *)vtophys(newptd);
+
+ /* alloc new page table page */
+ newpt = (pt_entry_t *)(kmem_alloc(kernel_map, PAGE_SIZE));
+
+ /* set the new PTD's private page to point there */
+ newptd[MPPTDI] = (pt_entry_t)(PG_V | PG_RW | vtophys(newpt));
+
+ /* install self referential entry */
+ newptd[PTDPTDI] = (pd_entry_t)(PG_V | PG_RW | vtophys(newptd));
+
+ /* allocate a new private data page */
+ gd = (struct globaldata *)kmem_alloc(kernel_map, PAGE_SIZE);
+
+ /* wire it into the private page table page */
+ newpt[0] = (pt_entry_t)(PG_V | PG_RW | vtophys(gd));
+
+ /* wire the ptp into itself for access */
+ newpt[1] = (pt_entry_t)(PG_V | PG_RW | vtophys(newpt));
+
+ /* copy in the pointer to the local apic */
+ newpt[2] = SMP_prvpt[2];
+
+ /* and the IO apic mapping[s] */
+ for (i = 16; i < 32; i++)
+ newpt[i] = SMP_prvpt[i];
+
+ /* allocate and set up an idle stack data page */
+ stack = (char *)kmem_alloc(kernel_map, UPAGES*PAGE_SIZE);
+ for (i = 0; i < UPAGES; i++)
+ newpt[i + 3] = (pt_entry_t)(PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
+
+ newpt[3 + UPAGES] = 0; /* *prv_CMAP1 */
+ newpt[4 + UPAGES] = 0; /* *prv_CMAP2 */
+ newpt[5 + UPAGES] = 0; /* *prv_CMAP3 */
+ newpt[6 + UPAGES] = 0; /* *prv_PMAP1 */
+
+ /* prime data page for it to use */
+ gd->cpuid = x;
+ gd->cpu_lockid = x << 24;
+ gd->my_idlePTD = myPTD;
+ gd->prv_CMAP1 = &newpt[3 + UPAGES];
+ gd->prv_CMAP2 = &newpt[4 + UPAGES];
+ gd->prv_CMAP3 = &newpt[5 + UPAGES];
+ gd->prv_PMAP1 = &newpt[6 + UPAGES];
+
+ /* setup a vector to our boot code */
+ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
+ *((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4);
+#ifndef PC98
+ outb(CMOS_REG, BIOS_RESET);
+ outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */
+#endif
+
+ bootPTD = myPTD;
+ /* attempt to start the Application Processor */
+ CHECK_INIT(99); /* setup checkpoints */
+ if (!start_ap(x, boot_addr)) {
+ printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x));
+ CHECK_PRINT("trace"); /* show checkpoints */
+ /* better panic as the AP may be running loose */
+ printf("panic y/n? [y] ");
+ if (cngetc() != 'n')
+ panic("bye-bye");
+ }
+ CHECK_PRINT("trace"); /* show checkpoints */
+
+ /* record its version info */
+ cpu_apic_versions[x] = cpu_apic_versions[0];
+
+ all_cpus |= (1 << x); /* record AP in CPU map */
+ }
+
+ /* build our map of 'other' CPUs */
+ other_cpus = all_cpus & ~(1 << cpuid);
+
+ /* fill in our (BSP) APIC version */
+ cpu_apic_versions[0] = lapic.version;
+
+ /* restore the warmstart vector */
+ *(u_long *) WARMBOOT_OFF = mpbioswarmvec;
+#ifndef PC98
+ outb(CMOS_REG, BIOS_RESET);
+ outb(CMOS_DATA, mpbiosreason);
+#endif
+
+ /*
+ * Set up the idle context for the BSP. Similar to above except
+ * that some was done by locore, some by pmap.c and some is implicit
+ * because the BSP is cpu#0 and the page is initially zero, and also
+ * because we can refer to variables by name on the BSP..
+ */
+ newptd = (pd_entry_t *)(kmem_alloc(kernel_map, PAGE_SIZE));
+
+ bcopy(PTD, newptd, PAGE_SIZE); /* inc prv page pde */
+ IdlePTDS[0] = newptd;
+
+ /* Point PTD[] to this page instead of IdlePTD's physical page */
+ newptd[PTDPTDI] = (pd_entry_t)(PG_V | PG_RW | vtophys(newptd));
+
+ my_idlePTD = (pd_entry_t *)vtophys(newptd);
+
+ /* Allocate and setup BSP idle stack */
+ stack = (char *)kmem_alloc(kernel_map, UPAGES * PAGE_SIZE);
+ for (i = 0; i < UPAGES; i++)
+ SMP_prvpt[i + 3] = (pt_entry_t)(PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
+
+ pmap_set_opt_bsp();
+
+ for (i = 0; i < mp_ncpus; i++) {
+ bcopy( (int *) PTD + KPTDI, (int *) IdlePTDS[i] + KPTDI, NKPDE * sizeof (int));
+ }
+
+ /* number of APs actually started */
+ return mp_ncpus - 1;
+}
+
+
+/*
+ * load the 1st level AP boot code into base memory.
+ */
+
+/* targets for relocation */
+extern void bigJump(void);
+extern void bootCodeSeg(void);
+extern void bootDataSeg(void);
+extern void MPentry(void);
+extern u_int MP_GDT;
+extern u_int mp_gdtbase;
+
+static void
+install_ap_tramp(u_int boot_addr)
+{
+ int x;
+ int size = *(int *) ((u_long) & bootMP_size);
+ u_char *src = (u_char *) ((u_long) bootMP);
+ u_char *dst = (u_char *) boot_addr + KERNBASE;
+ u_int boot_base = (u_int) bootMP;
+ u_int8_t *dst8;
+ u_int16_t *dst16;
+ u_int32_t *dst32;
+
+ POSTCODE(INSTALL_AP_TRAMP_POST);
+
+ for (x = 0; x < size; ++x)
+ *dst++ = *src++;
+
+ /*
+ * modify addresses in code we just moved to basemem. unfortunately we
+ * need fairly detailed info about mpboot.s for this to work. changes
+ * to mpboot.s might require changes here.
+ */
+
+ /* boot code is located in KERNEL space */
+ dst = (u_char *) boot_addr + KERNBASE;
+
+ /* modify the lgdt arg */
+ dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
+ *dst32 = boot_addr + ((u_int) & MP_GDT - boot_base);
+
+ /* modify the ljmp target for MPentry() */
+ dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
+ *dst32 = ((u_int) MPentry - KERNBASE);
+
+ /* modify the target for boot code segment */
+ dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
+ dst8 = (u_int8_t *) (dst16 + 1);
+ *dst16 = (u_int) boot_addr & 0xffff;
+ *dst8 = ((u_int) boot_addr >> 16) & 0xff;
+
+ /* modify the target for boot data segment */
+ dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
+ dst8 = (u_int8_t *) (dst16 + 1);
+ *dst16 = (u_int) boot_addr & 0xffff;
+ *dst8 = ((u_int) boot_addr >> 16) & 0xff;
+}
+
+
+/*
+ * this function starts the AP (application processor) identified
+ * by the APIC ID 'physicalCpu'. It does quite a "song and dance"
+ * to accomplish this. This is necessary because of the nuances
+ * of the different hardware we might encounter. It ain't pretty,
+ * but it seems to work.
+ */
+static int
+start_ap(int logical_cpu, u_int boot_addr)
+{
+ int physical_cpu;
+ int vector;
+ int cpus;
+ u_long icr_lo, icr_hi;
+
+ POSTCODE(START_AP_POST);
+
+ /* get the PHYSICAL APIC ID# */
+ physical_cpu = CPU_TO_ID(logical_cpu);
+
+ /* calculate the vector */
+ vector = (boot_addr >> 12) & 0xff;
+
+ /* used as a watchpoint to signal AP startup */
+ cpus = mp_ncpus;
+
+ /*
+ * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
+ * and running the target CPU. OR this INIT IPI might be latched (P5
+ * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
+ * ignored.
+ */
+
+ /* setup the address for the target AP */
+ icr_hi = lapic.icr_hi & ~APIC_ID_MASK;
+ icr_hi |= (physical_cpu << 24);
+ lapic.icr_hi = icr_hi;
+
+ /* do an INIT IPI: assert RESET */
+ icr_lo = lapic.icr_lo & 0xfff00000;
+ lapic.icr_lo = icr_lo | 0x0000c500;
+
+ /* wait for pending status end */
+ while (lapic.icr_lo & APIC_DELSTAT_MASK)
+ /* spin */ ;
+
+ /* do an INIT IPI: deassert RESET */
+ lapic.icr_lo = icr_lo | 0x00008500;
+
+ /* wait for pending status end */
+ u_sleep(10000); /* wait ~10mS */
+ while (lapic.icr_lo & APIC_DELSTAT_MASK)
+ /* spin */ ;
+
+ /*
+ * next we do a STARTUP IPI: the previous INIT IPI might still be
+ * latched, (P5 bug) this 1st STARTUP would then terminate
+ * immediately, and the previously started INIT IPI would continue. OR
+ * the previous INIT IPI has already run. and this STARTUP IPI will
+ * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
+ * will run.
+ */
+
+ /* do a STARTUP IPI */
+ lapic.icr_lo = icr_lo | 0x00000600 | vector;
+ while (lapic.icr_lo & APIC_DELSTAT_MASK)
+ /* spin */ ;
+ u_sleep(200); /* wait ~200uS */
+
+ /*
+ * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
+ * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
+ * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
+ * recognized after hardware RESET or INIT IPI.
+ */
+
+ lapic.icr_lo = icr_lo | 0x00000600 | vector;
+ while (lapic.icr_lo & APIC_DELSTAT_MASK)
+ /* spin */ ;
+ u_sleep(200); /* wait ~200uS */
+
+ /* wait for it to start */
+ set_apic_timer(5000000);/* == 5 seconds */
+ while (read_apic_timer())
+ if (mp_ncpus > cpus)
+ return 1; /* return SUCCESS */
+
+ return 0; /* return FAILURE */
+}
+
+
+/*
+ * Flush the TLB on all other CPU's
+ *
+ * XXX: Needs to handshake and wait for completion before proceding.
+ */
+void
+smp_invltlb(void)
+{
+#if defined(APIC_IO)
+ if (smp_started && invltlb_ok)
+ all_but_self_ipi(XINVLTLB_OFFSET);
+#endif /* APIC_IO */
+}
+
+void
+invlpg(u_int addr)
+{
+ __asm __volatile("invlpg (%0)"::"r"(addr):"memory");
+
+ /* send a message to the other CPUs */
+ smp_invltlb();
+}
+
+void
+invltlb(void)
+{
+ u_long temp;
+
+ /*
+ * This should be implemented as load_cr3(rcr3()) when load_cr3() is
+ * inlined.
+ */
+ __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
+
+ /* send a message to the other CPUs */
+ smp_invltlb();
+}
+
+
+/*
+ * When called the executing CPU will send an IPI to all other CPUs
+ * requesting that they halt execution.
+ *
+ * Usually (but not necessarily) called with 'other_cpus' as its arg.
+ *
+ * - Signals all CPUs in map to stop.
+ * - Waits for each to stop.
+ *
+ * Returns:
+ * -1: error
+ * 0: NA
+ * 1: ok
+ *
+ * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs
+ * from executing at same time.
+ */
+int
+stop_cpus(u_int map)
+{
+ if (!smp_started)
+ return 0;
+
+ /* send the Xcpustop IPI to all CPUs in map */
+ selected_apic_ipi(map, XCPUSTOP_OFFSET, APIC_DELMODE_FIXED);
+
+ while ((stopped_cpus & map) != map)
+ /* spin */ ;
+
+ return 1;
+}
+
+
+/*
+ * Called by a CPU to restart stopped CPUs.
+ *
+ * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
+ *
+ * - Signals all CPUs in map to restart.
+ * - Waits for each to restart.
+ *
+ * Returns:
+ * -1: error
+ * 0: NA
+ * 1: ok
+ */
+int
+restart_cpus(u_int map)
+{
+ if (!smp_started)
+ return 0;
+
+ started_cpus = map; /* signal other cpus to restart */
+
+ while ((stopped_cpus & map) != 0) /* wait for each to clear its bit */
+ /* spin */ ;
+
+ return 1;
+}
+
+int smp_active = 0; /* are the APs allowed to run? */
+SYSCTL_INT(_machdep, OID_AUTO, smp_active, CTLFLAG_RW, &smp_active, 0, "");
+
+/* XXX maybe should be hw.ncpu */
+static int smp_cpus = 1; /* how many cpu's running */
+SYSCTL_INT(_machdep, OID_AUTO, smp_cpus, CTLFLAG_RD, &smp_cpus, 0, "");
+
+int invltlb_ok = 0; /* throttle smp_invltlb() till safe */
+SYSCTL_INT(_machdep, OID_AUTO, invltlb_ok, CTLFLAG_RW, &invltlb_ok, 0, "");
+
+/* Warning: Do not staticize. Used from swtch.s */
+int do_page_zero_idle = 1; /* bzero pages for fun and profit in idleloop */
+SYSCTL_INT(_machdep, OID_AUTO, do_page_zero_idle, CTLFLAG_RW,
+ &do_page_zero_idle, 0, "");
+
+/* Is forwarding of a interrupt to the CPU holding the ISR lock enabled ? */
+int forward_irq_enabled = 1;
+SYSCTL_INT(_machdep, OID_AUTO, forward_irq_enabled, CTLFLAG_RW,
+ &forward_irq_enabled, 0, "");
+
+/* Enable forwarding of a signal to a process running on a different CPU */
+static int forward_signal_enabled = 1;
+SYSCTL_INT(_machdep, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
+ &forward_signal_enabled, 0, "");
+
+/* Enable forwarding of roundrobin to all other cpus */
+static int forward_roundrobin_enabled = 1;
+SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW,
+ &forward_roundrobin_enabled, 0, "");
+
+/*
+ * This is called once the rest of the system is up and running and we're
+ * ready to let the AP's out of the pen.
+ */
+void ap_init(void);
+
+void
+ap_init()
+{
+ u_int apic_id;
+
+ smp_cpus++;
+
+#if defined(I586_CPU) && !defined(NO_F00F_HACK)
+ lidt(&r_idt);
+#endif
+
+ /* Build our map of 'other' CPUs. */
+ other_cpus = all_cpus & ~(1 << cpuid);
+
+ printf("SMP: AP CPU #%d Launched!\n", cpuid);
+
+ /* XXX FIXME: i386 specific, and redundant: Setup the FPU. */
+ load_cr0((rcr0() & ~CR0_EM) | CR0_MP | CR0_NE | CR0_TS);
+
+ /* A quick check from sanity claus */
+ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]);
+ if (cpuid != apic_id) {
+ printf("SMP: cpuid = %d\n", cpuid);
+ printf("SMP: apic_id = %d\n", apic_id);
+ printf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]);
+ panic("cpuid mismatch! boom!!");
+ }
+
+ getmtrr();
+
+ /* Init local apic for irq's */
+ apic_initialize();
+
+ /*
+ * Activate smp_invltlb, although strictly speaking, this isn't
+ * quite correct yet. We should have a bitfield for cpus willing
+ * to accept TLB flush IPI's or something and sync them.
+ */
+ if (smp_cpus == mp_ncpus) {
+ invltlb_ok = 1;
+ smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */
+ smp_active = 1; /* historic */
+ }
+
+ curproc = NULL; /* make sure */
+}
+
+#ifdef BETTER_CLOCK
+
+#define CHECKSTATE_USER 0
+#define CHECKSTATE_SYS 1
+#define CHECKSTATE_INTR 2
+
+/* Do not staticize. Used from apic_vector.s */
+struct proc* checkstate_curproc[NCPU];
+int checkstate_cpustate[NCPU];
+u_long checkstate_pc[NCPU];
+
+extern long cp_time[CPUSTATES];
+
+#define PC_TO_INDEX(pc, prof) \
+ ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
+ (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
+
+static void
+addupc_intr_forwarded(struct proc *p, int id, int *astmap)
+{
+ int i;
+ struct uprof *prof;
+ u_long pc;
+
+ pc = checkstate_pc[id];
+ prof = &p->p_stats->p_prof;
+ if (pc >= prof->pr_off &&
+ (i = PC_TO_INDEX(pc, prof)) < prof->pr_size) {
+ if ((p->p_flag & P_OWEUPC) == 0) {
+ prof->pr_addr = pc;
+ prof->pr_ticks = 1;
+ p->p_flag |= P_OWEUPC;
+ }
+ *astmap |= (1 << id);
+ }
+}
+
+static void
+forwarded_statclock(int id, int pscnt, int *astmap)
+{
+ struct pstats *pstats;
+ long rss;
+ struct rusage *ru;
+ struct vmspace *vm;
+ int cpustate;
+ struct proc *p;
+#ifdef GPROF
+ register struct gmonparam *g;
+ int i;
+#endif
+
+ p = checkstate_curproc[id];
+ cpustate = checkstate_cpustate[id];
+
+ switch (cpustate) {
+ case CHECKSTATE_USER:
+ if (p->p_flag & P_PROFIL)
+ addupc_intr_forwarded(p, id, astmap);
+ if (pscnt > 1)
+ return;
+ p->p_uticks++;
+ if (p->p_nice > NZERO)
+ cp_time[CP_NICE]++;
+ else
+ cp_time[CP_USER]++;
+ break;
+ case CHECKSTATE_SYS:
+#ifdef GPROF
+ /*
+ * Kernel statistics are just like addupc_intr, only easier.
+ */
+ g = &_gmonparam;
+ if (g->state == GMON_PROF_ON) {
+ i = checkstate_pc[id] - g->lowpc;
+ if (i < g->textsize) {
+ i /= HISTFRACTION * sizeof(*g->kcount);
+ g->kcount[i]++;
+ }
+ }
+#endif
+ if (pscnt > 1)
+ return;
+
+ if (!p)
+ cp_time[CP_IDLE]++;
+ else {
+ p->p_sticks++;
+ cp_time[CP_SYS]++;
+ }
+ break;
+ case CHECKSTATE_INTR:
+ default:
+#ifdef GPROF
+ /*
+ * Kernel statistics are just like addupc_intr, only easier.
+ */
+ g = &_gmonparam;
+ if (g->state == GMON_PROF_ON) {
+ i = checkstate_pc[id] - g->lowpc;
+ if (i < g->textsize) {
+ i /= HISTFRACTION * sizeof(*g->kcount);
+ g->kcount[i]++;
+ }
+ }
+#endif
+ if (pscnt > 1)
+ return;
+ if (p)
+ p->p_iticks++;
+ cp_time[CP_INTR]++;
+ }
+ if (p != NULL) {
+ p->p_cpticks++;
+ if (++p->p_estcpu == 0)
+ p->p_estcpu--;
+ if ((p->p_estcpu & 3) == 0) {
+ resetpriority(p);
+ if (p->p_priority >= PUSER)
+ p->p_priority = p->p_usrpri;
+ }
+
+ /* Update resource usage integrals and maximums. */
+ if ((pstats = p->p_stats) != NULL &&
+ (ru = &pstats->p_ru) != NULL &&
+ (vm = p->p_vmspace) != NULL) {
+ ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
+ ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
+ ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
+ rss = vm->vm_pmap.pm_stats.resident_count *
+ PAGE_SIZE / 1024;
+ if (ru->ru_maxrss < rss)
+ ru->ru_maxrss = rss;
+ }
+ }
+}
+
+void
+forward_statclock(int pscnt)
+{
+ int map;
+ int id;
+ int i;
+
+ /* Kludge. We don't yet have separate locks for the interrupts
+ * and the kernel. This means that we cannot let the other processors
+ * handle complex interrupts while inhibiting them from entering
+ * the kernel in a non-interrupt context.
+ *
+ * What we can do, without changing the locking mechanisms yet,
+ * is letting the other processors handle a very simple interrupt
+ * (wich determines the processor states), and do the main
+ * work ourself.
+ */
+
+ if (!smp_started || !invltlb_ok || cold || panicstr)
+ return;
+
+ /* Step 1: Probe state (user, cpu, interrupt, spinlock, idle ) */
+
+ map = other_cpus & ~stopped_cpus ;
+ checkstate_probed_cpus = 0;
+ if (map != 0)
+ selected_apic_ipi(map,
+ XCPUCHECKSTATE_OFFSET, APIC_DELMODE_FIXED);
+
+ i = 0;
+ while (checkstate_probed_cpus != map) {
+ /* spin */
+ i++;
+ if (i == 100000) {
+#ifdef BETTER_CLOCK_DIAGNOSTIC
+ printf("forward_statclock: checkstate %x\n",
+ checkstate_probed_cpus);
+#endif
+ break;
+ }
+ }
+
+ /*
+ * Step 2: walk through other processors processes, update ticks and
+ * profiling info.
+ */
+
+ map = 0;
+ for (id = 0; id < mp_ncpus; id++) {
+ if (id == cpuid)
+ continue;
+ if (((1 << id) & checkstate_probed_cpus) == 0)
+ continue;
+ forwarded_statclock(id, pscnt, &map);
+ }
+ if (map != 0) {
+ checkstate_need_ast |= map;
+ selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
+ i = 0;
+ while ((checkstate_need_ast & map) != 0) {
+ /* spin */
+ i++;
+ if (i > 100000) {
+#ifdef BETTER_CLOCK_DIAGNOSTIC
+ printf("forward_statclock: dropped ast 0x%x\n",
+ checkstate_need_ast & map);
+#endif
+ break;
+ }
+ }
+ }
+}
+
+void
+forward_hardclock(int pscnt)
+{
+ int map;
+ int id;
+ struct proc *p;
+ struct pstats *pstats;
+ int i;
+
+ /* Kludge. We don't yet have separate locks for the interrupts
+ * and the kernel. This means that we cannot let the other processors
+ * handle complex interrupts while inhibiting them from entering
+ * the kernel in a non-interrupt context.
+ *
+ * What we can do, without changing the locking mechanisms yet,
+ * is letting the other processors handle a very simple interrupt
+ * (wich determines the processor states), and do the main
+ * work ourself.
+ */
+
+ if (!smp_started || !invltlb_ok || cold || panicstr)
+ return;
+
+ /* Step 1: Probe state (user, cpu, interrupt, spinlock, idle) */
+
+ map = other_cpus & ~stopped_cpus ;
+ checkstate_probed_cpus = 0;
+ if (map != 0)
+ selected_apic_ipi(map,
+ XCPUCHECKSTATE_OFFSET, APIC_DELMODE_FIXED);
+
+ i = 0;
+ while (checkstate_probed_cpus != map) {
+ /* spin */
+ i++;
+ if (i == 100000) {
+#ifdef BETTER_CLOCK_DIAGNOSTIC
+ printf("forward_hardclock: checkstate %x\n",
+ checkstate_probed_cpus);
+#endif
+ break;
+ }
+ }
+
+ /*
+ * Step 2: walk through other processors processes, update virtual
+ * timer and profiling timer. If stathz == 0, also update ticks and
+ * profiling info.
+ */
+
+ map = 0;
+ for (id = 0; id < mp_ncpus; id++) {
+ if (id == cpuid)
+ continue;
+ if (((1 << id) & checkstate_probed_cpus) == 0)
+ continue;
+ p = checkstate_curproc[id];
+ if (p) {
+ pstats = p->p_stats;
+ if (checkstate_cpustate[id] == CHECKSTATE_USER &&
+ timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
+ itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
+ psignal(p, SIGVTALRM);
+ map |= (1 << id);
+ }
+ if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
+ itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
+ psignal(p, SIGPROF);
+ map |= (1 << id);
+ }
+ }
+ if (stathz == 0) {
+ forwarded_statclock( id, pscnt, &map);
+ }
+ }
+ if (map != 0) {
+ checkstate_need_ast |= map;
+ selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
+ i = 0;
+ while ((checkstate_need_ast & map) != 0) {
+ /* spin */
+ i++;
+ if (i > 100000) {
+#ifdef BETTER_CLOCK_DIAGNOSTIC
+ printf("forward_hardclock: dropped ast 0x%x\n",
+ checkstate_need_ast & map);
+#endif
+ break;
+ }
+ }
+ }
+}
+
+#endif /* BETTER_CLOCK */
+
+void
+forward_signal(struct proc *p)
+{
+ int map;
+ int id;
+ int i;
+
+ /* Kludge. We don't yet have separate locks for the interrupts
+ * and the kernel. This means that we cannot let the other processors
+ * handle complex interrupts while inhibiting them from entering
+ * the kernel in a non-interrupt context.
+ *
+ * What we can do, without changing the locking mechanisms yet,
+ * is letting the other processors handle a very simple interrupt
+ * (wich determines the processor states), and do the main
+ * work ourself.
+ */
+
+ if (!smp_started || !invltlb_ok || cold || panicstr)
+ return;
+ if (!forward_signal_enabled)
+ return;
+ while (1) {
+ if (p->p_stat != SRUN)
+ return;
+ id = (u_char) p->p_oncpu;
+ if (id == 0xff)
+ return;
+ map = (1<<id);
+ checkstate_need_ast |= map;
+ selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
+ i = 0;
+ while ((checkstate_need_ast & map) != 0) {
+ /* spin */
+ i++;
+ if (i > 100000) {
+#if 0
+ printf("forward_signal: dropped ast 0x%x\n",
+ checkstate_need_ast & map);
+#endif
+ break;
+ }
+ }
+ if (id == (u_char) p->p_oncpu)
+ return;
+ }
+}
+
+void
+forward_roundrobin(void)
+{
+ u_int map;
+ int i;
+
+ if (!smp_started || !invltlb_ok || cold || panicstr)
+ return;
+ if (!forward_roundrobin_enabled)
+ return;
+ resched_cpus |= other_cpus;
+ map = other_cpus & ~stopped_cpus ;
+#if 1
+ selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
+#else
+ (void) all_but_self_ipi(XCPUAST_OFFSET);
+#endif
+ i = 0;
+ while ((checkstate_need_ast & map) != 0) {
+ /* spin */
+ i++;
+ if (i > 100000) {
+#if 0
+ printf("forward_roundrobin: dropped ast 0x%x\n",
+ checkstate_need_ast & map);
+#endif
+ break;
+ }
+ }
+}
+
+
+#ifdef APIC_INTR_REORDER
+/*
+ * Maintain mapping from softintr vector to isr bit in local apic.
+ */
+void
+set_lapic_isrloc(int intr, int vector)
+{
+ if (intr < 0 || intr > 32)
+ panic("set_apic_isrloc: bad intr argument: %d",intr);
+ if (vector < ICU_OFFSET || vector > 255)
+ panic("set_apic_isrloc: bad vector argument: %d",vector);
+ apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2);
+ apic_isrbit_location[intr].bit = (1<<(vector & 31));
+}
+#endif
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 57195f3..42b0c85 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -1,6 +1,7 @@
/*-
- * Copyright (c) 1990 The Regents of the University of California.
- * All rights reserved.
+ * Copyright (C) 1994, David Greenman
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the University of Utah, and William Jolitz.
@@ -33,515 +34,1145 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)trap.c 7.4 (Berkeley) 5/13/91
- *
- * PATCHES MAGIC LEVEL PATCH THAT GOT US HERE
- * -------------------- ----- ----------------------
- * CURRENT PATCH LEVEL: 1 00137
- * -------------------- ----- ----------------------
- *
- * 08 Apr 93 Bruce Evans Several VM system fixes
- * Paul Kranenburg Add counter for vmstat
+ * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
+ * $Id: trap.c,v 1.132 1998/12/28 23:02:56 msmith Exp $
*/
-static char rcsid[] = "$Header: /usr/bill/working/sys/i386/i386/RCS/trap.c,v 1.2 92/01/21 14:22:13 william Exp $";
/*
- * 386 Trap and System call handleing
+ * 386 Trap and System call handling
*/
-#include "machine/cpu.h"
-#include "machine/psl.h"
-#include "machine/reg.h"
+#include "opt_cpu.h"
+#include "opt_ddb.h"
+#include "opt_ktrace.h"
+#include "opt_trap.h"
+#include "opt_vm86.h"
-#include "param.h"
-#include "systm.h"
-#include "proc.h"
-#include "user.h"
-#include "acct.h"
-#include "kernel.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/kernel.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/uio.h>
+#include <sys/vmmeter.h>
#ifdef KTRACE
-#include "ktrace.h"
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+
+#include <machine/cpu.h>
+#include <machine/ipl.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#ifdef SMP
+#include <machine/smp.h>
#endif
+#include <machine/tss.h>
+
+#include <i386/isa/intr_machdep.h>
+
+#ifdef POWERFAIL_NMI
+#include <sys/syslog.h>
+#include <machine/clock.h>
+#endif
+
+#ifdef VM86
+#include <machine/vm86.h>
+#endif
+
+#ifdef DDB
+ extern int in_Debugger, debugger_on_panic;
+#endif
+
+#include "isa.h"
+#include "npx.h"
+
+extern struct i386tss common_tss;
+
+int (*pmath_emulate) __P((struct trapframe *));
+
+extern void trap __P((struct trapframe frame));
+extern int trapwrite __P((unsigned addr));
+extern void syscall __P((struct trapframe frame));
+
+static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
+static void trap_fatal __P((struct trapframe *, vm_offset_t));
+void dblfault_handler __P((void));
+
+extern inthand_t IDTVEC(syscall);
+
+#define MAX_TRAP_MSG 28
+static char *trap_msg[] = {
+ "", /* 0 unused */
+ "privileged instruction fault", /* 1 T_PRIVINFLT */
+ "", /* 2 unused */
+ "breakpoint instruction fault", /* 3 T_BPTFLT */
+ "", /* 4 unused */
+ "", /* 5 unused */
+ "arithmetic trap", /* 6 T_ARITHTRAP */
+ "system forced exception", /* 7 T_ASTFLT */
+ "", /* 8 unused */
+ "general protection fault", /* 9 T_PROTFLT */
+ "trace trap", /* 10 T_TRCTRAP */
+ "", /* 11 unused */
+ "page fault", /* 12 T_PAGEFLT */
+ "", /* 13 unused */
+ "alignment fault", /* 14 T_ALIGNFLT */
+ "", /* 15 unused */
+ "", /* 16 unused */
+ "", /* 17 unused */
+ "integer divide fault", /* 18 T_DIVIDE */
+ "non-maskable interrupt trap", /* 19 T_NMI */
+ "overflow trap", /* 20 T_OFLOW */
+ "FPU bounds check fault", /* 21 T_BOUND */
+ "FPU device not available", /* 22 T_DNA */
+ "double fault", /* 23 T_DOUBLEFLT */
+ "FPU operand fetch fault", /* 24 T_FPOPFLT */
+ "invalid TSS fault", /* 25 T_TSSFLT */
+ "segment not present fault", /* 26 T_SEGNPFLT */
+ "stack fault", /* 27 T_STKFLT */
+ "machine check trap", /* 28 T_MCHK */
+};
+
+static __inline void userret __P((struct proc *p, struct trapframe *frame,
+ u_quad_t oticks));
-#include "vm/vm_param.h"
-#include "vm/pmap.h"
-#include "vm/vm_map.h"
-#include "sys/vmmeter.h"
+#if defined(I586_CPU) && !defined(NO_F00F_HACK)
+extern struct gate_descriptor *t_idt;
+extern int has_f00f_bug;
+#endif
-#include "machine/trap.h"
+static __inline void
+userret(p, frame, oticks)
+ struct proc *p;
+ struct trapframe *frame;
+ u_quad_t oticks;
+{
+ int sig, s;
+ while ((sig = CURSIG(p)) != 0)
+ postsig(sig);
-struct sysent sysent[];
-int nsysent;
-int dostacklimits;
-unsigned rcr2();
-extern short cpl;
+#if 0
+ if (!want_resched &&
+ (p->p_priority <= p->p_usrpri) &&
+ (p->p_rtprio.type == RTP_PRIO_NORMAL)) {
+ int newpriority;
+ p->p_estcpu += 1;
+ newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
+ newpriority = min(newpriority, MAXPRI);
+ p->p_usrpri = newpriority;
+ }
+#endif
+
+ p->p_priority = p->p_usrpri;
+ if (want_resched) {
+ /*
+ * Since we are curproc, clock will normally just change
+ * our priority without moving us from one queue to another
+ * (since the running process is not on a queue.)
+ * If that happened after we setrunqueue ourselves but before we
+ * mi_switch()'ed, we might not be on the queue indicated by
+ * our priority.
+ */
+ s = splhigh();
+ setrunqueue(p);
+ p->p_stats->p_ru.ru_nivcsw++;
+ mi_switch();
+ splx(s);
+ while ((sig = CURSIG(p)) != 0)
+ postsig(sig);
+ }
+ /*
+ * Charge system time if profiling.
+ */
+ if (p->p_flag & P_PROFIL)
+ addupc_task(p, frame->tf_eip,
+ (u_int)(p->p_sticks - oticks) * psratio);
+ curpriority = p->p_priority;
+}
/*
- * trap(frame):
- * Exception, fault, and trap interface to BSD kernel. This
- * common code is called from assembly language IDT gate entry
+ * Exception, fault, and trap interface to the FreeBSD kernel.
+ * This common code is called from assembly language IDT gate entry
* routines that prepare a suitable stack frame, and restore this
- * frame after the exception has been processed. Note that the
- * effect is as if the arguments were passed call by reference.
+ * frame after the exception has been processed.
*/
-/*ARGSUSED*/
+void
trap(frame)
struct trapframe frame;
{
- register int i;
- register struct proc *p = curproc;
- struct timeval syst;
- int ucode, type, code, eva;
+ struct proc *p = curproc;
+ u_quad_t sticks = 0;
+ int i = 0, ucode = 0, type, code;
+ vm_offset_t eva;
- frame.tf_eflags &= ~PSL_NT; /* clear nested trap XXX */
- type = frame.tf_trapno;
-#include "ddb.h"
-#if NDDB > 0
- if (curpcb && curpcb->pcb_onfault) {
- if (frame.tf_trapno == T_BPTFLT
- || frame.tf_trapno == T_TRCTRAP)
- if (kdb_trap (type, 0, &frame))
- return;
- }
-#endif
-
-/*pg("trap type %d code = %x eip = %x cs = %x eva = %x esp %x",
- frame.tf_trapno, frame.tf_err, frame.tf_eip,
- frame.tf_cs, rcr2(), frame.tf_esp);*/
-if(curpcb == 0 || curproc == 0) goto we_re_toast;
- if (curpcb->pcb_onfault && frame.tf_trapno != 0xc) {
-copyfault:
- frame.tf_eip = (int)curpcb->pcb_onfault;
- return;
+ if (!(frame.tf_eflags & PSL_I)) {
+ /*
+ * Buggy application or kernel code has disabled interrupts
+ * and then trapped. Enabling interrupts now is wrong, but
+ * it is better than running with interrupts disabled until
+ * they are accidentally enabled later.
+ */
+ type = frame.tf_trapno;
+ if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
+ printf(
+ "pid %ld (%s): trap %d with interrupts disabled\n",
+ (long)curproc->p_pid, curproc->p_comm, type);
+ else if (type != T_BPTFLT && type != T_TRCTRAP)
+ /*
+ * XXX not quite right, since this may be for a
+ * multiple fault in user mode.
+ */
+ printf("kernel trap %d with interrupts disabled\n",
+ type);
+ enable_intr();
}
- syst = p->p_stime;
- if (ISPL(frame.tf_cs) == SEL_UPL) {
- type |= T_USER;
- p->p_regs = (int *)&frame;
- curpcb->pcb_flags |= FM_TRAP; /* used by sendsig */
+ eva = 0;
+ if (frame.tf_trapno == T_PAGEFLT) {
+ /*
+ * For some Cyrix CPUs, %cr2 is clobbered by interrupts.
+ * This problem is worked around by using an interrupt
+ * gate for the pagefault handler. We are finally ready
+ * to read %cr2 and then must reenable interrupts.
+ *
+ * XXX this should be in the switch statement, but the
+ * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the
+ * flow of control too much for this to be obviously
+ * correct.
+ */
+ eva = rcr2();
+ enable_intr();
}
- ucode=0;
- eva = rcr2();
+#if defined(I586_CPU) && !defined(NO_F00F_HACK)
+restart:
+#endif
+ type = frame.tf_trapno;
code = frame.tf_err;
- switch (type) {
- default:
- we_re_toast:
-#ifdef KDB
- if (kdb_trap(&psl))
+#ifdef VM86
+ if (in_vm86call) {
+ if (frame.tf_eflags & PSL_VM &&
+ (type == T_PROTFLT || type == T_STKFLT)) {
+ i = vm86_emulate((struct vm86frame *)&frame);
+ if (i != 0)
+ /*
+ * returns to original process
+ */
+ vm86_trap((struct vm86frame *)&frame);
return;
-#endif
-#if NDDB > 0
- if (kdb_trap (type, 0, &frame))
+ }
+ switch (type) {
+ /*
+ * these traps want either a process context, or
+ * assume a normal userspace trap.
+ */
+ case T_PROTFLT:
+ case T_SEGNPFLT:
+ trap_fatal(&frame, eva);
return;
+ case T_TRCTRAP:
+ type = T_BPTFLT; /* kernel breakpoint */
+ /* FALL THROUGH */
+ }
+ goto kernel_trap; /* normal kernel trap handling */
+ }
#endif
- printf("trap type %d code = %x eip = %x cs = %x eflags = %x ",
- frame.tf_trapno, frame.tf_err, frame.tf_eip,
- frame.tf_cs, frame.tf_eflags);
- eva = rcr2();
- printf("cr2 %x cpl %x\n", eva, cpl);
- /* type &= ~T_USER; */ /* XXX what the hell is this */
- panic("trap");
- /*NOTREACHED*/
-
- case T_SEGNPFLT|T_USER:
- case T_STKFLT|T_USER:
- case T_PROTFLT|T_USER: /* protection fault */
- ucode = code + BUS_SEGM_FAULT ;
- i = SIGBUS;
- break;
+ if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) {
+ /* user trap */
- case T_PRIVINFLT|T_USER: /* privileged instruction fault */
- case T_RESADFLT|T_USER: /* reserved addressing fault */
- case T_RESOPFLT|T_USER: /* reserved operand fault */
- case T_FPOPFLT|T_USER: /* coprocessor operand fault */
- ucode = type &~ T_USER;
- i = SIGILL;
- break;
+ sticks = p->p_sticks;
+ p->p_md.md_regs = &frame;
+
+ switch (type) {
+ case T_PRIVINFLT: /* privileged instruction fault */
+ ucode = type;
+ i = SIGILL;
+ break;
+
+ case T_BPTFLT: /* bpt instruction fault */
+ case T_TRCTRAP: /* trace trap */
+ frame.tf_eflags &= ~PSL_T;
+ i = SIGTRAP;
+ break;
+
+ case T_ARITHTRAP: /* arithmetic trap */
+ ucode = code;
+ i = SIGFPE;
+ break;
+
+ case T_ASTFLT: /* Allow process switch */
+ astoff();
+ cnt.v_soft++;
+ if (p->p_flag & P_OWEUPC) {
+ p->p_flag &= ~P_OWEUPC;
+ addupc_task(p, p->p_stats->p_prof.pr_addr,
+ p->p_stats->p_prof.pr_ticks);
+ }
+ goto out;
+
+ /*
+ * The following two traps can happen in
+ * vm86 mode, and, if so, we want to handle
+ * them specially.
+ */
+ case T_PROTFLT: /* general protection fault */
+ case T_STKFLT: /* stack fault */
+#ifdef VM86
+ if (frame.tf_eflags & PSL_VM) {
+ i = vm86_emulate((struct vm86frame *)&frame);
+ if (i == 0)
+ goto out;
+ break;
+ }
+#endif /* VM86 */
+ /* FALL THROUGH */
+
+ case T_SEGNPFLT: /* segment not present fault */
+ case T_TSSFLT: /* invalid TSS fault */
+ case T_DOUBLEFLT: /* double fault */
+ default:
+ ucode = code + BUS_SEGM_FAULT ;
+ i = SIGBUS;
+ break;
+
+ case T_PAGEFLT: /* page fault */
+ i = trap_pfault(&frame, TRUE, eva);
+ if (i == -1)
+ return;
+#if defined(I586_CPU) && !defined(NO_F00F_HACK)
+ if (i == -2)
+ goto restart;
+#endif
+ if (i == 0)
+ goto out;
+
+ ucode = T_PAGEFLT;
+ break;
+
+ case T_DIVIDE: /* integer divide fault */
+ ucode = FPE_INTDIV_TRAP;
+ i = SIGFPE;
+ break;
+
+#if NISA > 0
+ case T_NMI:
+#ifdef POWERFAIL_NMI
+ goto handle_powerfail;
+#else /* !POWERFAIL_NMI */
+#ifdef DDB
+ /* NMI can be hooked up to a pushbutton for debugging */
+ printf ("NMI ... going to debugger\n");
+ if (kdb_trap (type, 0, &frame))
+ return;
+#endif /* DDB */
+ /* machine/parity/power fail/"kitchen sink" faults */
+ if (isa_nmi(code) == 0) return;
+ panic("NMI indicates hardware failure");
+#endif /* POWERFAIL_NMI */
+#endif /* NISA > 0 */
+
+ case T_OFLOW: /* integer overflow fault */
+ ucode = FPE_INTOVF_TRAP;
+ i = SIGFPE;
+ break;
- case T_ASTFLT|T_USER: /* Allow process switch */
- astoff();
- cnt.v_soft++;
- if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) {
- addupc(frame.tf_eip, &p->p_stats->p_prof, 1);
- p->p_flag &= ~SOWEUPC;
+ case T_BOUND: /* bounds check fault */
+ ucode = FPE_SUBRNG_TRAP;
+ i = SIGFPE;
+ break;
+
+ case T_DNA:
+#if NNPX > 0
+ /* if a transparent fault (due to context switch "late") */
+ if (npxdna())
+ return;
+#endif
+ if (!pmath_emulate) {
+ i = SIGFPE;
+ ucode = FPE_FPU_NP_TRAP;
+ break;
+ }
+ i = (*pmath_emulate)(&frame);
+ if (i == 0) {
+ if (!(frame.tf_eflags & PSL_T))
+ return;
+ frame.tf_eflags &= ~PSL_T;
+ i = SIGTRAP;
+ }
+ /* else ucode = emulator_only_knows() XXX */
+ break;
+
+ case T_FPOPFLT: /* FPU operand fetch fault */
+ ucode = T_FPOPFLT;
+ i = SIGILL;
+ break;
}
- goto out;
+ } else {
+#ifdef VM86
+kernel_trap:
+#endif
+ /* kernel trap */
+
+ switch (type) {
+ case T_PAGEFLT: /* page fault */
+ (void) trap_pfault(&frame, FALSE, eva);
+ return;
- case T_DNA|T_USER:
-#ifdef NPX
- /* if a transparent fault (due to context switch "late") */
- if (npxdna()) return;
+ case T_DNA:
+#if NNPX > 0
+ /*
+ * The kernel is apparently using npx for copying.
+ * XXX this should be fatal unless the kernel has
+ * registered such use.
+ */
+ if (npxdna())
+ return;
#endif
- i = math_emulate(&frame);
- if (i == 0) return;
- ucode = FPE_FPU_NP_TRAP;
- break;
+ break;
- case T_BOUND|T_USER:
- ucode = FPE_SUBRNG_TRAP;
- i = SIGFPE;
- break;
+ case T_PROTFLT: /* general protection fault */
+ case T_SEGNPFLT: /* segment not present fault */
+ /*
+ * Invalid segment selectors and out of bounds
+ * %eip's and %esp's can be set up in user mode.
+ * This causes a fault in kernel mode when the
+ * kernel tries to return to user mode. We want
+ * to get this fault so that we can fix the
+ * problem here and not have to check all the
+ * selectors and pointers when the user changes
+ * them.
+ */
+#define MAYBE_DORETI_FAULT(where, whereto) \
+ do { \
+ if (frame.tf_eip == (int)where) { \
+ frame.tf_eip = (int)whereto; \
+ return; \
+ } \
+ } while (0)
- case T_OFLOW|T_USER:
- ucode = FPE_INTOVF_TRAP;
- i = SIGFPE;
- break;
+ if (intr_nesting_level == 0) {
+ /*
+ * Invalid %fs's and %gs's can be created using
+ * procfs or PT_SETREGS or by invalidating the
+ * underlying LDT entry. This causes a fault
+ * in kernel mode when the kernel attempts to
+ * switch contexts. Lose the bad context
+ * (XXX) so that we can continue, and generate
+ * a signal.
+ */
+ if (frame.tf_eip == (int)cpu_switch_load_fs) {
+ curpcb->pcb_fs = 0;
+ psignal(p, SIGBUS);
+ return;
+ }
+ if (frame.tf_eip == (int)cpu_switch_load_gs) {
+ curpcb->pcb_gs = 0;
+ psignal(p, SIGBUS);
+ return;
+ }
+ MAYBE_DORETI_FAULT(doreti_iret,
+ doreti_iret_fault);
+ MAYBE_DORETI_FAULT(doreti_popl_ds,
+ doreti_popl_ds_fault);
+ MAYBE_DORETI_FAULT(doreti_popl_es,
+ doreti_popl_es_fault);
+ if (curpcb && curpcb->pcb_onfault) {
+ frame.tf_eip = (int)curpcb->pcb_onfault;
+ return;
+ }
+ }
+ break;
- case T_DIVIDE|T_USER:
- ucode = FPE_INTDIV_TRAP;
- i = SIGFPE;
- break;
+ case T_TSSFLT:
+ /*
+ * PSL_NT can be set in user mode and isn't cleared
+ * automatically when the kernel is entered. This
+ * causes a TSS fault when the kernel attempts to
+ * `iret' because the TSS link is uninitialized. We
+ * want to get this fault so that we can fix the
+ * problem here and not every time the kernel is
+ * entered.
+ */
+ if (frame.tf_eflags & PSL_NT) {
+ frame.tf_eflags &= ~PSL_NT;
+ return;
+ }
+ break;
- case T_ARITHTRAP|T_USER:
- ucode = code;
- i = SIGFPE;
- break;
+ case T_TRCTRAP: /* trace trap */
+ if (frame.tf_eip == (int)IDTVEC(syscall)) {
+ /*
+ * We've just entered system mode via the
+ * syscall lcall. Continue single stepping
+ * silently until the syscall handler has
+ * saved the flags.
+ */
+ return;
+ }
+ if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
+ /*
+ * The syscall handler has now saved the
+ * flags. Stop single stepping it.
+ */
+ frame.tf_eflags &= ~PSL_T;
+ return;
+ }
+ /*
+ * Fall through.
+ */
+ case T_BPTFLT:
+ /*
+ * If DDB is enabled, let it handle the debugger trap.
+ * Otherwise, debugger traps "can't happen".
+ */
+#ifdef DDB
+ if (kdb_trap (type, 0, &frame))
+ return;
+#endif
+ break;
- case T_PAGEFLT: /* allow page faults in kernel mode */
-#if 0
- /* XXX - check only applies to 386's and 486's with WP off */
- if (code & PGEX_P) goto we_re_toast;
+#if NISA > 0
+ case T_NMI:
+#ifdef POWERFAIL_NMI
+#ifndef TIMER_FREQ
+# define TIMER_FREQ 1193182
#endif
+ handle_powerfail:
+ {
+ static unsigned lastalert = 0;
+
+ if(time_second - lastalert > 10)
+ {
+ log(LOG_WARNING, "NMI: power fail\n");
+ sysbeep(TIMER_FREQ/880, hz);
+ lastalert = time_second;
+ }
+ return;
+ }
+#else /* !POWERFAIL_NMI */
+#ifdef DDB
+ /* NMI can be hooked up to a pushbutton for debugging */
+ printf ("NMI ... going to debugger\n");
+ if (kdb_trap (type, 0, &frame))
+ return;
+#endif /* DDB */
+ /* machine/parity/power fail/"kitchen sink" faults */
+ if (isa_nmi(code) == 0) return;
+ /* FALL THROUGH */
+#endif /* POWERFAIL_NMI */
+#endif /* NISA > 0 */
+ }
+
+ trap_fatal(&frame, eva);
+ return;
+ }
+
+ /* Translate fault for emulators (e.g. Linux) */
+ if (*p->p_sysent->sv_transtrap)
+ i = (*p->p_sysent->sv_transtrap)(i, type);
+
+ trapsignal(p, i, ucode);
+
+#ifdef DEBUG
+ if (type <= MAX_TRAP_MSG) {
+ uprintf("fatal process exception: %s",
+ trap_msg[type]);
+ if ((type == T_PAGEFLT) || (type == T_PROTFLT))
+ uprintf(", fault VA = 0x%lx", (u_long)eva);
+ uprintf("\n");
+ }
+#endif
+
+out:
+ userret(p, &frame, sticks);
+}
+
+#ifdef notyet
+/*
+ * This version doesn't allow a page fault to user space while
+ * in the kernel. The rest of the kernel needs to be made "safe"
+ * before this can be used. I think the only things remaining
+ * to be made safe are the iBCS2 code and the process tracing/
+ * debugging code.
+ */
+static int
+trap_pfault(frame, usermode, eva)
+ struct trapframe *frame;
+ int usermode;
+ vm_offset_t eva;
+{
+ vm_offset_t va;
+ struct vmspace *vm = NULL;
+ vm_map_t map = 0;
+ int rv = 0;
+ vm_prot_t ftype;
+ struct proc *p = curproc;
+
+ if (frame->tf_err & PGEX_W)
+ ftype = VM_PROT_READ | VM_PROT_WRITE;
+ else
+ ftype = VM_PROT_READ;
+
+ va = trunc_page(eva);
+ if (va < VM_MIN_KERNEL_ADDRESS) {
+ vm_offset_t v;
+ vm_page_t mpte;
+
+ if (p == NULL ||
+ (!usermode && va < VM_MAXUSER_ADDRESS &&
+ (intr_nesting_level != 0 || curpcb == NULL ||
+ curpcb->pcb_onfault == NULL))) {
+ trap_fatal(frame, eva);
+ return (-1);
+ }
- /* fall into */
- case T_PAGEFLT|T_USER: /* page fault */
- {
- register vm_offset_t va;
- register struct vmspace *vm = p->p_vmspace;
- register vm_map_t map;
- int rv;
- vm_prot_t ftype;
- extern vm_map_t kernel_map;
- unsigned nss,v;
-
- va = trunc_page((vm_offset_t)eva);
/*
- * Avoid even looking at pde_v(va) for high va's. va's
- * above VM_MAX_KERNEL_ADDRESS don't correspond to normal
- * PDE's (half of them correspond to APDEpde and half to
- * an unmapped kernel PDE). va's betweeen 0xFEC00000 and
- * VM_MAX_KERNEL_ADDRESS correspond to unmapped kernel PDE's
- * (XXX - why are only 3 initialized when 6 are required to
- * reach VM_MAX_KERNEL_ADDRESS?). Faulting in an unmapped
- * kernel page table would give inconsistent PTD's.
- *
- * XXX - faulting in unmapped page tables wastes a page if
- * va turns out to be invalid.
- *
- * XXX - should "kernel address space" cover the kernel page
- * tables? Might have same problem with PDEpde as with
- * APDEpde (or there may be no problem with APDEpde).
+ * This is a fault on non-kernel virtual memory.
+ * vm is initialized above to NULL. If curproc is NULL
+ * or curproc->p_vmspace is NULL the fault is fatal.
*/
- if (va > 0xFEBFF000) {
- rv = KERN_FAILURE; /* becomes SIGBUS */
+ vm = p->p_vmspace;
+ if (vm == NULL)
goto nogo;
- }
+
+ map = &vm->vm_map;
+
/*
- * It is only a kernel address space fault iff:
- * 1. (type & T_USER) == 0 and
- * 2. pcb_onfault not set or
- * 3. pcb_onfault set but supervisor space fault
- * The last can occur during an exec() copyin where the
- * argument space is lazy-allocated.
+ * Keep swapout from messing with us during this
+ * critical time.
*/
- if (type == T_PAGEFLT && va >= KERNBASE)
- map = kernel_map;
- else
- map = &vm->vm_map;
- if (code & PGEX_W)
- ftype = VM_PROT_READ | VM_PROT_WRITE;
- else
- ftype = VM_PROT_READ;
-
-#ifdef DEBUG
- if (map == kernel_map && va == 0) {
- printf("trap: bad kernel access at %x\n", va);
- goto we_re_toast;
- }
-#endif
+ ++p->p_lock;
/*
- * XXX: rude hack to make stack limits "work"
+ * Grow the stack if necessary
*/
- nss = 0;
- if ((caddr_t)va >= vm->vm_maxsaddr && map != kernel_map
- && dostacklimits) {
- nss = clrnd(btoc((unsigned)vm->vm_maxsaddr
- + MAXSSIZ - (unsigned)va));
- if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
-/*pg("trap rlimit %d, maxsaddr %x va %x ", nss, vm->vm_maxsaddr, va);*/
+#ifndef VM_STACK
+ if ((caddr_t)va > vm->vm_maxsaddr && va < USRSTACK) {
+ if (!grow(p, va)) {
rv = KERN_FAILURE;
+ --p->p_lock;
goto nogo;
}
}
- /* check if page table is mapped, if not, fault it first */
-#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v)
- if (!pde_v(va)) {
- v = trunc_page(vtopte(va));
- rv = vm_fault(map, v, ftype, FALSE);
- if (rv != KERN_SUCCESS) goto nogo;
- /* check if page table fault, increment wiring */
- vm_map_pageable(map, v, round_page(v+1), FALSE);
- } else v=0;
- rv = vm_fault(map, va, ftype, FALSE);
- if (rv == KERN_SUCCESS) {
- /*
- * XXX: continuation of rude stack hack
- */
- if (nss > vm->vm_ssize)
- vm->vm_ssize = nss;
- va = trunc_page(vtopte(va));
- /* for page table, increment wiring
- as long as not a page table fault as well */
- if (!v && type != T_PAGEFLT)
- vm_map_pageable(map, va, round_page(va+1), FALSE);
- if (type == T_PAGEFLT)
- return;
- goto out;
+#else
+ /* grow_stack returns false only if va falls into
+ * a growable stack region and the stack growth
+ * fails. It returns true if va was not within
+ * a growable stack region, or if the stack
+ * growth succeeded.
+ */
+ if (!grow_stack (p, va)) {
+ rv = KERN_FAILURE;
+ --p->p_lock;
+ goto nogo;
}
+#endif
+
+ /* Fault in the user page: */
+ rv = vm_fault(map, va, ftype,
+ (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : 0);
+
+ --p->p_lock;
+ } else {
+ /*
+ * Don't allow user-mode faults in kernel address space.
+ */
+ if (usermode)
+ goto nogo;
+
+ /*
+ * Since we know that kernel virtual address addresses
+ * always have pte pages mapped, we just have to fault
+ * the page.
+ */
+ rv = vm_fault(kernel_map, va, ftype, FALSE);
+ }
+
+ if (rv == KERN_SUCCESS)
+ return (0);
nogo:
- if (type == T_PAGEFLT) {
- if (curpcb->pcb_onfault)
- goto copyfault;
- printf("vm_fault(%x, %x, %x, 0) -> %x\n",
- map, va, ftype, rv);
- printf(" type %x, code %x\n",
- type, code);
- goto we_re_toast;
+ if (!usermode) {
+ if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
+ frame->tf_eip = (int)curpcb->pcb_onfault;
+ return (0);
}
- i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV;
- break;
- }
+ trap_fatal(frame, eva);
+ return (-1);
+ }
-#if NDDB == 0
- case T_TRCTRAP: /* trace trap -- someone single stepping lcall's */
- frame.tf_eflags &= ~PSL_T;
+ /* kludge to pass faulting virtual address to sendsig */
+ frame->tf_err = eva;
- /* Q: how do we turn it on again? */
- return;
+ return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
+}
#endif
-
- case T_BPTFLT|T_USER: /* bpt instruction fault */
- case T_TRCTRAP|T_USER: /* trace trap */
- frame.tf_eflags &= ~PSL_T;
- i = SIGTRAP;
- break;
-#include "isa.h"
-#if NISA > 0
- case T_NMI:
- case T_NMI|T_USER:
-#if NDDB > 0
- /* NMI can be hooked up to a pushbutton for debugging */
- printf ("NMI ... going to debugger\n");
- if (kdb_trap (type, 0, &frame))
- return;
-#endif
- /* machine/parity/power fail/"kitchen sink" faults */
- if(isa_nmi(code) == 0) return;
- else goto we_re_toast;
+int
+trap_pfault(frame, usermode, eva)
+ struct trapframe *frame;
+ int usermode;
+ vm_offset_t eva;
+{
+ vm_offset_t va;
+ struct vmspace *vm = NULL;
+ vm_map_t map = 0;
+ int rv = 0;
+ vm_prot_t ftype;
+ struct proc *p = curproc;
+
+ va = trunc_page(eva);
+ if (va >= KERNBASE) {
+ /*
+ * Don't allow user-mode faults in kernel address space.
+ * An exception: if the faulting address is the invalid
+ * instruction entry in the IDT, then the Intel Pentium
+ * F00F bug workaround was triggered, and we need to
+ * treat it is as an illegal instruction, and not a page
+ * fault.
+ */
+#if defined(I586_CPU) && !defined(NO_F00F_HACK)
+ if ((eva == (unsigned int)&t_idt[6]) && has_f00f_bug) {
+ frame->tf_trapno = T_PRIVINFLT;
+ return -2;
+ }
#endif
- }
+ if (usermode)
+ goto nogo;
- trapsignal(p, i, ucode);
- if ((type & T_USER) == 0)
- return;
-out:
- while (i = CURSIG(p))
- psig(i);
- p->p_pri = p->p_usrpri;
- if (want_resched) {
+ map = kernel_map;
+ } else {
/*
- * Since we are curproc, clock will normally just change
- * our priority without moving us from one queue to another
- * (since the running process is not on a queue.)
- * If that happened after we setrq ourselves but before we
- * swtch()'ed, we might not be on the queue indicated by
- * our priority.
+ * This is a fault on non-kernel virtual memory.
+ * vm is initialized above to NULL. If curproc is NULL
+ * or curproc->p_vmspace is NULL the fault is fatal.
*/
- (void) splclock();
- setrq(p);
- p->p_stats->p_ru.ru_nivcsw++;
- swtch();
- (void) splnone();
- while (i = CURSIG(p))
- psig(i);
+ if (p != NULL)
+ vm = p->p_vmspace;
+
+ if (vm == NULL)
+ goto nogo;
+
+ map = &vm->vm_map;
}
- if (p->p_stats->p_prof.pr_scale) {
- int ticks;
- struct timeval *tv = &p->p_stime;
-
- ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
- (tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
- if (ticks) {
-#ifdef PROFTIMER
- extern int profscale;
- addupc(frame.tf_eip, &p->p_stats->p_prof,
- ticks * profscale);
+
+ if (frame->tf_err & PGEX_W)
+ ftype = VM_PROT_READ | VM_PROT_WRITE;
+ else
+ ftype = VM_PROT_READ;
+
+ if (map != kernel_map) {
+ /*
+ * Keep swapout from messing with us during this
+ * critical time.
+ */
+ ++p->p_lock;
+
+ /*
+ * Grow the stack if necessary
+ */
+#ifndef VM_STACK
+ if ((caddr_t)va > vm->vm_maxsaddr && va < USRSTACK) {
+ if (!grow(p, va)) {
+ rv = KERN_FAILURE;
+ --p->p_lock;
+ goto nogo;
+ }
+ }
#else
- addupc(frame.tf_eip, &p->p_stats->p_prof, ticks);
+ /* grow_stack returns false only if va falls into
+ * a growable stack region and the stack growth
+ * fails. It returns true if va was not within
+ * a growable stack region, or if the stack
+ * growth succeeded.
+ */
+ if (!grow_stack (p, va)) {
+ rv = KERN_FAILURE;
+ --p->p_lock;
+ goto nogo;
+ }
#endif
+
+ /* Fault in the user page: */
+ rv = vm_fault(map, va, ftype,
+ (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : 0);
+
+ --p->p_lock;
+ } else {
+ /*
+ * Don't have to worry about process locking or stacks in the kernel.
+ */
+ rv = vm_fault(map, va, ftype, FALSE);
+ }
+
+ if (rv == KERN_SUCCESS)
+ return (0);
+nogo:
+ if (!usermode) {
+ if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
+ frame->tf_eip = (int)curpcb->pcb_onfault;
+ return (0);
}
+ trap_fatal(frame, eva);
+ return (-1);
}
- curpri = p->p_pri;
- curpcb->pcb_flags &= ~FM_TRAP; /* used by sendsig */
+
+ /* kludge to pass faulting virtual address to sendsig */
+ frame->tf_err = eva;
+
+ return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
+}
+
+static void
+trap_fatal(frame, eva)
+ struct trapframe *frame;
+ vm_offset_t eva;
+{
+ int code, type, ss, esp;
+ struct soft_segment_descriptor softseg;
+
+ code = frame->tf_err;
+ type = frame->tf_trapno;
+ sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
+
+ if (type <= MAX_TRAP_MSG)
+ printf("\n\nFatal trap %d: %s while in %s mode\n",
+ type, trap_msg[type],
+ frame->tf_eflags & PSL_VM ? "vm86" :
+ ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
+#ifdef SMP
+ /* three seperate prints in case of a trap on an unmapped page */
+ printf("mp_lock = %08x; ", mp_lock);
+ printf("cpuid = %d; ", cpuid);
+ printf("lapic.id = %08x\n", lapic.id);
+#endif
+ if (type == T_PAGEFLT) {
+ printf("fault virtual address = 0x%x\n", eva);
+ printf("fault code = %s %s, %s\n",
+ code & PGEX_U ? "user" : "supervisor",
+ code & PGEX_W ? "write" : "read",
+ code & PGEX_P ? "protection violation" : "page not present");
+ }
+ printf("instruction pointer = 0x%x:0x%x\n",
+ frame->tf_cs & 0xffff, frame->tf_eip);
+ if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
+ ss = frame->tf_ss & 0xffff;
+ esp = frame->tf_esp;
+ } else {
+ ss = GSEL(GDATA_SEL, SEL_KPL);
+ esp = (int)&frame->tf_esp;
+ }
+ printf("stack pointer = 0x%x:0x%x\n", ss, esp);
+ printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp);
+ printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n",
+ softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
+ printf(" = DPL %d, pres %d, def32 %d, gran %d\n",
+ softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
+ softseg.ssd_gran);
+ printf("processor eflags = ");
+ if (frame->tf_eflags & PSL_T)
+ printf("trace trap, ");
+ if (frame->tf_eflags & PSL_I)
+ printf("interrupt enabled, ");
+ if (frame->tf_eflags & PSL_NT)
+ printf("nested task, ");
+ if (frame->tf_eflags & PSL_RF)
+ printf("resume, ");
+ if (frame->tf_eflags & PSL_VM)
+ printf("vm86, ");
+ printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
+ printf("current process = ");
+ if (curproc) {
+ printf("%lu (%s)\n",
+ (u_long)curproc->p_pid, curproc->p_comm ?
+ curproc->p_comm : "");
+ } else {
+ printf("Idle\n");
+ }
+ printf("interrupt mask = ");
+ if ((cpl & net_imask) == net_imask)
+ printf("net ");
+ if ((cpl & tty_imask) == tty_imask)
+ printf("tty ");
+ if ((cpl & bio_imask) == bio_imask)
+ printf("bio ");
+ if ((cpl & cam_imask) == cam_imask)
+ printf("cam ");
+ if (cpl == 0)
+ printf("none");
+#ifdef SMP
+/**
+ * XXX FIXME:
+ * we probably SHOULD have stopped the other CPUs before now!
+ * another CPU COULD have been touching cpl at this moment...
+ */
+ printf(" <- SMP: XXX");
+#endif
+ printf("\n");
+
+#ifdef KDB
+ if (kdb_trap(&psl))
+ return;
+#endif
+#ifdef DDB
+ if ((debugger_on_panic || in_Debugger) && kdb_trap(type, 0, frame))
+ return;
+#endif
+ printf("trap number = %d\n", type);
+ if (type <= MAX_TRAP_MSG)
+ panic(trap_msg[type]);
+ else
+ panic("unknown/reserved trap");
}
/*
- * Compensate for 386 brain damage (missing URKR)
+ * Double fault handler. Called when a fault occurs while writing
+ * a frame for a trap/exception onto the stack. This usually occurs
+ * when the stack overflows (such is the case with infinite recursion,
+ * for example).
+ *
+ * XXX Note that the current PTD gets replaced by IdlePTD when the
+ * task switch occurs. This means that the stack that was active at
+ * the time of the double fault is not available at <kstack> unless
+ * the machine was idle when the double fault occurred. The downside
+ * of this is that "trace <ebp>" in ddb won't work.
*/
-int trapwrite(unsigned addr) {
- int rv;
+void
+dblfault_handler()
+{
+ printf("\nFatal double fault:\n");
+ printf("eip = 0x%x\n", common_tss.tss_eip);
+ printf("esp = 0x%x\n", common_tss.tss_esp);
+ printf("ebp = 0x%x\n", common_tss.tss_ebp);
+#ifdef SMP
+ /* three seperate prints in case of a trap on an unmapped page */
+ printf("mp_lock = %08x; ", mp_lock);
+ printf("cpuid = %d; ", cpuid);
+ printf("lapic.id = %08x\n", lapic.id);
+#endif
+ panic("double fault");
+}
+
+/*
+ * Compensate for 386 brain damage (missing URKR).
+ * This is a little simpler than the pagefault handler in trap() because
+ * it the page tables have already been faulted in and high addresses
+ * are thrown out early for other reasons.
+ */
+int trapwrite(addr)
+ unsigned addr;
+{
+ struct proc *p;
vm_offset_t va;
+ struct vmspace *vm;
+ int rv;
va = trunc_page((vm_offset_t)addr);
- if (va > VM_MAXUSER_ADDRESS) return(1);
- rv = vm_fault(&curproc->p_vmspace->vm_map, va,
- VM_PROT_READ | VM_PROT_WRITE, FALSE);
- if (rv == KERN_SUCCESS) return(0);
- else return(1);
+ /*
+ * XXX - MAX is END. Changed > to >= for temp. fix.
+ */
+ if (va >= VM_MAXUSER_ADDRESS)
+ return (1);
+
+ p = curproc;
+ vm = p->p_vmspace;
+
+ ++p->p_lock;
+
+#ifndef VM_STACK
+ if ((caddr_t)va >= vm->vm_maxsaddr && va < USRSTACK) {
+ if (!grow(p, va)) {
+ --p->p_lock;
+ return (1);
+ }
+ }
+#else
+ if (!grow_stack (p, va)) {
+ --p->p_lock;
+ return (1);
+ }
+#endif
+
+ /*
+ * fault the data page
+ */
+ rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, VM_FAULT_DIRTY);
+
+ --p->p_lock;
+
+ if (rv != KERN_SUCCESS)
+ return 1;
+
+ return (0);
}
/*
- * syscall(frame):
- * System call request from POSIX system call gate interface to kernel.
+ * System call request from POSIX system call gate interface to kernel.
* Like trap(), argument is call by reference.
*/
-/*ARGSUSED*/
+void
syscall(frame)
- volatile struct syscframe frame;
+ struct trapframe frame;
{
- register int *locr0 = ((int *)&frame);
- register caddr_t params;
- register int i;
- register struct sysent *callp;
- register struct proc *p = curproc;
- struct timeval syst;
- int error, opc;
- int args[8], rval[2];
- int code;
-
-#ifdef lint
- r0 = 0; r0 = r0; r1 = 0; r1 = r1;
-#endif
- syst = p->p_stime;
- if (ISPL(frame.sf_cs) != SEL_UPL)
+ caddr_t params;
+ int i;
+ struct sysent *callp;
+ struct proc *p = curproc;
+ u_quad_t sticks;
+ int error;
+ int args[8];
+ u_int code;
+
+#ifdef DIAGNOSTIC
+ if (ISPL(frame.tf_cs) != SEL_UPL)
panic("syscall");
+#endif
+ sticks = p->p_sticks;
+ p->p_md.md_regs = &frame;
+ params = (caddr_t)frame.tf_esp + sizeof(int);
+ code = frame.tf_eax;
+ if (p->p_sysent->sv_prepsyscall) {
+ (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
+ } else {
+ /*
+ * Need to check if this is a 32 bit or 64 bit syscall.
+ */
+ if (code == SYS_syscall) {
+ /*
+ * Code is first argument, followed by actual args.
+ */
+ code = fuword(params);
+ params += sizeof(int);
+ } else if (code == SYS___syscall) {
+ /*
+ * Like syscall, but code is a quad, so as to maintain
+ * quad alignment for the rest of the arguments.
+ */
+ code = fuword(params);
+ params += sizeof(quad_t);
+ }
+ }
- code = frame.sf_eax;
- curpcb->pcb_flags &= ~FM_TRAP; /* used by sendsig */
- p->p_regs = (int *)&frame;
- params = (caddr_t)frame.sf_esp + sizeof (int) ;
+ if (p->p_sysent->sv_mask)
+ code &= p->p_sysent->sv_mask;
- /*
- * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always.
- */
- opc = frame.sf_eip - 7;
- callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
- if (callp == sysent) {
- i = fuword(params);
- params += sizeof (int);
- callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
- }
+ if (code >= p->p_sysent->sv_size)
+ callp = &p->p_sysent->sv_table[0];
+ else
+ callp = &p->p_sysent->sv_table[code];
- if ((i = callp->sy_narg * sizeof (int)) &&
+ if (params && (i = callp->sy_narg * sizeof(int)) &&
(error = copyin(params, (caddr_t)args, (u_int)i))) {
- frame.sf_eax = error;
- frame.sf_eflags |= PSL_C; /* carry bit */
#ifdef KTRACE
if (KTRPOINT(p, KTR_SYSCALL))
- ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
+ ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
#endif
- goto done;
+ goto bad;
}
#ifdef KTRACE
if (KTRPOINT(p, KTR_SYSCALL))
- ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
+ ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
#endif
- rval[0] = 0;
- rval[1] = frame.sf_edx;
-/*pg("%d. s %d\n", p->p_pid, code);*/
- error = (*callp->sy_call)(p, args, rval);
- if (error == ERESTART)
- frame.sf_eip = opc;
- else if (error != EJUSTRETURN) {
- if (error) {
-/*pg("error %d", error);*/
- frame.sf_eax = error;
- frame.sf_eflags |= PSL_C; /* carry bit */
- } else {
- frame.sf_eax = rval[0];
- frame.sf_edx = rval[1];
- frame.sf_eflags &= ~PSL_C; /* carry bit */
- }
- }
- /* else if (error == EJUSTRETURN) */
- /* nothing to do */
-done:
- /*
- * Reinitialize proc pointer `p' as it may be different
- * if this is a child returning from fork syscall.
- */
- p = curproc;
- while (i = CURSIG(p))
- psig(i);
- p->p_pri = p->p_usrpri;
- if (want_resched) {
+ p->p_retval[0] = 0;
+ p->p_retval[1] = frame.tf_edx;
+
+ STOPEVENT(p, S_SCE, callp->sy_narg);
+
+ error = (*callp->sy_call)(p, args);
+
+ switch (error) {
+
+ case 0:
/*
- * Since we are curproc, clock will normally just change
- * our priority without moving us from one queue to another
- * (since the running process is not on a queue.)
- * If that happened after we setrq ourselves but before we
- * swtch()'ed, we might not be on the queue indicated by
- * our priority.
+ * Reinitialize proc pointer `p' as it may be different
+ * if this is a child returning from fork syscall.
*/
- (void) splclock();
- setrq(p);
- p->p_stats->p_ru.ru_nivcsw++;
- swtch();
- (void) splnone();
- while (i = CURSIG(p))
- psig(i);
+ p = curproc;
+ frame.tf_eax = p->p_retval[0];
+ frame.tf_edx = p->p_retval[1];
+ frame.tf_eflags &= ~PSL_C;
+ break;
+
+ case ERESTART:
+ /*
+ * Reconstruct pc, assuming lcall $X,y is 7 bytes,
+ * int 0x80 is 2 bytes. We saved this in tf_err.
+ */
+ frame.tf_eip -= frame.tf_err;
+ break;
+
+ case EJUSTRETURN:
+ break;
+
+ default:
+bad:
+ if (p->p_sysent->sv_errsize)
+ if (error >= p->p_sysent->sv_errsize)
+ error = -1; /* XXX */
+ else
+ error = p->p_sysent->sv_errtbl[error];
+ frame.tf_eax = error;
+ frame.tf_eflags |= PSL_C;
+ break;
}
- if (p->p_stats->p_prof.pr_scale) {
- int ticks;
- struct timeval *tv = &p->p_stime;
-
- ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
- (tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
- if (ticks) {
-#ifdef PROFTIMER
- extern int profscale;
- addupc(frame.sf_eip, &p->p_stats->p_prof,
- ticks * profscale);
-#else
- addupc(frame.sf_eip, &p->p_stats->p_prof, ticks);
-#endif
- }
+
+ if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
+ /* Traced syscall. */
+ frame.tf_eflags &= ~PSL_T;
+ trapsignal(p, SIGTRAP, 0);
}
- curpri = p->p_pri;
+
+ userret(p, &frame, sticks);
+
#ifdef KTRACE
if (KTRPOINT(p, KTR_SYSRET))
- ktrsysret(p->p_tracep, code, error, rval[0]);
+ ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
#endif
-#ifdef DIAGNOSTICx
-{ extern int _udatasel, _ucodesel;
- if (frame.sf_ss != _udatasel)
- printf("ss %x call %d\n", frame.sf_ss, code);
- if ((frame.sf_cs&0xffff) != _ucodesel)
- printf("cs %x call %d\n", frame.sf_cs, code);
- if (frame.sf_eip > VM_MAXUSER_ADDRESS) {
- printf("eip %x call %d\n", frame.sf_eip, code);
- frame.sf_eip = 0;
- }
+
+ /*
+ * This works because errno is findable through the
+ * register set. If we ever support an emulation where this
+ * is not the case, this code will need to be revisited.
+ */
+ STOPEVENT(p, S_SCX, code);
+
}
+
+/*
+ * Simplified back end of syscall(), used when returning from fork()
+ * directly into user mode.
+ */
+void
+fork_return(p, frame)
+ struct proc *p;
+ struct trapframe frame;
+{
+ frame.tf_eax = 0; /* Child returns zero */
+ frame.tf_eflags &= ~PSL_C; /* success */
+ frame.tf_edx = 1;
+
+ userret(p, &frame, 0);
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_SYSRET))
+ ktrsysret(p->p_tracep, SYS_fork, 0, 0);
#endif
}
diff --git a/sys/kern/subr_xxx.c b/sys/kern/subr_xxx.c
new file mode 100644
index 0000000..7ff3366
--- /dev/null
+++ b/sys/kern/subr_xxx.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_xxx.c 8.1 (Berkeley) 6/10/93
+ * $Id: subr_xxx.c,v 1.11 1998/08/20 06:10:40 bde Exp $
+ */
+
+/*
+ * Miscellaneous trivial functions.
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+
+/*
+ * Return error for operation not supported
+ * on a specific object or file type.
+ */
+int
+eopnotsupp()
+{
+
+ return (EOPNOTSUPP);
+}
+
+/*
+ * Return error for an inval operation
+ * on a specific object or file type.
+ */
+int
+einval()
+{
+
+ return (EINVAL);
+}
+
+/*
+ * Generic null operation, always returns success.
+ */
+int
+nullop()
+{
+
+ return (0);
+}
+
+#include <sys/conf.h>
+
+/*
+ * Unsupported devswitch functions (e.g. for writing to read-only device).
+ * XXX may belong elsewhere.
+ */
+
+int
+noopen(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
+{
+
+ return (ENODEV);
+}
+
+int
+noclose(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
+{
+
+ return (ENODEV);
+}
+
+int
+noread(dev, uio, ioflag)
+ dev_t dev;
+ struct uio *uio;
+ int ioflag;
+{
+
+ return (ENODEV);
+}
+
+int
+nowrite(dev, uio, ioflag)
+ dev_t dev;
+ struct uio *uio;
+ int ioflag;
+{
+
+ return (ENODEV);
+}
+
+int
+noioctl(dev, cmd, data, flags, p)
+ dev_t dev;
+ u_long cmd;
+ caddr_t data;
+ int flags;
+ struct proc *p;
+{
+
+ return (ENODEV);
+}
+
+void
+nostop(tp, rw)
+ struct tty *tp;
+ int rw;
+{
+
+}
+
+int
+noreset(dev)
+ dev_t dev;
+{
+
+ printf("noreset(0x%x) called\n", dev);
+ return (ENODEV);
+}
+
+struct tty *
+nodevtotty(dev)
+ dev_t dev;
+{
+
+ return (NULL);
+}
+
+int
+nommap(dev, offset, nprot)
+ dev_t dev;
+ vm_offset_t offset;
+ int nprot;
+{
+
+ /* Don't return ENODEV. That would allow mapping address ENODEV! */
+ return (-1);
+}
+
+int
+nodump(dev)
+ dev_t dev;
+{
+
+ return (ENODEV);
+}
+
+/*
+ * Null devswitch functions (for when the operation always succeeds).
+ * XXX may belong elsewhere.
+ * XXX not all are here (e.g., seltrue() isn't).
+ */
+
+/*
+ * XXX this is probably bogus. Any device that uses it isn't checking the
+ * minor number.
+ */
+int
+nullopen(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
+{
+
+ return (0);
+}
+
+int
+nullclose(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
+{
+
+ return (0);
+}
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
new file mode 100644
index 0000000..8d90ee9
--- /dev/null
+++ b/sys/kern/sys_generic.c
@@ -0,0 +1,872 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
+ * $Id: sys_generic.c,v 1.42 1998/11/11 10:03:55 truckman Exp $
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/sysent.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/limits.h>
+
+static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
+static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
+MALLOC_DEFINE(M_IOV, "iov", "large iov's");
+
+static int pollscan __P((struct proc *, struct pollfd *, int));
+static int selscan __P((struct proc *, fd_mask **, fd_mask **, int));
+
+/*
+ * Read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct read_args {
+ int fd;
+ void *buf;
+ size_t nbyte;
+};
+#endif
+/* ARGSUSED */
+int
+read(p, uap)
+ struct proc *p;
+ register struct read_args *uap;
+{
+ register struct file *fp;
+ register struct filedesc *fdp = p->p_fd;
+ struct uio auio;
+ struct iovec aiov;
+ long cnt, error = 0;
+#ifdef KTRACE
+ struct iovec ktriov;
+#endif
+
+ if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
+ (fp->f_flag & FREAD) == 0)
+ return (EBADF);
+ aiov.iov_base = (caddr_t)uap->buf;
+ aiov.iov_len = uap->nbyte;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = -1;
+ if (uap->nbyte > INT_MAX)
+ return (EINVAL);
+ auio.uio_resid = uap->nbyte;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+#ifdef KTRACE
+ /*
+ * if tracing, save a copy of iovec
+ */
+ if (KTRPOINT(p, KTR_GENIO))
+ ktriov = aiov;
+#endif
+ cnt = uap->nbyte;
+ if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
+ if (auio.uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ cnt -= auio.uio_resid;
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_GENIO) && error == 0)
+ ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktriov, cnt, error);
+#endif
+ p->p_retval[0] = cnt;
+ return (error);
+}
+
+/*
+ * Scatter read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readv_args {
+ int fd;
+ struct iovec *iovp;
+ u_int iovcnt;
+};
+#endif
+int
+readv(p, uap)
+ struct proc *p;
+ register struct readv_args *uap;
+{
+ register struct file *fp;
+ register struct filedesc *fdp = p->p_fd;
+ struct uio auio;
+ register struct iovec *iov;
+ struct iovec *needfree;
+ struct iovec aiov[UIO_SMALLIOV];
+ long i, cnt, error = 0;
+ u_int iovlen;
+#ifdef KTRACE
+ struct iovec *ktriov = NULL;
+#endif
+
+ if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
+ (fp->f_flag & FREAD) == 0)
+ return (EBADF);
+ /* note: can't use iovlen until iovcnt is validated */
+ iovlen = uap->iovcnt * sizeof (struct iovec);
+ if (uap->iovcnt > UIO_SMALLIOV) {
+ if (uap->iovcnt > UIO_MAXIOV)
+ return (EINVAL);
+ MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+ needfree = iov;
+ } else {
+ iov = aiov;
+ needfree = NULL;
+ }
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = uap->iovcnt;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+ auio.uio_offset = -1;
+ if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
+ goto done;
+ auio.uio_resid = 0;
+ for (i = 0; i < uap->iovcnt; i++) {
+ if (iov->iov_len > INT_MAX - auio.uio_resid) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid += iov->iov_len;
+ iov++;
+ }
+#ifdef KTRACE
+ /*
+ * if tracing, save a copy of iovec
+ */
+ if (KTRPOINT(p, KTR_GENIO)) {
+ MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+ bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+ }
+#endif
+ cnt = auio.uio_resid;
+ if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
+ if (auio.uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ cnt -= auio.uio_resid;
+#ifdef KTRACE
+ if (ktriov != NULL) {
+ if (error == 0)
+ ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov,
+ cnt, error);
+ FREE(ktriov, M_TEMP);
+ }
+#endif
+ p->p_retval[0] = cnt;
+done:
+ if (needfree)
+ FREE(needfree, M_IOV);
+ return (error);
+}
+
+/*
+ * Write system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct write_args {
+ int fd;
+ const void *buf;
+ size_t nbyte;
+};
+#endif
+int
+write(p, uap)
+ struct proc *p;
+ register struct write_args *uap;
+{
+ register struct file *fp;
+ register struct filedesc *fdp = p->p_fd;
+ struct uio auio;
+ struct iovec aiov;
+ long cnt, error = 0;
+#ifdef KTRACE
+ struct iovec ktriov;
+#endif
+
+ if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
+ (fp->f_flag & FWRITE) == 0)
+ return (EBADF);
+ aiov.iov_base = (caddr_t)uap->buf;
+ aiov.iov_len = uap->nbyte;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = -1;
+ if (uap->nbyte > INT_MAX)
+ return (EINVAL);
+ auio.uio_resid = uap->nbyte;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+#ifdef KTRACE
+ /*
+ * if tracing, save a copy of iovec
+ */
+ if (KTRPOINT(p, KTR_GENIO))
+ ktriov = aiov;
+#endif
+ cnt = uap->nbyte;
+ if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
+ if (auio.uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ if (error == EPIPE)
+ psignal(p, SIGPIPE);
+ }
+ cnt -= auio.uio_resid;
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_GENIO) && error == 0)
+ ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
+ &ktriov, cnt, error);
+#endif
+ p->p_retval[0] = cnt;
+ return (error);
+}
+
+/*
+ * Gather write system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct writev_args {
+ int fd;
+ struct iovec *iovp;
+ u_int iovcnt;
+};
+#endif
+int
+writev(p, uap)
+ struct proc *p;
+ register struct writev_args *uap;
+{
+ register struct file *fp;
+ register struct filedesc *fdp = p->p_fd;
+ struct uio auio;
+ register struct iovec *iov;
+ struct iovec *needfree;
+ struct iovec aiov[UIO_SMALLIOV];
+ long i, cnt, error = 0;
+ u_int iovlen;
+#ifdef KTRACE
+ struct iovec *ktriov = NULL;
+#endif
+
+ if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
+ (fp->f_flag & FWRITE) == 0)
+ return (EBADF);
+ /* note: can't use iovlen until iovcnt is validated */
+ iovlen = uap->iovcnt * sizeof (struct iovec);
+ if (uap->iovcnt > UIO_SMALLIOV) {
+ if (uap->iovcnt > UIO_MAXIOV)
+ return (EINVAL);
+ MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+ needfree = iov;
+ } else {
+ iov = aiov;
+ needfree = NULL;
+ }
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = uap->iovcnt;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+ auio.uio_offset = -1;
+ if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
+ goto done;
+ auio.uio_resid = 0;
+ for (i = 0; i < uap->iovcnt; i++) {
+ if (iov->iov_len > INT_MAX - auio.uio_resid) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid += iov->iov_len;
+ iov++;
+ }
+#ifdef KTRACE
+ /*
+ * if tracing, save a copy of iovec
+ */
+ if (KTRPOINT(p, KTR_GENIO)) {
+ MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+ bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+ }
+#endif
+ cnt = auio.uio_resid;
+ if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
+ if (auio.uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ if (error == EPIPE)
+ psignal(p, SIGPIPE);
+ }
+ cnt -= auio.uio_resid;
+#ifdef KTRACE
+ if (ktriov != NULL) {
+ if (error == 0)
+ ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
+ ktriov, cnt, error);
+ FREE(ktriov, M_TEMP);
+ }
+#endif
+ p->p_retval[0] = cnt;
+done:
+ if (needfree)
+ FREE(needfree, M_IOV);
+ return (error);
+}
+
+/*
+ * Ioctl system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ioctl_args {
+ int fd;
+ u_long com;
+ caddr_t data;
+};
+#endif
+/* ARGSUSED */
+int
+ioctl(p, uap)
+ struct proc *p;
+ register struct ioctl_args *uap;
+{
+ register struct file *fp;
+ register struct filedesc *fdp;
+ register u_long com;
+ int error;
+ register u_int size;
+ caddr_t data, memp;
+ int tmp;
+#define STK_PARAMS 128
+ char stkbuf[STK_PARAMS];
+
+ fdp = p->p_fd;
+ if ((u_int)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+ return (EBADF);
+
+ if ((fp->f_flag & (FREAD | FWRITE)) == 0)
+ return (EBADF);
+
+ switch (com = uap->com) {
+ case FIONCLEX:
+ fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
+ return (0);
+ case FIOCLEX:
+ fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
+ return (0);
+ }
+
+ /*
+ * Interpret high order word to find amount of data to be
+ * copied to/from the user's address space.
+ */
+ size = IOCPARM_LEN(com);
+ if (size > IOCPARM_MAX)
+ return (ENOTTY);
+ memp = NULL;
+ if (size > sizeof (stkbuf)) {
+ memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
+ data = memp;
+ } else
+ data = stkbuf;
+ if (com&IOC_IN) {
+ if (size) {
+ error = copyin(uap->data, data, (u_int)size);
+ if (error) {
+ if (memp)
+ free(memp, M_IOCTLOPS);
+ return (error);
+ }
+ } else
+ *(caddr_t *)data = uap->data;
+ } else if ((com&IOC_OUT) && size)
+ /*
+ * Zero the buffer so the user always
+ * gets back something deterministic.
+ */
+ bzero(data, size);
+ else if (com&IOC_VOID)
+ *(caddr_t *)data = uap->data;
+
+ switch (com) {
+
+ case FIONBIO:
+ if ((tmp = *(int *)data))
+ fp->f_flag |= FNONBLOCK;
+ else
+ fp->f_flag &= ~FNONBLOCK;
+ error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
+ break;
+
+ case FIOASYNC:
+ if ((tmp = *(int *)data))
+ fp->f_flag |= FASYNC;
+ else
+ fp->f_flag &= ~FASYNC;
+ error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
+ break;
+
+ default:
+ error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
+ /*
+ * Copy any data to user, size was
+ * already set and checked above.
+ */
+ if (error == 0 && (com&IOC_OUT) && size)
+ error = copyout(data, uap->data, (u_int)size);
+ break;
+ }
+ if (memp)
+ free(memp, M_IOCTLOPS);
+ return (error);
+}
+
+static int nselcoll;
+int selwait;
+
+/*
+ * Select system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct select_args {
+ int nd;
+ fd_set *in, *ou, *ex;
+ struct timeval *tv;
+};
+#endif
+int
+select(p, uap)
+ register struct proc *p;
+ register struct select_args *uap;
+{
+ /*
+ * The magic 2048 here is chosen to be just enough for FD_SETSIZE
+ * infds with the new FD_SETSIZE of 1024, and more than enough for
+ * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
+ * of 256.
+ */
+ fd_mask s_selbits[howmany(2048, NFDBITS)];
+ fd_mask *ibits[3], *obits[3], *selbits, *sbp;
+ struct timeval atv, rtv, ttv;
+ int s, ncoll, error, timo;
+ u_int nbufbytes, ncpbytes, nfdbits;
+
+ if (uap->nd < 0)
+ return (EINVAL);
+ if (uap->nd > p->p_fd->fd_nfiles)
+ uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
+
+ /*
+ * Allocate just enough bits for the non-null fd_sets. Use the
+ * preallocated auto buffer if possible.
+ */
+ nfdbits = roundup(uap->nd, NFDBITS);
+ ncpbytes = nfdbits / NBBY;
+ nbufbytes = 0;
+ if (uap->in != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (uap->ou != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (uap->ex != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (nbufbytes <= sizeof s_selbits)
+ selbits = &s_selbits[0];
+ else
+ selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
+
+ /*
+ * Assign pointers into the bit buffers and fetch the input bits.
+ * Put the output buffers together so that they can be bzeroed
+ * together.
+ */
+ sbp = selbits;
+#define getbits(name, x) \
+ do { \
+ if (uap->name == NULL) \
+ ibits[x] = NULL; \
+ else { \
+ ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
+ obits[x] = sbp; \
+ sbp += ncpbytes / sizeof *sbp; \
+ error = copyin(uap->name, ibits[x], ncpbytes); \
+ if (error != 0) \
+ goto done; \
+ } \
+ } while (0)
+ getbits(in, 0);
+ getbits(ou, 1);
+ getbits(ex, 2);
+#undef getbits
+ if (nbufbytes != 0)
+ bzero(selbits, nbufbytes / 2);
+
+ if (uap->tv) {
+ error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
+ sizeof (atv));
+ if (error)
+ goto done;
+ if (itimerfix(&atv)) {
+ error = EINVAL;
+ goto done;
+ }
+ getmicrouptime(&rtv);
+ timevaladd(&atv, &rtv);
+ } else
+ atv.tv_sec = 0;
+ timo = 0;
+retry:
+ ncoll = nselcoll;
+ p->p_flag |= P_SELECT;
+ error = selscan(p, ibits, obits, uap->nd);
+ if (error || p->p_retval[0])
+ goto done;
+ if (atv.tv_sec) {
+ getmicrouptime(&rtv);
+ if (timevalcmp(&rtv, &atv, >=))
+ goto done;
+ ttv = atv;
+ timevalsub(&ttv, &rtv);
+ timo = ttv.tv_sec > 24 * 60 * 60 ?
+ 24 * 60 * 60 * hz : tvtohz(&ttv);
+ }
+ s = splhigh();
+ if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
+ splx(s);
+ goto retry;
+ }
+ p->p_flag &= ~P_SELECT;
+ error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
+ splx(s);
+ if (error == 0)
+ goto retry;
+done:
+ p->p_flag &= ~P_SELECT;
+ /* select is not restarted after signals... */
+ if (error == ERESTART)
+ error = EINTR;
+ if (error == EWOULDBLOCK)
+ error = 0;
+#define putbits(name, x) \
+ if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
+ error = error2;
+ if (error == 0) {
+ int error2;
+
+ putbits(in, 0);
+ putbits(ou, 1);
+ putbits(ex, 2);
+#undef putbits
+ }
+ if (selbits != &s_selbits[0])
+ free(selbits, M_SELECT);
+ return (error);
+}
+
+static int
+selscan(p, ibits, obits, nfd)
+ struct proc *p;
+ fd_mask **ibits, **obits;
+ int nfd;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register int msk, i, j, fd;
+ register fd_mask bits;
+ struct file *fp;
+ int n = 0;
+ /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
+ static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
+
+ for (msk = 0; msk < 3; msk++) {
+ if (ibits[msk] == NULL)
+ continue;
+ for (i = 0; i < nfd; i += NFDBITS) {
+ bits = ibits[msk][i/NFDBITS];
+ while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
+ bits &= ~(1 << j);
+ fp = fdp->fd_ofiles[fd];
+ if (fp == NULL)
+ return (EBADF);
+ if ((*fp->f_ops->fo_poll)(fp, flag[msk],
+ fp->f_cred, p)) {
+ obits[msk][(fd)/NFDBITS] |=
+ (1 << ((fd) % NFDBITS));
+ n++;
+ }
+ }
+ }
+ }
+ p->p_retval[0] = n;
+ return (0);
+}
+
+/*
+ * Poll system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct poll_args {
+ struct pollfd *fds;
+ u_int nfds;
+ int timeout;
+};
+#endif
+int
+poll(p, uap)
+ register struct proc *p;
+ register struct poll_args *uap;
+{
+ caddr_t bits;
+ char smallbits[32 * sizeof(struct pollfd)];
+ struct timeval atv, rtv, ttv;
+ int s, ncoll, error = 0, timo;
+ size_t ni;
+
+ if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
+ /* forgiving; slightly wrong */
+ SCARG(uap, nfds) = p->p_fd->fd_nfiles;
+ }
+ ni = SCARG(uap, nfds) * sizeof(struct pollfd);
+ if (ni > sizeof(smallbits))
+ bits = malloc(ni, M_TEMP, M_WAITOK);
+ else
+ bits = smallbits;
+ error = copyin(SCARG(uap, fds), bits, ni);
+ if (error)
+ goto done;
+ if (SCARG(uap, timeout) != INFTIM) {
+ atv.tv_sec = SCARG(uap, timeout) / 1000;
+ atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
+ if (itimerfix(&atv)) {
+ error = EINVAL;
+ goto done;
+ }
+ getmicrouptime(&rtv);
+ timevaladd(&atv, &rtv);
+ } else
+ atv.tv_sec = 0;
+ timo = 0;
+retry:
+ ncoll = nselcoll;
+ p->p_flag |= P_SELECT;
+ error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds));
+ if (error || p->p_retval[0])
+ goto done;
+ if (atv.tv_sec) {
+ getmicrouptime(&rtv);
+ if (timevalcmp(&rtv, &atv, >=))
+ goto done;
+ ttv = atv;
+ timevalsub(&ttv, &rtv);
+ timo = ttv.tv_sec > 24 * 60 * 60 ?
+ 24 * 60 * 60 * hz : tvtohz(&ttv);
+ }
+ s = splhigh();
+ if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
+ splx(s);
+ goto retry;
+ }
+ p->p_flag &= ~P_SELECT;
+ error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
+ splx(s);
+ if (error == 0)
+ goto retry;
+done:
+ p->p_flag &= ~P_SELECT;
+ /* poll is not restarted after signals... */
+ if (error == ERESTART)
+ error = EINTR;
+ if (error == EWOULDBLOCK)
+ error = 0;
+ if (error == 0) {
+ error = copyout(bits, SCARG(uap, fds), ni);
+ if (error)
+ goto out;
+ }
+out:
+ if (ni > sizeof(smallbits))
+ free(bits, M_TEMP);
+ return (error);
+}
+
+static int
+pollscan(p, fds, nfd)
+ struct proc *p;
+ struct pollfd *fds;
+ int nfd;
+{
+ register struct filedesc *fdp = p->p_fd;
+ int i;
+ struct file *fp;
+ int n = 0;
+
+ for (i = 0; i < nfd; i++, fds++) {
+ if (fds->fd >= fdp->fd_nfiles) {
+ fds->revents = POLLNVAL;
+ n++;
+ } else if (fds->fd < 0) {
+ fds->revents = 0;
+ } else {
+ fp = fdp->fd_ofiles[fds->fd];
+ if (fp == 0) {
+ fds->revents = POLLNVAL;
+ n++;
+ } else {
+ /*
+ * Note: backend also returns POLLHUP and
+ * POLLERR if appropriate.
+ */
+ fds->revents = (*fp->f_ops->fo_poll)(fp,
+ fds->events, fp->f_cred, p);
+ if (fds->revents != 0)
+ n++;
+ }
+ }
+ }
+ p->p_retval[0] = n;
+ return (0);
+}
+
+/*
+ * OpenBSD poll system call.
+ * XXX this isn't quite a true representation.. OpenBSD uses select ops.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct openbsd_poll_args {
+ struct pollfd *fds;
+ u_int nfds;
+ int timeout;
+};
+#endif
+int
+openbsd_poll(p, uap)
+ register struct proc *p;
+ register struct openbsd_poll_args *uap;
+{
+ return (poll(p, (struct poll_args *)uap));
+}
+
+/*ARGSUSED*/
+int
+seltrue(dev, events, p)
+ dev_t dev;
+ int events;
+ struct proc *p;
+{
+
+ return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Record a select request.
+ */
+void
+selrecord(selector, sip)
+ struct proc *selector;
+ struct selinfo *sip;
+{
+ struct proc *p;
+ pid_t mypid;
+
+ mypid = selector->p_pid;
+ if (sip->si_pid == mypid)
+ return;
+ if (sip->si_pid && (p = pfind(sip->si_pid)) &&
+ p->p_wchan == (caddr_t)&selwait)
+ sip->si_flags |= SI_COLL;
+ else
+ sip->si_pid = mypid;
+}
+
+/*
+ * Do a wakeup when a selectable event occurs.
+ */
+void
+selwakeup(sip)
+ register struct selinfo *sip;
+{
+ register struct proc *p;
+ int s;
+
+ if (sip->si_pid == 0)
+ return;
+ if (sip->si_flags & SI_COLL) {
+ nselcoll++;
+ sip->si_flags &= ~SI_COLL;
+ wakeup((caddr_t)&selwait);
+ }
+ p = pfind(sip->si_pid);
+ sip->si_pid = 0;
+ if (p != NULL) {
+ s = splhigh();
+ if (p->p_wchan == (caddr_t)&selwait) {
+ if (p->p_stat == SSLEEP)
+ setrunnable(p);
+ else
+ unsleep(p);
+ } else if (p->p_flag & P_SELECT)
+ p->p_flag &= ~P_SELECT;
+ splx(s);
+ }
+}
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
new file mode 100644
index 0000000..29e1e97
--- /dev/null
+++ b/sys/kern/sys_pipe.c
@@ -0,0 +1,1102 @@
+/*
+ * Copyright (c) 1996 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice immediately at the beginning of the file, without modification,
+ * this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ * John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ * are met.
+ *
+ * $Id: sys_pipe.c,v 1.45 1998/11/11 10:03:55 truckman Exp $
+ */
+
+/*
+ * This file contains a high-performance replacement for the socket-based
+ * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
+ * all features of sockets, but does do everything that pipes normally
+ * do.
+ */
+
+/*
+ * This code has two modes of operation, a small write mode and a large
+ * write mode. The small write mode acts like conventional pipes with
+ * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
+ * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
+ * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
+ * the receiving process can copy it directly from the pages in the sending
+ * process.
+ *
+ * If the sending process receives a signal, it is possible that it will
+ * go away, and certainly its address space can change, because control
+ * is returned back to the user-mode side. In that case, the pipe code
+ * arranges to copy the buffer supplied by the user process, to a pageable
+ * kernel buffer, and the receiving process will grab the data from the
+ * pageable kernel buffer. Since signals don't happen all that often,
+ * the copy operation is normally eliminated.
+ *
+ * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
+ * happen for small transfers so that the system will not spend all of
+ * its time context switching. PIPE_SIZE is constrained by the
+ * amount of kernel virtual memory.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/stat.h>
+#include <sys/poll.h>
+#include <sys/signalvar.h>
+#include <sys/sysproto.h>
+#include <sys/pipe.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_zone.h>
+
+/*
+ * Use this define if you want to disable *fancy* VM things. Expect an
+ * approx 30% decrease in transfer rate. This could be useful for
+ * NetBSD or OpenBSD.
+ */
+/* #define PIPE_NODIRECT */
+
+/*
+ * interfaces to the outside world
+ */
+static int pipe_read __P((struct file *fp, struct uio *uio,
+ struct ucred *cred));
+static int pipe_write __P((struct file *fp, struct uio *uio,
+ struct ucred *cred));
+static int pipe_close __P((struct file *fp, struct proc *p));
+static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
+ struct proc *p));
+static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
+
+static struct fileops pipeops =
+ { pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_close };
+
+/*
+ * Default pipe buffer size(s), this can be kind-of large now because pipe
+ * space is pageable. The pipe code will try to maintain locality of
+ * reference for performance reasons, so small amounts of outstanding I/O
+ * will not wipe the cache.
+ */
+#define MINPIPESIZE (PIPE_SIZE/3)
+#define MAXPIPESIZE (2*PIPE_SIZE/3)
+
+/*
+ * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
+ * is there so that on large systems, we don't exhaust it.
+ */
+#define MAXPIPEKVA (8*1024*1024)
+
+/*
+ * Limit for direct transfers, we cannot, of course limit
+ * the amount of kva for pipes in general though.
+ */
+#define LIMITPIPEKVA (16*1024*1024)
+
+/*
+ * Limit the number of "big" pipes
+ */
+#define LIMITBIGPIPES 32
+static int nbigpipe;
+
+static int amountpipekva;
+
+static void pipeclose __P((struct pipe *cpipe));
+static void pipeinit __P((struct pipe *cpipe));
+static __inline int pipelock __P((struct pipe *cpipe, int catch));
+static __inline void pipeunlock __P((struct pipe *cpipe));
+static __inline void pipeselwakeup __P((struct pipe *cpipe));
+#ifndef PIPE_NODIRECT
+static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
+static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
+static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
+static void pipe_clone_write_buffer __P((struct pipe *wpipe));
+#endif
+static void pipespace __P((struct pipe *cpipe));
+
+static vm_zone_t pipe_zone;
+
+/*
+ * The pipe system call for the DTYPE_PIPE type of pipes
+ */
+
+/* ARGSUSED */
+int
+pipe(p, uap)
+ struct proc *p;
+ struct pipe_args /* {
+ int dummy;
+ } */ *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ struct file *rf, *wf;
+ struct pipe *rpipe, *wpipe;
+ int fd, error;
+
+ if (pipe_zone == NULL)
+ pipe_zone = zinit("PIPE", sizeof (struct pipe), 0, 0, 4);
+
+ rpipe = zalloc( pipe_zone);
+ pipeinit(rpipe);
+ rpipe->pipe_state |= PIPE_DIRECTOK;
+ wpipe = zalloc( pipe_zone);
+ pipeinit(wpipe);
+ wpipe->pipe_state |= PIPE_DIRECTOK;
+
+ error = falloc(p, &rf, &fd);
+ if (error)
+ goto free2;
+ p->p_retval[0] = fd;
+ rf->f_flag = FREAD | FWRITE;
+ rf->f_type = DTYPE_PIPE;
+ rf->f_ops = &pipeops;
+ rf->f_data = (caddr_t)rpipe;
+ error = falloc(p, &wf, &fd);
+ if (error)
+ goto free3;
+ wf->f_flag = FREAD | FWRITE;
+ wf->f_type = DTYPE_PIPE;
+ wf->f_ops = &pipeops;
+ wf->f_data = (caddr_t)wpipe;
+ p->p_retval[1] = fd;
+
+ rpipe->pipe_peer = wpipe;
+ wpipe->pipe_peer = rpipe;
+
+ return (0);
+free3:
+ ffree(rf);
+ fdp->fd_ofiles[p->p_retval[0]] = 0;
+free2:
+ (void)pipeclose(wpipe);
+ (void)pipeclose(rpipe);
+ return (error);
+}
+
+/*
+ * Allocate kva for pipe circular buffer, the space is pageable
+ */
+static void
+pipespace(cpipe)
+ struct pipe *cpipe;
+{
+ int npages, error;
+
+ npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE;
+ /*
+ * Create an object, I don't like the idea of paging to/from
+ * kernel_object.
+ * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
+ */
+ cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages);
+ cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map);
+
+ /*
+ * Insert the object into the kernel map, and allocate kva for it.
+ * The map entry is, by default, pageable.
+ * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
+ */
+ error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0,
+ (vm_offset_t *) &cpipe->pipe_buffer.buffer,
+ cpipe->pipe_buffer.size, 1,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+
+ if (error != KERN_SUCCESS)
+ panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error);
+ amountpipekva += cpipe->pipe_buffer.size;
+}
+
+/*
+ * initialize and allocate VM and memory for pipe
+ */
+static void
+pipeinit(cpipe)
+ struct pipe *cpipe;
+{
+
+ cpipe->pipe_buffer.in = 0;
+ cpipe->pipe_buffer.out = 0;
+ cpipe->pipe_buffer.cnt = 0;
+ cpipe->pipe_buffer.size = PIPE_SIZE;
+
+ /* Buffer kva gets dynamically allocated */
+ cpipe->pipe_buffer.buffer = NULL;
+ /* cpipe->pipe_buffer.object = invalid */
+
+ cpipe->pipe_state = 0;
+ cpipe->pipe_peer = NULL;
+ cpipe->pipe_busy = 0;
+ getnanotime(&cpipe->pipe_ctime);
+ cpipe->pipe_atime = cpipe->pipe_ctime;
+ cpipe->pipe_mtime = cpipe->pipe_ctime;
+ bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel);
+
+#ifndef PIPE_NODIRECT
+ /*
+ * pipe data structure initializations to support direct pipe I/O
+ */
+ cpipe->pipe_map.cnt = 0;
+ cpipe->pipe_map.kva = 0;
+ cpipe->pipe_map.pos = 0;
+ cpipe->pipe_map.npages = 0;
+ /* cpipe->pipe_map.ms[] = invalid */
+#endif
+}
+
+
+/*
+ * lock a pipe for I/O, blocking other access
+ */
+static __inline int
+pipelock(cpipe, catch)
+ struct pipe *cpipe;
+ int catch;
+{
+ int error;
+ while (cpipe->pipe_state & PIPE_LOCK) {
+ cpipe->pipe_state |= PIPE_LWANT;
+ if (error = tsleep( cpipe,
+ catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) {
+ return error;
+ }
+ }
+ cpipe->pipe_state |= PIPE_LOCK;
+ return 0;
+}
+
+/*
+ * unlock a pipe I/O lock
+ */
+static __inline void
+pipeunlock(cpipe)
+ struct pipe *cpipe;
+{
+ cpipe->pipe_state &= ~PIPE_LOCK;
+ if (cpipe->pipe_state & PIPE_LWANT) {
+ cpipe->pipe_state &= ~PIPE_LWANT;
+ wakeup(cpipe);
+ }
+}
+
+static __inline void
+pipeselwakeup(cpipe)
+ struct pipe *cpipe;
+{
+ if (cpipe->pipe_state & PIPE_SEL) {
+ cpipe->pipe_state &= ~PIPE_SEL;
+ selwakeup(&cpipe->pipe_sel);
+ }
+ if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
+ pgsigio(cpipe->pipe_sigio, SIGIO, 0);
+}
+
+/* ARGSUSED */
+static int
+pipe_read(fp, uio, cred)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+{
+
+ struct pipe *rpipe = (struct pipe *) fp->f_data;
+ int error = 0;
+ int nread = 0;
+ u_int size;
+
+ ++rpipe->pipe_busy;
+ while (uio->uio_resid) {
+ /*
+ * normal pipe buffer receive
+ */
+ if (rpipe->pipe_buffer.cnt > 0) {
+ size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
+ if (size > rpipe->pipe_buffer.cnt)
+ size = rpipe->pipe_buffer.cnt;
+ if (size > (u_int) uio->uio_resid)
+ size = (u_int) uio->uio_resid;
+ if ((error = pipelock(rpipe,1)) == 0) {
+ error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
+ size, uio);
+ pipeunlock(rpipe);
+ }
+ if (error) {
+ break;
+ }
+ rpipe->pipe_buffer.out += size;
+ if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
+ rpipe->pipe_buffer.out = 0;
+
+ rpipe->pipe_buffer.cnt -= size;
+ nread += size;
+#ifndef PIPE_NODIRECT
+ /*
+ * Direct copy, bypassing a kernel buffer.
+ */
+ } else if ((size = rpipe->pipe_map.cnt) &&
+ (rpipe->pipe_state & PIPE_DIRECTW)) {
+ caddr_t va;
+ if (size > (u_int) uio->uio_resid)
+ size = (u_int) uio->uio_resid;
+ if ((error = pipelock(rpipe,1)) == 0) {
+ va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
+ error = uiomove(va, size, uio);
+ pipeunlock(rpipe);
+ }
+ if (error)
+ break;
+ nread += size;
+ rpipe->pipe_map.pos += size;
+ rpipe->pipe_map.cnt -= size;
+ if (rpipe->pipe_map.cnt == 0) {
+ rpipe->pipe_state &= ~PIPE_DIRECTW;
+ wakeup(rpipe);
+ }
+#endif
+ } else {
+ /*
+ * detect EOF condition
+ */
+ if (rpipe->pipe_state & PIPE_EOF) {
+ /* XXX error = ? */
+ break;
+ }
+ /*
+ * If the "write-side" has been blocked, wake it up now.
+ */
+ if (rpipe->pipe_state & PIPE_WANTW) {
+ rpipe->pipe_state &= ~PIPE_WANTW;
+ wakeup(rpipe);
+ }
+ if (nread > 0)
+ break;
+
+ if (fp->f_flag & FNONBLOCK) {
+ error = EAGAIN;
+ break;
+ }
+
+ /*
+ * If there is no more to read in the pipe, reset
+ * its pointers to the beginning. This improves
+ * cache hit stats.
+ */
+
+ if ((error = pipelock(rpipe,1)) == 0) {
+ if (rpipe->pipe_buffer.cnt == 0) {
+ rpipe->pipe_buffer.in = 0;
+ rpipe->pipe_buffer.out = 0;
+ }
+ pipeunlock(rpipe);
+ } else {
+ break;
+ }
+
+ if (rpipe->pipe_state & PIPE_WANTW) {
+ rpipe->pipe_state &= ~PIPE_WANTW;
+ wakeup(rpipe);
+ }
+
+ rpipe->pipe_state |= PIPE_WANTR;
+ if (error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) {
+ break;
+ }
+ }
+ }
+
+ if (error == 0)
+ getnanotime(&rpipe->pipe_atime);
+
+ --rpipe->pipe_busy;
+ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
+ rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
+ wakeup(rpipe);
+ } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
+ /*
+ * If there is no more to read in the pipe, reset
+ * its pointers to the beginning. This improves
+ * cache hit stats.
+ */
+ if (rpipe->pipe_buffer.cnt == 0) {
+ if ((error == 0) && (error = pipelock(rpipe,1)) == 0) {
+ rpipe->pipe_buffer.in = 0;
+ rpipe->pipe_buffer.out = 0;
+ pipeunlock(rpipe);
+ }
+ }
+
+ /*
+ * If the "write-side" has been blocked, wake it up now.
+ */
+ if (rpipe->pipe_state & PIPE_WANTW) {
+ rpipe->pipe_state &= ~PIPE_WANTW;
+ wakeup(rpipe);
+ }
+ }
+
+ if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
+ pipeselwakeup(rpipe);
+
+ return error;
+}
+
+#ifndef PIPE_NODIRECT
+/*
+ * Map the sending processes' buffer into kernel space and wire it.
+ * This is similar to a physical write operation.
+ */
+static int
+pipe_build_write_buffer(wpipe, uio)
+ struct pipe *wpipe;
+ struct uio *uio;
+{
+ u_int size;
+ int i;
+ vm_offset_t addr, endaddr, paddr;
+
+ size = (u_int) uio->uio_iov->iov_len;
+ if (size > wpipe->pipe_buffer.size)
+ size = wpipe->pipe_buffer.size;
+
+ endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
+ for(i = 0, addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
+ addr < endaddr;
+ addr += PAGE_SIZE, i+=1) {
+
+ vm_page_t m;
+
+ vm_fault_quick( (caddr_t) addr, VM_PROT_READ);
+ paddr = pmap_kextract(addr);
+ if (!paddr) {
+ int j;
+ for(j=0;j<i;j++)
+ vm_page_unwire(wpipe->pipe_map.ms[j], 1);
+ return EFAULT;
+ }
+
+ m = PHYS_TO_VM_PAGE(paddr);
+ vm_page_wire(m);
+ wpipe->pipe_map.ms[i] = m;
+ }
+
+/*
+ * set up the control block
+ */
+ wpipe->pipe_map.npages = i;
+ wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
+ wpipe->pipe_map.cnt = size;
+
+/*
+ * and map the buffer
+ */
+ if (wpipe->pipe_map.kva == 0) {
+ /*
+ * We need to allocate space for an extra page because the
+ * address range might (will) span pages at times.
+ */
+ wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
+ wpipe->pipe_buffer.size + PAGE_SIZE);
+ amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
+ }
+ pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
+ wpipe->pipe_map.npages);
+
+/*
+ * and update the uio data
+ */
+
+ uio->uio_iov->iov_len -= size;
+ uio->uio_iov->iov_base += size;
+ if (uio->uio_iov->iov_len == 0)
+ uio->uio_iov++;
+ uio->uio_resid -= size;
+ uio->uio_offset += size;
+ return 0;
+}
+
+/*
+ * unmap and unwire the process buffer
+ */
+static void
+pipe_destroy_write_buffer(wpipe)
+struct pipe *wpipe;
+{
+ int i;
+ if (wpipe->pipe_map.kva) {
+ pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
+
+ if (amountpipekva > MAXPIPEKVA) {
+ vm_offset_t kva = wpipe->pipe_map.kva;
+ wpipe->pipe_map.kva = 0;
+ kmem_free(kernel_map, kva,
+ wpipe->pipe_buffer.size + PAGE_SIZE);
+ amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
+ }
+ }
+ for (i=0;i<wpipe->pipe_map.npages;i++)
+ vm_page_unwire(wpipe->pipe_map.ms[i], 1);
+}
+
+/*
+ * In the case of a signal, the writing process might go away. This
+ * code copies the data into the circular buffer so that the source
+ * pages can be freed without loss of data.
+ */
+static void
+pipe_clone_write_buffer(wpipe)
+struct pipe *wpipe;
+{
+ int size;
+ int pos;
+
+ size = wpipe->pipe_map.cnt;
+ pos = wpipe->pipe_map.pos;
+ bcopy((caddr_t) wpipe->pipe_map.kva+pos,
+ (caddr_t) wpipe->pipe_buffer.buffer,
+ size);
+
+ wpipe->pipe_buffer.in = size;
+ wpipe->pipe_buffer.out = 0;
+ wpipe->pipe_buffer.cnt = size;
+ wpipe->pipe_state &= ~PIPE_DIRECTW;
+
+ pipe_destroy_write_buffer(wpipe);
+}
+
+/*
+ * This implements the pipe buffer write mechanism. Note that only
+ * a direct write OR a normal pipe write can be pending at any given time.
+ * If there are any characters in the pipe buffer, the direct write will
+ * be deferred until the receiving process grabs all of the bytes from
+ * the pipe buffer. Then the direct mapping write is set-up.
+ */
+static int
+pipe_direct_write(wpipe, uio)
+ struct pipe *wpipe;
+ struct uio *uio;
+{
+ int error;
+retry:
+ while (wpipe->pipe_state & PIPE_DIRECTW) {
+ if ( wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ wpipe->pipe_state |= PIPE_WANTW;
+ error = tsleep(wpipe,
+ PRIBIO|PCATCH, "pipdww", 0);
+ if (error)
+ goto error1;
+ if (wpipe->pipe_state & PIPE_EOF) {
+ error = EPIPE;
+ goto error1;
+ }
+ }
+ wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
+ if (wpipe->pipe_buffer.cnt > 0) {
+ if ( wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+
+ wpipe->pipe_state |= PIPE_WANTW;
+ error = tsleep(wpipe,
+ PRIBIO|PCATCH, "pipdwc", 0);
+ if (error)
+ goto error1;
+ if (wpipe->pipe_state & PIPE_EOF) {
+ error = EPIPE;
+ goto error1;
+ }
+ goto retry;
+ }
+
+ wpipe->pipe_state |= PIPE_DIRECTW;
+
+ error = pipe_build_write_buffer(wpipe, uio);
+ if (error) {
+ wpipe->pipe_state &= ~PIPE_DIRECTW;
+ goto error1;
+ }
+
+ error = 0;
+ while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
+ if (wpipe->pipe_state & PIPE_EOF) {
+ pipelock(wpipe, 0);
+ pipe_destroy_write_buffer(wpipe);
+ pipeunlock(wpipe);
+ pipeselwakeup(wpipe);
+ error = EPIPE;
+ goto error1;
+ }
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ pipeselwakeup(wpipe);
+ error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
+ }
+
+ pipelock(wpipe,0);
+ if (wpipe->pipe_state & PIPE_DIRECTW) {
+ /*
+ * this bit of trickery substitutes a kernel buffer for
+ * the process that might be going away.
+ */
+ pipe_clone_write_buffer(wpipe);
+ } else {
+ pipe_destroy_write_buffer(wpipe);
+ }
+ pipeunlock(wpipe);
+ return error;
+
+error1:
+ wakeup(wpipe);
+ return error;
+}
+#endif
+
+static int
+pipe_write(fp, uio, cred)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+{
+ int error = 0;
+ int orig_resid;
+
+ struct pipe *wpipe, *rpipe;
+
+ rpipe = (struct pipe *) fp->f_data;
+ wpipe = rpipe->pipe_peer;
+
+ /*
+ * detect loss of pipe read side, issue SIGPIPE if lost.
+ */
+ if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+ return EPIPE;
+ }
+
+ /*
+ * If it is advantageous to resize the pipe buffer, do
+ * so.
+ */
+ if ((uio->uio_resid > PIPE_SIZE) &&
+ (nbigpipe < LIMITBIGPIPES) &&
+ (wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
+ (wpipe->pipe_buffer.size <= PIPE_SIZE) &&
+ (wpipe->pipe_buffer.cnt == 0)) {
+
+ if (wpipe->pipe_buffer.buffer) {
+ amountpipekva -= wpipe->pipe_buffer.size;
+ kmem_free(kernel_map,
+ (vm_offset_t)wpipe->pipe_buffer.buffer,
+ wpipe->pipe_buffer.size);
+ }
+
+#ifndef PIPE_NODIRECT
+ if (wpipe->pipe_map.kva) {
+ amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
+ kmem_free(kernel_map,
+ wpipe->pipe_map.kva,
+ wpipe->pipe_buffer.size + PAGE_SIZE);
+ }
+#endif
+
+ wpipe->pipe_buffer.in = 0;
+ wpipe->pipe_buffer.out = 0;
+ wpipe->pipe_buffer.cnt = 0;
+ wpipe->pipe_buffer.size = BIG_PIPE_SIZE;
+ wpipe->pipe_buffer.buffer = NULL;
+ ++nbigpipe;
+
+#ifndef PIPE_NODIRECT
+ wpipe->pipe_map.cnt = 0;
+ wpipe->pipe_map.kva = 0;
+ wpipe->pipe_map.pos = 0;
+ wpipe->pipe_map.npages = 0;
+#endif
+
+ }
+
+
+ if( wpipe->pipe_buffer.buffer == NULL) {
+ if ((error = pipelock(wpipe,1)) == 0) {
+ pipespace(wpipe);
+ pipeunlock(wpipe);
+ } else {
+ return error;
+ }
+ }
+
+ ++wpipe->pipe_busy;
+ orig_resid = uio->uio_resid;
+ while (uio->uio_resid) {
+ int space;
+#ifndef PIPE_NODIRECT
+ /*
+ * If the transfer is large, we can gain performance if
+ * we do process-to-process copies directly.
+ * If the write is non-blocking, we don't use the
+ * direct write mechanism.
+ */
+ if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
+ (fp->f_flag & FNONBLOCK) == 0 &&
+ (wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
+ (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
+ error = pipe_direct_write( wpipe, uio);
+ if (error) {
+ break;
+ }
+ continue;
+ }
+#endif
+
+ /*
+ * Pipe buffered writes cannot be coincidental with
+ * direct writes. We wait until the currently executing
+ * direct write is completed before we start filling the
+ * pipe buffer.
+ */
+ retrywrite:
+ while (wpipe->pipe_state & PIPE_DIRECTW) {
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ error = tsleep(wpipe,
+ PRIBIO|PCATCH, "pipbww", 0);
+ if (error)
+ break;
+ }
+
+ space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
+
+ /* Writes of size <= PIPE_BUF must be atomic. */
+ /* XXX perhaps they need to be contiguous to be atomic? */
+ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
+ space = 0;
+
+ if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
+ /*
+ * This set the maximum transfer as a segment of
+ * the buffer.
+ */
+ int size = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in;
+ /*
+ * space is the size left in the buffer
+ */
+ if (size > space)
+ size = space;
+ /*
+ * now limit it to the size of the uio transfer
+ */
+ if (size > uio->uio_resid)
+ size = uio->uio_resid;
+ if ((error = pipelock(wpipe,1)) == 0) {
+ /*
+ * It is possible for a direct write to
+ * slip in on us... handle it here...
+ */
+ if (wpipe->pipe_state & PIPE_DIRECTW) {
+ pipeunlock(wpipe);
+ goto retrywrite;
+ }
+ error = uiomove( &wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
+ size, uio);
+ pipeunlock(wpipe);
+ }
+ if (error)
+ break;
+
+ wpipe->pipe_buffer.in += size;
+ if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size)
+ wpipe->pipe_buffer.in = 0;
+
+ wpipe->pipe_buffer.cnt += size;
+ } else {
+ /*
+ * If the "read-side" has been blocked, wake it up now.
+ */
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+
+ /*
+ * don't block on non-blocking I/O
+ */
+ if (fp->f_flag & FNONBLOCK) {
+ error = EAGAIN;
+ break;
+ }
+
+ /*
+ * We have no more space and have something to offer,
+ * wake up select/poll.
+ */
+ pipeselwakeup(wpipe);
+
+ wpipe->pipe_state |= PIPE_WANTW;
+ if (error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) {
+ break;
+ }
+ /*
+ * If read side wants to go away, we just issue a signal
+ * to ourselves.
+ */
+ if (wpipe->pipe_state & PIPE_EOF) {
+ error = EPIPE;
+ break;
+ }
+ }
+ }
+
+ --wpipe->pipe_busy;
+ if ((wpipe->pipe_busy == 0) &&
+ (wpipe->pipe_state & PIPE_WANT)) {
+ wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
+ wakeup(wpipe);
+ } else if (wpipe->pipe_buffer.cnt > 0) {
+ /*
+ * If we have put any characters in the buffer, we wake up
+ * the reader.
+ */
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ }
+
+ /*
+ * Don't return EPIPE if I/O was successful
+ */
+ if ((wpipe->pipe_buffer.cnt == 0) &&
+ (uio->uio_resid == 0) &&
+ (error == EPIPE))
+ error = 0;
+
+ if (error == 0)
+ getnanotime(&wpipe->pipe_mtime);
+
+ /*
+ * We have something to offer,
+ * wake up select/poll.
+ */
+ if (wpipe->pipe_buffer.cnt)
+ pipeselwakeup(wpipe);
+
+ return error;
+}
+
+/*
+ * we implement a very minimal set of ioctls for compatibility with sockets.
+ */
+int
+pipe_ioctl(fp, cmd, data, p)
+ struct file *fp;
+ u_long cmd;
+ register caddr_t data;
+ struct proc *p;
+{
+ register struct pipe *mpipe = (struct pipe *)fp->f_data;
+
+ switch (cmd) {
+
+ case FIONBIO:
+ return (0);
+
+ case FIOASYNC:
+ if (*(int *)data) {
+ mpipe->pipe_state |= PIPE_ASYNC;
+ } else {
+ mpipe->pipe_state &= ~PIPE_ASYNC;
+ }
+ return (0);
+
+ case FIONREAD:
+ if (mpipe->pipe_state & PIPE_DIRECTW)
+ *(int *)data = mpipe->pipe_map.cnt;
+ else
+ *(int *)data = mpipe->pipe_buffer.cnt;
+ return (0);
+
+ case FIOSETOWN:
+ return (fsetown(*(int *)data, &mpipe->pipe_sigio));
+
+ case FIOGETOWN:
+ *(int *)data = fgetown(mpipe->pipe_sigio);
+ return (0);
+
+ /* This is deprecated, FIOSETOWN should be used instead. */
+ case TIOCSPGRP:
+ return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
+
+ /* This is deprecated, FIOGETOWN should be used instead. */
+ case TIOCGPGRP:
+ *(int *)data = -fgetown(mpipe->pipe_sigio);
+ return (0);
+
+ }
+ return (ENOTTY);
+}
+
+int
+pipe_poll(fp, events, cred, p)
+ struct file *fp;
+ int events;
+ struct ucred *cred;
+ struct proc *p;
+{
+ register struct pipe *rpipe = (struct pipe *)fp->f_data;
+ struct pipe *wpipe;
+ int revents = 0;
+
+ wpipe = rpipe->pipe_peer;
+ if (events & (POLLIN | POLLRDNORM))
+ if ((rpipe->pipe_state & PIPE_DIRECTW) ||
+ (rpipe->pipe_buffer.cnt > 0) ||
+ (rpipe->pipe_state & PIPE_EOF))
+ revents |= events & (POLLIN | POLLRDNORM);
+
+ if (events & (POLLOUT | POLLWRNORM))
+ if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
+ ((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
+ (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)
+ revents |= events & (POLLOUT | POLLWRNORM);
+
+ if ((rpipe->pipe_state & PIPE_EOF) ||
+ (wpipe == NULL) ||
+ (wpipe->pipe_state & PIPE_EOF))
+ revents |= POLLHUP;
+
+ if (revents == 0) {
+ if (events & (POLLIN | POLLRDNORM)) {
+ selrecord(p, &rpipe->pipe_sel);
+ rpipe->pipe_state |= PIPE_SEL;
+ }
+
+ if (events & (POLLOUT | POLLWRNORM)) {
+ selrecord(p, &wpipe->pipe_sel);
+ wpipe->pipe_state |= PIPE_SEL;
+ }
+ }
+
+ return (revents);
+}
+
+int
+pipe_stat(pipe, ub)
+ register struct pipe *pipe;
+ register struct stat *ub;
+{
+ bzero((caddr_t)ub, sizeof (*ub));
+ ub->st_mode = S_IFIFO;
+ ub->st_blksize = pipe->pipe_buffer.size;
+ ub->st_size = pipe->pipe_buffer.cnt;
+ ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
+ ub->st_atimespec = pipe->pipe_atime;
+ ub->st_mtimespec = pipe->pipe_mtime;
+ ub->st_ctimespec = pipe->pipe_ctime;
+ /*
+ * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev,
+ * st_flags, st_gen.
+ * XXX (st_dev, st_ino) should be unique.
+ */
+ return 0;
+}
+
+/* ARGSUSED */
+static int
+pipe_close(fp, p)
+ struct file *fp;
+ struct proc *p;
+{
+ struct pipe *cpipe = (struct pipe *)fp->f_data;
+
+ funsetown(cpipe->pipe_sigio);
+ pipeclose(cpipe);
+ fp->f_data = NULL;
+ return 0;
+}
+
+/*
+ * shutdown the pipe
+ */
+static void
+pipeclose(cpipe)
+ struct pipe *cpipe;
+{
+ struct pipe *ppipe;
+ if (cpipe) {
+
+ pipeselwakeup(cpipe);
+
+ /*
+ * If the other side is blocked, wake it up saying that
+ * we want to close it down.
+ */
+ while (cpipe->pipe_busy) {
+ wakeup(cpipe);
+ cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
+ tsleep(cpipe, PRIBIO, "pipecl", 0);
+ }
+
+ /*
+ * Disconnect from peer
+ */
+ if (ppipe = cpipe->pipe_peer) {
+ pipeselwakeup(ppipe);
+
+ ppipe->pipe_state |= PIPE_EOF;
+ wakeup(ppipe);
+ ppipe->pipe_peer = NULL;
+ }
+
+ /*
+ * free resources
+ */
+ if (cpipe->pipe_buffer.buffer) {
+ if (cpipe->pipe_buffer.size > PIPE_SIZE)
+ --nbigpipe;
+ amountpipekva -= cpipe->pipe_buffer.size;
+ kmem_free(kernel_map,
+ (vm_offset_t)cpipe->pipe_buffer.buffer,
+ cpipe->pipe_buffer.size);
+ }
+#ifndef PIPE_NODIRECT
+ if (cpipe->pipe_map.kva) {
+ amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
+ kmem_free(kernel_map,
+ cpipe->pipe_map.kva,
+ cpipe->pipe_buffer.size + PAGE_SIZE);
+ }
+#endif
+ zfree(pipe_zone, cpipe);
+ }
+}
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
new file mode 100644
index 0000000..4756127
--- /dev/null
+++ b/sys/kern/sys_process.c
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 1994, Sean Eric Fagan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Sean Eric Fagan.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: sys_process.c,v 1.40 1998/07/29 18:41:30 dfr Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/ptrace.h>
+
+#include <machine/reg.h>
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+
+#include <sys/user.h>
+#include <miscfs/procfs/procfs.h>
+
+/* use the equivalent procfs code */
+#if 0
+static int
+pread (struct proc *procp, unsigned int addr, unsigned int *retval) {
+ int rv;
+ vm_map_t map, tmap;
+ vm_object_t object;
+ vm_offset_t kva = 0;
+ int page_offset; /* offset into page */
+ vm_offset_t pageno; /* page number */
+ vm_map_entry_t out_entry;
+ vm_prot_t out_prot;
+ boolean_t wired;
+ vm_pindex_t pindex;
+
+ /* Map page into kernel space */
+
+ map = &procp->p_vmspace->vm_map;
+
+ page_offset = addr - trunc_page(addr);
+ pageno = trunc_page(addr);
+
+ tmap = map;
+ rv = vm_map_lookup (&tmap, pageno, VM_PROT_READ, &out_entry,
+ &object, &pindex, &out_prot, &wired);
+
+ if (rv != KERN_SUCCESS)
+ return EINVAL;
+
+ vm_map_lookup_done (tmap, out_entry);
+
+ /* Find space in kernel_map for the page we're interested in */
+ rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex),
+ &kva, PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0);
+
+ if (!rv) {
+ vm_object_reference (object);
+
+ rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0);
+ if (!rv) {
+ *retval = 0;
+ bcopy ((caddr_t)kva + page_offset,
+ retval, sizeof *retval);
+ }
+ vm_map_remove (kernel_map, kva, kva + PAGE_SIZE);
+ }
+
+ return rv;
+}
+
+static int
+pwrite (struct proc *procp, unsigned int addr, unsigned int datum) {
+ int rv;
+ vm_map_t map, tmap;
+ vm_object_t object;
+ vm_offset_t kva = 0;
+ int page_offset; /* offset into page */
+ vm_offset_t pageno; /* page number */
+ vm_map_entry_t out_entry;
+ vm_prot_t out_prot;
+ boolean_t wired;
+ vm_pindex_t pindex;
+ boolean_t fix_prot = 0;
+
+ /* Map page into kernel space */
+
+ map = &procp->p_vmspace->vm_map;
+
+ page_offset = addr - trunc_page(addr);
+ pageno = trunc_page(addr);
+
+ /*
+ * Check the permissions for the area we're interested in.
+ */
+
+ if (vm_map_check_protection (map, pageno, pageno + PAGE_SIZE,
+ VM_PROT_WRITE) == FALSE) {
+ /*
+ * If the page was not writable, we make it so.
+ * XXX It is possible a page may *not* be read/executable,
+ * if a process changes that!
+ */
+ fix_prot = 1;
+ /* The page isn't writable, so let's try making it so... */
+ if ((rv = vm_map_protect (map, pageno, pageno + PAGE_SIZE,
+ VM_PROT_ALL, 0)) != KERN_SUCCESS)
+ return EFAULT; /* I guess... */
+ }
+
+ /*
+ * Now we need to get the page. out_entry, out_prot, wired, and
+ * single_use aren't used. One would think the vm code would be
+ * a *bit* nicer... We use tmap because vm_map_lookup() can
+ * change the map argument.
+ */
+
+ tmap = map;
+ rv = vm_map_lookup (&tmap, pageno, VM_PROT_WRITE, &out_entry,
+ &object, &pindex, &out_prot, &wired);
+ if (rv != KERN_SUCCESS) {
+ return EINVAL;
+ }
+
+ /*
+ * Okay, we've got the page. Let's release tmap.
+ */
+
+ vm_map_lookup_done (tmap, out_entry);
+
+ /*
+ * Fault the page in...
+ */
+
+ rv = vm_fault(map, pageno, VM_PROT_WRITE|VM_PROT_READ, FALSE);
+ if (rv != KERN_SUCCESS)
+ return EFAULT;
+
+ /* Find space in kernel_map for the page we're interested in */
+ rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex),
+ &kva, PAGE_SIZE, 0,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (!rv) {
+ vm_object_reference (object);
+
+ rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0);
+ if (!rv) {
+ bcopy (&datum, (caddr_t)kva + page_offset, sizeof datum);
+ }
+ vm_map_remove (kernel_map, kva, kva + PAGE_SIZE);
+ }
+
+ if (fix_prot)
+ vm_map_protect (map, pageno, pageno + PAGE_SIZE,
+ VM_PROT_READ|VM_PROT_EXECUTE, 0);
+ return rv;
+}
+#endif
+
+/*
+ * Process debugging system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ptrace_args {
+ int req;
+ pid_t pid;
+ caddr_t addr;
+ int data;
+};
+#endif
+
+int
+ptrace(curp, uap)
+ struct proc *curp;
+ struct ptrace_args *uap;
+{
+ struct proc *p;
+ struct iovec iov;
+ struct uio uio;
+ int error = 0;
+ int write;
+ int s;
+
+ if (uap->req == PT_TRACE_ME)
+ p = curp;
+ else {
+ if ((p = pfind(uap->pid)) == NULL)
+ return ESRCH;
+ }
+
+ /*
+ * Permissions check
+ */
+ switch (uap->req) {
+ case PT_TRACE_ME:
+ /* Always legal. */
+ break;
+
+ case PT_ATTACH:
+ /* Self */
+ if (p->p_pid == curp->p_pid)
+ return EINVAL;
+
+ /* Already traced */
+ if (p->p_flag & P_TRACED)
+ return EBUSY;
+
+ /* not owned by you, has done setuid (unless you're root) */
+ if ((p->p_cred->p_ruid != curp->p_cred->p_ruid) ||
+ (p->p_flag & P_SUGID)) {
+ if (error = suser(curp->p_ucred, &curp->p_acflag))
+ return error;
+ }
+
+ /* can't trace init when securelevel > 0 */
+ if (securelevel > 0 && p->p_pid == 1)
+ return EPERM;
+
+ /* OK */
+ break;
+
+ case PT_READ_I:
+ case PT_READ_D:
+ case PT_READ_U:
+ case PT_WRITE_I:
+ case PT_WRITE_D:
+ case PT_WRITE_U:
+ case PT_CONTINUE:
+ case PT_KILL:
+ case PT_STEP:
+ case PT_DETACH:
+#ifdef PT_GETREGS
+ case PT_GETREGS:
+#endif
+#ifdef PT_SETREGS
+ case PT_SETREGS:
+#endif
+#ifdef PT_GETFPREGS
+ case PT_GETFPREGS:
+#endif
+#ifdef PT_SETFPREGS
+ case PT_SETFPREGS:
+#endif
+ /* not being traced... */
+ if ((p->p_flag & P_TRACED) == 0)
+ return EPERM;
+
+ /* not being traced by YOU */
+ if (p->p_pptr != curp)
+ return EBUSY;
+
+ /* not currently stopped */
+ if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0)
+ return EBUSY;
+
+ /* OK */
+ break;
+
+ default:
+ return EINVAL;
+ }
+
+#ifdef FIX_SSTEP
+ /*
+ * Single step fixup ala procfs
+ */
+ FIX_SSTEP(p);
+#endif
+
+ /*
+ * Actually do the requests
+ */
+
+ write = 0;
+ curp->p_retval[0] = 0;
+
+ switch (uap->req) {
+ case PT_TRACE_ME:
+ /* set my trace flag and "owner" so it can read/write me */
+ p->p_flag |= P_TRACED;
+ p->p_oppid = p->p_pptr->p_pid;
+ return 0;
+
+ case PT_ATTACH:
+ /* security check done above */
+ p->p_flag |= P_TRACED;
+ p->p_oppid = p->p_pptr->p_pid;
+ if (p->p_pptr != curp)
+ proc_reparent(p, curp);
+ uap->data = SIGSTOP;
+ goto sendsig; /* in PT_CONTINUE below */
+
+ case PT_STEP:
+ case PT_CONTINUE:
+ case PT_DETACH:
+ if ((unsigned)uap->data >= NSIG)
+ return EINVAL;
+
+ PHOLD(p);
+
+ if (uap->req == PT_STEP) {
+ if ((error = ptrace_single_step (p))) {
+ PRELE(p);
+ return error;
+ }
+ }
+
+ if (uap->addr != (caddr_t)1) {
+ fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
+ if ((error = ptrace_set_pc (p,
+ (u_long)(uintfptr_t)uap->addr))) {
+ PRELE(p);
+ return error;
+ }
+ }
+ PRELE(p);
+
+ if (uap->req == PT_DETACH) {
+ /* reset process parent */
+ if (p->p_oppid != p->p_pptr->p_pid) {
+ struct proc *pp;
+
+ pp = pfind(p->p_oppid);
+ proc_reparent(p, pp ? pp : initproc);
+ }
+
+ p->p_flag &= ~(P_TRACED | P_WAITED);
+ p->p_oppid = 0;
+
+ /* should we send SIGCHLD? */
+
+ }
+
+ sendsig:
+ /* deliver or queue signal */
+ s = splhigh();
+ if (p->p_stat == SSTOP) {
+ p->p_xstat = uap->data;
+ setrunnable(p);
+ } else if (uap->data) {
+ psignal(p, uap->data);
+ }
+ splx(s);
+ return 0;
+
+ case PT_WRITE_I:
+ case PT_WRITE_D:
+ write = 1;
+ /* fallthrough */
+ case PT_READ_I:
+ case PT_READ_D:
+ /* write = 0 set above */
+ iov.iov_base = write ? (caddr_t)&uap->data : (caddr_t)curp->p_retval;
+ iov.iov_len = sizeof(int);
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = (off_t)(uintptr_t)uap->addr;
+ uio.uio_resid = sizeof(int);
+ uio.uio_segflg = UIO_SYSSPACE; /* ie: the uap */
+ uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+ uio.uio_procp = p;
+ error = procfs_domem(curp, p, NULL, &uio);
+ if (uio.uio_resid != 0) {
+ /*
+ * XXX procfs_domem() doesn't currently return ENOSPC,
+ * so I think write() can bogusly return 0.
+ * XXX what happens for short writes? We don't want
+ * to write partial data.
+ * XXX procfs_domem() returns EPERM for other invalid
+ * addresses. Convert this to EINVAL. Does this
+ * clobber returns of EPERM for other reasons?
+ */
+ if (error == 0 || error == ENOSPC || error == EPERM)
+ error = EINVAL; /* EOF */
+ }
+ return (error);
+
+ case PT_READ_U:
+ if ((uintptr_t)uap->addr > UPAGES * PAGE_SIZE - sizeof(int)) {
+ return EFAULT;
+ }
+ if ((uintptr_t)uap->addr & (sizeof(int) - 1)) {
+ return EFAULT;
+ }
+ if (ptrace_read_u_check(p,(vm_offset_t) uap->addr,
+ sizeof(long)) &&
+ !procfs_kmemaccess(curp)) {
+ return EFAULT;
+ }
+ error = 0;
+ PHOLD(p); /* user had damn well better be incore! */
+ if (p->p_flag & P_INMEM) {
+ p->p_addr->u_kproc.kp_proc = *p;
+ fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
+ curp->p_retval[0] = *(int *)
+ ((uintptr_t)p->p_addr + (uintptr_t)uap->addr);
+ } else {
+ curp->p_retval[0] = 0;
+ error = EFAULT;
+ }
+ PRELE(p);
+ return error;
+
+ case PT_WRITE_U:
+ PHOLD(p); /* user had damn well better be incore! */
+ if (p->p_flag & P_INMEM) {
+ p->p_addr->u_kproc.kp_proc = *p;
+ fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
+ error = ptrace_write_u(p, (vm_offset_t)uap->addr, uap->data);
+ } else {
+ error = EFAULT;
+ }
+ PRELE(p);
+ return error;
+
+ case PT_KILL:
+ uap->data = SIGKILL;
+ goto sendsig; /* in PT_CONTINUE above */
+
+#ifdef PT_SETREGS
+ case PT_SETREGS:
+ write = 1;
+ /* fallthrough */
+#endif /* PT_SETREGS */
+#ifdef PT_GETREGS
+ case PT_GETREGS:
+ /* write = 0 above */
+#endif /* PT_SETREGS */
+#if defined(PT_SETREGS) || defined(PT_GETREGS)
+ if (!procfs_validregs(p)) /* no P_SYSTEM procs please */
+ return EINVAL;
+ else {
+ iov.iov_base = uap->addr;
+ iov.iov_len = sizeof(struct reg);
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = 0;
+ uio.uio_resid = sizeof(struct reg);
+ uio.uio_segflg = UIO_USERSPACE;
+ uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+ uio.uio_procp = curp;
+ return (procfs_doregs(curp, p, NULL, &uio));
+ }
+#endif /* defined(PT_SETREGS) || defined(PT_GETREGS) */
+
+#ifdef PT_SETFPREGS
+ case PT_SETFPREGS:
+ write = 1;
+ /* fallthrough */
+#endif /* PT_SETFPREGS */
+#ifdef PT_GETFPREGS
+ case PT_GETFPREGS:
+ /* write = 0 above */
+#endif /* PT_SETFPREGS */
+#if defined(PT_SETFPREGS) || defined(PT_GETFPREGS)
+ if (!procfs_validfpregs(p)) /* no P_SYSTEM procs please */
+ return EINVAL;
+ else {
+ iov.iov_base = uap->addr;
+ iov.iov_len = sizeof(struct fpreg);
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = 0;
+ uio.uio_resid = sizeof(struct fpreg);
+ uio.uio_segflg = UIO_USERSPACE;
+ uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+ uio.uio_procp = curp;
+ return (procfs_dofpregs(curp, p, NULL, &uio));
+ }
+#endif /* defined(PT_SETFPREGS) || defined(PT_GETFPREGS) */
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+int
+trace_req(p)
+ struct proc *p;
+{
+ return 1;
+}
+
+/*
+ * stopevent()
+ * Stop a process because of a procfs event;
+ * stay stopped until p->p_step is cleared
+ * (cleared by PIOCCONT in procfs).
+ */
+
+void
+stopevent(struct proc *p, unsigned int event, unsigned int val) {
+ p->p_step = 1;
+
+ do {
+ p->p_xstat = val;
+ p->p_stype = event; /* Which event caused the stop? */
+ wakeup(&p->p_stype); /* Wake up any PIOCWAIT'ing procs */
+ tsleep(&p->p_step, PWAIT, "stopevent", 0);
+ } while (p->p_step);
+}
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
new file mode 100644
index 0000000..8cf30cd
--- /dev/null
+++ b/sys/kern/sys_socket.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 1982, 1986, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93
+ * $Id: sys_socket.c,v 1.18 1998/06/07 17:11:40 dfr Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/filio.h> /* XXX */
+#include <sys/sockio.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/filedesc.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+static int soo_read __P((struct file *fp, struct uio *uio,
+ struct ucred *cred));
+static int soo_write __P((struct file *fp, struct uio *uio,
+ struct ucred *cred));
+static int soo_close __P((struct file *fp, struct proc *p));
+
+struct fileops socketops =
+ { soo_read, soo_write, soo_ioctl, soo_poll, soo_close };
+
+/* ARGSUSED */
+static int
+soo_read(fp, uio, cred)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+{
+ struct socket *so = (struct socket *)fp->f_data;
+ return so->so_proto->pr_usrreqs->pru_soreceive(so, 0, uio, 0, 0, 0);
+}
+
+/* ARGSUSED */
+static int
+soo_write(fp, uio, cred)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+{
+ struct socket *so = (struct socket *)fp->f_data;
+ return so->so_proto->pr_usrreqs->pru_sosend(so, 0, uio, 0, 0, 0,
+ uio->uio_procp);
+}
+
+int
+soo_ioctl(fp, cmd, data, p)
+ struct file *fp;
+ u_long cmd;
+ register caddr_t data;
+ struct proc *p;
+{
+ register struct socket *so = (struct socket *)fp->f_data;
+
+ switch (cmd) {
+
+ case FIONBIO:
+ if (*(int *)data)
+ so->so_state |= SS_NBIO;
+ else
+ so->so_state &= ~SS_NBIO;
+ return (0);
+
+ case FIOASYNC:
+ if (*(int *)data) {
+ so->so_state |= SS_ASYNC;
+ so->so_rcv.sb_flags |= SB_ASYNC;
+ so->so_snd.sb_flags |= SB_ASYNC;
+ } else {
+ so->so_state &= ~SS_ASYNC;
+ so->so_rcv.sb_flags &= ~SB_ASYNC;
+ so->so_snd.sb_flags &= ~SB_ASYNC;
+ }
+ return (0);
+
+ case FIONREAD:
+ *(int *)data = so->so_rcv.sb_cc;
+ return (0);
+
+ case FIOSETOWN:
+ return (fsetown(*(int *)data, &so->so_sigio));
+
+ case FIOGETOWN:
+ *(int *)data = fgetown(so->so_sigio);
+ return (0);
+
+ case SIOCSPGRP:
+ return (fsetown(-(*(int *)data), &so->so_sigio));
+
+ case SIOCGPGRP:
+ *(int *)data = -fgetown(so->so_sigio);
+ return (0);
+
+ case SIOCATMARK:
+ *(int *)data = (so->so_state&SS_RCVATMARK) != 0;
+ return (0);
+ }
+ /*
+ * Interface/routing/protocol specific ioctls:
+ * interface and routing ioctls should have a
+ * different entry since a socket's unnecessary
+ */
+ if (IOCGROUP(cmd) == 'i')
+ return (ifioctl(so, cmd, data, p));
+ if (IOCGROUP(cmd) == 'r')
+ return (rtioctl(cmd, data, p));
+ return ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, 0, p));
+}
+
+int
+soo_poll(fp, events, cred, p)
+ struct file *fp;
+ int events;
+ struct ucred *cred;
+ struct proc *p;
+{
+ struct socket *so = (struct socket *)fp->f_data;
+ return so->so_proto->pr_usrreqs->pru_sopoll(so, events, cred, p);
+}
+
+int
+soo_stat(so, ub)
+ register struct socket *so;
+ register struct stat *ub;
+{
+
+ bzero((caddr_t)ub, sizeof (*ub));
+ ub->st_mode = S_IFSOCK;
+ return ((*so->so_proto->pr_usrreqs->pru_sense)(so, ub));
+}
+
+/* ARGSUSED */
+static int
+soo_close(fp, p)
+ struct file *fp;
+ struct proc *p;
+{
+ int error = 0;
+
+ if (fp->f_data)
+ error = soclose((struct socket *)fp->f_data);
+ fp->f_data = 0;
+ return (error);
+}
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
new file mode 100644
index 0000000..22e9e8e
--- /dev/null
+++ b/sys/kern/syscalls.c
@@ -0,0 +1,347 @@
+/*
+ * System call names.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * created from Id: syscalls.master,v 1.55 1998/11/11 12:45:14 peter Exp
+ */
+
+char *syscallnames[] = {
+ "syscall", /* 0 = syscall */
+ "exit", /* 1 = exit */
+ "fork", /* 2 = fork */
+ "read", /* 3 = read */
+ "write", /* 4 = write */
+ "open", /* 5 = open */
+ "close", /* 6 = close */
+ "wait4", /* 7 = wait4 */
+ "old.creat", /* 8 = old creat */
+ "link", /* 9 = link */
+ "unlink", /* 10 = unlink */
+ "obs_execv", /* 11 = obsolete execv */
+ "chdir", /* 12 = chdir */
+ "fchdir", /* 13 = fchdir */
+ "mknod", /* 14 = mknod */
+ "chmod", /* 15 = chmod */
+ "chown", /* 16 = chown */
+ "break", /* 17 = break */
+ "getfsstat", /* 18 = getfsstat */
+ "old.lseek", /* 19 = old lseek */
+ "getpid", /* 20 = getpid */
+ "mount", /* 21 = mount */
+ "unmount", /* 22 = unmount */
+ "setuid", /* 23 = setuid */
+ "getuid", /* 24 = getuid */
+ "geteuid", /* 25 = geteuid */
+ "ptrace", /* 26 = ptrace */
+ "recvmsg", /* 27 = recvmsg */
+ "sendmsg", /* 28 = sendmsg */
+ "recvfrom", /* 29 = recvfrom */
+ "accept", /* 30 = accept */
+ "getpeername", /* 31 = getpeername */
+ "getsockname", /* 32 = getsockname */
+ "access", /* 33 = access */
+ "chflags", /* 34 = chflags */
+ "fchflags", /* 35 = fchflags */
+ "sync", /* 36 = sync */
+ "kill", /* 37 = kill */
+ "old.stat", /* 38 = old stat */
+ "getppid", /* 39 = getppid */
+ "old.lstat", /* 40 = old lstat */
+ "dup", /* 41 = dup */
+ "pipe", /* 42 = pipe */
+ "getegid", /* 43 = getegid */
+ "profil", /* 44 = profil */
+ "ktrace", /* 45 = ktrace */
+ "sigaction", /* 46 = sigaction */
+ "getgid", /* 47 = getgid */
+ "sigprocmask", /* 48 = sigprocmask */
+ "getlogin", /* 49 = getlogin */
+ "setlogin", /* 50 = setlogin */
+ "acct", /* 51 = acct */
+ "sigpending", /* 52 = sigpending */
+ "sigaltstack", /* 53 = sigaltstack */
+ "ioctl", /* 54 = ioctl */
+ "reboot", /* 55 = reboot */
+ "revoke", /* 56 = revoke */
+ "symlink", /* 57 = symlink */
+ "readlink", /* 58 = readlink */
+ "execve", /* 59 = execve */
+ "umask", /* 60 = umask */
+ "chroot", /* 61 = chroot */
+ "old.fstat", /* 62 = old fstat */
+ "old.getkerninfo", /* 63 = old getkerninfo */
+ "old.getpagesize", /* 64 = old getpagesize */
+ "msync", /* 65 = msync */
+ "vfork", /* 66 = vfork */
+ "obs_vread", /* 67 = obsolete vread */
+ "obs_vwrite", /* 68 = obsolete vwrite */
+ "sbrk", /* 69 = sbrk */
+ "sstk", /* 70 = sstk */
+ "old.mmap", /* 71 = old mmap */
+ "vadvise", /* 72 = vadvise */
+ "munmap", /* 73 = munmap */
+ "mprotect", /* 74 = mprotect */
+ "madvise", /* 75 = madvise */
+ "obs_vhangup", /* 76 = obsolete vhangup */
+ "obs_vlimit", /* 77 = obsolete vlimit */
+ "mincore", /* 78 = mincore */
+ "getgroups", /* 79 = getgroups */
+ "setgroups", /* 80 = setgroups */
+ "getpgrp", /* 81 = getpgrp */
+ "setpgid", /* 82 = setpgid */
+ "setitimer", /* 83 = setitimer */
+ "old.wait", /* 84 = old wait */
+ "swapon", /* 85 = swapon */
+ "getitimer", /* 86 = getitimer */
+ "old.gethostname", /* 87 = old gethostname */
+ "old.sethostname", /* 88 = old sethostname */
+ "getdtablesize", /* 89 = getdtablesize */
+ "dup2", /* 90 = dup2 */
+ "#91", /* 91 = getdopt */
+ "fcntl", /* 92 = fcntl */
+ "select", /* 93 = select */
+ "#94", /* 94 = setdopt */
+ "fsync", /* 95 = fsync */
+ "setpriority", /* 96 = setpriority */
+ "socket", /* 97 = socket */
+ "connect", /* 98 = connect */
+ "old.accept", /* 99 = old accept */
+ "getpriority", /* 100 = getpriority */
+ "old.send", /* 101 = old send */
+ "old.recv", /* 102 = old recv */
+ "sigreturn", /* 103 = sigreturn */
+ "bind", /* 104 = bind */
+ "setsockopt", /* 105 = setsockopt */
+ "listen", /* 106 = listen */
+ "obs_vtimes", /* 107 = obsolete vtimes */
+ "old.sigvec", /* 108 = old sigvec */
+ "old.sigblock", /* 109 = old sigblock */
+ "old.sigsetmask", /* 110 = old sigsetmask */
+ "sigsuspend", /* 111 = sigsuspend */
+ "old.sigstack", /* 112 = old sigstack */
+ "old.recvmsg", /* 113 = old recvmsg */
+ "old.sendmsg", /* 114 = old sendmsg */
+ "obs_vtrace", /* 115 = obsolete vtrace */
+ "gettimeofday", /* 116 = gettimeofday */
+ "getrusage", /* 117 = getrusage */
+ "getsockopt", /* 118 = getsockopt */
+ "#119", /* 119 = resuba */
+ "readv", /* 120 = readv */
+ "writev", /* 121 = writev */
+ "settimeofday", /* 122 = settimeofday */
+ "fchown", /* 123 = fchown */
+ "fchmod", /* 124 = fchmod */
+ "old.recvfrom", /* 125 = old recvfrom */
+ "setreuid", /* 126 = setreuid */
+ "setregid", /* 127 = setregid */
+ "rename", /* 128 = rename */
+ "old.truncate", /* 129 = old truncate */
+ "old.ftruncate", /* 130 = old ftruncate */
+ "flock", /* 131 = flock */
+ "mkfifo", /* 132 = mkfifo */
+ "sendto", /* 133 = sendto */
+ "shutdown", /* 134 = shutdown */
+ "socketpair", /* 135 = socketpair */
+ "mkdir", /* 136 = mkdir */
+ "rmdir", /* 137 = rmdir */
+ "utimes", /* 138 = utimes */
+ "obs_4.2", /* 139 = obsolete 4.2 sigreturn */
+ "adjtime", /* 140 = adjtime */
+ "old.getpeername", /* 141 = old getpeername */
+ "old.gethostid", /* 142 = old gethostid */
+ "old.sethostid", /* 143 = old sethostid */
+ "old.getrlimit", /* 144 = old getrlimit */
+ "old.setrlimit", /* 145 = old setrlimit */
+ "old.killpg", /* 146 = old killpg */
+ "setsid", /* 147 = setsid */
+ "quotactl", /* 148 = quotactl */
+ "old.quota", /* 149 = old quota */
+ "old.getsockname", /* 150 = old getsockname */
+ "#151", /* 151 = sem_lock */
+ "#152", /* 152 = sem_wakeup */
+ "#153", /* 153 = asyncdaemon */
+ "#154", /* 154 = nosys */
+ "nfssvc", /* 155 = nfssvc */
+ "old.getdirentries", /* 156 = old getdirentries */
+ "statfs", /* 157 = statfs */
+ "fstatfs", /* 158 = fstatfs */
+ "#159", /* 159 = nosys */
+ "#160", /* 160 = nosys */
+ "getfh", /* 161 = getfh */
+ "getdomainname", /* 162 = getdomainname */
+ "setdomainname", /* 163 = setdomainname */
+ "uname", /* 164 = uname */
+ "sysarch", /* 165 = sysarch */
+ "rtprio", /* 166 = rtprio */
+ "#167", /* 167 = nosys */
+ "#168", /* 168 = nosys */
+ "semsys", /* 169 = semsys */
+ "msgsys", /* 170 = msgsys */
+ "shmsys", /* 171 = shmsys */
+ "#172", /* 172 = nosys */
+ "#173", /* 173 = nosys */
+ "#174", /* 174 = nosys */
+ "#175", /* 175 = nosys */
+ "ntp_adjtime", /* 176 = ntp_adjtime */
+ "#177", /* 177 = sfork */
+ "#178", /* 178 = getdescriptor */
+ "#179", /* 179 = setdescriptor */
+ "#180", /* 180 = nosys */
+ "setgid", /* 181 = setgid */
+ "setegid", /* 182 = setegid */
+ "seteuid", /* 183 = seteuid */
+ "#184", /* 184 = lfs_bmapv */
+ "#185", /* 185 = lfs_markv */
+ "#186", /* 186 = lfs_segclean */
+ "#187", /* 187 = lfs_segwait */
+ "stat", /* 188 = stat */
+ "fstat", /* 189 = fstat */
+ "lstat", /* 190 = lstat */
+ "pathconf", /* 191 = pathconf */
+ "fpathconf", /* 192 = fpathconf */
+ "#193", /* 193 = nosys */
+ "getrlimit", /* 194 = getrlimit */
+ "setrlimit", /* 195 = setrlimit */
+ "getdirentries", /* 196 = getdirentries */
+ "mmap", /* 197 = mmap */
+ "__syscall", /* 198 = __syscall */
+ "lseek", /* 199 = lseek */
+ "truncate", /* 200 = truncate */
+ "ftruncate", /* 201 = ftruncate */
+ "__sysctl", /* 202 = __sysctl */
+ "mlock", /* 203 = mlock */
+ "munlock", /* 204 = munlock */
+ "undelete", /* 205 = undelete */
+ "futimes", /* 206 = futimes */
+ "getpgid", /* 207 = getpgid */
+ "#208", /* 208 = newreboot */
+ "poll", /* 209 = poll */
+ "lkmnosys", /* 210 = lkmnosys */
+ "lkmnosys", /* 211 = lkmnosys */
+ "lkmnosys", /* 212 = lkmnosys */
+ "lkmnosys", /* 213 = lkmnosys */
+ "lkmnosys", /* 214 = lkmnosys */
+ "lkmnosys", /* 215 = lkmnosys */
+ "lkmnosys", /* 216 = lkmnosys */
+ "lkmnosys", /* 217 = lkmnosys */
+ "lkmnosys", /* 218 = lkmnosys */
+ "lkmnosys", /* 219 = lkmnosys */
+ "__semctl", /* 220 = __semctl */
+ "semget", /* 221 = semget */
+ "semop", /* 222 = semop */
+ "semconfig", /* 223 = semconfig */
+ "msgctl", /* 224 = msgctl */
+ "msgget", /* 225 = msgget */
+ "msgsnd", /* 226 = msgsnd */
+ "msgrcv", /* 227 = msgrcv */
+ "shmat", /* 228 = shmat */
+ "shmctl", /* 229 = shmctl */
+ "shmdt", /* 230 = shmdt */
+ "shmget", /* 231 = shmget */
+ "clock_gettime", /* 232 = clock_gettime */
+ "clock_settime", /* 233 = clock_settime */
+ "clock_getres", /* 234 = clock_getres */
+ "#235", /* 235 = timer_create */
+ "#236", /* 236 = timer_delete */
+ "#237", /* 237 = timer_settime */
+ "#238", /* 238 = timer_gettime */
+ "#239", /* 239 = timer_getoverrun */
+ "nanosleep", /* 240 = nanosleep */
+ "#241", /* 241 = nosys */
+ "#242", /* 242 = nosys */
+ "#243", /* 243 = nosys */
+ "#244", /* 244 = nosys */
+ "#245", /* 245 = nosys */
+ "#246", /* 246 = nosys */
+ "#247", /* 247 = nosys */
+ "#248", /* 248 = nosys */
+ "#249", /* 249 = nosys */
+ "minherit", /* 250 = minherit */
+ "rfork", /* 251 = rfork */
+ "openbsd_poll", /* 252 = openbsd_poll */
+ "issetugid", /* 253 = issetugid */
+ "lchown", /* 254 = lchown */
+ "#255", /* 255 = nosys */
+ "#256", /* 256 = nosys */
+ "#257", /* 257 = nosys */
+ "#258", /* 258 = nosys */
+ "#259", /* 259 = nosys */
+ "#260", /* 260 = nosys */
+ "#261", /* 261 = nosys */
+ "#262", /* 262 = nosys */
+ "#263", /* 263 = nosys */
+ "#264", /* 264 = nosys */
+ "#265", /* 265 = nosys */
+ "#266", /* 266 = nosys */
+ "#267", /* 267 = nosys */
+ "#268", /* 268 = nosys */
+ "#269", /* 269 = nosys */
+ "#270", /* 270 = nosys */
+ "#271", /* 271 = nosys */
+ "getdents", /* 272 = getdents */
+ "#273", /* 273 = nosys */
+ "lchmod", /* 274 = lchmod */
+ "netbsd_lchown", /* 275 = netbsd_lchown */
+ "lutimes", /* 276 = lutimes */
+ "netbsd_msync", /* 277 = netbsd_msync */
+ "nstat", /* 278 = nstat */
+ "nfstat", /* 279 = nfstat */
+ "nlstat", /* 280 = nlstat */
+ "#281", /* 281 = nosys */
+ "#282", /* 282 = nosys */
+ "#283", /* 283 = nosys */
+ "#284", /* 284 = nosys */
+ "#285", /* 285 = nosys */
+ "#286", /* 286 = nosys */
+ "#287", /* 287 = nosys */
+ "#288", /* 288 = nosys */
+ "#289", /* 289 = nosys */
+ "#290", /* 290 = nosys */
+ "#291", /* 291 = nosys */
+ "#292", /* 292 = nosys */
+ "#293", /* 293 = nosys */
+ "#294", /* 294 = nosys */
+ "#295", /* 295 = nosys */
+ "#296", /* 296 = nosys */
+ "#297", /* 297 = nosys */
+ "#298", /* 298 = nosys */
+ "#299", /* 299 = nosys */
+ "modnext", /* 300 = modnext */
+ "modstat", /* 301 = modstat */
+ "modfnext", /* 302 = modfnext */
+ "modfind", /* 303 = modfind */
+ "kldload", /* 304 = kldload */
+ "kldunload", /* 305 = kldunload */
+ "kldfind", /* 306 = kldfind */
+ "kldnext", /* 307 = kldnext */
+ "kldstat", /* 308 = kldstat */
+ "kldfirstmod", /* 309 = kldfirstmod */
+ "getsid", /* 310 = getsid */
+ "#311", /* 311 = setresuid */
+ "#312", /* 312 = setresgid */
+ "obs_signanosleep", /* 313 = obsolete signanosleep */
+ "aio_return", /* 314 = aio_return */
+ "aio_suspend", /* 315 = aio_suspend */
+ "aio_cancel", /* 316 = aio_cancel */
+ "aio_error", /* 317 = aio_error */
+ "aio_read", /* 318 = aio_read */
+ "aio_write", /* 319 = aio_write */
+ "lio_listio", /* 320 = lio_listio */
+ "yield", /* 321 = yield */
+ "thr_sleep", /* 322 = thr_sleep */
+ "thr_wakeup", /* 323 = thr_wakeup */
+ "mlockall", /* 324 = mlockall */
+ "munlockall", /* 325 = munlockall */
+ "__getcwd", /* 326 = __getcwd */
+ "sched_setparam", /* 327 = sched_setparam */
+ "sched_getparam", /* 328 = sched_getparam */
+ "sched_setscheduler", /* 329 = sched_setscheduler */
+ "sched_getscheduler", /* 330 = sched_getscheduler */
+ "sched_yield", /* 331 = sched_yield */
+ "sched_get_priority_max", /* 332 = sched_get_priority_max */
+ "sched_get_priority_min", /* 333 = sched_get_priority_min */
+ "sched_rr_get_interval", /* 334 = sched_rr_get_interval */
+ "utrace", /* 335 = utrace */
+ "sendfile", /* 336 = sendfile */
+ "kldsym", /* 337 = kldsym */
+};
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
new file mode 100644
index 0000000..6772363
--- /dev/null
+++ b/sys/kern/syscalls.master
@@ -0,0 +1,473 @@
+ $Id: syscalls.master,v 1.54 1998/11/05 14:28:24 dg Exp $
+; from: @(#)syscalls.master 8.2 (Berkeley) 1/13/94
+;
+; System call name/number master file.
+; Processed to created init_sysent.c, syscalls.c and syscall.h.
+
+; Columns: number type nargs namespc name alt{name,tag,rtyp}/comments
+; number system call number, must be in order
+; type one of STD, OBSOL, UNIMPL, COMPAT
+; namespc one of POSIX, BSD, NOHIDE
+; name psuedo-prototype of syscall routine
+; If one of the following alts is different, then all appear:
+; altname name of system call if different
+; alttag name of args struct tag if different from [o]`name'"_args"
+; altrtyp return type if not int (bogus - syscalls always return int)
+; for UNIMPL/OBSOL, name continues with comments
+
+; types:
+; STD always included
+; COMPAT included on COMPAT #ifdef
+; LIBCOMPAT included on COMPAT #ifdef, and placed in syscall.h
+; OBSOL obsolete, not included in system, only specifies name
+; UNIMPL not implemented, placeholder only
+
+; #ifdef's, etc. may be included, and are copied to the output files.
+
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+
+; Reserved/unimplemented system calls in the range 0-150 inclusive
+; are reserved for use in future Berkeley releases.
+; Additional system calls implemented in vendor and other
+; redistributions should be placed in the reserved range at the end
+; of the current calls.
+
+0 STD NOHIDE { int nosys(void); } syscall nosys_args int
+1 STD NOHIDE { void exit(int rval); } exit rexit_args void
+2 STD POSIX { int fork(void); }
+3 STD POSIX { ssize_t read(int fd, void *buf, size_t nbyte); }
+4 STD POSIX { ssize_t write(int fd, const void *buf, size_t nbyte); }
+5 STD POSIX { int open(char *path, int flags, int mode); }
+; XXX should be { int open(const char *path, int flags, ...); }
+; but we're not ready for `const' or varargs.
+; XXX man page says `mode_t mode'.
+6 STD POSIX { int close(int fd); }
+7 STD BSD { int wait4(int pid, int *status, int options, \
+ struct rusage *rusage); } wait4 wait_args int
+8 COMPAT BSD { int creat(char *path, int mode); }
+9 STD POSIX { int link(char *path, char *link); }
+10 STD POSIX { int unlink(char *path); }
+11 OBSOL NOHIDE execv
+12 STD POSIX { int chdir(char *path); }
+13 STD BSD { int fchdir(int fd); }
+14 STD POSIX { int mknod(char *path, int mode, int dev); }
+15 STD POSIX { int chmod(char *path, int mode); }
+16 STD POSIX { int chown(char *path, int uid, int gid); }
+17 STD BSD { int obreak(char *nsize); } break obreak_args int
+18 STD BSD { int getfsstat(struct statfs *buf, long bufsize, \
+ int flags); }
+19 COMPAT POSIX { long lseek(int fd, long offset, int whence); }
+20 STD POSIX { pid_t getpid(void); }
+21 STD BSD { int mount(char *type, char *path, int flags, \
+ caddr_t data); }
+; XXX 4.4lite2 uses `char *type' but we're not ready for that.
+; XXX `path' should have type `const char *' but we're not ready for that.
+22 STD BSD { int unmount(char *path, int flags); }
+23 STD POSIX { int setuid(uid_t uid); }
+24 STD POSIX { uid_t getuid(void); }
+25 STD POSIX { uid_t geteuid(void); }
+26 STD BSD { int ptrace(int req, pid_t pid, caddr_t addr, \
+ int data); }
+27 STD BSD { int recvmsg(int s, struct msghdr *msg, int flags); }
+28 STD BSD { int sendmsg(int s, caddr_t msg, int flags); }
+29 STD BSD { int recvfrom(int s, caddr_t buf, size_t len, \
+ int flags, caddr_t from, int *fromlenaddr); }
+30 STD BSD { int accept(int s, caddr_t name, int *anamelen); }
+31 STD BSD { int getpeername(int fdes, caddr_t asa, int *alen); }
+32 STD BSD { int getsockname(int fdes, caddr_t asa, int *alen); }
+33 STD POSIX { int access(char *path, int flags); }
+34 STD BSD { int chflags(char *path, int flags); }
+35 STD BSD { int fchflags(int fd, int flags); }
+36 STD BSD { int sync(void); }
+37 STD POSIX { int kill(int pid, int signum); }
+38 COMPAT POSIX { int stat(char *path, struct ostat *ub); }
+39 STD POSIX { pid_t getppid(void); }
+40 COMPAT POSIX { int lstat(char *path, struct ostat *ub); }
+41 STD POSIX { int dup(u_int fd); }
+42 STD POSIX { int pipe(void); }
+43 STD POSIX { gid_t getegid(void); }
+44 STD BSD { int profil(caddr_t samples, size_t size, \
+ size_t offset, u_int scale); }
+45 STD BSD { int ktrace(char *fname, int ops, int facs, \
+ int pid); }
+46 STD POSIX { int sigaction(int signum, struct sigaction *nsa, \
+ struct sigaction *osa); }
+47 STD POSIX { gid_t getgid(void); }
+48 STD POSIX { int sigprocmask(int how, sigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it, and we return the old mask as the
+; (int) return value.
+49 STD BSD { int getlogin(char *namebuf, u_int namelen); }
+50 STD BSD { int setlogin(char *namebuf); }
+51 STD BSD { int acct(char *path); }
+52 STD POSIX { int sigpending(void); }
+53 STD BSD { int sigaltstack(struct sigaltstack *nss, \
+ struct sigaltstack *oss); }
+54 STD POSIX { int ioctl(int fd, u_long com, caddr_t data); }
+55 STD BSD { int reboot(int opt); }
+56 STD POSIX { int revoke(char *path); }
+57 STD POSIX { int symlink(char *path, char *link); }
+58 STD POSIX { int readlink(char *path, char *buf, int count); }
+59 STD POSIX { int execve(char *fname, char **argv, char **envv); }
+60 STD POSIX { int umask(int newmask); } umask umask_args int
+61 STD BSD { int chroot(char *path); }
+62 COMPAT POSIX { int fstat(int fd, struct ostat *sb); }
+63 COMPAT BSD { int getkerninfo(int op, char *where, size_t *size, \
+ int arg); } getkerninfo getkerninfo_args int
+64 COMPAT BSD { int getpagesize(void); } \
+ getpagesize getpagesize_args int
+65 STD BSD { int msync(void *addr, size_t len, int flags); }
+66 STD BSD { int vfork(void); }
+67 OBSOL NOHIDE vread
+68 OBSOL NOHIDE vwrite
+69 STD BSD { int sbrk(int incr); }
+70 STD BSD { int sstk(int incr); }
+71 COMPAT BSD { int mmap(void *addr, int len, int prot, \
+ int flags, int fd, long pos); }
+72 STD BSD { int ovadvise(int anom); } vadvise ovadvise_args int
+73 STD BSD { int munmap(void *addr, size_t len); }
+74 STD BSD { int mprotect(const void *addr, size_t len, int prot); }
+75 STD BSD { int madvise(void *addr, size_t len, int behav); }
+76 OBSOL NOHIDE vhangup
+77 OBSOL NOHIDE vlimit
+78 STD BSD { int mincore(const void *addr, size_t len, \
+ char *vec); }
+79 STD POSIX { int getgroups(u_int gidsetsize, gid_t *gidset); }
+80 STD POSIX { int setgroups(u_int gidsetsize, gid_t *gidset); }
+81 STD POSIX { int getpgrp(void); }
+82 STD POSIX { int setpgid(int pid, int pgid); }
+83 STD BSD { int setitimer(u_int which, struct itimerval *itv, \
+ struct itimerval *oitv); }
+84 COMPAT BSD { int wait(void); }
+85 STD BSD { int swapon(char *name); }
+86 STD BSD { int getitimer(u_int which, struct itimerval *itv); }
+87 COMPAT BSD { int gethostname(char *hostname, u_int len); } \
+ gethostname gethostname_args int
+88 COMPAT BSD { int sethostname(char *hostname, u_int len); } \
+ sethostname sethostname_args int
+89 STD BSD { int getdtablesize(void); }
+90 STD POSIX { int dup2(u_int from, u_int to); }
+91 UNIMPL BSD getdopt
+92 STD POSIX { int fcntl(int fd, int cmd, long arg); }
+; XXX should be { int fcntl(int fd, int cmd, ...); }
+; but we're not ready for varargs.
+; XXX man page says `int arg' too.
+93 STD BSD { int select(int nd, fd_set *in, fd_set *ou, \
+ fd_set *ex, struct timeval *tv); }
+94 UNIMPL BSD setdopt
+95 STD POSIX { int fsync(int fd); }
+96 STD BSD { int setpriority(int which, int who, int prio); }
+97 STD BSD { int socket(int domain, int type, int protocol); }
+98 STD BSD { int connect(int s, caddr_t name, int namelen); }
+99 CPT_NOA BSD { int accept(int s, caddr_t name, int *anamelen); } \
+ accept accept_args int
+100 STD BSD { int getpriority(int which, int who); }
+101 COMPAT BSD { int send(int s, caddr_t buf, int len, int flags); }
+102 COMPAT BSD { int recv(int s, caddr_t buf, int len, int flags); }
+103 STD BSD { int sigreturn(struct sigcontext *sigcntxp); }
+104 STD BSD { int bind(int s, caddr_t name, int namelen); }
+105 STD BSD { int setsockopt(int s, int level, int name, \
+ caddr_t val, int valsize); }
+106 STD BSD { int listen(int s, int backlog); }
+107 OBSOL NOHIDE vtimes
+108 COMPAT BSD { int sigvec(int signum, struct sigvec *nsv, \
+ struct sigvec *osv); }
+109 COMPAT BSD { int sigblock(int mask); }
+110 COMPAT BSD { int sigsetmask(int mask); }
+111 STD POSIX { int sigsuspend(sigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it.
+112 COMPAT BSD { int sigstack(struct sigstack *nss, \
+ struct sigstack *oss); }
+113 COMPAT BSD { int recvmsg(int s, struct omsghdr *msg, int flags); }
+114 COMPAT BSD { int sendmsg(int s, caddr_t msg, int flags); }
+115 OBSOL NOHIDE vtrace
+116 STD BSD { int gettimeofday(struct timeval *tp, \
+ struct timezone *tzp); }
+117 STD BSD { int getrusage(int who, struct rusage *rusage); }
+118 STD BSD { int getsockopt(int s, int level, int name, \
+ caddr_t val, int *avalsize); }
+119 UNIMPL NOHIDE resuba (BSD/OS 2.x)
+120 STD BSD { int readv(int fd, struct iovec *iovp, u_int iovcnt); }
+121 STD BSD { int writev(int fd, struct iovec *iovp, \
+ u_int iovcnt); }
+122 STD BSD { int settimeofday(struct timeval *tv, \
+ struct timezone *tzp); }
+123 STD BSD { int fchown(int fd, int uid, int gid); }
+124 STD BSD { int fchmod(int fd, int mode); }
+125 CPT_NOA BSD { int recvfrom(int s, caddr_t buf, size_t len, \
+ int flags, caddr_t from, int *fromlenaddr); } \
+ recvfrom recvfrom_args int
+126 STD BSD { int setreuid(int ruid, int euid); }
+127 STD BSD { int setregid(int rgid, int egid); }
+128 STD POSIX { int rename(char *from, char *to); }
+129 COMPAT BSD { int truncate(char *path, long length); }
+130 COMPAT BSD { int ftruncate(int fd, long length); }
+131 STD BSD { int flock(int fd, int how); }
+132 STD POSIX { int mkfifo(char *path, int mode); }
+133 STD BSD { int sendto(int s, caddr_t buf, size_t len, \
+ int flags, caddr_t to, int tolen); }
+134 STD BSD { int shutdown(int s, int how); }
+135 STD BSD { int socketpair(int domain, int type, int protocol, \
+ int *rsv); }
+136 STD POSIX { int mkdir(char *path, int mode); }
+137 STD POSIX { int rmdir(char *path); }
+138 STD BSD { int utimes(char *path, struct timeval *tptr); }
+139 OBSOL NOHIDE 4.2 sigreturn
+140 STD BSD { int adjtime(struct timeval *delta, \
+ struct timeval *olddelta); }
+141 COMPAT BSD { int getpeername(int fdes, caddr_t asa, int *alen); }
+142 COMPAT BSD { long gethostid(void); }
+143 COMPAT BSD { int sethostid(long hostid); }
+144 COMPAT BSD { int getrlimit(u_int which, struct ogetrlimit *rlp); }
+145 COMPAT BSD { int setrlimit(u_int which, struct ogetrlimit *rlp); }
+146 COMPAT BSD { int killpg(int pgid, int signum); }
+147 STD POSIX { int setsid(void); }
+148 STD BSD { int quotactl(char *path, int cmd, int uid, \
+ caddr_t arg); }
+149 COMPAT BSD { int quota(void); }
+150 CPT_NOA BSD { int getsockname(int fdec, caddr_t asa, int *alen); }\
+ getsockname getsockname_args int
+
+; Syscalls 151-180 inclusive are reserved for vendor-specific
+; system calls. (This includes various calls added for compatibity
+; with other Unix variants.)
+; Some of these calls are now supported by BSD...
+151 UNIMPL NOHIDE sem_lock (BSD/OS 2.x)
+152 UNIMPL NOHIDE sem_wakeup (BSD/OS 2.x)
+153 UNIMPL NOHIDE asyncdaemon (BSD/OS 2.x)
+154 UNIMPL NOHIDE nosys
+; 155 is initialized by the NFS code, if present.
+155 NOIMPL BSD { int nfssvc(int flag, caddr_t argp); }
+156 COMPAT BSD { int getdirentries(int fd, char *buf, u_int count, \
+ long *basep); }
+157 STD BSD { int statfs(char *path, struct statfs *buf); }
+158 STD BSD { int fstatfs(int fd, struct statfs *buf); }
+159 UNIMPL NOHIDE nosys
+160 UNIMPL NOHIDE nosys
+; 161 is initialized by the NFS code, if present.
+161 NOIMPL BSD { int getfh(char *fname, struct fhandle *fhp); }
+162 STD BSD { int getdomainname(char *domainname, int len); }
+163 STD BSD { int setdomainname(char *domainname, int len); }
+164 STD BSD { int uname(struct utsname *name); }
+165 STD BSD { int sysarch(int op, char *parms); }
+166 STD BSD { int rtprio(int function, pid_t pid, \
+ struct rtprio *rtp); }
+167 UNIMPL NOHIDE nosys
+168 UNIMPL NOHIDE nosys
+169 STD BSD { int semsys(int which, int a2, int a3, int a4, \
+ int a5); }
+; XXX should be { int semsys(int which, ...); }
+170 STD BSD { int msgsys(int which, int a2, int a3, int a4, \
+ int a5, int a6); }
+; XXX should be { int msgsys(int which, ...); }
+171 STD BSD { int shmsys(int which, int a2, int a3, int a4); }
+; XXX should be { int shmsys(int which, ...); }
+172 UNIMPL NOHIDE nosys
+173 UNIMPL NOHIDE nosys
+174 UNIMPL NOHIDE nosys
+175 UNIMPL NOHIDE nosys
+176 STD BSD { int ntp_adjtime(struct timex *tp); }
+177 UNIMPL NOHIDE sfork (BSD/OS 2.x)
+178 UNIMPL NOHIDE getdescriptor (BSD/OS 2.x)
+179 UNIMPL NOHIDE setdescriptor (BSD/OS 2.x)
+180 UNIMPL NOHIDE nosys
+
+; Syscalls 180-199 are used by/reserved for BSD
+181 STD POSIX { int setgid(gid_t gid); }
+182 STD BSD { int setegid(gid_t egid); }
+183 STD BSD { int seteuid(uid_t euid); }
+184 UNIMPL BSD lfs_bmapv
+185 UNIMPL BSD lfs_markv
+186 UNIMPL BSD lfs_segclean
+187 UNIMPL BSD lfs_segwait
+188 STD POSIX { int stat(char *path, struct stat *ub); }
+189 STD POSIX { int fstat(int fd, struct stat *sb); }
+190 STD POSIX { int lstat(char *path, struct stat *ub); }
+191 STD POSIX { int pathconf(char *path, int name); }
+192 STD POSIX { int fpathconf(int fd, int name); }
+193 UNIMPL NOHIDE nosys
+194 STD BSD { int getrlimit(u_int which, \
+ struct orlimit *rlp); } \
+ getrlimit __getrlimit_args int
+195 STD BSD { int setrlimit(u_int which, \
+ struct orlimit *rlp); } \
+ setrlimit __setrlimit_args int
+196 STD BSD { int getdirentries(int fd, char *buf, u_int count, \
+ long *basep); }
+197 STD BSD { caddr_t mmap(caddr_t addr, size_t len, int prot, \
+ int flags, int fd, long pad, off_t pos); }
+198 STD NOHIDE { int nosys(void); } __syscall __syscall_args int
+199 STD POSIX { off_t lseek(int fd, int pad, off_t offset, \
+ int whence); }
+200 STD BSD { int truncate(char *path, int pad, off_t length); }
+201 STD BSD { int ftruncate(int fd, int pad, off_t length); }
+202 STD BSD { int __sysctl(int *name, u_int namelen, void *old, \
+ size_t *oldlenp, void *new, size_t newlen); } \
+ __sysctl sysctl_args int
+; properly, __sysctl should be a NOHIDE, but making an exception
+; here allows to avoid one in libc/sys/Makefile.inc.
+203 STD BSD { int mlock(const void *addr, size_t len); }
+204 STD BSD { int munlock(const void *addr, size_t len); }
+205 STD BSD { int undelete(char *path); }
+206 STD BSD { int futimes(int fd, struct timeval *tptr); }
+207 STD BSD { int getpgid(pid_t pid); }
+208 UNIMPL NOHIDE newreboot (NetBSD)
+209 STD BSD { int poll(struct pollfd *fds, u_int nfds, \
+ int timeout); }
+
+;
+; The following are reserved for loadable syscalls
+;
+210 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+211 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+212 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+213 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+214 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+215 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+216 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+217 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+218 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+219 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+
+;
+; The following were introduced with NetBSD/4.4Lite-2
+;
+220 STD BSD { int __semctl(int semid, int semnum, int cmd, \
+ union semun *arg); }
+221 STD BSD { int semget(key_t key, int nsems, int semflg); }
+222 STD BSD { int semop(int semid, struct sembuf *sops, \
+ u_int nsops); }
+223 STD BSD { int semconfig(int flag); }
+224 STD BSD { int msgctl(int msqid, int cmd, \
+ struct msqid_ds *buf); }
+225 STD BSD { int msgget(key_t key, int msgflg); }
+226 STD BSD { int msgsnd(int msqid, void *msgp, size_t msgsz, \
+ int msgflg); }
+227 STD BSD { int msgrcv(int msqid, void *msgp, size_t msgsz, \
+ long msgtyp, int msgflg); }
+228 STD BSD { int shmat(int shmid, void *shmaddr, int shmflg); }
+229 STD BSD { int shmctl(int shmid, int cmd, \
+ struct shmid_ds *buf); }
+230 STD BSD { int shmdt(void *shmaddr); }
+231 STD BSD { int shmget(key_t key, int size, int shmflg); }
+;
+232 STD POSIX { int clock_gettime(clockid_t clock_id, \
+ struct timespec *tp); }
+233 STD POSIX { int clock_settime(clockid_t clock_id, \
+ const struct timespec *tp); }
+234 STD POSIX { int clock_getres(clockid_t clock_id, \
+ struct timespec *tp); }
+235 UNIMPL NOHIDE timer_create
+236 UNIMPL NOHIDE timer_delete
+237 UNIMPL NOHIDE timer_settime
+238 UNIMPL NOHIDE timer_gettime
+239 UNIMPL NOHIDE timer_getoverrun
+240 STD POSIX { int nanosleep(const struct timespec *rqtp, \
+ struct timespec *rmtp); }
+241 UNIMPL NOHIDE nosys
+242 UNIMPL NOHIDE nosys
+243 UNIMPL NOHIDE nosys
+244 UNIMPL NOHIDE nosys
+245 UNIMPL NOHIDE nosys
+246 UNIMPL NOHIDE nosys
+247 UNIMPL NOHIDE nosys
+248 UNIMPL NOHIDE nosys
+249 UNIMPL NOHIDE nosys
+; syscall numbers initially used in OpenBSD
+250 STD BSD { int minherit(void *addr, size_t len, int inherit); }
+251 STD BSD { int rfork(int flags); }
+252 STD BSD { int openbsd_poll(struct pollfd *fds, u_int nfds, \
+ int timeout); }
+253 STD BSD { int issetugid(void); }
+254 STD BSD { int lchown(char *path, int uid, int gid); }
+255 UNIMPL NOHIDE nosys
+256 UNIMPL NOHIDE nosys
+257 UNIMPL NOHIDE nosys
+258 UNIMPL NOHIDE nosys
+259 UNIMPL NOHIDE nosys
+260 UNIMPL NOHIDE nosys
+261 UNIMPL NOHIDE nosys
+262 UNIMPL NOHIDE nosys
+263 UNIMPL NOHIDE nosys
+264 UNIMPL NOHIDE nosys
+265 UNIMPL NOHIDE nosys
+266 UNIMPL NOHIDE nosys
+267 UNIMPL NOHIDE nosys
+268 UNIMPL NOHIDE nosys
+269 UNIMPL NOHIDE nosys
+270 UNIMPL NOHIDE nosys
+271 UNIMPL NOHIDE nosys
+272 STD BSD { int getdents(int fd, char *buf, size_t count); }
+273 UNIMPL NOHIDE nosys
+274 STD BSD { int lchmod(char *path, mode_t mode); }
+275 NOPROTO BSD { int lchown(char *path, uid_t uid, gid_t gid); } netbsd_lchown netbsd_lchown int
+276 STD BSD { int lutimes(char *path, struct timeval *tptr); }
+277 NOPROTO BSD { int msync(void *addr, size_t len, int flags); } netbsd_msync netbsd_msync int
+278 STD BSD { int nstat(char *path, struct nstat *ub); }
+279 STD BSD { int nfstat(int fd, struct nstat *sb); }
+280 STD BSD { int nlstat(char *path, struct nstat *ub); }
+281 UNIMPL NOHIDE nosys
+282 UNIMPL NOHIDE nosys
+283 UNIMPL NOHIDE nosys
+284 UNIMPL NOHIDE nosys
+285 UNIMPL NOHIDE nosys
+286 UNIMPL NOHIDE nosys
+287 UNIMPL NOHIDE nosys
+288 UNIMPL NOHIDE nosys
+289 UNIMPL NOHIDE nosys
+290 UNIMPL NOHIDE nosys
+291 UNIMPL NOHIDE nosys
+292 UNIMPL NOHIDE nosys
+293 UNIMPL NOHIDE nosys
+294 UNIMPL NOHIDE nosys
+295 UNIMPL NOHIDE nosys
+296 UNIMPL NOHIDE nosys
+297 UNIMPL NOHIDE nosys
+298 UNIMPL NOHIDE nosys
+299 UNIMPL NOHIDE nosys
+; syscall numbers for FreeBSD
+300 STD BSD { int modnext(int modid); }
+301 STD BSD { int modstat(int modid, struct module_stat* stat); }
+302 STD BSD { int modfnext(int modid); }
+303 STD BSD { int modfind(char *name); }
+304 STD BSD { int kldload(const char *file); }
+305 STD BSD { int kldunload(int fileid); }
+306 STD BSD { int kldfind(const char *file); }
+307 STD BSD { int kldnext(int fileid); }
+308 STD BSD { int kldstat(int fileid, struct kld_file_stat* stat); }
+309 STD BSD { int kldfirstmod(int fileid); }
+310 STD BSD { int getsid(pid_t pid); }
+311 UNIMPL NOHIDE setresuid
+312 UNIMPL NOHIDE setresgid
+313 OBSOL NOHIDE signanosleep
+314 STD BSD { int aio_return(struct aiocb *aiocbp); }
+315 STD BSD { int aio_suspend(struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); }
+316 STD BSD { int aio_cancel(int fd, struct aiocb *aiocbp); }
+317 STD BSD { int aio_error(struct aiocb *aiocbp); }
+318 STD BSD { int aio_read(struct aiocb *aiocbp); }
+319 STD BSD { int aio_write(struct aiocb *aiocbp); }
+320 STD BSD { int lio_listio(int mode, struct aiocb * const *acb_list, int nent, struct sigevent *sig); }
+321 STD BSD { int yield(void); }
+322 STD BSD { int thr_sleep(const struct timespec *timeout); }
+323 STD BSD { int thr_wakeup(pid_t pid); }
+324 STD BSD { int mlockall(int how); }
+325 STD BSD { int munlockall(void); }
+326 STD BSD { int __getcwd(u_char *buf, u_int buflen); }
+
+327 STD POSIX { int sched_setparam (pid_t pid, const struct sched_param *param); }
+328 STD POSIX { int sched_getparam (pid_t pid, struct sched_param *param); }
+
+329 STD POSIX { int sched_setscheduler (pid_t pid, int policy, const struct sched_param *param); }
+330 STD POSIX { int sched_getscheduler (pid_t pid); }
+
+331 STD POSIX { int sched_yield (void); }
+332 STD POSIX { int sched_get_priority_max (int policy); }
+333 STD POSIX { int sched_get_priority_min (int policy); }
+334 STD POSIX { int sched_rr_get_interval (pid_t pid, struct timespec *interval); }
+335 STD BSD { int utrace(caddr_t addr, size_t len); }
+336 STD BSD { int sendfile(int fd, int s, off_t offset, size_t nbytes, \
+ struct sf_hdtr *hdtr, off_t *sbytes, int flags); }
+337 STD BSD { int kldsym(int fileid, int cmd, void *data); }
diff --git a/sys/kern/sysv_ipc.c b/sys/kern/sysv_ipc.c
new file mode 100644
index 0000000..553c213
--- /dev/null
+++ b/sys/kern/sysv_ipc.c
@@ -0,0 +1,283 @@
+/* $Id: sysv_ipc.c,v 1.7 1997/11/06 19:29:22 phk Exp $ */
+/* $NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $ */
+
+/*
+ * Copyright (c) 1994 Herb Peyerl <hpeyerl@novatel.ca>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Herb Peyerl.
+ * 4. The name of Herb Peyerl may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/ipc.h>
+#include <sys/ucred.h>
+
+#if defined(SYSVSEM) || defined(SYSVSHM) || defined(SYSVMSG)
+
+/*
+ * Check for ipc permission
+ */
+
+int
+ipcperm(cred, perm, mode)
+ struct ucred *cred;
+ struct ipc_perm *perm;
+ int mode;
+{
+
+ if (cred->cr_uid == 0)
+ return (0);
+
+ /* Check for user match. */
+ if (cred->cr_uid != perm->cuid && cred->cr_uid != perm->uid) {
+ if (mode & IPC_M)
+ return (EPERM);
+ /* Check for group match. */
+ mode >>= 3;
+ if (!groupmember(perm->gid, cred) &&
+ !groupmember(perm->cgid, cred))
+ /* Check for `other' match. */
+ mode >>= 3;
+ }
+
+ if (mode & IPC_M)
+ return (0);
+ return ((mode & perm->mode) == mode ? 0 : EACCES);
+}
+
+#endif /* defined(SYSVSEM) || defined(SYSVSHM) || defined(SYSVMSG) */
+
+
+#if !defined(SYSVSEM) || !defined(SYSVSHM) || !defined(SYSVMSG)
+
+#include <sys/proc.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+
+static void sysv_nosys __P((struct proc *p, char *s));
+
+static void
+sysv_nosys(p, s)
+ struct proc *p;
+ char *s;
+{
+ log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
+ p->p_comm, p->p_pid, s);
+}
+
+#if !defined(SYSVSEM)
+
+/*
+ * SYSVSEM stubs
+ */
+
+int
+semsys(p, uap)
+ struct proc *p;
+ struct semsys_args *uap;
+{
+ sysv_nosys(p, "SYSVSEM");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+semconfig(p, uap)
+ struct proc *p;
+ struct semconfig_args *uap;
+{
+ sysv_nosys(p, "SYSVSEM");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+__semctl(p, uap)
+ struct proc *p;
+ register struct __semctl_args *uap;
+{
+ sysv_nosys(p, "SYSVSEM");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+semget(p, uap)
+ struct proc *p;
+ register struct semget_args *uap;
+{
+ sysv_nosys(p, "SYSVSEM");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+semop(p, uap)
+ struct proc *p;
+ register struct semop_args *uap;
+{
+ sysv_nosys(p, "SYSVSEM");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+/* called from kern_exit.c */
+void
+semexit(p)
+ struct proc *p;
+{
+ return;
+}
+
+#endif /* !defined(SYSVSEM) */
+
+
+#if !defined(SYSVMSG)
+
+/*
+ * SYSVMSG stubs
+ */
+
+int
+msgsys(p, uap)
+ struct proc *p;
+ /* XXX actually varargs. */
+ struct msgsys_args *uap;
+{
+ sysv_nosys(p, "SYSVMSG");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+msgctl(p, uap)
+ struct proc *p;
+ register struct msgctl_args *uap;
+{
+ sysv_nosys(p, "SYSVMSG");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+msgget(p, uap)
+ struct proc *p;
+ register struct msgget_args *uap;
+{
+ sysv_nosys(p, "SYSVMSG");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+msgsnd(p, uap)
+ struct proc *p;
+ register struct msgsnd_args *uap;
+{
+ sysv_nosys(p, "SYSVMSG");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+msgrcv(p, uap)
+ struct proc *p;
+ register struct msgrcv_args *uap;
+{
+ sysv_nosys(p, "SYSVMSG");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+#endif /* !defined(SYSVMSG) */
+
+
+#if !defined(SYSVSHM)
+
+/*
+ * SYSVSHM stubs
+ */
+
+int
+shmdt(p, uap)
+ struct proc *p;
+ struct shmdt_args *uap;
+{
+ sysv_nosys(p, "SYSVSHM");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+shmat(p, uap)
+ struct proc *p;
+ struct shmat_args *uap;
+{
+ sysv_nosys(p, "SYSVSHM");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+shmctl(p, uap)
+ struct proc *p;
+ struct shmctl_args *uap;
+{
+ sysv_nosys(p, "SYSVSHM");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+shmget(p, uap)
+ struct proc *p;
+ struct shmget_args *uap;
+{
+ sysv_nosys(p, "SYSVSHM");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+shmsys(p, uap)
+ struct proc *p;
+ /* XXX actually varargs. */
+ struct shmsys_args *uap;
+{
+ sysv_nosys(p, "SYSVSHM");
+ return nosys(p, (struct nosys_args *)uap);
+};
+
+/* called from kern_fork.c */
+void
+shmfork(p1, p2)
+ struct proc *p1, *p2;
+{
+ return;
+}
+
+/* called from kern_exit.c */
+void
+shmexit(p)
+ struct proc *p;
+{
+ return;
+}
+
+#endif /* !defined(SYSVSHM) */
+
+#endif /* !defined(SYSVSEM) || !defined(SYSVSHM) || !defined(SYSVMSG) */
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
new file mode 100644
index 0000000..d3b8a98
--- /dev/null
+++ b/sys/kern/sysv_msg.c
@@ -0,0 +1,1027 @@
+/* $Id: sysv_msg.c,v 1.17 1997/11/06 19:29:24 phk Exp $ */
+
+/*
+ * Implementation of SVID messages
+ *
+ * Author: Daniel Boulet
+ *
+ * Copyright 1993 Daniel Boulet and RTMX Inc.
+ *
+ * This system call was implemented by Daniel Boulet under contract from RTMX.
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/msg.h>
+#include <sys/sysent.h>
+
+static void msginit __P((void *));
+SYSINIT(sysv_msg, SI_SUB_SYSV_MSG, SI_ORDER_FIRST, msginit, NULL)
+
+#define MSG_DEBUG
+#undef MSG_DEBUG_OK
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgctl_args;
+int msgctl __P((struct proc *p, struct msgctl_args *uap));
+struct msgget_args;
+int msgget __P((struct proc *p, struct msgget_args *uap));
+struct msgsnd_args;
+int msgsnd __P((struct proc *p, struct msgsnd_args *uap));
+struct msgrcv_args;
+int msgrcv __P((struct proc *p, struct msgrcv_args *uap));
+#endif
+static void msg_freehdr __P((struct msg *msghdr));
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *msgcalls[] = {
+ (sy_call_t *)msgctl, (sy_call_t *)msgget,
+ (sy_call_t *)msgsnd, (sy_call_t *)msgrcv
+};
+
+static int nfree_msgmaps; /* # of free map entries */
+static short free_msgmaps; /* head of linked list of free map entries */
+static struct msg *free_msghdrs; /* list of free msg headers */
+char *msgpool; /* MSGMAX byte long msg buffer pool */
+struct msgmap *msgmaps; /* MSGSEG msgmap structures */
+struct msg *msghdrs; /* MSGTQL msg headers */
+struct msqid_ds *msqids; /* MSGMNI msqid_ds struct's */
+
+void
+msginit(dummy)
+ void *dummy;
+{
+ register int i;
+
+ /*
+ * msginfo.msgssz should be a power of two for efficiency reasons.
+ * It is also pretty silly if msginfo.msgssz is less than 8
+ * or greater than about 256 so ...
+ */
+
+ i = 8;
+ while (i < 1024 && i != msginfo.msgssz)
+ i <<= 1;
+ if (i != msginfo.msgssz) {
+ printf("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
+ msginfo.msgssz);
+ panic("msginfo.msgssz not a small power of 2");
+ }
+
+ if (msginfo.msgseg > 32767) {
+ printf("msginfo.msgseg=%d\n", msginfo.msgseg);
+ panic("msginfo.msgseg > 32767");
+ }
+
+ if (msgmaps == NULL)
+ panic("msgmaps is NULL");
+
+ for (i = 0; i < msginfo.msgseg; i++) {
+ if (i > 0)
+ msgmaps[i-1].next = i;
+ msgmaps[i].next = -1; /* implies entry is available */
+ }
+ free_msgmaps = 0;
+ nfree_msgmaps = msginfo.msgseg;
+
+ if (msghdrs == NULL)
+ panic("msghdrs is NULL");
+
+ for (i = 0; i < msginfo.msgtql; i++) {
+ msghdrs[i].msg_type = 0;
+ if (i > 0)
+ msghdrs[i-1].msg_next = &msghdrs[i];
+ msghdrs[i].msg_next = NULL;
+ }
+ free_msghdrs = &msghdrs[0];
+
+ if (msqids == NULL)
+ panic("msqids is NULL");
+
+ for (i = 0; i < msginfo.msgmni; i++) {
+ msqids[i].msg_qbytes = 0; /* implies entry is available */
+ msqids[i].msg_perm.seq = 0; /* reset to a known value */
+ }
+}
+
+/*
+ * Entry point for all MSG calls
+ */
+int
+msgsys(p, uap)
+ struct proc *p;
+ /* XXX actually varargs. */
+ struct msgsys_args /* {
+ u_int which;
+ int a2;
+ int a3;
+ int a4;
+ int a5;
+ int a6;
+ } */ *uap;
+{
+
+ if (uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
+ return (EINVAL);
+ return ((*msgcalls[uap->which])(p, &uap->a2));
+}
+
+static void
+msg_freehdr(msghdr)
+ struct msg *msghdr;
+{
+ while (msghdr->msg_ts > 0) {
+ short next;
+ if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
+ panic("msghdr->msg_spot out of range");
+ next = msgmaps[msghdr->msg_spot].next;
+ msgmaps[msghdr->msg_spot].next = free_msgmaps;
+ free_msgmaps = msghdr->msg_spot;
+ nfree_msgmaps++;
+ msghdr->msg_spot = next;
+ if (msghdr->msg_ts >= msginfo.msgssz)
+ msghdr->msg_ts -= msginfo.msgssz;
+ else
+ msghdr->msg_ts = 0;
+ }
+ if (msghdr->msg_spot != -1)
+ panic("msghdr->msg_spot != -1");
+ msghdr->msg_next = free_msghdrs;
+ free_msghdrs = msghdr;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgctl_args {
+ int msqid;
+ int cmd;
+ struct msqid_ds *buf;
+};
+#endif
+
+int
+msgctl(p, uap)
+ struct proc *p;
+ register struct msgctl_args *uap;
+{
+ int msqid = uap->msqid;
+ int cmd = uap->cmd;
+ struct msqid_ds *user_msqptr = uap->buf;
+ struct ucred *cred = p->p_ucred;
+ int rval, eval;
+ struct msqid_ds msqbuf;
+ register struct msqid_ds *msqptr;
+
+#ifdef MSG_DEBUG_OK
+ printf("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, user_msqptr);
+#endif
+
+ msqid = IPCID_TO_IX(msqid);
+
+ if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+ msginfo.msgmni);
+#endif
+ return(EINVAL);
+ }
+
+ msqptr = &msqids[msqid];
+
+ if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("no such msqid\n");
+#endif
+ return(EINVAL);
+ }
+ if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+ printf("wrong sequence number\n");
+#endif
+ return(EINVAL);
+ }
+
+ eval = 0;
+ rval = 0;
+
+ switch (cmd) {
+
+ case IPC_RMID:
+ {
+ struct msg *msghdr;
+ if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M)))
+ return(eval);
+ /* Free the message headers */
+ msghdr = msqptr->msg_first;
+ while (msghdr != NULL) {
+ struct msg *msghdr_tmp;
+
+ /* Free the segments of each message */
+ msqptr->msg_cbytes -= msghdr->msg_ts;
+ msqptr->msg_qnum--;
+ msghdr_tmp = msghdr;
+ msghdr = msghdr->msg_next;
+ msg_freehdr(msghdr_tmp);
+ }
+
+ if (msqptr->msg_cbytes != 0)
+ panic("msg_cbytes is screwed up");
+ if (msqptr->msg_qnum != 0)
+ panic("msg_qnum is screwed up");
+
+ msqptr->msg_qbytes = 0; /* Mark it as free */
+
+ wakeup((caddr_t)msqptr);
+ }
+
+ break;
+
+ case IPC_SET:
+ if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M)))
+ return(eval);
+ if ((eval = copyin(user_msqptr, &msqbuf, sizeof(msqbuf))) != 0)
+ return(eval);
+ if (msqbuf.msg_qbytes > msqptr->msg_qbytes && cred->cr_uid != 0)
+ return(EPERM);
+ if (msqbuf.msg_qbytes > msginfo.msgmnb) {
+#ifdef MSG_DEBUG_OK
+ printf("can't increase msg_qbytes beyond %d (truncating)\n",
+ msginfo.msgmnb);
+#endif
+ msqbuf.msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */
+ }
+ if (msqbuf.msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("can't reduce msg_qbytes to 0\n");
+#endif
+ return(EINVAL); /* non-standard errno! */
+ }
+ msqptr->msg_perm.uid = msqbuf.msg_perm.uid; /* change the owner */
+ msqptr->msg_perm.gid = msqbuf.msg_perm.gid; /* change the owner */
+ msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) |
+ (msqbuf.msg_perm.mode & 0777);
+ msqptr->msg_qbytes = msqbuf.msg_qbytes;
+ msqptr->msg_ctime = time_second;
+ break;
+
+ case IPC_STAT:
+ if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) {
+#ifdef MSG_DEBUG_OK
+ printf("requester doesn't have read access\n");
+#endif
+ return(eval);
+ }
+ eval = copyout((caddr_t)msqptr, user_msqptr,
+ sizeof(struct msqid_ds));
+ break;
+
+ default:
+#ifdef MSG_DEBUG_OK
+ printf("invalid command %d\n", cmd);
+#endif
+ return(EINVAL);
+ }
+
+ if (eval == 0)
+ p->p_retval[0] = rval;
+ return(eval);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgget_args {
+ key_t key;
+ int msgflg;
+};
+#endif
+
+int
+msgget(p, uap)
+ struct proc *p;
+ register struct msgget_args *uap;
+{
+ int msqid, eval;
+ int key = uap->key;
+ int msgflg = uap->msgflg;
+ struct ucred *cred = p->p_ucred;
+ register struct msqid_ds *msqptr = NULL;
+
+#ifdef MSG_DEBUG_OK
+ printf("msgget(0x%x, 0%o)\n", key, msgflg);
+#endif
+
+ if (key != IPC_PRIVATE) {
+ for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes != 0 &&
+ msqptr->msg_perm.key == key)
+ break;
+ }
+ if (msqid < msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("found public key\n");
+#endif
+ if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
+#ifdef MSG_DEBUG_OK
+ printf("not exclusive\n");
+#endif
+ return(EEXIST);
+ }
+ if ((eval = ipcperm(cred, &msqptr->msg_perm, msgflg & 0700 ))) {
+#ifdef MSG_DEBUG_OK
+ printf("requester doesn't have 0%o access\n",
+ msgflg & 0700);
+#endif
+ return(eval);
+ }
+ goto found;
+ }
+ }
+
+#ifdef MSG_DEBUG_OK
+ printf("need to allocate the msqid_ds\n");
+#endif
+ if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
+ for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+ /*
+ * Look for an unallocated and unlocked msqid_ds.
+ * msqid_ds's can be locked by msgsnd or msgrcv while
+ * they are copying the message in/out. We can't
+ * re-use the entry until they release it.
+ */
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes == 0 &&
+ (msqptr->msg_perm.mode & MSG_LOCKED) == 0)
+ break;
+ }
+ if (msqid == msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("no more msqid_ds's available\n");
+#endif
+ return(ENOSPC);
+ }
+#ifdef MSG_DEBUG_OK
+ printf("msqid %d is available\n", msqid);
+#endif
+ msqptr->msg_perm.key = key;
+ msqptr->msg_perm.cuid = cred->cr_uid;
+ msqptr->msg_perm.uid = cred->cr_uid;
+ msqptr->msg_perm.cgid = cred->cr_gid;
+ msqptr->msg_perm.gid = cred->cr_gid;
+ msqptr->msg_perm.mode = (msgflg & 0777);
+ /* Make sure that the returned msqid is unique */
+ msqptr->msg_perm.seq++;
+ msqptr->msg_first = NULL;
+ msqptr->msg_last = NULL;
+ msqptr->msg_cbytes = 0;
+ msqptr->msg_qnum = 0;
+ msqptr->msg_qbytes = msginfo.msgmnb;
+ msqptr->msg_lspid = 0;
+ msqptr->msg_lrpid = 0;
+ msqptr->msg_stime = 0;
+ msqptr->msg_rtime = 0;
+ msqptr->msg_ctime = time_second;
+ } else {
+#ifdef MSG_DEBUG_OK
+ printf("didn't find it and wasn't asked to create it\n");
+#endif
+ return(ENOENT);
+ }
+
+found:
+ /* Construct the unique msqid */
+ p->p_retval[0] = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm);
+ return(0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgsnd_args {
+ int msqid;
+ void *msgp;
+ size_t msgsz;
+ int msgflg;
+};
+#endif
+
+int
+msgsnd(p, uap)
+ struct proc *p;
+ register struct msgsnd_args *uap;
+{
+ int msqid = uap->msqid;
+ void *user_msgp = uap->msgp;
+ size_t msgsz = uap->msgsz;
+ int msgflg = uap->msgflg;
+ int segs_needed, eval;
+ struct ucred *cred = p->p_ucred;
+ register struct msqid_ds *msqptr;
+ register struct msg *msghdr;
+ short next;
+
+#ifdef MSG_DEBUG_OK
+ printf("call to msgsnd(%d, 0x%x, %d, %d)\n", msqid, user_msgp, msgsz,
+ msgflg);
+#endif
+
+ msqid = IPCID_TO_IX(msqid);
+
+ if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+ msginfo.msgmni);
+#endif
+ return(EINVAL);
+ }
+
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("no such message queue id\n");
+#endif
+ return(EINVAL);
+ }
+ if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+ printf("wrong sequence number\n");
+#endif
+ return(EINVAL);
+ }
+
+ if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_W))) {
+#ifdef MSG_DEBUG_OK
+ printf("requester doesn't have write access\n");
+#endif
+ return(eval);
+ }
+
+ segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
+#ifdef MSG_DEBUG_OK
+ printf("msgsz=%d, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz,
+ segs_needed);
+#endif
+ for (;;) {
+ int need_more_resources = 0;
+
+ /*
+ * check msgsz
+ * (inside this loop in case msg_qbytes changes while we sleep)
+ */
+
+ if (msgsz > msqptr->msg_qbytes) {
+#ifdef MSG_DEBUG_OK
+ printf("msgsz > msqptr->msg_qbytes\n");
+#endif
+ return(EINVAL);
+ }
+
+ if (msqptr->msg_perm.mode & MSG_LOCKED) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid is locked\n");
+#endif
+ need_more_resources = 1;
+ }
+ if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes) {
+#ifdef MSG_DEBUG_OK
+ printf("msgsz + msg_cbytes > msg_qbytes\n");
+#endif
+ need_more_resources = 1;
+ }
+ if (segs_needed > nfree_msgmaps) {
+#ifdef MSG_DEBUG_OK
+ printf("segs_needed > nfree_msgmaps\n");
+#endif
+ need_more_resources = 1;
+ }
+ if (free_msghdrs == NULL) {
+#ifdef MSG_DEBUG_OK
+ printf("no more msghdrs\n");
+#endif
+ need_more_resources = 1;
+ }
+
+ if (need_more_resources) {
+ int we_own_it;
+
+ if ((msgflg & IPC_NOWAIT) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("need more resources but caller doesn't want to wait\n");
+#endif
+ return(EAGAIN);
+ }
+
+ if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("we don't own the msqid_ds\n");
+#endif
+ we_own_it = 0;
+ } else {
+ /* Force later arrivals to wait for our
+ request */
+#ifdef MSG_DEBUG_OK
+ printf("we own the msqid_ds\n");
+#endif
+ msqptr->msg_perm.mode |= MSG_LOCKED;
+ we_own_it = 1;
+ }
+#ifdef MSG_DEBUG_OK
+ printf("goodnight\n");
+#endif
+ eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH,
+ "msgwait", 0);
+#ifdef MSG_DEBUG_OK
+ printf("good morning, eval=%d\n", eval);
+#endif
+ if (we_own_it)
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+ if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("msgsnd: interrupted system call\n");
+#endif
+ return(EINTR);
+ }
+
+ /*
+ * Make sure that the msq queue still exists
+ */
+
+ if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid deleted\n");
+#endif
+ /* The SVID says to return EIDRM. */
+#ifdef EIDRM
+ return(EIDRM);
+#else
+ /* Unfortunately, BSD doesn't define that code
+ yet! */
+ return(EINVAL);
+#endif
+ }
+
+ } else {
+#ifdef MSG_DEBUG_OK
+ printf("got all the resources that we need\n");
+#endif
+ break;
+ }
+ }
+
+ /*
+ * We have the resources that we need.
+ * Make sure!
+ */
+
+ if (msqptr->msg_perm.mode & MSG_LOCKED)
+ panic("msg_perm.mode & MSG_LOCKED");
+ if (segs_needed > nfree_msgmaps)
+ panic("segs_needed > nfree_msgmaps");
+ if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes)
+ panic("msgsz + msg_cbytes > msg_qbytes");
+ if (free_msghdrs == NULL)
+ panic("no more msghdrs");
+
+ /*
+ * Re-lock the msqid_ds in case we page-fault when copying in the
+ * message
+ */
+
+ if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0)
+ panic("msqid_ds is already locked");
+ msqptr->msg_perm.mode |= MSG_LOCKED;
+
+ /*
+ * Allocate a message header
+ */
+
+ msghdr = free_msghdrs;
+ free_msghdrs = msghdr->msg_next;
+ msghdr->msg_spot = -1;
+ msghdr->msg_ts = msgsz;
+
+ /*
+ * Allocate space for the message
+ */
+
+ while (segs_needed > 0) {
+ if (nfree_msgmaps <= 0)
+ panic("not enough msgmaps");
+ if (free_msgmaps == -1)
+ panic("nil free_msgmaps");
+ next = free_msgmaps;
+ if (next <= -1)
+ panic("next too low #1");
+ if (next >= msginfo.msgseg)
+ panic("next out of range #1");
+#ifdef MSG_DEBUG_OK
+ printf("allocating segment %d to message\n", next);
+#endif
+ free_msgmaps = msgmaps[next].next;
+ nfree_msgmaps--;
+ msgmaps[next].next = msghdr->msg_spot;
+ msghdr->msg_spot = next;
+ segs_needed--;
+ }
+
+ /*
+ * Copy in the message type
+ */
+
+ if ((eval = copyin(user_msgp, &msghdr->msg_type,
+ sizeof(msghdr->msg_type))) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("error %d copying the message type\n", eval);
+#endif
+ msg_freehdr(msghdr);
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+ wakeup((caddr_t)msqptr);
+ return(eval);
+ }
+ user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+
+ /*
+ * Validate the message type
+ */
+
+ if (msghdr->msg_type < 1) {
+ msg_freehdr(msghdr);
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+ wakeup((caddr_t)msqptr);
+#ifdef MSG_DEBUG_OK
+ printf("mtype (%d) < 1\n", msghdr->msg_type);
+#endif
+ return(EINVAL);
+ }
+
+ /*
+ * Copy in the message body
+ */
+
+ next = msghdr->msg_spot;
+ while (msgsz > 0) {
+ size_t tlen;
+ if (msgsz > msginfo.msgssz)
+ tlen = msginfo.msgssz;
+ else
+ tlen = msgsz;
+ if (next <= -1)
+ panic("next too low #2");
+ if (next >= msginfo.msgseg)
+ panic("next out of range #2");
+ if ((eval = copyin(user_msgp, &msgpool[next * msginfo.msgssz],
+ tlen)) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("error %d copying in message segment\n", eval);
+#endif
+ msg_freehdr(msghdr);
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+ wakeup((caddr_t)msqptr);
+ return(eval);
+ }
+ msgsz -= tlen;
+ user_msgp = (char *)user_msgp + tlen;
+ next = msgmaps[next].next;
+ }
+ if (next != -1)
+ panic("didn't use all the msg segments");
+
+ /*
+ * We've got the message. Unlock the msqid_ds.
+ */
+
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+
+ /*
+ * Make sure that the msqid_ds is still allocated.
+ */
+
+ if (msqptr->msg_qbytes == 0) {
+ msg_freehdr(msghdr);
+ wakeup((caddr_t)msqptr);
+ /* The SVID says to return EIDRM. */
+#ifdef EIDRM
+ return(EIDRM);
+#else
+ /* Unfortunately, BSD doesn't define that code yet! */
+ return(EINVAL);
+#endif
+ }
+
+ /*
+ * Put the message into the queue
+ */
+
+ if (msqptr->msg_first == NULL) {
+ msqptr->msg_first = msghdr;
+ msqptr->msg_last = msghdr;
+ } else {
+ msqptr->msg_last->msg_next = msghdr;
+ msqptr->msg_last = msghdr;
+ }
+ msqptr->msg_last->msg_next = NULL;
+
+ msqptr->msg_cbytes += msghdr->msg_ts;
+ msqptr->msg_qnum++;
+ msqptr->msg_lspid = p->p_pid;
+ msqptr->msg_stime = time_second;
+
+ wakeup((caddr_t)msqptr);
+ p->p_retval[0] = 0;
+ return(0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgrcv_args {
+ int msqid;
+ void *msgp;
+ size_t msgsz;
+ long msgtyp;
+ int msgflg;
+};
+#endif
+
+int
+msgrcv(p, uap)
+ struct proc *p;
+ register struct msgrcv_args *uap;
+{
+ int msqid = uap->msqid;
+ void *user_msgp = uap->msgp;
+ size_t msgsz = uap->msgsz;
+ long msgtyp = uap->msgtyp;
+ int msgflg = uap->msgflg;
+ size_t len;
+ struct ucred *cred = p->p_ucred;
+ register struct msqid_ds *msqptr;
+ register struct msg *msghdr;
+ int eval;
+ short next;
+
+#ifdef MSG_DEBUG_OK
+ printf("call to msgrcv(%d, 0x%x, %d, %ld, %d)\n", msqid, user_msgp,
+ msgsz, msgtyp, msgflg);
+#endif
+
+ msqid = IPCID_TO_IX(msqid);
+
+ if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+ msginfo.msgmni);
+#endif
+ return(EINVAL);
+ }
+
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("no such message queue id\n");
+#endif
+ return(EINVAL);
+ }
+ if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+ printf("wrong sequence number\n");
+#endif
+ return(EINVAL);
+ }
+
+ if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) {
+#ifdef MSG_DEBUG_OK
+ printf("requester doesn't have read access\n");
+#endif
+ return(eval);
+ }
+
+ msghdr = NULL;
+ while (msghdr == NULL) {
+ if (msgtyp == 0) {
+ msghdr = msqptr->msg_first;
+ if (msghdr != NULL) {
+ if (msgsz < msghdr->msg_ts &&
+ (msgflg & MSG_NOERROR) == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("first message on the queue is too big (want %d, got %d)\n",
+ msgsz, msghdr->msg_ts);
+#endif
+ return(E2BIG);
+ }
+ if (msqptr->msg_first == msqptr->msg_last) {
+ msqptr->msg_first = NULL;
+ msqptr->msg_last = NULL;
+ } else {
+ msqptr->msg_first = msghdr->msg_next;
+ if (msqptr->msg_first == NULL)
+ panic("msg_first/last screwed up #1");
+ }
+ }
+ } else {
+ struct msg *previous;
+ struct msg **prev;
+
+ previous = NULL;
+ prev = &(msqptr->msg_first);
+ while ((msghdr = *prev) != NULL) {
+ /*
+ * Is this message's type an exact match or is
+ * this message's type less than or equal to
+ * the absolute value of a negative msgtyp?
+ * Note that the second half of this test can
+ * NEVER be true if msgtyp is positive since
+ * msg_type is always positive!
+ */
+
+ if (msgtyp == msghdr->msg_type ||
+ msghdr->msg_type <= -msgtyp) {
+#ifdef MSG_DEBUG_OK
+ printf("found message type %d, requested %d\n",
+ msghdr->msg_type, msgtyp);
+#endif
+ if (msgsz < msghdr->msg_ts &&
+ (msgflg & MSG_NOERROR) == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("requested message on the queue is too big (want %d, got %d)\n",
+ msgsz, msghdr->msg_ts);
+#endif
+ return(E2BIG);
+ }
+ *prev = msghdr->msg_next;
+ if (msghdr == msqptr->msg_last) {
+ if (previous == NULL) {
+ if (prev !=
+ &msqptr->msg_first)
+ panic("msg_first/last screwed up #2");
+ msqptr->msg_first =
+ NULL;
+ msqptr->msg_last =
+ NULL;
+ } else {
+ if (prev ==
+ &msqptr->msg_first)
+ panic("msg_first/last screwed up #3");
+ msqptr->msg_last =
+ previous;
+ }
+ }
+ break;
+ }
+ previous = msghdr;
+ prev = &(msghdr->msg_next);
+ }
+ }
+
+ /*
+ * We've either extracted the msghdr for the appropriate
+ * message or there isn't one.
+ * If there is one then bail out of this loop.
+ */
+
+ if (msghdr != NULL)
+ break;
+
+ /*
+ * Hmph! No message found. Does the user want to wait?
+ */
+
+ if ((msgflg & IPC_NOWAIT) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("no appropriate message found (msgtyp=%d)\n",
+ msgtyp);
+#endif
+ /* The SVID says to return ENOMSG. */
+#ifdef ENOMSG
+ return(ENOMSG);
+#else
+ /* Unfortunately, BSD doesn't define that code yet! */
+ return(EAGAIN);
+#endif
+ }
+
+ /*
+ * Wait for something to happen
+ */
+
+#ifdef MSG_DEBUG_OK
+ printf("msgrcv: goodnight\n");
+#endif
+ eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, "msgwait",
+ 0);
+#ifdef MSG_DEBUG_OK
+ printf("msgrcv: good morning (eval=%d)\n", eval);
+#endif
+
+ if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("msgsnd: interrupted system call\n");
+#endif
+ return(EINTR);
+ }
+
+ /*
+ * Make sure that the msq queue still exists
+ */
+
+ if (msqptr->msg_qbytes == 0 ||
+ msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid deleted\n");
+#endif
+ /* The SVID says to return EIDRM. */
+#ifdef EIDRM
+ return(EIDRM);
+#else
+ /* Unfortunately, BSD doesn't define that code yet! */
+ return(EINVAL);
+#endif
+ }
+ }
+
+ /*
+ * Return the message to the user.
+ *
+ * First, do the bookkeeping (before we risk being interrupted).
+ */
+
+ msqptr->msg_cbytes -= msghdr->msg_ts;
+ msqptr->msg_qnum--;
+ msqptr->msg_lrpid = p->p_pid;
+ msqptr->msg_rtime = time_second;
+
+ /*
+ * Make msgsz the actual amount that we'll be returning.
+ * Note that this effectively truncates the message if it is too long
+ * (since msgsz is never increased).
+ */
+
+#ifdef MSG_DEBUG_OK
+ printf("found a message, msgsz=%d, msg_ts=%d\n", msgsz,
+ msghdr->msg_ts);
+#endif
+ if (msgsz > msghdr->msg_ts)
+ msgsz = msghdr->msg_ts;
+
+ /*
+ * Return the type to the user.
+ */
+
+ eval = copyout((caddr_t)&(msghdr->msg_type), user_msgp,
+ sizeof(msghdr->msg_type));
+ if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("error (%d) copying out message type\n", eval);
+#endif
+ msg_freehdr(msghdr);
+ wakeup((caddr_t)msqptr);
+ return(eval);
+ }
+ user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+
+ /*
+ * Return the segments to the user
+ */
+
+ next = msghdr->msg_spot;
+ for (len = 0; len < msgsz; len += msginfo.msgssz) {
+ size_t tlen;
+
+ if (msgsz > msginfo.msgssz)
+ tlen = msginfo.msgssz;
+ else
+ tlen = msgsz;
+ if (next <= -1)
+ panic("next too low #3");
+ if (next >= msginfo.msgseg)
+ panic("next out of range #3");
+ eval = copyout((caddr_t)&msgpool[next * msginfo.msgssz],
+ user_msgp, tlen);
+ if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("error (%d) copying out message segment\n",
+ eval);
+#endif
+ msg_freehdr(msghdr);
+ wakeup((caddr_t)msqptr);
+ return(eval);
+ }
+ user_msgp = (char *)user_msgp + tlen;
+ next = msgmaps[next].next;
+ }
+
+ /*
+ * Done, return the actual number of bytes copied out.
+ */
+
+ msg_freehdr(msghdr);
+ wakeup((caddr_t)msqptr);
+ p->p_retval[0] = msgsz;
+ return(0);
+}
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
new file mode 100644
index 0000000..fb04c42
--- /dev/null
+++ b/sys/kern/sysv_sem.c
@@ -0,0 +1,977 @@
+/* $Id: sysv_sem.c,v 1.21 1998/03/30 09:50:41 phk Exp $ */
+
+/*
+ * Implementation of SVID semaphores
+ *
+ * Author: Daniel Boulet
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sem.h>
+#include <sys/sysent.h>
+
+static void seminit __P((void *));
+SYSINIT(sysv_sem, SI_SUB_SYSV_SEM, SI_ORDER_FIRST, seminit, NULL)
+
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args;
+int __semctl __P((struct proc *p, struct __semctl_args *uap));
+struct semget_args;
+int semget __P((struct proc *p, struct semget_args *uap));
+struct semop_args;
+int semop __P((struct proc *p, struct semop_args *uap));
+struct semconfig_args;
+int semconfig __P((struct proc *p, struct semconfig_args *uap));
+#endif
+
+static struct sem_undo *semu_alloc __P((struct proc *p));
+static int semundo_adjust __P((struct proc *p, struct sem_undo **supptr,
+ int semid, int semnum, int adjval));
+static void semundo_clear __P((int semid, int semnum));
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *semcalls[] = {
+ (sy_call_t *)__semctl, (sy_call_t *)semget,
+ (sy_call_t *)semop, (sy_call_t *)semconfig
+};
+
+static int semtot = 0;
+struct semid_ds *sema; /* semaphore id pool */
+struct sem *sem; /* semaphore pool */
+static struct sem_undo *semu_list; /* list of active undo structures */
+int *semu; /* undo structure pool */
+
+static struct proc *semlock_holder = NULL;
+
+void
+seminit(dummy)
+ void *dummy;
+{
+ register int i;
+
+ if (sema == NULL)
+ panic("sema is NULL");
+ if (semu == NULL)
+ panic("semu is NULL");
+
+ for (i = 0; i < seminfo.semmni; i++) {
+ sema[i].sem_base = 0;
+ sema[i].sem_perm.mode = 0;
+ }
+ for (i = 0; i < seminfo.semmnu; i++) {
+ register struct sem_undo *suptr = SEMU(i);
+ suptr->un_proc = NULL;
+ }
+ semu_list = NULL;
+}
+
+/*
+ * Entry point for all SEM calls
+ */
+int
+semsys(p, uap)
+ struct proc *p;
+ /* XXX actually varargs. */
+ struct semsys_args /* {
+ u_int which;
+ int a2;
+ int a3;
+ int a4;
+ int a5;
+ } */ *uap;
+{
+
+ while (semlock_holder != NULL && semlock_holder != p)
+ (void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semsys", 0);
+
+ if (uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
+ return (EINVAL);
+ return ((*semcalls[uap->which])(p, &uap->a2));
+}
+
+/*
+ * Lock or unlock the entire semaphore facility.
+ *
+ * This will probably eventually evolve into a general purpose semaphore
+ * facility status enquiry mechanism (I don't like the "read /dev/kmem"
+ * approach currently taken by ipcs and the amount of info that we want
+ * to be able to extract for ipcs is probably beyond what the capability
+ * of the getkerninfo facility.
+ *
+ * At the time that the current version of semconfig was written, ipcs is
+ * the only user of the semconfig facility. It uses it to ensure that the
+ * semaphore facility data structures remain static while it fishes around
+ * in /dev/kmem.
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct semconfig_args {
+ semconfig_ctl_t flag;
+};
+#endif
+
+int
+semconfig(p, uap)
+ struct proc *p;
+ struct semconfig_args *uap;
+{
+ int eval = 0;
+
+ switch (uap->flag) {
+ case SEM_CONFIG_FREEZE:
+ semlock_holder = p;
+ break;
+
+ case SEM_CONFIG_THAW:
+ semlock_holder = NULL;
+ wakeup((caddr_t)&semlock_holder);
+ break;
+
+ default:
+ printf("semconfig: unknown flag parameter value (%d) - ignored\n",
+ uap->flag);
+ eval = EINVAL;
+ break;
+ }
+
+ p->p_retval[0] = 0;
+ return(eval);
+}
+
+/*
+ * Allocate a new sem_undo structure for a process
+ * (returns ptr to structure or NULL if no more room)
+ */
+
+static struct sem_undo *
+semu_alloc(p)
+ struct proc *p;
+{
+ register int i;
+ register struct sem_undo *suptr;
+ register struct sem_undo **supptr;
+ int attempt;
+
+ /*
+ * Try twice to allocate something.
+ * (we'll purge any empty structures after the first pass so
+ * two passes are always enough)
+ */
+
+ for (attempt = 0; attempt < 2; attempt++) {
+ /*
+ * Look for a free structure.
+ * Fill it in and return it if we find one.
+ */
+
+ for (i = 0; i < seminfo.semmnu; i++) {
+ suptr = SEMU(i);
+ if (suptr->un_proc == NULL) {
+ suptr->un_next = semu_list;
+ semu_list = suptr;
+ suptr->un_cnt = 0;
+ suptr->un_proc = p;
+ return(suptr);
+ }
+ }
+
+ /*
+ * We didn't find a free one, if this is the first attempt
+ * then try to free some structures.
+ */
+
+ if (attempt == 0) {
+ /* All the structures are in use - try to free some */
+ int did_something = 0;
+
+ supptr = &semu_list;
+ while ((suptr = *supptr) != NULL) {
+ if (suptr->un_cnt == 0) {
+ suptr->un_proc = NULL;
+ *supptr = suptr->un_next;
+ did_something = 1;
+ } else
+ supptr = &(suptr->un_next);
+ }
+
+ /* If we didn't free anything then just give-up */
+ if (!did_something)
+ return(NULL);
+ } else {
+ /*
+ * The second pass failed even though we freed
+ * something after the first pass!
+ * This is IMPOSSIBLE!
+ */
+ panic("semu_alloc - second attempt failed");
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Adjust a particular entry for a particular proc
+ */
+
+static int
+semundo_adjust(p, supptr, semid, semnum, adjval)
+ register struct proc *p;
+ struct sem_undo **supptr;
+ int semid, semnum;
+ int adjval;
+{
+ register struct sem_undo *suptr;
+ register struct undo *sunptr;
+ int i;
+
+ /* Look for and remember the sem_undo if the caller doesn't provide
+ it */
+
+ suptr = *supptr;
+ if (suptr == NULL) {
+ for (suptr = semu_list; suptr != NULL;
+ suptr = suptr->un_next) {
+ if (suptr->un_proc == p) {
+ *supptr = suptr;
+ break;
+ }
+ }
+ if (suptr == NULL) {
+ if (adjval == 0)
+ return(0);
+ suptr = semu_alloc(p);
+ if (suptr == NULL)
+ return(ENOSPC);
+ *supptr = suptr;
+ }
+ }
+
+ /*
+ * Look for the requested entry and adjust it (delete if adjval becomes
+ * 0).
+ */
+ sunptr = &suptr->un_ent[0];
+ for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
+ if (sunptr->un_id != semid || sunptr->un_num != semnum)
+ continue;
+ if (adjval == 0)
+ sunptr->un_adjval = 0;
+ else
+ sunptr->un_adjval += adjval;
+ if (sunptr->un_adjval == 0) {
+ suptr->un_cnt--;
+ if (i < suptr->un_cnt)
+ suptr->un_ent[i] =
+ suptr->un_ent[suptr->un_cnt];
+ }
+ return(0);
+ }
+
+ /* Didn't find the right entry - create it */
+ if (adjval == 0)
+ return(0);
+ if (suptr->un_cnt != seminfo.semume) {
+ sunptr = &suptr->un_ent[suptr->un_cnt];
+ suptr->un_cnt++;
+ sunptr->un_adjval = adjval;
+ sunptr->un_id = semid; sunptr->un_num = semnum;
+ } else
+ return(EINVAL);
+ return(0);
+}
+
+static void
+semundo_clear(semid, semnum)
+ int semid, semnum;
+{
+ register struct sem_undo *suptr;
+
+ for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) {
+ register struct undo *sunptr = &suptr->un_ent[0];
+ register int i = 0;
+
+ while (i < suptr->un_cnt) {
+ if (sunptr->un_id == semid) {
+ if (semnum == -1 || sunptr->un_num == semnum) {
+ suptr->un_cnt--;
+ if (i < suptr->un_cnt) {
+ suptr->un_ent[i] =
+ suptr->un_ent[suptr->un_cnt];
+ continue;
+ }
+ }
+ if (semnum != -1)
+ break;
+ }
+ i++, sunptr++;
+ }
+ }
+}
+
+/*
+ * Note that the user-mode half of this passes a union, not a pointer
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args {
+ int semid;
+ int semnum;
+ int cmd;
+ union semun *arg;
+};
+#endif
+
+int
+__semctl(p, uap)
+ struct proc *p;
+ register struct __semctl_args *uap;
+{
+ int semid = uap->semid;
+ int semnum = uap->semnum;
+ int cmd = uap->cmd;
+ union semun *arg = uap->arg;
+ union semun real_arg;
+ struct ucred *cred = p->p_ucred;
+ int i, rval, eval;
+ struct semid_ds sbuf;
+ register struct semid_ds *semaptr;
+
+#ifdef SEM_DEBUG
+ printf("call to semctl(%d, %d, %d, 0x%x)\n", semid, semnum, cmd, arg);
+#endif
+
+ semid = IPCID_TO_IX(semid);
+ if (semid < 0 || semid >= seminfo.semmsl)
+ return(EINVAL);
+
+ semaptr = &sema[semid];
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
+ semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid))
+ return(EINVAL);
+
+ eval = 0;
+ rval = 0;
+
+ switch (cmd) {
+ case IPC_RMID:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M)))
+ return(eval);
+ semaptr->sem_perm.cuid = cred->cr_uid;
+ semaptr->sem_perm.uid = cred->cr_uid;
+ semtot -= semaptr->sem_nsems;
+ for (i = semaptr->sem_base - sem; i < semtot; i++)
+ sem[i] = sem[i + semaptr->sem_nsems];
+ for (i = 0; i < seminfo.semmni; i++) {
+ if ((sema[i].sem_perm.mode & SEM_ALLOC) &&
+ sema[i].sem_base > semaptr->sem_base)
+ sema[i].sem_base -= semaptr->sem_nsems;
+ }
+ semaptr->sem_perm.mode = 0;
+ semundo_clear(semid, -1);
+ wakeup((caddr_t)semaptr);
+ break;
+
+ case IPC_SET:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M)))
+ return(eval);
+ if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ return(eval);
+ if ((eval = copyin(real_arg.buf, (caddr_t)&sbuf,
+ sizeof(sbuf))) != 0)
+ return(eval);
+ semaptr->sem_perm.uid = sbuf.sem_perm.uid;
+ semaptr->sem_perm.gid = sbuf.sem_perm.gid;
+ semaptr->sem_perm.mode = (semaptr->sem_perm.mode & ~0777) |
+ (sbuf.sem_perm.mode & 0777);
+ semaptr->sem_ctime = time_second;
+ break;
+
+ case IPC_STAT:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+ return(eval);
+ if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ return(eval);
+ eval = copyout((caddr_t)semaptr, real_arg.buf,
+ sizeof(struct semid_ds));
+ break;
+
+ case GETNCNT:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+ return(eval);
+ if (semnum < 0 || semnum >= semaptr->sem_nsems)
+ return(EINVAL);
+ rval = semaptr->sem_base[semnum].semncnt;
+ break;
+
+ case GETPID:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+ return(eval);
+ if (semnum < 0 || semnum >= semaptr->sem_nsems)
+ return(EINVAL);
+ rval = semaptr->sem_base[semnum].sempid;
+ break;
+
+ case GETVAL:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+ return(eval);
+ if (semnum < 0 || semnum >= semaptr->sem_nsems)
+ return(EINVAL);
+ rval = semaptr->sem_base[semnum].semval;
+ break;
+
+ case GETALL:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+ return(eval);
+ if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ return(eval);
+ for (i = 0; i < semaptr->sem_nsems; i++) {
+ eval = copyout((caddr_t)&semaptr->sem_base[i].semval,
+ &real_arg.array[i], sizeof(real_arg.array[0]));
+ if (eval != 0)
+ break;
+ }
+ break;
+
+ case GETZCNT:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+ return(eval);
+ if (semnum < 0 || semnum >= semaptr->sem_nsems)
+ return(EINVAL);
+ rval = semaptr->sem_base[semnum].semzcnt;
+ break;
+
+ case SETVAL:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W)))
+ return(eval);
+ if (semnum < 0 || semnum >= semaptr->sem_nsems)
+ return(EINVAL);
+ if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ return(eval);
+ semaptr->sem_base[semnum].semval = real_arg.val;
+ semundo_clear(semid, semnum);
+ wakeup((caddr_t)semaptr);
+ break;
+
+ case SETALL:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W)))
+ return(eval);
+ if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ return(eval);
+ for (i = 0; i < semaptr->sem_nsems; i++) {
+ eval = copyin(&real_arg.array[i],
+ (caddr_t)&semaptr->sem_base[i].semval,
+ sizeof(real_arg.array[0]));
+ if (eval != 0)
+ break;
+ }
+ semundo_clear(semid, -1);
+ wakeup((caddr_t)semaptr);
+ break;
+
+ default:
+ return(EINVAL);
+ }
+
+ if (eval == 0)
+ p->p_retval[0] = rval;
+ return(eval);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semget_args {
+ key_t key;
+ int nsems;
+ int semflg;
+};
+#endif
+
+int
+semget(p, uap)
+ struct proc *p;
+ register struct semget_args *uap;
+{
+ int semid, eval;
+ int key = uap->key;
+ int nsems = uap->nsems;
+ int semflg = uap->semflg;
+ struct ucred *cred = p->p_ucred;
+
+#ifdef SEM_DEBUG
+ printf("semget(0x%x, %d, 0%o)\n", key, nsems, semflg);
+#endif
+
+ if (key != IPC_PRIVATE) {
+ for (semid = 0; semid < seminfo.semmni; semid++) {
+ if ((sema[semid].sem_perm.mode & SEM_ALLOC) &&
+ sema[semid].sem_perm.key == key)
+ break;
+ }
+ if (semid < seminfo.semmni) {
+#ifdef SEM_DEBUG
+ printf("found public key\n");
+#endif
+ if ((eval = ipcperm(cred, &sema[semid].sem_perm,
+ semflg & 0700)))
+ return(eval);
+ if (nsems > 0 && sema[semid].sem_nsems < nsems) {
+#ifdef SEM_DEBUG
+ printf("too small\n");
+#endif
+ return(EINVAL);
+ }
+ if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
+#ifdef SEM_DEBUG
+ printf("not exclusive\n");
+#endif
+ return(EEXIST);
+ }
+ goto found;
+ }
+ }
+
+#ifdef SEM_DEBUG
+ printf("need to allocate the semid_ds\n");
+#endif
+ if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
+ if (nsems <= 0 || nsems > seminfo.semmsl) {
+#ifdef SEM_DEBUG
+ printf("nsems out of range (0<%d<=%d)\n", nsems,
+ seminfo.semmsl);
+#endif
+ return(EINVAL);
+ }
+ if (nsems > seminfo.semmns - semtot) {
+#ifdef SEM_DEBUG
+ printf("not enough semaphores left (need %d, got %d)\n",
+ nsems, seminfo.semmns - semtot);
+#endif
+ return(ENOSPC);
+ }
+ for (semid = 0; semid < seminfo.semmni; semid++) {
+ if ((sema[semid].sem_perm.mode & SEM_ALLOC) == 0)
+ break;
+ }
+ if (semid == seminfo.semmni) {
+#ifdef SEM_DEBUG
+ printf("no more semid_ds's available\n");
+#endif
+ return(ENOSPC);
+ }
+#ifdef SEM_DEBUG
+ printf("semid %d is available\n", semid);
+#endif
+ sema[semid].sem_perm.key = key;
+ sema[semid].sem_perm.cuid = cred->cr_uid;
+ sema[semid].sem_perm.uid = cred->cr_uid;
+ sema[semid].sem_perm.cgid = cred->cr_gid;
+ sema[semid].sem_perm.gid = cred->cr_gid;
+ sema[semid].sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
+ sema[semid].sem_perm.seq =
+ (sema[semid].sem_perm.seq + 1) & 0x7fff;
+ sema[semid].sem_nsems = nsems;
+ sema[semid].sem_otime = 0;
+ sema[semid].sem_ctime = time_second;
+ sema[semid].sem_base = &sem[semtot];
+ semtot += nsems;
+ bzero(sema[semid].sem_base,
+ sizeof(sema[semid].sem_base[0])*nsems);
+#ifdef SEM_DEBUG
+ printf("sembase = 0x%x, next = 0x%x\n", sema[semid].sem_base,
+ &sem[semtot]);
+#endif
+ } else {
+#ifdef SEM_DEBUG
+ printf("didn't find it and wasn't asked to create it\n");
+#endif
+ return(ENOENT);
+ }
+
+found:
+ p->p_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].sem_perm);
+ return(0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semop_args {
+ int semid;
+ struct sembuf *sops;
+ int nsops;
+};
+#endif
+
+int
+semop(p, uap)
+ struct proc *p;
+ register struct semop_args *uap;
+{
+ int semid = uap->semid;
+ int nsops = uap->nsops;
+ struct sembuf sops[MAX_SOPS];
+ register struct semid_ds *semaptr;
+ register struct sembuf *sopptr;
+ register struct sem *semptr;
+ struct sem_undo *suptr = NULL;
+ struct ucred *cred = p->p_ucred;
+ int i, j, eval;
+ int do_wakeup, do_undos;
+
+#ifdef SEM_DEBUG
+ printf("call to semop(%d, 0x%x, %d)\n", semid, sops, nsops);
+#endif
+
+ semid = IPCID_TO_IX(semid); /* Convert back to zero origin */
+
+ if (semid < 0 || semid >= seminfo.semmsl)
+ return(EINVAL);
+
+ semaptr = &sema[semid];
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0)
+ return(EINVAL);
+ if (semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid))
+ return(EINVAL);
+
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W))) {
+#ifdef SEM_DEBUG
+ printf("eval = %d from ipaccess\n", eval);
+#endif
+ return(eval);
+ }
+
+ if (nsops > MAX_SOPS) {
+#ifdef SEM_DEBUG
+ printf("too many sops (max=%d, nsops=%d)\n", MAX_SOPS, nsops);
+#endif
+ return(E2BIG);
+ }
+
+ if ((eval = copyin(uap->sops, &sops, nsops * sizeof(sops[0]))) != 0) {
+#ifdef SEM_DEBUG
+ printf("eval = %d from copyin(%08x, %08x, %d)\n", eval,
+ uap->sops, &sops, nsops * sizeof(sops[0]));
+#endif
+ return(eval);
+ }
+
+ /*
+ * Loop trying to satisfy the vector of requests.
+ * If we reach a point where we must wait, any requests already
+ * performed are rolled back and we go to sleep until some other
+ * process wakes us up. At this point, we start all over again.
+ *
+ * This ensures that from the perspective of other tasks, a set
+ * of requests is atomic (never partially satisfied).
+ */
+ do_undos = 0;
+
+ for (;;) {
+ do_wakeup = 0;
+
+ for (i = 0; i < nsops; i++) {
+ sopptr = &sops[i];
+
+ if (sopptr->sem_num >= semaptr->sem_nsems)
+ return(EFBIG);
+
+ semptr = &semaptr->sem_base[sopptr->sem_num];
+
+#ifdef SEM_DEBUG
+ printf("semop: semaptr=%x, sem_base=%x, semptr=%x, sem[%d]=%d : op=%d, flag=%s\n",
+ semaptr, semaptr->sem_base, semptr,
+ sopptr->sem_num, semptr->semval, sopptr->sem_op,
+ (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait");
+#endif
+
+ if (sopptr->sem_op < 0) {
+ if (semptr->semval + sopptr->sem_op < 0) {
+#ifdef SEM_DEBUG
+ printf("semop: can't do it now\n");
+#endif
+ break;
+ } else {
+ semptr->semval += sopptr->sem_op;
+ if (semptr->semval == 0 &&
+ semptr->semzcnt > 0)
+ do_wakeup = 1;
+ }
+ if (sopptr->sem_flg & SEM_UNDO)
+ do_undos = 1;
+ } else if (sopptr->sem_op == 0) {
+ if (semptr->semval > 0) {
+#ifdef SEM_DEBUG
+ printf("semop: not zero now\n");
+#endif
+ break;
+ }
+ } else {
+ if (semptr->semncnt > 0)
+ do_wakeup = 1;
+ semptr->semval += sopptr->sem_op;
+ if (sopptr->sem_flg & SEM_UNDO)
+ do_undos = 1;
+ }
+ }
+
+ /*
+ * Did we get through the entire vector?
+ */
+ if (i >= nsops)
+ goto done;
+
+ /*
+ * No ... rollback anything that we've already done
+ */
+#ifdef SEM_DEBUG
+ printf("semop: rollback 0 through %d\n", i-1);
+#endif
+ for (j = 0; j < i; j++)
+ semaptr->sem_base[sops[j].sem_num].semval -=
+ sops[j].sem_op;
+
+ /*
+ * If the request that we couldn't satisfy has the
+ * NOWAIT flag set then return with EAGAIN.
+ */
+ if (sopptr->sem_flg & IPC_NOWAIT)
+ return(EAGAIN);
+
+ if (sopptr->sem_op == 0)
+ semptr->semzcnt++;
+ else
+ semptr->semncnt++;
+
+#ifdef SEM_DEBUG
+ printf("semop: good night!\n");
+#endif
+ eval = tsleep((caddr_t)semaptr, (PZERO - 4) | PCATCH,
+ "semwait", 0);
+#ifdef SEM_DEBUG
+ printf("semop: good morning (eval=%d)!\n", eval);
+#endif
+
+ suptr = NULL; /* sem_undo may have been reallocated */
+
+ if (eval != 0)
+ return(EINTR);
+#ifdef SEM_DEBUG
+ printf("semop: good morning!\n");
+#endif
+
+ /*
+ * Make sure that the semaphore still exists
+ */
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
+ semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
+ /* The man page says to return EIDRM. */
+ /* Unfortunately, BSD doesn't define that code! */
+#ifdef EIDRM
+ return(EIDRM);
+#else
+ return(EINVAL);
+#endif
+ }
+
+ /*
+ * The semaphore is still alive. Readjust the count of
+ * waiting processes.
+ */
+ if (sopptr->sem_op == 0)
+ semptr->semzcnt--;
+ else
+ semptr->semncnt--;
+ }
+
+done:
+ /*
+ * Process any SEM_UNDO requests.
+ */
+ if (do_undos) {
+ for (i = 0; i < nsops; i++) {
+ /*
+ * We only need to deal with SEM_UNDO's for non-zero
+ * op's.
+ */
+ int adjval;
+
+ if ((sops[i].sem_flg & SEM_UNDO) == 0)
+ continue;
+ adjval = sops[i].sem_op;
+ if (adjval == 0)
+ continue;
+ eval = semundo_adjust(p, &suptr, semid,
+ sops[i].sem_num, -adjval);
+ if (eval == 0)
+ continue;
+
+ /*
+ * Oh-Oh! We ran out of either sem_undo's or undo's.
+ * Rollback the adjustments to this point and then
+ * rollback the semaphore ups and down so we can return
+ * with an error with all structures restored. We
+ * rollback the undo's in the exact reverse order that
+ * we applied them. This guarantees that we won't run
+ * out of space as we roll things back out.
+ */
+ for (j = i - 1; j >= 0; j--) {
+ if ((sops[j].sem_flg & SEM_UNDO) == 0)
+ continue;
+ adjval = sops[j].sem_op;
+ if (adjval == 0)
+ continue;
+ if (semundo_adjust(p, &suptr, semid,
+ sops[j].sem_num, adjval) != 0)
+ panic("semop - can't undo undos");
+ }
+
+ for (j = 0; j < nsops; j++)
+ semaptr->sem_base[sops[j].sem_num].semval -=
+ sops[j].sem_op;
+
+#ifdef SEM_DEBUG
+ printf("eval = %d from semundo_adjust\n", eval);
+#endif
+ return(eval);
+ } /* loop through the sops */
+ } /* if (do_undos) */
+
+ /* We're definitely done - set the sempid's */
+ for (i = 0; i < nsops; i++) {
+ sopptr = &sops[i];
+ semptr = &semaptr->sem_base[sopptr->sem_num];
+ semptr->sempid = p->p_pid;
+ }
+
+ /* Do a wakeup if any semaphore was up'd. */
+ if (do_wakeup) {
+#ifdef SEM_DEBUG
+ printf("semop: doing wakeup\n");
+#ifdef SEM_WAKEUP
+ sem_wakeup((caddr_t)semaptr);
+#else
+ wakeup((caddr_t)semaptr);
+#endif
+ printf("semop: back from wakeup\n");
+#else
+ wakeup((caddr_t)semaptr);
+#endif
+ }
+#ifdef SEM_DEBUG
+ printf("semop: done\n");
+#endif
+ p->p_retval[0] = 0;
+ return(0);
+}
+
+/*
+ * Go through the undo structures for this process and apply the adjustments to
+ * semaphores.
+ */
+void
+semexit(p)
+ struct proc *p;
+{
+ register struct sem_undo *suptr;
+ register struct sem_undo **supptr;
+ int did_something;
+
+ /*
+ * If somebody else is holding the global semaphore facility lock
+ * then sleep until it is released.
+ */
+ while (semlock_holder != NULL && semlock_holder != p) {
+#ifdef SEM_DEBUG
+ printf("semaphore facility locked - sleeping ...\n");
+#endif
+ (void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semext", 0);
+ }
+
+ did_something = 0;
+
+ /*
+ * Go through the chain of undo vectors looking for one
+ * associated with this process.
+ */
+
+ for (supptr = &semu_list; (suptr = *supptr) != NULL;
+ supptr = &suptr->un_next) {
+ if (suptr->un_proc == p)
+ break;
+ }
+
+ if (suptr == NULL)
+ goto unlock;
+
+#ifdef SEM_DEBUG
+ printf("proc @%08x has undo structure with %d entries\n", p,
+ suptr->un_cnt);
+#endif
+
+ /*
+ * If there are any active undo elements then process them.
+ */
+ if (suptr->un_cnt > 0) {
+ int ix;
+
+ for (ix = 0; ix < suptr->un_cnt; ix++) {
+ int semid = suptr->un_ent[ix].un_id;
+ int semnum = suptr->un_ent[ix].un_num;
+ int adjval = suptr->un_ent[ix].un_adjval;
+ struct semid_ds *semaptr;
+
+ semaptr = &sema[semid];
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0)
+ panic("semexit - semid not allocated");
+ if (semnum >= semaptr->sem_nsems)
+ panic("semexit - semnum out of range");
+
+#ifdef SEM_DEBUG
+ printf("semexit: %08x id=%d num=%d(adj=%d) ; sem=%d\n",
+ suptr->un_proc, suptr->un_ent[ix].un_id,
+ suptr->un_ent[ix].un_num,
+ suptr->un_ent[ix].un_adjval,
+ semaptr->sem_base[semnum].semval);
+#endif
+
+ if (adjval < 0) {
+ if (semaptr->sem_base[semnum].semval < -adjval)
+ semaptr->sem_base[semnum].semval = 0;
+ else
+ semaptr->sem_base[semnum].semval +=
+ adjval;
+ } else
+ semaptr->sem_base[semnum].semval += adjval;
+
+#ifdef SEM_WAKEUP
+ sem_wakeup((caddr_t)semaptr);
+#else
+ wakeup((caddr_t)semaptr);
+#endif
+#ifdef SEM_DEBUG
+ printf("semexit: back from wakeup\n");
+#endif
+ }
+ }
+
+ /*
+ * Deallocate the undo vector.
+ */
+#ifdef SEM_DEBUG
+ printf("removing vector\n");
+#endif
+ suptr->un_proc = NULL;
+ *supptr = suptr->un_next;
+
+unlock:
+ /*
+ * If the exiting process is holding the global semaphore facility
+ * lock then release it.
+ */
+ if (semlock_holder == p) {
+ semlock_holder = NULL;
+ wakeup((caddr_t)&semlock_holder);
+ }
+}
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
new file mode 100644
index 0000000..a6c2dfe
--- /dev/null
+++ b/sys/kern/sysv_shm.c
@@ -0,0 +1,617 @@
+/* $Id: sysv_shm.c,v 1.39 1998/10/13 08:24:40 dg Exp $ */
+/* $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $ */
+
+/*
+ * Copyright (c) 1994 Adam Glass and Charles Hannum. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Adam Glass and Charles
+ * Hannum.
+ * 4. The names of the authors may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_compat.h"
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/shm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/sysent.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_object.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_inherit.h>
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmat_args;
+extern int shmat __P((struct proc *p, struct shmat_args *uap));
+struct shmctl_args;
+extern int shmctl __P((struct proc *p, struct shmctl_args *uap));
+struct shmdt_args;
+extern int shmdt __P((struct proc *p, struct shmdt_args *uap));
+struct shmget_args;
+extern int shmget __P((struct proc *p, struct shmget_args *uap));
+#endif
+
+static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
+
+static void shminit __P((void *));
+SYSINIT(sysv_shm, SI_SUB_SYSV_SHM, SI_ORDER_FIRST, shminit, NULL)
+
+struct oshmctl_args;
+static int oshmctl __P((struct proc *p, struct oshmctl_args *uap));
+static int shmget_allocate_segment __P((struct proc *p, struct shmget_args *uap, int mode));
+static int shmget_existing __P((struct proc *p, struct shmget_args *uap, int mode, int segnum));
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *shmcalls[] = {
+ (sy_call_t *)shmat, (sy_call_t *)oshmctl,
+ (sy_call_t *)shmdt, (sy_call_t *)shmget,
+ (sy_call_t *)shmctl
+};
+
+#define SHMSEG_FREE 0x0200
+#define SHMSEG_REMOVED 0x0400
+#define SHMSEG_ALLOCATED 0x0800
+#define SHMSEG_WANTED 0x1000
+
+static int shm_last_free, shm_nused, shm_committed;
+struct shmid_ds *shmsegs;
+
+struct shm_handle {
+ /* vm_offset_t kva; */
+ vm_object_t shm_object;
+};
+
+struct shmmap_state {
+ vm_offset_t va;
+ int shmid;
+};
+
+static void shm_deallocate_segment __P((struct shmid_ds *));
+static int shm_find_segment_by_key __P((key_t));
+static struct shmid_ds *shm_find_segment_by_shmid __P((int));
+static int shm_delete_mapping __P((struct proc *, struct shmmap_state *));
+
+static int
+shm_find_segment_by_key(key)
+ key_t key;
+{
+ int i;
+
+ for (i = 0; i < shminfo.shmmni; i++)
+ if ((shmsegs[i].shm_perm.mode & SHMSEG_ALLOCATED) &&
+ shmsegs[i].shm_perm.key == key)
+ return i;
+ return -1;
+}
+
+static struct shmid_ds *
+shm_find_segment_by_shmid(shmid)
+ int shmid;
+{
+ int segnum;
+ struct shmid_ds *shmseg;
+
+ segnum = IPCID_TO_IX(shmid);
+ if (segnum < 0 || segnum >= shminfo.shmmni)
+ return NULL;
+ shmseg = &shmsegs[segnum];
+ if ((shmseg->shm_perm.mode & (SHMSEG_ALLOCATED | SHMSEG_REMOVED))
+ != SHMSEG_ALLOCATED ||
+ shmseg->shm_perm.seq != IPCID_TO_SEQ(shmid))
+ return NULL;
+ return shmseg;
+}
+
+static void
+shm_deallocate_segment(shmseg)
+ struct shmid_ds *shmseg;
+{
+ struct shm_handle *shm_handle;
+ size_t size;
+
+ shm_handle = shmseg->shm_internal;
+ vm_object_deallocate(shm_handle->shm_object);
+ free((caddr_t)shm_handle, M_SHM);
+ shmseg->shm_internal = NULL;
+ size = round_page(shmseg->shm_segsz);
+ shm_committed -= btoc(size);
+ shm_nused--;
+ shmseg->shm_perm.mode = SHMSEG_FREE;
+}
+
+static int
+shm_delete_mapping(p, shmmap_s)
+ struct proc *p;
+ struct shmmap_state *shmmap_s;
+{
+ struct shmid_ds *shmseg;
+ int segnum, result;
+ size_t size;
+
+ segnum = IPCID_TO_IX(shmmap_s->shmid);
+ shmseg = &shmsegs[segnum];
+ size = round_page(shmseg->shm_segsz);
+ result = vm_map_remove(&p->p_vmspace->vm_map, shmmap_s->va, shmmap_s->va + size);
+ if (result != KERN_SUCCESS)
+ return EINVAL;
+ shmmap_s->shmid = -1;
+ shmseg->shm_dtime = time_second;
+ if ((--shmseg->shm_nattch <= 0) &&
+ (shmseg->shm_perm.mode & SHMSEG_REMOVED)) {
+ shm_deallocate_segment(shmseg);
+ shm_last_free = segnum;
+ }
+ return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmdt_args {
+ void *shmaddr;
+};
+#endif
+
+int
+shmdt(p, uap)
+ struct proc *p;
+ struct shmdt_args *uap;
+{
+ struct shmmap_state *shmmap_s;
+ int i;
+
+ shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+ if (shmmap_s == NULL)
+ return EINVAL;
+ for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+ if (shmmap_s->shmid != -1 &&
+ shmmap_s->va == (vm_offset_t)uap->shmaddr)
+ break;
+ if (i == shminfo.shmseg)
+ return EINVAL;
+ return shm_delete_mapping(p, shmmap_s);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmat_args {
+ int shmid;
+ void *shmaddr;
+ int shmflg;
+};
+#endif
+
+int
+shmat(p, uap)
+ struct proc *p;
+ struct shmat_args *uap;
+{
+ int error, i, flags;
+ struct ucred *cred = p->p_ucred;
+ struct shmid_ds *shmseg;
+ struct shmmap_state *shmmap_s = NULL;
+ struct shm_handle *shm_handle;
+ vm_offset_t attach_va;
+ vm_prot_t prot;
+ vm_size_t size;
+ int rv;
+
+ shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+ if (shmmap_s == NULL) {
+ size = shminfo.shmseg * sizeof(struct shmmap_state);
+ shmmap_s = malloc(size, M_SHM, M_WAITOK);
+ for (i = 0; i < shminfo.shmseg; i++)
+ shmmap_s[i].shmid = -1;
+ p->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+ }
+ shmseg = shm_find_segment_by_shmid(uap->shmid);
+ if (shmseg == NULL)
+ return EINVAL;
+ error = ipcperm(cred, &shmseg->shm_perm,
+ (uap->shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
+ if (error)
+ return error;
+ for (i = 0; i < shminfo.shmseg; i++) {
+ if (shmmap_s->shmid == -1)
+ break;
+ shmmap_s++;
+ }
+ if (i >= shminfo.shmseg)
+ return EMFILE;
+ size = round_page(shmseg->shm_segsz);
+ prot = VM_PROT_READ;
+ if ((uap->shmflg & SHM_RDONLY) == 0)
+ prot |= VM_PROT_WRITE;
+ flags = MAP_ANON | MAP_SHARED;
+ if (uap->shmaddr) {
+ flags |= MAP_FIXED;
+ if (uap->shmflg & SHM_RND)
+ attach_va = (vm_offset_t)uap->shmaddr & ~(SHMLBA-1);
+ else if (((vm_offset_t)uap->shmaddr & (SHMLBA-1)) == 0)
+ attach_va = (vm_offset_t)uap->shmaddr;
+ else
+ return EINVAL;
+ } else {
+ /* This is just a hint to vm_map_find() about where to put it. */
+ attach_va = round_page((vm_offset_t)p->p_vmspace->vm_taddr + MAXTSIZ + MAXDSIZ);
+ }
+
+ shm_handle = shmseg->shm_internal;
+ vm_object_reference(shm_handle->shm_object);
+ rv = vm_map_find(&p->p_vmspace->vm_map, shm_handle->shm_object,
+ 0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0);
+ if (rv != KERN_SUCCESS) {
+ return ENOMEM;
+ }
+ vm_map_inherit(&p->p_vmspace->vm_map,
+ attach_va, attach_va + size, VM_INHERIT_SHARE);
+
+ shmmap_s->va = attach_va;
+ shmmap_s->shmid = uap->shmid;
+ shmseg->shm_lpid = p->p_pid;
+ shmseg->shm_atime = time_second;
+ shmseg->shm_nattch++;
+ p->p_retval[0] = attach_va;
+ return 0;
+}
+
+struct oshmid_ds {
+ struct ipc_perm shm_perm; /* operation perms */
+ int shm_segsz; /* size of segment (bytes) */
+ ushort shm_cpid; /* pid, creator */
+ ushort shm_lpid; /* pid, last operation */
+ short shm_nattch; /* no. of current attaches */
+ time_t shm_atime; /* last attach time */
+ time_t shm_dtime; /* last detach time */
+ time_t shm_ctime; /* last change time */
+ void *shm_handle; /* internal handle for shm segment */
+};
+
+struct oshmctl_args {
+ int shmid;
+ int cmd;
+ struct oshmid_ds *ubuf;
+};
+
+static int
+oshmctl(p, uap)
+ struct proc *p;
+ struct oshmctl_args *uap;
+{
+#ifdef COMPAT_43
+ int error;
+ struct ucred *cred = p->p_ucred;
+ struct shmid_ds *shmseg;
+ struct oshmid_ds outbuf;
+
+ shmseg = shm_find_segment_by_shmid(uap->shmid);
+ if (shmseg == NULL)
+ return EINVAL;
+ switch (uap->cmd) {
+ case IPC_STAT:
+ error = ipcperm(cred, &shmseg->shm_perm, IPC_R);
+ if (error)
+ return error;
+ outbuf.shm_perm = shmseg->shm_perm;
+ outbuf.shm_segsz = shmseg->shm_segsz;
+ outbuf.shm_cpid = shmseg->shm_cpid;
+ outbuf.shm_lpid = shmseg->shm_lpid;
+ outbuf.shm_nattch = shmseg->shm_nattch;
+ outbuf.shm_atime = shmseg->shm_atime;
+ outbuf.shm_dtime = shmseg->shm_dtime;
+ outbuf.shm_ctime = shmseg->shm_ctime;
+ outbuf.shm_handle = shmseg->shm_internal;
+ error = copyout((caddr_t)&outbuf, uap->ubuf, sizeof(outbuf));
+ if (error)
+ return error;
+ break;
+ default:
+ /* XXX casting to (sy_call_t *) is bogus, as usual. */
+ return ((sy_call_t *)shmctl)(p, uap);
+ }
+ return 0;
+#else
+ return EINVAL;
+#endif
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmctl_args {
+ int shmid;
+ int cmd;
+ struct shmid_ds *buf;
+};
+#endif
+
+int
+shmctl(p, uap)
+ struct proc *p;
+ struct shmctl_args *uap;
+{
+ int error;
+ struct ucred *cred = p->p_ucred;
+ struct shmid_ds inbuf;
+ struct shmid_ds *shmseg;
+
+ shmseg = shm_find_segment_by_shmid(uap->shmid);
+ if (shmseg == NULL)
+ return EINVAL;
+ switch (uap->cmd) {
+ case IPC_STAT:
+ error = ipcperm(cred, &shmseg->shm_perm, IPC_R);
+ if (error)
+ return error;
+ error = copyout((caddr_t)shmseg, uap->buf, sizeof(inbuf));
+ if (error)
+ return error;
+ break;
+ case IPC_SET:
+ error = ipcperm(cred, &shmseg->shm_perm, IPC_M);
+ if (error)
+ return error;
+ error = copyin(uap->buf, (caddr_t)&inbuf, sizeof(inbuf));
+ if (error)
+ return error;
+ shmseg->shm_perm.uid = inbuf.shm_perm.uid;
+ shmseg->shm_perm.gid = inbuf.shm_perm.gid;
+ shmseg->shm_perm.mode =
+ (shmseg->shm_perm.mode & ~ACCESSPERMS) |
+ (inbuf.shm_perm.mode & ACCESSPERMS);
+ shmseg->shm_ctime = time_second;
+ break;
+ case IPC_RMID:
+ error = ipcperm(cred, &shmseg->shm_perm, IPC_M);
+ if (error)
+ return error;
+ shmseg->shm_perm.key = IPC_PRIVATE;
+ shmseg->shm_perm.mode |= SHMSEG_REMOVED;
+ if (shmseg->shm_nattch <= 0) {
+ shm_deallocate_segment(shmseg);
+ shm_last_free = IPCID_TO_IX(uap->shmid);
+ }
+ break;
+#if 0
+ case SHM_LOCK:
+ case SHM_UNLOCK:
+#endif
+ default:
+ return EINVAL;
+ }
+ return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmget_args {
+ key_t key;
+ size_t size;
+ int shmflg;
+};
+#endif
+
+static int
+shmget_existing(p, uap, mode, segnum)
+ struct proc *p;
+ struct shmget_args *uap;
+ int mode;
+ int segnum;
+{
+ struct shmid_ds *shmseg;
+ struct ucred *cred = p->p_ucred;
+ int error;
+
+ shmseg = &shmsegs[segnum];
+ if (shmseg->shm_perm.mode & SHMSEG_REMOVED) {
+ /*
+ * This segment is in the process of being allocated. Wait
+ * until it's done, and look the key up again (in case the
+ * allocation failed or it was freed).
+ */
+ shmseg->shm_perm.mode |= SHMSEG_WANTED;
+ error = tsleep((caddr_t)shmseg, PLOCK | PCATCH, "shmget", 0);
+ if (error)
+ return error;
+ return EAGAIN;
+ }
+ error = ipcperm(cred, &shmseg->shm_perm, mode);
+ if (error)
+ return error;
+ if (uap->size && uap->size > shmseg->shm_segsz)
+ return EINVAL;
+ if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
+ return EEXIST;
+ p->p_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
+ return 0;
+}
+
+static int
+shmget_allocate_segment(p, uap, mode)
+ struct proc *p;
+ struct shmget_args *uap;
+ int mode;
+{
+ int i, segnum, shmid, size;
+ struct ucred *cred = p->p_ucred;
+ struct shmid_ds *shmseg;
+ struct shm_handle *shm_handle;
+
+ if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
+ return EINVAL;
+ if (shm_nused >= shminfo.shmmni) /* any shmids left? */
+ return ENOSPC;
+ size = round_page(uap->size);
+ if (shm_committed + btoc(size) > shminfo.shmall)
+ return ENOMEM;
+ if (shm_last_free < 0) {
+ for (i = 0; i < shminfo.shmmni; i++)
+ if (shmsegs[i].shm_perm.mode & SHMSEG_FREE)
+ break;
+ if (i == shminfo.shmmni)
+ panic("shmseg free count inconsistent");
+ segnum = i;
+ } else {
+ segnum = shm_last_free;
+ shm_last_free = -1;
+ }
+ shmseg = &shmsegs[segnum];
+ /*
+ * In case we sleep in malloc(), mark the segment present but deleted
+ * so that noone else tries to create the same key.
+ */
+ shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
+ shmseg->shm_perm.key = uap->key;
+ shmseg->shm_perm.seq = (shmseg->shm_perm.seq + 1) & 0x7fff;
+ shm_handle = (struct shm_handle *)
+ malloc(sizeof(struct shm_handle), M_SHM, M_WAITOK);
+ shmid = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
+
+ /*
+ * We make sure that we have allocated a pager before we need
+ * to.
+ */
+ shm_handle->shm_object =
+ vm_pager_allocate(OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0);
+ vm_object_clear_flag(shm_handle->shm_object, OBJ_ONEMAPPING);
+ vm_object_set_flag(shm_handle->shm_object, OBJ_NOSPLIT);
+
+ shmseg->shm_internal = shm_handle;
+ shmseg->shm_perm.cuid = shmseg->shm_perm.uid = cred->cr_uid;
+ shmseg->shm_perm.cgid = shmseg->shm_perm.gid = cred->cr_gid;
+ shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) |
+ (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
+ shmseg->shm_segsz = uap->size;
+ shmseg->shm_cpid = p->p_pid;
+ shmseg->shm_lpid = shmseg->shm_nattch = 0;
+ shmseg->shm_atime = shmseg->shm_dtime = 0;
+ shmseg->shm_ctime = time_second;
+ shm_committed += btoc(size);
+ shm_nused++;
+ if (shmseg->shm_perm.mode & SHMSEG_WANTED) {
+ /*
+ * Somebody else wanted this key while we were asleep. Wake
+ * them up now.
+ */
+ shmseg->shm_perm.mode &= ~SHMSEG_WANTED;
+ wakeup((caddr_t)shmseg);
+ }
+ p->p_retval[0] = shmid;
+ return 0;
+}
+
+int
+shmget(p, uap)
+ struct proc *p;
+ struct shmget_args *uap;
+{
+ int segnum, mode, error;
+
+ mode = uap->shmflg & ACCESSPERMS;
+ if (uap->key != IPC_PRIVATE) {
+ again:
+ segnum = shm_find_segment_by_key(uap->key);
+ if (segnum >= 0) {
+ error = shmget_existing(p, uap, mode, segnum);
+ if (error == EAGAIN)
+ goto again;
+ return error;
+ }
+ if ((uap->shmflg & IPC_CREAT) == 0)
+ return ENOENT;
+ }
+ return shmget_allocate_segment(p, uap, mode);
+}
+
+int
+shmsys(p, uap)
+ struct proc *p;
+ /* XXX actually varargs. */
+ struct shmsys_args /* {
+ u_int which;
+ int a2;
+ int a3;
+ int a4;
+ } */ *uap;
+{
+
+ if (uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0]))
+ return EINVAL;
+ return ((*shmcalls[uap->which])(p, &uap->a2));
+}
+
+void
+shmfork(p1, p2)
+ struct proc *p1, *p2;
+{
+ struct shmmap_state *shmmap_s;
+ size_t size;
+ int i;
+
+ size = shminfo.shmseg * sizeof(struct shmmap_state);
+ shmmap_s = malloc(size, M_SHM, M_WAITOK);
+ bcopy((caddr_t)p1->p_vmspace->vm_shm, (caddr_t)shmmap_s, size);
+ p2->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+ for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+ if (shmmap_s->shmid != -1)
+ shmsegs[IPCID_TO_IX(shmmap_s->shmid)].shm_nattch++;
+}
+
+void
+shmexit(p)
+ struct proc *p;
+{
+ struct shmmap_state *shmmap_s;
+ int i;
+
+ shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+ for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+ if (shmmap_s->shmid != -1)
+ shm_delete_mapping(p, shmmap_s);
+ free((caddr_t)p->p_vmspace->vm_shm, M_SHM);
+ p->p_vmspace->vm_shm = NULL;
+}
+
+void
+shminit(dummy)
+ void *dummy;
+{
+ int i;
+ for (i = 0; i < shminfo.shmmni; i++) {
+ shmsegs[i].shm_perm.mode = SHMSEG_FREE;
+ shmsegs[i].shm_perm.seq = 0;
+ }
+ shm_last_free = 0;
+ shm_nused = 0;
+ shm_committed = 0;
+}
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
new file mode 100644
index 0000000..1adf784
--- /dev/null
+++ b/sys/kern/tty.c
@@ -0,0 +1,2437 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty.c 8.8 (Berkeley) 1/21/94
+ * $Id: tty.c,v 1.110 1998/12/08 10:22:07 bde Exp $
+ */
+
+/*-
+ * TODO:
+ * o Fix races for sending the start char in ttyflush().
+ * o Handle inter-byte timeout for "MIN > 0, TIME > 0" in ttyselect().
+ * With luck, there will be MIN chars before select() returns().
+ * o Handle CLOCAL consistently for ptys. Perhaps disallow setting it.
+ * o Don't allow input in TS_ZOMBIE case. It would be visible through
+ * FIONREAD.
+ * o Do the new sio locking stuff here and use it to avoid special
+ * case for EXTPROC?
+ * o Lock PENDIN too?
+ * o Move EXTPROC and/or PENDIN to t_state?
+ * o Wrap most of ttioctl in spltty/splx.
+ * o Implement TIOCNOTTY or remove it from <sys/ioctl.h>.
+ * o Send STOP if IXOFF is toggled off while TS_TBLOCK is set.
+ * o Don't allow certain termios flags to affect disciplines other
+ * than TTYDISC. Cancel their effects before switch disciplines
+ * and ignore them if they are set while we are in another
+ * discipline.
+ * o Now that historical speed conversions are handled here, don't
+ * do them in drivers.
+ * o Check for TS_CARR_ON being set while everything is closed and not
+ * waiting for carrier. TS_CARR_ON isn't cleared if nothing is open,
+ * so it would live until the next open even if carrier drops.
+ * o Restore TS_WOPEN since it is useful in pstat. It must be cleared
+ * only when _all_ openers leave open().
+ */
+
+#include "snp.h"
+#include "opt_compat.h"
+#include "opt_uconsole.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/filio.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
+#include <sys/proc.h>
+#define TTYDEFCHARS
+#include <sys/tty.h>
+#undef TTYDEFCHARS
+#include <sys/fcntl.h>
+#include <sys/conf.h>
+#include <sys/dkstat.h>
+#include <sys/poll.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/malloc.h>
+#include <sys/filedesc.h>
+#if NSNP > 0
+#include <sys/snoop.h>
+#endif
+
+#include <vm/vm.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+MALLOC_DEFINE(M_TTYS, "ttys", "tty data structures");
+
+static int proc_compare __P((struct proc *p1, struct proc *p2));
+static int ttnread __P((struct tty *tp));
+static void ttyecho __P((int c, struct tty *tp));
+static int ttyoutput __P((int c, register struct tty *tp));
+static void ttypend __P((struct tty *tp));
+static void ttyretype __P((struct tty *tp));
+static void ttyrub __P((int c, struct tty *tp));
+static void ttyrubo __P((struct tty *tp, int cnt));
+static void ttyunblock __P((struct tty *tp));
+static int ttywflush __P((struct tty *tp));
+
+/*
+ * Table with character classes and parity. The 8th bit indicates parity,
+ * the 7th bit indicates the character is an alphameric or underscore (for
+ * ALTWERASE), and the low 6 bits indicate delay type. If the low 6 bits
+ * are 0 then the character needs no special processing on output; classes
+ * other than 0 might be translated or (not currently) require delays.
+ */
+#define E 0x00 /* Even parity. */
+#define O 0x80 /* Odd parity. */
+#define PARITY(c) (char_type[c] & O)
+
+#define ALPHA 0x40 /* Alpha or underscore. */
+#define ISALPHA(c) (char_type[(c) & TTY_CHARMASK] & ALPHA)
+
+#define CCLASSMASK 0x3f
+#define CCLASS(c) (char_type[c] & CCLASSMASK)
+
+#define BS BACKSPACE
+#define CC CONTROL
+#define CR RETURN
+#define NA ORDINARY | ALPHA
+#define NL NEWLINE
+#define NO ORDINARY
+#define TB TAB
+#define VT VTAB
+
+static u_char const char_type[] = {
+ E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* nul - bel */
+ O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */
+ O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */
+ E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */
+ O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */
+ E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */
+ E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */
+ O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */
+ O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */
+ E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */
+ E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */
+ O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */
+ E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */
+ O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */
+ O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */
+ E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */
+ /*
+ * Meta chars; should be settable per character set;
+ * for now, treat them all as normal characters.
+ */
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+ NA, NA, NA, NA, NA, NA, NA, NA,
+};
+#undef BS
+#undef CC
+#undef CR
+#undef NA
+#undef NL
+#undef NO
+#undef TB
+#undef VT
+
+/* Macros to clear/set/test flags. */
+#define SET(t, f) (t) |= (f)
+#define CLR(t, f) (t) &= ~(f)
+#define ISSET(t, f) ((t) & (f))
+
+#undef MAX_INPUT /* XXX wrong in <sys/syslimits.h> */
+#define MAX_INPUT TTYHOG /* XXX limit is usually larger for !ICANON */
+
+/*
+ * Initial open of tty, or (re)entry to standard tty line discipline.
+ */
+int
+ttyopen(device, tp)
+ dev_t device;
+ register struct tty *tp;
+{
+ int s;
+
+ s = spltty();
+ tp->t_dev = device;
+ if (!ISSET(tp->t_state, TS_ISOPEN)) {
+ SET(tp->t_state, TS_ISOPEN);
+ if (ISSET(tp->t_cflag, CLOCAL))
+ SET(tp->t_state, TS_CONNECTED);
+ bzero(&tp->t_winsize, sizeof(tp->t_winsize));
+ }
+ ttsetwater(tp);
+ splx(s);
+ return (0);
+}
+
+/*
+ * Handle close() on a tty line: flush and set to initial state,
+ * bumping generation number so that pending read/write calls
+ * can detect recycling of the tty.
+ * XXX our caller should have done `spltty(); l_close(); ttyclose();'
+ * and l_close() should have flushed, but we repeat the spltty() and
+ * the flush in case there are buggy callers.
+ */
+int
+ttyclose(tp)
+ register struct tty *tp;
+{
+ int s;
+
+ funsetown(tp->t_sigio);
+ s = spltty();
+ if (constty == tp)
+ constty = NULL;
+
+ ttyflush(tp, FREAD | FWRITE);
+ clist_free_cblocks(&tp->t_canq);
+ clist_free_cblocks(&tp->t_outq);
+ clist_free_cblocks(&tp->t_rawq);
+
+#if NSNP > 0
+ if (ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+ snpdown((struct snoop *)tp->t_sc);
+#endif
+
+ tp->t_gen++;
+ tp->t_line = TTYDISC;
+ tp->t_pgrp = NULL;
+ tp->t_session = NULL;
+ tp->t_state = 0;
+ splx(s);
+ return (0);
+}
+
+#define FLUSHQ(q) { \
+ if ((q)->c_cc) \
+ ndflush(q, (q)->c_cc); \
+}
+
+/* Is 'c' a line delimiter ("break" character)? */
+#define TTBREAKC(c, lflag) \
+ ((c) == '\n' || (((c) == cc[VEOF] || \
+ (c) == cc[VEOL] || ((c) == cc[VEOL2] && lflag & IEXTEN)) && \
+ (c) != _POSIX_VDISABLE))
+
+/*
+ * Process input of a single character received on a tty.
+ */
+int
+ttyinput(c, tp)
+ register int c;
+ register struct tty *tp;
+{
+ register tcflag_t iflag, lflag;
+ register cc_t *cc;
+ int i, err;
+
+ /*
+ * If input is pending take it first.
+ */
+ lflag = tp->t_lflag;
+ if (ISSET(lflag, PENDIN))
+ ttypend(tp);
+ /*
+ * Gather stats.
+ */
+ if (ISSET(lflag, ICANON)) {
+ ++tk_cancc;
+ ++tp->t_cancc;
+ } else {
+ ++tk_rawcc;
+ ++tp->t_rawcc;
+ }
+ ++tk_nin;
+
+ /*
+ * Block further input iff:
+ * current input > threshold AND input is available to user program
+ * AND input flow control is enabled and not yet invoked.
+ * The 3 is slop for PARMRK.
+ */
+ iflag = tp->t_iflag;
+ if (tp->t_rawq.c_cc + tp->t_canq.c_cc > tp->t_ihiwat - 3 &&
+ (!ISSET(lflag, ICANON) || tp->t_canq.c_cc != 0) &&
+ (ISSET(tp->t_cflag, CRTS_IFLOW) || ISSET(iflag, IXOFF)) &&
+ !ISSET(tp->t_state, TS_TBLOCK))
+ ttyblock(tp);
+
+ /* Handle exceptional conditions (break, parity, framing). */
+ cc = tp->t_cc;
+ err = (ISSET(c, TTY_ERRORMASK));
+ if (err) {
+ CLR(c, TTY_ERRORMASK);
+ if (ISSET(err, TTY_BI)) {
+ if (ISSET(iflag, IGNBRK))
+ return (0);
+ if (ISSET(iflag, BRKINT)) {
+ ttyflush(tp, FREAD | FWRITE);
+ pgsignal(tp->t_pgrp, SIGINT, 1);
+ goto endcase;
+ }
+ if (ISSET(iflag, PARMRK))
+ goto parmrk;
+ } else if ((ISSET(err, TTY_PE) && ISSET(iflag, INPCK))
+ || ISSET(err, TTY_FE)) {
+ if (ISSET(iflag, IGNPAR))
+ return (0);
+ else if (ISSET(iflag, PARMRK)) {
+parmrk:
+ if (tp->t_rawq.c_cc + tp->t_canq.c_cc >
+ MAX_INPUT - 3)
+ goto input_overflow;
+ (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
+ (void)putc(0 | TTY_QUOTE, &tp->t_rawq);
+ (void)putc(c | TTY_QUOTE, &tp->t_rawq);
+ goto endcase;
+ } else
+ c = 0;
+ }
+ }
+
+ if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP))
+ CLR(c, 0x80);
+ if (!ISSET(lflag, EXTPROC)) {
+ /*
+ * Check for literal nexting very first
+ */
+ if (ISSET(tp->t_state, TS_LNCH)) {
+ SET(c, TTY_QUOTE);
+ CLR(tp->t_state, TS_LNCH);
+ }
+ /*
+ * Scan for special characters. This code
+ * is really just a big case statement with
+ * non-constant cases. The bottom of the
+ * case statement is labeled ``endcase'', so goto
+ * it after a case match, or similar.
+ */
+
+ /*
+ * Control chars which aren't controlled
+ * by ICANON, ISIG, or IXON.
+ */
+ if (ISSET(lflag, IEXTEN)) {
+ if (CCEQ(cc[VLNEXT], c)) {
+ if (ISSET(lflag, ECHO)) {
+ if (ISSET(lflag, ECHOE)) {
+ (void)ttyoutput('^', tp);
+ (void)ttyoutput('\b', tp);
+ } else
+ ttyecho(c, tp);
+ }
+ SET(tp->t_state, TS_LNCH);
+ goto endcase;
+ }
+ if (CCEQ(cc[VDISCARD], c)) {
+ if (ISSET(lflag, FLUSHO))
+ CLR(tp->t_lflag, FLUSHO);
+ else {
+ ttyflush(tp, FWRITE);
+ ttyecho(c, tp);
+ if (tp->t_rawq.c_cc + tp->t_canq.c_cc)
+ ttyretype(tp);
+ SET(tp->t_lflag, FLUSHO);
+ }
+ goto startoutput;
+ }
+ }
+ /*
+ * Signals.
+ */
+ if (ISSET(lflag, ISIG)) {
+ if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) {
+ if (!ISSET(lflag, NOFLSH))
+ ttyflush(tp, FREAD | FWRITE);
+ ttyecho(c, tp);
+ pgsignal(tp->t_pgrp,
+ CCEQ(cc[VINTR], c) ? SIGINT : SIGQUIT, 1);
+ goto endcase;
+ }
+ if (CCEQ(cc[VSUSP], c)) {
+ if (!ISSET(lflag, NOFLSH))
+ ttyflush(tp, FREAD);
+ ttyecho(c, tp);
+ pgsignal(tp->t_pgrp, SIGTSTP, 1);
+ goto endcase;
+ }
+ }
+ /*
+ * Handle start/stop characters.
+ */
+ if (ISSET(iflag, IXON)) {
+ if (CCEQ(cc[VSTOP], c)) {
+ if (!ISSET(tp->t_state, TS_TTSTOP)) {
+ SET(tp->t_state, TS_TTSTOP);
+#ifdef sun4c /* XXX */
+ (*tp->t_stop)(tp, 0);
+#else
+ (*cdevsw[major(tp->t_dev)]->d_stop)(tp,
+ 0);
+#endif
+ return (0);
+ }
+ if (!CCEQ(cc[VSTART], c))
+ return (0);
+ /*
+ * if VSTART == VSTOP then toggle
+ */
+ goto endcase;
+ }
+ if (CCEQ(cc[VSTART], c))
+ goto restartoutput;
+ }
+ /*
+ * IGNCR, ICRNL, & INLCR
+ */
+ if (c == '\r') {
+ if (ISSET(iflag, IGNCR))
+ return (0);
+ else if (ISSET(iflag, ICRNL))
+ c = '\n';
+ } else if (c == '\n' && ISSET(iflag, INLCR))
+ c = '\r';
+ }
+ if (!ISSET(tp->t_lflag, EXTPROC) && ISSET(lflag, ICANON)) {
+ /*
+ * From here on down canonical mode character
+ * processing takes place.
+ */
+ /*
+ * erase (^H / ^?)
+ */
+ if (CCEQ(cc[VERASE], c)) {
+ if (tp->t_rawq.c_cc)
+ ttyrub(unputc(&tp->t_rawq), tp);
+ goto endcase;
+ }
+ /*
+ * kill (^U)
+ */
+ if (CCEQ(cc[VKILL], c)) {
+ if (ISSET(lflag, ECHOKE) &&
+ tp->t_rawq.c_cc == tp->t_rocount &&
+ !ISSET(lflag, ECHOPRT))
+ while (tp->t_rawq.c_cc)
+ ttyrub(unputc(&tp->t_rawq), tp);
+ else {
+ ttyecho(c, tp);
+ if (ISSET(lflag, ECHOK) ||
+ ISSET(lflag, ECHOKE))
+ ttyecho('\n', tp);
+ FLUSHQ(&tp->t_rawq);
+ tp->t_rocount = 0;
+ }
+ CLR(tp->t_state, TS_LOCAL);
+ goto endcase;
+ }
+ /*
+ * word erase (^W)
+ */
+ if (CCEQ(cc[VWERASE], c) && ISSET(lflag, IEXTEN)) {
+ int ctype;
+
+ /*
+ * erase whitespace
+ */
+ while ((c = unputc(&tp->t_rawq)) == ' ' || c == '\t')
+ ttyrub(c, tp);
+ if (c == -1)
+ goto endcase;
+ /*
+ * erase last char of word and remember the
+ * next chars type (for ALTWERASE)
+ */
+ ttyrub(c, tp);
+ c = unputc(&tp->t_rawq);
+ if (c == -1)
+ goto endcase;
+ if (c == ' ' || c == '\t') {
+ (void)putc(c, &tp->t_rawq);
+ goto endcase;
+ }
+ ctype = ISALPHA(c);
+ /*
+ * erase rest of word
+ */
+ do {
+ ttyrub(c, tp);
+ c = unputc(&tp->t_rawq);
+ if (c == -1)
+ goto endcase;
+ } while (c != ' ' && c != '\t' &&
+ (!ISSET(lflag, ALTWERASE) || ISALPHA(c) == ctype));
+ (void)putc(c, &tp->t_rawq);
+ goto endcase;
+ }
+ /*
+ * reprint line (^R)
+ */
+ if (CCEQ(cc[VREPRINT], c) && ISSET(lflag, IEXTEN)) {
+ ttyretype(tp);
+ goto endcase;
+ }
+ /*
+ * ^T - kernel info and generate SIGINFO
+ */
+ if (CCEQ(cc[VSTATUS], c) && ISSET(lflag, IEXTEN)) {
+ if (ISSET(lflag, ISIG))
+ pgsignal(tp->t_pgrp, SIGINFO, 1);
+ if (!ISSET(lflag, NOKERNINFO))
+ ttyinfo(tp);
+ goto endcase;
+ }
+ }
+ /*
+ * Check for input buffer overflow
+ */
+ if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= MAX_INPUT) {
+input_overflow:
+ if (ISSET(iflag, IMAXBEL)) {
+ if (tp->t_outq.c_cc < tp->t_ohiwat)
+ (void)ttyoutput(CTRL('g'), tp);
+ }
+ goto endcase;
+ }
+
+ if ( c == 0377 && ISSET(iflag, PARMRK) && !ISSET(iflag, ISTRIP)
+ && ISSET(iflag, IGNBRK|IGNPAR) != (IGNBRK|IGNPAR))
+ (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
+
+ /*
+ * Put data char in q for user and
+ * wakeup on seeing a line delimiter.
+ */
+ if (putc(c, &tp->t_rawq) >= 0) {
+ if (!ISSET(lflag, ICANON)) {
+ ttwakeup(tp);
+ ttyecho(c, tp);
+ goto endcase;
+ }
+ if (TTBREAKC(c, lflag)) {
+ tp->t_rocount = 0;
+ catq(&tp->t_rawq, &tp->t_canq);
+ ttwakeup(tp);
+ } else if (tp->t_rocount++ == 0)
+ tp->t_rocol = tp->t_column;
+ if (ISSET(tp->t_state, TS_ERASE)) {
+ /*
+ * end of prterase \.../
+ */
+ CLR(tp->t_state, TS_ERASE);
+ (void)ttyoutput('/', tp);
+ }
+ i = tp->t_column;
+ ttyecho(c, tp);
+ if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) {
+ /*
+ * Place the cursor over the '^' of the ^D.
+ */
+ i = imin(2, tp->t_column - i);
+ while (i > 0) {
+ (void)ttyoutput('\b', tp);
+ i--;
+ }
+ }
+ }
+endcase:
+ /*
+ * IXANY means allow any character to restart output.
+ */
+ if (ISSET(tp->t_state, TS_TTSTOP) &&
+ !ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP])
+ return (0);
+restartoutput:
+ CLR(tp->t_lflag, FLUSHO);
+ CLR(tp->t_state, TS_TTSTOP);
+startoutput:
+ return (ttstart(tp));
+}
+
+/*
+ * Output a single character on a tty, doing output processing
+ * as needed (expanding tabs, newline processing, etc.).
+ * Returns < 0 if succeeds, otherwise returns char to resend.
+ * Must be recursive.
+ */
+static int
+ttyoutput(c, tp)
+ register int c;
+ register struct tty *tp;
+{
+ register tcflag_t oflag;
+ register int col, s;
+
+ oflag = tp->t_oflag;
+ if (!ISSET(oflag, OPOST)) {
+ if (ISSET(tp->t_lflag, FLUSHO))
+ return (-1);
+ if (putc(c, &tp->t_outq))
+ return (c);
+ tk_nout++;
+ tp->t_outcc++;
+ return (-1);
+ }
+ /*
+ * Do tab expansion if OXTABS is set. Special case if we external
+ * processing, we don't do the tab expansion because we'll probably
+ * get it wrong. If tab expansion needs to be done, let it happen
+ * externally.
+ */
+ CLR(c, ~TTY_CHARMASK);
+ if (c == '\t' &&
+ ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) {
+ c = 8 - (tp->t_column & 7);
+ if (!ISSET(tp->t_lflag, FLUSHO)) {
+ s = spltty(); /* Don't interrupt tabs. */
+ c -= b_to_q(" ", c, &tp->t_outq);
+ tk_nout += c;
+ tp->t_outcc += c;
+ splx(s);
+ }
+ tp->t_column += c;
+ return (c ? -1 : '\t');
+ }
+ if (c == CEOT && ISSET(oflag, ONOEOT))
+ return (-1);
+
+ /*
+ * Newline translation: if ONLCR is set,
+ * translate newline into "\r\n".
+ */
+ if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) {
+ tk_nout++;
+ tp->t_outcc++;
+ if (putc('\r', &tp->t_outq))
+ return (c);
+ }
+ tk_nout++;
+ tp->t_outcc++;
+ if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq))
+ return (c);
+
+ col = tp->t_column;
+ switch (CCLASS(c)) {
+ case BACKSPACE:
+ if (col > 0)
+ --col;
+ break;
+ case CONTROL:
+ break;
+ case NEWLINE:
+ case RETURN:
+ col = 0;
+ break;
+ case ORDINARY:
+ ++col;
+ break;
+ case TAB:
+ col = (col + 8) & ~7;
+ break;
+ }
+ tp->t_column = col;
+ return (-1);
+}
+
+/*
+ * Ioctls for all tty devices. Called after line-discipline specific ioctl
+ * has been called to do discipline-specific functions and/or reject any
+ * of these ioctl commands.
+ */
+/* ARGSUSED */
+int
+ttioctl(tp, cmd, data, flag)
+ register struct tty *tp;
+ u_long cmd;
+ int flag;
+ void *data;
+{
+ register struct proc *p;
+ int s, error;
+
+ p = curproc; /* XXX */
+
+ /* If the ioctl involves modification, hang if in the background. */
+ switch (cmd) {
+ case TIOCCBRK:
+ case TIOCCONS:
+ case TIOCDRAIN:
+ case TIOCEXCL:
+ case TIOCFLUSH:
+#ifdef TIOCHPCL
+ case TIOCHPCL:
+#endif
+ case TIOCNXCL:
+ case TIOCSBRK:
+ case TIOCSCTTY:
+ case TIOCSDRAINWAIT:
+ case TIOCSETA:
+ case TIOCSETAF:
+ case TIOCSETAW:
+ case TIOCSETD:
+ case TIOCSPGRP:
+ case TIOCSTART:
+ case TIOCSTAT:
+ case TIOCSTI:
+ case TIOCSTOP:
+ case TIOCSWINSZ:
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+ case TIOCLBIC:
+ case TIOCLBIS:
+ case TIOCLSET:
+ case TIOCSETC:
+ case OTIOCSETD:
+ case TIOCSETN:
+ case TIOCSETP:
+ case TIOCSLTC:
+#endif
+ while (isbackground(p, tp) &&
+ (p->p_flag & P_PPWAIT) == 0 &&
+ (p->p_sigignore & sigmask(SIGTTOU)) == 0 &&
+ (p->p_sigmask & sigmask(SIGTTOU)) == 0) {
+ if (p->p_pgrp->pg_jobc == 0)
+ return (EIO);
+ pgsignal(p->p_pgrp, SIGTTOU, 1);
+ error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, "ttybg1",
+ 0);
+ if (error)
+ return (error);
+ }
+ break;
+ }
+
+ switch (cmd) { /* Process the ioctl. */
+ case FIOASYNC: /* set/clear async i/o */
+ s = spltty();
+ if (*(int *)data)
+ SET(tp->t_state, TS_ASYNC);
+ else
+ CLR(tp->t_state, TS_ASYNC);
+ splx(s);
+ break;
+ case FIONBIO: /* set/clear non-blocking i/o */
+ break; /* XXX: delete. */
+ case FIONREAD: /* get # bytes to read */
+ s = spltty();
+ *(int *)data = ttnread(tp);
+ splx(s);
+ break;
+
+ case FIOSETOWN:
+ /*
+ * Policy -- Don't allow FIOSETOWN on someone else's
+ * controlling tty
+ */
+ if (tp->t_session != NULL && !isctty(p, tp))
+ return (ENOTTY);
+
+ error = fsetown(*(int *)data, &tp->t_sigio);
+ if (error)
+ return (error);
+ break;
+ case FIOGETOWN:
+ if (tp->t_session != NULL && !isctty(p, tp))
+ return (ENOTTY);
+ *(int *)data = fgetown(tp->t_sigio);
+ break;
+
+ case TIOCEXCL: /* set exclusive use of tty */
+ s = spltty();
+ SET(tp->t_state, TS_XCLUDE);
+ splx(s);
+ break;
+ case TIOCFLUSH: { /* flush buffers */
+ register int flags = *(int *)data;
+
+ if (flags == 0)
+ flags = FREAD | FWRITE;
+ else
+ flags &= FREAD | FWRITE;
+ ttyflush(tp, flags);
+ break;
+ }
+ case TIOCCONS: /* become virtual console */
+ if (*(int *)data) {
+ if (constty && constty != tp &&
+ ISSET(constty->t_state, TS_CONNECTED))
+ return (EBUSY);
+#ifndef UCONSOLE
+ if (error = suser(p->p_ucred, &p->p_acflag))
+ return (error);
+#endif
+ constty = tp;
+ } else if (tp == constty)
+ constty = NULL;
+ break;
+ case TIOCDRAIN: /* wait till output drained */
+ error = ttywait(tp);
+ if (error)
+ return (error);
+ break;
+ case TIOCGETA: { /* get termios struct */
+ struct termios *t = (struct termios *)data;
+
+ bcopy(&tp->t_termios, t, sizeof(struct termios));
+ break;
+ }
+ case TIOCGETD: /* get line discipline */
+ *(int *)data = tp->t_line;
+ break;
+ case TIOCGWINSZ: /* get window size */
+ *(struct winsize *)data = tp->t_winsize;
+ break;
+ case TIOCGPGRP: /* get pgrp of tty */
+ if (!isctty(p, tp))
+ return (ENOTTY);
+ *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
+ break;
+#ifdef TIOCHPCL
+ case TIOCHPCL: /* hang up on last close */
+ s = spltty();
+ SET(tp->t_cflag, HUPCL);
+ splx(s);
+ break;
+#endif
+ case TIOCNXCL: /* reset exclusive use of tty */
+ s = spltty();
+ CLR(tp->t_state, TS_XCLUDE);
+ splx(s);
+ break;
+ case TIOCOUTQ: /* output queue size */
+ *(int *)data = tp->t_outq.c_cc;
+ break;
+ case TIOCSETA: /* set termios struct */
+ case TIOCSETAW: /* drain output, set */
+ case TIOCSETAF: { /* drn out, fls in, set */
+ register struct termios *t = (struct termios *)data;
+
+ if (t->c_ispeed == 0)
+ t->c_ispeed = t->c_ospeed;
+ if (t->c_ispeed == 0)
+ t->c_ispeed = tp->t_ospeed;
+ if (t->c_ispeed == 0)
+ return (EINVAL);
+ s = spltty();
+ if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
+ error = ttywait(tp);
+ if (error) {
+ splx(s);
+ return (error);
+ }
+ if (cmd == TIOCSETAF)
+ ttyflush(tp, FREAD);
+ }
+ if (!ISSET(t->c_cflag, CIGNORE)) {
+ /*
+ * Set device hardware.
+ */
+ if (tp->t_param && (error = (*tp->t_param)(tp, t))) {
+ splx(s);
+ return (error);
+ }
+ if (ISSET(t->c_cflag, CLOCAL) &&
+ !ISSET(tp->t_cflag, CLOCAL)) {
+ /*
+ * XXX disconnections would be too hard to
+ * get rid of without this kludge. The only
+ * way to get rid of controlling terminals
+ * is to exit from the session leader.
+ */
+ CLR(tp->t_state, TS_ZOMBIE);
+
+ wakeup(TSA_CARR_ON(tp));
+ ttwakeup(tp);
+ ttwwakeup(tp);
+ }
+ if ((ISSET(tp->t_state, TS_CARR_ON) ||
+ ISSET(t->c_cflag, CLOCAL)) &&
+ !ISSET(tp->t_state, TS_ZOMBIE))
+ SET(tp->t_state, TS_CONNECTED);
+ else
+ CLR(tp->t_state, TS_CONNECTED);
+ tp->t_cflag = t->c_cflag;
+ tp->t_ispeed = t->c_ispeed;
+ if (t->c_ospeed != 0)
+ tp->t_ospeed = t->c_ospeed;
+ ttsetwater(tp);
+ }
+ if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON) &&
+ cmd != TIOCSETAF) {
+ if (ISSET(t->c_lflag, ICANON))
+ SET(tp->t_lflag, PENDIN);
+ else {
+ /*
+ * XXX we really shouldn't allow toggling
+ * ICANON while we're in a non-termios line
+ * discipline. Now we have to worry about
+ * panicing for a null queue.
+ */
+ if (tp->t_canq.c_cbreserved > 0 &&
+ tp->t_rawq.c_cbreserved > 0) {
+ catq(&tp->t_rawq, &tp->t_canq);
+ /*
+ * XXX the queue limits may be
+ * different, so the old queue
+ * swapping method no longer works.
+ */
+ catq(&tp->t_canq, &tp->t_rawq);
+ }
+ CLR(tp->t_lflag, PENDIN);
+ }
+ ttwakeup(tp);
+ }
+ tp->t_iflag = t->c_iflag;
+ tp->t_oflag = t->c_oflag;
+ /*
+ * Make the EXTPROC bit read only.
+ */
+ if (ISSET(tp->t_lflag, EXTPROC))
+ SET(t->c_lflag, EXTPROC);
+ else
+ CLR(t->c_lflag, EXTPROC);
+ tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN);
+ if (t->c_cc[VMIN] != tp->t_cc[VMIN] ||
+ t->c_cc[VTIME] != tp->t_cc[VTIME])
+ ttwakeup(tp);
+ bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc));
+ splx(s);
+ break;
+ }
+ case TIOCSETD: { /* set line discipline */
+ register int t = *(int *)data;
+ dev_t device = tp->t_dev;
+
+ if ((u_int)t >= nlinesw)
+ return (ENXIO);
+ if (t != tp->t_line) {
+ s = spltty();
+ (*linesw[tp->t_line].l_close)(tp, flag);
+ error = (*linesw[t].l_open)(device, tp);
+ if (error) {
+ (void)(*linesw[tp->t_line].l_open)(device, tp);
+ splx(s);
+ return (error);
+ }
+ tp->t_line = t;
+ splx(s);
+ }
+ break;
+ }
+ case TIOCSTART: /* start output, like ^Q */
+ s = spltty();
+ if (ISSET(tp->t_state, TS_TTSTOP) ||
+ ISSET(tp->t_lflag, FLUSHO)) {
+ CLR(tp->t_lflag, FLUSHO);
+ CLR(tp->t_state, TS_TTSTOP);
+ ttstart(tp);
+ }
+ splx(s);
+ break;
+ case TIOCSTI: /* simulate terminal input */
+ if (p->p_ucred->cr_uid && (flag & FREAD) == 0)
+ return (EPERM);
+ if (p->p_ucred->cr_uid && !isctty(p, tp))
+ return (EACCES);
+ s = spltty();
+ (*linesw[tp->t_line].l_rint)(*(u_char *)data, tp);
+ splx(s);
+ break;
+ case TIOCSTOP: /* stop output, like ^S */
+ s = spltty();
+ if (!ISSET(tp->t_state, TS_TTSTOP)) {
+ SET(tp->t_state, TS_TTSTOP);
+#ifdef sun4c /* XXX */
+ (*tp->t_stop)(tp, 0);
+#else
+ (*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0);
+#endif
+ }
+ splx(s);
+ break;
+ case TIOCSCTTY: /* become controlling tty */
+ /* Session ctty vnode pointer set in vnode layer. */
+ if (!SESS_LEADER(p) ||
+ ((p->p_session->s_ttyvp || tp->t_session) &&
+ (tp->t_session != p->p_session)))
+ return (EPERM);
+ tp->t_session = p->p_session;
+ tp->t_pgrp = p->p_pgrp;
+ p->p_session->s_ttyp = tp;
+ p->p_flag |= P_CONTROLT;
+ break;
+ case TIOCSPGRP: { /* set pgrp of tty */
+ register struct pgrp *pgrp = pgfind(*(int *)data);
+
+ if (!isctty(p, tp))
+ return (ENOTTY);
+ else if (pgrp == NULL || pgrp->pg_session != p->p_session)
+ return (EPERM);
+ tp->t_pgrp = pgrp;
+ break;
+ }
+ case TIOCSTAT: /* simulate control-T */
+ s = spltty();
+ ttyinfo(tp);
+ splx(s);
+ break;
+ case TIOCSWINSZ: /* set window size */
+ if (bcmp((caddr_t)&tp->t_winsize, data,
+ sizeof (struct winsize))) {
+ tp->t_winsize = *(struct winsize *)data;
+ pgsignal(tp->t_pgrp, SIGWINCH, 1);
+ }
+ break;
+ case TIOCSDRAINWAIT:
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error)
+ return (error);
+ tp->t_timeout = *(int *)data * hz;
+ wakeup(TSA_OCOMPLETE(tp));
+ wakeup(TSA_OLOWAT(tp));
+ break;
+ case TIOCGDRAINWAIT:
+ *(int *)data = tp->t_timeout / hz;
+ break;
+ default:
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+ return (ttcompat(tp, cmd, data, flag));
+#else
+ return (ENOIOCTL);
+#endif
+ }
+ return (0);
+}
+
+int
+ttypoll(tp, events, p)
+ struct tty *tp;
+ int events;
+ struct proc *p;
+{
+ int s;
+ int revents = 0;
+
+ if (tp == NULL) /* XXX used to return ENXIO, but that means true! */
+ return ((events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM))
+ | POLLHUP);
+
+ s = spltty();
+ if (events & (POLLIN | POLLRDNORM))
+ if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE))
+ revents |= events & (POLLIN | POLLRDNORM);
+ else
+ selrecord(p, &tp->t_rsel);
+
+ if (events & (POLLOUT | POLLWRNORM))
+ if ((tp->t_outq.c_cc <= tp->t_olowat &&
+ ISSET(tp->t_state, TS_CONNECTED))
+ || ISSET(tp->t_state, TS_ZOMBIE))
+ revents |= events & (POLLOUT | POLLWRNORM);
+ else
+ selrecord(p, &tp->t_wsel);
+ splx(s);
+ return (revents);
+}
+
+/*
+ * This is a wrapper for compatibility with the select vector used by
+ * cdevsw. It relies on a proper xxxdevtotty routine.
+ */
+int
+ttpoll(dev, events, p)
+ dev_t dev;
+ int events;
+ struct proc *p;
+{
+ return ttypoll((*cdevsw[major(dev)]->d_devtotty)(dev), events, p);
+}
+
+/*
+ * Must be called at spltty().
+ */
+static int
+ttnread(tp)
+ struct tty *tp;
+{
+ int nread;
+
+ if (ISSET(tp->t_lflag, PENDIN))
+ ttypend(tp);
+ nread = tp->t_canq.c_cc;
+ if (!ISSET(tp->t_lflag, ICANON)) {
+ nread += tp->t_rawq.c_cc;
+ if (nread < tp->t_cc[VMIN] && tp->t_cc[VTIME] == 0)
+ nread = 0;
+ }
+ return (nread);
+}
+
+/*
+ * Wait for output to drain.
+ */
+int
+ttywait(tp)
+ register struct tty *tp;
+{
+ int error, s;
+
+ error = 0;
+ s = spltty();
+ while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
+ ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) {
+ (*tp->t_oproc)(tp);
+ if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
+ ISSET(tp->t_state, TS_CONNECTED)) {
+ SET(tp->t_state, TS_SO_OCOMPLETE);
+ error = ttysleep(tp, TSA_OCOMPLETE(tp),
+ TTOPRI | PCATCH, "ttywai",
+ tp->t_timeout);
+ if (error) {
+ if (error == EWOULDBLOCK)
+ error = EIO;
+ break;
+ }
+ } else
+ break;
+ }
+ if (!error && (tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)))
+ error = EIO;
+ splx(s);
+ return (error);
+}
+
+/*
+ * Flush if successfully wait.
+ */
+static int
+ttywflush(tp)
+ struct tty *tp;
+{
+ int error;
+
+ if ((error = ttywait(tp)) == 0)
+ ttyflush(tp, FREAD);
+ return (error);
+}
+
+/*
+ * Flush tty read and/or write queues, notifying anyone waiting.
+ */
+void
+ttyflush(tp, rw)
+ register struct tty *tp;
+ int rw;
+{
+ register int s;
+
+ s = spltty();
+#if 0
+again:
+#endif
+ if (rw & FWRITE) {
+ FLUSHQ(&tp->t_outq);
+ CLR(tp->t_state, TS_TTSTOP);
+ }
+#ifdef sun4c /* XXX */
+ (*tp->t_stop)(tp, rw);
+#else
+ (*cdevsw[major(tp->t_dev)]->d_stop)(tp, rw);
+#endif
+ if (rw & FREAD) {
+ FLUSHQ(&tp->t_canq);
+ FLUSHQ(&tp->t_rawq);
+ CLR(tp->t_lflag, PENDIN);
+ tp->t_rocount = 0;
+ tp->t_rocol = 0;
+ CLR(tp->t_state, TS_LOCAL);
+ ttwakeup(tp);
+ if (ISSET(tp->t_state, TS_TBLOCK)) {
+ if (rw & FWRITE)
+ FLUSHQ(&tp->t_outq);
+ ttyunblock(tp);
+
+ /*
+ * Don't let leave any state that might clobber the
+ * next line discipline (although we should do more
+ * to send the START char). Not clearing the state
+ * may have caused the "putc to a clist with no
+ * reserved cblocks" panic/printf.
+ */
+ CLR(tp->t_state, TS_TBLOCK);
+
+#if 0 /* forget it, sleeping isn't always safe and we don't know when it is */
+ if (ISSET(tp->t_iflag, IXOFF)) {
+ /*
+ * XXX wait a bit in the hope that the stop
+ * character (if any) will go out. Waiting
+ * isn't good since it allows races. This
+ * will be fixed when the stop character is
+ * put in a special queue. Don't bother with
+ * the checks in ttywait() since the timeout
+ * will save us.
+ */
+ SET(tp->t_state, TS_SO_OCOMPLETE);
+ ttysleep(tp, TSA_OCOMPLETE(tp), TTOPRI,
+ "ttyfls", hz / 10);
+ /*
+ * Don't try sending the stop character again.
+ */
+ CLR(tp->t_state, TS_TBLOCK);
+ goto again;
+ }
+#endif
+ }
+ }
+ if (rw & FWRITE) {
+ FLUSHQ(&tp->t_outq);
+ ttwwakeup(tp);
+ }
+ splx(s);
+}
+
+/*
+ * Copy in the default termios characters.
+ */
+void
+termioschars(t)
+ struct termios *t;
+{
+
+ bcopy(ttydefchars, t->c_cc, sizeof t->c_cc);
+}
+
+/*
+ * Old interface.
+ */
+void
+ttychars(tp)
+ struct tty *tp;
+{
+
+ termioschars(&tp->t_termios);
+}
+
+/*
+ * Handle input high water. Send stop character for the IXOFF case. Turn
+ * on our input flow control bit and propagate the changes to the driver.
+ * XXX the stop character should be put in a special high priority queue.
+ */
+void
+ttyblock(tp)
+ struct tty *tp;
+{
+
+ SET(tp->t_state, TS_TBLOCK);
+ if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE &&
+ putc(tp->t_cc[VSTOP], &tp->t_outq) != 0)
+ CLR(tp->t_state, TS_TBLOCK); /* try again later */
+ ttstart(tp);
+}
+
+/*
+ * Handle input low water. Send start character for the IXOFF case. Turn
+ * off our input flow control bit and propagate the changes to the driver.
+ * XXX the start character should be put in a special high priority queue.
+ */
+static void
+ttyunblock(tp)
+ struct tty *tp;
+{
+
+ CLR(tp->t_state, TS_TBLOCK);
+ if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTART] != _POSIX_VDISABLE &&
+ putc(tp->t_cc[VSTART], &tp->t_outq) != 0)
+ SET(tp->t_state, TS_TBLOCK); /* try again later */
+ ttstart(tp);
+}
+
+#ifdef notyet
+/* Not used by any current (i386) drivers. */
+/*
+ * Restart after an inter-char delay.
+ */
+void
+ttrstrt(tp_arg)
+ void *tp_arg;
+{
+ struct tty *tp;
+ int s;
+
+ KASSERT(tp_arg != NULL, ("ttrstrt"));
+
+ tp = tp_arg;
+ s = spltty();
+
+ CLR(tp->t_state, TS_TIMEOUT);
+ ttstart(tp);
+
+ splx(s);
+}
+#endif
+
+int
+ttstart(tp)
+ struct tty *tp;
+{
+
+ if (tp->t_oproc != NULL) /* XXX: Kludge for pty. */
+ (*tp->t_oproc)(tp);
+ return (0);
+}
+
+/*
+ * "close" a line discipline
+ */
+int
+ttylclose(tp, flag)
+ struct tty *tp;
+ int flag;
+{
+
+ if (flag & FNONBLOCK || ttywflush(tp))
+ ttyflush(tp, FREAD | FWRITE);
+ return (0);
+}
+
+/*
+ * Handle modem control transition on a tty.
+ * Flag indicates new state of carrier.
+ * Returns 0 if the line should be turned off, otherwise 1.
+ */
+int
+ttymodem(tp, flag)
+ register struct tty *tp;
+ int flag;
+{
+
+ if (ISSET(tp->t_state, TS_CARR_ON) && ISSET(tp->t_cflag, MDMBUF)) {
+ /*
+ * MDMBUF: do flow control according to carrier flag
+ * XXX TS_CAR_OFLOW doesn't do anything yet. TS_TTSTOP
+ * works if IXON and IXANY are clear.
+ */
+ if (flag) {
+ CLR(tp->t_state, TS_CAR_OFLOW);
+ CLR(tp->t_state, TS_TTSTOP);
+ ttstart(tp);
+ } else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) {
+ SET(tp->t_state, TS_CAR_OFLOW);
+ SET(tp->t_state, TS_TTSTOP);
+#ifdef sun4c /* XXX */
+ (*tp->t_stop)(tp, 0);
+#else
+ (*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0);
+#endif
+ }
+ } else if (flag == 0) {
+ /*
+ * Lost carrier.
+ */
+ CLR(tp->t_state, TS_CARR_ON);
+ if (ISSET(tp->t_state, TS_ISOPEN) &&
+ !ISSET(tp->t_cflag, CLOCAL)) {
+ SET(tp->t_state, TS_ZOMBIE);
+ CLR(tp->t_state, TS_CONNECTED);
+ if (tp->t_session && tp->t_session->s_leader)
+ psignal(tp->t_session->s_leader, SIGHUP);
+ ttyflush(tp, FREAD | FWRITE);
+ return (0);
+ }
+ } else {
+ /*
+ * Carrier now on.
+ */
+ SET(tp->t_state, TS_CARR_ON);
+ if (!ISSET(tp->t_state, TS_ZOMBIE))
+ SET(tp->t_state, TS_CONNECTED);
+ wakeup(TSA_CARR_ON(tp));
+ ttwakeup(tp);
+ ttwwakeup(tp);
+ }
+ return (1);
+}
+
+/*
+ * Reinput pending characters after state switch
+ * call at spltty().
+ */
+static void
+ttypend(tp)
+ register struct tty *tp;
+{
+ struct clist tq;
+ register int c;
+
+ CLR(tp->t_lflag, PENDIN);
+ SET(tp->t_state, TS_TYPEN);
+ /*
+ * XXX this assumes too much about clist internals. It may even
+ * fail if the cblock slush pool is empty. We can't allocate more
+ * cblocks here because we are called from an interrupt handler
+ * and clist_alloc_cblocks() can wait.
+ */
+ tq = tp->t_rawq;
+ bzero(&tp->t_rawq, sizeof tp->t_rawq);
+ tp->t_rawq.c_cbmax = tq.c_cbmax;
+ tp->t_rawq.c_cbreserved = tq.c_cbreserved;
+ while ((c = getc(&tq)) >= 0)
+ ttyinput(c, tp);
+ CLR(tp->t_state, TS_TYPEN);
+}
+
+/*
+ * Process a read call on a tty device.
+ */
+int
+ttread(tp, uio, flag)
+ register struct tty *tp;
+ struct uio *uio;
+ int flag;
+{
+ register struct clist *qp;
+ register int c;
+ register tcflag_t lflag;
+ register cc_t *cc = tp->t_cc;
+ register struct proc *p = curproc;
+ int s, first, error = 0;
+ int has_stime = 0, last_cc = 0;
+ long slp = 0; /* XXX this should be renamed `timo'. */
+ struct timeval stime;
+
+loop:
+ s = spltty();
+ lflag = tp->t_lflag;
+ /*
+ * take pending input first
+ */
+ if (ISSET(lflag, PENDIN)) {
+ ttypend(tp);
+ splx(s); /* reduce latency */
+ s = spltty();
+ lflag = tp->t_lflag; /* XXX ttypend() clobbers it */
+ }
+
+ /*
+ * Hang process if it's in the background.
+ */
+ if (isbackground(p, tp)) {
+ splx(s);
+ if ((p->p_sigignore & sigmask(SIGTTIN)) ||
+ (p->p_sigmask & sigmask(SIGTTIN)) ||
+ p->p_flag & P_PPWAIT || p->p_pgrp->pg_jobc == 0)
+ return (EIO);
+ pgsignal(p->p_pgrp, SIGTTIN, 1);
+ error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg2", 0);
+ if (error)
+ return (error);
+ goto loop;
+ }
+
+ if (ISSET(tp->t_state, TS_ZOMBIE)) {
+ splx(s);
+ return (0); /* EOF */
+ }
+
+ /*
+ * If canonical, use the canonical queue,
+ * else use the raw queue.
+ *
+ * (should get rid of clists...)
+ */
+ qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq;
+
+ if (flag & IO_NDELAY) {
+ if (qp->c_cc > 0)
+ goto read;
+ if (!ISSET(lflag, ICANON) && cc[VMIN] == 0) {
+ splx(s);
+ return (0);
+ }
+ splx(s);
+ return (EWOULDBLOCK);
+ }
+ if (!ISSET(lflag, ICANON)) {
+ int m = cc[VMIN];
+ long t = cc[VTIME];
+ struct timeval timecopy;
+
+ /*
+ * Check each of the four combinations.
+ * (m > 0 && t == 0) is the normal read case.
+ * It should be fairly efficient, so we check that and its
+ * companion case (m == 0 && t == 0) first.
+ * For the other two cases, we compute the target sleep time
+ * into slp.
+ */
+ if (t == 0) {
+ if (qp->c_cc < m)
+ goto sleep;
+ if (qp->c_cc > 0)
+ goto read;
+
+ /* m, t and qp->c_cc are all 0. 0 is enough input. */
+ splx(s);
+ return (0);
+ }
+ t *= 100000; /* time in us */
+#define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 1000000 + \
+ ((t1).tv_usec - (t2).tv_usec))
+ if (m > 0) {
+ if (qp->c_cc <= 0)
+ goto sleep;
+ if (qp->c_cc >= m)
+ goto read;
+ getmicrotime(&timecopy);
+ if (!has_stime) {
+ /* first character, start timer */
+ has_stime = 1;
+ stime = timecopy;
+ slp = t;
+ } else if (qp->c_cc > last_cc) {
+ /* got a character, restart timer */
+ stime = timecopy;
+ slp = t;
+ } else {
+ /* nothing, check expiration */
+ slp = t - diff(timecopy, stime);
+ if (slp <= 0)
+ goto read;
+ }
+ last_cc = qp->c_cc;
+ } else { /* m == 0 */
+ if (qp->c_cc > 0)
+ goto read;
+ getmicrotime(&timecopy);
+ if (!has_stime) {
+ has_stime = 1;
+ stime = timecopy;
+ slp = t;
+ } else {
+ slp = t - diff(timecopy, stime);
+ if (slp <= 0) {
+ /* Timed out, but 0 is enough input. */
+ splx(s);
+ return (0);
+ }
+ }
+ }
+#undef diff
+ /*
+ * Rounding down may make us wake up just short
+ * of the target, so we round up.
+ * The formula is ceiling(slp * hz/1000000).
+ * 32-bit arithmetic is enough for hz < 169.
+ * XXX see tvtohz() for how to avoid overflow if hz
+ * is large (divide by `tick' and/or arrange to
+ * use tvtohz() if hz is large).
+ */
+ slp = (long) (((u_long)slp * hz) + 999999) / 1000000;
+ goto sleep;
+ }
+ if (qp->c_cc <= 0) {
+sleep:
+ /*
+ * There is no input, or not enough input and we can block.
+ */
+ error = ttysleep(tp, TSA_HUP_OR_INPUT(tp), TTIPRI | PCATCH,
+ ISSET(tp->t_state, TS_CONNECTED) ?
+ "ttyin" : "ttyhup", (int)slp);
+ splx(s);
+ if (error == EWOULDBLOCK)
+ error = 0;
+ else if (error)
+ return (error);
+ /*
+ * XXX what happens if another process eats some input
+ * while we are asleep (not just here)? It would be
+ * safest to detect changes and reset our state variables
+ * (has_stime and last_cc).
+ */
+ slp = 0;
+ goto loop;
+ }
+read:
+ splx(s);
+ /*
+ * Input present, check for input mapping and processing.
+ */
+ first = 1;
+ if (ISSET(lflag, ICANON | ISIG))
+ goto slowcase;
+ for (;;) {
+ char ibuf[IBUFSIZ];
+ int icc;
+
+ icc = imin(uio->uio_resid, IBUFSIZ);
+ icc = q_to_b(qp, ibuf, icc);
+ if (icc <= 0) {
+ if (first)
+ goto loop;
+ break;
+ }
+ error = uiomove(ibuf, icc, uio);
+ /*
+ * XXX if there was an error then we should ungetc() the
+ * unmoved chars and reduce icc here.
+ */
+#if NSNP > 0
+ if (ISSET(tp->t_lflag, ECHO) &&
+ ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+ snpin((struct snoop *)tp->t_sc, ibuf, icc);
+#endif
+ if (error)
+ break;
+ if (uio->uio_resid == 0)
+ break;
+ first = 0;
+ }
+ goto out;
+slowcase:
+ for (;;) {
+ c = getc(qp);
+ if (c < 0) {
+ if (first)
+ goto loop;
+ break;
+ }
+ /*
+ * delayed suspend (^Y)
+ */
+ if (CCEQ(cc[VDSUSP], c) &&
+ ISSET(lflag, IEXTEN | ISIG) == (IEXTEN | ISIG)) {
+ pgsignal(tp->t_pgrp, SIGTSTP, 1);
+ if (first) {
+ error = ttysleep(tp, &lbolt, TTIPRI | PCATCH,
+ "ttybg3", 0);
+ if (error)
+ break;
+ goto loop;
+ }
+ break;
+ }
+ /*
+ * Interpret EOF only in canonical mode.
+ */
+ if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON))
+ break;
+ /*
+ * Give user character.
+ */
+ error = ureadc(c, uio);
+ if (error)
+ /* XXX should ungetc(c, qp). */
+ break;
+#if NSNP > 0
+ /*
+ * Only snoop directly on input in echo mode. Non-echoed
+ * input will be snooped later iff the application echoes it.
+ */
+ if (ISSET(tp->t_lflag, ECHO) &&
+ ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+ snpinc((struct snoop *)tp->t_sc, (char)c);
+#endif
+ if (uio->uio_resid == 0)
+ break;
+ /*
+ * In canonical mode check for a "break character"
+ * marking the end of a "line of input".
+ */
+ if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag))
+ break;
+ first = 0;
+ }
+
+out:
+ /*
+ * Look to unblock input now that (presumably)
+ * the input queue has gone down.
+ */
+ s = spltty();
+ if (ISSET(tp->t_state, TS_TBLOCK) &&
+ tp->t_rawq.c_cc + tp->t_canq.c_cc <= tp->t_ilowat)
+ ttyunblock(tp);
+ splx(s);
+
+ return (error);
+}
+
+/*
+ * Check the output queue on tp for space for a kernel message (from uprintf
+ * or tprintf). Allow some space over the normal hiwater mark so we don't
+ * lose messages due to normal flow control, but don't let the tty run amok.
+ * Sleeps here are not interruptible, but we return prematurely if new signals
+ * arrive.
+ */
+int
+ttycheckoutq(tp, wait)
+ register struct tty *tp;
+ int wait;
+{
+ int hiwat, s, oldsig;
+
+ hiwat = tp->t_ohiwat;
+ s = spltty();
+ oldsig = wait ? curproc->p_siglist : 0;
+ if (tp->t_outq.c_cc > hiwat + OBUFSIZ + 100)
+ while (tp->t_outq.c_cc > hiwat) {
+ ttstart(tp);
+ if (tp->t_outq.c_cc <= hiwat)
+ break;
+ if (wait == 0 || curproc->p_siglist != oldsig) {
+ splx(s);
+ return (0);
+ }
+ SET(tp->t_state, TS_SO_OLOWAT);
+ tsleep(TSA_OLOWAT(tp), PZERO - 1, "ttoutq", hz);
+ }
+ splx(s);
+ return (1);
+}
+
+/*
+ * Process a write call on a tty device.
+ */
+int
+ttwrite(tp, uio, flag)
+ register struct tty *tp;
+ register struct uio *uio;
+ int flag;
+{
+ register char *cp = NULL;
+ register int cc, ce;
+ register struct proc *p;
+ int i, hiwat, cnt, error, s;
+ char obuf[OBUFSIZ];
+
+ hiwat = tp->t_ohiwat;
+ cnt = uio->uio_resid;
+ error = 0;
+ cc = 0;
+loop:
+ s = spltty();
+ if (ISSET(tp->t_state, TS_ZOMBIE)) {
+ splx(s);
+ if (uio->uio_resid == cnt)
+ error = EIO;
+ goto out;
+ }
+ if (!ISSET(tp->t_state, TS_CONNECTED)) {
+ if (flag & IO_NDELAY) {
+ splx(s);
+ error = EWOULDBLOCK;
+ goto out;
+ }
+ error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+ "ttydcd", 0);
+ splx(s);
+ if (error)
+ goto out;
+ goto loop;
+ }
+ splx(s);
+ /*
+ * Hang the process if it's in the background.
+ */
+ p = curproc;
+ if (isbackground(p, tp) &&
+ ISSET(tp->t_lflag, TOSTOP) && (p->p_flag & P_PPWAIT) == 0 &&
+ (p->p_sigignore & sigmask(SIGTTOU)) == 0 &&
+ (p->p_sigmask & sigmask(SIGTTOU)) == 0) {
+ if (p->p_pgrp->pg_jobc == 0) {
+ error = EIO;
+ goto out;
+ }
+ pgsignal(p->p_pgrp, SIGTTOU, 1);
+ error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg4", 0);
+ if (error)
+ goto out;
+ goto loop;
+ }
+ /*
+ * Process the user's data in at most OBUFSIZ chunks. Perform any
+ * output translation. Keep track of high water mark, sleep on
+ * overflow awaiting device aid in acquiring new space.
+ */
+ while (uio->uio_resid > 0 || cc > 0) {
+ if (ISSET(tp->t_lflag, FLUSHO)) {
+ uio->uio_resid = 0;
+ return (0);
+ }
+ if (tp->t_outq.c_cc > hiwat)
+ goto ovhiwat;
+ /*
+ * Grab a hunk of data from the user, unless we have some
+ * leftover from last time.
+ */
+ if (cc == 0) {
+ cc = imin(uio->uio_resid, OBUFSIZ);
+ cp = obuf;
+ error = uiomove(cp, cc, uio);
+ if (error) {
+ cc = 0;
+ break;
+ }
+#if NSNP > 0
+ if (ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+ snpin((struct snoop *)tp->t_sc, cp, cc);
+#endif
+ }
+ /*
+ * If nothing fancy need be done, grab those characters we
+ * can handle without any of ttyoutput's processing and
+ * just transfer them to the output q. For those chars
+ * which require special processing (as indicated by the
+ * bits in char_type), call ttyoutput. After processing
+ * a hunk of data, look for FLUSHO so ^O's will take effect
+ * immediately.
+ */
+ while (cc > 0) {
+ if (!ISSET(tp->t_oflag, OPOST))
+ ce = cc;
+ else {
+ ce = cc - scanc((u_int)cc, (u_char *)cp,
+ char_type, CCLASSMASK);
+ /*
+ * If ce is zero, then we're processing
+ * a special character through ttyoutput.
+ */
+ if (ce == 0) {
+ tp->t_rocount = 0;
+ if (ttyoutput(*cp, tp) >= 0) {
+ /* No Clists, wait a bit. */
+ ttstart(tp);
+ if (flag & IO_NDELAY) {
+ error = EWOULDBLOCK;
+ goto out;
+ }
+ error = ttysleep(tp, &lbolt,
+ TTOPRI|PCATCH,
+ "ttybf1", 0);
+ if (error)
+ goto out;
+ goto loop;
+ }
+ cp++;
+ cc--;
+ if (ISSET(tp->t_lflag, FLUSHO) ||
+ tp->t_outq.c_cc > hiwat)
+ goto ovhiwat;
+ continue;
+ }
+ }
+ /*
+ * A bunch of normal characters have been found.
+ * Transfer them en masse to the output queue and
+ * continue processing at the top of the loop.
+ * If there are any further characters in this
+ * <= OBUFSIZ chunk, the first should be a character
+ * requiring special handling by ttyoutput.
+ */
+ tp->t_rocount = 0;
+ i = b_to_q(cp, ce, &tp->t_outq);
+ ce -= i;
+ tp->t_column += ce;
+ cp += ce, cc -= ce, tk_nout += ce;
+ tp->t_outcc += ce;
+ if (i > 0) {
+ /* No Clists, wait a bit. */
+ ttstart(tp);
+ if (flag & IO_NDELAY) {
+ error = EWOULDBLOCK;
+ goto out;
+ }
+ error = ttysleep(tp, &lbolt, TTOPRI | PCATCH,
+ "ttybf2", 0);
+ if (error)
+ goto out;
+ goto loop;
+ }
+ if (ISSET(tp->t_lflag, FLUSHO) ||
+ tp->t_outq.c_cc > hiwat)
+ break;
+ }
+ ttstart(tp);
+ }
+out:
+ /*
+ * If cc is nonzero, we leave the uio structure inconsistent, as the
+ * offset and iov pointers have moved forward, but it doesn't matter
+ * (the call will either return short or restart with a new uio).
+ */
+ uio->uio_resid += cc;
+ return (error);
+
+ovhiwat:
+ ttstart(tp);
+ s = spltty();
+ /*
+ * This can only occur if FLUSHO is set in t_lflag,
+ * or if ttstart/oproc is synchronous (or very fast).
+ */
+ if (tp->t_outq.c_cc <= hiwat) {
+ splx(s);
+ goto loop;
+ }
+ if (flag & IO_NDELAY) {
+ splx(s);
+ uio->uio_resid += cc;
+ return (uio->uio_resid == cnt ? EWOULDBLOCK : 0);
+ }
+ SET(tp->t_state, TS_SO_OLOWAT);
+ error = ttysleep(tp, TSA_OLOWAT(tp), TTOPRI | PCATCH, "ttywri",
+ tp->t_timeout);
+ splx(s);
+ if (error == EWOULDBLOCK)
+ error = EIO;
+ if (error)
+ goto out;
+ goto loop;
+}
+
+/*
+ * Rubout one character from the rawq of tp
+ * as cleanly as possible.
+ */
+static void
+ttyrub(c, tp)
+ register int c;
+ register struct tty *tp;
+{
+ register char *cp;
+ register int savecol;
+ int tabc, s;
+
+ if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC))
+ return;
+ CLR(tp->t_lflag, FLUSHO);
+ if (ISSET(tp->t_lflag, ECHOE)) {
+ if (tp->t_rocount == 0) {
+ /*
+ * Screwed by ttwrite; retype
+ */
+ ttyretype(tp);
+ return;
+ }
+ if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE))
+ ttyrubo(tp, 2);
+ else {
+ CLR(c, ~TTY_CHARMASK);
+ switch (CCLASS(c)) {
+ case ORDINARY:
+ ttyrubo(tp, 1);
+ break;
+ case BACKSPACE:
+ case CONTROL:
+ case NEWLINE:
+ case RETURN:
+ case VTAB:
+ if (ISSET(tp->t_lflag, ECHOCTL))
+ ttyrubo(tp, 2);
+ break;
+ case TAB:
+ if (tp->t_rocount < tp->t_rawq.c_cc) {
+ ttyretype(tp);
+ return;
+ }
+ s = spltty();
+ savecol = tp->t_column;
+ SET(tp->t_state, TS_CNTTB);
+ SET(tp->t_lflag, FLUSHO);
+ tp->t_column = tp->t_rocol;
+ cp = tp->t_rawq.c_cf;
+ if (cp)
+ tabc = *cp; /* XXX FIX NEXTC */
+ for (; cp; cp = nextc(&tp->t_rawq, cp, &tabc))
+ ttyecho(tabc, tp);
+ CLR(tp->t_lflag, FLUSHO);
+ CLR(tp->t_state, TS_CNTTB);
+ splx(s);
+
+ /* savecol will now be length of the tab. */
+ savecol -= tp->t_column;
+ tp->t_column += savecol;
+ if (savecol > 8)
+ savecol = 8; /* overflow screw */
+ while (--savecol >= 0)
+ (void)ttyoutput('\b', tp);
+ break;
+ default: /* XXX */
+#define PANICSTR "ttyrub: would panic c = %d, val = %d\n"
+ (void)printf(PANICSTR, c, CCLASS(c));
+#ifdef notdef
+ panic(PANICSTR, c, CCLASS(c));
+#endif
+ }
+ }
+ } else if (ISSET(tp->t_lflag, ECHOPRT)) {
+ if (!ISSET(tp->t_state, TS_ERASE)) {
+ SET(tp->t_state, TS_ERASE);
+ (void)ttyoutput('\\', tp);
+ }
+ ttyecho(c, tp);
+ } else
+ ttyecho(tp->t_cc[VERASE], tp);
+ --tp->t_rocount;
+}
+
+/*
+ * Back over cnt characters, erasing them.
+ */
+static void
+ttyrubo(tp, cnt)
+ register struct tty *tp;
+ int cnt;
+{
+
+ while (cnt-- > 0) {
+ (void)ttyoutput('\b', tp);
+ (void)ttyoutput(' ', tp);
+ (void)ttyoutput('\b', tp);
+ }
+}
+
+/*
+ * ttyretype --
+ * Reprint the rawq line. Note, it is assumed that c_cc has already
+ * been checked.
+ */
+static void
+ttyretype(tp)
+ register struct tty *tp;
+{
+ register char *cp;
+ int s, c;
+
+ /* Echo the reprint character. */
+ if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE)
+ ttyecho(tp->t_cc[VREPRINT], tp);
+
+ (void)ttyoutput('\n', tp);
+
+ /*
+ * XXX
+ * FIX: NEXTC IS BROKEN - DOESN'T CHECK QUOTE
+ * BIT OF FIRST CHAR.
+ */
+ s = spltty();
+ for (cp = tp->t_canq.c_cf, c = (cp != NULL ? *cp : 0);
+ cp != NULL; cp = nextc(&tp->t_canq, cp, &c))
+ ttyecho(c, tp);
+ for (cp = tp->t_rawq.c_cf, c = (cp != NULL ? *cp : 0);
+ cp != NULL; cp = nextc(&tp->t_rawq, cp, &c))
+ ttyecho(c, tp);
+ CLR(tp->t_state, TS_ERASE);
+ splx(s);
+
+ tp->t_rocount = tp->t_rawq.c_cc;
+ tp->t_rocol = 0;
+}
+
+/*
+ * Echo a typed character to the terminal.
+ */
+static void
+ttyecho(c, tp)
+ register int c;
+ register struct tty *tp;
+{
+
+ if (!ISSET(tp->t_state, TS_CNTTB))
+ CLR(tp->t_lflag, FLUSHO);
+ if ((!ISSET(tp->t_lflag, ECHO) &&
+ (c != '\n' || !ISSET(tp->t_lflag, ECHONL))) ||
+ ISSET(tp->t_lflag, EXTPROC))
+ return;
+ if (ISSET(tp->t_lflag, ECHOCTL) &&
+ ((ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n') ||
+ ISSET(c, TTY_CHARMASK) == 0177)) {
+ (void)ttyoutput('^', tp);
+ CLR(c, ~TTY_CHARMASK);
+ if (c == 0177)
+ c = '?';
+ else
+ c += 'A' - 1;
+ }
+ (void)ttyoutput(c, tp);
+}
+
+/*
+ * Wake up any readers on a tty.
+ */
+void
+ttwakeup(tp)
+ register struct tty *tp;
+{
+
+ if (tp->t_rsel.si_pid != 0)
+ selwakeup(&tp->t_rsel);
+ if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL)
+ pgsigio(tp->t_sigio, SIGIO, (tp->t_session != NULL));
+ wakeup(TSA_HUP_OR_INPUT(tp));
+}
+
+/*
+ * Wake up any writers on a tty.
+ */
+void
+ttwwakeup(tp)
+ register struct tty *tp;
+{
+
+ if (tp->t_wsel.si_pid != 0 && tp->t_outq.c_cc <= tp->t_olowat)
+ selwakeup(&tp->t_wsel);
+ if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) ==
+ TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) {
+ CLR(tp->t_state, TS_SO_OCOMPLETE);
+ wakeup(TSA_OCOMPLETE(tp));
+ }
+ if (ISSET(tp->t_state, TS_SO_OLOWAT) &&
+ tp->t_outq.c_cc <= tp->t_olowat) {
+ CLR(tp->t_state, TS_SO_OLOWAT);
+ wakeup(TSA_OLOWAT(tp));
+ }
+}
+
+/*
+ * Look up a code for a specified speed in a conversion table;
+ * used by drivers to map software speed values to hardware parameters.
+ */
+int
+ttspeedtab(speed, table)
+ int speed;
+ register struct speedtab *table;
+{
+
+ for ( ; table->sp_speed != -1; table++)
+ if (table->sp_speed == speed)
+ return (table->sp_code);
+ return (-1);
+}
+
+/*
+ * Set input and output watermarks and buffer sizes. For input, the
+ * high watermark is about one second's worth of input above empty, the
+ * low watermark is slightly below high water, and the buffer size is a
+ * driver-dependent amount above high water. For output, the watermarks
+ * are near the ends of the buffer, with about 1 second's worth of input
+ * between them. All this only applies to the standard line discipline.
+ */
+void
+ttsetwater(tp)
+ struct tty *tp;
+{
+ register int cps, ttmaxhiwat, x;
+
+ /* Input. */
+ clist_alloc_cblocks(&tp->t_canq, TTYHOG, 512);
+ switch (tp->t_ispeedwat) {
+ case (speed_t)-1:
+ cps = tp->t_ispeed / 10;
+ break;
+ case 0:
+ /*
+ * This case is for old drivers that don't know about
+ * t_ispeedwat. Arrange for them to get the old buffer
+ * sizes and watermarks.
+ */
+ cps = TTYHOG - 2 * 256;
+ tp->t_ififosize = 2 * 256;
+ break;
+ default:
+ cps = tp->t_ispeedwat / 10;
+ break;
+ }
+ tp->t_ihiwat = cps;
+ tp->t_ilowat = 7 * cps / 8;
+ x = cps + tp->t_ififosize;
+ clist_alloc_cblocks(&tp->t_rawq, x, x);
+
+ /* Output. */
+ switch (tp->t_ospeedwat) {
+ case (speed_t)-1:
+ cps = tp->t_ospeed / 10;
+ ttmaxhiwat = 2 * TTMAXHIWAT;
+ break;
+ case 0:
+ cps = tp->t_ospeed / 10;
+ ttmaxhiwat = TTMAXHIWAT;
+ break;
+ default:
+ cps = tp->t_ospeedwat / 10;
+ ttmaxhiwat = 8 * TTMAXHIWAT;
+ break;
+ }
+#define CLAMP(x, h, l) ((x) > h ? h : ((x) < l) ? l : (x))
+ tp->t_olowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT);
+ x += cps;
+ x = CLAMP(x, ttmaxhiwat, TTMINHIWAT); /* XXX clamps are too magic */
+ tp->t_ohiwat = roundup(x, CBSIZE); /* XXX for compat */
+ x = imax(tp->t_ohiwat, TTMAXHIWAT); /* XXX for compat/safety */
+ x += OBUFSIZ + 100;
+ clist_alloc_cblocks(&tp->t_outq, x, x);
+#undef CLAMP
+}
+
+/*
+ * Report on state of foreground process group.
+ */
+void
+ttyinfo(tp)
+ register struct tty *tp;
+{
+ register struct proc *p, *pick;
+ struct timeval utime, stime;
+ int tmp;
+
+ if (ttycheckoutq(tp,0) == 0)
+ return;
+
+ /* Print load average. */
+ tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
+ ttyprintf(tp, "load: %d.%02d ", tmp / 100, tmp % 100);
+
+ if (tp->t_session == NULL)
+ ttyprintf(tp, "not a controlling terminal\n");
+ else if (tp->t_pgrp == NULL)
+ ttyprintf(tp, "no foreground process group\n");
+ else if ((p = tp->t_pgrp->pg_members.lh_first) == 0)
+ ttyprintf(tp, "empty foreground process group\n");
+ else {
+ /* Pick interesting process. */
+ for (pick = NULL; p != 0; p = p->p_pglist.le_next)
+ if (proc_compare(pick, p))
+ pick = p;
+
+ ttyprintf(tp, " cmd: %s %d [%s] ", pick->p_comm, pick->p_pid,
+ pick->p_stat == SRUN ? "running" :
+ pick->p_wmesg ? pick->p_wmesg : "iowait");
+
+ calcru(pick, &utime, &stime, NULL);
+
+ /* Print user time. */
+ ttyprintf(tp, "%ld.%02ldu ",
+ utime.tv_sec, utime.tv_usec / 10000);
+
+ /* Print system time. */
+ ttyprintf(tp, "%ld.%02lds ",
+ stime.tv_sec, stime.tv_usec / 10000);
+
+#define pgtok(a) (((a) * PAGE_SIZE) / 1024)
+ /* Print percentage cpu, resident set size. */
+ tmp = (pick->p_pctcpu * 10000 + FSCALE / 2) >> FSHIFT;
+ ttyprintf(tp, "%d%% %ldk\n",
+ tmp / 100,
+ pick->p_stat == SIDL || pick->p_stat == SZOMB ? 0 :
+#ifdef pmap_resident_count
+ (long)pgtok(pmap_resident_count(&pick->p_vmspace->vm_pmap))
+#else
+ (long)pgtok(pick->p_vmspace->vm_rssize)
+#endif
+ );
+ }
+ tp->t_rocount = 0; /* so pending input will be retyped if BS */
+}
+
+/*
+ * Returns 1 if p2 is "better" than p1
+ *
+ * The algorithm for picking the "interesting" process is thus:
+ *
+ * 1) Only foreground processes are eligible - implied.
+ * 2) Runnable processes are favored over anything else. The runner
+ * with the highest cpu utilization is picked (p_estcpu). Ties are
+ * broken by picking the highest pid.
+ * 3) The sleeper with the shortest sleep time is next. With ties,
+ * we pick out just "short-term" sleepers (P_SINTR == 0).
+ * 4) Further ties are broken by picking the highest pid.
+ */
+#define ISRUN(p) (((p)->p_stat == SRUN) || ((p)->p_stat == SIDL))
+#define TESTAB(a, b) ((a)<<1 | (b))
+#define ONLYA 2
+#define ONLYB 1
+#define BOTH 3
+
+static int
+proc_compare(p1, p2)
+ register struct proc *p1, *p2;
+{
+
+ if (p1 == NULL)
+ return (1);
+ /*
+ * see if at least one of them is runnable
+ */
+ switch (TESTAB(ISRUN(p1), ISRUN(p2))) {
+ case ONLYA:
+ return (0);
+ case ONLYB:
+ return (1);
+ case BOTH:
+ /*
+ * tie - favor one with highest recent cpu utilization
+ */
+ if (p2->p_estcpu > p1->p_estcpu)
+ return (1);
+ if (p1->p_estcpu > p2->p_estcpu)
+ return (0);
+ return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+ }
+ /*
+ * weed out zombies
+ */
+ switch (TESTAB(p1->p_stat == SZOMB, p2->p_stat == SZOMB)) {
+ case ONLYA:
+ return (1);
+ case ONLYB:
+ return (0);
+ case BOTH:
+ return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+ }
+ /*
+ * pick the one with the smallest sleep time
+ */
+ if (p2->p_slptime > p1->p_slptime)
+ return (0);
+ if (p1->p_slptime > p2->p_slptime)
+ return (1);
+ /*
+ * favor one sleeping in a non-interruptible sleep
+ */
+ if (p1->p_flag & P_SINTR && (p2->p_flag & P_SINTR) == 0)
+ return (1);
+ if (p2->p_flag & P_SINTR && (p1->p_flag & P_SINTR) == 0)
+ return (0);
+ return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+}
+
+/*
+ * Output char to tty; console putchar style.
+ */
+int
+tputchar(c, tp)
+ int c;
+ struct tty *tp;
+{
+ register int s;
+
+ s = spltty();
+ if (!ISSET(tp->t_state, TS_CONNECTED)) {
+ splx(s);
+ return (-1);
+ }
+ if (c == '\n')
+ (void)ttyoutput('\r', tp);
+ (void)ttyoutput(c, tp);
+ ttstart(tp);
+ splx(s);
+ return (0);
+}
+
+/*
+ * Sleep on chan, returning ERESTART if tty changed while we napped and
+ * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by tsleep. If
+ * the tty is revoked, restarting a pending call will redo validation done
+ * at the start of the call.
+ */
+int
+ttysleep(tp, chan, pri, wmesg, timo)
+ struct tty *tp;
+ void *chan;
+ int pri, timo;
+ char *wmesg;
+{
+ int error;
+ int gen;
+
+ gen = tp->t_gen;
+ error = tsleep(chan, pri, wmesg, timo);
+ if (error)
+ return (error);
+ return (tp->t_gen == gen ? 0 : ERESTART);
+}
+
+#ifdef notyet
+/*
+ * XXX this is usable not useful or used. Most tty drivers have
+ * ifdefs for using ttymalloc() but assume a different interface.
+ */
+/*
+ * Allocate a tty struct. Clists in the struct will be allocated by
+ * ttyopen().
+ */
+struct tty *
+ttymalloc()
+{
+ struct tty *tp;
+
+ tp = malloc(sizeof *tp, M_TTYS, M_WAITOK);
+ bzero(tp, sizeof *tp);
+ return (tp);
+}
+#endif
+
+#if 0 /* XXX not yet usable: session leader holds a ref (see kern_exit.c). */
+/*
+ * Free a tty struct. Clists in the struct should have been freed by
+ * ttyclose().
+ */
+void
+ttyfree(tp)
+ struct tty *tp;
+{
+ free(tp, M_TTYS);
+}
+#endif /* 0 */
diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c
new file mode 100644
index 0000000..fa2ae5c
--- /dev/null
+++ b/sys/kern/tty_compat.c
@@ -0,0 +1,490 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty_compat.c 8.1 (Berkeley) 6/10/93
+ * $Id: tty_compat.c,v 1.27 1998/02/25 06:16:37 bde Exp $
+ */
+
+#include "opt_compat.h"
+
+/*
+ * mapping routines for old line discipline (yuck)
+ */
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ioctl_compat.h>
+#include <sys/tty.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+static int ttcompatgetflags __P((struct tty *tp));
+static void ttcompatsetflags __P((struct tty *tp, struct termios *t));
+static void ttcompatsetlflags __P((struct tty *tp, struct termios *t));
+static int ttcompatspeedtab __P((int speed, struct speedtab *table));
+
+static int ttydebug = 0;
+SYSCTL_INT(_debug, OID_AUTO, ttydebug, CTLFLAG_RW, &ttydebug, 0, "");
+
+static struct speedtab compatspeeds[] = {
+#define MAX_SPEED 17
+ { 115200, 17 },
+ { 57600, 16 },
+ { 38400, 15 },
+ { 19200, 14 },
+ { 9600, 13 },
+ { 4800, 12 },
+ { 2400, 11 },
+ { 1800, 10 },
+ { 1200, 9 },
+ { 600, 8 },
+ { 300, 7 },
+ { 200, 6 },
+ { 150, 5 },
+ { 134, 4 },
+ { 110, 3 },
+ { 75, 2 },
+ { 50, 1 },
+ { 0, 0 },
+ { -1, -1 },
+};
+static int compatspcodes[] = {
+ 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200,
+ 1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200,
+};
+
+static int
+ttcompatspeedtab(speed, table)
+ int speed;
+ register struct speedtab *table;
+{
+ if (speed == 0)
+ return (0); /* hangup */
+ for ( ; table->sp_speed > 0; table++)
+ if (table->sp_speed <= speed) /* nearest one, rounded down */
+ return (table->sp_code);
+ return (1); /* 50, min and not hangup */
+}
+
+int
+ttsetcompat(tp, com, data, term)
+ register struct tty *tp;
+ u_long *com;
+ caddr_t data;
+ struct termios *term;
+{
+ switch (*com) {
+ case TIOCSETP:
+ case TIOCSETN: {
+ register struct sgttyb *sg = (struct sgttyb *)data;
+ int speed;
+
+ if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0)
+ return(EINVAL);
+ else if (speed != ttcompatspeedtab(tp->t_ispeed, compatspeeds))
+ term->c_ispeed = compatspcodes[speed];
+ else
+ term->c_ispeed = tp->t_ispeed;
+ if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0)
+ return(EINVAL);
+ else if (speed != ttcompatspeedtab(tp->t_ospeed, compatspeeds))
+ term->c_ospeed = compatspcodes[speed];
+ else
+ term->c_ospeed = tp->t_ospeed;
+ term->c_cc[VERASE] = sg->sg_erase;
+ term->c_cc[VKILL] = sg->sg_kill;
+ tp->t_flags = (tp->t_flags&0xffff0000) | (sg->sg_flags&0xffff);
+ ttcompatsetflags(tp, term);
+ *com = (*com == TIOCSETP) ? TIOCSETAF : TIOCSETA;
+ break;
+ }
+ case TIOCSETC: {
+ struct tchars *tc = (struct tchars *)data;
+ register cc_t *cc;
+
+ cc = term->c_cc;
+ cc[VINTR] = tc->t_intrc;
+ cc[VQUIT] = tc->t_quitc;
+ cc[VSTART] = tc->t_startc;
+ cc[VSTOP] = tc->t_stopc;
+ cc[VEOF] = tc->t_eofc;
+ cc[VEOL] = tc->t_brkc;
+ if (tc->t_brkc == -1)
+ cc[VEOL2] = _POSIX_VDISABLE;
+ *com = TIOCSETA;
+ break;
+ }
+ case TIOCSLTC: {
+ struct ltchars *ltc = (struct ltchars *)data;
+ register cc_t *cc;
+
+ cc = term->c_cc;
+ cc[VSUSP] = ltc->t_suspc;
+ cc[VDSUSP] = ltc->t_dsuspc;
+ cc[VREPRINT] = ltc->t_rprntc;
+ cc[VDISCARD] = ltc->t_flushc;
+ cc[VWERASE] = ltc->t_werasc;
+ cc[VLNEXT] = ltc->t_lnextc;
+ *com = TIOCSETA;
+ break;
+ }
+ case TIOCLBIS:
+ case TIOCLBIC:
+ case TIOCLSET:
+ if (*com == TIOCLSET)
+ tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16;
+ else {
+ tp->t_flags =
+ (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff);
+ if (*com == TIOCLBIS)
+ tp->t_flags |= *(int *)data<<16;
+ else
+ tp->t_flags &= ~(*(int *)data<<16);
+ }
+ ttcompatsetlflags(tp, term);
+ *com = TIOCSETA;
+ break;
+ }
+ return 0;
+}
+
+/*ARGSUSED*/
+int
+ttcompat(tp, com, data, flag)
+ register struct tty *tp;
+ u_long com;
+ caddr_t data;
+ int flag;
+{
+ switch (com) {
+ case TIOCSETP:
+ case TIOCSETN:
+ case TIOCSETC:
+ case TIOCSLTC:
+ case TIOCLBIS:
+ case TIOCLBIC:
+ case TIOCLSET: {
+ struct termios term;
+ int error;
+
+ term = tp->t_termios;
+ if ((error = ttsetcompat(tp, &com, data, &term)) != 0)
+ return error;
+ return ttioctl(tp, com, &term, flag);
+ }
+ case TIOCGETP: {
+ register struct sgttyb *sg = (struct sgttyb *)data;
+ register cc_t *cc = tp->t_cc;
+
+ sg->sg_ospeed = ttcompatspeedtab(tp->t_ospeed, compatspeeds);
+ if (tp->t_ispeed == 0)
+ sg->sg_ispeed = sg->sg_ospeed;
+ else
+ sg->sg_ispeed = ttcompatspeedtab(tp->t_ispeed, compatspeeds);
+ sg->sg_erase = cc[VERASE];
+ sg->sg_kill = cc[VKILL];
+ sg->sg_flags = tp->t_flags = ttcompatgetflags(tp);
+ break;
+ }
+ case TIOCGETC: {
+ struct tchars *tc = (struct tchars *)data;
+ register cc_t *cc = tp->t_cc;
+
+ tc->t_intrc = cc[VINTR];
+ tc->t_quitc = cc[VQUIT];
+ tc->t_startc = cc[VSTART];
+ tc->t_stopc = cc[VSTOP];
+ tc->t_eofc = cc[VEOF];
+ tc->t_brkc = cc[VEOL];
+ break;
+ }
+ case TIOCGLTC: {
+ struct ltchars *ltc = (struct ltchars *)data;
+ register cc_t *cc = tp->t_cc;
+
+ ltc->t_suspc = cc[VSUSP];
+ ltc->t_dsuspc = cc[VDSUSP];
+ ltc->t_rprntc = cc[VREPRINT];
+ ltc->t_flushc = cc[VDISCARD];
+ ltc->t_werasc = cc[VWERASE];
+ ltc->t_lnextc = cc[VLNEXT];
+ break;
+ }
+ case TIOCLGET:
+ tp->t_flags =
+ (ttcompatgetflags(tp) & 0xffff0000UL)
+ | (tp->t_flags & 0xffff);
+ *(int *)data = tp->t_flags>>16;
+ if (ttydebug)
+ printf("CLGET: returning %x\n", *(int *)data);
+ break;
+
+ case OTIOCGETD:
+ *(int *)data = tp->t_line ? tp->t_line : 2;
+ break;
+
+ case OTIOCSETD: {
+ int ldisczero = 0;
+
+ return (ttioctl(tp, TIOCSETD,
+ *(int *)data == 2 ? (caddr_t)&ldisczero : data, flag));
+ }
+
+ case OTIOCCONS:
+ *(int *)data = 1;
+ return (ttioctl(tp, TIOCCONS, data, flag));
+
+ default:
+ return (ENOIOCTL);
+ }
+ return (0);
+}
+
+static int
+ttcompatgetflags(tp)
+ register struct tty *tp;
+{
+ register tcflag_t iflag = tp->t_iflag;
+ register tcflag_t lflag = tp->t_lflag;
+ register tcflag_t oflag = tp->t_oflag;
+ register tcflag_t cflag = tp->t_cflag;
+ register int flags = 0;
+
+ if (iflag&IXOFF)
+ flags |= TANDEM;
+ if (iflag&ICRNL || oflag&ONLCR)
+ flags |= CRMOD;
+ if ((cflag&CSIZE) == CS8) {
+ flags |= PASS8;
+ if (iflag&ISTRIP)
+ flags |= ANYP;
+ }
+ else if (cflag&PARENB) {
+ if (iflag&INPCK) {
+ if (cflag&PARODD)
+ flags |= ODDP;
+ else
+ flags |= EVENP;
+ } else
+ flags |= EVENP | ODDP;
+ }
+
+ if ((lflag&ICANON) == 0) {
+ /* fudge */
+ if (iflag&(INPCK|ISTRIP|IXON) || lflag&(IEXTEN|ISIG)
+ || (cflag&(CSIZE|PARENB)) != CS8)
+ flags |= CBREAK;
+ else
+ flags |= RAW;
+ }
+ if (!(flags&RAW) && !(oflag&OPOST) && (cflag&(CSIZE|PARENB)) == CS8)
+ flags |= LITOUT;
+ if (cflag&MDMBUF)
+ flags |= MDMBUF;
+ if ((cflag&HUPCL) == 0)
+ flags |= NOHANG;
+ if (oflag&OXTABS)
+ flags |= XTABS;
+ if (lflag&ECHOE)
+ flags |= CRTERA|CRTBS;
+ if (lflag&ECHOKE)
+ flags |= CRTKIL|CRTBS;
+ if (lflag&ECHOPRT)
+ flags |= PRTERA;
+ if (lflag&ECHOCTL)
+ flags |= CTLECH;
+ if ((iflag&IXANY) == 0)
+ flags |= DECCTQ;
+ flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH);
+ if (ttydebug)
+ printf("getflags: %x\n", flags);
+ return (flags);
+}
+
+static void
+ttcompatsetflags(tp, t)
+ register struct tty *tp;
+ register struct termios *t;
+{
+ register int flags = tp->t_flags;
+ register tcflag_t iflag = t->c_iflag;
+ register tcflag_t oflag = t->c_oflag;
+ register tcflag_t lflag = t->c_lflag;
+ register tcflag_t cflag = t->c_cflag;
+
+ if (flags & RAW) {
+ iflag = IGNBRK;
+ lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN);
+ } else {
+ iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
+ iflag |= BRKINT|IXON|IMAXBEL;
+ lflag |= ISIG|IEXTEN|ECHOCTL; /* XXX was echoctl on ? */
+ if (flags & XTABS)
+ oflag |= OXTABS;
+ else
+ oflag &= ~OXTABS;
+ if (flags & CBREAK)
+ lflag &= ~ICANON;
+ else
+ lflag |= ICANON;
+ if (flags&CRMOD) {
+ iflag |= ICRNL;
+ oflag |= ONLCR;
+ } else {
+ iflag &= ~ICRNL;
+ oflag &= ~ONLCR;
+ }
+ }
+ if (flags&ECHO)
+ lflag |= ECHO;
+ else
+ lflag &= ~ECHO;
+
+ cflag &= ~(CSIZE|PARENB);
+ if (flags&(RAW|LITOUT|PASS8)) {
+ cflag |= CS8;
+ if (!(flags&(RAW|PASS8))
+ || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
+ iflag |= ISTRIP;
+ else
+ iflag &= ~ISTRIP;
+ if (flags&(RAW|LITOUT))
+ oflag &= ~OPOST;
+ else
+ oflag |= OPOST;
+ } else {
+ cflag |= CS7|PARENB;
+ iflag |= ISTRIP;
+ oflag |= OPOST;
+ }
+ /* XXX don't set INPCK if RAW or PASS8? */
+ if ((flags&(EVENP|ODDP)) == EVENP) {
+ iflag |= INPCK;
+ cflag &= ~PARODD;
+ } else if ((flags&(EVENP|ODDP)) == ODDP) {
+ iflag |= INPCK;
+ cflag |= PARODD;
+ } else
+ iflag &= ~INPCK;
+ if (flags&TANDEM)
+ iflag |= IXOFF;
+ else
+ iflag &= ~IXOFF;
+ if ((flags&DECCTQ) == 0)
+ iflag |= IXANY;
+ else
+ iflag &= ~IXANY;
+ t->c_iflag = iflag;
+ t->c_oflag = oflag;
+ t->c_lflag = lflag;
+ t->c_cflag = cflag;
+}
+
+static void
+ttcompatsetlflags(tp, t)
+ register struct tty *tp;
+ register struct termios *t;
+{
+ register int flags = tp->t_flags;
+ register tcflag_t iflag = t->c_iflag;
+ register tcflag_t oflag = t->c_oflag;
+ register tcflag_t lflag = t->c_lflag;
+ register tcflag_t cflag = t->c_cflag;
+
+ iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
+ if (flags&CRTERA)
+ lflag |= ECHOE;
+ else
+ lflag &= ~ECHOE;
+ if (flags&CRTKIL)
+ lflag |= ECHOKE;
+ else
+ lflag &= ~ECHOKE;
+ if (flags&PRTERA)
+ lflag |= ECHOPRT;
+ else
+ lflag &= ~ECHOPRT;
+ if (flags&CTLECH)
+ lflag |= ECHOCTL;
+ else
+ lflag &= ~ECHOCTL;
+ if (flags&TANDEM)
+ iflag |= IXOFF;
+ else
+ iflag &= ~IXOFF;
+ if ((flags&DECCTQ) == 0)
+ iflag |= IXANY;
+ else
+ iflag &= ~IXANY;
+ if (flags & MDMBUF)
+ cflag |= MDMBUF;
+ else
+ cflag &= ~MDMBUF;
+ if (flags&NOHANG)
+ cflag &= ~HUPCL;
+ else
+ cflag |= HUPCL;
+ lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH);
+ lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH);
+
+ /*
+ * The next if-else statement is copied from above so don't bother
+ * checking it separately. We could avoid fiddlling with the
+ * character size if the mode is already RAW or if neither the
+ * LITOUT bit or the PASS8 bit is being changed, but the delta of
+ * the change is not available here and skipping the RAW case would
+ * make the code different from above.
+ */
+ cflag &= ~(CSIZE|PARENB);
+ if (flags&(RAW|LITOUT|PASS8)) {
+ cflag |= CS8;
+ if (!(flags&(RAW|PASS8))
+ || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
+ iflag |= ISTRIP;
+ else
+ iflag &= ~ISTRIP;
+ if (flags&(RAW|LITOUT))
+ oflag &= ~OPOST;
+ else
+ oflag |= OPOST;
+ } else {
+ cflag |= CS7|PARENB;
+ iflag |= ISTRIP;
+ oflag |= OPOST;
+ }
+ t->c_iflag = iflag;
+ t->c_oflag = oflag;
+ t->c_lflag = lflag;
+ t->c_cflag = cflag;
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
diff --git a/sys/kern/tty_conf.c b/sys/kern/tty_conf.c
new file mode 100644
index 0000000..12f26e0
--- /dev/null
+++ b/sys/kern/tty_conf.c
@@ -0,0 +1,210 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty_conf.c 8.4 (Berkeley) 1/21/94
+ * $Id: tty_conf.c,v 1.12 1997/12/16 17:40:27 eivind Exp $
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+
+#ifndef MAXLDISC
+#define MAXLDISC 8
+#endif
+
+static l_open_t l_noopen;
+static l_close_t l_noclose;
+static l_ioctl_t l_nullioctl;
+static l_rint_t l_norint;
+static l_start_t l_nostart;
+
+/*
+ * XXX it probably doesn't matter what the entries other than the l_open
+ * entry are here. The l_nullioctl and ttymodem entries still look fishy.
+ * Reconsider the removal of nullmodem anyway. It was too much like
+ * ttymodem, but a completely null version might be useful.
+ */
+#define NODISC(n) \
+ { l_noopen, l_noclose, l_noread, l_nowrite, \
+ l_nullioctl, l_norint, l_nostart, ttymodem }
+
+struct linesw linesw[MAXLDISC] =
+{
+ /* 0- termios */
+ { ttyopen, ttylclose, ttread, ttwrite,
+ l_nullioctl, ttyinput, ttstart, ttymodem },
+ NODISC(1), /* 1- defunct */
+ /* 2- NTTYDISC */
+#ifdef COMPAT_43
+ { ttyopen, ttylclose, ttread, ttwrite,
+ l_nullioctl, ttyinput, ttstart, ttymodem },
+#else
+ NODISC(2),
+#endif
+ NODISC(3), /* TABLDISC */
+ NODISC(4), /* SLIPDISC */
+ NODISC(5), /* PPPDISC */
+ NODISC(6), /* loadable */
+ NODISC(7), /* loadable */
+};
+
+int nlinesw = sizeof (linesw) / sizeof (linesw[0]);
+
+static struct linesw nodisc = NODISC(0);
+
+#define LOADABLE_LDISC 6
+/*
+ * ldisc_register: Register a line discipline.
+ *
+ * discipline: Index for discipline to load, or LDISC_LOAD for us to choose.
+ * linesw_p: Pointer to linesw_p.
+ *
+ * Returns: Index used or -1 on failure.
+ */
+int
+ldisc_register(discipline, linesw_p)
+ int discipline;
+ struct linesw *linesw_p;
+{
+ int slot = -1;
+
+ if (discipline == LDISC_LOAD) {
+ int i;
+ for (i = LOADABLE_LDISC; i < MAXLDISC; i++)
+ if (bcmp(linesw + i, &nodisc, sizeof(nodisc)) == 0) {
+ slot = i;
+ }
+ }
+ else if (discipline >= 0 && discipline < MAXLDISC) {
+ slot = discipline;
+ }
+
+ if (slot != -1 && linesw_p)
+ linesw[slot] = *linesw_p;
+
+ return slot;
+}
+
+/*
+ * ldisc_deregister: Deregister a line discipline obtained with
+ * ldisc_register. Can only deregister "loadable" ones now.
+ *
+ * discipline: Index for discipline to unload.
+ */
+void
+ldisc_deregister(discipline)
+ int discipline;
+{
+ if (discipline >= LOADABLE_LDISC && discipline < MAXLDISC) {
+ linesw[discipline] = nodisc;
+ }
+}
+
+static int
+l_noopen(dev, tp)
+ dev_t dev;
+ struct tty *tp;
+{
+
+ return (ENODEV);
+}
+
+static int
+l_noclose(tp, flag)
+ struct tty *tp;
+ int flag;
+{
+
+ return (ENODEV);
+}
+
+int
+l_noread(tp, uio, flag)
+ struct tty *tp;
+ struct uio *uio;
+ int flag;
+{
+
+ return (ENODEV);
+}
+
+int
+l_nowrite(tp, uio, flag)
+ struct tty *tp;
+ struct uio *uio;
+ int flag;
+{
+
+ return (ENODEV);
+}
+
+static int
+l_norint(c, tp)
+ int c;
+ struct tty *tp;
+{
+
+ return (ENODEV);
+}
+
+static int
+l_nostart(tp)
+ struct tty *tp;
+{
+
+ return (ENODEV);
+}
+
+/*
+ * Do nothing specific version of line
+ * discipline specific ioctl command.
+ */
+static int
+l_nullioctl(tp, cmd, data, flags, p)
+ struct tty *tp;
+ u_long cmd;
+ char *data;
+ int flags;
+ struct proc *p;
+{
+
+ return (ENOIOCTL);
+}
diff --git a/sys/kern/tty_cons.c b/sys/kern/tty_cons.c
index 6189d72..581ff3f 100644
--- a/sys/kern/tty_cons.c
+++ b/sys/kern/tty_cons.c
@@ -35,129 +35,323 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)cons.c 7.2 (Berkeley) 5/9/91
- *
- * PATCHES MAGIC LEVEL PATCH THAT GOT US HERE
- * -------------------- ----- ----------------------
- * CURRENT PATCH LEVEL: 1 00083
- * -------------------- ----- ----------------------
- *
- * 16 Aug 92 Pace Willisson /dev/console redirect (xterm -C, etc.)
- * 14 Mar 93 Chris G. Demetriou Moved pg() here from isa/pccons.c
+ * from: @(#)cons.c 7.2 (Berkeley) 5/9/91
+ * $Id: cons.c,v 1.59 1998/08/23 08:26:40 bde Exp $
*/
+#include "opt_devfs.h"
-#include "sys/param.h"
-#include "sys/proc.h"
-#include "sys/user.h"
-#include "sys/systm.h"
-#include "sys/buf.h"
-#include "sys/ioctl.h"
-#include "sys/tty.h"
-#include "sys/file.h"
-#include "sys/conf.h"
+#include <sys/param.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/reboot.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/uio.h>
-#include "cons.h"
+#include <machine/cpu.h>
+#include <machine/cons.h>
-/* XXX - all this could be autoconfig()ed */
-int pccnprobe(), pccninit(), pccngetc(), pccnputc();
-#include "com.h"
-#if NCOM > 0
-int comcnprobe(), comcninit(), comcngetc(), comcnputc();
-#endif
+static d_open_t cnopen;
+static d_close_t cnclose;
+static d_read_t cnread;
+static d_write_t cnwrite;
+static d_ioctl_t cnioctl;
+static d_poll_t cnpoll;
-struct consdev constab[] = {
- { pccnprobe, pccninit, pccngetc, pccnputc },
-#if NCOM > 0
- { comcnprobe, comcninit, comcngetc, comcnputc },
-#endif
- { 0 },
+#define CDEV_MAJOR 0
+static struct cdevsw cn_cdevsw = {
+ cnopen, cnclose, cnread, cnwrite,
+ cnioctl, nullstop, nullreset, nodevtotty,
+ cnpoll, nommap, NULL, "console",
+ NULL, -1, nodump, nopsize,
+ D_TTY,
};
-/* end XXX */
-struct tty *constty = 0; /* virtual console output device */
-struct consdev *cn_tab; /* physical console device info */
-struct tty *cn_tty; /* XXX: console tty struct for tprintf */
+static dev_t cn_dev_t; /* seems to be never really used */
+SYSCTL_OPAQUE(_machdep, CPU_CONSDEV, consdev, CTLFLAG_RD,
+ &cn_dev_t, sizeof cn_dev_t, "T,dev_t", "");
+
+static int cn_mute;
+int cons_unavail = 0; /* XXX:
+ * physical console not available for
+ * input (i.e., it is in graphics mode)
+ */
+
+static u_char cn_is_open; /* nonzero if logical console is open */
+static int openmode, openflag; /* how /dev/console was openned */
+static u_char cn_phys_is_open; /* nonzero if physical device is open */
+static d_close_t *cn_phys_close; /* physical device close function */
+static d_open_t *cn_phys_open; /* physical device open function */
+static struct consdev *cn_tab; /* physical console device info */
+static struct tty *cn_tp; /* physical console tty struct */
+#ifdef DEVFS
+static void *cn_devfs_token; /* represents the devfs entry */
+#endif /* DEVFS */
+
+CONS_DRIVER(cons, NULL, NULL, NULL, NULL, NULL);
+
+void
cninit()
{
- register struct consdev *cp;
+ struct consdev *best_cp, *cp;
+ struct consdev **list;
/*
- * Collect information about all possible consoles
- * and find the one with highest priority
+ * Find the first console with the highest priority.
*/
- for (cp = constab; cp->cn_probe; cp++) {
+ best_cp = NULL;
+ list = (struct consdev **)cons_set.ls_items;
+ while ((cp = *list++) != NULL) {
+ if (cp->cn_probe == NULL)
+ continue;
(*cp->cn_probe)(cp);
if (cp->cn_pri > CN_DEAD &&
- (cn_tab == NULL || cp->cn_pri > cn_tab->cn_pri))
- cn_tab = cp;
+ (best_cp == NULL || cp->cn_pri > best_cp->cn_pri))
+ best_cp = cp;
}
+
+ /*
+ * Check if we should mute the console (for security reasons perhaps)
+ * It can be changes dynamically using sysctl kern.consmute
+ * once we are up and going.
+ *
+ */
+ cn_mute = ((boothowto & (RB_MUTE
+ |RB_SINGLE
+ |RB_VERBOSE
+ |RB_ASKNAME
+ |RB_CONFIG)) == RB_MUTE);
+
+ /*
+ * If no console, give up.
+ */
+ if (best_cp == NULL) {
+ cn_tab = best_cp;
+ return;
+ }
+
+ /*
+ * Initialize console, then attach to it. This ordering allows
+ * debugging using the previous console, if any.
+ * XXX if there was a previous console, then its driver should
+ * be informed when we forget about it.
+ */
+ (*best_cp->cn_init)(best_cp);
+ cn_tab = best_cp;
+}
+
+void
+cninit_finish()
+{
+ struct cdevsw *cdp;
+
+ if ((cn_tab == NULL) || cn_mute)
+ return;
+
/*
- * No console, we can handle it
+ * Hook the open and close functions.
*/
- if ((cp = cn_tab) == NULL)
+ cdp = cdevsw[major(cn_tab->cn_dev)];
+ cn_phys_close = cdp->d_close;
+ cdp->d_close = cnclose;
+ cn_phys_open = cdp->d_open;
+ cdp->d_open = cnopen;
+ cn_tp = (*cdp->d_devtotty)(cn_tab->cn_dev);
+ cn_dev_t = cn_tp->t_dev;
+}
+
+static void
+cnuninit(void)
+{
+ struct cdevsw *cdp;
+
+ if (cn_tab == NULL)
return;
+
/*
- * Turn on console
+ * Unhook the open and close functions.
*/
- cn_tty = cp->cn_tp;
- (*cp->cn_init)(cp);
+ cdp = cdevsw[major(cn_tab->cn_dev)];
+ cdp->d_close = cn_phys_close;
+ cn_phys_close = NULL;
+ cdp->d_open = cn_phys_open;
+ cn_phys_open = NULL;
+ cn_tp = NULL;
+ cn_dev_t = 0;
+}
+
+/*
+ * User has changed the state of the console muting.
+ * This may require us to open or close the device in question.
+ */
+static int
+sysctl_kern_consmute SYSCTL_HANDLER_ARGS
+{
+ int error;
+ int ocn_mute;
+
+ ocn_mute = cn_mute;
+ error = sysctl_handle_int(oidp, &cn_mute, 0, req);
+ if((error == 0) && (cn_tab != NULL) && (req->newptr != NULL)) {
+ if(ocn_mute && !cn_mute) {
+ /*
+ * going from muted to unmuted.. open the physical dev
+ * if the console has been openned
+ */
+ cninit_finish();
+ if(cn_is_open)
+ /* XXX curproc is not what we want really */
+ error = cnopen(cn_dev_t, openflag,
+ openmode, curproc);
+ /* if it failed, back it out */
+ if ( error != 0) cnuninit();
+ } else if (!ocn_mute && cn_mute) {
+ /*
+ * going from unmuted to muted.. close the physical dev
+ * if it's only open via /dev/console
+ */
+ if(cn_is_open)
+ error = cnclose(cn_dev_t, openflag,
+ openmode, curproc);
+ if ( error == 0) cnuninit();
+ }
+ if (error != 0) {
+ /*
+ * back out the change if there was an error
+ */
+ cn_mute = ocn_mute;
+ }
+ }
+ return (error);
}
+SYSCTL_PROC(_kern, OID_AUTO, consmute, CTLTYPE_INT|CTLFLAG_RW,
+ 0, sizeof cn_mute, sysctl_kern_consmute, "I", "");
+
+static int
cnopen(dev, flag, mode, p)
dev_t dev;
int flag, mode;
struct proc *p;
{
+ dev_t cndev, physdev;
+ int retval = 0;
+
if (cn_tab == NULL)
return (0);
- dev = cn_tab->cn_dev;
- return ((*cdevsw[major(dev)].d_open)(dev, flag, mode, p));
+ cndev = cn_tab->cn_dev;
+ physdev = (major(dev) == major(cndev) ? dev : cndev);
+ /*
+ * If mute is active, then non console opens don't get here
+ * so we don't need to check for that. They
+ * bypass this and go straight to the device.
+ */
+ if(!cn_mute)
+ retval = (*cn_phys_open)(physdev, flag, mode, p);
+ if (retval == 0) {
+ /*
+ * check if we openned it via /dev/console or
+ * via the physical entry (e.g. /dev/sio0).
+ */
+ if (dev == cndev)
+ cn_phys_is_open = 1;
+ else if (physdev == cndev) {
+ openmode = mode;
+ openflag = flag;
+ cn_is_open = 1;
+ }
+ }
+ return (retval);
}
-
+
+static int
cnclose(dev, flag, mode, p)
dev_t dev;
int flag, mode;
struct proc *p;
{
+ dev_t cndev;
+
if (cn_tab == NULL)
return (0);
- dev = cn_tab->cn_dev;
- return ((*cdevsw[major(dev)].d_close)(dev, flag, mode, p));
+ cndev = cn_tab->cn_dev;
+ /*
+ * act appropriatly depending on whether it's /dev/console
+ * or the pysical device (e.g. /dev/sio) that's being closed.
+ * in either case, don't actually close the device unless
+ * both are closed.
+ */
+ if (dev == cndev) {
+ /* the physical device is about to be closed */
+ cn_phys_is_open = 0;
+ if (cn_is_open) {
+ if (cn_tp) {
+ /* perform a ttyhalfclose() */
+ /* reset session and proc group */
+ cn_tp->t_pgrp = NULL;
+ cn_tp->t_session = NULL;
+ }
+ return (0);
+ }
+ } else if (major(dev) != major(cndev)) {
+ /* the logical console is about to be closed */
+ cn_is_open = 0;
+ if (cn_phys_is_open)
+ return (0);
+ dev = cndev;
+ }
+ if(cn_phys_close)
+ return ((*cn_phys_close)(dev, flag, mode, p));
+ return (0);
}
-
+
+static int
cnread(dev, uio, flag)
dev_t dev;
struct uio *uio;
+ int flag;
{
- if (cn_tab == NULL)
+ if ((cn_tab == NULL) || cn_mute)
return (0);
dev = cn_tab->cn_dev;
- return ((*cdevsw[major(dev)].d_read)(dev, uio, flag));
+ return ((*cdevsw[major(dev)]->d_read)(dev, uio, flag));
}
-
+
+static int
cnwrite(dev, uio, flag)
dev_t dev;
struct uio *uio;
+ int flag;
{
- if (cn_tab == NULL)
+ if ((cn_tab == NULL) || cn_mute) {
+ uio->uio_resid = 0; /* dump the data */
return (0);
- if (constty) /* 16 Aug 92*/
+ }
+ if (constty)
dev = constty->t_dev;
else
dev = cn_tab->cn_dev;
- return ((*cdevsw[major(dev)].d_write)(dev, uio, flag));
+ return ((*cdevsw[major(dev)]->d_write)(dev, uio, flag));
}
-
+
+static int
cnioctl(dev, cmd, data, flag, p)
dev_t dev;
+ u_long cmd;
caddr_t data;
+ int flag;
struct proc *p;
{
int error;
- if (cn_tab == NULL)
+ if ((cn_tab == NULL) || cn_mute)
return (0);
/*
* Superuser can always use this to wrest control of console
@@ -171,43 +365,74 @@ cnioctl(dev, cmd, data, flag, p)
return (0);
}
dev = cn_tab->cn_dev;
- return ((*cdevsw[major(dev)].d_ioctl)(dev, cmd, data, flag, p));
+ return ((*cdevsw[major(dev)]->d_ioctl)(dev, cmd, data, flag, p));
}
-/*ARGSUSED*/
-cnselect(dev, rw, p)
+static int
+cnpoll(dev, events, p)
dev_t dev;
- int rw;
+ int events;
struct proc *p;
{
- if (cn_tab == NULL)
+ if ((cn_tab == NULL) || cn_mute)
return (1);
- return (ttselect(cn_tab->cn_dev, rw, p));
+
+ dev = cn_tab->cn_dev;
+
+ return ((*cdevsw[major(dev)]->d_poll)(dev, events, p));
}
+int
cngetc()
{
- if (cn_tab == NULL)
- return (0);
- return ((*cn_tab->cn_getc)(cn_tab->cn_dev));
+ int c;
+ if ((cn_tab == NULL) || cn_mute)
+ return (-1);
+ c = (*cn_tab->cn_getc)(cn_tab->cn_dev);
+ if (c == '\r') c = '\n'; /* console input is always ICRNL */
+ return (c);
}
+int
+cncheckc()
+{
+ if ((cn_tab == NULL) || cn_mute)
+ return (-1);
+ return ((*cn_tab->cn_checkc)(cn_tab->cn_dev));
+}
+
+void
cnputc(c)
register int c;
{
- if (cn_tab == NULL)
+ if ((cn_tab == NULL) || cn_mute)
return;
if (c) {
- (*cn_tab->cn_putc)(cn_tab->cn_dev, c);
if (c == '\n')
(*cn_tab->cn_putc)(cn_tab->cn_dev, '\r');
+ (*cn_tab->cn_putc)(cn_tab->cn_dev, c);
}
}
-pg(p,q,r,s,t,u,v,w,x,y,z) char *p; {
- printf(p,q,r,s,t,u,v,w,x,y,z);
- printf("\n>");
- return(cngetc());
+static cn_devsw_installed = 0;
+
+static void
+cn_drvinit(void *unused)
+{
+ dev_t dev;
+
+ if( ! cn_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR,0);
+ cdevsw_add(&dev,&cn_cdevsw,NULL);
+ cn_devsw_installed = 1;
+#ifdef DEVFS
+ cn_devfs_token = devfs_add_devswf(&cn_cdevsw, 0, DV_CHR,
+ UID_ROOT, GID_WHEEL, 0600,
+ "console");
+#endif
+ }
}
+SYSINIT(cndev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,cn_drvinit,NULL)
+
diff --git a/sys/kern/tty_pty.c b/sys/kern/tty_pty.c
new file mode 100644
index 0000000..214f103
--- /dev/null
+++ b/sys/kern/tty_pty.c
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty_pty.c 8.4 (Berkeley) 2/20/95
+ * $Id: tty_pty.c,v 1.53 1998/07/15 12:18:30 bde Exp $
+ */
+
+/*
+ * Pseudo-teletype Driver
+ * (Actually two drivers, requiring two entries in 'cdevsw')
+ */
+#include "pty.h" /* XXX */
+#include "opt_compat.h"
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/poll.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+#ifdef notyet
+static void ptyattach __P((int n));
+#endif
+static void ptsstart __P((struct tty *tp));
+static void ptcwakeup __P((struct tty *tp, int flag));
+
+static d_open_t ptsopen;
+static d_close_t ptsclose;
+static d_read_t ptsread;
+static d_write_t ptswrite;
+static d_ioctl_t ptyioctl;
+static d_stop_t ptsstop;
+static d_devtotty_t ptydevtotty;
+static d_open_t ptcopen;
+static d_close_t ptcclose;
+static d_read_t ptcread;
+static d_write_t ptcwrite;
+static d_poll_t ptcpoll;
+
+#define CDEV_MAJOR_S 5
+static struct cdevsw pts_cdevsw = {
+ ptsopen, ptsclose, ptsread, ptswrite,
+ ptyioctl, ptsstop, nullreset, ptydevtotty,
+ ttpoll, nommap, NULL, "pts",
+ NULL, -1, nodump, nopsize,
+ D_TTY,
+};
+
+#define CDEV_MAJOR_C 6
+static struct cdevsw ptc_cdevsw = {
+ ptcopen, ptcclose, ptcread, ptcwrite,
+ ptyioctl, nullstop, nullreset, ptydevtotty,
+ ptcpoll, nommap, NULL, "ptc",
+ NULL, -1, nodump, nopsize,
+ D_TTY,
+};
+
+#if NPTY == 1
+#undef NPTY
+#define NPTY 32 /* crude XXX */
+#warning You have only one pty defined, redefining to 32.
+#endif
+
+#ifdef DEVFS
+#define MAXUNITS (8 * 32)
+static void *devfs_token_pts[MAXUNITS];
+static void *devfs_token_ptc[MAXUNITS];
+static const char jnames[] = "pqrsPQRS";
+#if NPTY > MAXUNITS
+#undef NPTY
+#define NPTY MAXUNITS
+#warning Can't have more than 256 pty's with DEVFS defined.
+#endif
+#endif
+
+#define BUFSIZ 100 /* Chunk size iomoved to/from user */
+
+/*
+ * pts == /dev/tty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
+ * ptc == /dev/pty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
+ */
+static struct tty pt_tty[NPTY]; /* XXX */
+static struct pt_ioctl {
+ int pt_flags;
+ struct selinfo pt_selr, pt_selw;
+ u_char pt_send;
+ u_char pt_ucntl;
+} pt_ioctl[NPTY]; /* XXX */
+static int npty = NPTY; /* for pstat -t */
+
+#define PF_PKT 0x08 /* packet mode */
+#define PF_STOPPED 0x10 /* user told stopped */
+#define PF_REMOTE 0x20 /* remote and flow controlled input */
+#define PF_NOSTOP 0x40
+#define PF_UCNTL 0x80 /* user control mode */
+
+#ifdef notyet
+/*
+ * Establish n (or default if n is 1) ptys in the system.
+ *
+ * XXX cdevsw & pstat require the array `pty[]' to be an array
+ */
+static void
+ptyattach(n)
+ int n;
+{
+ char *mem;
+ register u_long ntb;
+#define DEFAULT_NPTY 32
+
+ /* maybe should allow 0 => none? */
+ if (n <= 1)
+ n = DEFAULT_NPTY;
+ ntb = n * sizeof(struct tty);
+ mem = malloc(ntb + ALIGNBYTES + n * sizeof(struct pt_ioctl),
+ M_DEVBUF, M_WAITOK);
+ pt_tty = (struct tty *)mem;
+ mem = (char *)ALIGN(mem + ntb);
+ pt_ioctl = (struct pt_ioctl *)mem;
+ npty = n;
+}
+#endif
+
+/*ARGSUSED*/
+static int
+ptsopen(dev, flag, devtype, p)
+ dev_t dev;
+ int flag, devtype;
+ struct proc *p;
+{
+ register struct tty *tp;
+ int error;
+
+ if (minor(dev) >= npty)
+ return (ENXIO);
+ tp = &pt_tty[minor(dev)];
+ if ((tp->t_state & TS_ISOPEN) == 0) {
+ ttychars(tp); /* Set up default chars */
+ tp->t_iflag = TTYDEF_IFLAG;
+ tp->t_oflag = TTYDEF_OFLAG;
+ tp->t_lflag = TTYDEF_LFLAG;
+ tp->t_cflag = TTYDEF_CFLAG;
+ tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED;
+ } else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0)
+ return (EBUSY);
+ if (tp->t_oproc) /* Ctrlr still around. */
+ (void)(*linesw[tp->t_line].l_modem)(tp, 1);
+ while ((tp->t_state & TS_CARR_ON) == 0) {
+ if (flag&FNONBLOCK)
+ break;
+ error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+ "ptsopn", 0);
+ if (error)
+ return (error);
+ }
+ error = (*linesw[tp->t_line].l_open)(dev, tp);
+ if (error == 0)
+ ptcwakeup(tp, FREAD|FWRITE);
+ return (error);
+}
+
+static int
+ptsclose(dev, flag, mode, p)
+ dev_t dev;
+ int flag, mode;
+ struct proc *p;
+{
+ register struct tty *tp;
+ int err;
+
+ tp = &pt_tty[minor(dev)];
+ err = (*linesw[tp->t_line].l_close)(tp, flag);
+ ptsstop(tp, FREAD|FWRITE);
+ (void) ttyclose(tp);
+ return (err);
+}
+
+static int
+ptsread(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ struct proc *p = curproc;
+ register struct tty *tp = &pt_tty[minor(dev)];
+ register struct pt_ioctl *pti = &pt_ioctl[minor(dev)];
+ int error = 0;
+
+again:
+ if (pti->pt_flags & PF_REMOTE) {
+ while (isbackground(p, tp)) {
+ if ((p->p_sigignore & sigmask(SIGTTIN)) ||
+ (p->p_sigmask & sigmask(SIGTTIN)) ||
+ p->p_pgrp->pg_jobc == 0 ||
+ p->p_flag & P_PPWAIT)
+ return (EIO);
+ pgsignal(p->p_pgrp, SIGTTIN, 1);
+ error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ptsbg",
+ 0);
+ if (error)
+ return (error);
+ }
+ if (tp->t_canq.c_cc == 0) {
+ if (flag & IO_NDELAY)
+ return (EWOULDBLOCK);
+ error = ttysleep(tp, TSA_PTS_READ(tp), TTIPRI | PCATCH,
+ "ptsin", 0);
+ if (error)
+ return (error);
+ goto again;
+ }
+ while (tp->t_canq.c_cc > 1 && uio->uio_resid > 0)
+ if (ureadc(getc(&tp->t_canq), uio) < 0) {
+ error = EFAULT;
+ break;
+ }
+ if (tp->t_canq.c_cc == 1)
+ (void) getc(&tp->t_canq);
+ if (tp->t_canq.c_cc)
+ return (error);
+ } else
+ if (tp->t_oproc)
+ error = (*linesw[tp->t_line].l_read)(tp, uio, flag);
+ ptcwakeup(tp, FWRITE);
+ return (error);
+}
+
+/*
+ * Write to pseudo-tty.
+ * Wakeups of controlling tty will happen
+ * indirectly, when tty driver calls ptsstart.
+ */
+static int
+ptswrite(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ register struct tty *tp;
+
+ tp = &pt_tty[minor(dev)];
+ if (tp->t_oproc == 0)
+ return (EIO);
+ return ((*linesw[tp->t_line].l_write)(tp, uio, flag));
+}
+
+/*
+ * Start output on pseudo-tty.
+ * Wake up process selecting or sleeping for input from controlling tty.
+ */
+static void
+ptsstart(tp)
+ struct tty *tp;
+{
+ register struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)];
+
+ if (tp->t_state & TS_TTSTOP)
+ return;
+ if (pti->pt_flags & PF_STOPPED) {
+ pti->pt_flags &= ~PF_STOPPED;
+ pti->pt_send = TIOCPKT_START;
+ }
+ ptcwakeup(tp, FREAD);
+}
+
+static void
+ptcwakeup(tp, flag)
+ struct tty *tp;
+ int flag;
+{
+ struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)];
+
+ if (flag & FREAD) {
+ selwakeup(&pti->pt_selr);
+ wakeup(TSA_PTC_READ(tp));
+ }
+ if (flag & FWRITE) {
+ selwakeup(&pti->pt_selw);
+ wakeup(TSA_PTC_WRITE(tp));
+ }
+}
+
+static int
+ptcopen(dev, flag, devtype, p)
+ dev_t dev;
+ int flag, devtype;
+ struct proc *p;
+{
+ register struct tty *tp;
+ struct pt_ioctl *pti;
+
+ if (minor(dev) >= npty)
+ return (ENXIO);
+ tp = &pt_tty[minor(dev)];
+ if (tp->t_oproc)
+ return (EIO);
+ tp->t_oproc = ptsstart;
+#ifdef sun4c
+ tp->t_stop = ptsstop;
+#endif
+ (void)(*linesw[tp->t_line].l_modem)(tp, 1);
+ tp->t_lflag &= ~EXTPROC;
+ pti = &pt_ioctl[minor(dev)];
+ pti->pt_flags = 0;
+ pti->pt_send = 0;
+ pti->pt_ucntl = 0;
+ return (0);
+}
+
+static int
+ptcclose(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
+{
+ register struct tty *tp;
+
+ tp = &pt_tty[minor(dev)];
+ (void)(*linesw[tp->t_line].l_modem)(tp, 0);
+
+ /*
+ * XXX MDMBUF makes no sense for ptys but would inhibit the above
+ * l_modem(). CLOCAL makes sense but isn't supported. Special
+ * l_modem()s that ignore carrier drop make no sense for ptys but
+ * may be in use because other parts of the line discipline make
+ * sense for ptys. Recover by doing everything that a normal
+ * ttymodem() would have done except for sending a SIGHUP.
+ */
+ if (tp->t_state & TS_ISOPEN) {
+ tp->t_state &= ~(TS_CARR_ON | TS_CONNECTED);
+ tp->t_state |= TS_ZOMBIE;
+ ttyflush(tp, FREAD | FWRITE);
+ }
+
+ tp->t_oproc = 0; /* mark closed */
+ return (0);
+}
+
+static int
+ptcread(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ register struct tty *tp = &pt_tty[minor(dev)];
+ struct pt_ioctl *pti = &pt_ioctl[minor(dev)];
+ char buf[BUFSIZ];
+ int error = 0, cc;
+
+ /*
+ * We want to block until the slave
+ * is open, and there's something to read;
+ * but if we lost the slave or we're NBIO,
+ * then return the appropriate error instead.
+ */
+ for (;;) {
+ if (tp->t_state&TS_ISOPEN) {
+ if (pti->pt_flags&PF_PKT && pti->pt_send) {
+ error = ureadc((int)pti->pt_send, uio);
+ if (error)
+ return (error);
+ if (pti->pt_send & TIOCPKT_IOCTL) {
+ cc = min(uio->uio_resid,
+ sizeof(tp->t_termios));
+ uiomove((caddr_t)&tp->t_termios, cc,
+ uio);
+ }
+ pti->pt_send = 0;
+ return (0);
+ }
+ if (pti->pt_flags&PF_UCNTL && pti->pt_ucntl) {
+ error = ureadc((int)pti->pt_ucntl, uio);
+ if (error)
+ return (error);
+ pti->pt_ucntl = 0;
+ return (0);
+ }
+ if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0)
+ break;
+ }
+ if ((tp->t_state & TS_CONNECTED) == 0)
+ return (0); /* EOF */
+ if (flag & IO_NDELAY)
+ return (EWOULDBLOCK);
+ error = tsleep(TSA_PTC_READ(tp), TTIPRI | PCATCH, "ptcin", 0);
+ if (error)
+ return (error);
+ }
+ if (pti->pt_flags & (PF_PKT|PF_UCNTL))
+ error = ureadc(0, uio);
+ while (uio->uio_resid > 0 && error == 0) {
+ cc = q_to_b(&tp->t_outq, buf, min(uio->uio_resid, BUFSIZ));
+ if (cc <= 0)
+ break;
+ error = uiomove(buf, cc, uio);
+ }
+ ttwwakeup(tp);
+ return (error);
+}
+
+static void
+ptsstop(tp, flush)
+ register struct tty *tp;
+ int flush;
+{
+ struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)];
+ int flag;
+
+ /* note: FLUSHREAD and FLUSHWRITE already ok */
+ if (flush == 0) {
+ flush = TIOCPKT_STOP;
+ pti->pt_flags |= PF_STOPPED;
+ } else
+ pti->pt_flags &= ~PF_STOPPED;
+ pti->pt_send |= flush;
+ /* change of perspective */
+ flag = 0;
+ if (flush & FREAD)
+ flag |= FWRITE;
+ if (flush & FWRITE)
+ flag |= FREAD;
+ ptcwakeup(tp, flag);
+}
+
+static int
+ptcpoll(dev, events, p)
+ dev_t dev;
+ int events;
+ struct proc *p;
+{
+ register struct tty *tp = &pt_tty[minor(dev)];
+ struct pt_ioctl *pti = &pt_ioctl[minor(dev)];
+ int revents = 0;
+ int s;
+
+ if ((tp->t_state & TS_CONNECTED) == 0)
+ return (seltrue(dev, events, p) | POLLHUP);
+
+ /*
+ * Need to block timeouts (ttrstart).
+ */
+ s = spltty();
+
+ if (events & (POLLIN | POLLRDNORM))
+ if ((tp->t_state & TS_ISOPEN) &&
+ ((tp->t_outq.c_cc && (tp->t_state & TS_TTSTOP) == 0) ||
+ ((pti->pt_flags & PF_PKT) && pti->pt_send) ||
+ ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)))
+ revents |= events & (POLLIN | POLLRDNORM);
+
+ if (events & (POLLOUT | POLLWRNORM))
+ if (tp->t_state & TS_ISOPEN &&
+ ((pti->pt_flags & PF_REMOTE) ?
+ (tp->t_canq.c_cc == 0) :
+ ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG - 2) ||
+ (tp->t_canq.c_cc == 0 && (tp->t_iflag & ICANON)))))
+ revents |= events & (POLLOUT | POLLWRNORM);
+
+ if (events & POLLHUP)
+ if ((tp->t_state & TS_CARR_ON) == 0)
+ revents |= POLLHUP;
+
+ if (revents == 0) {
+ if (events & (POLLIN | POLLRDNORM))
+ selrecord(p, &pti->pt_selr);
+
+ if (events & (POLLOUT | POLLWRNORM))
+ selrecord(p, &pti->pt_selw);
+ }
+ splx(s);
+
+ return (revents);
+}
+
+static int
+ptcwrite(dev, uio, flag)
+ dev_t dev;
+ register struct uio *uio;
+ int flag;
+{
+ register struct tty *tp = &pt_tty[minor(dev)];
+ register u_char *cp = 0;
+ register int cc = 0;
+ u_char locbuf[BUFSIZ];
+ int cnt = 0;
+ struct pt_ioctl *pti = &pt_ioctl[minor(dev)];
+ int error = 0;
+
+again:
+ if ((tp->t_state&TS_ISOPEN) == 0)
+ goto block;
+ if (pti->pt_flags & PF_REMOTE) {
+ if (tp->t_canq.c_cc)
+ goto block;
+ while ((uio->uio_resid > 0 || cc > 0) &&
+ tp->t_canq.c_cc < TTYHOG - 1) {
+ if (cc == 0) {
+ cc = min(uio->uio_resid, BUFSIZ);
+ cc = min(cc, TTYHOG - 1 - tp->t_canq.c_cc);
+ cp = locbuf;
+ error = uiomove((caddr_t)cp, cc, uio);
+ if (error)
+ return (error);
+ /* check again for safety */
+ if ((tp->t_state & TS_ISOPEN) == 0) {
+ /* adjust as usual */
+ uio->uio_resid += cc;
+ return (EIO);
+ }
+ }
+ if (cc > 0) {
+ cc = b_to_q((char *)cp, cc, &tp->t_canq);
+ /*
+ * XXX we don't guarantee that the canq size
+ * is >= TTYHOG, so the above b_to_q() may
+ * leave some bytes uncopied. However, space
+ * is guaranteed for the null terminator if
+ * we don't fail here since (TTYHOG - 1) is
+ * not a multiple of CBSIZE.
+ */
+ if (cc > 0)
+ break;
+ }
+ }
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ (void) putc(0, &tp->t_canq);
+ ttwakeup(tp);
+ wakeup(TSA_PTS_READ(tp));
+ return (0);
+ }
+ while (uio->uio_resid > 0 || cc > 0) {
+ if (cc == 0) {
+ cc = min(uio->uio_resid, BUFSIZ);
+ cp = locbuf;
+ error = uiomove((caddr_t)cp, cc, uio);
+ if (error)
+ return (error);
+ /* check again for safety */
+ if ((tp->t_state & TS_ISOPEN) == 0) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ return (EIO);
+ }
+ }
+ while (cc > 0) {
+ if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 &&
+ (tp->t_canq.c_cc > 0 || !(tp->t_iflag&ICANON))) {
+ wakeup(TSA_HUP_OR_INPUT(tp));
+ goto block;
+ }
+ (*linesw[tp->t_line].l_rint)(*cp++, tp);
+ cnt++;
+ cc--;
+ }
+ cc = 0;
+ }
+ return (0);
+block:
+ /*
+ * Come here to wait for slave to open, for space
+ * in outq, or space in rawq, or an empty canq.
+ */
+ if ((tp->t_state & TS_CONNECTED) == 0) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ return (EIO);
+ }
+ if (flag & IO_NDELAY) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ if (cnt == 0)
+ return (EWOULDBLOCK);
+ return (0);
+ }
+ error = tsleep(TSA_PTC_WRITE(tp), TTOPRI | PCATCH, "ptcout", 0);
+ if (error) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
+ return (error);
+ }
+ goto again;
+}
+
+static struct tty *
+ptydevtotty(dev)
+ dev_t dev;
+{
+ if (minor(dev) >= npty)
+ return (NULL);
+
+ return &pt_tty[minor(dev)];
+}
+
+/*ARGSUSED*/
+static int
+ptyioctl(dev, cmd, data, flag, p)
+ dev_t dev;
+ u_long cmd;
+ caddr_t data;
+ int flag;
+ struct proc *p;
+{
+ register struct tty *tp = &pt_tty[minor(dev)];
+ register struct pt_ioctl *pti = &pt_ioctl[minor(dev)];
+ register u_char *cc = tp->t_cc;
+ int stop, error;
+
+ /*
+ * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG.
+ * ttywflush(tp) will hang if there are characters in the outq.
+ */
+ if (cmd == TIOCEXT) {
+ /*
+ * When the EXTPROC bit is being toggled, we need
+ * to send an TIOCPKT_IOCTL if the packet driver
+ * is turned on.
+ */
+ if (*(int *)data) {
+ if (pti->pt_flags & PF_PKT) {
+ pti->pt_send |= TIOCPKT_IOCTL;
+ ptcwakeup(tp, FREAD);
+ }
+ tp->t_lflag |= EXTPROC;
+ } else {
+ if ((tp->t_lflag & EXTPROC) &&
+ (pti->pt_flags & PF_PKT)) {
+ pti->pt_send |= TIOCPKT_IOCTL;
+ ptcwakeup(tp, FREAD);
+ }
+ tp->t_lflag &= ~EXTPROC;
+ }
+ return(0);
+ } else
+ if (cdevsw[major(dev)]->d_open == ptcopen)
+ switch (cmd) {
+
+ case TIOCGPGRP:
+ /*
+ * We avoid calling ttioctl on the controller since,
+ * in that case, tp must be the controlling terminal.
+ */
+ *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
+ return (0);
+
+ case TIOCPKT:
+ if (*(int *)data) {
+ if (pti->pt_flags & PF_UCNTL)
+ return (EINVAL);
+ pti->pt_flags |= PF_PKT;
+ } else
+ pti->pt_flags &= ~PF_PKT;
+ return (0);
+
+ case TIOCUCNTL:
+ if (*(int *)data) {
+ if (pti->pt_flags & PF_PKT)
+ return (EINVAL);
+ pti->pt_flags |= PF_UCNTL;
+ } else
+ pti->pt_flags &= ~PF_UCNTL;
+ return (0);
+
+ case TIOCREMOTE:
+ if (*(int *)data)
+ pti->pt_flags |= PF_REMOTE;
+ else
+ pti->pt_flags &= ~PF_REMOTE;
+ ttyflush(tp, FREAD|FWRITE);
+ return (0);
+
+#ifdef COMPAT_43
+ case TIOCSETP:
+ case TIOCSETN:
+#endif
+ case TIOCSETD:
+ case TIOCSETA:
+ case TIOCSETAW:
+ case TIOCSETAF:
+ ndflush(&tp->t_outq, tp->t_outq.c_cc);
+ break;
+
+ case TIOCSIG:
+ if (*(unsigned int *)data >= NSIG ||
+ *(unsigned int *)data == 0)
+ return(EINVAL);
+ if ((tp->t_lflag&NOFLSH) == 0)
+ ttyflush(tp, FREAD|FWRITE);
+ pgsignal(tp->t_pgrp, *(unsigned int *)data, 1);
+ if ((*(unsigned int *)data == SIGINFO) &&
+ ((tp->t_lflag&NOKERNINFO) == 0))
+ ttyinfo(tp);
+ return(0);
+ }
+ error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, p);
+ if (error == ENOIOCTL)
+ error = ttioctl(tp, cmd, data, flag);
+ if (error == ENOIOCTL) {
+ if (pti->pt_flags & PF_UCNTL &&
+ (cmd & ~0xff) == UIOCCMD(0)) {
+ if (cmd & 0xff) {
+ pti->pt_ucntl = (u_char)cmd;
+ ptcwakeup(tp, FREAD);
+ }
+ return (0);
+ }
+ error = ENOTTY;
+ }
+ /*
+ * If external processing and packet mode send ioctl packet.
+ */
+ if ((tp->t_lflag&EXTPROC) && (pti->pt_flags & PF_PKT)) {
+ switch(cmd) {
+ case TIOCSETA:
+ case TIOCSETAW:
+ case TIOCSETAF:
+#ifdef COMPAT_43
+ case TIOCSETP:
+ case TIOCSETN:
+#endif
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+ case TIOCSETC:
+ case TIOCSLTC:
+ case TIOCLBIS:
+ case TIOCLBIC:
+ case TIOCLSET:
+#endif
+ pti->pt_send |= TIOCPKT_IOCTL;
+ ptcwakeup(tp, FREAD);
+ default:
+ break;
+ }
+ }
+ stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s'))
+ && CCEQ(cc[VSTART], CTRL('q'));
+ if (pti->pt_flags & PF_NOSTOP) {
+ if (stop) {
+ pti->pt_send &= ~TIOCPKT_NOSTOP;
+ pti->pt_send |= TIOCPKT_DOSTOP;
+ pti->pt_flags &= ~PF_NOSTOP;
+ ptcwakeup(tp, FREAD);
+ }
+ } else {
+ if (!stop) {
+ pti->pt_send &= ~TIOCPKT_DOSTOP;
+ pti->pt_send |= TIOCPKT_NOSTOP;
+ pti->pt_flags |= PF_NOSTOP;
+ ptcwakeup(tp, FREAD);
+ }
+ }
+ return (error);
+}
+
+static int ptc_devsw_installed;
+
+static void ptc_drvinit __P((void *unused));
+static void
+ptc_drvinit(unused)
+ void *unused;
+{
+#ifdef DEVFS
+ int i,j,k;
+#endif
+ dev_t dev;
+
+ if( ! ptc_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR_S, 0);
+ cdevsw_add(&dev, &pts_cdevsw, NULL);
+ dev = makedev(CDEV_MAJOR_C, 0);
+ cdevsw_add(&dev, &ptc_cdevsw, NULL);
+ ptc_devsw_installed = 1;
+#ifdef DEVFS
+ for ( i = 0 ; i<NPTY ; i++ ) {
+ j = i / 32;
+ k = i % 32;
+ devfs_token_pts[i] =
+ devfs_add_devswf(&pts_cdevsw,i,
+ DV_CHR,0,0,0666,
+ "tty%c%r",jnames[j],k);
+ devfs_token_ptc[i] =
+ devfs_add_devswf(&ptc_cdevsw,i,
+ DV_CHR,0,0,0666,
+ "pty%c%r",jnames[j],k);
+ }
+#endif
+ }
+}
+
+SYSINIT(ptcdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR_C,ptc_drvinit,NULL)
diff --git a/sys/kern/tty_snoop.c b/sys/kern/tty_snoop.c
new file mode 100644
index 0000000..ba71a94
--- /dev/null
+++ b/sys/kern/tty_snoop.c
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 1995 Ugen J.S.Antsilevich
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ *
+ * Snoop stuff.
+ */
+
+#include "snp.h"
+
+#if NSNP > 0
+
+#include "opt_compat.h"
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/filio.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+#include <sys/poll.h>
+#include <sys/kernel.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+#include <sys/snoop.h>
+#include <sys/vnode.h>
+
+static d_open_t snpopen;
+static d_close_t snpclose;
+static d_read_t snpread;
+static d_write_t snpwrite;
+static d_ioctl_t snpioctl;
+static d_poll_t snppoll;
+
+#define CDEV_MAJOR 53
+static struct cdevsw snp_cdevsw =
+ { snpopen, snpclose, snpread, snpwrite, /*53*/
+ snpioctl, nostop, nullreset, nodevtotty,/* snoop */
+ snppoll, nommap, NULL, "snp", NULL, -1 };
+
+
+#ifndef MIN
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+
+static struct snoop snoopsw[NSNP];
+
+static struct tty *snpdevtotty __P((dev_t dev));
+static int snp_detach __P((struct snoop *snp));
+
+static struct tty *
+snpdevtotty (dev)
+ dev_t dev;
+{
+ struct cdevsw *cdp;
+ int maj;
+
+ maj = major(dev);
+ if ((u_int)maj >= nchrdev)
+ return (NULL);
+ cdp = cdevsw[maj];
+ if (cdp == NULL)
+ return (NULL);
+ return ((*cdp->d_devtotty)(dev));
+}
+
+#define SNP_INPUT_BUF 5 /* This is even too much,the maximal
+ * interactive mode write is 3 bytes
+ * length for function keys...
+ */
+
+static int
+snpwrite(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ int unit = minor(dev), len, i, error;
+ struct snoop *snp = &snoopsw[unit];
+ struct tty *tp;
+ char c[SNP_INPUT_BUF];
+
+ if (snp->snp_tty == NULL)
+ return (EIO);
+
+ tp = snp->snp_tty;
+
+ if ((tp->t_sc == snp) && (tp->t_state & TS_SNOOP) &&
+ (tp->t_line == OTTYDISC || tp->t_line == NTTYDISC))
+ goto tty_input;
+
+ printf("Snoop: attempt to write to bad tty.\n");
+ return (EIO);
+
+tty_input:
+ if (!(tp->t_state & TS_ISOPEN))
+ return (EIO);
+
+ while (uio->uio_resid > 0) {
+ len = MIN(uio->uio_resid,SNP_INPUT_BUF);
+ if ((error = uiomove(c, len, uio)) != 0)
+ return (error);
+ for (i=0;i<len;i++) {
+ if (ttyinput(c[i] , tp))
+ return (EIO);
+ }
+ }
+ return 0;
+
+}
+
+
+static int
+snpread(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ int unit = minor(dev), s;
+ struct snoop *snp = &snoopsw[unit];
+ int len, n, nblen, error = 0;
+ caddr_t from;
+ char *nbuf;
+
+ KASSERT(snp->snp_len + snp->snp_base <= snp->snp_blen,
+ ("snoop buffer error"));
+
+ if (snp->snp_tty == NULL)
+ return (EIO);
+
+ snp->snp_flags &= ~SNOOP_RWAIT;
+
+ do {
+ if (snp->snp_len == 0) {
+ if (flag & IO_NDELAY)
+ return (EWOULDBLOCK);
+ snp->snp_flags |= SNOOP_RWAIT;
+ tsleep((caddr_t) snp, (PZERO + 1) | PCATCH, "snoopread", 0);
+ }
+ } while (snp->snp_len == 0);
+
+ n = snp->snp_len;
+
+ while (snp->snp_len > 0 && uio->uio_resid > 0 && error == 0) {
+ len = MIN(uio->uio_resid, snp->snp_len);
+ from = (caddr_t) (snp->snp_buf + snp->snp_base);
+ if (len == 0)
+ break;
+
+ error = uiomove(from, len, uio);
+ snp->snp_base += len;
+ snp->snp_len -= len;
+ }
+ if ((snp->snp_flags & SNOOP_OFLOW) && (n < snp->snp_len)) {
+ snp->snp_flags &= ~SNOOP_OFLOW;
+ }
+ s = spltty();
+ nblen = snp->snp_blen;
+ if (((nblen / 2) >= SNOOP_MINLEN) && (nblen / 2) >= snp->snp_len) {
+ while (((nblen / 2) >= snp->snp_len) && ((nblen / 2) >= SNOOP_MINLEN))
+ nblen = nblen / 2;
+ if (nbuf = malloc(nblen, M_TTYS, M_NOWAIT)) {
+ bcopy(snp->snp_buf + snp->snp_base, nbuf, snp->snp_len);
+ free(snp->snp_buf, M_TTYS);
+ snp->snp_buf = nbuf;
+ snp->snp_blen = nblen;
+ snp->snp_base = 0;
+ }
+ }
+ splx(s);
+
+ return error;
+}
+
+int
+snpinc(struct snoop *snp, char c)
+{
+ char buf[1];
+
+ buf[0]=c;
+ return (snpin(snp,buf,1));
+}
+
+
+int
+snpin(snp, buf, n)
+ struct snoop *snp;
+ char *buf;
+ int n;
+{
+ int s_free, s_tail;
+ int s, len, nblen;
+ caddr_t from, to;
+ char *nbuf;
+
+ KASSERT(n >= 0, ("negative snoop char count"));
+
+ if (n == 0)
+ return 0;
+
+#ifdef DIAGNOSTIC
+ if (!(snp->snp_flags & SNOOP_OPEN)) {
+ printf("Snoop: data coming to closed device.\n");
+ return 0;
+ }
+#endif
+ if (snp->snp_flags & SNOOP_DOWN) {
+ printf("Snoop: more data to down interface.\n");
+ return 0;
+ }
+
+ if (snp->snp_flags & SNOOP_OFLOW) {
+ printf("Snoop: buffer overflow.\n");
+ /*
+ * On overflow we just repeat the standart close
+ * procedure...yes , this is waste of space but.. Then next
+ * read from device will fail if one would recall he is
+ * snooping and retry...
+ */
+
+ return (snpdown(snp));
+ }
+ s_tail = snp->snp_blen - (snp->snp_len + snp->snp_base);
+ s_free = snp->snp_blen - snp->snp_len;
+
+
+ if (n > s_free) {
+ s = spltty();
+ nblen = snp->snp_blen;
+ while ((n > s_free) && ((nblen * 2) <= SNOOP_MAXLEN)) {
+ nblen = snp->snp_blen * 2;
+ s_free = nblen - (snp->snp_len + snp->snp_base);
+ }
+ if ((n <= s_free) && (nbuf = malloc(nblen, M_TTYS, M_NOWAIT))) {
+ bcopy(snp->snp_buf + snp->snp_base, nbuf, snp->snp_len);
+ free(snp->snp_buf, M_TTYS);
+ snp->snp_buf = nbuf;
+ snp->snp_blen = nblen;
+ snp->snp_base = 0;
+ } else {
+ snp->snp_flags |= SNOOP_OFLOW;
+ if (snp->snp_flags & SNOOP_RWAIT) {
+ snp->snp_flags &= ~SNOOP_RWAIT;
+ wakeup((caddr_t) snp);
+ }
+ splx(s);
+ return 0;
+ }
+ splx(s);
+ }
+ if (n > s_tail) {
+ from = (caddr_t) (snp->snp_buf + snp->snp_base);
+ to = (caddr_t) (snp->snp_buf);
+ len = snp->snp_len;
+ bcopy(from, to, len);
+ snp->snp_base = 0;
+ }
+ to = (caddr_t) (snp->snp_buf + snp->snp_base + snp->snp_len);
+ bcopy(buf, to, n);
+ snp->snp_len += n;
+
+ if (snp->snp_flags & SNOOP_RWAIT) {
+ snp->snp_flags &= ~SNOOP_RWAIT;
+ wakeup((caddr_t) snp);
+ }
+ selwakeup(&snp->snp_sel);
+ snp->snp_sel.si_pid = 0;
+
+ return n;
+}
+
+static int
+snpopen(dev, flag, mode, p)
+ dev_t dev;
+ int flag, mode;
+ struct proc *p;
+{
+ struct snoop *snp;
+ register int unit, error;
+
+ if (error = suser(p->p_ucred, &p->p_acflag))
+ return (error);
+
+ if ((unit = minor(dev)) >= NSNP)
+ return (ENXIO);
+
+ snp = &snoopsw[unit];
+
+ if (snp->snp_flags & SNOOP_OPEN)
+ return (ENXIO);
+
+ /*
+ * We intentionally do not OR flags with SNOOP_OPEN,but set them so
+ * all previous settings (especially SNOOP_OFLOW) will be cleared.
+ */
+ snp->snp_flags = SNOOP_OPEN;
+
+ snp->snp_buf = malloc(SNOOP_MINLEN, M_TTYS, M_WAITOK);
+ snp->snp_blen = SNOOP_MINLEN;
+ snp->snp_base = 0;
+ snp->snp_len = 0;
+
+ /*
+ * snp_tty == NULL is for inactive snoop devices.
+ */
+ snp->snp_tty = NULL;
+ snp->snp_target = -1;
+ return (0);
+}
+
+
+static int
+snp_detach(snp)
+ struct snoop *snp;
+{
+ struct tty *tp;
+
+ snp->snp_base = 0;
+ snp->snp_len = 0;
+
+ /*
+ * If line disc. changed we do not touch this pointer,SLIP/PPP will
+ * change it anyway.
+ */
+
+ if (snp->snp_tty == NULL)
+ goto detach_notty;
+
+ tp = snp->snp_tty;
+
+ if (tp && (tp->t_sc == snp) && (tp->t_state & TS_SNOOP) &&
+ (tp->t_line == OTTYDISC || tp->t_line == NTTYDISC)) {
+ tp->t_sc = NULL;
+ tp->t_state &= ~TS_SNOOP;
+ } else
+ printf("Snoop: bad attached tty data.\n");
+
+ snp->snp_tty = NULL;
+ snp->snp_target = -1;
+
+detach_notty:
+ selwakeup(&snp->snp_sel);
+ snp->snp_sel.si_pid = 0;
+
+ return (0);
+}
+
+static int
+snpclose(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
+{
+ register int unit = minor(dev);
+ struct snoop *snp = &snoopsw[unit];
+
+ snp->snp_blen = 0;
+ free(snp->snp_buf, M_TTYS);
+ snp->snp_flags &= ~SNOOP_OPEN;
+
+ return (snp_detach(snp));
+}
+
+int
+snpdown(snp)
+ struct snoop *snp;
+{
+ snp->snp_blen = SNOOP_MINLEN;
+ free(snp->snp_buf, M_TTYS);
+ snp->snp_buf = malloc(SNOOP_MINLEN, M_TTYS, M_WAITOK);
+ snp->snp_flags |= SNOOP_DOWN;
+
+ return (snp_detach(snp));
+}
+
+
+static int
+snpioctl(dev, cmd, data, flags, p)
+ dev_t dev;
+ u_long cmd;
+ caddr_t data;
+ int flags;
+ struct proc *p;
+{
+ int unit = minor(dev), s;
+ dev_t tdev;
+ struct snoop *snp = &snoopsw[unit];
+ struct tty *tp, *tpo;
+
+ switch (cmd) {
+ case SNPSTTY:
+ tdev = *((dev_t *) data);
+ if (tdev == -1)
+ return (snpdown(snp));
+
+ tp = snpdevtotty(tdev);
+ if (!tp)
+ return (EINVAL);
+
+ if ((tp->t_sc != (caddr_t) snp) && (tp->t_state & TS_SNOOP))
+ return (EBUSY);
+
+ if ((tp->t_line != OTTYDISC) && (tp->t_line != NTTYDISC))
+ return (EBUSY);
+
+ s = spltty();
+
+ if (snp->snp_target == -1) {
+ tpo = snp->snp_tty;
+ if (tpo)
+ tpo->t_state &= ~TS_SNOOP;
+ }
+
+ tp->t_sc = (caddr_t) snp;
+ tp->t_state |= TS_SNOOP;
+ snp->snp_tty = tp;
+ snp->snp_target = tdev;
+
+ /*
+ * Clean overflow and down flags -
+ * we'll have a chance to get them in the future :)))
+ */
+ snp->snp_flags &= ~SNOOP_OFLOW;
+ snp->snp_flags &= ~SNOOP_DOWN;
+ splx(s);
+ break;
+
+ case SNPGTTY:
+ /*
+ * We keep snp_target field specially to make
+ * SNPGTTY happy,else we can't know what is device
+ * major/minor for tty.
+ */
+ *((dev_t *) data) = snp->snp_target;
+ break;
+
+ case FIONBIO:
+ break;
+
+ case FIOASYNC:
+ if (*(int *) data)
+ snp->snp_flags |= SNOOP_ASYNC;
+ else
+ snp->snp_flags &= ~SNOOP_ASYNC;
+ break;
+
+ case FIONREAD:
+ s = spltty();
+ if (snp->snp_tty != NULL)
+ *(int *) data = snp->snp_len;
+ else
+ if (snp->snp_flags & SNOOP_DOWN) {
+ if (snp->snp_flags & SNOOP_OFLOW)
+ *(int *) data = SNP_OFLOW;
+ else
+ *(int *) data = SNP_TTYCLOSE;
+ } else {
+ *(int *) data = SNP_DETACH;
+ }
+ splx(s);
+ break;
+
+ default:
+ return (ENOTTY);
+ }
+ return (0);
+}
+
+
+static int
+snppoll(dev, events, p)
+ dev_t dev;
+ int events;
+ struct proc *p;
+{
+ int unit = minor(dev);
+ struct snoop *snp = &snoopsw[unit];
+ int revents = 0;
+
+
+ /*
+ * If snoop is down,we don't want to poll() forever so we return 1.
+ * Caller should see if we down via FIONREAD ioctl().The last should
+ * return -1 to indicate down state.
+ */
+ if (events & (POLLIN | POLLRDNORM))
+ if (snp->snp_flags & SNOOP_DOWN || snp->snp_len > 0)
+ revents |= events & (POLLIN | POLLRDNORM);
+ else
+ selrecord(p, &snp->snp_sel);
+
+ return (revents);
+}
+
+#ifdef DEVFS
+static void *snp_devfs_token[NSNP];
+#endif
+static int snp_devsw_installed;
+
+static void snp_drvinit __P((void *unused));
+static void
+snp_drvinit(unused)
+ void *unused;
+{
+ dev_t dev;
+#ifdef DEVFS
+ int i;
+#endif
+
+ if( ! snp_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR, 0);
+ cdevsw_add(&dev,&snp_cdevsw, NULL);
+ snp_devsw_installed = 1;
+#ifdef DEVFS
+ for ( i = 0 ; i < NSNP ; i++) {
+ snp_devfs_token[i] =
+ devfs_add_devswf(&snp_cdevsw, i, DV_CHR, 0, 0,
+ 0600, "snp%d", i);
+ }
+#endif
+ }
+}
+
+SYSINIT(snpdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,snp_drvinit,NULL)
+
+
+#endif
diff --git a/sys/kern/tty_subr.c b/sys/kern/tty_subr.c
new file mode 100644
index 0000000..593d00c
--- /dev/null
+++ b/sys/kern/tty_subr.c
@@ -0,0 +1,694 @@
+/*
+ * Copyright (c) 1994, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: tty_subr.c,v 1.29 1998/04/15 17:46:27 bde Exp $
+ */
+
+/*
+ * clist support routines
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/tty.h>
+#include <sys/clist.h>
+
+static void clist_init __P((void *));
+SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL)
+
+static struct cblock *cfreelist = 0;
+int cfreecount = 0;
+static int cslushcount;
+static int ctotcount;
+
+#ifndef INITIAL_CBLOCKS
+#define INITIAL_CBLOCKS 50
+#endif
+
+static struct cblock *cblock_alloc __P((void));
+static void cblock_alloc_cblocks __P((int number));
+static void cblock_free __P((struct cblock *cblockp));
+static void cblock_free_cblocks __P((int number));
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cbstat, cbstat)
+{
+ printf(
+ "tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n",
+ ctotcount * CBSIZE, ctotcount * CBSIZE - cfreecount, cfreecount,
+ cfreecount - cslushcount * CBSIZE, cslushcount * CBSIZE);
+}
+#endif /* DDB */
+
+/*
+ * Called from init_main.c
+ */
+/* ARGSUSED*/
+static void
+clist_init(dummy)
+ void *dummy;
+{
+ /*
+ * Allocate an initial base set of cblocks as a 'slush'.
+ * We allocate non-slush cblocks with each initial ttyopen() and
+ * deallocate them with each ttyclose().
+ * We should adjust the slush allocation. This can't be done in
+ * the i/o routines because they are sometimes called from
+ * interrupt handlers when it may be unsafe to call malloc().
+ */
+ cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS);
+}
+
+/*
+ * Remove a cblock from the cfreelist queue and return a pointer
+ * to it.
+ */
+static __inline struct cblock *
+cblock_alloc()
+{
+ struct cblock *cblockp;
+
+ cblockp = cfreelist;
+ if (cblockp == NULL)
+ panic("clist reservation botch");
+ cfreelist = cblockp->c_next;
+ cblockp->c_next = NULL;
+ cfreecount -= CBSIZE;
+ return (cblockp);
+}
+
+/*
+ * Add a cblock to the cfreelist queue.
+ */
+static __inline void
+cblock_free(cblockp)
+ struct cblock *cblockp;
+{
+ if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1))
+ bzero(cblockp->c_quote, sizeof cblockp->c_quote);
+ cblockp->c_next = cfreelist;
+ cfreelist = cblockp;
+ cfreecount += CBSIZE;
+}
+
+/*
+ * Allocate some cblocks for the cfreelist queue.
+ */
+static void
+cblock_alloc_cblocks(number)
+ int number;
+{
+ int i;
+ struct cblock *cbp;
+
+ for (i = 0; i < number; ++i) {
+ cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT);
+ if (cbp == NULL) {
+ printf(
+"clist_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n");
+ cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK);
+ }
+ /*
+ * Freed cblocks have zero quotes and garbage elsewhere.
+ * Set the may-have-quote bit to force zeroing the quotes.
+ */
+ setbit(cbp->c_quote, CBQSIZE * NBBY - 1);
+ cblock_free(cbp);
+ }
+ ctotcount += number;
+}
+
+/*
+ * Set the cblock allocation policy for a a clist.
+ * Must be called in process context at spltty().
+ */
+void
+clist_alloc_cblocks(clistp, ccmax, ccreserved)
+ struct clist *clistp;
+ int ccmax;
+ int ccreserved;
+{
+ int dcbr;
+
+ /*
+ * Allow for wasted space at the head.
+ */
+ if (ccmax != 0)
+ ccmax += CBSIZE - 1;
+ if (ccreserved != 0)
+ ccreserved += CBSIZE - 1;
+
+ clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE;
+ dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved;
+ if (dcbr >= 0)
+ cblock_alloc_cblocks(dcbr);
+ else {
+ if (clistp->c_cbreserved + dcbr < clistp->c_cbcount)
+ dcbr = clistp->c_cbcount - clistp->c_cbreserved;
+ cblock_free_cblocks(-dcbr);
+ }
+ clistp->c_cbreserved += dcbr;
+}
+
+/*
+ * Free some cblocks from the cfreelist queue back to the
+ * system malloc pool.
+ */
+static void
+cblock_free_cblocks(number)
+ int number;
+{
+ int i;
+
+ for (i = 0; i < number; ++i)
+ free(cblock_alloc(), M_TTYS);
+ ctotcount -= number;
+}
+
+/*
+ * Free the cblocks reserved for a clist.
+ * Must be called at spltty().
+ */
+void
+clist_free_cblocks(clistp)
+ struct clist *clistp;
+{
+ if (clistp->c_cbcount != 0)
+ panic("freeing active clist cblocks");
+ cblock_free_cblocks(clistp->c_cbreserved);
+ clistp->c_cbmax = 0;
+ clistp->c_cbreserved = 0;
+}
+
+/*
+ * Get a character from the head of a clist.
+ */
+int
+getc(clistp)
+ struct clist *clistp;
+{
+ int chr = -1;
+ int s;
+ struct cblock *cblockp;
+
+ s = spltty();
+
+ /* If there are characters in the list, get one */
+ if (clistp->c_cc) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+ chr = (u_char)*clistp->c_cf;
+
+ /*
+ * If this char is quoted, set the flag.
+ */
+ if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info))
+ chr |= TTY_QUOTE;
+
+ /*
+ * Advance to next character.
+ */
+ clistp->c_cf++;
+ clistp->c_cc--;
+ /*
+ * If we have advanced the 'first' character pointer
+ * past the end of this cblock, advance to the next one.
+ * If there are no more characters, set the first and
+ * last pointers to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+ return (chr);
+}
+
+/*
+ * Copy 'amount' of chars, beginning at head of clist 'clistp' to
+ * destination linear buffer 'dest'. Return number of characters
+ * actually copied.
+ */
+int
+q_to_b(clistp, dest, amount)
+ struct clist *clistp;
+ char *dest;
+ int amount;
+{
+ struct cblock *cblockp;
+ struct cblock *cblockn;
+ char *dest_orig = dest;
+ int numc;
+ int s;
+
+ s = spltty();
+
+ while (clistp && amount && (clistp->c_cc > 0)) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+ cblockn = cblockp + 1; /* pointer arithmetic! */
+ numc = min(amount, (char *)cblockn - clistp->c_cf);
+ numc = min(numc, clistp->c_cc);
+ bcopy(clistp->c_cf, dest, numc);
+ amount -= numc;
+ clistp->c_cf += numc;
+ clistp->c_cc -= numc;
+ dest += numc;
+ /*
+ * If this cblock has been emptied, advance to the next
+ * one. If there are no more characters, set the first
+ * and last pointer to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+ return (dest - dest_orig);
+}
+
+/*
+ * Flush 'amount' of chars, beginning at head of clist 'clistp'.
+ */
+void
+ndflush(clistp, amount)
+ struct clist *clistp;
+ int amount;
+{
+ struct cblock *cblockp;
+ struct cblock *cblockn;
+ int numc;
+ int s;
+
+ s = spltty();
+
+ while (amount && (clistp->c_cc > 0)) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+ cblockn = cblockp + 1; /* pointer arithmetic! */
+ numc = min(amount, (char *)cblockn - clistp->c_cf);
+ numc = min(numc, clistp->c_cc);
+ amount -= numc;
+ clistp->c_cf += numc;
+ clistp->c_cc -= numc;
+ /*
+ * If this cblock has been emptied, advance to the next
+ * one. If there are no more characters, set the first
+ * and last pointer to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+}
+
+/*
+ * Add a character to the end of a clist. Return -1 is no
+ * more clists, or 0 for success.
+ */
+int
+putc(chr, clistp)
+ int chr;
+ struct clist *clistp;
+{
+ struct cblock *cblockp;
+ int s;
+
+ s = spltty();
+
+ if (clistp->c_cl == NULL) {
+ if (clistp->c_cbreserved < 1) {
+ splx(s);
+ printf("putc to a clist with no reserved cblocks\n");
+ return (-1); /* nothing done */
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount = 1;
+ clistp->c_cf = clistp->c_cl = cblockp->c_info;
+ clistp->c_cc = 0;
+ } else {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+ if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+ struct cblock *prev = (cblockp - 1);
+
+ if (clistp->c_cbcount >= clistp->c_cbreserved) {
+ if (clistp->c_cbcount >= clistp->c_cbmax
+ || cslushcount <= 0) {
+ splx(s);
+ return (-1);
+ }
+ --cslushcount;
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount++;
+ prev->c_next = cblockp;
+ clistp->c_cl = cblockp->c_info;
+ }
+ }
+
+ /*
+ * If this character is quoted, set the quote bit, if not, clear it.
+ */
+ if (chr & TTY_QUOTE) {
+ setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+ /*
+ * Use one of the spare quote bits to record that something
+ * may be quoted.
+ */
+ setbit(cblockp->c_quote, CBQSIZE * NBBY - 1);
+ } else
+ clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+
+ *clistp->c_cl++ = chr;
+ clistp->c_cc++;
+
+ splx(s);
+ return (0);
+}
+
+/*
+ * Copy data from linear buffer to clist chain. Return the
+ * number of characters not copied.
+ */
+int
+b_to_q(src, amount, clistp)
+ char *src;
+ int amount;
+ struct clist *clistp;
+{
+ struct cblock *cblockp;
+ char *firstbyte, *lastbyte;
+ u_char startmask, endmask;
+ int startbit, endbit, num_between, numc;
+ int s;
+
+ /*
+ * Avoid allocating an initial cblock and then not using it.
+ * c_cc == 0 must imply c_cbount == 0.
+ */
+ if (amount <= 0)
+ return (amount);
+
+ s = spltty();
+
+ /*
+ * If there are no cblocks assigned to this clist yet,
+ * then get one.
+ */
+ if (clistp->c_cl == NULL) {
+ if (clistp->c_cbreserved < 1) {
+ splx(s);
+ printf("b_to_q to a clist with no reserved cblocks.\n");
+ return (amount); /* nothing done */
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount = 1;
+ clistp->c_cf = clistp->c_cl = cblockp->c_info;
+ clistp->c_cc = 0;
+ } else {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+ }
+
+ while (amount) {
+ /*
+ * Get another cblock if needed.
+ */
+ if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+ struct cblock *prev = cblockp - 1;
+
+ if (clistp->c_cbcount >= clistp->c_cbreserved) {
+ if (clistp->c_cbcount >= clistp->c_cbmax
+ || cslushcount <= 0) {
+ splx(s);
+ return (amount);
+ }
+ --cslushcount;
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount++;
+ prev->c_next = cblockp;
+ clistp->c_cl = cblockp->c_info;
+ }
+
+ /*
+ * Copy a chunk of the linear buffer up to the end
+ * of this cblock.
+ */
+ numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl);
+ bcopy(src, clistp->c_cl, numc);
+
+ /*
+ * Clear quote bits if they aren't known to be clear.
+ * The following could probably be made into a seperate
+ * "bitzero()" routine, but why bother?
+ */
+ if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) {
+ startbit = clistp->c_cl - (char *)cblockp->c_info;
+ endbit = startbit + numc - 1;
+
+ firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY);
+ lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY);
+
+ /*
+ * Calculate mask of bits to preserve in first and
+ * last bytes.
+ */
+ startmask = NBBY - (startbit % NBBY);
+ startmask = 0xff >> startmask;
+ endmask = (endbit % NBBY);
+ endmask = 0xff << (endmask + 1);
+
+ if (firstbyte != lastbyte) {
+ *firstbyte &= startmask;
+ *lastbyte &= endmask;
+
+ num_between = lastbyte - firstbyte - 1;
+ if (num_between)
+ bzero(firstbyte + 1, num_between);
+ } else {
+ *firstbyte &= (startmask | endmask);
+ }
+ }
+
+ /*
+ * ...and update pointer for the next chunk.
+ */
+ src += numc;
+ clistp->c_cl += numc;
+ clistp->c_cc += numc;
+ amount -= numc;
+ /*
+ * If we go through the loop again, it's always
+ * for data in the next cblock, so by adding one (cblock),
+ * (which makes the pointer 1 beyond the end of this
+ * cblock) we prepare for the assignment of 'prev'
+ * above.
+ */
+ cblockp += 1;
+
+ }
+
+ splx(s);
+ return (amount);
+}
+
+/*
+ * Get the next character in the clist. Store it at dst. Don't
+ * advance any clist pointers, but return a pointer to the next
+ * character position.
+ */
+char *
+nextc(clistp, cp, dst)
+ struct clist *clistp;
+ char *cp;
+ int *dst;
+{
+ struct cblock *cblockp;
+
+ ++cp;
+ /*
+ * See if the next character is beyond the end of
+ * the clist.
+ */
+ if (clistp->c_cc && (cp != clistp->c_cl)) {
+ /*
+ * If the next character is beyond the end of this
+ * cblock, advance to the next cblock.
+ */
+ if (((intptr_t)cp & CROUND) == 0)
+ cp = ((struct cblock *)cp - 1)->c_next->c_info;
+ cblockp = (struct cblock *)((intptr_t)cp & ~CROUND);
+
+ /*
+ * Get the character. Set the quote flag if this character
+ * is quoted.
+ */
+ *dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0);
+
+ return (cp);
+ }
+
+ return (NULL);
+}
+
+/*
+ * "Unput" a character from a clist.
+ */
+int
+unputc(clistp)
+ struct clist *clistp;
+{
+ struct cblock *cblockp = 0, *cbp = 0;
+ int s;
+ int chr = -1;
+
+
+ s = spltty();
+
+ if (clistp->c_cc) {
+ --clistp->c_cc;
+ --clistp->c_cl;
+
+ chr = (u_char)*clistp->c_cl;
+
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+
+ /*
+ * Set quote flag if this character was quoted.
+ */
+ if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info))
+ chr |= TTY_QUOTE;
+
+ /*
+ * If all of the characters have been unput in this
+ * cblock, then find the previous one and free this
+ * one.
+ */
+ if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) {
+ cbp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+
+ while (cbp->c_next != cblockp)
+ cbp = cbp->c_next;
+
+ /*
+ * When the previous cblock is at the end, the 'last'
+ * pointer always points (invalidly) one past.
+ */
+ clistp->c_cl = (char *)(cbp+1);
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ cbp->c_next = NULL;
+ }
+ }
+
+ /*
+ * If there are no more characters on the list, then
+ * free the last cblock.
+ */
+ if ((clistp->c_cc == 0) && clistp->c_cl) {
+ cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+
+ splx(s);
+ return (chr);
+}
+
+/*
+ * Move characters in source clist to destination clist,
+ * preserving quote bits.
+ */
+void
+catq(src_clistp, dest_clistp)
+ struct clist *src_clistp, *dest_clistp;
+{
+ int chr, s;
+
+ s = spltty();
+ /*
+ * If the destination clist is empty (has no cblocks atttached),
+ * and there are no possible complications with the resource counters,
+ * then we simply assign the current clist to the destination.
+ */
+ if (!dest_clistp->c_cf
+ && src_clistp->c_cbcount <= src_clistp->c_cbmax
+ && src_clistp->c_cbcount <= dest_clistp->c_cbmax) {
+ dest_clistp->c_cf = src_clistp->c_cf;
+ dest_clistp->c_cl = src_clistp->c_cl;
+ src_clistp->c_cf = src_clistp->c_cl = NULL;
+
+ dest_clistp->c_cc = src_clistp->c_cc;
+ src_clistp->c_cc = 0;
+ dest_clistp->c_cbcount = src_clistp->c_cbcount;
+ src_clistp->c_cbcount = 0;
+
+ splx(s);
+ return;
+ }
+
+ splx(s);
+
+ /*
+ * XXX This should probably be optimized to more than one
+ * character at a time.
+ */
+ while ((chr = getc(src_clistp)) != -1)
+ putc(chr, dest_clistp);
+}
diff --git a/sys/kern/tty_tb.c b/sys/kern/tty_tb.c
new file mode 100644
index 0000000..8f4c84c
--- /dev/null
+++ b/sys/kern/tty_tb.c
@@ -0,0 +1,367 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty_tb.c 8.1 (Berkeley) 6/10/93
+ * $Id$
+ */
+
+#include "tb.h"
+#if NTB > 0
+
+/*
+ * Line discipline for RS232 tablets;
+ * supplies binary coordinate data.
+ */
+#include <sys/param.h>
+#include <sys/tablet.h>
+#include <sys/tty.h>
+
+/*
+ * Tablet configuration table.
+ */
+struct tbconf {
+ short tbc_recsize; /* input record size in bytes */
+ short tbc_uiosize; /* size of data record returned user */
+ int tbc_sync; /* mask for finding sync byte/bit */
+ int (*tbc_decode)();/* decoding routine */
+ char *tbc_run; /* enter run mode sequence */
+ char *tbc_point; /* enter point mode sequence */
+ char *tbc_stop; /* stop sequence */
+ char *tbc_start; /* start/restart sequence */
+ int tbc_flags;
+#define TBF_POL 0x1 /* polhemus hack */
+#define TBF_INPROX 0x2 /* tablet has proximity info */
+};
+
+static int tbdecode(), gtcodecode(), poldecode();
+static int tblresdecode(), tbhresdecode();
+
+struct tbconf tbconf[TBTYPE] = {
+{ 0 },
+{ 5, sizeof (struct tbpos), 0200, tbdecode, "6", "4" },
+{ 5, sizeof (struct tbpos), 0200, tbdecode, "\1CN", "\1RT", "\2", "\4" },
+{ 8, sizeof (struct gtcopos), 0200, gtcodecode },
+{17, sizeof (struct polpos), 0200, poldecode, 0, 0, "\21", "\5\22\2\23",
+ TBF_POL },
+{ 5, sizeof (struct tbpos), 0100, tblresdecode, "\1CN", "\1PT", "\2", "\4",
+ TBF_INPROX },
+{ 6, sizeof (struct tbpos), 0200, tbhresdecode, "\1CN", "\1PT", "\2", "\4",
+ TBF_INPROX },
+{ 5, sizeof (struct tbpos), 0100, tblresdecode, "\1CL\33", "\1PT\33", 0, 0},
+{ 6, sizeof (struct tbpos), 0200, tbhresdecode, "\1CL\33", "\1PT\33", 0, 0},
+};
+
+/*
+ * Tablet state
+ */
+struct tb {
+ int tbflags; /* mode & type bits */
+#define TBMAXREC 17 /* max input record size */
+ char cbuf[TBMAXREC]; /* input buffer */
+ union {
+ struct tbpos tbpos;
+ struct gtcopos gtcopos;
+ struct polpos polpos;
+ } rets; /* processed state */
+#define NTBS 16
+} tb[NTBS];
+
+/*
+ * Open as tablet discipline; called on discipline change.
+ */
+/*ARGSUSED*/
+tbopen(dev, tp)
+ dev_t dev;
+ register struct tty *tp;
+{
+ register struct tb *tbp;
+
+ if (tp->t_line == TABLDISC)
+ return (ENODEV);
+ ttywflush(tp);
+ for (tbp = tb; tbp < &tb[NTBS]; tbp++)
+ if (tbp->tbflags == 0)
+ break;
+ if (tbp >= &tb[NTBS])
+ return (EBUSY);
+ tbp->tbflags = TBTIGER|TBPOINT; /* default */
+ tp->t_cp = tbp->cbuf;
+ tp->t_inbuf = 0;
+ bzero((caddr_t)&tbp->rets, sizeof (tbp->rets));
+ tp->T_LINEP = (caddr_t)tbp;
+ tp->t_flags |= LITOUT;
+ return (0);
+}
+
+/*
+ * Line discipline change or last device close.
+ */
+tbclose(tp)
+ register struct tty *tp;
+{
+ register int s;
+ int modebits = TBPOINT|TBSTOP;
+
+ tbioctl(tp, BIOSMODE, &modebits, 0);
+ s = spltty();
+ ((struct tb *)tp->T_LINEP)->tbflags = 0;
+ tp->t_cp = 0;
+ tp->t_inbuf = 0;
+ tp->t_rawq.c_cc = 0; /* clear queues -- paranoid */
+ tp->t_canq.c_cc = 0;
+ tp->t_line = 0; /* paranoid: avoid races */
+ splx(s);
+}
+
+/*
+ * Read from a tablet line.
+ * Characters have been buffered in a buffer and decoded.
+ */
+tbread(tp, uio)
+ register struct tty *tp;
+ struct uio *uio;
+{
+ register struct tb *tbp = (struct tb *)tp->T_LINEP;
+ register struct tbconf *tc = &tbconf[tbp->tbflags & TBTYPE];
+ int ret;
+
+ if ((tp->t_state&TS_CARR_ON) == 0)
+ return (EIO);
+ ret = uiomove(&tbp->rets, tc->tbc_uiosize, uio);
+ if (tc->tbc_flags&TBF_POL)
+ tbp->rets.polpos.p_key = ' ';
+ return (ret);
+}
+
+/*
+ * Low level character input routine.
+ * Stuff the character in the buffer, and decode
+ * if all the chars are there.
+ *
+ * This routine could be expanded in-line in the receiver
+ * interrupt routine to make it run as fast as possible.
+ */
+tbinput(c, tp)
+ register int c;
+ register struct tty *tp;
+{
+ register struct tb *tbp = (struct tb *)tp->T_LINEP;
+ register struct tbconf *tc = &tbconf[tbp->tbflags & TBTYPE];
+
+ if (tc->tbc_recsize == 0 || tc->tbc_decode == 0) /* paranoid? */
+ return;
+ /*
+ * Locate sync bit/byte or reset input buffer.
+ */
+ if (c&tc->tbc_sync || tp->t_inbuf == tc->tbc_recsize) {
+ tp->t_cp = tbp->cbuf;
+ tp->t_inbuf = 0;
+ }
+ *tp->t_cp++ = c&0177;
+ /*
+ * Call decode routine only if a full record has been collected.
+ */
+ if (++tp->t_inbuf == tc->tbc_recsize)
+ (*tc->tbc_decode)(tc, tbp->cbuf, &tbp->rets);
+}
+
+/*
+ * Decode GTCO 8 byte format (high res, tilt, and pressure).
+ */
+static
+gtcodecode(tc, cp, tbpos)
+ struct tbconf *tc;
+ register char *cp;
+ register struct gtcopos *tbpos;
+{
+
+ tbpos->pressure = *cp >> 2;
+ tbpos->status = (tbpos->pressure > 16) | TBINPROX; /* half way down */
+ tbpos->xpos = (*cp++ & 03) << 14;
+ tbpos->xpos |= *cp++ << 7;
+ tbpos->xpos |= *cp++;
+ tbpos->ypos = (*cp++ & 03) << 14;
+ tbpos->ypos |= *cp++ << 7;
+ tbpos->ypos |= *cp++;
+ tbpos->xtilt = *cp++;
+ tbpos->ytilt = *cp++;
+ tbpos->scount++;
+}
+
+/*
+ * Decode old Hitachi 5 byte format (low res).
+ */
+static
+tbdecode(tc, cp, tbpos)
+ struct tbconf *tc;
+ register char *cp;
+ register struct tbpos *tbpos;
+{
+ register char byte;
+
+ byte = *cp++;
+ tbpos->status = (byte&0100) ? TBINPROX : 0;
+ byte &= ~0100;
+ if (byte > 036)
+ tbpos->status |= 1 << ((byte-040)/2);
+ tbpos->xpos = *cp++ << 7;
+ tbpos->xpos |= *cp++;
+ if (tbpos->xpos < 256) /* tablet wraps around at 256 */
+ tbpos->status &= ~TBINPROX; /* make it out of proximity */
+ tbpos->ypos = *cp++ << 7;
+ tbpos->ypos |= *cp++;
+ tbpos->scount++;
+}
+
+/*
+ * Decode new Hitach 5-byte format (low res).
+ */
+static
+tblresdecode(tc, cp, tbpos)
+ struct tbconf *tc;
+ register char *cp;
+ register struct tbpos *tbpos;
+{
+
+ *cp &= ~0100; /* mask sync bit */
+ tbpos->status = (*cp++ >> 2) | TBINPROX;
+ if (tc->tbc_flags&TBF_INPROX && tbpos->status&020)
+ tbpos->status &= ~(020|TBINPROX);
+ tbpos->xpos = *cp++;
+ tbpos->xpos |= *cp++ << 6;
+ tbpos->ypos = *cp++;
+ tbpos->ypos |= *cp++ << 6;
+ tbpos->scount++;
+}
+
+/*
+ * Decode new Hitach 6-byte format (high res).
+ */
+static
+tbhresdecode(tc, cp, tbpos)
+ struct tbconf *tc;
+ register char *cp;
+ register struct tbpos *tbpos;
+{
+ char byte;
+
+ byte = *cp++;
+ tbpos->xpos = (byte & 03) << 14;
+ tbpos->xpos |= *cp++ << 7;
+ tbpos->xpos |= *cp++;
+ tbpos->ypos = *cp++ << 14;
+ tbpos->ypos |= *cp++ << 7;
+ tbpos->ypos |= *cp++;
+ tbpos->status = (byte >> 2) | TBINPROX;
+ if (tc->tbc_flags&TBF_INPROX && tbpos->status&020)
+ tbpos->status &= ~(020|TBINPROX);
+ tbpos->scount++;
+}
+
+/*
+ * Polhemus decode.
+ */
+static
+poldecode(tc, cp, polpos)
+ struct tbconf *tc;
+ register char *cp;
+ register struct polpos *polpos;
+{
+
+ polpos->p_x = cp[4] | cp[3]<<7 | (cp[9] & 0x03) << 14;
+ polpos->p_y = cp[6] | cp[5]<<7 | (cp[9] & 0x0c) << 12;
+ polpos->p_z = cp[8] | cp[7]<<7 | (cp[9] & 0x30) << 10;
+ polpos->p_azi = cp[11] | cp[10]<<7 | (cp[16] & 0x03) << 14;
+ polpos->p_pit = cp[13] | cp[12]<<7 | (cp[16] & 0x0c) << 12;
+ polpos->p_rol = cp[15] | cp[14]<<7 | (cp[16] & 0x30) << 10;
+ polpos->p_stat = cp[1] | cp[0]<<7;
+ if (cp[2] != ' ')
+ polpos->p_key = cp[2];
+}
+
+/*ARGSUSED*/
+tbioctl(tp, cmd, data, flag)
+ struct tty *tp;
+ caddr_t data;
+{
+ register struct tb *tbp = (struct tb *)tp->T_LINEP;
+
+ switch (cmd) {
+
+ case BIOGMODE:
+ *(int *)data = tbp->tbflags & TBMODE;
+ break;
+
+ case BIOSTYPE:
+ if (tbconf[*(int *)data & TBTYPE].tbc_recsize == 0 ||
+ tbconf[*(int *)data & TBTYPE].tbc_decode == 0)
+ return (EINVAL);
+ tbp->tbflags &= ~TBTYPE;
+ tbp->tbflags |= *(int *)data & TBTYPE;
+ /* fall thru... to set mode bits */
+
+ case BIOSMODE: {
+ register struct tbconf *tc;
+
+ tbp->tbflags &= ~TBMODE;
+ tbp->tbflags |= *(int *)data & TBMODE;
+ tc = &tbconf[tbp->tbflags & TBTYPE];
+ if (tbp->tbflags&TBSTOP) {
+ if (tc->tbc_stop)
+ ttyout(tc->tbc_stop, tp);
+ } else if (tc->tbc_start)
+ ttyout(tc->tbc_start, tp);
+ if (tbp->tbflags&TBPOINT) {
+ if (tc->tbc_point)
+ ttyout(tc->tbc_point, tp);
+ } else if (tc->tbc_run)
+ ttyout(tc->tbc_run, tp);
+ ttstart(tp);
+ break;
+ }
+
+ case BIOGTYPE:
+ *(int *)data = tbp->tbflags & TBTYPE;
+ break;
+
+ case TIOCSETD:
+ case TIOCGETD:
+ case TIOCGETP:
+ case TIOCGETC:
+ return (-1); /* pass thru... */
+
+ default:
+ return (ENOTTY);
+ }
+ return (0);
+}
+#endif
diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c
new file mode 100644
index 0000000..889c935
--- /dev/null
+++ b/sys/kern/tty_tty.c
@@ -0,0 +1,206 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty_tty.c 8.2 (Berkeley) 9/23/93
+ * $Id: tty_tty.c,v 1.24 1998/06/07 17:11:44 dfr Exp $
+ */
+
+/*
+ * Indirect driver for controlling tty.
+ */
+
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/lock.h>
+#include <sys/proc.h>
+#include <sys/ttycom.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+static d_open_t cttyopen;
+static d_read_t cttyread;
+static d_write_t cttywrite;
+static d_ioctl_t cttyioctl;
+static d_poll_t cttypoll;
+
+#define CDEV_MAJOR 1
+/* Don't make this static, since fdesc_vnops uses it. */
+struct cdevsw ctty_cdevsw = {
+ cttyopen, nullclose, cttyread, cttywrite,
+ cttyioctl, nullstop, nullreset, nodevtotty,
+ cttypoll, nommap, NULL, "ctty",
+ NULL, -1, nodump, nopsize,
+ D_TTY,
+};
+
+#define cttyvp(p) ((p)->p_flag & P_CONTROLT ? (p)->p_session->s_ttyvp : NULL)
+
+/*ARGSUSED*/
+static int
+cttyopen(dev, flag, mode, p)
+ dev_t dev;
+ int flag, mode;
+ struct proc *p;
+{
+ struct vnode *ttyvp = cttyvp(p);
+ int error;
+
+ if (ttyvp == NULL)
+ return (ENXIO);
+ vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p);
+#ifdef PARANOID
+ /*
+ * Since group is tty and mode is 620 on most terminal lines
+ * and since sessions protect terminals from processes outside
+ * your session, this check is probably no longer necessary.
+ * Since it inhibits setuid root programs that later switch
+ * to another user from accessing /dev/tty, we have decided
+ * to delete this test. (mckusick 5/93)
+ */
+ error = VOP_ACCESS(ttyvp,
+ (flag&FREAD ? VREAD : 0) | (flag&FWRITE ? VWRITE : 0), p->p_ucred, p);
+ if (!error)
+#endif /* PARANOID */
+ error = VOP_OPEN(ttyvp, flag, NOCRED, p);
+ VOP_UNLOCK(ttyvp, 0, p);
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+cttyread(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ struct proc *p = uio->uio_procp;
+ register struct vnode *ttyvp = cttyvp(p);
+ int error;
+
+ if (ttyvp == NULL)
+ return (EIO);
+ vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p);
+ error = VOP_READ(ttyvp, uio, flag, NOCRED);
+ VOP_UNLOCK(ttyvp, 0, p);
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+cttywrite(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ struct proc *p = uio->uio_procp;
+ struct vnode *ttyvp = cttyvp(uio->uio_procp);
+ int error;
+
+ if (ttyvp == NULL)
+ return (EIO);
+ vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p);
+ error = VOP_WRITE(ttyvp, uio, flag, NOCRED);
+ VOP_UNLOCK(ttyvp, 0, p);
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+cttyioctl(dev, cmd, addr, flag, p)
+ dev_t dev;
+ u_long cmd;
+ caddr_t addr;
+ int flag;
+ struct proc *p;
+{
+ struct vnode *ttyvp = cttyvp(p);
+
+ if (ttyvp == NULL)
+ return (EIO);
+ if (cmd == TIOCSCTTY) /* don't allow controlling tty to be set */
+ return EINVAL; /* to controlling tty -- infinite recursion */
+ if (cmd == TIOCNOTTY) {
+ if (!SESS_LEADER(p)) {
+ p->p_flag &= ~P_CONTROLT;
+ return (0);
+ } else
+ return (EINVAL);
+ }
+ return (VOP_IOCTL(ttyvp, cmd, addr, flag, NOCRED, p));
+}
+
+/*ARGSUSED*/
+static int
+cttypoll(dev, events, p)
+ dev_t dev;
+ int events;
+ struct proc *p;
+{
+ struct vnode *ttyvp = cttyvp(p);
+
+ if (ttyvp == NULL)
+ /* try operation to get EOF/failure */
+ return (seltrue(dev, events, p));
+ return (VOP_POLL(ttyvp, events, p->p_ucred, p));
+}
+
+static int ctty_devsw_installed;
+#ifdef DEVFS
+static void *ctty_devfs_token;
+#endif
+
+static void ctty_drvinit __P((void *unused));
+static void
+ctty_drvinit(unused)
+ void *unused;
+{
+ dev_t dev;
+
+ if( ! ctty_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR,0);
+ cdevsw_add(&dev,&ctty_cdevsw,NULL);
+ ctty_devsw_installed = 1;
+#ifdef DEVFS
+ ctty_devfs_token =
+ devfs_add_devswf(&ctty_cdevsw, 0, DV_CHR, 0, 0,
+ 0666, "tty");
+#endif
+ }
+}
+
+SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,ctty_drvinit,NULL)
diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c
new file mode 100644
index 0000000..929da87
--- /dev/null
+++ b/sys/kern/uipc_domain.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93
+ * $Id: uipc_domain.c,v 1.19 1998/05/15 20:11:29 wollman Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/socketvar.h>
+#include <sys/systm.h>
+#include <vm/vm_zone.h>
+
+/*
+ * System initialization
+ *
+ * Note: domain initialization wants to take place on a per domain basis
+ * as a result of traversing a linker set. Most likely, each domain
+ * want to call a registration function rather than being handled here
+ * in domaininit(). Probably this will look like:
+ *
+ * SYSINIT(unique, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, domain_add, xxx)
+ *
+ * Where 'xxx' is replaced by the address of a parameter struct to be
+ * passed to the doamin_add() function.
+ */
+
+static int x_save_spl; /* used by kludge*/
+static void kludge_splimp __P((void *));
+static void kludge_splx __P((void *));
+static void domaininit __P((void *));
+SYSINIT(splimp, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, kludge_splimp, &x_save_spl)
+SYSINIT(domain, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, domaininit, NULL)
+SYSINIT(splx, SI_SUB_PROTO_END, SI_ORDER_FIRST, kludge_splx, &x_save_spl)
+
+static void pffasttimo __P((void *));
+static void pfslowtimo __P((void *));
+
+struct domain *domains;
+
+/*
+ * Add a new protocol domain to the list of supported domains
+ * Note: you cant unload it again because a socket may be using it.
+ * XXX can't fail at this time.
+ */
+static int
+net_init_domain(struct domain *dp)
+{
+ register struct protosw *pr;
+ int s;
+
+ s = splnet();
+ if (dp->dom_init)
+ (*dp->dom_init)();
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++){
+ if (pr->pr_usrreqs == 0)
+ panic("domaininit: %ssw[%d] has no usrreqs!",
+ dp->dom_name,
+ (int)(pr - dp->dom_protosw));
+ if (pr->pr_init)
+ (*pr->pr_init)();
+ }
+ /*
+ * update global informatio about maximums
+ */
+ max_hdr = max_linkhdr + max_protohdr;
+ max_datalen = MHLEN - max_hdr;
+ splx(s);
+ return (0);
+}
+
+/*
+ * Add a new protocol domain to the list of supported domains
+ * Note: you cant unload it again because a socket may be using it.
+ * XXX can't fail at this time.
+ */
+int
+net_add_domain(struct domain *dp)
+{
+ int s, error;
+
+ s = splnet();
+ dp->dom_next = domains;
+ domains = dp;
+ splx(s);
+ error = net_init_domain(dp);
+ max_hdr = max_linkhdr + max_protohdr;
+ max_datalen = MHLEN - max_hdr;
+ return (error);
+}
+
+extern struct linker_set domain_set;
+
+/* ARGSUSED*/
+static void
+domaininit(void *dummy)
+{
+ register struct domain *dp, **dpp;
+ /*
+ * Before we do any setup, make sure to initialize the
+ * zone allocator we get struct sockets from. The obvious
+ * maximum number of sockets is `maxfiles', but it is possible
+ * to have a socket without an open file (e.g., a connection waiting
+ * to be accept(2)ed). Rather than think up and define a
+ * better value, we just use nmbclusters, since that's what people
+ * are told to increase first when the network runs out of memory.
+ * Perhaps we should have two pools, one of unlimited size
+ * for use during socreate(), and one ZONE_INTERRUPT pool for
+ * use in sonewconn().
+ */
+ socket_zone = zinit("socket", sizeof(struct socket), maxsockets,
+ ZONE_INTERRUPT, 0);
+
+ if (max_linkhdr < 16) /* XXX */
+ max_linkhdr = 16;
+
+ /*
+ * NB - local domain is always present.
+ */
+ net_add_domain(&localdomain);
+
+ /*
+ * gather up as many protocols as we have statically linked.
+ * XXX we need to do this because when we ask the routing
+ * protocol to initialise it will want to examine all
+ * installed protocols. This needs fixing before protocols
+ * that use the standard routing can become modules.
+ */
+ for (dpp = (struct domain **)domain_set.ls_items; *dpp; dpp++) {
+ (**dpp).dom_next = domains;
+ domains = *dpp;
+ }
+
+ /*
+ * Now ask them all to init (XXX including the routing domain,
+ * see above)
+ */
+ for (dp = domains; dp; dp = dp->dom_next)
+ net_init_domain(dp);
+
+ timeout(pffasttimo, (void *)0, 1);
+ timeout(pfslowtimo, (void *)0, 1);
+}
+
+
+/*
+ * The following two operations are kludge code. Most likely, they should
+ * be done as a "domainpreinit()" for the first function and then rolled
+ * in as the last act of "domaininit()" for the second.
+ *
+ * In point of fact, it is questionable why other initialization prior
+ * to this does not also take place at splimp by default.
+ */
+static void
+kludge_splimp(udata)
+ void *udata;
+{
+ int *savesplp = udata;
+
+ *savesplp = splimp();
+}
+
+static void
+kludge_splx(udata)
+ void *udata;
+{
+ int *savesplp = udata;
+
+ splx(*savesplp);
+}
+
+
+
+struct protosw *
+pffindtype(family, type)
+ int family;
+ int type;
+{
+ register struct domain *dp;
+ register struct protosw *pr;
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ if (dp->dom_family == family)
+ goto found;
+ return (0);
+found:
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_type && pr->pr_type == type)
+ return (pr);
+ return (0);
+}
+
+struct protosw *
+pffindproto(family, protocol, type)
+ int family;
+ int protocol;
+ int type;
+{
+ register struct domain *dp;
+ register struct protosw *pr;
+ struct protosw *maybe = 0;
+
+ if (family == 0)
+ return (0);
+ for (dp = domains; dp; dp = dp->dom_next)
+ if (dp->dom_family == family)
+ goto found;
+ return (0);
+found:
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
+ if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
+ return (pr);
+
+ if (type == SOCK_RAW && pr->pr_type == SOCK_RAW &&
+ pr->pr_protocol == 0 && maybe == (struct protosw *)0)
+ maybe = pr;
+ }
+ return (maybe);
+}
+
+void
+pfctlinput(cmd, sa)
+ int cmd;
+ struct sockaddr *sa;
+{
+ register struct domain *dp;
+ register struct protosw *pr;
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_ctlinput)
+ (*pr->pr_ctlinput)(cmd, sa, (void *)0);
+}
+
+static void
+pfslowtimo(arg)
+ void *arg;
+{
+ register struct domain *dp;
+ register struct protosw *pr;
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_slowtimo)
+ (*pr->pr_slowtimo)();
+ timeout(pfslowtimo, (void *)0, hz/2);
+}
+
+static void
+pffasttimo(arg)
+ void *arg;
+{
+ register struct domain *dp;
+ register struct protosw *pr;
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_fasttimo)
+ (*pr->pr_fasttimo)();
+ timeout(pffasttimo, (void *)0, hz/5);
+}
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
new file mode 100644
index 0000000..09ddd23
--- /dev/null
+++ b/sys/kern/uipc_mbuf.c
@@ -0,0 +1,945 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
+ * $Id: uipc_mbuf.c,v 1.36 1998/07/03 08:36:48 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+static void mbinit __P((void *));
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
+
+struct mbuf *mbutl;
+char *mclrefcnt;
+struct mbstat mbstat;
+struct mbuf *mmbfree;
+union mcluster *mclfree;
+int max_linkhdr;
+int max_protohdr;
+int max_hdr;
+int max_datalen;
+
+SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
+ &max_linkhdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
+ &max_protohdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
+ &max_datalen, 0, "");
+SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, "");
+
+static void m_reclaim __P((void));
+
+/* "number of clusters of pages" */
+#define NCL_INIT 1
+
+#define NMB_INIT 16
+
+/* ARGSUSED*/
+static void
+mbinit(dummy)
+ void *dummy;
+{
+ int s;
+
+ mmbfree = NULL; mclfree = NULL;
+ mbstat.m_msize = MSIZE;
+ mbstat.m_mclbytes = MCLBYTES;
+ mbstat.m_minclsize = MINCLSIZE;
+ mbstat.m_mlen = MLEN;
+ mbstat.m_mhlen = MHLEN;
+
+ s = splimp();
+ if (m_mballoc(NMB_INIT, M_DONTWAIT) == 0)
+ goto bad;
+#if MCLBYTES <= PAGE_SIZE
+ if (m_clalloc(NCL_INIT, M_DONTWAIT) == 0)
+ goto bad;
+#else
+ /* It's OK to call contigmalloc in this context. */
+ if (m_clalloc(16, M_WAIT) == 0)
+ goto bad;
+#endif
+ splx(s);
+ return;
+bad:
+ panic("mbinit");
+}
+
+/*
+ * Allocate at least nmb mbufs and place on mbuf free list.
+ * Must be called at splimp.
+ */
+/* ARGSUSED */
+int
+m_mballoc(nmb, how)
+ register int nmb;
+ int how;
+{
+ register caddr_t p;
+ register int i;
+ int nbytes;
+
+ /* Once we run out of map space, it will be impossible to get
+ * any more (nothing is ever freed back to the map) (XXX which
+ * is dumb). (however you are not dead as m_reclaim might
+ * still be able to free a substantial amount of space).
+ */
+ if (mb_map_full)
+ return (0);
+
+ nbytes = round_page(nmb * MSIZE);
+ p = (caddr_t)kmem_malloc(mb_map, nbytes, M_NOWAIT);
+ if (p == 0 && how == M_WAIT) {
+ mbstat.m_wait++;
+ p = (caddr_t)kmem_malloc(mb_map, nbytes, M_WAITOK);
+ }
+
+ /*
+ * Either the map is now full, or `how' is M_NOWAIT and there
+ * are no pages left.
+ */
+ if (p == NULL)
+ return (0);
+
+ nmb = nbytes / MSIZE;
+ for (i = 0; i < nmb; i++) {
+ ((struct mbuf *)p)->m_next = mmbfree;
+ mmbfree = (struct mbuf *)p;
+ p += MSIZE;
+ }
+ mbstat.m_mbufs += nmb;
+ return (1);
+}
+
+#if MCLBYTES > PAGE_SIZE
+static int i_want_my_mcl;
+
+static void
+kproc_mclalloc(void)
+{
+ int status;
+
+ while (1) {
+ tsleep(&i_want_my_mcl, PVM, "mclalloc", 0);
+
+ for (; i_want_my_mcl; i_want_my_mcl--) {
+ if (m_clalloc(1, M_WAIT) == 0)
+ printf("m_clalloc failed even in process context!\n");
+ }
+ }
+}
+
+static struct proc *mclallocproc;
+static struct kproc_desc mclalloc_kp = {
+ "mclalloc",
+ kproc_mclalloc,
+ &mclallocproc
+};
+SYSINIT_KT(mclallocproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
+ &mclalloc_kp);
+#endif
+
+/*
+ * Allocate some number of mbuf clusters
+ * and place on cluster free list.
+ * Must be called at splimp.
+ */
+/* ARGSUSED */
+int
+m_clalloc(ncl, how)
+ register int ncl;
+ int how;
+{
+ register caddr_t p;
+ register int i;
+ int npg;
+
+ /*
+ * Once we run out of map space, it will be impossible
+ * to get any more (nothing is ever freed back to the
+ * map).
+ */
+ if (mb_map_full) {
+ mbstat.m_drops++;
+ return (0);
+ }
+
+#if MCLBYTES > PAGE_SIZE
+ if (how != M_WAIT) {
+ i_want_my_mcl += ncl;
+ wakeup(&i_want_my_mcl);
+ mbstat.m_wait++;
+ p = 0;
+ } else {
+ p = contigmalloc1(MCLBYTES * ncl, M_DEVBUF, M_WAITOK, 0ul,
+ ~0ul, PAGE_SIZE, 0, mb_map);
+ }
+#else
+ npg = ncl;
+ p = (caddr_t)kmem_malloc(mb_map, ctob(npg),
+ how != M_WAIT ? M_NOWAIT : M_WAITOK);
+ ncl = ncl * PAGE_SIZE / MCLBYTES;
+#endif
+ /*
+ * Either the map is now full, or `how' is M_NOWAIT and there
+ * are no pages left.
+ */
+ if (p == NULL) {
+ mbstat.m_drops++;
+ return (0);
+ }
+
+ for (i = 0; i < ncl; i++) {
+ ((union mcluster *)p)->mcl_next = mclfree;
+ mclfree = (union mcluster *)p;
+ p += MCLBYTES;
+ mbstat.m_clfree++;
+ }
+ mbstat.m_clusters += ncl;
+ return (1);
+}
+
+/*
+ * When MGET failes, ask protocols to free space when short of memory,
+ * then re-attempt to allocate an mbuf.
+ */
+struct mbuf *
+m_retry(i, t)
+ int i, t;
+{
+ register struct mbuf *m;
+
+ /*
+ * Must only do the reclaim if not in an interrupt context.
+ */
+ if (i == M_WAIT)
+ m_reclaim();
+#define m_retry(i, t) (struct mbuf *)0
+ MGET(m, i, t);
+#undef m_retry
+ if (m != NULL) {
+ mbstat.m_wait++;
+ } else {
+ if (i == M_DONTWAIT)
+ mbstat.m_drops++;
+ else
+ panic("Out of mbuf clusters");
+ }
+ return (m);
+}
+
+/*
+ * As above; retry an MGETHDR.
+ */
+struct mbuf *
+m_retryhdr(i, t)
+ int i, t;
+{
+ register struct mbuf *m;
+
+ /*
+ * Must only do the reclaim if not in an interrupt context.
+ */
+ if (i == M_WAIT)
+ m_reclaim();
+#define m_retryhdr(i, t) (struct mbuf *)0
+ MGETHDR(m, i, t);
+#undef m_retryhdr
+ if (m != NULL) {
+ mbstat.m_wait++;
+ } else {
+ if (i == M_DONTWAIT)
+ mbstat.m_drops++;
+ else
+ panic("Out of mbuf clusters");
+ }
+ return (m);
+}
+
+static void
+m_reclaim()
+{
+ register struct domain *dp;
+ register struct protosw *pr;
+ int s = splimp();
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_drain)
+ (*pr->pr_drain)();
+ splx(s);
+ mbstat.m_drain++;
+}
+
+/*
+ * Space allocation routines.
+ * These are also available as macros
+ * for critical paths.
+ */
+struct mbuf *
+m_get(how, type)
+ int how, type;
+{
+ register struct mbuf *m;
+
+ MGET(m, how, type);
+ return (m);
+}
+
+struct mbuf *
+m_gethdr(how, type)
+ int how, type;
+{
+ register struct mbuf *m;
+
+ MGETHDR(m, how, type);
+ return (m);
+}
+
+struct mbuf *
+m_getclr(how, type)
+ int how, type;
+{
+ register struct mbuf *m;
+
+ MGET(m, how, type);
+ if (m == 0)
+ return (0);
+ bzero(mtod(m, caddr_t), MLEN);
+ return (m);
+}
+
+struct mbuf *
+m_free(m)
+ struct mbuf *m;
+{
+ register struct mbuf *n;
+
+ MFREE(m, n);
+ return (n);
+}
+
+void
+m_freem(m)
+ register struct mbuf *m;
+{
+ register struct mbuf *n;
+
+ if (m == NULL)
+ return;
+ do {
+ MFREE(m, n);
+ m = n;
+ } while (m);
+}
+
+/*
+ * Mbuffer utility routines.
+ */
+
+/*
+ * Lesser-used path for M_PREPEND:
+ * allocate new mbuf to prepend to chain,
+ * copy junk along.
+ */
+struct mbuf *
+m_prepend(m, len, how)
+ register struct mbuf *m;
+ int len, how;
+{
+ struct mbuf *mn;
+
+ MGET(mn, how, m->m_type);
+ if (mn == (struct mbuf *)NULL) {
+ m_freem(m);
+ return ((struct mbuf *)NULL);
+ }
+ if (m->m_flags & M_PKTHDR) {
+ M_COPY_PKTHDR(mn, m);
+ m->m_flags &= ~M_PKTHDR;
+ }
+ mn->m_next = m;
+ m = mn;
+ if (len < MHLEN)
+ MH_ALIGN(m, len);
+ m->m_len = len;
+ return (m);
+}
+
+/*
+ * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
+ * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
+ * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
+ */
+#define MCFail (mbstat.m_mcfail)
+
+struct mbuf *
+m_copym(m, off0, len, wait)
+ register struct mbuf *m;
+ int off0, wait;
+ register int len;
+{
+ register struct mbuf *n, **np;
+ register int off = off0;
+ struct mbuf *top;
+ int copyhdr = 0;
+
+ if (off < 0 || len < 0)
+ panic("m_copym");
+ if (off == 0 && m->m_flags & M_PKTHDR)
+ copyhdr = 1;
+ while (off > 0) {
+ if (m == 0)
+ panic("m_copym");
+ if (off < m->m_len)
+ break;
+ off -= m->m_len;
+ m = m->m_next;
+ }
+ np = &top;
+ top = 0;
+ while (len > 0) {
+ if (m == 0) {
+ if (len != M_COPYALL)
+ panic("m_copym");
+ break;
+ }
+ MGET(n, wait, m->m_type);
+ *np = n;
+ if (n == 0)
+ goto nospace;
+ if (copyhdr) {
+ M_COPY_PKTHDR(n, m);
+ if (len == M_COPYALL)
+ n->m_pkthdr.len -= off0;
+ else
+ n->m_pkthdr.len = len;
+ copyhdr = 0;
+ }
+ n->m_len = min(len, m->m_len - off);
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data + off;
+ if(!m->m_ext.ext_ref)
+ mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+ else
+ (*(m->m_ext.ext_ref))(m->m_ext.ext_buf,
+ m->m_ext.ext_size);
+ n->m_ext = m->m_ext;
+ n->m_flags |= M_EXT;
+ } else
+ bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
+ (unsigned)n->m_len);
+ if (len != M_COPYALL)
+ len -= n->m_len;
+ off = 0;
+ m = m->m_next;
+ np = &n->m_next;
+ }
+ if (top == 0)
+ MCFail++;
+ return (top);
+nospace:
+ m_freem(top);
+ MCFail++;
+ return (0);
+}
+
+/*
+ * Copy an entire packet, including header (which must be present).
+ * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
+ */
+struct mbuf *
+m_copypacket(m, how)
+ struct mbuf *m;
+ int how;
+{
+ struct mbuf *top, *n, *o;
+
+ MGET(n, how, m->m_type);
+ top = n;
+ if (!n)
+ goto nospace;
+
+ M_COPY_PKTHDR(n, m);
+ n->m_len = m->m_len;
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data;
+ if(!m->m_ext.ext_ref)
+ mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+ else
+ (*(m->m_ext.ext_ref))(m->m_ext.ext_buf,
+ m->m_ext.ext_size);
+ n->m_ext = m->m_ext;
+ n->m_flags |= M_EXT;
+ } else {
+ bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+ }
+
+ m = m->m_next;
+ while (m) {
+ MGET(o, how, m->m_type);
+ if (!o)
+ goto nospace;
+
+ n->m_next = o;
+ n = n->m_next;
+
+ n->m_len = m->m_len;
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data;
+ if(!m->m_ext.ext_ref)
+ mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+ else
+ (*(m->m_ext.ext_ref))(m->m_ext.ext_buf,
+ m->m_ext.ext_size);
+ n->m_ext = m->m_ext;
+ n->m_flags |= M_EXT;
+ } else {
+ bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+ }
+
+ m = m->m_next;
+ }
+ return top;
+nospace:
+ m_freem(top);
+ MCFail++;
+ return 0;
+}
+
+/*
+ * Copy data from an mbuf chain starting "off" bytes from the beginning,
+ * continuing for "len" bytes, into the indicated buffer.
+ */
+void
+m_copydata(m, off, len, cp)
+ register struct mbuf *m;
+ register int off;
+ register int len;
+ caddr_t cp;
+{
+ register unsigned count;
+
+ if (off < 0 || len < 0)
+ panic("m_copydata");
+ while (off > 0) {
+ if (m == 0)
+ panic("m_copydata");
+ if (off < m->m_len)
+ break;
+ off -= m->m_len;
+ m = m->m_next;
+ }
+ while (len > 0) {
+ if (m == 0)
+ panic("m_copydata");
+ count = min(m->m_len - off, len);
+ bcopy(mtod(m, caddr_t) + off, cp, count);
+ len -= count;
+ cp += count;
+ off = 0;
+ m = m->m_next;
+ }
+}
+
+/*
+ * Concatenate mbuf chain n to m.
+ * Both chains must be of the same type (e.g. MT_DATA).
+ * Any m_pkthdr is not updated.
+ */
+void
+m_cat(m, n)
+ register struct mbuf *m, *n;
+{
+ while (m->m_next)
+ m = m->m_next;
+ while (n) {
+ if (m->m_flags & M_EXT ||
+ m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
+ /* just join the two chains */
+ m->m_next = n;
+ return;
+ }
+ /* splat the data from one into the other */
+ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
+ (u_int)n->m_len);
+ m->m_len += n->m_len;
+ n = m_free(n);
+ }
+}
+
+void
+m_adj(mp, req_len)
+ struct mbuf *mp;
+ int req_len;
+{
+ register int len = req_len;
+ register struct mbuf *m;
+ register int count;
+
+ if ((m = mp) == NULL)
+ return;
+ if (len >= 0) {
+ /*
+ * Trim from head.
+ */
+ while (m != NULL && len > 0) {
+ if (m->m_len <= len) {
+ len -= m->m_len;
+ m->m_len = 0;
+ m = m->m_next;
+ } else {
+ m->m_len -= len;
+ m->m_data += len;
+ len = 0;
+ }
+ }
+ m = mp;
+ if (mp->m_flags & M_PKTHDR)
+ m->m_pkthdr.len -= (req_len - len);
+ } else {
+ /*
+ * Trim from tail. Scan the mbuf chain,
+ * calculating its length and finding the last mbuf.
+ * If the adjustment only affects this mbuf, then just
+ * adjust and return. Otherwise, rescan and truncate
+ * after the remaining size.
+ */
+ len = -len;
+ count = 0;
+ for (;;) {
+ count += m->m_len;
+ if (m->m_next == (struct mbuf *)0)
+ break;
+ m = m->m_next;
+ }
+ if (m->m_len >= len) {
+ m->m_len -= len;
+ if (mp->m_flags & M_PKTHDR)
+ mp->m_pkthdr.len -= len;
+ return;
+ }
+ count -= len;
+ if (count < 0)
+ count = 0;
+ /*
+ * Correct length for chain is "count".
+ * Find the mbuf with last data, adjust its length,
+ * and toss data from remaining mbufs on chain.
+ */
+ m = mp;
+ if (m->m_flags & M_PKTHDR)
+ m->m_pkthdr.len = count;
+ for (; m; m = m->m_next) {
+ if (m->m_len >= count) {
+ m->m_len = count;
+ break;
+ }
+ count -= m->m_len;
+ }
+ while (m->m_next)
+ (m = m->m_next) ->m_len = 0;
+ }
+}
+
+/*
+ * Rearange an mbuf chain so that len bytes are contiguous
+ * and in the data area of an mbuf (so that mtod and dtom
+ * will work for a structure of size len). Returns the resulting
+ * mbuf chain on success, frees it and returns null on failure.
+ * If there is room, it will add up to max_protohdr-len extra bytes to the
+ * contiguous region in an attempt to avoid being called next time.
+ */
+#define MPFail (mbstat.m_mpfail)
+
+struct mbuf *
+m_pullup(n, len)
+ register struct mbuf *n;
+ int len;
+{
+ register struct mbuf *m;
+ register int count;
+ int space;
+
+ /*
+ * If first mbuf has no cluster, and has room for len bytes
+ * without shifting current data, pullup into it,
+ * otherwise allocate a new mbuf to prepend to the chain.
+ */
+ if ((n->m_flags & M_EXT) == 0 &&
+ n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
+ if (n->m_len >= len)
+ return (n);
+ m = n;
+ n = n->m_next;
+ len -= m->m_len;
+ } else {
+ if (len > MHLEN)
+ goto bad;
+ MGET(m, M_DONTWAIT, n->m_type);
+ if (m == 0)
+ goto bad;
+ m->m_len = 0;
+ if (n->m_flags & M_PKTHDR) {
+ M_COPY_PKTHDR(m, n);
+ n->m_flags &= ~M_PKTHDR;
+ }
+ }
+ space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
+ do {
+ count = min(min(max(len, max_protohdr), space), n->m_len);
+ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
+ (unsigned)count);
+ len -= count;
+ m->m_len += count;
+ n->m_len -= count;
+ space -= count;
+ if (n->m_len)
+ n->m_data += count;
+ else
+ n = m_free(n);
+ } while (len > 0 && n);
+ if (len > 0) {
+ (void) m_free(m);
+ goto bad;
+ }
+ m->m_next = n;
+ return (m);
+bad:
+ m_freem(n);
+ MPFail++;
+ return (0);
+}
+
+/*
+ * Partition an mbuf chain in two pieces, returning the tail --
+ * all but the first len0 bytes. In case of failure, it returns NULL and
+ * attempts to restore the chain to its original state.
+ */
+struct mbuf *
+m_split(m0, len0, wait)
+ register struct mbuf *m0;
+ int len0, wait;
+{
+ register struct mbuf *m, *n;
+ unsigned len = len0, remain;
+
+ for (m = m0; m && len > m->m_len; m = m->m_next)
+ len -= m->m_len;
+ if (m == 0)
+ return (0);
+ remain = m->m_len - len;
+ if (m0->m_flags & M_PKTHDR) {
+ MGETHDR(n, wait, m0->m_type);
+ if (n == 0)
+ return (0);
+ n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
+ n->m_pkthdr.len = m0->m_pkthdr.len - len0;
+ m0->m_pkthdr.len = len0;
+ if (m->m_flags & M_EXT)
+ goto extpacket;
+ if (remain > MHLEN) {
+ /* m can't be the lead packet */
+ MH_ALIGN(n, 0);
+ n->m_next = m_split(m, len, wait);
+ if (n->m_next == 0) {
+ (void) m_free(n);
+ return (0);
+ } else
+ return (n);
+ } else
+ MH_ALIGN(n, remain);
+ } else if (remain == 0) {
+ n = m->m_next;
+ m->m_next = 0;
+ return (n);
+ } else {
+ MGET(n, wait, m->m_type);
+ if (n == 0)
+ return (0);
+ M_ALIGN(n, remain);
+ }
+extpacket:
+ if (m->m_flags & M_EXT) {
+ n->m_flags |= M_EXT;
+ n->m_ext = m->m_ext;
+ if(!m->m_ext.ext_ref)
+ mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+ else
+ (*(m->m_ext.ext_ref))(m->m_ext.ext_buf,
+ m->m_ext.ext_size);
+ m->m_ext.ext_size = 0; /* For Accounting XXXXXX danger */
+ n->m_data = m->m_data + len;
+ } else {
+ bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
+ }
+ n->m_len = remain;
+ m->m_len = len;
+ n->m_next = m->m_next;
+ m->m_next = 0;
+ return (n);
+}
+/*
+ * Routine to copy from device local memory into mbufs.
+ */
+struct mbuf *
+m_devget(buf, totlen, off0, ifp, copy)
+ char *buf;
+ int totlen, off0;
+ struct ifnet *ifp;
+ void (*copy) __P((char *from, caddr_t to, u_int len));
+{
+ register struct mbuf *m;
+ struct mbuf *top = 0, **mp = &top;
+ register int off = off0, len;
+ register char *cp;
+ char *epkt;
+
+ cp = buf;
+ epkt = cp + totlen;
+ if (off) {
+ cp += off + 2 * sizeof(u_short);
+ totlen -= 2 * sizeof(u_short);
+ }
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m == 0)
+ return (0);
+ m->m_pkthdr.rcvif = ifp;
+ m->m_pkthdr.len = totlen;
+ m->m_len = MHLEN;
+
+ while (totlen > 0) {
+ if (top) {
+ MGET(m, M_DONTWAIT, MT_DATA);
+ if (m == 0) {
+ m_freem(top);
+ return (0);
+ }
+ m->m_len = MLEN;
+ }
+ len = min(totlen, epkt - cp);
+ if (len >= MINCLSIZE) {
+ MCLGET(m, M_DONTWAIT);
+ if (m->m_flags & M_EXT)
+ m->m_len = len = min(len, MCLBYTES);
+ else
+ len = m->m_len;
+ } else {
+ /*
+ * Place initial small packet/header at end of mbuf.
+ */
+ if (len < m->m_len) {
+ if (top == 0 && len + max_linkhdr <= m->m_len)
+ m->m_data += max_linkhdr;
+ m->m_len = len;
+ } else
+ len = m->m_len;
+ }
+ if (copy)
+ copy(cp, mtod(m, caddr_t), (unsigned)len);
+ else
+ bcopy(cp, mtod(m, caddr_t), (unsigned)len);
+ cp += len;
+ *mp = m;
+ mp = &m->m_next;
+ totlen -= len;
+ if (cp == epkt)
+ cp = buf;
+ }
+ return (top);
+}
+
+/*
+ * Copy data from a buffer back into the indicated mbuf chain,
+ * starting "off" bytes from the beginning, extending the mbuf
+ * chain if necessary.
+ */
+void
+m_copyback(m0, off, len, cp)
+ struct mbuf *m0;
+ register int off;
+ register int len;
+ caddr_t cp;
+{
+ register int mlen;
+ register struct mbuf *m = m0, *n;
+ int totlen = 0;
+
+ if (m0 == 0)
+ return;
+ while (off > (mlen = m->m_len)) {
+ off -= mlen;
+ totlen += mlen;
+ if (m->m_next == 0) {
+ n = m_getclr(M_DONTWAIT, m->m_type);
+ if (n == 0)
+ goto out;
+ n->m_len = min(MLEN, len + off);
+ m->m_next = n;
+ }
+ m = m->m_next;
+ }
+ while (len > 0) {
+ mlen = min (m->m_len - off, len);
+ bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
+ cp += mlen;
+ len -= mlen;
+ mlen += off;
+ off = 0;
+ totlen += mlen;
+ if (len == 0)
+ break;
+ if (m->m_next == 0) {
+ n = m_get(M_DONTWAIT, m->m_type);
+ if (n == 0)
+ break;
+ n->m_len = min(MLEN, len);
+ m->m_next = n;
+ }
+ m = m->m_next;
+ }
+out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
+ m->m_pkthdr.len = totlen;
+}
diff --git a/sys/kern/uipc_proto.c b/sys/kern/uipc_proto.c
new file mode 100644
index 0000000..094d1bf
--- /dev/null
+++ b/sys/kern/uipc_proto.c
@@ -0,0 +1,79 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_proto.c 8.1 (Berkeley) 6/10/93
+ * $Id: uipc_proto.c,v 1.16 1998/06/21 14:53:18 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/un.h>
+#include <sys/unpcb.h>
+
+#include <net/raw_cb.h>
+
+/*
+ * Definitions of protocols supported in the LOCAL domain.
+ */
+
+static struct protosw localsw[] = {
+{ SOCK_STREAM, &localdomain, 0, PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
+ 0, 0, 0, 0,
+ 0,
+ 0, 0, 0, 0,
+ &uipc_usrreqs
+},
+{ SOCK_DGRAM, &localdomain, 0, PR_ATOMIC|PR_ADDR|PR_RIGHTS,
+ 0, 0, 0, 0,
+ 0,
+ 0, 0, 0, 0,
+ &uipc_usrreqs
+},
+{ 0, 0, 0, 0,
+ 0, 0, raw_ctlinput, 0,
+ 0,
+ raw_init, 0, 0, 0,
+ &raw_usrreqs
+}
+};
+
+struct domain localdomain =
+ { AF_LOCAL, "local", unp_init, unp_externalize, unp_dispose,
+ localsw, &localsw[sizeof(localsw)/sizeof(localsw[0])] };
+
+SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
+SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM");
+SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
new file mode 100644
index 0000000..e718c62
--- /dev/null
+++ b/sys/kern/uipc_sockbuf.c
@@ -0,0 +1,954 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
+ * $Id: uipc_socket2.c,v 1.42 1998/11/23 00:45:38 truckman Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+
+/*
+ * Primitive routines for operating on sockets and socket buffers
+ */
+
+u_long sb_max = SB_MAX; /* XXX should be static */
+
+static u_long sb_efficiency = 8; /* parameter for sbreserve() */
+
+/*
+ * Procedures to manipulate state flags of socket
+ * and do appropriate wakeups. Normal sequence from the
+ * active (originating) side is that soisconnecting() is
+ * called during processing of connect() call,
+ * resulting in an eventual call to soisconnected() if/when the
+ * connection is established. When the connection is torn down
+ * soisdisconnecting() is called during processing of disconnect() call,
+ * and soisdisconnected() is called when the connection to the peer
+ * is totally severed. The semantics of these routines are such that
+ * connectionless protocols can call soisconnected() and soisdisconnected()
+ * only, bypassing the in-progress calls when setting up a ``connection''
+ * takes no time.
+ *
+ * From the passive side, a socket is created with
+ * two queues of sockets: so_q0 for connections in progress
+ * and so_q for connections already made and awaiting user acceptance.
+ * As a protocol is preparing incoming connections, it creates a socket
+ * structure queued on so_q0 by calling sonewconn(). When the connection
+ * is established, soisconnected() is called, and transfers the
+ * socket structure to so_q, making it available to accept().
+ *
+ * If a socket is closed with sockets on either
+ * so_q0 or so_q, these sockets are dropped.
+ *
+ * If higher level protocols are implemented in
+ * the kernel, the wakeups done here will sometimes
+ * cause software-interrupt process scheduling.
+ */
+
+void
+soisconnecting(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= SS_ISCONNECTING;
+}
+
+void
+soisconnected(so)
+ register struct socket *so;
+{
+ register struct socket *head = so->so_head;
+
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+ so->so_state |= SS_ISCONNECTED;
+ if (head && (so->so_state & SS_INCOMP)) {
+ TAILQ_REMOVE(&head->so_incomp, so, so_list);
+ head->so_incqlen--;
+ so->so_state &= ~SS_INCOMP;
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ so->so_state |= SS_COMP;
+ sorwakeup(head);
+ wakeup_one(&head->so_timeo);
+ } else {
+ wakeup(&so->so_timeo);
+ sorwakeup(so);
+ sowwakeup(so);
+ }
+}
+
+void
+soisdisconnecting(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~SS_ISCONNECTING;
+ so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
+ wakeup((caddr_t)&so->so_timeo);
+ sowwakeup(so);
+ sorwakeup(so);
+}
+
+void
+soisdisconnected(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
+ wakeup((caddr_t)&so->so_timeo);
+ sowwakeup(so);
+ sorwakeup(so);
+}
+
+/*
+ * Return a random connection that hasn't been serviced yet and
+ * is eligible for discard. There is a one in qlen chance that
+ * we will return a null, saying that there are no dropable
+ * requests. In this case, the protocol specific code should drop
+ * the new request. This insures fairness.
+ *
+ * This may be used in conjunction with protocol specific queue
+ * congestion routines.
+ */
+struct socket *
+sodropablereq(head)
+ register struct socket *head;
+{
+ register struct socket *so;
+ unsigned int i, j, qlen;
+ static int rnd;
+ static struct timeval old_runtime;
+ static unsigned int cur_cnt, old_cnt;
+ struct timeval tv;
+
+ getmicrouptime(&tv);
+ if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) {
+ old_runtime = tv;
+ old_cnt = cur_cnt / i;
+ cur_cnt = 0;
+ }
+
+ so = TAILQ_FIRST(&head->so_incomp);
+ if (!so)
+ return (so);
+
+ qlen = head->so_incqlen;
+ if (++cur_cnt > qlen || old_cnt > qlen) {
+ rnd = (314159 * rnd + 66329) & 0xffff;
+ j = ((qlen + 1) * rnd) >> 16;
+
+ while (j-- && so)
+ so = TAILQ_NEXT(so, so_list);
+ }
+
+ return (so);
+}
+
+/*
+ * When an attempt at a new connection is noted on a socket
+ * which accepts connections, sonewconn is called. If the
+ * connection is possible (subject to space constraints, etc.)
+ * then we allocate a new structure, propoerly linked into the
+ * data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ */
+struct socket *
+sonewconn(head, connstatus)
+ register struct socket *head;
+ int connstatus;
+{
+ register struct socket *so;
+
+ if (head->so_qlen > 3 * head->so_qlimit / 2)
+ return ((struct socket *)0);
+ so = soalloc(0);
+ if (so == NULL)
+ return ((struct socket *)0);
+ so->so_head = head;
+ so->so_type = head->so_type;
+ so->so_options = head->so_options &~ SO_ACCEPTCONN;
+ so->so_linger = head->so_linger;
+ so->so_state = head->so_state | SS_NOFDREF;
+ so->so_proto = head->so_proto;
+ so->so_timeo = head->so_timeo;
+ so->so_uid = head->so_uid;
+ (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat);
+
+ if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+ sodealloc(so);
+ return ((struct socket *)0);
+ }
+
+ if (connstatus) {
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ so->so_state |= SS_COMP;
+ } else {
+ TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+ so->so_state |= SS_INCOMP;
+ head->so_incqlen++;
+ }
+ head->so_qlen++;
+ if (connstatus) {
+ sorwakeup(head);
+ wakeup((caddr_t)&head->so_timeo);
+ so->so_state |= connstatus;
+ }
+ return (so);
+}
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the
+ * socket; it would normally be applied to a socket when the user
+ * informs the system that no more data is to be sent, by the protocol
+ * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
+ * will be received, and will normally be applied to the socket by a
+ * protocol when it detects that the peer will send no more data.
+ * Data queued for reading in the socket may yet be read.
+ */
+
+void
+socantsendmore(so)
+ struct socket *so;
+{
+
+ so->so_state |= SS_CANTSENDMORE;
+ sowwakeup(so);
+}
+
+void
+socantrcvmore(so)
+ struct socket *so;
+{
+
+ so->so_state |= SS_CANTRCVMORE;
+ sorwakeup(so);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(sb)
+ struct sockbuf *sb;
+{
+
+ sb->sb_flags |= SB_WAIT;
+ return (tsleep((caddr_t)&sb->sb_cc,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+ sb->sb_timeo));
+}
+
+/*
+ * Lock a sockbuf already known to be locked;
+ * return any error returned from sleep (EINTR).
+ */
+int
+sb_lock(sb)
+ register struct sockbuf *sb;
+{
+ int error;
+
+ while (sb->sb_flags & SB_LOCK) {
+ sb->sb_flags |= SB_WANT;
+ error = tsleep((caddr_t)&sb->sb_flags,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
+ "sblock", 0);
+ if (error)
+ return (error);
+ }
+ sb->sb_flags |= SB_LOCK;
+ return (0);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer.
+ * Do asynchronous notification via SIGIO
+ * if the socket has the SS_ASYNC flag set.
+ */
+void
+sowakeup(so, sb)
+ register struct socket *so;
+ register struct sockbuf *sb;
+{
+ selwakeup(&sb->sb_sel);
+ sb->sb_flags &= ~SB_SEL;
+ if (sb->sb_flags & SB_WAIT) {
+ sb->sb_flags &= ~SB_WAIT;
+ wakeup((caddr_t)&sb->sb_cc);
+ }
+ if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
+ pgsigio(so->so_sigio, SIGIO, 0);
+ if (sb->sb_flags & SB_UPCALL)
+ (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and
+ * one for receiving data. Each buffer contains a queue of mbufs,
+ * information about the number of mbufs and amount of data in the
+ * queue, and other fields allowing select() statements and notification
+ * on data availability to be implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records.
+ * Each record is a list of mbufs chained together with the m_next
+ * field. Records are chained together with the m_nextpkt field. The upper
+ * level routine soreceive() expects the following conventions to be
+ * observed when placing information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's
+ * name, then a record containing that name must be present before
+ * any associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really
+ * just additional data associated with the message), and there are
+ * ``rights'' to be received, then a record containing this data
+ * should be present (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by
+ * a data record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve(). This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits). The space
+ * should be released by calling sbrelease() when the socket is destroyed.
+ */
+
+int
+soreserve(so, sndcc, rcvcc)
+ register struct socket *so;
+ u_long sndcc, rcvcc;
+{
+
+ if (sbreserve(&so->so_snd, sndcc) == 0)
+ goto bad;
+ if (sbreserve(&so->so_rcv, rcvcc) == 0)
+ goto bad2;
+ if (so->so_rcv.sb_lowat == 0)
+ so->so_rcv.sb_lowat = 1;
+ if (so->so_snd.sb_lowat == 0)
+ so->so_snd.sb_lowat = MCLBYTES;
+ if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+ so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+ return (0);
+bad2:
+ sbrelease(&so->so_snd);
+bad:
+ return (ENOBUFS);
+}
+
+/*
+ * Allot mbufs to a sockbuf.
+ * Attempt to scale mbmax so that mbcnt doesn't become limiting
+ * if buffering efficiency is near the normal case.
+ */
+int
+sbreserve(sb, cc)
+ struct sockbuf *sb;
+ u_long cc;
+{
+ if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
+ return (0);
+ sb->sb_hiwat = cc;
+ sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+ if (sb->sb_lowat > sb->sb_hiwat)
+ sb->sb_lowat = sb->sb_hiwat;
+ return (1);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+void
+sbrelease(sb)
+ struct sockbuf *sb;
+{
+
+ sbflush(sb);
+ sb->sb_hiwat = sb->sb_mbmax = 0;
+}
+
+/*
+ * Routines to add and remove
+ * data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to
+ * append new mbufs to a socket buffer, after checking that adequate
+ * space is available, comparing the function sbspace() with the amount
+ * of data to be added. sbappendrecord() differs from sbappend() in
+ * that data supplied is treated as the beginning of a new record.
+ * To place a sender's address, optional access rights, and data in a
+ * socket receive buffer, sbappendaddr() should be used. To place
+ * access rights and data in a socket receive buffer, sbappendrights()
+ * should be used. In either case, the new data begins a new record.
+ * Note that unlike sbappend() and sbappendrecord(), these routines check
+ * for the caller that there will be enough space to store the data.
+ * Each fails if there is not enough space, or if it cannot find mbufs
+ * to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data
+ * awaiting acknowledgement. Data is normally copied from a socket
+ * send buffer in a protocol with m_copy for output to a peer,
+ * and then removing the data from the socket buffer with sbdrop()
+ * or sbdroprecord() when the data is acknowledged by the peer.
+ */
+
+/*
+ * Append mbuf chain m to the last record in the
+ * socket buffer sb. The additional space associated
+ * the mbuf chain is recorded in sb. Empty mbufs are
+ * discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(sb, m)
+ struct sockbuf *sb;
+ struct mbuf *m;
+{
+ register struct mbuf *n;
+
+ if (m == 0)
+ return;
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ do {
+ if (n->m_flags & M_EOR) {
+ sbappendrecord(sb, m); /* XXXXXX!!!! */
+ return;
+ }
+ } while (n->m_next && (n = n->m_next));
+ }
+ sbcompress(sb, m, n);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(sb)
+ register struct sockbuf *sb;
+{
+ register struct mbuf *m;
+ register struct mbuf *n = 0;
+ register u_long len = 0, mbcnt = 0;
+
+ for (m = sb->sb_mb; m; m = n) {
+ n = m->m_nextpkt;
+ for (; m; m = m->m_next) {
+ len += m->m_len;
+ mbcnt += MSIZE;
+ if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+ mbcnt += m->m_ext.ext_size;
+ }
+ }
+ if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+ printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
+ mbcnt, sb->sb_mbcnt);
+ panic("sbcheck");
+ }
+}
+#endif
+
+/*
+ * As above, except the mbuf chain
+ * begins a new record.
+ */
+void
+sbappendrecord(sb, m0)
+ register struct sockbuf *sb;
+ register struct mbuf *m0;
+{
+ register struct mbuf *m;
+
+ if (m0 == 0)
+ return;
+ m = sb->sb_mb;
+ if (m)
+ while (m->m_nextpkt)
+ m = m->m_nextpkt;
+ /*
+ * Put the first mbuf on the queue.
+ * Note this permits zero length records.
+ */
+ sballoc(sb, m0);
+ if (m)
+ m->m_nextpkt = m0;
+ else
+ sb->sb_mb = m0;
+ m = m0->m_next;
+ m0->m_next = 0;
+ if (m && (m0->m_flags & M_EOR)) {
+ m0->m_flags &= ~M_EOR;
+ m->m_flags |= M_EOR;
+ }
+ sbcompress(sb, m, m0);
+}
+
+/*
+ * As above except that OOB data
+ * is inserted at the beginning of the sockbuf,
+ * but after any other OOB data.
+ */
+void
+sbinsertoob(sb, m0)
+ register struct sockbuf *sb;
+ register struct mbuf *m0;
+{
+ register struct mbuf *m;
+ register struct mbuf **mp;
+
+ if (m0 == 0)
+ return;
+ for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
+ m = *mp;
+ again:
+ switch (m->m_type) {
+
+ case MT_OOBDATA:
+ continue; /* WANT next train */
+
+ case MT_CONTROL:
+ m = m->m_next;
+ if (m)
+ goto again; /* inspect THIS train further */
+ }
+ break;
+ }
+ /*
+ * Put the first mbuf on the queue.
+ * Note this permits zero length records.
+ */
+ sballoc(sb, m0);
+ m0->m_nextpkt = *mp;
+ *mp = m0;
+ m = m0->m_next;
+ m0->m_next = 0;
+ if (m && (m0->m_flags & M_EOR)) {
+ m0->m_flags &= ~M_EOR;
+ m->m_flags |= M_EOR;
+ }
+ sbcompress(sb, m, m0);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data
+ * to the receive queue of a socket. If present,
+ * m0 must include a packet header with total length.
+ * Returns 0 if no space in sockbuf or insufficient mbufs.
+ */
+int
+sbappendaddr(sb, asa, m0, control)
+ register struct sockbuf *sb;
+ struct sockaddr *asa;
+ struct mbuf *m0, *control;
+{
+ register struct mbuf *m, *n;
+ int space = asa->sa_len;
+
+if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+panic("sbappendaddr");
+ if (m0)
+ space += m0->m_pkthdr.len;
+ for (n = control; n; n = n->m_next) {
+ space += n->m_len;
+ if (n->m_next == 0) /* keep pointer to last control buf */
+ break;
+ }
+ if (space > sbspace(sb))
+ return (0);
+ if (asa->sa_len > MLEN)
+ return (0);
+ MGET(m, M_DONTWAIT, MT_SONAME);
+ if (m == 0)
+ return (0);
+ m->m_len = asa->sa_len;
+ bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
+ if (n)
+ n->m_next = m0; /* concatenate data to control */
+ else
+ control = m0;
+ m->m_next = control;
+ for (n = m; n; n = n->m_next)
+ sballoc(sb, n);
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ n->m_nextpkt = m;
+ } else
+ sb->sb_mb = m;
+ return (1);
+}
+
+int
+sbappendcontrol(sb, m0, control)
+ struct sockbuf *sb;
+ struct mbuf *control, *m0;
+{
+ register struct mbuf *m, *n;
+ int space = 0;
+
+ if (control == 0)
+ panic("sbappendcontrol");
+ for (m = control; ; m = m->m_next) {
+ space += m->m_len;
+ if (m->m_next == 0)
+ break;
+ }
+ n = m; /* save pointer to last control buffer */
+ for (m = m0; m; m = m->m_next)
+ space += m->m_len;
+ if (space > sbspace(sb))
+ return (0);
+ n->m_next = m0; /* concatenate data to control */
+ for (m = control; m; m = m->m_next)
+ sballoc(sb, m);
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ n->m_nextpkt = control;
+ } else
+ sb->sb_mb = control;
+ return (1);
+}
+
+/*
+ * Compress mbuf chain m into the socket
+ * buffer sb following mbuf n. If n
+ * is null, the buffer is presumed empty.
+ */
+void
+sbcompress(sb, m, n)
+ register struct sockbuf *sb;
+ register struct mbuf *m, *n;
+{
+ register int eor = 0;
+ register struct mbuf *o;
+
+ while (m) {
+ eor |= m->m_flags & M_EOR;
+ if (m->m_len == 0 &&
+ (eor == 0 ||
+ (((o = m->m_next) || (o = n)) &&
+ o->m_type == m->m_type))) {
+ m = m_free(m);
+ continue;
+ }
+ if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 &&
+ (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] &&
+ n->m_type == m->m_type) {
+ bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+ (unsigned)m->m_len);
+ n->m_len += m->m_len;
+ sb->sb_cc += m->m_len;
+ m = m_free(m);
+ continue;
+ }
+ if (n)
+ n->m_next = m;
+ else
+ sb->sb_mb = m;
+ sballoc(sb, m);
+ n = m;
+ m->m_flags &= ~M_EOR;
+ m = m->m_next;
+ n->m_next = 0;
+ }
+ if (eor) {
+ if (n)
+ n->m_flags |= eor;
+ else
+ printf("semi-panic: sbcompress\n");
+ }
+}
+
+/*
+ * Free all mbufs in a sockbuf.
+ * Check that all resources are reclaimed.
+ */
+void
+sbflush(sb)
+ register struct sockbuf *sb;
+{
+
+ if (sb->sb_flags & SB_LOCK)
+ panic("sbflush: locked");
+ while (sb->sb_mbcnt && sb->sb_cc)
+ sbdrop(sb, (int)sb->sb_cc);
+ if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
+ panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop(sb, len)
+ register struct sockbuf *sb;
+ register int len;
+{
+ register struct mbuf *m, *mn;
+ struct mbuf *next;
+
+ next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+ while (len > 0) {
+ if (m == 0) {
+ if (next == 0)
+ panic("sbdrop");
+ m = next;
+ next = m->m_nextpkt;
+ continue;
+ }
+ if (m->m_len > len) {
+ m->m_len -= len;
+ m->m_data += len;
+ sb->sb_cc -= len;
+ break;
+ }
+ len -= m->m_len;
+ sbfree(sb, m);
+ MFREE(m, mn);
+ m = mn;
+ }
+ while (m && m->m_len == 0) {
+ sbfree(sb, m);
+ MFREE(m, mn);
+ m = mn;
+ }
+ if (m) {
+ sb->sb_mb = m;
+ m->m_nextpkt = next;
+ } else
+ sb->sb_mb = next;
+}
+
+/*
+ * Drop a record off the front of a sockbuf
+ * and move the next record to the front.
+ */
+void
+sbdroprecord(sb)
+ register struct sockbuf *sb;
+{
+ register struct mbuf *m, *mn;
+
+ m = sb->sb_mb;
+ if (m) {
+ sb->sb_mb = m->m_nextpkt;
+ do {
+ sbfree(sb, m);
+ MFREE(m, mn);
+ m = mn;
+ } while (m);
+ }
+}
+
+/*
+ * Create a "control" mbuf containing the specified data
+ * with the specified type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(p, size, type, level)
+ caddr_t p;
+ register int size;
+ int type, level;
+{
+ register struct cmsghdr *cp;
+ struct mbuf *m;
+
+ if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+ return ((struct mbuf *) NULL);
+ cp = mtod(m, struct cmsghdr *);
+ /* XXX check size? */
+ (void)memcpy(CMSG_DATA(cp), p, size);
+ size += sizeof(*cp);
+ m->m_len = size;
+ cp->cmsg_len = size;
+ cp->cmsg_level = level;
+ cp->cmsg_type = type;
+ return (m);
+}
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol. Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
+ struct ifnet *ifp, struct proc *p)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so, struct proc *p)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+ return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one
+ * and doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+ sb->st_blksize = so->so_snd.sb_hiwat;
+ return 0;
+}
+
+/*
+ * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
+ */
+struct sockaddr *
+dup_sockaddr(sa, canwait)
+ struct sockaddr *sa;
+ int canwait;
+{
+ struct sockaddr *sa2;
+
+ MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME,
+ canwait ? M_WAITOK : M_NOWAIT);
+ if (sa2)
+ bcopy(sa, sa2, sa->sa_len);
+ return sa2;
+}
+
+/*
+ * Create an external-format (``xsocket'') structure using the information
+ * in the kernel-format socket structure pointed to by so. This is done
+ * to reduce the spew of irrelevant information over this interface,
+ * to isolate user code from changes in the kernel structure, and
+ * potentially to provide information-hiding if we decide that
+ * some of this information should be hidden from users.
+ */
+void
+sotoxsocket(struct socket *so, struct xsocket *xso)
+{
+ xso->xso_len = sizeof *xso;
+ xso->xso_so = so;
+ xso->so_type = so->so_type;
+ xso->so_options = so->so_options;
+ xso->so_linger = so->so_linger;
+ xso->so_state = so->so_state;
+ xso->so_pcb = so->so_pcb;
+ xso->xso_protocol = so->so_proto->pr_protocol;
+ xso->xso_family = so->so_proto->pr_domain->dom_family;
+ xso->so_qlen = so->so_qlen;
+ xso->so_incqlen = so->so_incqlen;
+ xso->so_qlimit = so->so_qlimit;
+ xso->so_timeo = so->so_timeo;
+ xso->so_error = so->so_error;
+ xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+ xso->so_oobmark = so->so_oobmark;
+ sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+ sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+ xso->so_uid = so->so_uid;
+}
+
+/*
+ * This does the same for sockbufs. Note that the xsockbuf structure,
+ * since it is always embedded in a socket, does not include a self
+ * pointer nor a length. We make this entry point public in case
+ * some other mechanism needs it.
+ */
+void
+sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
+{
+ xsb->sb_cc = sb->sb_cc;
+ xsb->sb_hiwat = sb->sb_hiwat;
+ xsb->sb_mbcnt = sb->sb_mbcnt;
+ xsb->sb_mbmax = sb->sb_mbmax;
+ xsb->sb_lowat = sb->sb_lowat;
+ xsb->sb_flags = sb->sb_flags;
+ xsb->sb_timeo = sb->sb_timeo;
+}
+
+/*
+ * Here is the definition of some of the basic objects in the kern.ipc
+ * branch of the MIB.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+
+SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, &sb_max, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, &maxsockets, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+ &sb_efficiency, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, "");
+
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
new file mode 100644
index 0000000..1efa8c5
--- /dev/null
+++ b/sys/kern/uipc_socket.c
@@ -0,0 +1,1216 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
+ * $Id: uipc_socket.c,v 1.50 1999/01/20 17:31:54 fenner Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/poll.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <vm/vm_zone.h>
+
+#include <machine/limits.h>
+
+struct vm_zone *socket_zone;
+so_gen_t so_gencnt; /* generation count for sockets */
+
+MALLOC_DEFINE(M_SONAME, "soname", "socket name");
+MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
+
+static int somaxconn = SOMAXCONN;
+SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
+ 0, "");
+
+/*
+ * Socket operation routines.
+ * These routines are called by the routines in
+ * sys_socket.c or from a system process, and
+ * implement the semantics of socket operations by
+ * switching out to the protocol specific routines.
+ */
+
+/*
+ * Get a socket structure from our zone, and initialize it.
+ * We don't implement `waitok' yet (see comments in uipc_domain.c).
+ * Note that it would probably be better to allocate socket
+ * and PCB at the same time, but I'm not convinced that all
+ * the protocols can be easily modified to do this.
+ */
+struct socket *
+soalloc(waitok)
+ int waitok;
+{
+ struct socket *so;
+
+ so = zalloci(socket_zone);
+ if (so) {
+ /* XXX race condition for reentrant kernel */
+ bzero(so, sizeof *so);
+ so->so_gencnt = ++so_gencnt;
+ so->so_zone = socket_zone;
+ }
+ return so;
+}
+
+int
+socreate(dom, aso, type, proto, p)
+ int dom;
+ struct socket **aso;
+ register int type;
+ int proto;
+ struct proc *p;
+{
+ register struct protosw *prp;
+ register struct socket *so;
+ register int error;
+
+ if (proto)
+ prp = pffindproto(dom, proto, type);
+ else
+ prp = pffindtype(dom, type);
+ if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
+ return (EPROTONOSUPPORT);
+ if (prp->pr_type != type)
+ return (EPROTOTYPE);
+ so = soalloc(p != 0);
+ if (so == 0)
+ return (ENOBUFS);
+
+ TAILQ_INIT(&so->so_incomp);
+ TAILQ_INIT(&so->so_comp);
+ so->so_type = type;
+ if (p != 0)
+ so->so_uid = p->p_ucred->cr_uid;
+ so->so_proto = prp;
+ error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
+ if (error) {
+ so->so_state |= SS_NOFDREF;
+ sofree(so);
+ return (error);
+ }
+ *aso = so;
+ return (0);
+}
+
+int
+sobind(so, nam, p)
+ struct socket *so;
+ struct sockaddr *nam;
+ struct proc *p;
+{
+ int s = splnet();
+ int error;
+
+ error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
+ splx(s);
+ return (error);
+}
+
+void
+sodealloc(so)
+ struct socket *so;
+{
+ so->so_gencnt = ++so_gencnt;
+ zfreei(so->so_zone, so);
+}
+
+int
+solisten(so, backlog, p)
+ register struct socket *so;
+ int backlog;
+ struct proc *p;
+{
+ int s, error;
+
+ s = splnet();
+ error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
+ if (error) {
+ splx(s);
+ return (error);
+ }
+ if (so->so_comp.tqh_first == NULL)
+ so->so_options |= SO_ACCEPTCONN;
+ if (backlog < 0 || backlog > somaxconn)
+ backlog = somaxconn;
+ so->so_qlimit = backlog;
+ splx(s);
+ return (0);
+}
+
+void
+sofree(so)
+ register struct socket *so;
+{
+ struct socket *head = so->so_head;
+
+ if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
+ return;
+ if (head != NULL) {
+ if (so->so_state & SS_INCOMP) {
+ TAILQ_REMOVE(&head->so_incomp, so, so_list);
+ head->so_incqlen--;
+ } else if (so->so_state & SS_COMP) {
+ TAILQ_REMOVE(&head->so_comp, so, so_list);
+ } else {
+ panic("sofree: not queued");
+ }
+ head->so_qlen--;
+ so->so_state &= ~(SS_INCOMP|SS_COMP);
+ so->so_head = NULL;
+ }
+ sbrelease(&so->so_snd);
+ sorflush(so);
+ sodealloc(so);
+}
+
+/*
+ * Close a socket on last file table reference removal.
+ * Initiate disconnect if connected.
+ * Free socket when disconnect complete.
+ */
+int
+soclose(so)
+ register struct socket *so;
+{
+ int s = splnet(); /* conservative */
+ int error = 0;
+
+ funsetown(so->so_sigio);
+ if (so->so_options & SO_ACCEPTCONN) {
+ struct socket *sp, *sonext;
+
+ for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
+ sonext = sp->so_list.tqe_next;
+ (void) soabort(sp);
+ }
+ for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
+ sonext = sp->so_list.tqe_next;
+ (void) soabort(sp);
+ }
+ }
+ if (so->so_pcb == 0)
+ goto discard;
+ if (so->so_state & SS_ISCONNECTED) {
+ if ((so->so_state & SS_ISDISCONNECTING) == 0) {
+ error = sodisconnect(so);
+ if (error)
+ goto drop;
+ }
+ if (so->so_options & SO_LINGER) {
+ if ((so->so_state & SS_ISDISCONNECTING) &&
+ (so->so_state & SS_NBIO))
+ goto drop;
+ while (so->so_state & SS_ISCONNECTED) {
+ error = tsleep((caddr_t)&so->so_timeo,
+ PSOCK | PCATCH, "soclos", so->so_linger);
+ if (error)
+ break;
+ }
+ }
+ }
+drop:
+ if (so->so_pcb) {
+ int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
+ if (error == 0)
+ error = error2;
+ }
+discard:
+ if (so->so_state & SS_NOFDREF)
+ panic("soclose: NOFDREF");
+ so->so_state |= SS_NOFDREF;
+ sofree(so);
+ splx(s);
+ return (error);
+}
+
+/*
+ * Must be called at splnet...
+ */
+int
+soabort(so)
+ struct socket *so;
+{
+
+ return (*so->so_proto->pr_usrreqs->pru_abort)(so);
+}
+
+int
+soaccept(so, nam)
+ register struct socket *so;
+ struct sockaddr **nam;
+{
+ int s = splnet();
+ int error;
+
+ if ((so->so_state & SS_NOFDREF) == 0)
+ panic("soaccept: !NOFDREF");
+ so->so_state &= ~SS_NOFDREF;
+ error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
+ splx(s);
+ return (error);
+}
+
+int
+soconnect(so, nam, p)
+ register struct socket *so;
+ struct sockaddr *nam;
+ struct proc *p;
+{
+ int s;
+ int error;
+
+ if (so->so_options & SO_ACCEPTCONN)
+ return (EOPNOTSUPP);
+ s = splnet();
+ /*
+ * If protocol is connection-based, can only connect once.
+ * Otherwise, if connected, try to disconnect first.
+ * This allows user to disconnect by connecting to, e.g.,
+ * a null address.
+ */
+ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
+ ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
+ (error = sodisconnect(so))))
+ error = EISCONN;
+ else
+ error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
+ splx(s);
+ return (error);
+}
+
+int
+soconnect2(so1, so2)
+ register struct socket *so1;
+ struct socket *so2;
+{
+ int s = splnet();
+ int error;
+
+ error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
+ splx(s);
+ return (error);
+}
+
+int
+sodisconnect(so)
+ register struct socket *so;
+{
+ int s = splnet();
+ int error;
+
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ error = ENOTCONN;
+ goto bad;
+ }
+ if (so->so_state & SS_ISDISCONNECTING) {
+ error = EALREADY;
+ goto bad;
+ }
+ error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
+bad:
+ splx(s);
+ return (error);
+}
+
+#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
+/*
+ * Send on a socket.
+ * If send must go all at once and message is larger than
+ * send buffering, then hard error.
+ * Lock against other senders.
+ * If must go all at once and not enough room now, then
+ * inform user that this would block and do nothing.
+ * Otherwise, if nonblocking, send as much as possible.
+ * The data to be sent is described by "uio" if nonzero,
+ * otherwise by the mbuf chain "top" (which must be null
+ * if uio is not). Data provided in mbuf chain must be small
+ * enough to send all at once.
+ *
+ * Returns nonzero on error, timeout or signal; callers
+ * must check for short counts if EINTR/ERESTART are returned.
+ * Data and control buffers are freed on return.
+ */
+int
+sosend(so, addr, uio, top, control, flags, p)
+ register struct socket *so;
+ struct sockaddr *addr;
+ struct uio *uio;
+ struct mbuf *top;
+ struct mbuf *control;
+ int flags;
+ struct proc *p;
+{
+ struct mbuf **mp;
+ register struct mbuf *m;
+ register long space, len, resid;
+ int clen = 0, error, s, dontroute, mlen;
+ int atomic = sosendallatonce(so) || top;
+
+ if (uio)
+ resid = uio->uio_resid;
+ else
+ resid = top->m_pkthdr.len;
+ /*
+ * In theory resid should be unsigned.
+ * However, space must be signed, as it might be less than 0
+ * if we over-committed, and we must use a signed comparison
+ * of space and resid. On the other hand, a negative resid
+ * causes us to loop sending 0-length segments to the protocol.
+ *
+ * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+ * type sockets since that's an error.
+ */
+ if (resid < 0 || so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ dontroute =
+ (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
+ (so->so_proto->pr_flags & PR_ATOMIC);
+ if (p)
+ p->p_stats->p_ru.ru_msgsnd++;
+ if (control)
+ clen = control->m_len;
+#define snderr(errno) { error = errno; splx(s); goto release; }
+
+restart:
+ error = sblock(&so->so_snd, SBLOCKWAIT(flags));
+ if (error)
+ goto out;
+ do {
+ s = splnet();
+ if (so->so_state & SS_CANTSENDMORE)
+ snderr(EPIPE);
+ if (so->so_error) {
+ error = so->so_error;
+ so->so_error = 0;
+ splx(s);
+ goto release;
+ }
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ /*
+ * `sendto' and `sendmsg' is allowed on a connection-
+ * based socket if it supports implied connect.
+ * Return ENOTCONN if not connected and no address is
+ * supplied.
+ */
+ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+ if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+ !(resid == 0 && clen != 0))
+ snderr(ENOTCONN);
+ } else if (addr == 0)
+ snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
+ ENOTCONN : EDESTADDRREQ);
+ }
+ space = sbspace(&so->so_snd);
+ if (flags & MSG_OOB)
+ space += 1024;
+ if ((atomic && resid > so->so_snd.sb_hiwat) ||
+ clen > so->so_snd.sb_hiwat)
+ snderr(EMSGSIZE);
+ if (space < resid + clen && uio &&
+ (atomic || space < so->so_snd.sb_lowat || space < clen)) {
+ if (so->so_state & SS_NBIO)
+ snderr(EWOULDBLOCK);
+ sbunlock(&so->so_snd);
+ error = sbwait(&so->so_snd);
+ splx(s);
+ if (error)
+ goto out;
+ goto restart;
+ }
+ splx(s);
+ mp = &top;
+ space -= clen;
+ do {
+ if (uio == NULL) {
+ /*
+ * Data is prepackaged in "top".
+ */
+ resid = 0;
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ } else do {
+ if (top == 0) {
+ MGETHDR(m, M_WAIT, MT_DATA);
+ mlen = MHLEN;
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+ } else {
+ MGET(m, M_WAIT, MT_DATA);
+ mlen = MLEN;
+ }
+ if (resid >= MINCLSIZE) {
+ MCLGET(m, M_WAIT);
+ if ((m->m_flags & M_EXT) == 0)
+ goto nopages;
+ mlen = MCLBYTES;
+ len = min(min(mlen, resid), space);
+ } else {
+nopages:
+ len = min(min(mlen, resid), space);
+ /*
+ * For datagram protocols, leave room
+ * for protocol headers in first mbuf.
+ */
+ if (atomic && top == 0 && len < mlen)
+ MH_ALIGN(m, len);
+ }
+ space -= len;
+ error = uiomove(mtod(m, caddr_t), (int)len, uio);
+ resid = uio->uio_resid;
+ m->m_len = len;
+ *mp = m;
+ top->m_pkthdr.len += len;
+ if (error)
+ goto release;
+ mp = &m->m_next;
+ if (resid <= 0) {
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ break;
+ }
+ } while (space > 0 && atomic);
+ if (dontroute)
+ so->so_options |= SO_DONTROUTE;
+ s = splnet(); /* XXX */
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+ (flags & MSG_OOB) ? PRUS_OOB :
+ /*
+ * If the user set MSG_EOF, the protocol
+ * understands this flag and nothing left to
+ * send then use PRU_SEND_EOF instead of PRU_SEND.
+ */
+ ((flags & MSG_EOF) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+ (resid <= 0)) ?
+ PRUS_EOF :
+ /* If there is more to send set PRUS_MORETOCOME */
+ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+ top, addr, control, p);
+ splx(s);
+ if (dontroute)
+ so->so_options &= ~SO_DONTROUTE;
+ clen = 0;
+ control = 0;
+ top = 0;
+ mp = &top;
+ if (error)
+ goto release;
+ } while (resid && space > 0);
+ } while (resid);
+
+release:
+ sbunlock(&so->so_snd);
+out:
+ if (top)
+ m_freem(top);
+ if (control)
+ m_freem(control);
+ return (error);
+}
+
+/*
+ * Implement receive operations on a socket.
+ * We depend on the way that records are added to the sockbuf
+ * by sbappend*. In particular, each record (mbufs linked through m_next)
+ * must begin with an address if the protocol so specifies,
+ * followed by an optional mbuf or mbufs containing ancillary data,
+ * and then zero or more mbufs of data.
+ * In order to avoid blocking network interrupts for the entire time here,
+ * we splx() while doing the actual copy to user space.
+ * Although the sockbuf is locked, new data may still be appended,
+ * and thus we must maintain consistency of the sockbuf during that time.
+ *
+ * The caller may receive the data as a single mbuf chain by supplying
+ * an mbuf **mp0 for use in returning the chain. The uio is then used
+ * only for the count in uio_resid.
+ */
+int
+soreceive(so, psa, uio, mp0, controlp, flagsp)
+ register struct socket *so;
+ struct sockaddr **psa;
+ struct uio *uio;
+ struct mbuf **mp0;
+ struct mbuf **controlp;
+ int *flagsp;
+{
+ register struct mbuf *m, **mp;
+ register int flags, len, error, s, offset;
+ struct protosw *pr = so->so_proto;
+ struct mbuf *nextrecord;
+ int moff, type = 0;
+ int orig_resid = uio->uio_resid;
+
+ mp = mp0;
+ if (psa)
+ *psa = 0;
+ if (controlp)
+ *controlp = 0;
+ if (flagsp)
+ flags = *flagsp &~ MSG_EOR;
+ else
+ flags = 0;
+ if (flags & MSG_OOB) {
+ m = m_get(M_WAIT, MT_DATA);
+ error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
+ if (error)
+ goto bad;
+ do {
+ error = uiomove(mtod(m, caddr_t),
+ (int) min(uio->uio_resid, m->m_len), uio);
+ m = m_free(m);
+ } while (uio->uio_resid && error == 0 && m);
+bad:
+ if (m)
+ m_freem(m);
+ return (error);
+ }
+ if (mp)
+ *mp = (struct mbuf *)0;
+ if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
+ (*pr->pr_usrreqs->pru_rcvd)(so, 0);
+
+restart:
+ error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+ if (error)
+ return (error);
+ s = splnet();
+
+ m = so->so_rcv.sb_mb;
+ /*
+ * If we have less data than requested, block awaiting more
+ * (subject to any timeout) if:
+ * 1. the current count is less than the low water mark, or
+ * 2. MSG_WAITALL is set, and it is possible to do the entire
+ * receive operation at once if we block (resid <= hiwat).
+ * 3. MSG_DONTWAIT is not set
+ * If MSG_WAITALL is set but resid is larger than the receive buffer,
+ * we have to do the receive in sections, and thus risk returning
+ * a short count if a timeout or signal occurs after we start.
+ */
+ if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
+ so->so_rcv.sb_cc < uio->uio_resid) &&
+ (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
+ ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
+ m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
+ KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1"));
+ if (so->so_error) {
+ if (m)
+ goto dontblock;
+ error = so->so_error;
+ if ((flags & MSG_PEEK) == 0)
+ so->so_error = 0;
+ goto release;
+ }
+ if (so->so_state & SS_CANTRCVMORE) {
+ if (m)
+ goto dontblock;
+ else
+ goto release;
+ }
+ for (; m; m = m->m_next)
+ if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
+ m = so->so_rcv.sb_mb;
+ goto dontblock;
+ }
+ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
+ (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+ error = ENOTCONN;
+ goto release;
+ }
+ if (uio->uio_resid == 0)
+ goto release;
+ if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
+ error = EWOULDBLOCK;
+ goto release;
+ }
+ sbunlock(&so->so_rcv);
+ error = sbwait(&so->so_rcv);
+ splx(s);
+ if (error)
+ return (error);
+ goto restart;
+ }
+dontblock:
+ if (uio->uio_procp)
+ uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
+ nextrecord = m->m_nextpkt;
+ if (pr->pr_flags & PR_ADDR) {
+ KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
+ orig_resid = 0;
+ if (psa)
+ *psa = dup_sockaddr(mtod(m, struct sockaddr *),
+ mp0 == 0);
+ if (flags & MSG_PEEK) {
+ m = m->m_next;
+ } else {
+ sbfree(&so->so_rcv, m);
+ MFREE(m, so->so_rcv.sb_mb);
+ m = so->so_rcv.sb_mb;
+ }
+ }
+ while (m && m->m_type == MT_CONTROL && error == 0) {
+ if (flags & MSG_PEEK) {
+ if (controlp)
+ *controlp = m_copy(m, 0, m->m_len);
+ m = m->m_next;
+ } else {
+ sbfree(&so->so_rcv, m);
+ if (controlp) {
+ if (pr->pr_domain->dom_externalize &&
+ mtod(m, struct cmsghdr *)->cmsg_type ==
+ SCM_RIGHTS)
+ error = (*pr->pr_domain->dom_externalize)(m);
+ *controlp = m;
+ so->so_rcv.sb_mb = m->m_next;
+ m->m_next = 0;
+ m = so->so_rcv.sb_mb;
+ } else {
+ MFREE(m, so->so_rcv.sb_mb);
+ m = so->so_rcv.sb_mb;
+ }
+ }
+ if (controlp) {
+ orig_resid = 0;
+ controlp = &(*controlp)->m_next;
+ }
+ }
+ if (m) {
+ if ((flags & MSG_PEEK) == 0)
+ m->m_nextpkt = nextrecord;
+ type = m->m_type;
+ if (type == MT_OOBDATA)
+ flags |= MSG_OOB;
+ }
+ moff = 0;
+ offset = 0;
+ while (m && uio->uio_resid > 0 && error == 0) {
+ if (m->m_type == MT_OOBDATA) {
+ if (type != MT_OOBDATA)
+ break;
+ } else if (type == MT_OOBDATA)
+ break;
+ else
+ KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
+ ("receive 3"));
+ so->so_state &= ~SS_RCVATMARK;
+ len = uio->uio_resid;
+ if (so->so_oobmark && len > so->so_oobmark - offset)
+ len = so->so_oobmark - offset;
+ if (len > m->m_len - moff)
+ len = m->m_len - moff;
+ /*
+ * If mp is set, just pass back the mbufs.
+ * Otherwise copy them out via the uio, then free.
+ * Sockbuf must be consistent here (points to current mbuf,
+ * it points to next record) when we drop priority;
+ * we must note any additions to the sockbuf when we
+ * block interrupts again.
+ */
+ if (mp == 0) {
+ splx(s);
+ error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
+ s = splnet();
+ if (error)
+ goto release;
+ } else
+ uio->uio_resid -= len;
+ if (len == m->m_len - moff) {
+ if (m->m_flags & M_EOR)
+ flags |= MSG_EOR;
+ if (flags & MSG_PEEK) {
+ m = m->m_next;
+ moff = 0;
+ } else {
+ nextrecord = m->m_nextpkt;
+ sbfree(&so->so_rcv, m);
+ if (mp) {
+ *mp = m;
+ mp = &m->m_next;
+ so->so_rcv.sb_mb = m = m->m_next;
+ *mp = (struct mbuf *)0;
+ } else {
+ MFREE(m, so->so_rcv.sb_mb);
+ m = so->so_rcv.sb_mb;
+ }
+ if (m)
+ m->m_nextpkt = nextrecord;
+ }
+ } else {
+ if (flags & MSG_PEEK)
+ moff += len;
+ else {
+ if (mp)
+ *mp = m_copym(m, 0, len, M_WAIT);
+ m->m_data += len;
+ m->m_len -= len;
+ so->so_rcv.sb_cc -= len;
+ }
+ }
+ if (so->so_oobmark) {
+ if ((flags & MSG_PEEK) == 0) {
+ so->so_oobmark -= len;
+ if (so->so_oobmark == 0) {
+ so->so_state |= SS_RCVATMARK;
+ break;
+ }
+ } else {
+ offset += len;
+ if (offset == so->so_oobmark)
+ break;
+ }
+ }
+ if (flags & MSG_EOR)
+ break;
+ /*
+ * If the MSG_WAITALL flag is set (for non-atomic socket),
+ * we must not quit until "uio->uio_resid == 0" or an error
+ * termination. If a signal/timeout occurs, return
+ * with a short count but without error.
+ * Keep sockbuf locked against other readers.
+ */
+ while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
+ !sosendallatonce(so) && !nextrecord) {
+ if (so->so_error || so->so_state & SS_CANTRCVMORE)
+ break;
+ error = sbwait(&so->so_rcv);
+ if (error) {
+ sbunlock(&so->so_rcv);
+ splx(s);
+ return (0);
+ }
+ m = so->so_rcv.sb_mb;
+ if (m)
+ nextrecord = m->m_nextpkt;
+ }
+ }
+
+ if (m && pr->pr_flags & PR_ATOMIC) {
+ flags |= MSG_TRUNC;
+ if ((flags & MSG_PEEK) == 0)
+ (void) sbdroprecord(&so->so_rcv);
+ }
+ if ((flags & MSG_PEEK) == 0) {
+ if (m == 0)
+ so->so_rcv.sb_mb = nextrecord;
+ if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
+ (*pr->pr_usrreqs->pru_rcvd)(so, flags);
+ }
+ if (orig_resid == uio->uio_resid && orig_resid &&
+ (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
+ sbunlock(&so->so_rcv);
+ splx(s);
+ goto restart;
+ }
+
+ if (flagsp)
+ *flagsp |= flags;
+release:
+ sbunlock(&so->so_rcv);
+ splx(s);
+ return (error);
+}
+
+int
+soshutdown(so, how)
+ register struct socket *so;
+ register int how;
+{
+ register struct protosw *pr = so->so_proto;
+
+ how++;
+ if (how & FREAD)
+ sorflush(so);
+ if (how & FWRITE)
+ return ((*pr->pr_usrreqs->pru_shutdown)(so));
+ return (0);
+}
+
+void
+sorflush(so)
+ register struct socket *so;
+{
+ register struct sockbuf *sb = &so->so_rcv;
+ register struct protosw *pr = so->so_proto;
+ register int s;
+ struct sockbuf asb;
+
+ sb->sb_flags |= SB_NOINTR;
+ (void) sblock(sb, M_WAITOK);
+ s = splimp();
+ socantrcvmore(so);
+ sbunlock(sb);
+ asb = *sb;
+ bzero((caddr_t)sb, sizeof (*sb));
+ splx(s);
+ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
+ (*pr->pr_domain->dom_dispose)(asb.sb_mb);
+ sbrelease(&asb);
+}
+
+/*
+ * Perhaps this routine, and sooptcopyout(), below, ought to come in
+ * an additional variant to handle the case where the option value needs
+ * to be some kind of integer, but not a specific size.
+ * In addition to their use here, these functions are also called by the
+ * protocol-level pr_ctloutput() routines.
+ */
+int
+sooptcopyin(sopt, buf, len, minlen)
+ struct sockopt *sopt;
+ void *buf;
+ size_t len;
+ size_t minlen;
+{
+ size_t valsize;
+
+ /*
+ * If the user gives us more than we wanted, we ignore it,
+ * but if we don't get the minimum length the caller
+ * wants, we return EINVAL. On success, sopt->sopt_valsize
+ * is set to however much we actually retrieved.
+ */
+ if ((valsize = sopt->sopt_valsize) < minlen)
+ return EINVAL;
+ if (valsize > len)
+ sopt->sopt_valsize = valsize = len;
+
+ if (sopt->sopt_p != 0)
+ return (copyin(sopt->sopt_val, buf, valsize));
+
+ bcopy(sopt->sopt_val, buf, valsize);
+ return 0;
+}
+
+int
+sosetopt(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ int error, optval;
+ struct linger l;
+ struct timeval tv;
+ short val;
+
+ error = 0;
+ if (sopt->sopt_level != SOL_SOCKET) {
+ if (so->so_proto && so->so_proto->pr_ctloutput)
+ return ((*so->so_proto->pr_ctloutput)
+ (so, sopt));
+ error = ENOPROTOOPT;
+ } else {
+ switch (sopt->sopt_name) {
+ case SO_LINGER:
+ error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
+ if (error)
+ goto bad;
+
+ so->so_linger = l.l_linger;
+ if (l.l_onoff)
+ so->so_options |= SO_LINGER;
+ else
+ so->so_options &= ~SO_LINGER;
+ break;
+
+ case SO_DEBUG:
+ case SO_KEEPALIVE:
+ case SO_DONTROUTE:
+ case SO_USELOOPBACK:
+ case SO_BROADCAST:
+ case SO_REUSEADDR:
+ case SO_REUSEPORT:
+ case SO_OOBINLINE:
+ case SO_TIMESTAMP:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ goto bad;
+ if (optval)
+ so->so_options |= sopt->sopt_name;
+ else
+ so->so_options &= ~sopt->sopt_name;
+ break;
+
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ case SO_SNDLOWAT:
+ case SO_RCVLOWAT:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ goto bad;
+
+ /*
+ * Values < 1 make no sense for any of these
+ * options, so disallow them.
+ */
+ if (optval < 1) {
+ error = EINVAL;
+ goto bad;
+ }
+
+ switch (sopt->sopt_name) {
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
+ &so->so_snd : &so->so_rcv,
+ (u_long) optval) == 0) {
+ error = ENOBUFS;
+ goto bad;
+ }
+ break;
+
+ /*
+ * Make sure the low-water is never greater than
+ * the high-water.
+ */
+ case SO_SNDLOWAT:
+ so->so_snd.sb_lowat =
+ (optval > so->so_snd.sb_hiwat) ?
+ so->so_snd.sb_hiwat : optval;
+ break;
+ case SO_RCVLOWAT:
+ so->so_rcv.sb_lowat =
+ (optval > so->so_rcv.sb_hiwat) ?
+ so->so_rcv.sb_hiwat : optval;
+ break;
+ }
+ break;
+
+ case SO_SNDTIMEO:
+ case SO_RCVTIMEO:
+ error = sooptcopyin(sopt, &tv, sizeof tv,
+ sizeof tv);
+ if (error)
+ goto bad;
+
+ if (tv.tv_sec > SHRT_MAX / hz - hz) {
+ error = EDOM;
+ goto bad;
+ }
+ val = tv.tv_sec * hz + tv.tv_usec / tick;
+
+ switch (sopt->sopt_name) {
+ case SO_SNDTIMEO:
+ so->so_snd.sb_timeo = val;
+ break;
+ case SO_RCVTIMEO:
+ so->so_rcv.sb_timeo = val;
+ break;
+ }
+ break;
+
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
+ (void) ((*so->so_proto->pr_ctloutput)
+ (so, sopt));
+ }
+ }
+bad:
+ return (error);
+}
+
+/* Helper routine for getsockopt */
+int
+sooptcopyout(sopt, buf, len)
+ struct sockopt *sopt;
+ void *buf;
+ size_t len;
+{
+ int error;
+ size_t valsize;
+
+ error = 0;
+
+ /*
+ * Documented get behavior is that we always return a value,
+ * possibly truncated to fit in the user's buffer.
+ * Traditional behavior is that we always tell the user
+ * precisely how much we copied, rather than something useful
+ * like the total amount we had available for her.
+ * Note that this interface is not idempotent; the entire answer must
+ * generated ahead of time.
+ */
+ valsize = min(len, sopt->sopt_valsize);
+ sopt->sopt_valsize = valsize;
+ if (sopt->sopt_val != 0) {
+ if (sopt->sopt_p != 0)
+ error = copyout(buf, sopt->sopt_val, valsize);
+ else
+ bcopy(buf, sopt->sopt_val, valsize);
+ }
+ return error;
+}
+
+int
+sogetopt(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ int error, optval;
+ struct linger l;
+ struct timeval tv;
+
+ error = 0;
+ if (sopt->sopt_level != SOL_SOCKET) {
+ if (so->so_proto && so->so_proto->pr_ctloutput) {
+ return ((*so->so_proto->pr_ctloutput)
+ (so, sopt));
+ } else
+ return (ENOPROTOOPT);
+ } else {
+ switch (sopt->sopt_name) {
+ case SO_LINGER:
+ l.l_onoff = so->so_options & SO_LINGER;
+ l.l_linger = so->so_linger;
+ error = sooptcopyout(sopt, &l, sizeof l);
+ break;
+
+ case SO_USELOOPBACK:
+ case SO_DONTROUTE:
+ case SO_DEBUG:
+ case SO_KEEPALIVE:
+ case SO_REUSEADDR:
+ case SO_REUSEPORT:
+ case SO_BROADCAST:
+ case SO_OOBINLINE:
+ case SO_TIMESTAMP:
+ optval = so->so_options & sopt->sopt_name;
+integer:
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+
+ case SO_TYPE:
+ optval = so->so_type;
+ goto integer;
+
+ case SO_ERROR:
+ optval = so->so_error;
+ so->so_error = 0;
+ goto integer;
+
+ case SO_SNDBUF:
+ optval = so->so_snd.sb_hiwat;
+ goto integer;
+
+ case SO_RCVBUF:
+ optval = so->so_rcv.sb_hiwat;
+ goto integer;
+
+ case SO_SNDLOWAT:
+ optval = so->so_snd.sb_lowat;
+ goto integer;
+
+ case SO_RCVLOWAT:
+ optval = so->so_rcv.sb_lowat;
+ goto integer;
+
+ case SO_SNDTIMEO:
+ case SO_RCVTIMEO:
+ optval = (sopt->sopt_name == SO_SNDTIMEO ?
+ so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
+
+ tv.tv_sec = optval / hz;
+ tv.tv_usec = (optval % hz) * tick;
+ error = sooptcopyout(sopt, &tv, sizeof tv);
+ break;
+
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ return (error);
+ }
+}
+
+void
+sohasoutofband(so)
+ register struct socket *so;
+{
+ if (so->so_sigio != NULL)
+ pgsigio(so->so_sigio, SIGURG, 0);
+ selwakeup(&so->so_rcv.sb_sel);
+}
+
+int
+sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p)
+{
+ int revents = 0;
+ int s = splnet();
+
+ if (events & (POLLIN | POLLRDNORM))
+ if (soreadable(so))
+ revents |= events & (POLLIN | POLLRDNORM);
+
+ if (events & (POLLOUT | POLLWRNORM))
+ if (sowriteable(so))
+ revents |= events & (POLLOUT | POLLWRNORM);
+
+ if (events & (POLLPRI | POLLRDBAND))
+ if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
+ revents |= events & (POLLPRI | POLLRDBAND);
+
+ if (revents == 0) {
+ if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
+ selrecord(p, &so->so_rcv.sb_sel);
+ so->so_rcv.sb_flags |= SB_SEL;
+ }
+
+ if (events & (POLLOUT | POLLWRNORM)) {
+ selrecord(p, &so->so_snd.sb_sel);
+ so->so_snd.sb_flags |= SB_SEL;
+ }
+ }
+
+ splx(s);
+ return (revents);
+}
diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c
new file mode 100644
index 0000000..e718c62
--- /dev/null
+++ b/sys/kern/uipc_socket2.c
@@ -0,0 +1,954 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
+ * $Id: uipc_socket2.c,v 1.42 1998/11/23 00:45:38 truckman Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+
+/*
+ * Primitive routines for operating on sockets and socket buffers
+ */
+
+u_long sb_max = SB_MAX; /* XXX should be static */
+
+static u_long sb_efficiency = 8; /* parameter for sbreserve() */
+
+/*
+ * Procedures to manipulate state flags of socket
+ * and do appropriate wakeups. Normal sequence from the
+ * active (originating) side is that soisconnecting() is
+ * called during processing of connect() call,
+ * resulting in an eventual call to soisconnected() if/when the
+ * connection is established. When the connection is torn down
+ * soisdisconnecting() is called during processing of disconnect() call,
+ * and soisdisconnected() is called when the connection to the peer
+ * is totally severed. The semantics of these routines are such that
+ * connectionless protocols can call soisconnected() and soisdisconnected()
+ * only, bypassing the in-progress calls when setting up a ``connection''
+ * takes no time.
+ *
+ * From the passive side, a socket is created with
+ * two queues of sockets: so_q0 for connections in progress
+ * and so_q for connections already made and awaiting user acceptance.
+ * As a protocol is preparing incoming connections, it creates a socket
+ * structure queued on so_q0 by calling sonewconn(). When the connection
+ * is established, soisconnected() is called, and transfers the
+ * socket structure to so_q, making it available to accept().
+ *
+ * If a socket is closed with sockets on either
+ * so_q0 or so_q, these sockets are dropped.
+ *
+ * If higher level protocols are implemented in
+ * the kernel, the wakeups done here will sometimes
+ * cause software-interrupt process scheduling.
+ */
+
+void
+soisconnecting(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= SS_ISCONNECTING;
+}
+
+void
+soisconnected(so)
+ register struct socket *so;
+{
+ register struct socket *head = so->so_head;
+
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+ so->so_state |= SS_ISCONNECTED;
+ if (head && (so->so_state & SS_INCOMP)) {
+ TAILQ_REMOVE(&head->so_incomp, so, so_list);
+ head->so_incqlen--;
+ so->so_state &= ~SS_INCOMP;
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ so->so_state |= SS_COMP;
+ sorwakeup(head);
+ wakeup_one(&head->so_timeo);
+ } else {
+ wakeup(&so->so_timeo);
+ sorwakeup(so);
+ sowwakeup(so);
+ }
+}
+
+void
+soisdisconnecting(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~SS_ISCONNECTING;
+ so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
+ wakeup((caddr_t)&so->so_timeo);
+ sowwakeup(so);
+ sorwakeup(so);
+}
+
+void
+soisdisconnected(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
+ wakeup((caddr_t)&so->so_timeo);
+ sowwakeup(so);
+ sorwakeup(so);
+}
+
+/*
+ * Return a random connection that hasn't been serviced yet and
+ * is eligible for discard. There is a one in qlen chance that
+ * we will return a null, saying that there are no dropable
+ * requests. In this case, the protocol specific code should drop
+ * the new request. This insures fairness.
+ *
+ * This may be used in conjunction with protocol specific queue
+ * congestion routines.
+ */
+struct socket *
+sodropablereq(head)
+ register struct socket *head;
+{
+ register struct socket *so;
+ unsigned int i, j, qlen;
+ static int rnd;
+ static struct timeval old_runtime;
+ static unsigned int cur_cnt, old_cnt;
+ struct timeval tv;
+
+ getmicrouptime(&tv);
+ if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) {
+ old_runtime = tv;
+ old_cnt = cur_cnt / i;
+ cur_cnt = 0;
+ }
+
+ so = TAILQ_FIRST(&head->so_incomp);
+ if (!so)
+ return (so);
+
+ qlen = head->so_incqlen;
+ if (++cur_cnt > qlen || old_cnt > qlen) {
+ rnd = (314159 * rnd + 66329) & 0xffff;
+ j = ((qlen + 1) * rnd) >> 16;
+
+ while (j-- && so)
+ so = TAILQ_NEXT(so, so_list);
+ }
+
+ return (so);
+}
+
+/*
+ * When an attempt at a new connection is noted on a socket
+ * which accepts connections, sonewconn is called. If the
+ * connection is possible (subject to space constraints, etc.)
+ * then we allocate a new structure, propoerly linked into the
+ * data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ */
+struct socket *
+sonewconn(head, connstatus)
+ register struct socket *head;
+ int connstatus;
+{
+ register struct socket *so;
+
+ if (head->so_qlen > 3 * head->so_qlimit / 2)
+ return ((struct socket *)0);
+ so = soalloc(0);
+ if (so == NULL)
+ return ((struct socket *)0);
+ so->so_head = head;
+ so->so_type = head->so_type;
+ so->so_options = head->so_options &~ SO_ACCEPTCONN;
+ so->so_linger = head->so_linger;
+ so->so_state = head->so_state | SS_NOFDREF;
+ so->so_proto = head->so_proto;
+ so->so_timeo = head->so_timeo;
+ so->so_uid = head->so_uid;
+ (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat);
+
+ if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+ sodealloc(so);
+ return ((struct socket *)0);
+ }
+
+ if (connstatus) {
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ so->so_state |= SS_COMP;
+ } else {
+ TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+ so->so_state |= SS_INCOMP;
+ head->so_incqlen++;
+ }
+ head->so_qlen++;
+ if (connstatus) {
+ sorwakeup(head);
+ wakeup((caddr_t)&head->so_timeo);
+ so->so_state |= connstatus;
+ }
+ return (so);
+}
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the
+ * socket; it would normally be applied to a socket when the user
+ * informs the system that no more data is to be sent, by the protocol
+ * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
+ * will be received, and will normally be applied to the socket by a
+ * protocol when it detects that the peer will send no more data.
+ * Data queued for reading in the socket may yet be read.
+ */
+
+void
+socantsendmore(so)
+ struct socket *so;
+{
+
+ so->so_state |= SS_CANTSENDMORE;
+ sowwakeup(so);
+}
+
+void
+socantrcvmore(so)
+ struct socket *so;
+{
+
+ so->so_state |= SS_CANTRCVMORE;
+ sorwakeup(so);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(sb)
+ struct sockbuf *sb;
+{
+
+ sb->sb_flags |= SB_WAIT;
+ return (tsleep((caddr_t)&sb->sb_cc,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+ sb->sb_timeo));
+}
+
+/*
+ * Lock a sockbuf already known to be locked;
+ * return any error returned from sleep (EINTR).
+ */
+int
+sb_lock(sb)
+ register struct sockbuf *sb;
+{
+ int error;
+
+ while (sb->sb_flags & SB_LOCK) {
+ sb->sb_flags |= SB_WANT;
+ error = tsleep((caddr_t)&sb->sb_flags,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
+ "sblock", 0);
+ if (error)
+ return (error);
+ }
+ sb->sb_flags |= SB_LOCK;
+ return (0);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer.
+ * Do asynchronous notification via SIGIO
+ * if the socket has the SS_ASYNC flag set.
+ */
+void
+sowakeup(so, sb)
+ register struct socket *so;
+ register struct sockbuf *sb;
+{
+ selwakeup(&sb->sb_sel);
+ sb->sb_flags &= ~SB_SEL;
+ if (sb->sb_flags & SB_WAIT) {
+ sb->sb_flags &= ~SB_WAIT;
+ wakeup((caddr_t)&sb->sb_cc);
+ }
+ if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
+ pgsigio(so->so_sigio, SIGIO, 0);
+ if (sb->sb_flags & SB_UPCALL)
+ (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and
+ * one for receiving data. Each buffer contains a queue of mbufs,
+ * information about the number of mbufs and amount of data in the
+ * queue, and other fields allowing select() statements and notification
+ * on data availability to be implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records.
+ * Each record is a list of mbufs chained together with the m_next
+ * field. Records are chained together with the m_nextpkt field. The upper
+ * level routine soreceive() expects the following conventions to be
+ * observed when placing information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's
+ * name, then a record containing that name must be present before
+ * any associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really
+ * just additional data associated with the message), and there are
+ * ``rights'' to be received, then a record containing this data
+ * should be present (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by
+ * a data record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve(). This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits). The space
+ * should be released by calling sbrelease() when the socket is destroyed.
+ */
+
+int
+soreserve(so, sndcc, rcvcc)
+ register struct socket *so;
+ u_long sndcc, rcvcc;
+{
+
+ if (sbreserve(&so->so_snd, sndcc) == 0)
+ goto bad;
+ if (sbreserve(&so->so_rcv, rcvcc) == 0)
+ goto bad2;
+ if (so->so_rcv.sb_lowat == 0)
+ so->so_rcv.sb_lowat = 1;
+ if (so->so_snd.sb_lowat == 0)
+ so->so_snd.sb_lowat = MCLBYTES;
+ if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+ so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+ return (0);
+bad2:
+ sbrelease(&so->so_snd);
+bad:
+ return (ENOBUFS);
+}
+
+/*
+ * Allot mbufs to a sockbuf.
+ * Attempt to scale mbmax so that mbcnt doesn't become limiting
+ * if buffering efficiency is near the normal case.
+ */
+int
+sbreserve(sb, cc)
+ struct sockbuf *sb;
+ u_long cc;
+{
+ if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
+ return (0);
+ sb->sb_hiwat = cc;
+ sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+ if (sb->sb_lowat > sb->sb_hiwat)
+ sb->sb_lowat = sb->sb_hiwat;
+ return (1);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+void
+sbrelease(sb)
+ struct sockbuf *sb;
+{
+
+ sbflush(sb);
+ sb->sb_hiwat = sb->sb_mbmax = 0;
+}
+
+/*
+ * Routines to add and remove
+ * data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to
+ * append new mbufs to a socket buffer, after checking that adequate
+ * space is available, comparing the function sbspace() with the amount
+ * of data to be added. sbappendrecord() differs from sbappend() in
+ * that data supplied is treated as the beginning of a new record.
+ * To place a sender's address, optional access rights, and data in a
+ * socket receive buffer, sbappendaddr() should be used. To place
+ * access rights and data in a socket receive buffer, sbappendrights()
+ * should be used. In either case, the new data begins a new record.
+ * Note that unlike sbappend() and sbappendrecord(), these routines check
+ * for the caller that there will be enough space to store the data.
+ * Each fails if there is not enough space, or if it cannot find mbufs
+ * to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data
+ * awaiting acknowledgement. Data is normally copied from a socket
+ * send buffer in a protocol with m_copy for output to a peer,
+ * and then removing the data from the socket buffer with sbdrop()
+ * or sbdroprecord() when the data is acknowledged by the peer.
+ */
+
+/*
+ * Append mbuf chain m to the last record in the
+ * socket buffer sb. The additional space associated
+ * the mbuf chain is recorded in sb. Empty mbufs are
+ * discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(sb, m)
+ struct sockbuf *sb;
+ struct mbuf *m;
+{
+ register struct mbuf *n;
+
+ if (m == 0)
+ return;
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ do {
+ if (n->m_flags & M_EOR) {
+ sbappendrecord(sb, m); /* XXXXXX!!!! */
+ return;
+ }
+ } while (n->m_next && (n = n->m_next));
+ }
+ sbcompress(sb, m, n);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(sb)
+ register struct sockbuf *sb;
+{
+ register struct mbuf *m;
+ register struct mbuf *n = 0;
+ register u_long len = 0, mbcnt = 0;
+
+ for (m = sb->sb_mb; m; m = n) {
+ n = m->m_nextpkt;
+ for (; m; m = m->m_next) {
+ len += m->m_len;
+ mbcnt += MSIZE;
+ if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+ mbcnt += m->m_ext.ext_size;
+ }
+ }
+ if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+ printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
+ mbcnt, sb->sb_mbcnt);
+ panic("sbcheck");
+ }
+}
+#endif
+
+/*
+ * As above, except the mbuf chain
+ * begins a new record.
+ */
+void
+sbappendrecord(sb, m0)
+ register struct sockbuf *sb;
+ register struct mbuf *m0;
+{
+ register struct mbuf *m;
+
+ if (m0 == 0)
+ return;
+ m = sb->sb_mb;
+ if (m)
+ while (m->m_nextpkt)
+ m = m->m_nextpkt;
+ /*
+ * Put the first mbuf on the queue.
+ * Note this permits zero length records.
+ */
+ sballoc(sb, m0);
+ if (m)
+ m->m_nextpkt = m0;
+ else
+ sb->sb_mb = m0;
+ m = m0->m_next;
+ m0->m_next = 0;
+ if (m && (m0->m_flags & M_EOR)) {
+ m0->m_flags &= ~M_EOR;
+ m->m_flags |= M_EOR;
+ }
+ sbcompress(sb, m, m0);
+}
+
+/*
+ * As above except that OOB data
+ * is inserted at the beginning of the sockbuf,
+ * but after any other OOB data.
+ */
+void
+sbinsertoob(sb, m0)
+ register struct sockbuf *sb;
+ register struct mbuf *m0;
+{
+ register struct mbuf *m;
+ register struct mbuf **mp;
+
+ if (m0 == 0)
+ return;
+ for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
+ m = *mp;
+ again:
+ switch (m->m_type) {
+
+ case MT_OOBDATA:
+ continue; /* WANT next train */
+
+ case MT_CONTROL:
+ m = m->m_next;
+ if (m)
+ goto again; /* inspect THIS train further */
+ }
+ break;
+ }
+ /*
+ * Put the first mbuf on the queue.
+ * Note this permits zero length records.
+ */
+ sballoc(sb, m0);
+ m0->m_nextpkt = *mp;
+ *mp = m0;
+ m = m0->m_next;
+ m0->m_next = 0;
+ if (m && (m0->m_flags & M_EOR)) {
+ m0->m_flags &= ~M_EOR;
+ m->m_flags |= M_EOR;
+ }
+ sbcompress(sb, m, m0);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data
+ * to the receive queue of a socket. If present,
+ * m0 must include a packet header with total length.
+ * Returns 0 if no space in sockbuf or insufficient mbufs.
+ */
+int
+sbappendaddr(sb, asa, m0, control)
+ register struct sockbuf *sb;
+ struct sockaddr *asa;
+ struct mbuf *m0, *control;
+{
+ register struct mbuf *m, *n;
+ int space = asa->sa_len;
+
+if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+panic("sbappendaddr");
+ if (m0)
+ space += m0->m_pkthdr.len;
+ for (n = control; n; n = n->m_next) {
+ space += n->m_len;
+ if (n->m_next == 0) /* keep pointer to last control buf */
+ break;
+ }
+ if (space > sbspace(sb))
+ return (0);
+ if (asa->sa_len > MLEN)
+ return (0);
+ MGET(m, M_DONTWAIT, MT_SONAME);
+ if (m == 0)
+ return (0);
+ m->m_len = asa->sa_len;
+ bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
+ if (n)
+ n->m_next = m0; /* concatenate data to control */
+ else
+ control = m0;
+ m->m_next = control;
+ for (n = m; n; n = n->m_next)
+ sballoc(sb, n);
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ n->m_nextpkt = m;
+ } else
+ sb->sb_mb = m;
+ return (1);
+}
+
+int
+sbappendcontrol(sb, m0, control)
+ struct sockbuf *sb;
+ struct mbuf *control, *m0;
+{
+ register struct mbuf *m, *n;
+ int space = 0;
+
+ if (control == 0)
+ panic("sbappendcontrol");
+ for (m = control; ; m = m->m_next) {
+ space += m->m_len;
+ if (m->m_next == 0)
+ break;
+ }
+ n = m; /* save pointer to last control buffer */
+ for (m = m0; m; m = m->m_next)
+ space += m->m_len;
+ if (space > sbspace(sb))
+ return (0);
+ n->m_next = m0; /* concatenate data to control */
+ for (m = control; m; m = m->m_next)
+ sballoc(sb, m);
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ n->m_nextpkt = control;
+ } else
+ sb->sb_mb = control;
+ return (1);
+}
+
+/*
+ * Compress mbuf chain m into the socket
+ * buffer sb following mbuf n. If n
+ * is null, the buffer is presumed empty.
+ */
+void
+sbcompress(sb, m, n)
+ register struct sockbuf *sb;
+ register struct mbuf *m, *n;
+{
+ register int eor = 0;
+ register struct mbuf *o;
+
+ while (m) {
+ eor |= m->m_flags & M_EOR;
+ if (m->m_len == 0 &&
+ (eor == 0 ||
+ (((o = m->m_next) || (o = n)) &&
+ o->m_type == m->m_type))) {
+ m = m_free(m);
+ continue;
+ }
+ if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 &&
+ (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] &&
+ n->m_type == m->m_type) {
+ bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+ (unsigned)m->m_len);
+ n->m_len += m->m_len;
+ sb->sb_cc += m->m_len;
+ m = m_free(m);
+ continue;
+ }
+ if (n)
+ n->m_next = m;
+ else
+ sb->sb_mb = m;
+ sballoc(sb, m);
+ n = m;
+ m->m_flags &= ~M_EOR;
+ m = m->m_next;
+ n->m_next = 0;
+ }
+ if (eor) {
+ if (n)
+ n->m_flags |= eor;
+ else
+ printf("semi-panic: sbcompress\n");
+ }
+}
+
+/*
+ * Free all mbufs in a sockbuf.
+ * Check that all resources are reclaimed.
+ */
+void
+sbflush(sb)
+ register struct sockbuf *sb;
+{
+
+ if (sb->sb_flags & SB_LOCK)
+ panic("sbflush: locked");
+ while (sb->sb_mbcnt && sb->sb_cc)
+ sbdrop(sb, (int)sb->sb_cc);
+ if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
+ panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop(sb, len)
+ register struct sockbuf *sb;
+ register int len;
+{
+ register struct mbuf *m, *mn;
+ struct mbuf *next;
+
+ next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+ while (len > 0) {
+ if (m == 0) {
+ if (next == 0)
+ panic("sbdrop");
+ m = next;
+ next = m->m_nextpkt;
+ continue;
+ }
+ if (m->m_len > len) {
+ m->m_len -= len;
+ m->m_data += len;
+ sb->sb_cc -= len;
+ break;
+ }
+ len -= m->m_len;
+ sbfree(sb, m);
+ MFREE(m, mn);
+ m = mn;
+ }
+ while (m && m->m_len == 0) {
+ sbfree(sb, m);
+ MFREE(m, mn);
+ m = mn;
+ }
+ if (m) {
+ sb->sb_mb = m;
+ m->m_nextpkt = next;
+ } else
+ sb->sb_mb = next;
+}
+
+/*
+ * Drop a record off the front of a sockbuf
+ * and move the next record to the front.
+ */
+void
+sbdroprecord(sb)
+ register struct sockbuf *sb;
+{
+ register struct mbuf *m, *mn;
+
+ m = sb->sb_mb;
+ if (m) {
+ sb->sb_mb = m->m_nextpkt;
+ do {
+ sbfree(sb, m);
+ MFREE(m, mn);
+ m = mn;
+ } while (m);
+ }
+}
+
+/*
+ * Create a "control" mbuf containing the specified data
+ * with the specified type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(p, size, type, level)
+ caddr_t p;
+ register int size;
+ int type, level;
+{
+ register struct cmsghdr *cp;
+ struct mbuf *m;
+
+ if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+ return ((struct mbuf *) NULL);
+ cp = mtod(m, struct cmsghdr *);
+ /* XXX check size? */
+ (void)memcpy(CMSG_DATA(cp), p, size);
+ size += sizeof(*cp);
+ m->m_len = size;
+ cp->cmsg_len = size;
+ cp->cmsg_level = level;
+ cp->cmsg_type = type;
+ return (m);
+}
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol. Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
+ struct ifnet *ifp, struct proc *p)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so, struct proc *p)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+ return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one
+ * and doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+ sb->st_blksize = so->so_snd.sb_hiwat;
+ return 0;
+}
+
+/*
+ * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
+ */
+struct sockaddr *
+dup_sockaddr(sa, canwait)
+ struct sockaddr *sa;
+ int canwait;
+{
+ struct sockaddr *sa2;
+
+ MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME,
+ canwait ? M_WAITOK : M_NOWAIT);
+ if (sa2)
+ bcopy(sa, sa2, sa->sa_len);
+ return sa2;
+}
+
+/*
+ * Create an external-format (``xsocket'') structure using the information
+ * in the kernel-format socket structure pointed to by so. This is done
+ * to reduce the spew of irrelevant information over this interface,
+ * to isolate user code from changes in the kernel structure, and
+ * potentially to provide information-hiding if we decide that
+ * some of this information should be hidden from users.
+ */
+void
+sotoxsocket(struct socket *so, struct xsocket *xso)
+{
+ xso->xso_len = sizeof *xso;
+ xso->xso_so = so;
+ xso->so_type = so->so_type;
+ xso->so_options = so->so_options;
+ xso->so_linger = so->so_linger;
+ xso->so_state = so->so_state;
+ xso->so_pcb = so->so_pcb;
+ xso->xso_protocol = so->so_proto->pr_protocol;
+ xso->xso_family = so->so_proto->pr_domain->dom_family;
+ xso->so_qlen = so->so_qlen;
+ xso->so_incqlen = so->so_incqlen;
+ xso->so_qlimit = so->so_qlimit;
+ xso->so_timeo = so->so_timeo;
+ xso->so_error = so->so_error;
+ xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+ xso->so_oobmark = so->so_oobmark;
+ sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+ sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+ xso->so_uid = so->so_uid;
+}
+
+/*
+ * This does the same for sockbufs. Note that the xsockbuf structure,
+ * since it is always embedded in a socket, does not include a self
+ * pointer nor a length. We make this entry point public in case
+ * some other mechanism needs it.
+ */
+void
+sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
+{
+ xsb->sb_cc = sb->sb_cc;
+ xsb->sb_hiwat = sb->sb_hiwat;
+ xsb->sb_mbcnt = sb->sb_mbcnt;
+ xsb->sb_mbmax = sb->sb_mbmax;
+ xsb->sb_lowat = sb->sb_lowat;
+ xsb->sb_flags = sb->sb_flags;
+ xsb->sb_timeo = sb->sb_timeo;
+}
+
+/*
+ * Here is the definition of some of the basic objects in the kern.ipc
+ * branch of the MIB.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+
+SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, &sb_max, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, &maxsockets, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+ &sb_efficiency, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, "");
+
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
new file mode 100644
index 0000000..bd5149f
--- /dev/null
+++ b/sys/kern/uipc_syscalls.c
@@ -0,0 +1,1701 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * sendfile(2) and related extensions:
+ * Copyright (c) 1998, David Greenman. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
+ * $Id: uipc_syscalls.c,v 1.50 1999/01/21 08:29:04 dillon Exp $
+ */
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysproto.h>
+#include <sys/malloc.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <machine/limits.h>
+
+static void sf_buf_init(void *arg);
+SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
+static struct sf_buf *sf_buf_alloc(void);
+static void sf_buf_ref(caddr_t addr, u_int size);
+static void sf_buf_free(caddr_t addr, u_int size);
+
+static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
+static int recvit __P((struct proc *p, int s, struct msghdr *mp,
+ caddr_t namelenp));
+
+static int accept1 __P((struct proc *p, struct accept_args *uap, int compat));
+static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
+ int compat));
+static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
+ int compat));
+
+static SLIST_HEAD(, sf_buf) sf_freelist;
+static vm_offset_t sf_base;
+static struct sf_buf *sf_bufs;
+static int sf_buf_alloc_want;
+
+/*
+ * System call interface to the socket abstraction.
+ */
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#define COMPAT_OLDSOCK
+#endif
+
+extern struct fileops socketops;
+
+int
+socket(p, uap)
+ struct proc *p;
+ register struct socket_args /* {
+ int domain;
+ int type;
+ int protocol;
+ } */ *uap;
+{
+ struct filedesc *fdp = p->p_fd;
+ struct socket *so;
+ struct file *fp;
+ int fd, error;
+
+ error = falloc(p, &fp, &fd);
+ if (error)
+ return (error);
+ fp->f_flag = FREAD|FWRITE;
+ fp->f_type = DTYPE_SOCKET;
+ fp->f_ops = &socketops;
+ error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
+ if (error) {
+ fdp->fd_ofiles[fd] = 0;
+ ffree(fp);
+ } else {
+ fp->f_data = (caddr_t)so;
+ p->p_retval[0] = fd;
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+int
+bind(p, uap)
+ struct proc *p;
+ register struct bind_args /* {
+ int s;
+ caddr_t name;
+ int namelen;
+ } */ *uap;
+{
+ struct file *fp;
+ struct sockaddr *sa;
+ int error;
+
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
+ return (error);
+ error = getsockaddr(&sa, uap->name, uap->namelen);
+ if (error)
+ return (error);
+ error = sobind((struct socket *)fp->f_data, sa, p);
+ FREE(sa, M_SONAME);
+ return (error);
+}
+
+/* ARGSUSED */
+int
+listen(p, uap)
+ struct proc *p;
+ register struct listen_args /* {
+ int s;
+ int backlog;
+ } */ *uap;
+{
+ struct file *fp;
+ int error;
+
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
+ return (error);
+ return (solisten((struct socket *)fp->f_data, uap->backlog, p));
+}
+
+static int
+accept1(p, uap, compat)
+ struct proc *p;
+ register struct accept_args /* {
+ int s;
+ caddr_t name;
+ int *anamelen;
+ } */ *uap;
+ int compat;
+{
+ struct file *fp;
+ struct sockaddr *sa;
+ int namelen, error, s;
+ struct socket *head, *so;
+ int fd;
+ short fflag; /* type must match fp->f_flag */
+
+ if (uap->name) {
+ error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
+ sizeof (namelen));
+ if(error)
+ return (error);
+ }
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
+ return (error);
+ s = splnet();
+ head = (struct socket *)fp->f_data;
+ if ((head->so_options & SO_ACCEPTCONN) == 0) {
+ splx(s);
+ return (EINVAL);
+ }
+ if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) {
+ splx(s);
+ return (EWOULDBLOCK);
+ }
+ while (head->so_comp.tqh_first == NULL && head->so_error == 0) {
+ if (head->so_state & SS_CANTRCVMORE) {
+ head->so_error = ECONNABORTED;
+ break;
+ }
+ error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
+ "accept", 0);
+ if (error) {
+ splx(s);
+ return (error);
+ }
+ }
+ if (head->so_error) {
+ error = head->so_error;
+ head->so_error = 0;
+ splx(s);
+ return (error);
+ }
+
+ /*
+ * At this point we know that there is at least one connection
+ * ready to be accepted. Remove it from the queue prior to
+ * allocating the file descriptor for it since falloc() may
+ * block allowing another process to accept the connection
+ * instead.
+ */
+ so = head->so_comp.tqh_first;
+ TAILQ_REMOVE(&head->so_comp, so, so_list);
+ head->so_qlen--;
+
+ fflag = fp->f_flag;
+ error = falloc(p, &fp, &fd);
+ if (error) {
+ /*
+ * Probably ran out of file descriptors. Put the
+ * unaccepted connection back onto the queue and
+ * do another wakeup so some other process might
+ * have a chance at it.
+ */
+ TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
+ head->so_qlen++;
+ wakeup_one(&head->so_timeo);
+ splx(s);
+ return (error);
+ } else
+ p->p_retval[0] = fd;
+
+ so->so_state &= ~SS_COMP;
+ so->so_head = NULL;
+ if (head->so_sigio != NULL)
+ fsetown(fgetown(head->so_sigio), &so->so_sigio);
+
+ fp->f_type = DTYPE_SOCKET;
+ fp->f_flag = fflag;
+ fp->f_ops = &socketops;
+ fp->f_data = (caddr_t)so;
+ sa = 0;
+ (void) soaccept(so, &sa);
+ if (sa == 0) {
+ namelen = 0;
+ if (uap->name)
+ goto gotnoname;
+ return 0;
+ }
+ if (uap->name) {
+ /* check sa_len before it is destroyed */
+ if (namelen > sa->sa_len)
+ namelen = sa->sa_len;
+#ifdef COMPAT_OLDSOCK
+ if (compat)
+ ((struct osockaddr *)sa)->sa_family =
+ sa->sa_family;
+#endif
+ error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
+ if (!error)
+gotnoname:
+ error = copyout((caddr_t)&namelen,
+ (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
+ }
+ FREE(sa, M_SONAME);
+ splx(s);
+ return (error);
+}
+
+int
+accept(p, uap)
+ struct proc *p;
+ struct accept_args *uap;
+{
+
+ return (accept1(p, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+oaccept(p, uap)
+ struct proc *p;
+ struct accept_args *uap;
+{
+
+ return (accept1(p, uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+/* ARGSUSED */
+int
+connect(p, uap)
+ struct proc *p;
+ register struct connect_args /* {
+ int s;
+ caddr_t name;
+ int namelen;
+ } */ *uap;
+{
+ struct file *fp;
+ register struct socket *so;
+ struct sockaddr *sa;
+ int error, s;
+
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
+ return (error);
+ so = (struct socket *)fp->f_data;
+ if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING))
+ return (EALREADY);
+ error = getsockaddr(&sa, uap->name, uap->namelen);
+ if (error)
+ return (error);
+ error = soconnect(so, sa, p);
+ if (error)
+ goto bad;
+ if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
+ FREE(sa, M_SONAME);
+ return (EINPROGRESS);
+ }
+ s = splnet();
+ while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
+ error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
+ "connec", 0);
+ if (error)
+ break;
+ }
+ if (error == 0) {
+ error = so->so_error;
+ so->so_error = 0;
+ }
+ splx(s);
+bad:
+ so->so_state &= ~SS_ISCONNECTING;
+ FREE(sa, M_SONAME);
+ if (error == ERESTART)
+ error = EINTR;
+ return (error);
+}
+
+int
+socketpair(p, uap)
+ struct proc *p;
+ register struct socketpair_args /* {
+ int domain;
+ int type;
+ int protocol;
+ int *rsv;
+ } */ *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ struct file *fp1, *fp2;
+ struct socket *so1, *so2;
+ int fd, error, sv[2];
+
+ error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
+ if (error)
+ return (error);
+ error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
+ if (error)
+ goto free1;
+ error = falloc(p, &fp1, &fd);
+ if (error)
+ goto free2;
+ sv[0] = fd;
+ fp1->f_flag = FREAD|FWRITE;
+ fp1->f_type = DTYPE_SOCKET;
+ fp1->f_ops = &socketops;
+ fp1->f_data = (caddr_t)so1;
+ error = falloc(p, &fp2, &fd);
+ if (error)
+ goto free3;
+ fp2->f_flag = FREAD|FWRITE;
+ fp2->f_type = DTYPE_SOCKET;
+ fp2->f_ops = &socketops;
+ fp2->f_data = (caddr_t)so2;
+ sv[1] = fd;
+ error = soconnect2(so1, so2);
+ if (error)
+ goto free4;
+ if (uap->type == SOCK_DGRAM) {
+ /*
+ * Datagram socket connection is asymmetric.
+ */
+ error = soconnect2(so2, so1);
+ if (error)
+ goto free4;
+ }
+ error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
+ return (error);
+free4:
+ ffree(fp2);
+ fdp->fd_ofiles[sv[1]] = 0;
+free3:
+ ffree(fp1);
+ fdp->fd_ofiles[sv[0]] = 0;
+free2:
+ (void)soclose(so2);
+free1:
+ (void)soclose(so1);
+ return (error);
+}
+
+static int
+sendit(p, s, mp, flags)
+ register struct proc *p;
+ int s;
+ register struct msghdr *mp;
+ int flags;
+{
+ struct file *fp;
+ struct uio auio;
+ register struct iovec *iov;
+ register int i;
+ struct mbuf *control;
+ struct sockaddr *to;
+ int len, error;
+ struct socket *so;
+#ifdef KTRACE
+ struct iovec *ktriov = NULL;
+#endif
+
+ error = getsock(p->p_fd, s, &fp);
+ if (error)
+ return (error);
+ auio.uio_iov = mp->msg_iov;
+ auio.uio_iovcnt = mp->msg_iovlen;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_procp = p;
+ auio.uio_offset = 0; /* XXX */
+ auio.uio_resid = 0;
+ iov = mp->msg_iov;
+ for (i = 0; i < mp->msg_iovlen; i++, iov++) {
+ if ((auio.uio_resid += iov->iov_len) < 0)
+ return (EINVAL);
+ }
+ if (mp->msg_name) {
+ error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
+ if (error)
+ return (error);
+ } else
+ to = 0;
+ if (mp->msg_control) {
+ if (mp->msg_controllen < sizeof(struct cmsghdr)
+#ifdef COMPAT_OLDSOCK
+ && mp->msg_flags != MSG_COMPAT
+#endif
+ ) {
+ error = EINVAL;
+ goto bad;
+ }
+ error = sockargs(&control, mp->msg_control,
+ mp->msg_controllen, MT_CONTROL);
+ if (error)
+ goto bad;
+#ifdef COMPAT_OLDSOCK
+ if (mp->msg_flags == MSG_COMPAT) {
+ register struct cmsghdr *cm;
+
+ M_PREPEND(control, sizeof(*cm), M_WAIT);
+ if (control == 0) {
+ error = ENOBUFS;
+ goto bad;
+ } else {
+ cm = mtod(control, struct cmsghdr *);
+ cm->cmsg_len = control->m_len;
+ cm->cmsg_level = SOL_SOCKET;
+ cm->cmsg_type = SCM_RIGHTS;
+ }
+ }
+#endif
+ } else
+ control = 0;
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_GENIO)) {
+ int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
+
+ MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+ bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+ }
+#endif
+ len = auio.uio_resid;
+ so = (struct socket *)fp->f_data;
+ error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
+ flags, p);
+ if (error) {
+ if (auio.uio_resid != len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ if (error == EPIPE)
+ psignal(p, SIGPIPE);
+ }
+ if (error == 0)
+ p->p_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+ if (ktriov != NULL) {
+ if (error == 0)
+ ktrgenio(p->p_tracep, s, UIO_WRITE,
+ ktriov, p->p_retval[0], error);
+ FREE(ktriov, M_TEMP);
+ }
+#endif
+bad:
+ if (to)
+ FREE(to, M_SONAME);
+ return (error);
+}
+
+int
+sendto(p, uap)
+ struct proc *p;
+ register struct sendto_args /* {
+ int s;
+ caddr_t buf;
+ size_t len;
+ int flags;
+ caddr_t to;
+ int tolen;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+
+ msg.msg_name = uap->to;
+ msg.msg_namelen = uap->tolen;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = 0;
+#ifdef COMPAT_OLDSOCK
+ msg.msg_flags = 0;
+#endif
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ return (sendit(p, uap->s, &msg, uap->flags));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+osend(p, uap)
+ struct proc *p;
+ register struct osend_args /* {
+ int s;
+ caddr_t buf;
+ int len;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ msg.msg_control = 0;
+ msg.msg_flags = 0;
+ return (sendit(p, uap->s, &msg, uap->flags));
+}
+
+int
+osendmsg(p, uap)
+ struct proc *p;
+ register struct osendmsg_args /* {
+ int s;
+ caddr_t msg;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov[UIO_SMALLIOV], *iov;
+ int error;
+
+ error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
+ if (error)
+ return (error);
+ if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+ if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
+ return (EMSGSIZE);
+ MALLOC(iov, struct iovec *,
+ sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+ M_WAITOK);
+ } else
+ iov = aiov;
+ error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
+ (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+ if (error)
+ goto done;
+ msg.msg_flags = MSG_COMPAT;
+ msg.msg_iov = iov;
+ error = sendit(p, uap->s, &msg, uap->flags);
+done:
+ if (iov != aiov)
+ FREE(iov, M_IOV);
+ return (error);
+}
+#endif
+
+int
+sendmsg(p, uap)
+ struct proc *p;
+ register struct sendmsg_args /* {
+ int s;
+ caddr_t msg;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov[UIO_SMALLIOV], *iov;
+ int error;
+
+ error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
+ if (error)
+ return (error);
+ if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+ if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
+ return (EMSGSIZE);
+ MALLOC(iov, struct iovec *,
+ sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+ M_WAITOK);
+ } else
+ iov = aiov;
+ if (msg.msg_iovlen &&
+ (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
+ (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
+ goto done;
+ msg.msg_iov = iov;
+#ifdef COMPAT_OLDSOCK
+ msg.msg_flags = 0;
+#endif
+ error = sendit(p, uap->s, &msg, uap->flags);
+done:
+ if (iov != aiov)
+ FREE(iov, M_IOV);
+ return (error);
+}
+
+static int
+recvit(p, s, mp, namelenp)
+ register struct proc *p;
+ int s;
+ register struct msghdr *mp;
+ caddr_t namelenp;
+{
+ struct file *fp;
+ struct uio auio;
+ register struct iovec *iov;
+ register int i;
+ int len, error;
+ struct mbuf *m, *control = 0;
+ caddr_t ctlbuf;
+ struct socket *so;
+ struct sockaddr *fromsa = 0;
+#ifdef KTRACE
+ struct iovec *ktriov = NULL;
+#endif
+
+ error = getsock(p->p_fd, s, &fp);
+ if (error)
+ return (error);
+ auio.uio_iov = mp->msg_iov;
+ auio.uio_iovcnt = mp->msg_iovlen;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_procp = p;
+ auio.uio_offset = 0; /* XXX */
+ auio.uio_resid = 0;
+ iov = mp->msg_iov;
+ for (i = 0; i < mp->msg_iovlen; i++, iov++) {
+ if ((auio.uio_resid += iov->iov_len) < 0)
+ return (EINVAL);
+ }
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_GENIO)) {
+ int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
+
+ MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+ bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+ }
+#endif
+ len = auio.uio_resid;
+ so = (struct socket *)fp->f_data;
+ error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
+ (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
+ &mp->msg_flags);
+ if (error) {
+ if (auio.uio_resid != len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ }
+#ifdef KTRACE
+ if (ktriov != NULL) {
+ if (error == 0)
+ ktrgenio(p->p_tracep, s, UIO_READ,
+ ktriov, len - auio.uio_resid, error);
+ FREE(ktriov, M_TEMP);
+ }
+#endif
+ if (error)
+ goto out;
+ p->p_retval[0] = len - auio.uio_resid;
+ if (mp->msg_name) {
+ len = mp->msg_namelen;
+ if (len <= 0 || fromsa == 0)
+ len = 0;
+ else {
+#ifndef MIN
+#define MIN(a,b) ((a)>(b)?(b):(a))
+#endif
+ /* save sa_len before it is destroyed by MSG_COMPAT */
+ len = MIN(len, fromsa->sa_len);
+#ifdef COMPAT_OLDSOCK
+ if (mp->msg_flags & MSG_COMPAT)
+ ((struct osockaddr *)fromsa)->sa_family =
+ fromsa->sa_family;
+#endif
+ error = copyout(fromsa,
+ (caddr_t)mp->msg_name, (unsigned)len);
+ if (error)
+ goto out;
+ }
+ mp->msg_namelen = len;
+ if (namelenp &&
+ (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
+#ifdef COMPAT_OLDSOCK
+ if (mp->msg_flags & MSG_COMPAT)
+ error = 0; /* old recvfrom didn't check */
+ else
+#endif
+ goto out;
+ }
+ }
+ if (mp->msg_control) {
+#ifdef COMPAT_OLDSOCK
+ /*
+ * We assume that old recvmsg calls won't receive access
+ * rights and other control info, esp. as control info
+ * is always optional and those options didn't exist in 4.3.
+ * If we receive rights, trim the cmsghdr; anything else
+ * is tossed.
+ */
+ if (control && mp->msg_flags & MSG_COMPAT) {
+ if (mtod(control, struct cmsghdr *)->cmsg_level !=
+ SOL_SOCKET ||
+ mtod(control, struct cmsghdr *)->cmsg_type !=
+ SCM_RIGHTS) {
+ mp->msg_controllen = 0;
+ goto out;
+ }
+ control->m_len -= sizeof (struct cmsghdr);
+ control->m_data += sizeof (struct cmsghdr);
+ }
+#endif
+ len = mp->msg_controllen;
+ m = control;
+ mp->msg_controllen = 0;
+ ctlbuf = (caddr_t) mp->msg_control;
+
+ while (m && len > 0) {
+ unsigned int tocopy;
+
+ if (len >= m->m_len)
+ tocopy = m->m_len;
+ else {
+ mp->msg_flags |= MSG_CTRUNC;
+ tocopy = len;
+ }
+
+ if (error = copyout((caddr_t)mtod(m, caddr_t),
+ ctlbuf, tocopy))
+ goto out;
+
+ ctlbuf += tocopy;
+ len -= tocopy;
+ m = m->m_next;
+ }
+ mp->msg_controllen = ctlbuf - mp->msg_control;
+ }
+out:
+ if (fromsa)
+ FREE(fromsa, M_SONAME);
+ if (control)
+ m_freem(control);
+ return (error);
+}
+
+int
+recvfrom(p, uap)
+ struct proc *p;
+ register struct recvfrom_args /* {
+ int s;
+ caddr_t buf;
+ size_t len;
+ int flags;
+ caddr_t from;
+ int *fromlenaddr;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+ int error;
+
+ if (uap->fromlenaddr) {
+ error = copyin((caddr_t)uap->fromlenaddr,
+ (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
+ if (error)
+ return (error);
+ } else
+ msg.msg_namelen = 0;
+ msg.msg_name = uap->from;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ msg.msg_control = 0;
+ msg.msg_flags = uap->flags;
+ return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+orecvfrom(p, uap)
+ struct proc *p;
+ struct recvfrom_args *uap;
+{
+
+ uap->flags |= MSG_COMPAT;
+ return (recvfrom(p, uap));
+}
+#endif
+
+
+#ifdef COMPAT_OLDSOCK
+int
+orecv(p, uap)
+ struct proc *p;
+ register struct orecv_args /* {
+ int s;
+ caddr_t buf;
+ int len;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ msg.msg_control = 0;
+ msg.msg_flags = uap->flags;
+ return (recvit(p, uap->s, &msg, (caddr_t)0));
+}
+
+/*
+ * Old recvmsg. This code takes advantage of the fact that the old msghdr
+ * overlays the new one, missing only the flags, and with the (old) access
+ * rights where the control fields are now.
+ */
+int
+orecvmsg(p, uap)
+ struct proc *p;
+ register struct orecvmsg_args /* {
+ int s;
+ struct omsghdr *msg;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov[UIO_SMALLIOV], *iov;
+ int error;
+
+ error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
+ sizeof (struct omsghdr));
+ if (error)
+ return (error);
+ if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+ if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
+ return (EMSGSIZE);
+ MALLOC(iov, struct iovec *,
+ sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+ M_WAITOK);
+ } else
+ iov = aiov;
+ msg.msg_flags = uap->flags | MSG_COMPAT;
+ error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
+ (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+ if (error)
+ goto done;
+ msg.msg_iov = iov;
+ error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
+
+ if (msg.msg_controllen && error == 0)
+ error = copyout((caddr_t)&msg.msg_controllen,
+ (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
+done:
+ if (iov != aiov)
+ FREE(iov, M_IOV);
+ return (error);
+}
+#endif
+
+int
+recvmsg(p, uap)
+ struct proc *p;
+ register struct recvmsg_args /* {
+ int s;
+ struct msghdr *msg;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
+ register int error;
+
+ error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
+ if (error)
+ return (error);
+ if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+ if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
+ return (EMSGSIZE);
+ MALLOC(iov, struct iovec *,
+ sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+ M_WAITOK);
+ } else
+ iov = aiov;
+#ifdef COMPAT_OLDSOCK
+ msg.msg_flags = uap->flags &~ MSG_COMPAT;
+#else
+ msg.msg_flags = uap->flags;
+#endif
+ uiov = msg.msg_iov;
+ msg.msg_iov = iov;
+ error = copyin((caddr_t)uiov, (caddr_t)iov,
+ (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+ if (error)
+ goto done;
+ error = recvit(p, uap->s, &msg, (caddr_t)0);
+ if (!error) {
+ msg.msg_iov = uiov;
+ error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
+ }
+done:
+ if (iov != aiov)
+ FREE(iov, M_IOV);
+ return (error);
+}
+
+/* ARGSUSED */
+int
+shutdown(p, uap)
+ struct proc *p;
+ register struct shutdown_args /* {
+ int s;
+ int how;
+ } */ *uap;
+{
+ struct file *fp;
+ int error;
+
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
+ return (error);
+ return (soshutdown((struct socket *)fp->f_data, uap->how));
+}
+
+/* ARGSUSED */
+int
+setsockopt(p, uap)
+ struct proc *p;
+ register struct setsockopt_args /* {
+ int s;
+ int level;
+ int name;
+ caddr_t val;
+ int valsize;
+ } */ *uap;
+{
+ struct file *fp;
+ struct sockopt sopt;
+ int error;
+
+ if (uap->val == 0 && uap->valsize != 0)
+ return (EFAULT);
+ if (uap->valsize < 0)
+ return (EINVAL);
+
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
+ return (error);
+
+ sopt.sopt_dir = SOPT_SET;
+ sopt.sopt_level = uap->level;
+ sopt.sopt_name = uap->name;
+ sopt.sopt_val = uap->val;
+ sopt.sopt_valsize = uap->valsize;
+ sopt.sopt_p = p;
+
+ return (sosetopt((struct socket *)fp->f_data, &sopt));
+}
+
+/* ARGSUSED */
+int
+getsockopt(p, uap)
+ struct proc *p;
+ register struct getsockopt_args /* {
+ int s;
+ int level;
+ int name;
+ caddr_t val;
+ int *avalsize;
+ } */ *uap;
+{
+ int valsize, error;
+ struct file *fp;
+ struct sockopt sopt;
+
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
+ return (error);
+ if (uap->val) {
+ error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
+ sizeof (valsize));
+ if (error)
+ return (error);
+ if (valsize < 0)
+ return (EINVAL);
+ } else
+ valsize = 0;
+
+ sopt.sopt_dir = SOPT_GET;
+ sopt.sopt_level = uap->level;
+ sopt.sopt_name = uap->name;
+ sopt.sopt_val = uap->val;
+ sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
+ sopt.sopt_p = p;
+
+ error = sogetopt((struct socket *)fp->f_data, &sopt);
+ if (error == 0) {
+ valsize = sopt.sopt_valsize;
+ error = copyout((caddr_t)&valsize,
+ (caddr_t)uap->avalsize, sizeof (valsize));
+ }
+ return (error);
+}
+
+/*
+ * Get socket name.
+ */
+/* ARGSUSED */
+static int
+getsockname1(p, uap, compat)
+ struct proc *p;
+ register struct getsockname_args /* {
+ int fdes;
+ caddr_t asa;
+ int *alen;
+ } */ *uap;
+ int compat;
+{
+ struct file *fp;
+ register struct socket *so;
+ struct sockaddr *sa;
+ int len, error;
+
+ error = getsock(p->p_fd, uap->fdes, &fp);
+ if (error)
+ return (error);
+ error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
+ if (error)
+ return (error);
+ so = (struct socket *)fp->f_data;
+ sa = 0;
+ error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
+ if (error)
+ goto bad;
+ if (sa == 0) {
+ len = 0;
+ goto gotnothing;
+ }
+
+ len = MIN(len, sa->sa_len);
+#ifdef COMPAT_OLDSOCK
+ if (compat)
+ ((struct osockaddr *)sa)->sa_family = sa->sa_family;
+#endif
+ error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
+ if (error == 0)
+gotnothing:
+ error = copyout((caddr_t)&len, (caddr_t)uap->alen,
+ sizeof (len));
+bad:
+ if (sa)
+ FREE(sa, M_SONAME);
+ return (error);
+}
+
+int
+getsockname(p, uap)
+ struct proc *p;
+ struct getsockname_args *uap;
+{
+
+ return (getsockname1(p, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+ogetsockname(p, uap)
+ struct proc *p;
+ struct getsockname_args *uap;
+{
+
+ return (getsockname1(p, uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+/*
+ * Get name of peer for connected socket.
+ */
+/* ARGSUSED */
+static int
+getpeername1(p, uap, compat)
+ struct proc *p;
+ register struct getpeername_args /* {
+ int fdes;
+ caddr_t asa;
+ int *alen;
+ } */ *uap;
+ int compat;
+{
+ struct file *fp;
+ register struct socket *so;
+ struct sockaddr *sa;
+ int len, error;
+
+ error = getsock(p->p_fd, uap->fdes, &fp);
+ if (error)
+ return (error);
+ so = (struct socket *)fp->f_data;
+ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
+ return (ENOTCONN);
+ error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
+ if (error)
+ return (error);
+ sa = 0;
+ error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
+ if (error)
+ goto bad;
+ if (sa == 0) {
+ len = 0;
+ goto gotnothing;
+ }
+ len = MIN(len, sa->sa_len);
+#ifdef COMPAT_OLDSOCK
+ if (compat)
+ ((struct osockaddr *)sa)->sa_family =
+ sa->sa_family;
+#endif
+ error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
+ if (error)
+ goto bad;
+gotnothing:
+ error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
+bad:
+ if (sa) FREE(sa, M_SONAME);
+ return (error);
+}
+
+int
+getpeername(p, uap)
+ struct proc *p;
+ struct getpeername_args *uap;
+{
+
+ return (getpeername1(p, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+ogetpeername(p, uap)
+ struct proc *p;
+ struct ogetpeername_args *uap;
+{
+
+ /* XXX uap should have type `getpeername_args *' to begin with. */
+ return (getpeername1(p, (struct getpeername_args *)uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+int
+sockargs(mp, buf, buflen, type)
+ struct mbuf **mp;
+ caddr_t buf;
+ int buflen, type;
+{
+ register struct sockaddr *sa;
+ register struct mbuf *m;
+ int error;
+
+ if ((u_int)buflen > MLEN) {
+#ifdef COMPAT_OLDSOCK
+ if (type == MT_SONAME && (u_int)buflen <= 112)
+ buflen = MLEN; /* unix domain compat. hack */
+ else
+#endif
+ return (EINVAL);
+ }
+ m = m_get(M_WAIT, type);
+ if (m == NULL)
+ return (ENOBUFS);
+ m->m_len = buflen;
+ error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
+ if (error)
+ (void) m_free(m);
+ else {
+ *mp = m;
+ if (type == MT_SONAME) {
+ sa = mtod(m, struct sockaddr *);
+
+#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
+ if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+ sa->sa_family = sa->sa_len;
+#endif
+ sa->sa_len = buflen;
+ }
+ }
+ return (error);
+}
+
+int
+getsockaddr(namp, uaddr, len)
+ struct sockaddr **namp;
+ caddr_t uaddr;
+ size_t len;
+{
+ struct sockaddr *sa;
+ int error;
+
+ if (len > SOCK_MAXADDRLEN)
+ return ENAMETOOLONG;
+ MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
+ error = copyin(uaddr, sa, len);
+ if (error) {
+ FREE(sa, M_SONAME);
+ } else {
+#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
+ if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+ sa->sa_family = sa->sa_len;
+#endif
+ sa->sa_len = len;
+ *namp = sa;
+ }
+ return error;
+}
+
+int
+getsock(fdp, fdes, fpp)
+ struct filedesc *fdp;
+ int fdes;
+ struct file **fpp;
+{
+ register struct file *fp;
+
+ if ((unsigned)fdes >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[fdes]) == NULL)
+ return (EBADF);
+ if (fp->f_type != DTYPE_SOCKET)
+ return (ENOTSOCK);
+ *fpp = fp;
+ return (0);
+}
+
+/*
+ * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
+ * XXX - The sf_buf functions are currently private to sendfile(2), so have
+ * been made static, but may be useful in the future for doing zero-copy in
+ * other parts of the networking code.
+ */
+static void
+sf_buf_init(void *arg)
+{
+ int i;
+
+ SLIST_INIT(&sf_freelist);
+ sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
+ sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT);
+ bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
+ for (i = 0; i < nsfbufs; i++) {
+ sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
+ SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
+ }
+}
+
+/*
+ * Get an sf_buf from the freelist. Will block if none are available.
+ */
+static struct sf_buf *
+sf_buf_alloc()
+{
+ struct sf_buf *sf;
+ int s;
+
+ s = splimp();
+ while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
+ sf_buf_alloc_want = 1;
+ tsleep(&sf_freelist, PVM, "sfbufa", 0);
+ }
+ SLIST_REMOVE_HEAD(&sf_freelist, free_list);
+ splx(s);
+ sf->refcnt = 1;
+ return (sf);
+}
+
+#define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
+static void
+sf_buf_ref(caddr_t addr, u_int size)
+{
+ struct sf_buf *sf;
+
+ sf = dtosf(addr);
+ if (sf->refcnt == 0)
+ panic("sf_buf_ref: referencing a free sf_buf");
+ sf->refcnt++;
+}
+
+/*
+ * Lose a reference to an sf_buf. When none left, detach mapped page
+ * and release resources back to the system.
+ *
+ * Must be called at splimp.
+ */
+static void
+sf_buf_free(caddr_t addr, u_int size)
+{
+ struct sf_buf *sf;
+ struct vm_page *m;
+ int s;
+
+ sf = dtosf(addr);
+ if (sf->refcnt == 0)
+ panic("sf_buf_free: freeing free sf_buf");
+ sf->refcnt--;
+ if (sf->refcnt == 0) {
+ pmap_qremove((vm_offset_t)addr, 1);
+ m = sf->m;
+ s = splvm();
+ vm_page_unwire(m, 0);
+ /*
+ * Check for the object going away on us. This can
+ * happen since we don't hold a reference to it.
+ * If so, we're responsible for freeing the page.
+ */
+ if (m->wire_count == 0 && m->object == NULL)
+ vm_page_free(m);
+ splx(s);
+ sf->m = NULL;
+ SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
+ if (sf_buf_alloc_want) {
+ sf_buf_alloc_want = 0;
+ wakeup(&sf_freelist);
+ }
+ }
+}
+
+/*
+ * sendfile(2).
+ * int sendfile(int fd, int s, off_t offset, size_t nbytes,
+ * struct sf_hdtr *hdtr, off_t *sbytes, int flags)
+ *
+ * Send a file specified by 'fd' and starting at 'offset' to a socket
+ * specified by 's'. Send only 'nbytes' of the file or until EOF if
+ * nbytes == 0. Optionally add a header and/or trailer to the socket
+ * output. If specified, write the total number of bytes sent into *sbytes.
+ */
+int
+sendfile(struct proc *p, struct sendfile_args *uap)
+{
+ struct file *fp;
+ struct filedesc *fdp = p->p_fd;
+ struct vnode *vp;
+ struct vm_object *obj;
+ struct socket *so;
+ struct mbuf *m;
+ struct sf_buf *sf;
+ struct vm_page *pg;
+ struct writev_args nuap;
+ struct sf_hdtr hdtr;
+ off_t off, xfsize, sbytes = 0;
+ int error = 0, s;
+
+ /*
+ * Do argument checking. Must be a regular file in, stream
+ * type and connected socket out, positive offset.
+ */
+ if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
+ (fp->f_flag & FREAD) == 0) {
+ error = EBADF;
+ goto done;
+ }
+ if (fp->f_type != DTYPE_VNODE) {
+ error = EINVAL;
+ goto done;
+ }
+ vp = (struct vnode *)fp->f_data;
+ obj = vp->v_object;
+ if (vp->v_type != VREG || obj == NULL) {
+ error = EINVAL;
+ goto done;
+ }
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
+ goto done;
+ so = (struct socket *)fp->f_data;
+ if (so->so_type != SOCK_STREAM) {
+ error = EINVAL;
+ goto done;
+ }
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ error = ENOTCONN;
+ goto done;
+ }
+ if (uap->offset < 0) {
+ error = EINVAL;
+ goto done;
+ }
+
+ /*
+ * If specified, get the pointer to the sf_hdtr struct for
+ * any headers/trailers.
+ */
+ if (uap->hdtr != NULL) {
+ error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
+ if (error)
+ goto done;
+ /*
+ * Send any headers. Wimp out and use writev(2).
+ */
+ if (hdtr.headers != NULL) {
+ nuap.fd = uap->s;
+ nuap.iovp = hdtr.headers;
+ nuap.iovcnt = hdtr.hdr_cnt;
+ error = writev(p, &nuap);
+ if (error)
+ goto done;
+ sbytes += p->p_retval[0];
+ }
+ }
+
+ /*
+ * Protect against multiple writers to the socket.
+ */
+ (void) sblock(&so->so_snd, M_WAITOK);
+
+ /*
+ * Loop through the pages in the file, starting with the requested
+ * offset. Get a file page (do I/O if necessary), map the file page
+ * into an sf_buf, attach an mbuf header to the sf_buf, and queue
+ * it on the socket.
+ */
+ for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
+ vm_pindex_t pindex;
+ vm_offset_t pgoff;
+
+ pindex = OFF_TO_IDX(off);
+retry_lookup:
+ /*
+ * Calculate the amount to transfer. Not to exceed a page,
+ * the EOF, or the passed in nbytes.
+ */
+ xfsize = obj->un_pager.vnp.vnp_size - off;
+ if (xfsize > PAGE_SIZE)
+ xfsize = PAGE_SIZE;
+ pgoff = (vm_offset_t)(off & PAGE_MASK);
+ if (PAGE_SIZE - pgoff < xfsize)
+ xfsize = PAGE_SIZE - pgoff;
+ if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
+ xfsize = uap->nbytes - sbytes;
+ if (xfsize <= 0)
+ break;
+ /*
+ * Optimize the non-blocking case by looking at the socket space
+ * before going to the extra work of constituting the sf_buf.
+ */
+ if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
+ if (so->so_state & SS_CANTSENDMORE)
+ error = EPIPE;
+ else
+ error = EAGAIN;
+ sbunlock(&so->so_snd);
+ goto done;
+ }
+ /*
+ * Attempt to look up the page. If the page doesn't exist or the
+ * part we're interested in isn't valid, then read it from disk.
+ * If some other part of the kernel has this page (i.e. it's busy),
+ * then disk I/O may be occuring on it, so wait and retry.
+ */
+ pg = vm_page_lookup(obj, pindex);
+ if (pg == NULL || (!(pg->flags & PG_BUSY) && !pg->busy &&
+ !vm_page_is_valid(pg, pgoff, xfsize))) {
+ struct uio auio;
+ struct iovec aiov;
+ int bsize;
+
+ if (pg == NULL) {
+ pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
+ if (pg == NULL) {
+ VM_WAIT;
+ goto retry_lookup;
+ }
+ /*
+ * don't just clear PG_BUSY manually -
+ * vm_page_alloc() should be considered opaque,
+ * use the VM routine provided to clear
+ * PG_BUSY.
+ */
+ vm_page_wakeup(pg);
+ }
+ /*
+ * Ensure that our page is still around when the I/O completes.
+ */
+ vm_page_io_start(pg);
+ vm_page_wire(pg);
+ /*
+ * Get the page from backing store.
+ */
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ aiov.iov_base = 0;
+ aiov.iov_len = MAXBSIZE;
+ auio.uio_resid = MAXBSIZE;
+ auio.uio_offset = trunc_page(off);
+ auio.uio_segflg = UIO_NOCOPY;
+ auio.uio_rw = UIO_READ;
+ auio.uio_procp = p;
+ vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
+ error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
+ p->p_ucred);
+ VOP_UNLOCK(vp, 0, p);
+ vm_page_flag_clear(pg, PG_ZERO);
+ vm_page_io_finish(pg);
+ if (error) {
+ vm_page_unwire(pg, 0);
+ /*
+ * See if anyone else might know about this page.
+ * If not and it is not valid, then free it.
+ */
+ if (pg->wire_count == 0 && pg->valid == 0 &&
+ pg->busy == 0 && !(pg->flags & PG_BUSY) &&
+ pg->hold_count == 0)
+ vm_page_free(pg);
+ sbunlock(&so->so_snd);
+ goto done;
+ }
+ } else {
+ if (vm_page_sleep_busy(pg, TRUE, "sfpbsy"))
+ goto retry_lookup;
+
+ /*
+ * Protect from having the page ripped out from
+ * beneath us.
+ */
+ vm_page_wire(pg);
+ }
+ /*
+ * Allocate a kernel virtual page and insert the physical page
+ * into it.
+ */
+ sf = sf_buf_alloc();
+ sf->m = pg;
+ pmap_qenter(sf->kva, &pg, 1);
+ /*
+ * Get an mbuf header and set it up as having external storage.
+ */
+ MGETHDR(m, M_WAIT, MT_DATA);
+ m->m_ext.ext_free = sf_buf_free;
+ m->m_ext.ext_ref = sf_buf_ref;
+ m->m_ext.ext_buf = (void *)sf->kva;
+ m->m_ext.ext_size = PAGE_SIZE;
+ m->m_data = (char *) sf->kva + pgoff;
+ m->m_flags |= M_EXT;
+ m->m_pkthdr.len = m->m_len = xfsize;
+ /*
+ * Add the buffer to the socket buffer chain.
+ */
+ s = splnet();
+retry_space:
+ /*
+ * Make sure that the socket is still able to take more data.
+ * CANTSENDMORE being true usually means that the connection
+ * was closed. so_error is true when an error was sensed after
+ * a previous send.
+ * The state is checked after the page mapping and buffer
+ * allocation above since those operations may block and make
+ * any socket checks stale. From this point forward, nothing
+ * blocks before the pru_send (or more accurately, any blocking
+ * results in a loop back to here to re-check).
+ */
+ if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
+ if (so->so_state & SS_CANTSENDMORE) {
+ error = EPIPE;
+ } else {
+ error = so->so_error;
+ so->so_error = 0;
+ }
+ m_freem(m);
+ sbunlock(&so->so_snd);
+ splx(s);
+ goto done;
+ }
+ /*
+ * Wait for socket space to become available. We do this just
+ * after checking the connection state above in order to avoid
+ * a race condition with sbwait().
+ */
+ if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
+ if (so->so_state & SS_NBIO) {
+ m_freem(m);
+ sbunlock(&so->so_snd);
+ splx(s);
+ error = EAGAIN;
+ goto done;
+ }
+ error = sbwait(&so->so_snd);
+ /*
+ * An error from sbwait usually indicates that we've
+ * been interrupted by a signal. If we've sent anything
+ * then return bytes sent, otherwise return the error.
+ */
+ if (error) {
+ m_freem(m);
+ sbunlock(&so->so_snd);
+ splx(s);
+ goto done;
+ }
+ goto retry_space;
+ }
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
+ splx(s);
+ if (error) {
+ sbunlock(&so->so_snd);
+ goto done;
+ }
+ }
+ sbunlock(&so->so_snd);
+
+ /*
+ * Send trailers. Wimp out and use writev(2).
+ */
+ if (uap->hdtr != NULL && hdtr.trailers != NULL) {
+ nuap.fd = uap->s;
+ nuap.iovp = hdtr.trailers;
+ nuap.iovcnt = hdtr.trl_cnt;
+ error = writev(p, &nuap);
+ if (error)
+ goto done;
+ sbytes += p->p_retval[0];
+ }
+
+done:
+ if (uap->sbytes != NULL) {
+ copyout(&sbytes, uap->sbytes, sizeof(off_t));
+ }
+ return (error);
+}
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
new file mode 100644
index 0000000..abdb71e
--- /dev/null
+++ b/sys/kern/uipc_usrreq.c
@@ -0,0 +1,1186 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94
+ * $Id: uipc_usrreq.c,v 1.38 1999/01/21 08:29:04 dillon Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/domain.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h> /* XXX must be before <sys/file.h> */
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/un.h>
+#include <sys/unpcb.h>
+#include <sys/vnode.h>
+
+#include <vm/vm_zone.h>
+
+struct vm_zone *unp_zone;
+static unp_gen_t unp_gencnt;
+static u_int unp_count;
+
+static struct unp_head unp_shead, unp_dhead;
+
+/*
+ * Unix communications domain.
+ *
+ * TODO:
+ * SEQPACKET, RDM
+ * rethink name space problems
+ * need a proper out-of-band
+ * lock pushdown
+ */
+static struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
+static ino_t unp_ino; /* prototype for fake inode numbers */
+
+static int unp_attach __P((struct socket *));
+static void unp_detach __P((struct unpcb *));
+static int unp_bind __P((struct unpcb *,struct sockaddr *, struct proc *));
+static int unp_connect __P((struct socket *,struct sockaddr *,
+ struct proc *));
+static void unp_disconnect __P((struct unpcb *));
+static void unp_shutdown __P((struct unpcb *));
+static void unp_drop __P((struct unpcb *, int));
+static void unp_gc __P((void));
+static void unp_scan __P((struct mbuf *, void (*)(struct file *)));
+static void unp_mark __P((struct file *));
+static void unp_discard __P((struct file *));
+static int unp_internalize __P((struct mbuf *, struct proc *));
+
+static int
+uipc_abort(struct socket *so)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+ unp_drop(unp, ECONNABORTED);
+ return 0;
+}
+
+static int
+uipc_accept(struct socket *so, struct sockaddr **nam)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+
+ /*
+ * Pass back name of connected socket,
+ * if it was bound and we are still connected
+ * (our peer may have closed already!).
+ */
+ if (unp->unp_conn && unp->unp_conn->unp_addr) {
+ *nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr,
+ 1);
+ } else {
+ *nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1);
+ }
+ return 0;
+}
+
+static int
+uipc_attach(struct socket *so, int proto, struct proc *p)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp != 0)
+ return EISCONN;
+ return unp_attach(so);
+}
+
+static int
+uipc_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+
+ return unp_bind(unp, nam, p);
+}
+
+static int
+uipc_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+ return unp_connect(so, nam, curproc);
+}
+
+static int
+uipc_connect2(struct socket *so1, struct socket *so2)
+{
+ struct unpcb *unp = sotounpcb(so1);
+
+ if (unp == 0)
+ return EINVAL;
+
+ return unp_connect2(so1, so2);
+}
+
+/* control is EOPNOTSUPP */
+
+static int
+uipc_detach(struct socket *so)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+
+ unp_detach(unp);
+ return 0;
+}
+
+static int
+uipc_disconnect(struct socket *so)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+ unp_disconnect(unp);
+ return 0;
+}
+
+static int
+uipc_listen(struct socket *so, struct proc *p)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0 || unp->unp_vnode == 0)
+ return EINVAL;
+ return 0;
+}
+
+static int
+uipc_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+ if (unp->unp_conn && unp->unp_conn->unp_addr)
+ *nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr,
+ 1);
+ return 0;
+}
+
+static int
+uipc_rcvd(struct socket *so, int flags)
+{
+ struct unpcb *unp = sotounpcb(so);
+ struct socket *so2;
+
+ if (unp == 0)
+ return EINVAL;
+ switch (so->so_type) {
+ case SOCK_DGRAM:
+ panic("uipc_rcvd DGRAM?");
+ /*NOTREACHED*/
+
+ case SOCK_STREAM:
+#define rcv (&so->so_rcv)
+#define snd (&so2->so_snd)
+ if (unp->unp_conn == 0)
+ break;
+ so2 = unp->unp_conn->unp_socket;
+ /*
+ * Adjust backpressure on sender
+ * and wakeup any waiting to write.
+ */
+ snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
+ unp->unp_mbcnt = rcv->sb_mbcnt;
+ snd->sb_hiwat += unp->unp_cc - rcv->sb_cc;
+ unp->unp_cc = rcv->sb_cc;
+ sowwakeup(so2);
+#undef snd
+#undef rcv
+ break;
+
+ default:
+ panic("uipc_rcvd unknown socktype");
+ }
+ return 0;
+}
+
+/* pru_rcvoob is EOPNOTSUPP */
+
+static int
+uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+ struct mbuf *control, struct proc *p)
+{
+ int error = 0;
+ struct unpcb *unp = sotounpcb(so);
+ struct socket *so2;
+
+ if (unp == 0) {
+ error = EINVAL;
+ goto release;
+ }
+ if (flags & PRUS_OOB) {
+ error = EOPNOTSUPP;
+ goto release;
+ }
+
+ if (control && (error = unp_internalize(control, p)))
+ goto release;
+
+ switch (so->so_type) {
+ case SOCK_DGRAM:
+ {
+ struct sockaddr *from;
+
+ if (nam) {
+ if (unp->unp_conn) {
+ error = EISCONN;
+ break;
+ }
+ error = unp_connect(so, nam, p);
+ if (error)
+ break;
+ } else {
+ if (unp->unp_conn == 0) {
+ error = ENOTCONN;
+ break;
+ }
+ }
+ so2 = unp->unp_conn->unp_socket;
+ if (unp->unp_addr)
+ from = (struct sockaddr *)unp->unp_addr;
+ else
+ from = &sun_noname;
+ if (sbappendaddr(&so2->so_rcv, from, m, control)) {
+ sorwakeup(so2);
+ m = 0;
+ control = 0;
+ } else
+ error = ENOBUFS;
+ if (nam)
+ unp_disconnect(unp);
+ break;
+ }
+
+ case SOCK_STREAM:
+#define rcv (&so2->so_rcv)
+#define snd (&so->so_snd)
+ /* Connect if not connected yet. */
+ /*
+ * Note: A better implementation would complain
+ * if not equal to the peer's address.
+ */
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ if (nam) {
+ error = unp_connect(so, nam, p);
+ if (error)
+ break; /* XXX */
+ } else {
+ error = ENOTCONN;
+ break;
+ }
+ }
+
+ if (so->so_state & SS_CANTSENDMORE) {
+ error = EPIPE;
+ break;
+ }
+ if (unp->unp_conn == 0)
+ panic("uipc_send connected but no connection?");
+ so2 = unp->unp_conn->unp_socket;
+ /*
+ * Send to paired receive port, and then reduce
+ * send buffer hiwater marks to maintain backpressure.
+ * Wake up readers.
+ */
+ if (control) {
+ if (sbappendcontrol(rcv, m, control))
+ control = 0;
+ } else
+ sbappend(rcv, m);
+ snd->sb_mbmax -=
+ rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
+ unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
+ snd->sb_hiwat -= rcv->sb_cc - unp->unp_conn->unp_cc;
+ unp->unp_conn->unp_cc = rcv->sb_cc;
+ sorwakeup(so2);
+ m = 0;
+#undef snd
+#undef rcv
+ break;
+
+ default:
+ panic("uipc_send unknown socktype");
+ }
+
+ /*
+ * SEND_EOF is equivalent to a SEND followed by
+ * a SHUTDOWN.
+ */
+ if (flags & PRUS_EOF) {
+ socantsendmore(so);
+ unp_shutdown(unp);
+ }
+
+release:
+ if (control)
+ m_freem(control);
+ if (m)
+ m_freem(m);
+ return error;
+}
+
+static int
+uipc_sense(struct socket *so, struct stat *sb)
+{
+ struct unpcb *unp = sotounpcb(so);
+ struct socket *so2;
+
+ if (unp == 0)
+ return EINVAL;
+ sb->st_blksize = so->so_snd.sb_hiwat;
+ if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
+ so2 = unp->unp_conn->unp_socket;
+ sb->st_blksize += so2->so_rcv.sb_cc;
+ }
+ sb->st_dev = NODEV;
+ if (unp->unp_ino == 0)
+ unp->unp_ino = unp_ino++;
+ sb->st_ino = unp->unp_ino;
+ return (0);
+}
+
+static int
+uipc_shutdown(struct socket *so)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+ socantsendmore(so);
+ unp_shutdown(unp);
+ return 0;
+}
+
+static int
+uipc_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+ struct unpcb *unp = sotounpcb(so);
+
+ if (unp == 0)
+ return EINVAL;
+ if (unp->unp_addr)
+ *nam = dup_sockaddr((struct sockaddr *)unp->unp_addr, 1);
+ return 0;
+}
+
+struct pr_usrreqs uipc_usrreqs = {
+ uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect,
+ uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect,
+ uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp,
+ uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr,
+ sosend, soreceive, sopoll
+};
+
+/*
+ * Both send and receive buffers are allocated PIPSIZ bytes of buffering
+ * for stream sockets, although the total for sender and receiver is
+ * actually only PIPSIZ.
+ * Datagram sockets really use the sendspace as the maximum datagram size,
+ * and don't really want to reserve the sendspace. Their recvspace should
+ * be large enough for at least one max-size datagram plus address.
+ */
+#ifndef PIPSIZ
+#define PIPSIZ 8192
+#endif
+static u_long unpst_sendspace = PIPSIZ;
+static u_long unpst_recvspace = PIPSIZ;
+static u_long unpdg_sendspace = 2*1024; /* really max datagram size */
+static u_long unpdg_recvspace = 4*1024;
+
+static int unp_rights; /* file descriptors in flight */
+
+SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
+ &unpst_sendspace, 0, "");
+SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
+ &unpst_recvspace, 0, "");
+SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
+ &unpdg_sendspace, 0, "");
+SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
+ &unpdg_recvspace, 0, "");
+SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
+
+static int
+unp_attach(so)
+ struct socket *so;
+{
+ register struct unpcb *unp;
+ int error;
+
+ if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+ switch (so->so_type) {
+
+ case SOCK_STREAM:
+ error = soreserve(so, unpst_sendspace, unpst_recvspace);
+ break;
+
+ case SOCK_DGRAM:
+ error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
+ break;
+
+ default:
+ panic("unp_attach");
+ }
+ if (error)
+ return (error);
+ }
+ unp = zalloc(unp_zone);
+ if (unp == NULL)
+ return (ENOBUFS);
+ bzero(unp, sizeof *unp);
+ unp->unp_gencnt = ++unp_gencnt;
+ unp_count++;
+ LIST_INIT(&unp->unp_refs);
+ unp->unp_socket = so;
+ LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
+ : &unp_shead, unp, unp_link);
+ so->so_pcb = (caddr_t)unp;
+ return (0);
+}
+
+static void
+unp_detach(unp)
+ register struct unpcb *unp;
+{
+ LIST_REMOVE(unp, unp_link);
+ unp->unp_gencnt = ++unp_gencnt;
+ --unp_count;
+ if (unp->unp_vnode) {
+ unp->unp_vnode->v_socket = 0;
+ vrele(unp->unp_vnode);
+ unp->unp_vnode = 0;
+ }
+ if (unp->unp_conn)
+ unp_disconnect(unp);
+ while (unp->unp_refs.lh_first)
+ unp_drop(unp->unp_refs.lh_first, ECONNRESET);
+ soisdisconnected(unp->unp_socket);
+ unp->unp_socket->so_pcb = 0;
+ if (unp_rights) {
+ /*
+ * Normally the receive buffer is flushed later,
+ * in sofree, but if our receive buffer holds references
+ * to descriptors that are now garbage, we will dispose
+ * of those descriptor references after the garbage collector
+ * gets them (resulting in a "panic: closef: count < 0").
+ */
+ sorflush(unp->unp_socket);
+ unp_gc();
+ }
+ if (unp->unp_addr)
+ FREE(unp->unp_addr, M_SONAME);
+ zfree(unp_zone, unp);
+}
+
+static int
+unp_bind(unp, nam, p)
+ struct unpcb *unp;
+ struct sockaddr *nam;
+ struct proc *p;
+{
+ struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error, namelen;
+ struct nameidata nd;
+ char buf[SOCK_MAXADDRLEN];
+
+ if (unp->unp_vnode != NULL)
+ return (EINVAL);
+#define offsetof(s, e) ((char *)&((s *)0)->e - (char *)((s *)0))
+ namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
+ if (namelen <= 0)
+ return EINVAL;
+ strncpy(buf, soun->sun_path, namelen);
+ buf[namelen] = 0; /* null-terminate the string */
+ NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE,
+ buf, p);
+/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
+ error = namei(&nd);
+ if (error)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(vp);
+ return (EADDRINUSE);
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VSOCK;
+ vattr.va_mode = (ACCESSPERMS & ~p->p_fd->fd_cmask);
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ vput(nd.ni_dvp);
+ if (error)
+ return (error);
+ vp = nd.ni_vp;
+ vp->v_socket = unp->unp_socket;
+ unp->unp_vnode = vp;
+ unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam, 1);
+ VOP_UNLOCK(vp, 0, p);
+ return (0);
+}
+
+static int
+unp_connect(so, nam, p)
+ struct socket *so;
+ struct sockaddr *nam;
+ struct proc *p;
+{
+ register struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+ register struct vnode *vp;
+ register struct socket *so2, *so3;
+ struct unpcb *unp2, *unp3;
+ int error, len;
+ struct nameidata nd;
+ char buf[SOCK_MAXADDRLEN];
+
+ len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
+ if (len <= 0)
+ return EINVAL;
+ strncpy(buf, soun->sun_path, len);
+ buf[len] = 0;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, p);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type != VSOCK) {
+ error = ENOTSOCK;
+ goto bad;
+ }
+ error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p);
+ if (error)
+ goto bad;
+ so2 = vp->v_socket;
+ if (so2 == 0) {
+ error = ECONNREFUSED;
+ goto bad;
+ }
+ if (so->so_type != so2->so_type) {
+ error = EPROTOTYPE;
+ goto bad;
+ }
+ if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
+ if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
+ (so3 = sonewconn(so2, 0)) == 0) {
+ error = ECONNREFUSED;
+ goto bad;
+ }
+ unp2 = sotounpcb(so2);
+ unp3 = sotounpcb(so3);
+ if (unp2->unp_addr)
+ unp3->unp_addr = (struct sockaddr_un *)
+ dup_sockaddr((struct sockaddr *)
+ unp2->unp_addr, 1);
+ so2 = so3;
+ }
+ error = unp_connect2(so, so2);
+bad:
+ vput(vp);
+ return (error);
+}
+
+int
+unp_connect2(so, so2)
+ register struct socket *so;
+ register struct socket *so2;
+{
+ register struct unpcb *unp = sotounpcb(so);
+ register struct unpcb *unp2;
+
+ if (so2->so_type != so->so_type)
+ return (EPROTOTYPE);
+ unp2 = sotounpcb(so2);
+ unp->unp_conn = unp2;
+ switch (so->so_type) {
+
+ case SOCK_DGRAM:
+ LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
+ soisconnected(so);
+ break;
+
+ case SOCK_STREAM:
+ unp2->unp_conn = unp;
+ soisconnected(so);
+ soisconnected(so2);
+ break;
+
+ default:
+ panic("unp_connect2");
+ }
+ return (0);
+}
+
+static void
+unp_disconnect(unp)
+ struct unpcb *unp;
+{
+ register struct unpcb *unp2 = unp->unp_conn;
+
+ if (unp2 == 0)
+ return;
+ unp->unp_conn = 0;
+ switch (unp->unp_socket->so_type) {
+
+ case SOCK_DGRAM:
+ LIST_REMOVE(unp, unp_reflink);
+ unp->unp_socket->so_state &= ~SS_ISCONNECTED;
+ break;
+
+ case SOCK_STREAM:
+ soisdisconnected(unp->unp_socket);
+ unp2->unp_conn = 0;
+ soisdisconnected(unp2->unp_socket);
+ break;
+ }
+}
+
+#ifdef notdef
+void
+unp_abort(unp)
+ struct unpcb *unp;
+{
+
+ unp_detach(unp);
+}
+#endif
+
+static int
+unp_pcblist SYSCTL_HANDLER_ARGS
+{
+ int error, i, n;
+ struct unpcb *unp, **unp_list;
+ unp_gen_t gencnt;
+ struct xunpgen xug;
+ struct unp_head *head;
+
+ head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
+
+ /*
+ * The process of preparing the PCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == 0) {
+ n = unp_count;
+ req->oldidx = 2 * (sizeof xug)
+ + (n + n/8) * sizeof(struct xunpcb);
+ return 0;
+ }
+
+ if (req->newptr != 0)
+ return EPERM;
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ gencnt = unp_gencnt;
+ n = unp_count;
+
+ xug.xug_len = sizeof xug;
+ xug.xug_count = n;
+ xug.xug_gen = gencnt;
+ xug.xug_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, &xug, sizeof xug);
+ if (error)
+ return error;
+
+ unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
+ if (unp_list == 0)
+ return ENOMEM;
+
+ for (unp = head->lh_first, i = 0; unp && i < n;
+ unp = unp->unp_link.le_next) {
+ if (unp->unp_gencnt <= gencnt)
+ unp_list[i++] = unp;
+ }
+ n = i; /* in case we lost some during malloc */
+
+ error = 0;
+ for (i = 0; i < n; i++) {
+ unp = unp_list[i];
+ if (unp->unp_gencnt <= gencnt) {
+ struct xunpcb xu;
+ xu.xu_len = sizeof xu;
+ xu.xu_unpp = unp;
+ /*
+ * XXX - need more locking here to protect against
+ * connect/disconnect races for SMP.
+ */
+ if (unp->unp_addr)
+ bcopy(unp->unp_addr, &xu.xu_addr,
+ unp->unp_addr->sun_len);
+ if (unp->unp_conn && unp->unp_conn->unp_addr)
+ bcopy(unp->unp_conn->unp_addr,
+ &xu.xu_caddr,
+ unp->unp_conn->unp_addr->sun_len);
+ bcopy(unp, &xu.xu_unp, sizeof *unp);
+ sotoxsocket(unp->unp_socket, &xu.xu_socket);
+ error = SYSCTL_OUT(req, &xu, sizeof xu);
+ }
+ }
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state.
+ * If the generation differs from what we told
+ * her before, she knows that something happened
+ * while we were processing this request, and it
+ * might be necessary to retry.
+ */
+ xug.xug_gen = unp_gencnt;
+ xug.xug_sogen = so_gencnt;
+ xug.xug_count = unp_count;
+ error = SYSCTL_OUT(req, &xug, sizeof xug);
+ }
+ free(unp_list, M_TEMP);
+ return error;
+}
+
+SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
+ (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
+ "List of active local datagram sockets");
+SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
+ (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
+ "List of active local stream sockets");
+
+static void
+unp_shutdown(unp)
+ struct unpcb *unp;
+{
+ struct socket *so;
+
+ if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
+ (so = unp->unp_conn->unp_socket))
+ socantrcvmore(so);
+}
+
+static void
+unp_drop(unp, errno)
+ struct unpcb *unp;
+ int errno;
+{
+ struct socket *so = unp->unp_socket;
+
+ so->so_error = errno;
+ unp_disconnect(unp);
+ if (so->so_head) {
+ LIST_REMOVE(unp, unp_link);
+ unp->unp_gencnt = ++unp_gencnt;
+ unp_count--;
+ so->so_pcb = (caddr_t) 0;
+ if (unp->unp_addr)
+ FREE(unp->unp_addr, M_SONAME);
+ zfree(unp_zone, unp);
+ sofree(so);
+ }
+}
+
+#ifdef notdef
+void
+unp_drain()
+{
+
+}
+#endif
+
+int
+unp_externalize(rights)
+ struct mbuf *rights;
+{
+ struct proc *p = curproc; /* XXX */
+ register int i;
+ register struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
+ register struct file **rp = (struct file **)(cm + 1);
+ register struct file *fp;
+ int newfds = (cm->cmsg_len - sizeof(*cm)) / sizeof (int);
+ int f;
+
+ /*
+ * if the new FD's will not fit, then we free them all
+ */
+ if (!fdavail(p, newfds)) {
+ for (i = 0; i < newfds; i++) {
+ fp = *rp;
+ unp_discard(fp);
+ *rp++ = 0;
+ }
+ return (EMSGSIZE);
+ }
+ /*
+ * now change each pointer to an fd in the global table to
+ * an integer that is the index to the local fd table entry
+ * that we set up to point to the global one we are transferring.
+ * XXX this assumes a pointer and int are the same size...!
+ */
+ for (i = 0; i < newfds; i++) {
+ if (fdalloc(p, 0, &f))
+ panic("unp_externalize");
+ fp = *rp;
+ p->p_fd->fd_ofiles[f] = fp;
+ fp->f_msgcount--;
+ unp_rights--;
+ *(int *)rp++ = f;
+ }
+ return (0);
+}
+
+void
+unp_init(void)
+{
+ unp_zone = zinit("unpcb", sizeof(struct unpcb), nmbclusters, 0, 0);
+ if (unp_zone == 0)
+ panic("unp_init");
+ LIST_INIT(&unp_dhead);
+ LIST_INIT(&unp_shead);
+}
+
+#ifndef MIN
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+
+static int
+unp_internalize(control, p)
+ struct mbuf *control;
+ struct proc *p;
+{
+ struct filedesc *fdp = p->p_fd;
+ register struct cmsghdr *cm = mtod(control, struct cmsghdr *);
+ register struct file **rp;
+ register struct file *fp;
+ register int i, fd;
+ register struct cmsgcred *cmcred;
+ int oldfds;
+
+ if ((cm->cmsg_type != SCM_RIGHTS && cm->cmsg_type != SCM_CREDS) ||
+ cm->cmsg_level != SOL_SOCKET || cm->cmsg_len != control->m_len)
+ return (EINVAL);
+
+ /*
+ * Fill in credential information.
+ */
+ if (cm->cmsg_type == SCM_CREDS) {
+ cmcred = (struct cmsgcred *)(cm + 1);
+ cmcred->cmcred_pid = p->p_pid;
+ cmcred->cmcred_uid = p->p_cred->p_ruid;
+ cmcred->cmcred_gid = p->p_cred->p_rgid;
+ cmcred->cmcred_euid = p->p_ucred->cr_uid;
+ cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups,
+ CMGROUP_MAX);
+ for (i = 0; i < cmcred->cmcred_ngroups; i++)
+ cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i];
+ return(0);
+ }
+
+ oldfds = (cm->cmsg_len - sizeof (*cm)) / sizeof (int);
+ /*
+ * check that all the FDs passed in refer to legal OPEN files
+ * If not, reject the entire operation.
+ */
+ rp = (struct file **)(cm + 1);
+ for (i = 0; i < oldfds; i++) {
+ fd = *(int *)rp++;
+ if ((unsigned)fd >= fdp->fd_nfiles ||
+ fdp->fd_ofiles[fd] == NULL)
+ return (EBADF);
+ }
+ /*
+ * Now replace the integer FDs with pointers to
+ * the associated global file table entry..
+ * XXX this assumes a pointer and an int are the same size!
+ */
+ rp = (struct file **)(cm + 1);
+ for (i = 0; i < oldfds; i++) {
+ fp = fdp->fd_ofiles[*(int *)rp];
+ *rp++ = fp;
+ fp->f_count++;
+ fp->f_msgcount++;
+ unp_rights++;
+ }
+ return (0);
+}
+
+static int unp_defer, unp_gcing;
+
+static void
+unp_gc()
+{
+ register struct file *fp, *nextfp;
+ register struct socket *so;
+ struct file **extra_ref, **fpp;
+ int nunref, i;
+
+ if (unp_gcing)
+ return;
+ unp_gcing = 1;
+ unp_defer = 0;
+ /*
+ * before going through all this, set all FDs to
+ * be NOT defered and NOT externally accessible
+ */
+ for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next)
+ fp->f_flag &= ~(FMARK|FDEFER);
+ do {
+ for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) {
+ /*
+ * If the file is not open, skip it
+ */
+ if (fp->f_count == 0)
+ continue;
+ /*
+ * If we already marked it as 'defer' in a
+ * previous pass, then try process it this time
+ * and un-mark it
+ */
+ if (fp->f_flag & FDEFER) {
+ fp->f_flag &= ~FDEFER;
+ unp_defer--;
+ } else {
+ /*
+ * if it's not defered, then check if it's
+ * already marked.. if so skip it
+ */
+ if (fp->f_flag & FMARK)
+ continue;
+ /*
+ * If all references are from messages
+ * in transit, then skip it. it's not
+ * externally accessible.
+ */
+ if (fp->f_count == fp->f_msgcount)
+ continue;
+ /*
+ * If it got this far then it must be
+ * externally accessible.
+ */
+ fp->f_flag |= FMARK;
+ }
+ /*
+ * either it was defered, or it is externally
+ * accessible and not already marked so.
+ * Now check if it is possibly one of OUR sockets.
+ */
+ if (fp->f_type != DTYPE_SOCKET ||
+ (so = (struct socket *)fp->f_data) == 0)
+ continue;
+ if (so->so_proto->pr_domain != &localdomain ||
+ (so->so_proto->pr_flags&PR_RIGHTS) == 0)
+ continue;
+#ifdef notdef
+ if (so->so_rcv.sb_flags & SB_LOCK) {
+ /*
+ * This is problematical; it's not clear
+ * we need to wait for the sockbuf to be
+ * unlocked (on a uniprocessor, at least),
+ * and it's also not clear what to do
+ * if sbwait returns an error due to receipt
+ * of a signal. If sbwait does return
+ * an error, we'll go into an infinite
+ * loop. Delete all of this for now.
+ */
+ (void) sbwait(&so->so_rcv);
+ goto restart;
+ }
+#endif
+ /*
+ * So, Ok, it's one of our sockets and it IS externally
+ * accessible (or was defered). Now we look
+ * to see if we hold any file descriptors in its
+ * message buffers. Follow those links and mark them
+ * as accessible too.
+ */
+ unp_scan(so->so_rcv.sb_mb, unp_mark);
+ }
+ } while (unp_defer);
+ /*
+ * We grab an extra reference to each of the file table entries
+ * that are not otherwise accessible and then free the rights
+ * that are stored in messages on them.
+ *
+ * The bug in the orginal code is a little tricky, so I'll describe
+ * what's wrong with it here.
+ *
+ * It is incorrect to simply unp_discard each entry for f_msgcount
+ * times -- consider the case of sockets A and B that contain
+ * references to each other. On a last close of some other socket,
+ * we trigger a gc since the number of outstanding rights (unp_rights)
+ * is non-zero. If during the sweep phase the gc code un_discards,
+ * we end up doing a (full) closef on the descriptor. A closef on A
+ * results in the following chain. Closef calls soo_close, which
+ * calls soclose. Soclose calls first (through the switch
+ * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply
+ * returns because the previous instance had set unp_gcing, and
+ * we return all the way back to soclose, which marks the socket
+ * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush
+ * to free up the rights that are queued in messages on the socket A,
+ * i.e., the reference on B. The sorflush calls via the dom_dispose
+ * switch unp_dispose, which unp_scans with unp_discard. This second
+ * instance of unp_discard just calls closef on B.
+ *
+ * Well, a similar chain occurs on B, resulting in a sorflush on B,
+ * which results in another closef on A. Unfortunately, A is already
+ * being closed, and the descriptor has already been marked with
+ * SS_NOFDREF, and soclose panics at this point.
+ *
+ * Here, we first take an extra reference to each inaccessible
+ * descriptor. Then, we call sorflush ourself, since we know
+ * it is a Unix domain socket anyhow. After we destroy all the
+ * rights carried in messages, we do a last closef to get rid
+ * of our extra reference. This is the last close, and the
+ * unp_detach etc will shut down the socket.
+ *
+ * 91/09/19, bsy@cs.cmu.edu
+ */
+ extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK);
+ for (nunref = 0, fp = filehead.lh_first, fpp = extra_ref; fp != 0;
+ fp = nextfp) {
+ nextfp = fp->f_list.le_next;
+ /*
+ * If it's not open, skip it
+ */
+ if (fp->f_count == 0)
+ continue;
+ /*
+ * If all refs are from msgs, and it's not marked accessible
+ * then it must be referenced from some unreachable cycle
+ * of (shut-down) FDs, so include it in our
+ * list of FDs to remove
+ */
+ if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
+ *fpp++ = fp;
+ nunref++;
+ fp->f_count++;
+ }
+ }
+ /*
+ * for each FD on our hit list, do the following two things
+ */
+ for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
+ struct file *tfp = *fpp;
+ if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL)
+ sorflush((struct socket *)(tfp->f_data));
+ }
+ for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
+ closef(*fpp, (struct proc *) NULL);
+ free((caddr_t)extra_ref, M_FILE);
+ unp_gcing = 0;
+}
+
+void
+unp_dispose(m)
+ struct mbuf *m;
+{
+
+ if (m)
+ unp_scan(m, unp_discard);
+}
+
+static void
+unp_scan(m0, op)
+ register struct mbuf *m0;
+ void (*op) __P((struct file *));
+{
+ register struct mbuf *m;
+ register struct file **rp;
+ register struct cmsghdr *cm;
+ register int i;
+ int qfds;
+
+ while (m0) {
+ for (m = m0; m; m = m->m_next)
+ if (m->m_type == MT_CONTROL &&
+ m->m_len >= sizeof(*cm)) {
+ cm = mtod(m, struct cmsghdr *);
+ if (cm->cmsg_level != SOL_SOCKET ||
+ cm->cmsg_type != SCM_RIGHTS)
+ continue;
+ qfds = (cm->cmsg_len - sizeof *cm)
+ / sizeof (struct file *);
+ rp = (struct file **)(cm + 1);
+ for (i = 0; i < qfds; i++)
+ (*op)(*rp++);
+ break; /* XXX, but saves time */
+ }
+ m0 = m0->m_act;
+ }
+}
+
+static void
+unp_mark(fp)
+ struct file *fp;
+{
+
+ if (fp->f_flag & FMARK)
+ return;
+ unp_defer++;
+ fp->f_flag |= (FMARK|FDEFER);
+}
+
+static void
+unp_discard(fp)
+ struct file *fp;
+{
+
+ fp->f_msgcount--;
+ unp_rights--;
+ (void) closef(fp, (struct proc *)NULL);
+}
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
new file mode 100644
index 0000000..c1af873
--- /dev/null
+++ b/sys/kern/vfs_aio.c
@@ -0,0 +1,2046 @@
+/*
+ * Copyright (c) 1997 John S. Dyson. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. John S. Dyson's name may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * DISCLAIMER: This code isn't warranted to do anything useful. Anything
+ * bad that happens because of using this software isn't the responsibility
+ * of the author. This software is distributed AS-IS.
+ *
+ * $Id: vfs_aio.c,v 1.36 1998/12/15 17:38:33 des Exp $
+ */
+
+/*
+ * This file contains support for the POSIX 1003.1B AIO/LIO facility.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/lock.h>
+#include <sys/unistd.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/conf.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_zone.h>
+#include <sys/aio.h>
+#include <sys/shm.h>
+
+#include <machine/cpu.h>
+#include <machine/limits.h>
+
+static long jobrefid;
+
+#define JOBST_NULL 0x0
+#define JOBST_JOBQPROC 0x1
+#define JOBST_JOBQGLOBAL 0x2
+#define JOBST_JOBRUNNING 0x3
+#define JOBST_JOBFINISHED 0x4
+#define JOBST_JOBQBUF 0x5
+#define JOBST_JOBBFINISHED 0x6
+
+#ifndef MAX_AIO_PER_PROC
+#define MAX_AIO_PER_PROC 32
+#endif
+
+#ifndef MAX_AIO_QUEUE_PER_PROC
+#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
+#endif
+
+#ifndef MAX_AIO_PROCS
+#define MAX_AIO_PROCS 32
+#endif
+
+#ifndef MAX_AIO_QUEUE
+#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
+#endif
+
+#ifndef TARGET_AIO_PROCS
+#define TARGET_AIO_PROCS 0
+#endif
+
+#ifndef MAX_BUF_AIO
+#define MAX_BUF_AIO 16
+#endif
+
+#ifndef AIOD_TIMEOUT_DEFAULT
+#define AIOD_TIMEOUT_DEFAULT (10 * hz)
+#endif
+
+#ifndef AIOD_LIFETIME_DEFAULT
+#define AIOD_LIFETIME_DEFAULT (30 * hz)
+#endif
+
+static int max_aio_procs = MAX_AIO_PROCS;
+static int num_aio_procs = 0;
+static int target_aio_procs = TARGET_AIO_PROCS;
+static int max_queue_count = MAX_AIO_QUEUE;
+static int num_queue_count = 0;
+static int num_buf_aio = 0;
+static int num_aio_resv_start = 0;
+static int aiod_timeout;
+static int aiod_lifetime;
+
+static int max_aio_per_proc = MAX_AIO_PER_PROC,
+ max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC;
+
+static int max_buf_aio = MAX_BUF_AIO;
+
+SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
+ CTLFLAG_RW, &max_aio_per_proc, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
+ CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
+ CTLFLAG_RW, &max_aio_procs, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
+ CTLFLAG_RD, &num_aio_procs, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
+ CTLFLAG_RD, &num_queue_count, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
+ CTLFLAG_RW, &max_queue_count, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
+ CTLFLAG_RW, &target_aio_procs, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
+ CTLFLAG_RW, &max_buf_aio, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
+ CTLFLAG_RD, &num_buf_aio, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
+ CTLFLAG_RW, &aiod_lifetime, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
+ CTLFLAG_RW, &aiod_timeout, 0, "");
+
+
+/*
+ * Job queue item
+ */
+
+#define AIOCBLIST_CANCELLED 0x1
+#define AIOCBLIST_RUNDOWN 0x4
+#define AIOCBLIST_ASYNCFREE 0x8
+#define AIOCBLIST_DONE 0x10
+
+struct aiocblist {
+ TAILQ_ENTRY (aiocblist) list; /* List of jobs */
+ TAILQ_ENTRY (aiocblist) plist; /* List of jobs for proc */
+ int jobflags;
+ int jobstate;
+ int inputcharge, outputcharge;
+ struct buf *bp; /* buffer pointer */
+ struct proc *userproc; /* User process */
+ struct aioproclist *jobaioproc; /* AIO process descriptor */
+ struct aio_liojob *lio; /* optional lio job */
+ struct aiocb *uuaiocb; /* pointer in userspace of aiocb */
+ struct aiocb uaiocb; /* Kernel I/O control block */
+};
+
+
+/*
+ * AIO process info
+ */
+#define AIOP_FREE 0x1 /* proc on free queue */
+#define AIOP_SCHED 0x2 /* proc explicitly scheduled */
+
+struct aioproclist {
+ int aioprocflags; /* AIO proc flags */
+ TAILQ_ENTRY(aioproclist) list; /* List of processes */
+ struct proc *aioproc; /* The AIO thread */
+ TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */
+};
+
+/*
+ * data-structure for lio signal management
+ */
+struct aio_liojob {
+ int lioj_flags;
+ int lioj_buffer_count;
+ int lioj_buffer_finished_count;
+ int lioj_queue_count;
+ int lioj_queue_finished_count;
+ struct sigevent lioj_signal; /* signal on all I/O done */
+ TAILQ_ENTRY (aio_liojob) lioj_list;
+ struct kaioinfo *lioj_ki;
+};
+#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
+#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
+
+/*
+ * per process aio data structure
+ */
+struct kaioinfo {
+ int kaio_flags; /* per process kaio flags */
+ int kaio_maxactive_count; /* maximum number of AIOs */
+ int kaio_active_count; /* number of currently used AIOs */
+ int kaio_qallowed_count; /* maxiumu size of AIO queue */
+ int kaio_queue_count; /* size of AIO queue */
+ int kaio_ballowed_count; /* maximum number of buffers */
+ int kaio_queue_finished_count; /* number of daemon jobs finished */
+ int kaio_buffer_count; /* number of physio buffers */
+ int kaio_buffer_finished_count; /* count of I/O done */
+ struct proc *kaio_p; /* process that uses this kaio block */
+ TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */
+ TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */
+ TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */
+ TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */
+ TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */
+};
+
+#define KAIO_RUNDOWN 0x1 /* process is being run down */
+#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant
+ event */
+
+
+static TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc;
+static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */
+static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */
+static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */
+
+static void aio_init_aioinfo(struct proc *p) ;
+static void aio_onceonly(void *) ;
+static int aio_free_entry(struct aiocblist *aiocbe);
+static void aio_process(struct aiocblist *aiocbe);
+static int aio_newproc(void) ;
+static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ;
+static void aio_physwakeup(struct buf *bp);
+static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type);
+static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
+static void aio_daemon(void *uproc);
+
+SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
+
+static vm_zone_t kaio_zone=0, aiop_zone=0,
+ aiocb_zone=0, aiol_zone=0, aiolio_zone=0;
+
+/*
+ * Single AIOD vmspace shared amongst all of them
+ */
+struct vmspace *aiovmspace = NULL;
+
+/*
+ * Startup initialization
+ */
+void
+aio_onceonly(void *na)
+{
+ TAILQ_INIT(&aio_freeproc);
+ TAILQ_INIT(&aio_activeproc);
+ TAILQ_INIT(&aio_jobs);
+ TAILQ_INIT(&aio_bufjobs);
+ TAILQ_INIT(&aio_freejobs);
+ kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1);
+ aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1);
+ aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1);
+ aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1);
+ aiolio_zone = zinit("AIOLIO",
+ AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1);
+ aiod_timeout = AIOD_TIMEOUT_DEFAULT;
+ aiod_lifetime = AIOD_LIFETIME_DEFAULT;
+ jobrefid = 1;
+}
+
+/*
+ * Init the per-process aioinfo structure.
+ * The aioinfo limits are set per-process for user limit (resource) management.
+ */
+void
+aio_init_aioinfo(struct proc *p)
+{
+ struct kaioinfo *ki;
+ if (p->p_aioinfo == NULL) {
+ ki = zalloc(kaio_zone);
+ p->p_aioinfo = ki;
+ ki->kaio_flags = 0;
+ ki->kaio_maxactive_count = max_aio_per_proc;
+ ki->kaio_active_count = 0;
+ ki->kaio_qallowed_count = max_aio_queue_per_proc;
+ ki->kaio_queue_count = 0;
+ ki->kaio_ballowed_count = max_buf_aio;
+ ki->kaio_buffer_count = 0;
+ ki->kaio_buffer_finished_count = 0;
+ ki->kaio_p = p;
+ TAILQ_INIT(&ki->kaio_jobdone);
+ TAILQ_INIT(&ki->kaio_jobqueue);
+ TAILQ_INIT(&ki->kaio_bufdone);
+ TAILQ_INIT(&ki->kaio_bufqueue);
+ TAILQ_INIT(&ki->kaio_liojoblist);
+ }
+}
+
+/*
+ * Free a job entry. Wait for completion if it is currently
+ * active, but don't delay forever. If we delay, we return
+ * a flag that says that we have to restart the queue scan.
+ */
+int
+aio_free_entry(struct aiocblist *aiocbe)
+{
+ struct kaioinfo *ki;
+ struct aioproclist *aiop;
+ struct aio_liojob *lj;
+ struct proc *p;
+ int error;
+ int s;
+
+ if (aiocbe->jobstate == JOBST_NULL)
+ panic("aio_free_entry: freeing already free job");
+
+ p = aiocbe->userproc;
+ ki = p->p_aioinfo;
+ lj = aiocbe->lio;
+ if (ki == NULL)
+ panic("aio_free_entry: missing p->p_aioinfo");
+
+ if (aiocbe->jobstate == JOBST_JOBRUNNING) {
+ if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
+ return 0;
+ aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
+ tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0);
+ }
+ aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
+
+ if (aiocbe->bp == NULL) {
+ if (ki->kaio_queue_count <= 0)
+ panic("aio_free_entry: process queue size <= 0");
+ if (num_queue_count <= 0)
+ panic("aio_free_entry: system wide queue size <= 0");
+
+ if(lj) {
+ lj->lioj_queue_count--;
+ if (aiocbe->jobflags & AIOCBLIST_DONE)
+ lj->lioj_queue_finished_count--;
+ }
+ ki->kaio_queue_count--;
+ if (aiocbe->jobflags & AIOCBLIST_DONE)
+ ki->kaio_queue_finished_count--;
+ num_queue_count--;
+
+ } else {
+ if(lj) {
+ lj->lioj_buffer_count--;
+ if (aiocbe->jobflags & AIOCBLIST_DONE)
+ lj->lioj_buffer_finished_count--;
+ }
+ if (aiocbe->jobflags & AIOCBLIST_DONE)
+ ki->kaio_buffer_finished_count--;
+ ki->kaio_buffer_count--;
+ num_buf_aio--;
+
+ }
+
+ if ((ki->kaio_flags & KAIO_WAKEUP) ||
+ (ki->kaio_flags & KAIO_RUNDOWN) &&
+ ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0))) {
+ ki->kaio_flags &= ~KAIO_WAKEUP;
+ wakeup(p);
+ }
+
+ if ( aiocbe->jobstate == JOBST_JOBQBUF) {
+ if ((error = aio_fphysio(p, aiocbe, 1)) != 0)
+ return error;
+ if (aiocbe->jobstate != JOBST_JOBBFINISHED)
+ panic("aio_free_entry: invalid physio finish-up state");
+ s = splbio();
+ TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
+ splx(s);
+ } else if ( aiocbe->jobstate == JOBST_JOBQPROC) {
+ aiop = aiocbe->jobaioproc;
+ TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
+ } else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) {
+ TAILQ_REMOVE(&aio_jobs, aiocbe, list);
+ } else if ( aiocbe->jobstate == JOBST_JOBFINISHED) {
+ TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
+ } else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) {
+ s = splbio();
+ TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
+ splx(s);
+ if (aiocbe->bp) {
+ vunmapbuf(aiocbe->bp);
+ relpbuf(aiocbe->bp, NULL);
+ aiocbe->bp = NULL;
+ }
+ }
+ if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
+ TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ zfree(aiolio_zone, lj);
+ }
+ TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+ aiocbe->jobstate = JOBST_NULL;
+ return 0;
+}
+
+/*
+ * Rundown the jobs for a given process.
+ */
+void
+aio_proc_rundown(struct proc *p)
+{
+ int s;
+ struct kaioinfo *ki;
+ struct aio_liojob *lj, *ljn;
+ struct aiocblist *aiocbe, *aiocbn;
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return;
+
+ ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
+ while ((ki->kaio_active_count > 0) ||
+ (ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) {
+ ki->kaio_flags |= KAIO_RUNDOWN;
+ if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
+ break;
+ }
+
+restart1:
+ for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone);
+ aiocbe;
+ aiocbe = aiocbn) {
+ aiocbn = TAILQ_NEXT(aiocbe, plist);
+ if (aio_free_entry(aiocbe))
+ goto restart1;
+ }
+
+restart2:
+ for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue);
+ aiocbe;
+ aiocbe = aiocbn) {
+ aiocbn = TAILQ_NEXT(aiocbe, plist);
+ if (aio_free_entry(aiocbe))
+ goto restart2;
+ }
+
+/*
+ * Note the use of lots of splbio here, trying to avoid
+ * splbio for long chains of I/O. Probably unnecessary.
+ */
+
+restart3:
+ s = splbio();
+ while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
+ ki->kaio_flags |= KAIO_WAKEUP;
+ tsleep (p, PRIBIO, "aioprn", 0);
+ splx(s);
+ goto restart3;
+ }
+ splx(s);
+
+restart4:
+ s = splbio();
+ for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone);
+ aiocbe;
+ aiocbe = aiocbn) {
+ aiocbn = TAILQ_NEXT(aiocbe, plist);
+ if (aio_free_entry(aiocbe)) {
+ splx(s);
+ goto restart4;
+ }
+ }
+ splx(s);
+
+ for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist);
+ lj;
+ lj = ljn) {
+ ljn = TAILQ_NEXT(lj, lioj_list);
+ if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
+ TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ zfree(aiolio_zone, lj);
+ } else {
+#if defined(DIAGNOSTIC)
+ printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n",
+ lj->lioj_buffer_count, lj->lioj_buffer_finished_count,
+ lj->lioj_queue_count, lj->lioj_queue_finished_count);
+#endif
+ }
+ }
+
+ zfree(kaio_zone, ki);
+ p->p_aioinfo = NULL;
+}
+
+/*
+ * Select a job to run (called by an AIO daemon)
+ */
+static struct aiocblist *
+aio_selectjob(struct aioproclist *aiop)
+{
+
+ struct aiocblist *aiocbe;
+
+ aiocbe = TAILQ_FIRST(&aiop->jobtorun);
+ if (aiocbe) {
+ TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
+ return aiocbe;
+ }
+
+ for (aiocbe = TAILQ_FIRST(&aio_jobs);
+ aiocbe;
+ aiocbe = TAILQ_NEXT(aiocbe, list)) {
+ struct kaioinfo *ki;
+ struct proc *userp;
+
+ userp = aiocbe->userproc;
+ ki = userp->p_aioinfo;
+
+ if (ki->kaio_active_count < ki->kaio_maxactive_count) {
+ TAILQ_REMOVE(&aio_jobs, aiocbe, list);
+ return aiocbe;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * The AIO processing activity. This is the code that does the
+ * I/O request for the non-physio version of the operations. The
+ * normal vn operations are used, and this code should work in
+ * all instances for every type of file, including pipes, sockets,
+ * fifos, and regular files.
+ */
+void
+aio_process(struct aiocblist *aiocbe)
+{
+ struct filedesc *fdp;
+ struct proc *userp, *mycp;
+ struct aiocb *cb;
+ struct file *fp;
+ struct uio auio;
+ struct iovec aiov;
+ unsigned int fd;
+ int cnt;
+ int error;
+ off_t offset;
+ int oublock_st, oublock_end;
+ int inblock_st, inblock_end;
+
+ userp = aiocbe->userproc;
+ cb = &aiocbe->uaiocb;
+
+ mycp = curproc;
+
+ fdp = mycp->p_fd;
+ fd = cb->aio_fildes;
+ fp = fdp->fd_ofiles[fd];
+
+ aiov.iov_base = (void *) cb->aio_buf;
+ aiov.iov_len = cb->aio_nbytes;
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = offset = cb->aio_offset;
+ auio.uio_resid = cb->aio_nbytes;
+ cnt = cb->aio_nbytes;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = mycp;
+
+ inblock_st = mycp->p_stats->p_ru.ru_inblock;
+ oublock_st = mycp->p_stats->p_ru.ru_oublock;
+ if (cb->aio_lio_opcode == LIO_READ) {
+ auio.uio_rw = UIO_READ;
+ error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
+ } else {
+ auio.uio_rw = UIO_WRITE;
+ error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
+ }
+ inblock_end = mycp->p_stats->p_ru.ru_inblock;
+ oublock_end = mycp->p_stats->p_ru.ru_oublock;
+
+ aiocbe->inputcharge = inblock_end - inblock_st;
+ aiocbe->outputcharge = oublock_end - oublock_st;
+
+ if (error) {
+ if (auio.uio_resid != cnt) {
+ if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
+ error = 0;
+ if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
+ psignal(userp, SIGPIPE);
+ }
+ }
+
+ cnt -= auio.uio_resid;
+ cb->_aiocb_private.error = error;
+ cb->_aiocb_private.status = cnt;
+
+ return;
+
+}
+
+/*
+ * The AIO daemon, most of the actual work is done in aio_process,
+ * but the setup (and address space mgmt) is done in this routine.
+ */
+static void
+aio_daemon(void *uproc)
+{
+ int s;
+ struct aioproclist *aiop;
+ struct vmspace *myvm, *aiovm;
+ struct proc *mycp;
+
+ /*
+ * Local copies of curproc (cp) and vmspace (myvm)
+ */
+ mycp = curproc;
+ myvm = mycp->p_vmspace;
+
+ /*
+ * We manage to create only one VM space for all AIOD processes.
+ * The VM space for the first AIOD created becomes the shared VM
+ * space for all of them. We add an additional reference count,
+ * even for the first AIOD, so the address space does not go away,
+ * and we continue to use that original VM space even if the first
+ * AIOD exits.
+ */
+ if ((aiovm = aiovmspace) == NULL) {
+ aiovmspace = myvm;
+ myvm->vm_refcnt++;
+ /*
+ * Remove userland cruft from address space.
+ */
+ if (myvm->vm_shm)
+ shmexit(mycp);
+ pmap_remove_pages(&myvm->vm_pmap, 0, USRSTACK);
+ vm_map_remove(&myvm->vm_map, 0, USRSTACK);
+ myvm->vm_tsize = 0;
+ myvm->vm_dsize = 0;
+ myvm->vm_ssize = 0;
+ } else {
+ aiovm->vm_refcnt++;
+ mycp->p_vmspace = aiovm;
+ pmap_activate(mycp);
+ vmspace_free(myvm);
+ myvm = aiovm;
+ }
+
+ if (mycp->p_textvp) {
+ vrele(mycp->p_textvp);
+ mycp->p_textvp = NULL;
+ }
+
+ /*
+ * Allocate and ready the aio control info. There is one
+ * aiop structure per daemon.
+ */
+ aiop = zalloc(aiop_zone);
+ aiop->aioproc = mycp;
+ aiop->aioprocflags |= AIOP_FREE;
+ TAILQ_INIT(&aiop->jobtorun);
+
+ /*
+ * Place thread (lightweight process) onto the AIO free thread list
+ */
+ if (TAILQ_EMPTY(&aio_freeproc))
+ wakeup(&aio_freeproc);
+ TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
+
+ /*
+ * Make up a name for the daemon
+ */
+ strcpy(mycp->p_comm, "aiod");
+
+ /*
+ * Get rid of our current filedescriptors. AIOD's don't need any
+ * filedescriptors, except as temporarily inherited from the client.
+ * Credentials are also cloned, and made equivalent to "root."
+ */
+ fdfree(mycp);
+ mycp->p_fd = NULL;
+ mycp->p_ucred = crcopy(mycp->p_ucred);
+ mycp->p_ucred->cr_uid = 0;
+ mycp->p_ucred->cr_ngroups = 1;
+ mycp->p_ucred->cr_groups[0] = 1;
+
+ /*
+ * The daemon resides in its own pgrp.
+ */
+ enterpgrp(mycp, mycp->p_pid, 1);
+
+ /*
+ * Mark special process type
+ */
+ mycp->p_flag |= P_SYSTEM|P_KTHREADP;
+
+ /*
+ * Wakeup parent process. (Parent sleeps to keep from blasting away
+ * creating to many daemons.)
+ */
+ wakeup(mycp);
+
+ while(1) {
+ struct proc *curcp;
+ struct aiocblist *aiocbe;
+
+ /*
+ * curcp is the current daemon process context.
+ * userp is the current user process context.
+ */
+ curcp = mycp;
+
+ /*
+ * Take daemon off of free queue
+ */
+ if (aiop->aioprocflags & AIOP_FREE) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
+ aiop->aioprocflags &= ~AIOP_FREE;
+ }
+ aiop->aioprocflags &= ~AIOP_SCHED;
+
+ /*
+ * Check for jobs
+ */
+ while ( aiocbe = aio_selectjob(aiop)) {
+ struct proc *userp;
+ struct aiocb *cb;
+ struct kaioinfo *ki;
+ struct aio_liojob *lj;
+
+ cb = &aiocbe->uaiocb;
+ userp = aiocbe->userproc;
+
+ aiocbe->jobstate = JOBST_JOBRUNNING;
+
+ /*
+ * Connect to process address space for user program
+ */
+ if (userp != curcp) {
+ struct vmspace *tmpvm;
+ /*
+ * Save the current address space that we are connected to.
+ */
+ tmpvm = mycp->p_vmspace;
+ /*
+ * Point to the new user address space, and refer to it.
+ */
+ mycp->p_vmspace = userp->p_vmspace;
+ mycp->p_vmspace->vm_refcnt++;
+ /*
+ * Activate the new mapping.
+ */
+ pmap_activate(mycp);
+ /*
+ * If the old address space wasn't the daemons own address
+ * space, then we need to remove the daemon's reference from
+ * the other process that it was acting on behalf of.
+ */
+ if (tmpvm != myvm) {
+ vmspace_free(tmpvm);
+ }
+ /*
+ * Disassociate from previous clients file descriptors, and
+ * associate to the new clients descriptors. Note that
+ * the daemon doesn't need to worry about its orginal
+ * descriptors, because they were originally freed.
+ */
+ if (mycp->p_fd)
+ fdfree(mycp);
+ mycp->p_fd = fdshare(userp);
+ curcp = userp;
+ }
+
+ ki = userp->p_aioinfo;
+ lj = aiocbe->lio;
+
+ /*
+ * Account for currently active jobs
+ */
+ ki->kaio_active_count++;
+
+ /*
+ * Do the I/O function
+ */
+ aiocbe->jobaioproc = aiop;
+ aio_process(aiocbe);
+
+ /*
+ * decrement the active job count
+ */
+ ki->kaio_active_count--;
+
+ /*
+ * increment the completion count for wakeup/signal comparisons
+ */
+ aiocbe->jobflags |= AIOCBLIST_DONE;
+ ki->kaio_queue_finished_count++;
+ if (lj) {
+ lj->lioj_queue_finished_count++;
+ }
+ if ((ki->kaio_flags & KAIO_WAKEUP) ||
+ (ki->kaio_flags & KAIO_RUNDOWN) &&
+ (ki->kaio_active_count == 0)) {
+ ki->kaio_flags &= ~KAIO_WAKEUP;
+ wakeup(userp);
+ }
+
+ s = splbio();
+ if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
+ LIOJ_SIGNAL) {
+ if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) &&
+ (lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) {
+ psignal(userp, lj->lioj_signal.sigev_signo);
+ lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+ }
+ }
+ splx(s);
+
+ aiocbe->jobstate = JOBST_JOBFINISHED;
+
+ /*
+ * If the I/O request should be automatically rundown, do the
+ * needed cleanup. Otherwise, place the queue entry for
+ * the just finished I/O request into the done queue for the
+ * associated client.
+ */
+ if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
+ aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
+ TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+ } else {
+ TAILQ_REMOVE(&ki->kaio_jobqueue,
+ aiocbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_jobdone,
+ aiocbe, plist);
+ }
+
+ if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
+ wakeup(aiocbe);
+ aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
+ }
+
+ if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
+ psignal(userp, cb->aio_sigevent.sigev_signo);
+ }
+ }
+
+ /*
+ * Disconnect from user address space
+ */
+ if (curcp != mycp) {
+ struct vmspace *tmpvm;
+ /*
+ * Get the user address space to disconnect from.
+ */
+ tmpvm = mycp->p_vmspace;
+ /*
+ * Get original address space for daemon.
+ */
+ mycp->p_vmspace = myvm;
+ /*
+ * Activate the daemon's address space.
+ */
+ pmap_activate(mycp);
+#if defined(DIAGNOSTIC)
+ if (tmpvm == myvm)
+ printf("AIOD: vmspace problem -- %d\n", mycp->p_pid);
+#endif
+ /*
+ * remove our vmspace reference.
+ */
+ vmspace_free(tmpvm);
+ /*
+ * disassociate from the user process's file descriptors.
+ */
+ if (mycp->p_fd)
+ fdfree(mycp);
+ mycp->p_fd = NULL;
+ curcp = mycp;
+ }
+
+ /*
+ * If we are the first to be put onto the free queue, wakeup
+ * anyone waiting for a daemon.
+ */
+ TAILQ_REMOVE(&aio_activeproc, aiop, list);
+ if (TAILQ_EMPTY(&aio_freeproc))
+ wakeup(&aio_freeproc);
+ TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
+ aiop->aioprocflags |= AIOP_FREE;
+
+ /*
+ * If daemon is inactive for a long time, allow it to exit, thereby
+ * freeing resources.
+ */
+ if (((aiop->aioprocflags & AIOP_SCHED) == 0) &&
+ tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) {
+ if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
+ (TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
+ if ((aiop->aioprocflags & AIOP_FREE) &&
+ (num_aio_procs > target_aio_procs)) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ zfree(aiop_zone, aiop);
+ num_aio_procs--;
+#if defined(DIAGNOSTIC)
+ if (mycp->p_vmspace->vm_refcnt <= 1)
+ printf("AIOD: bad vm refcnt for exiting daemon: %d\n",
+ mycp->p_vmspace->vm_refcnt);
+#endif
+ exit1(mycp, 0);
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Create a new AIO daemon. This is mostly a kernel-thread fork routine.
+ * The AIO daemon modifies its environment itself.
+ */
+static int
+aio_newproc()
+{
+ int error;
+ struct rfork_args rfa;
+ struct proc *p, *np;
+
+ rfa.flags = RFPROC | RFCFDG;
+
+ p = curproc;
+ if (error = rfork(p, &rfa))
+ return error;
+
+ np = pfind(p->p_retval[0]);
+ cpu_set_fork_handler(np, aio_daemon, p);
+
+ /*
+ * Wait until daemon is started, but continue on just in case (to
+ * handle error conditions.
+ */
+ error = tsleep(np, PZERO, "aiosta", aiod_timeout);
+ num_aio_procs++;
+
+ return error;
+
+}
+
+/*
+ * Try the high-performance physio method for eligible VCHR devices. This
+ * routine doesn't require the use of any additional threads, and have
+ * overhead.
+ */
+int
+aio_qphysio(p, aiocbe)
+ struct proc *p;
+ struct aiocblist *aiocbe;
+{
+ int error;
+ struct aiocb *cb;
+ struct file *fp;
+ struct buf *bp;
+ int bflags;
+ struct vnode *vp;
+ struct kaioinfo *ki;
+ struct filedesc *fdp;
+ struct aio_liojob *lj;
+ int fd;
+ int majordev;
+ int s;
+ int cnt;
+ dev_t dev;
+ int rw;
+ d_strategy_t *fstrategy;
+ struct cdevsw *cdev;
+ struct cdevsw *bdev;
+
+ cb = &aiocbe->uaiocb;
+ fdp = p->p_fd;
+ fd = cb->aio_fildes;
+ fp = fdp->fd_ofiles[fd];
+
+ if (fp->f_type != DTYPE_VNODE) {
+ return -1;
+ }
+
+ vp = (struct vnode *)fp->f_data;
+ if (vp->v_type != VCHR || ((cb->aio_nbytes & (DEV_BSIZE - 1)) != 0)) {
+ return -1;
+ }
+
+ if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) {
+ return -1;
+ }
+
+ if ((vp->v_specinfo == NULL) || (vp->v_flag & VISTTY)) {
+ return -1;
+ }
+
+ majordev = major(vp->v_rdev);
+ if (majordev == NODEV) {
+ return -1;
+ }
+
+ cdev = cdevsw[major(vp->v_rdev)];
+ if (cdev == NULL) {
+ return -1;
+ }
+
+ if (cdev->d_bmaj == -1) {
+ return -1;
+ }
+ bdev = cdev;
+
+ ki = p->p_aioinfo;
+ if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
+ return -1;
+ }
+
+ cnt = cb->aio_nbytes;
+ if (cnt > MAXPHYS) {
+ return -1;
+ }
+
+ dev = makedev(bdev->d_bmaj, minor(vp->v_rdev));
+
+ /*
+ * Physical I/O is charged directly to the process, so we don't have
+ * to fake it.
+ */
+ aiocbe->inputcharge = 0;
+ aiocbe->outputcharge = 0;
+
+ ki->kaio_buffer_count++;
+
+ lj = aiocbe->lio;
+ if (lj) {
+ lj->lioj_buffer_count++;
+ }
+
+ /* create and build a buffer header for a transfer */
+ bp = (struct buf *)getpbuf(NULL);
+
+ /*
+ * get a copy of the kva from the physical buffer
+ */
+ bp->b_proc = p;
+ bp->b_dev = dev;
+ error = bp->b_error = 0;
+
+ if (cb->aio_lio_opcode == LIO_WRITE) {
+ rw = 0;
+ bflags = B_WRITE;
+ } else {
+ rw = 1;
+ bflags = B_READ;
+ }
+
+ bp->b_bcount = cb->aio_nbytes;
+ bp->b_bufsize = cb->aio_nbytes;
+ bp->b_flags = B_BUSY | B_PHYS | B_CALL | bflags;
+ bp->b_iodone = aio_physwakeup;
+ bp->b_saveaddr = bp->b_data;
+ bp->b_data = (void *) cb->aio_buf;
+ bp->b_blkno = btodb(cb->aio_offset);
+
+ if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) {
+ error = EFAULT;
+ goto doerror;
+ }
+ if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) {
+ error = EFAULT;
+ goto doerror;
+ }
+
+ /* bring buffer into kernel space */
+ vmapbuf(bp);
+
+ s = splbio();
+ aiocbe->bp = bp;
+ bp->b_spc = (void *)aiocbe;
+ TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
+ TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
+ aiocbe->jobstate = JOBST_JOBQBUF;
+ cb->_aiocb_private.status = cb->aio_nbytes;
+ num_buf_aio++;
+ fstrategy = bdev->d_strategy;
+ bp->b_error = 0;
+
+ splx(s);
+ /* perform transfer */
+ (*fstrategy)(bp);
+
+ s = splbio();
+ /*
+ * If we had an error invoking the request, or an error in processing
+ * the request before we have returned, we process it as an error
+ * in transfer. Note that such an I/O error is not indicated immediately,
+ * but is returned using the aio_error mechanism. In this case, aio_suspend
+ * will return immediately.
+ */
+ if (bp->b_error || (bp->b_flags & B_ERROR)) {
+ struct aiocb *job = aiocbe->uuaiocb;
+
+ aiocbe->uaiocb._aiocb_private.status = 0;
+ suword(&job->_aiocb_private.status, 0);
+ aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+ suword(&job->_aiocb_private.error, bp->b_error);
+
+ ki->kaio_buffer_finished_count++;
+
+ if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
+ aiocbe->jobstate = JOBST_JOBBFINISHED;
+ aiocbe->jobflags |= AIOCBLIST_DONE;
+ TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
+ TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
+ }
+ }
+ splx(s);
+ return 0;
+
+doerror:
+ ki->kaio_buffer_count--;
+ if (lj) {
+ lj->lioj_buffer_count--;
+ }
+ aiocbe->bp = NULL;
+ relpbuf(bp, NULL);
+ return error;
+}
+
+/*
+ * This waits/tests physio completion.
+ */
+int
+aio_fphysio(p, iocb, flgwait)
+ struct proc *p;
+ struct aiocblist *iocb;
+ int flgwait;
+{
+ int s;
+ struct buf *bp;
+ int error;
+
+ bp = iocb->bp;
+
+ s = splbio();
+ if (flgwait == 0) {
+ if ((bp->b_flags & B_DONE) == 0) {
+ splx(s);
+ return EINPROGRESS;
+ }
+ }
+
+ while ((bp->b_flags & B_DONE) == 0) {
+ if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) {
+ if ((bp->b_flags & B_DONE) == 0) {
+ splx(s);
+ return EINPROGRESS;
+ } else {
+ break;
+ }
+ }
+ }
+
+ /* release mapping into kernel space */
+ vunmapbuf(bp);
+ iocb->bp = 0;
+
+ error = 0;
+ /*
+ * check for an error
+ */
+ if (bp->b_flags & B_ERROR) {
+ error = bp->b_error;
+ }
+
+ relpbuf(bp, NULL);
+ return (error);
+}
+
+/*
+ * Queue a new AIO request. Choosing either the threaded or direct physio
+ * VCHR technique is done in this code.
+ */
+static int
+_aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+ unsigned int fd;
+
+ int error;
+ int opcode;
+ struct aiocblist *aiocbe;
+ struct aioproclist *aiop;
+ struct kaioinfo *ki;
+
+ if (aiocbe = TAILQ_FIRST(&aio_freejobs)) {
+ TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
+ } else {
+ aiocbe = zalloc (aiocb_zone);
+ }
+
+ aiocbe->inputcharge = 0;
+ aiocbe->outputcharge = 0;
+
+ suword(&job->_aiocb_private.status, -1);
+ suword(&job->_aiocb_private.error, 0);
+ suword(&job->_aiocb_private.kernelinfo, -1);
+
+ error = copyin((caddr_t)job,
+ (caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb);
+ if (error) {
+ suword(&job->_aiocb_private.error, error);
+
+ TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+ return error;
+ }
+
+ /*
+ * Save userspace address of the job info
+ */
+ aiocbe->uuaiocb = job;
+
+ /*
+ * Get the opcode
+ */
+ if (type != LIO_NOP) {
+ aiocbe->uaiocb.aio_lio_opcode = type;
+ }
+ opcode = aiocbe->uaiocb.aio_lio_opcode;
+
+ /*
+ * Get the fd info for process
+ */
+ fdp = p->p_fd;
+
+ /*
+ * Range check file descriptor
+ */
+ fd = aiocbe->uaiocb.aio_fildes;
+ if (fd >= fdp->fd_nfiles) {
+ TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+ if (type == 0) {
+ suword(&job->_aiocb_private.error, EBADF);
+ }
+ return EBADF;
+ }
+
+ fp = fdp->fd_ofiles[fd];
+ if ((fp == NULL) ||
+ ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) {
+ TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+ if (type == 0) {
+ suword(&job->_aiocb_private.error, EBADF);
+ }
+ return EBADF;
+ }
+
+ if (aiocbe->uaiocb.aio_offset == -1LL) {
+ TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+ if (type == 0) {
+ suword(&job->_aiocb_private.error, EINVAL);
+ }
+ return EINVAL;
+ }
+
+ error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
+ if (error) {
+ TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+ if (type == 0) {
+ suword(&job->_aiocb_private.error, EINVAL);
+ }
+ return error;
+ }
+
+ aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
+ if (jobrefid == LONG_MAX)
+ jobrefid = 1;
+ else
+ jobrefid++;
+
+ if (opcode == LIO_NOP) {
+ TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+ if (type == 0) {
+ suword(&job->_aiocb_private.error, 0);
+ suword(&job->_aiocb_private.status, 0);
+ suword(&job->_aiocb_private.kernelinfo, 0);
+ }
+ return 0;
+ }
+
+ if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
+ TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+ if (type == 0) {
+ suword(&job->_aiocb_private.status, 0);
+ suword(&job->_aiocb_private.error, EINVAL);
+ }
+ return EINVAL;
+ }
+
+ suword(&job->_aiocb_private.error, EINPROGRESS);
+ aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
+ aiocbe->userproc = p;
+ aiocbe->jobflags = 0;
+ aiocbe->lio = lj;
+ ki = p->p_aioinfo;
+
+ if ((error = aio_qphysio(p, aiocbe)) == 0) {
+ return 0;
+ } else if (error > 0) {
+ suword(&job->_aiocb_private.status, 0);
+ aiocbe->uaiocb._aiocb_private.error = error;
+ suword(&job->_aiocb_private.error, error);
+ return error;
+ }
+
+ /*
+ * No buffer for daemon I/O
+ */
+ aiocbe->bp = NULL;
+
+ ki->kaio_queue_count++;
+ if (lj) {
+ lj->lioj_queue_count++;
+ }
+ TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
+ TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
+ aiocbe->jobstate = JOBST_JOBQGLOBAL;
+
+ num_queue_count++;
+ error = 0;
+
+ /*
+ * If we don't have a free AIO process, and we are below our
+ * quota, then start one. Otherwise, depend on the subsequent
+ * I/O completions to pick-up this job. If we don't sucessfully
+ * create the new process (thread) due to resource issues, we
+ * return an error for now (EAGAIN), which is likely not the
+ * correct thing to do.
+ */
+retryproc:
+ if (aiop = TAILQ_FIRST(&aio_freeproc)) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
+ aiop->aioprocflags &= ~AIOP_FREE;
+ wakeup(aiop->aioproc);
+ } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
+ ((ki->kaio_active_count + num_aio_resv_start) <
+ ki->kaio_maxactive_count)) {
+ num_aio_resv_start++;
+ if ((error = aio_newproc()) == 0) {
+ num_aio_resv_start--;
+ p->p_retval[0] = 0;
+ goto retryproc;
+ }
+ num_aio_resv_start--;
+ }
+ return error;
+}
+
+/*
+ * This routine queues an AIO request, checking for quotas.
+ */
+static int
+aio_aqueue(struct proc *p, struct aiocb *job, int type)
+{
+ struct kaioinfo *ki;
+
+ if (p->p_aioinfo == NULL) {
+ aio_init_aioinfo(p);
+ }
+
+ if (num_queue_count >= max_queue_count)
+ return EAGAIN;
+
+ ki = p->p_aioinfo;
+ if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
+ return EAGAIN;
+
+ return _aio_aqueue(p, job, NULL, type);
+}
+
+/*
+ * Support the aio_return system call, as a side-effect, kernel
+ * resources are released.
+ */
+int
+aio_return(struct proc *p, struct aio_return_args *uap)
+{
+ int s;
+ int jobref;
+ struct aiocblist *cb, *ncb;
+ struct aiocb *ujob;
+ struct kaioinfo *ki;
+
+ ki = p->p_aioinfo;
+ if (ki == NULL) {
+ return EINVAL;
+ }
+
+ ujob = uap->aiocbp;
+
+ jobref = fuword(&ujob->_aiocb_private.kernelinfo);
+ if (jobref == -1 || jobref == 0)
+ return EINVAL;
+
+ for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
+ cb;
+ cb = TAILQ_NEXT(cb, plist)) {
+ if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
+ if (ujob == cb->uuaiocb) {
+ p->p_retval[0] = cb->uaiocb._aiocb_private.status;
+ } else {
+ p->p_retval[0] = EFAULT;
+ }
+ if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+ curproc->p_stats->p_ru.ru_oublock += cb->outputcharge;
+ cb->outputcharge = 0;
+ } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+ curproc->p_stats->p_ru.ru_inblock += cb->inputcharge;
+ cb->inputcharge = 0;
+ }
+ aio_free_entry(cb);
+ return 0;
+ }
+ }
+
+ s = splbio();
+ for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
+ cb;
+ cb = ncb) {
+ ncb = TAILQ_NEXT(cb, plist);
+ if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
+ splx(s);
+ if (ujob == cb->uuaiocb) {
+ p->p_retval[0] = cb->uaiocb._aiocb_private.status;
+ } else {
+ p->p_retval[0] = EFAULT;
+ }
+ aio_free_entry(cb);
+ return 0;
+ }
+ }
+ splx(s);
+
+ return (EINVAL);
+}
+
+/*
+ * Allow a process to wakeup when any of the I/O requests are
+ * completed.
+ */
+int
+aio_suspend(struct proc *p, struct aio_suspend_args *uap)
+{
+ struct timeval atv;
+ struct timespec ts;
+ struct aiocb *const *cbptr, *cbp;
+ struct kaioinfo *ki;
+ struct aiocblist *cb;
+ int i;
+ int njoblist;
+ int error, s, timo;
+ int *ijoblist;
+ struct aiocb **ujoblist;
+
+ if (uap->nent >= AIO_LISTIO_MAX)
+ return EINVAL;
+
+ timo = 0;
+ if (uap->timeout) {
+ /*
+ * Get timespec struct
+ */
+ if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) {
+ return error;
+ }
+
+ if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
+ return (EINVAL);
+
+ TIMESPEC_TO_TIMEVAL(&atv, &ts);
+ if (itimerfix(&atv))
+ return (EINVAL);
+ timo = tvtohz(&atv);
+ }
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return EAGAIN;
+
+ njoblist = 0;
+ ijoblist = zalloc(aiol_zone);
+ ujoblist = zalloc(aiol_zone);
+ cbptr = uap->aiocbp;
+
+ for(i = 0; i < uap->nent; i++) {
+ cbp = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]);
+ if (cbp == 0)
+ continue;
+ ujoblist[njoblist] = cbp;
+ ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
+ njoblist++;
+ }
+ if (njoblist == 0) {
+ zfree(aiol_zone, ijoblist);
+ zfree(aiol_zone, ujoblist);
+ return 0;
+ }
+
+ error = 0;
+ while (1) {
+ for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
+ cb; cb = TAILQ_NEXT(cb, plist)) {
+ for(i = 0; i < njoblist; i++) {
+ if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
+ ijoblist[i]) {
+ if (ujoblist[i] != cb->uuaiocb)
+ error = EINVAL;
+ zfree(aiol_zone, ijoblist);
+ zfree(aiol_zone, ujoblist);
+ return error;
+ }
+ }
+ }
+
+ s = splbio();
+ for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
+ cb; cb = TAILQ_NEXT(cb, plist)) {
+ for(i = 0; i < njoblist; i++) {
+ if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
+ ijoblist[i]) {
+ splx(s);
+ if (ujoblist[i] != cb->uuaiocb)
+ error = EINVAL;
+ zfree(aiol_zone, ijoblist);
+ zfree(aiol_zone, ujoblist);
+ return error;
+ }
+ }
+ }
+
+ ki->kaio_flags |= KAIO_WAKEUP;
+ error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo);
+ splx(s);
+
+ if (error == EINTR) {
+ zfree(aiol_zone, ijoblist);
+ zfree(aiol_zone, ujoblist);
+ return EINTR;
+ } else if (error == EWOULDBLOCK) {
+ zfree(aiol_zone, ijoblist);
+ zfree(aiol_zone, ujoblist);
+ return EAGAIN;
+ }
+ }
+
+/* NOTREACHED */
+ return EINVAL;
+}
+
+/*
+ * aio_cancel at the kernel level is a NOOP right now. It
+ * might be possible to support it partially in user mode, or
+ * in kernel mode later on.
+ */
+int
+aio_cancel(struct proc *p, struct aio_cancel_args *uap)
+{
+ return ENOSYS;
+}
+
+/*
+ * aio_error is implemented in the kernel level for compatibility
+ * purposes only. For a user mode async implementation, it would be
+ * best to do it in a userland subroutine.
+ */
+int
+aio_error(struct proc *p, struct aio_error_args *uap)
+{
+ int s;
+ struct aiocblist *cb;
+ struct kaioinfo *ki;
+ int jobref;
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return EINVAL;
+
+ jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
+ if ((jobref == -1) || (jobref == 0))
+ return EINVAL;
+
+ for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
+ cb;
+ cb = TAILQ_NEXT(cb, plist)) {
+
+ if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
+ p->p_retval[0] = cb->uaiocb._aiocb_private.error;
+ return 0;
+ }
+ }
+
+ for (cb = TAILQ_FIRST(&ki->kaio_jobqueue);
+ cb;
+ cb = TAILQ_NEXT(cb, plist)) {
+
+ if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
+ p->p_retval[0] = EINPROGRESS;
+ return 0;
+ }
+ }
+
+ s = splbio();
+ for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
+ cb;
+ cb = TAILQ_NEXT(cb, plist)) {
+ if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
+ p->p_retval[0] = cb->uaiocb._aiocb_private.error;
+ splx(s);
+ return 0;
+ }
+ }
+
+ for (cb = TAILQ_FIRST(&ki->kaio_bufqueue);
+ cb;
+ cb = TAILQ_NEXT(cb, plist)) {
+ if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
+ p->p_retval[0] = EINPROGRESS;
+ splx(s);
+ return 0;
+ }
+ }
+ splx(s);
+
+
+ /*
+ * Hack for lio
+ */
+/*
+ status = fuword(&uap->aiocbp->_aiocb_private.status);
+ if (status == -1) {
+ return fuword(&uap->aiocbp->_aiocb_private.error);
+ }
+*/
+ return EINVAL;
+}
+
+int
+aio_read(struct proc *p, struct aio_read_args *uap)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+ struct uio auio;
+ struct iovec aiov;
+ unsigned int fd;
+ int cnt;
+ struct aiocb iocb;
+ int error, pmodes;
+
+ pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
+ if ((pmodes & AIO_PMODE_SYNC) == 0) {
+ return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
+ }
+
+ /*
+ * Get control block
+ */
+ if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
+ return error;
+
+ /*
+ * Get the fd info for process
+ */
+ fdp = p->p_fd;
+
+ /*
+ * Range check file descriptor
+ */
+ fd = iocb.aio_fildes;
+ if (fd >= fdp->fd_nfiles)
+ return EBADF;
+ fp = fdp->fd_ofiles[fd];
+ if ((fp == NULL) || ((fp->f_flag & FREAD) == 0))
+ return EBADF;
+ if (iocb.aio_offset == -1LL)
+ return EINVAL;
+
+ auio.uio_resid = iocb.aio_nbytes;
+ if (auio.uio_resid < 0)
+ return (EINVAL);
+
+ /*
+ * Process sync simply -- queue async request.
+ */
+ if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) {
+ return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
+ }
+
+ aiov.iov_base = (void *) iocb.aio_buf;
+ aiov.iov_len = iocb.aio_nbytes;
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = iocb.aio_offset;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+
+ cnt = iocb.aio_nbytes;
+ error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
+ if (error &&
+ (auio.uio_resid != cnt) &&
+ (error == ERESTART || error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ cnt -= auio.uio_resid;
+ p->p_retval[0] = cnt;
+ return error;
+}
+
+int
+aio_write(struct proc *p, struct aio_write_args *uap)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+ struct uio auio;
+ struct iovec aiov;
+ unsigned int fd;
+ int cnt;
+ struct aiocb iocb;
+ int error;
+ int pmodes;
+
+ /*
+ * Process sync simply -- queue async request.
+ */
+ pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
+ if ((pmodes & AIO_PMODE_SYNC) == 0) {
+ return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE);
+ }
+
+ if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
+ return error;
+
+ /*
+ * Get the fd info for process
+ */
+ fdp = p->p_fd;
+
+ /*
+ * Range check file descriptor
+ */
+ fd = iocb.aio_fildes;
+ if (fd >= fdp->fd_nfiles)
+ return EBADF;
+ fp = fdp->fd_ofiles[fd];
+ if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0))
+ return EBADF;
+ if (iocb.aio_offset == -1LL)
+ return EINVAL;
+
+ aiov.iov_base = (void *) iocb.aio_buf;
+ aiov.iov_len = iocb.aio_nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = iocb.aio_offset;
+
+ auio.uio_resid = iocb.aio_nbytes;
+ if (auio.uio_resid < 0)
+ return (EINVAL);
+
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+
+ cnt = iocb.aio_nbytes;
+ error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
+ if (error) {
+ if (auio.uio_resid != cnt) {
+ if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
+ error = 0;
+ if (error == EPIPE)
+ psignal(p, SIGPIPE);
+ }
+ }
+ cnt -= auio.uio_resid;
+ p->p_retval[0] = cnt;
+ return error;
+}
+
+int
+lio_listio(struct proc *p, struct lio_listio_args *uap)
+{
+ int nent, nentqueued;
+ struct aiocb *iocb, * const *cbptr;
+ struct aiocblist *cb;
+ struct kaioinfo *ki;
+ struct aio_liojob *lj;
+ int error, runningcode;
+ int nerror;
+ int i;
+ int s;
+
+ if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) {
+ return EINVAL;
+ }
+
+ nent = uap->nent;
+ if (nent > AIO_LISTIO_MAX) {
+ return EINVAL;
+ }
+
+ if (p->p_aioinfo == NULL) {
+ aio_init_aioinfo(p);
+ }
+
+ if ((nent + num_queue_count) > max_queue_count) {
+ return EAGAIN;
+ }
+
+ ki = p->p_aioinfo;
+ if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) {
+ return EAGAIN;
+ }
+
+ lj = zalloc(aiolio_zone);
+ if (!lj) {
+ return EAGAIN;
+ }
+
+ lj->lioj_flags = 0;
+ lj->lioj_buffer_count = 0;
+ lj->lioj_buffer_finished_count = 0;
+ lj->lioj_queue_count = 0;
+ lj->lioj_queue_finished_count = 0;
+ lj->lioj_ki = ki;
+ TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
+
+ /*
+ * Setup signal
+ */
+ if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+ error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal);
+ if (error)
+ return error;
+ lj->lioj_flags |= LIOJ_SIGNAL;
+ lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
+ } else {
+ lj->lioj_flags &= ~LIOJ_SIGNAL;
+ }
+
+/*
+ * get pointers to the list of I/O requests
+ */
+
+ nerror = 0;
+ nentqueued = 0;
+ cbptr = uap->acb_list;
+ for(i = 0; i < uap->nent; i++) {
+ iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]);
+ if (((intptr_t) iocb != -1) && ((intptr_t) iocb != NULL)) {
+ error = _aio_aqueue(p, iocb, lj, 0);
+ if (error == 0) {
+ nentqueued++;
+ } else {
+ nerror++;
+ }
+ }
+ }
+
+ /*
+ * If we haven't queued any, then just return error
+ */
+ if (nentqueued == 0) {
+ return 0;
+ }
+
+ /*
+ * Calculate the appropriate error return
+ */
+ runningcode = 0;
+ if (nerror)
+ runningcode = EIO;
+
+ if (uap->mode == LIO_WAIT) {
+ while (1) {
+ int found;
+ found = 0;
+ for(i = 0; i < uap->nent; i++) {
+ int jobref, command;
+
+ /*
+ * Fetch address of the control buf pointer in user space
+ */
+ iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]);
+ if (((intptr_t) iocb == -1) || ((intptr_t) iocb == 0))
+ continue;
+
+ /*
+ * Fetch the associated command from user space
+ */
+ command = fuword(&iocb->aio_lio_opcode);
+ if (command == LIO_NOP) {
+ found++;
+ continue;
+ }
+
+ jobref = fuword(&iocb->_aiocb_private.kernelinfo);
+
+ for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
+ cb;
+ cb = TAILQ_NEXT(cb, plist)) {
+ if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
+ jobref) {
+ if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+ curproc->p_stats->p_ru.ru_oublock +=
+ cb->outputcharge;
+ cb->outputcharge = 0;
+ } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+ curproc->p_stats->p_ru.ru_inblock +=
+ cb->inputcharge;
+ cb->inputcharge = 0;
+ }
+ found++;
+ break;
+ }
+ }
+
+ s = splbio();
+ for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
+ cb;
+ cb = TAILQ_NEXT(cb, plist)) {
+ if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
+ jobref) {
+ found++;
+ break;
+ }
+ }
+ splx(s);
+
+ }
+
+ /*
+ * If all I/Os have been disposed of, then we can return
+ */
+ if (found == nentqueued) {
+ return runningcode;
+ }
+
+ ki->kaio_flags |= KAIO_WAKEUP;
+ error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0);
+
+ if (error == EINTR) {
+ return EINTR;
+ } else if (error == EWOULDBLOCK) {
+ return EAGAIN;
+ }
+
+ }
+ }
+
+ return runningcode;
+}
+
+/*
+ * This is a wierd hack so that we can post a signal. It is safe
+ * to do so from a timeout routine, but *not* from an interrupt routine.
+ */
+static void
+process_signal(void *ljarg)
+{
+ struct aio_liojob *lj = ljarg;
+ if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) {
+ if (lj->lioj_queue_count == lj->lioj_queue_finished_count) {
+ psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
+ lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+ }
+ }
+}
+
+/*
+ * Interrupt handler for physio, performs the necessary process wakeups,
+ * and signals.
+ */
+static void
+aio_physwakeup(bp)
+ struct buf *bp;
+{
+ struct aiocblist *aiocbe;
+ struct proc *p;
+ struct kaioinfo *ki;
+ struct aio_liojob *lj;
+ int s;
+ s = splbio();
+
+ wakeup((caddr_t) bp);
+ bp->b_flags &= ~B_CALL;
+ bp->b_flags |= B_DONE;
+
+ aiocbe = (struct aiocblist *)bp->b_spc;
+ if (aiocbe) {
+ p = bp->b_proc;
+
+ aiocbe->jobstate = JOBST_JOBBFINISHED;
+ aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+ aiocbe->uaiocb._aiocb_private.error = 0;
+ aiocbe->jobflags |= AIOCBLIST_DONE;
+
+ if (bp->b_flags & B_ERROR) {
+ aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+ }
+
+ lj = aiocbe->lio;
+ if (lj) {
+ lj->lioj_buffer_finished_count++;
+ /*
+ * wakeup/signal if all of the interrupt jobs are done
+ */
+ if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) {
+ /*
+ * post a signal if it is called for
+ */
+ if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
+ LIOJ_SIGNAL) {
+ lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+ timeout(process_signal, lj, 0);
+ }
+ }
+ }
+
+ ki = p->p_aioinfo;
+ if (ki) {
+ ki->kaio_buffer_finished_count++;
+ TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
+ TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
+ /*
+ * and do the wakeup
+ */
+ if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
+ ki->kaio_flags &= ~KAIO_WAKEUP;
+ wakeup(p);
+ }
+ }
+ }
+ splx(s);
+}
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
new file mode 100644
index 0000000..3664ccd
--- /dev/null
+++ b/sys/kern/vfs_bio.c
@@ -0,0 +1,2443 @@
+/*
+ * Copyright (c) 1994,1997 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice immediately at the beginning of the file, without modification,
+ * this list of conditions, and the following disclaimer.
+ * 2. Absolutely no warranty of function or purpose is made by the author
+ * John S. Dyson.
+ *
+ * $Id: vfs_bio.c,v 1.194 1999/01/21 08:29:05 dillon Exp $
+ */
+
+/*
+ * this file contains a new buffer I/O scheme implementing a coherent
+ * VM object and buffer cache scheme. Pains have been taken to make
+ * sure that the performance degradation associated with schemes such
+ * as this is not realized.
+ *
+ * Author: John S. Dyson
+ * Significant help during the development and debugging phases
+ * had been provided by David Greenman, also of the FreeBSD core team.
+ *
+ * see man buf(9) for more info.
+ */
+
+#define VMIO
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/vmmeter.h>
+#include <sys/lock.h>
+#include <miscfs/specfs/specdev.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <sys/buf.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/resourcevar.h>
+
+static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
+
+struct bio_ops bioops; /* I/O operation notification */
+
+#if 0 /* replaced bu sched_sync */
+static void vfs_update __P((void));
+static struct proc *updateproc;
+static struct kproc_desc up_kp = {
+ "update",
+ vfs_update,
+ &updateproc
+};
+SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+#endif
+
+struct buf *buf; /* buffer header pool */
+struct swqueue bswlist;
+
+static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
+ vm_offset_t to);
+static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
+ vm_offset_t to);
+static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
+ vm_offset_t off, vm_offset_t size,
+ vm_page_t m);
+static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
+ int pageno, vm_page_t m);
+static void vfs_clean_pages(struct buf * bp);
+static void vfs_setdirty(struct buf *bp);
+static void vfs_vmio_release(struct buf *bp);
+static void flushdirtybuffers(int slpflag, int slptimeo);
+
+int needsbuffer;
+
+/*
+ * Internal update daemon, process 3
+ * The variable vfs_update_wakeup allows for internal syncs.
+ */
+int vfs_update_wakeup;
+
+
+/*
+ * buffers base kva
+ */
+
+/*
+ * bogus page -- for I/O to/from partially complete buffers
+ * this is a temporary solution to the problem, but it is not
+ * really that bad. it would be better to split the buffer
+ * for input in the case of buffers partially already in memory,
+ * but the code is intricate enough already.
+ */
+vm_page_t bogus_page;
+static vm_offset_t bogus_offset;
+
+static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
+ bufmallocspace, maxbufmallocspace;
+int numdirtybuffers;
+static int lodirtybuffers, hidirtybuffers;
+static int numfreebuffers, lofreebuffers, hifreebuffers;
+static int kvafreespace;
+
+SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
+ &numdirtybuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
+ &lodirtybuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
+ &hidirtybuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
+ &numfreebuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
+ &lofreebuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
+ &hifreebuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
+ &maxbufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
+ &bufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
+ &maxvmiobufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
+ &vmiospace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
+ &maxbufmallocspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
+ &bufmallocspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
+ &kvafreespace, 0, "");
+
+static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
+struct bqueues bufqueues[BUFFER_QUEUES] = {0};
+
+extern int vm_swap_size;
+
+#define BUF_MAXUSE 24
+
+#define VFS_BIO_NEED_ANY 1
+#define VFS_BIO_NEED_LOWLIMIT 2
+#define VFS_BIO_NEED_FREE 4
+
+/*
+ * Initialize buffer headers and related structures.
+ */
+void
+bufinit()
+{
+ struct buf *bp;
+ int i;
+
+ TAILQ_INIT(&bswlist);
+ LIST_INIT(&invalhash);
+
+ /* first, make a null hash table */
+ for (i = 0; i < BUFHSZ; i++)
+ LIST_INIT(&bufhashtbl[i]);
+
+ /* next, make a null set of free lists */
+ for (i = 0; i < BUFFER_QUEUES; i++)
+ TAILQ_INIT(&bufqueues[i]);
+
+ /* finally, initialize each buffer header and stick on empty q */
+ for (i = 0; i < nbuf; i++) {
+ bp = &buf[i];
+ bzero(bp, sizeof *bp);
+ bp->b_flags = B_INVAL; /* we're just an empty header */
+ bp->b_dev = NODEV;
+ bp->b_rcred = NOCRED;
+ bp->b_wcred = NOCRED;
+ bp->b_qindex = QUEUE_EMPTY;
+ bp->b_xflags = 0;
+ LIST_INIT(&bp->b_dep);
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+ }
+/*
+ * maxbufspace is currently calculated to support all filesystem blocks
+ * to be 8K. If you happen to use a 16K filesystem, the size of the buffer
+ * cache is still the same as it would be for 8K filesystems. This
+ * keeps the size of the buffer cache "in check" for big block filesystems.
+ */
+ maxbufspace = (nbuf + 8) * DFLTBSIZE;
+/*
+ * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
+ */
+ maxvmiobufspace = 2 * maxbufspace / 3;
+/*
+ * Limit the amount of malloc memory since it is wired permanently into
+ * the kernel space. Even though this is accounted for in the buffer
+ * allocation, we don't want the malloced region to grow uncontrolled.
+ * The malloc scheme improves memory utilization significantly on average
+ * (small) directories.
+ */
+ maxbufmallocspace = maxbufspace / 20;
+
+/*
+ * Remove the probability of deadlock conditions by limiting the
+ * number of dirty buffers.
+ */
+ hidirtybuffers = nbuf / 8 + 20;
+ lodirtybuffers = nbuf / 16 + 10;
+ numdirtybuffers = 0;
+ lofreebuffers = nbuf / 18 + 5;
+ hifreebuffers = 2 * lofreebuffers;
+ numfreebuffers = nbuf;
+ kvafreespace = 0;
+
+ bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
+ bogus_page = vm_page_alloc(kernel_object,
+ ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
+ VM_ALLOC_NORMAL);
+
+}
+
+/*
+ * Free the kva allocation for a buffer
+ * Must be called only at splbio or higher,
+ * as this is the only locking for buffer_map.
+ */
+static void
+bfreekva(struct buf * bp)
+{
+ if (bp->b_kvasize == 0)
+ return;
+
+ vm_map_delete(buffer_map,
+ (vm_offset_t) bp->b_kvabase,
+ (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
+
+ bp->b_kvasize = 0;
+
+}
+
+/*
+ * remove the buffer from the appropriate free list
+ */
+void
+bremfree(struct buf * bp)
+{
+ int s = splbio();
+
+ if (bp->b_qindex != QUEUE_NONE) {
+ if (bp->b_qindex == QUEUE_EMPTY) {
+ kvafreespace -= bp->b_kvasize;
+ }
+ TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+ bp->b_qindex = QUEUE_NONE;
+ } else {
+#if !defined(MAX_PERF)
+ panic("bremfree: removing a buffer when not on a queue");
+#endif
+ }
+ if ((bp->b_flags & B_INVAL) ||
+ (bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
+ --numfreebuffers;
+ splx(s);
+}
+
+
+/*
+ * Get a buffer with the specified data. Look in the cache first.
+ */
+int
+bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
+ struct buf ** bpp)
+{
+ struct buf *bp;
+
+ bp = getblk(vp, blkno, size, 0, 0);
+ *bpp = bp;
+
+ /* if not found in cache, do some I/O */
+ if ((bp->b_flags & B_CACHE) == 0) {
+ if (curproc != NULL)
+ curproc->p_stats->p_ru.ru_inblock++;
+ bp->b_flags |= B_READ;
+ bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+ if (bp->b_rcred == NOCRED) {
+ if (cred != NOCRED)
+ crhold(cred);
+ bp->b_rcred = cred;
+ }
+ vfs_busy_pages(bp, 0);
+ VOP_STRATEGY(vp, bp);
+ return (biowait(bp));
+ }
+ return (0);
+}
+
+/*
+ * Operates like bread, but also starts asynchronous I/O on
+ * read-ahead blocks.
+ */
+int
+breadn(struct vnode * vp, daddr_t blkno, int size,
+ daddr_t * rablkno, int *rabsize,
+ int cnt, struct ucred * cred, struct buf ** bpp)
+{
+ struct buf *bp, *rabp;
+ int i;
+ int rv = 0, readwait = 0;
+
+ *bpp = bp = getblk(vp, blkno, size, 0, 0);
+
+ /* if not found in cache, do some I/O */
+ if ((bp->b_flags & B_CACHE) == 0) {
+ if (curproc != NULL)
+ curproc->p_stats->p_ru.ru_inblock++;
+ bp->b_flags |= B_READ;
+ bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+ if (bp->b_rcred == NOCRED) {
+ if (cred != NOCRED)
+ crhold(cred);
+ bp->b_rcred = cred;
+ }
+ vfs_busy_pages(bp, 0);
+ VOP_STRATEGY(vp, bp);
+ ++readwait;
+ }
+ for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+ if (inmem(vp, *rablkno))
+ continue;
+ rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
+
+ if ((rabp->b_flags & B_CACHE) == 0) {
+ if (curproc != NULL)
+ curproc->p_stats->p_ru.ru_inblock++;
+ rabp->b_flags |= B_READ | B_ASYNC;
+ rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+ if (rabp->b_rcred == NOCRED) {
+ if (cred != NOCRED)
+ crhold(cred);
+ rabp->b_rcred = cred;
+ }
+ vfs_busy_pages(rabp, 0);
+ VOP_STRATEGY(vp, rabp);
+ } else {
+ brelse(rabp);
+ }
+ }
+
+ if (readwait) {
+ rv = biowait(bp);
+ }
+ return (rv);
+}
+
+/*
+ * Write, release buffer on completion. (Done by iodone
+ * if async.)
+ */
+int
+bwrite(struct buf * bp)
+{
+ int oldflags, s;
+ struct vnode *vp;
+ struct mount *mp;
+
+
+ if (bp->b_flags & B_INVAL) {
+ brelse(bp);
+ return (0);
+ }
+
+ oldflags = bp->b_flags;
+
+#if !defined(MAX_PERF)
+ if ((bp->b_flags & B_BUSY) == 0)
+ panic("bwrite: buffer is not busy???");
+#endif
+
+ bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
+ bp->b_flags |= B_WRITEINPROG;
+
+ s = splbio();
+ if ((oldflags & B_DELWRI) == B_DELWRI) {
+ --numdirtybuffers;
+ reassignbuf(bp, bp->b_vp);
+ }
+
+ bp->b_vp->v_numoutput++;
+ vfs_busy_pages(bp, 1);
+ if (curproc != NULL)
+ curproc->p_stats->p_ru.ru_oublock++;
+ splx(s);
+ VOP_STRATEGY(bp->b_vp, bp);
+
+ /*
+ * Collect statistics on synchronous and asynchronous writes.
+ * Writes to block devices are charged to their associated
+ * filesystem (if any).
+ */
+ if ((vp = bp->b_vp) != NULL) {
+ if (vp->v_type == VBLK)
+ mp = vp->v_specmountpoint;
+ else
+ mp = vp->v_mount;
+ if (mp != NULL)
+ if ((oldflags & B_ASYNC) == 0)
+ mp->mnt_stat.f_syncwrites++;
+ else
+ mp->mnt_stat.f_asyncwrites++;
+ }
+
+ if ((oldflags & B_ASYNC) == 0) {
+ int rtval = biowait(bp);
+ brelse(bp);
+ return (rtval);
+ }
+ return (0);
+}
+
+void
+vfs_bio_need_satisfy(void) {
+ ++numfreebuffers;
+ if (!needsbuffer)
+ return;
+ if (numdirtybuffers < lodirtybuffers) {
+ needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
+ } else {
+ needsbuffer &= ~VFS_BIO_NEED_ANY;
+ }
+ if (numfreebuffers >= hifreebuffers) {
+ needsbuffer &= ~VFS_BIO_NEED_FREE;
+ }
+ wakeup(&needsbuffer);
+}
+
+/*
+ * Delayed write. (Buffer is marked dirty).
+ */
+void
+bdwrite(struct buf * bp)
+{
+ struct vnode *vp;
+
+#if !defined(MAX_PERF)
+ if ((bp->b_flags & B_BUSY) == 0) {
+ panic("bdwrite: buffer is not busy");
+ }
+#endif
+
+ if (bp->b_flags & B_INVAL) {
+ brelse(bp);
+ return;
+ }
+ bp->b_flags &= ~(B_READ|B_RELBUF);
+ if ((bp->b_flags & B_DELWRI) == 0) {
+ bp->b_flags |= B_DONE | B_DELWRI;
+ reassignbuf(bp, bp->b_vp);
+ ++numdirtybuffers;
+ }
+
+ /*
+ * This bmap keeps the system from needing to do the bmap later,
+ * perhaps when the system is attempting to do a sync. Since it
+ * is likely that the indirect block -- or whatever other datastructure
+ * that the filesystem needs is still in memory now, it is a good
+ * thing to do this. Note also, that if the pageout daemon is
+ * requesting a sync -- there might not be enough memory to do
+ * the bmap then... So, this is important to do.
+ */
+ if (bp->b_lblkno == bp->b_blkno) {
+ VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+ }
+
+ /*
+ * Set the *dirty* buffer range based upon the VM system dirty pages.
+ */
+ vfs_setdirty(bp);
+
+ /*
+ * We need to do this here to satisfy the vnode_pager and the
+ * pageout daemon, so that it thinks that the pages have been
+ * "cleaned". Note that since the pages are in a delayed write
+ * buffer -- the VFS layer "will" see that the pages get written
+ * out on the next sync, or perhaps the cluster will be completed.
+ */
+ vfs_clean_pages(bp);
+ bqrelse(bp);
+
+ /*
+ * XXX The soft dependency code is not prepared to
+ * have I/O done when a bdwrite is requested. For
+ * now we just let the write be delayed if it is
+ * requested by the soft dependency code.
+ */
+ if ((vp = bp->b_vp) &&
+ ((vp->v_type == VBLK && vp->v_specmountpoint &&
+ (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
+ (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
+ return;
+
+ if (numdirtybuffers >= hidirtybuffers)
+ flushdirtybuffers(0, 0);
+
+ return;
+}
+
+
+/*
+ * Same as first half of bdwrite, mark buffer dirty, but do not release it.
+ * Check how this compares with vfs_setdirty(); XXX [JRE]
+ */
+void
+bdirty(bp)
+ struct buf *bp;
+{
+
+ bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */
+ if ((bp->b_flags & B_DELWRI) == 0) {
+ bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */
+ reassignbuf(bp, bp->b_vp);
+ ++numdirtybuffers;
+ }
+}
+
+/*
+ * Asynchronous write.
+ * Start output on a buffer, but do not wait for it to complete.
+ * The buffer is released when the output completes.
+ */
+void
+bawrite(struct buf * bp)
+{
+ bp->b_flags |= B_ASYNC;
+ (void) VOP_BWRITE(bp);
+}
+
+/*
+ * Ordered write.
+ * Start output on a buffer, and flag it so that the device will write
+ * it in the order it was queued. The buffer is released when the output
+ * completes.
+ */
+int
+bowrite(struct buf * bp)
+{
+ bp->b_flags |= B_ORDERED|B_ASYNC;
+ return (VOP_BWRITE(bp));
+}
+
+/*
+ * Release a buffer.
+ */
+void
+brelse(struct buf * bp)
+{
+ int s;
+
+ if (bp->b_flags & B_CLUSTER) {
+ relpbuf(bp, NULL);
+ return;
+ }
+
+ s = splbio();
+
+ /* anyone need this block? */
+ if (bp->b_flags & B_WANTED) {
+ bp->b_flags &= ~(B_WANTED | B_AGE);
+ wakeup(bp);
+ }
+
+ if (bp->b_flags & B_LOCKED)
+ bp->b_flags &= ~B_ERROR;
+
+ if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
+ (bp->b_bufsize <= 0)) {
+ bp->b_flags |= B_INVAL;
+ if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
+ (*bioops.io_deallocate)(bp);
+ if (bp->b_flags & B_DELWRI)
+ --numdirtybuffers;
+ bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
+ if ((bp->b_flags & B_VMIO) == 0) {
+ if (bp->b_bufsize)
+ allocbuf(bp, 0);
+ if (bp->b_vp)
+ brelvp(bp);
+ }
+ }
+
+ /*
+ * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release()
+ * is called with B_DELWRI set, the underlying pages may wind up
+ * getting freed causing a previous write (bdwrite()) to get 'lost'
+ * because pages associated with a B_DELWRI bp are marked clean.
+ *
+ * We still allow the B_INVAL case to call vfs_vmio_release(), even
+ * if B_DELWRI is set.
+ */
+
+ if (bp->b_flags & B_DELWRI)
+ bp->b_flags &= ~B_RELBUF;
+
+ /*
+ * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
+ * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
+ * but the VM object is kept around. The B_NOCACHE flag is used to
+ * invalidate the pages in the VM object.
+ *
+ * The b_{validoff,validend,dirtyoff,dirtyend} values are relative
+ * to b_offset and currently have byte granularity, whereas the
+ * valid flags in the vm_pages have only DEV_BSIZE resolution.
+ * The byte resolution fields are used to avoid unnecessary re-reads
+ * of the buffer but the code really needs to be genericized so
+ * other filesystem modules can take advantage of these fields.
+ *
+ * XXX this seems to cause performance problems.
+ */
+ if ((bp->b_flags & B_VMIO)
+ && !(bp->b_vp->v_tag == VT_NFS &&
+ bp->b_vp->v_type != VBLK &&
+ (bp->b_flags & B_DELWRI) != 0)
+#ifdef notdef
+ && (bp->b_vp->v_tag != VT_NFS
+ || bp->b_vp->v_type == VBLK
+ || (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
+ || bp->b_validend == 0
+ || (bp->b_validoff == 0
+ && bp->b_validend == bp->b_bufsize))
+#endif
+ ) {
+
+ int i, j, resid;
+ vm_page_t m;
+ off_t foff;
+ vm_pindex_t poff;
+ vm_object_t obj;
+ struct vnode *vp;
+
+ vp = bp->b_vp;
+
+ /*
+ * Get the base offset and length of the buffer. Note that
+ * for block sizes that are less then PAGE_SIZE, the b_data
+ * base of the buffer does not represent exactly b_offset and
+ * neither b_offset nor b_size are necessarily page aligned.
+ * Instead, the starting position of b_offset is:
+ *
+ * b_data + (b_offset & PAGE_MASK)
+ *
+ * block sizes less then DEV_BSIZE (usually 512) are not
+ * supported due to the page granularity bits (m->valid,
+ * m->dirty, etc...).
+ *
+ * See man buf(9) for more information
+ */
+
+ resid = bp->b_bufsize;
+ foff = bp->b_offset;
+
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ vm_page_flag_clear(m, PG_ZERO);
+ if (m == bogus_page) {
+
+ obj = (vm_object_t) vp->v_object;
+ poff = OFF_TO_IDX(bp->b_offset);
+
+ for (j = i; j < bp->b_npages; j++) {
+ m = bp->b_pages[j];
+ if (m == bogus_page) {
+ m = vm_page_lookup(obj, poff + j);
+#if !defined(MAX_PERF)
+ if (!m) {
+ panic("brelse: page missing\n");
+ }
+#endif
+ bp->b_pages[j] = m;
+ }
+ }
+
+ if ((bp->b_flags & B_INVAL) == 0) {
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+ }
+ }
+ if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
+ int poffset = foff & PAGE_MASK;
+ int presid = resid > (PAGE_SIZE - poffset) ?
+ (PAGE_SIZE - poffset) : resid;
+
+ KASSERT(presid >= 0, ("brelse: extra page"));
+ vm_page_set_invalid(m, poffset, presid);
+ }
+ resid -= PAGE_SIZE - (foff & PAGE_MASK);
+ foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+ }
+
+ if (bp->b_flags & (B_INVAL | B_RELBUF))
+ vfs_vmio_release(bp);
+
+ } else if (bp->b_flags & B_VMIO) {
+
+ if (bp->b_flags & (B_INVAL | B_RELBUF))
+ vfs_vmio_release(bp);
+
+ }
+
+#if !defined(MAX_PERF)
+ if (bp->b_qindex != QUEUE_NONE)
+ panic("brelse: free buffer onto another queue???");
+#endif
+
+ /* enqueue */
+ /* buffers with no memory */
+ if (bp->b_bufsize == 0) {
+ bp->b_flags |= B_INVAL;
+ bp->b_qindex = QUEUE_EMPTY;
+ TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+ LIST_REMOVE(bp, b_hash);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+ bp->b_dev = NODEV;
+ kvafreespace += bp->b_kvasize;
+
+ /* buffers with junk contents */
+ } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
+ bp->b_flags |= B_INVAL;
+ bp->b_qindex = QUEUE_AGE;
+ TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
+ LIST_REMOVE(bp, b_hash);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+ bp->b_dev = NODEV;
+
+ /* buffers that are locked */
+ } else if (bp->b_flags & B_LOCKED) {
+ bp->b_qindex = QUEUE_LOCKED;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+
+ /* buffers with stale but valid contents */
+ } else if (bp->b_flags & B_AGE) {
+ bp->b_qindex = QUEUE_AGE;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
+
+ /* buffers with valid and quite potentially reuseable contents */
+ } else {
+ bp->b_qindex = QUEUE_LRU;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ }
+
+ if ((bp->b_flags & B_INVAL) ||
+ (bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
+ if (bp->b_flags & B_DELWRI) {
+ --numdirtybuffers;
+ bp->b_flags &= ~B_DELWRI;
+ }
+ vfs_bio_need_satisfy();
+ }
+
+ /* unlock */
+ bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
+ B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+ splx(s);
+}
+
+/*
+ * Release a buffer.
+ */
+void
+bqrelse(struct buf * bp)
+{
+ int s;
+
+ s = splbio();
+
+ /* anyone need this block? */
+ if (bp->b_flags & B_WANTED) {
+ bp->b_flags &= ~(B_WANTED | B_AGE);
+ wakeup(bp);
+ }
+
+#if !defined(MAX_PERF)
+ if (bp->b_qindex != QUEUE_NONE)
+ panic("bqrelse: free buffer onto another queue???");
+#endif
+
+ if (bp->b_flags & B_LOCKED) {
+ bp->b_flags &= ~B_ERROR;
+ bp->b_qindex = QUEUE_LOCKED;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+ /* buffers with stale but valid contents */
+ } else {
+ bp->b_qindex = QUEUE_LRU;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ }
+
+ if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
+ vfs_bio_need_satisfy();
+ }
+
+ /* unlock */
+ bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
+ B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+ splx(s);
+}
+
+static void
+vfs_vmio_release(bp)
+ struct buf *bp;
+{
+ int i, s;
+ vm_page_t m;
+
+ s = splvm();
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ bp->b_pages[i] = NULL;
+ /*
+ * In order to keep page LRU ordering consistent, put
+ * everything on the inactive queue.
+ */
+ vm_page_unwire(m, 0);
+ /*
+ * We don't mess with busy pages, it is
+ * the responsibility of the process that
+ * busied the pages to deal with them.
+ */
+ if ((m->flags & PG_BUSY) || (m->busy != 0))
+ continue;
+
+ if (m->wire_count == 0) {
+ vm_page_flag_clear(m, PG_ZERO);
+ /*
+ * Might as well free the page if we can and it has
+ * no valid data.
+ */
+ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
+ vm_page_busy(m);
+ vm_page_protect(m, VM_PROT_NONE);
+ vm_page_free(m);
+ }
+ }
+ }
+ splx(s);
+ bufspace -= bp->b_bufsize;
+ vmiospace -= bp->b_bufsize;
+ pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+ bp->b_npages = 0;
+ bp->b_bufsize = 0;
+ bp->b_flags &= ~B_VMIO;
+ if (bp->b_vp)
+ brelvp(bp);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
+struct buf *
+gbincore(struct vnode * vp, daddr_t blkno)
+{
+ struct buf *bp;
+ struct bufhashhdr *bh;
+
+ bh = BUFHASH(vp, blkno);
+ bp = bh->lh_first;
+
+ /* Search hash chain */
+ while (bp != NULL) {
+ /* hit */
+ if (bp->b_vp == vp && bp->b_lblkno == blkno &&
+ (bp->b_flags & B_INVAL) == 0) {
+ break;
+ }
+ bp = bp->b_hash.le_next;
+ }
+ return (bp);
+}
+
+/*
+ * this routine implements clustered async writes for
+ * clearing out B_DELWRI buffers... This is much better
+ * than the old way of writing only one buffer at a time.
+ */
+int
+vfs_bio_awrite(struct buf * bp)
+{
+ int i;
+ daddr_t lblkno = bp->b_lblkno;
+ struct vnode *vp = bp->b_vp;
+ int s;
+ int ncl;
+ struct buf *bpa;
+ int nwritten;
+ int size;
+ int maxcl;
+
+ s = splbio();
+ /*
+ * right now we support clustered writing only to regular files
+ */
+ if ((vp->v_type == VREG) &&
+ (vp->v_mount != 0) && /* Only on nodes that have the size info */
+ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
+
+ size = vp->v_mount->mnt_stat.f_iosize;
+ maxcl = MAXPHYS / size;
+
+ for (i = 1; i < maxcl; i++) {
+ if ((bpa = gbincore(vp, lblkno + i)) &&
+ ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
+ (B_DELWRI | B_CLUSTEROK)) &&
+ (bpa->b_bufsize == size)) {
+ if ((bpa->b_blkno == bpa->b_lblkno) ||
+ (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
+ break;
+ } else {
+ break;
+ }
+ }
+ ncl = i;
+ /*
+ * this is a possible cluster write
+ */
+ if (ncl != 1) {
+ nwritten = cluster_wbuild(vp, size, lblkno, ncl);
+ splx(s);
+ return nwritten;
+ }
+ }
+
+ bremfree(bp);
+ bp->b_flags |= B_BUSY | B_ASYNC;
+
+ splx(s);
+ /*
+ * default (old) behavior, writing out only one block
+ */
+ nwritten = bp->b_bufsize;
+ (void) VOP_BWRITE(bp);
+ return nwritten;
+}
+
+
+/*
+ * Find a buffer header which is available for use.
+ */
+static struct buf *
+getnewbuf(struct vnode *vp, daddr_t blkno,
+ int slpflag, int slptimeo, int size, int maxsize)
+{
+ struct buf *bp, *bp1;
+ int nbyteswritten = 0;
+ vm_offset_t addr;
+ static int writerecursion = 0;
+
+start:
+ if (bufspace >= maxbufspace)
+ goto trytofreespace;
+
+ /* can we constitute a new buffer? */
+ if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
+#if !defined(MAX_PERF)
+ if (bp->b_qindex != QUEUE_EMPTY)
+ panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
+ bp->b_qindex);
+#endif
+ bp->b_flags |= B_BUSY;
+ bremfree(bp);
+ goto fillbuf;
+ }
+trytofreespace:
+ /*
+ * We keep the file I/O from hogging metadata I/O
+ * This is desirable because file data is cached in the
+ * VM/Buffer cache even if a buffer is freed.
+ */
+ if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
+#if !defined(MAX_PERF)
+ if (bp->b_qindex != QUEUE_AGE)
+ panic("getnewbuf: inconsistent AGE queue, qindex=%d",
+ bp->b_qindex);
+#endif
+ } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
+#if !defined(MAX_PERF)
+ if (bp->b_qindex != QUEUE_LRU)
+ panic("getnewbuf: inconsistent LRU queue, qindex=%d",
+ bp->b_qindex);
+#endif
+ }
+ if (!bp) {
+ /* wait for a free buffer of any kind */
+ needsbuffer |= VFS_BIO_NEED_ANY;
+ do
+ tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
+ slptimeo);
+ while (needsbuffer & VFS_BIO_NEED_ANY);
+ return (0);
+ }
+ KASSERT(!(bp->b_flags & B_BUSY),
+ ("getnewbuf: busy buffer on free list\n"));
+ /*
+ * We are fairly aggressive about freeing VMIO buffers, but since
+ * the buffering is intact without buffer headers, there is not
+ * much loss. We gain by maintaining non-VMIOed metadata in buffers.
+ */
+ if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
+ if ((bp->b_flags & B_VMIO) == 0 ||
+ (vmiospace < maxvmiobufspace)) {
+ --bp->b_usecount;
+ TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ goto start;
+ }
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ }
+ }
+
+
+ /* if we are a delayed write, convert to an async write */
+ if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
+
+ /*
+ * If our delayed write is likely to be used soon, then
+ * recycle back onto the LRU queue.
+ */
+ if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
+ (bp->b_lblkno >= blkno) && (maxsize > 0)) {
+
+ if (bp->b_usecount > 0) {
+ if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
+
+ TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
+
+ if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ bp->b_usecount--;
+ goto start;
+ }
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ }
+ }
+ }
+
+ /*
+ * Certain layered filesystems can recursively re-enter the vfs_bio
+ * code, due to delayed writes. This helps keep the system from
+ * deadlocking.
+ */
+ if (writerecursion > 0) {
+ if (writerecursion > 5) {
+ bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
+ while (bp) {
+ if ((bp->b_flags & B_DELWRI) == 0)
+ break;
+ bp = TAILQ_NEXT(bp, b_freelist);
+ }
+ if (bp == NULL) {
+ bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
+ while (bp) {
+ if ((bp->b_flags & B_DELWRI) == 0)
+ break;
+ bp = TAILQ_NEXT(bp, b_freelist);
+ }
+ }
+ if (bp == NULL)
+ panic("getnewbuf: cannot get buffer, infinite recursion failure");
+ } else {
+ bremfree(bp);
+ bp->b_flags |= B_BUSY | B_AGE | B_ASYNC;
+ nbyteswritten += bp->b_bufsize;
+ ++writerecursion;
+ VOP_BWRITE(bp);
+ --writerecursion;
+ if (!slpflag && !slptimeo) {
+ return (0);
+ }
+ goto start;
+ }
+ } else {
+ ++writerecursion;
+ nbyteswritten += vfs_bio_awrite(bp);
+ --writerecursion;
+ if (!slpflag && !slptimeo) {
+ return (0);
+ }
+ goto start;
+ }
+ }
+
+ if (bp->b_flags & B_WANTED) {
+ bp->b_flags &= ~B_WANTED;
+ wakeup(bp);
+ }
+ bremfree(bp);
+ bp->b_flags |= B_BUSY;
+
+ if (bp->b_flags & B_VMIO) {
+ bp->b_flags &= ~B_ASYNC;
+ vfs_vmio_release(bp);
+ }
+
+ if (bp->b_vp)
+ brelvp(bp);
+
+fillbuf:
+
+ /* we are not free, nor do we contain interesting data */
+ if (bp->b_rcred != NOCRED) {
+ crfree(bp->b_rcred);
+ bp->b_rcred = NOCRED;
+ }
+ if (bp->b_wcred != NOCRED) {
+ crfree(bp->b_wcred);
+ bp->b_wcred = NOCRED;
+ }
+ if (LIST_FIRST(&bp->b_dep) != NULL &&
+ bioops.io_deallocate)
+ (*bioops.io_deallocate)(bp);
+
+ LIST_REMOVE(bp, b_hash);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+ if (bp->b_bufsize) {
+ allocbuf(bp, 0);
+ }
+ bp->b_flags = B_BUSY;
+ bp->b_dev = NODEV;
+ bp->b_vp = NULL;
+ bp->b_blkno = bp->b_lblkno = 0;
+ bp->b_offset = NOOFFSET;
+ bp->b_iodone = 0;
+ bp->b_error = 0;
+ bp->b_resid = 0;
+ bp->b_bcount = 0;
+ bp->b_npages = 0;
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ bp->b_validoff = bp->b_validend = 0;
+ bp->b_usecount = 5;
+ /* Here, not kern_physio.c, is where this should be done*/
+ LIST_INIT(&bp->b_dep);
+
+ maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
+
+ /*
+ * we assume that buffer_map is not at address 0
+ */
+ addr = 0;
+ if (maxsize != bp->b_kvasize) {
+ bfreekva(bp);
+
+findkvaspace:
+ /*
+ * See if we have buffer kva space
+ */
+ if (vm_map_findspace(buffer_map,
+ vm_map_min(buffer_map), maxsize, &addr)) {
+ if (kvafreespace > 0) {
+ int totfree = 0, freed;
+ do {
+ freed = 0;
+ for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) {
+ if (bp1->b_kvasize != 0) {
+ totfree += bp1->b_kvasize;
+ freed = bp1->b_kvasize;
+ bremfree(bp1);
+ bfreekva(bp1);
+ brelse(bp1);
+ break;
+ }
+ }
+ } while (freed);
+ /*
+ * if we found free space, then retry with the same buffer.
+ */
+ if (totfree)
+ goto findkvaspace;
+ }
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ goto trytofreespace;
+ }
+ }
+
+ /*
+ * See if we are below are allocated minimum
+ */
+ if (bufspace >= (maxbufspace + nbyteswritten)) {
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ goto trytofreespace;
+ }
+
+ /*
+ * create a map entry for the buffer -- in essence
+ * reserving the kva space.
+ */
+ if (addr) {
+ vm_map_insert(buffer_map, NULL, 0,
+ addr, addr + maxsize,
+ VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
+
+ bp->b_kvabase = (caddr_t) addr;
+ bp->b_kvasize = maxsize;
+ }
+ bp->b_data = bp->b_kvabase;
+
+ return (bp);
+}
+
+static void
+waitfreebuffers(int slpflag, int slptimeo) {
+ while (numfreebuffers < hifreebuffers) {
+ flushdirtybuffers(slpflag, slptimeo);
+ if (numfreebuffers < hifreebuffers)
+ break;
+ needsbuffer |= VFS_BIO_NEED_FREE;
+ if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
+ break;
+ }
+}
+
+static void
+flushdirtybuffers(int slpflag, int slptimeo) {
+ int s;
+ static pid_t flushing = 0;
+
+ s = splbio();
+
+ if (flushing) {
+ if (flushing == curproc->p_pid) {
+ splx(s);
+ return;
+ }
+ while (flushing) {
+ if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) {
+ splx(s);
+ return;
+ }
+ }
+ }
+ flushing = curproc->p_pid;
+
+ while (numdirtybuffers > lodirtybuffers) {
+ struct buf *bp;
+ needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
+ bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
+ if (bp == NULL)
+ bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
+
+ while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
+ bp = TAILQ_NEXT(bp, b_freelist);
+ }
+
+ if (bp) {
+ vfs_bio_awrite(bp);
+ continue;
+ }
+ break;
+ }
+
+ flushing = 0;
+ wakeup(&flushing);
+ splx(s);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
+struct buf *
+incore(struct vnode * vp, daddr_t blkno)
+{
+ struct buf *bp;
+
+ int s = splbio();
+ bp = gbincore(vp, blkno);
+ splx(s);
+ return (bp);
+}
+
+/*
+ * Returns true if no I/O is needed to access the
+ * associated VM object. This is like incore except
+ * it also hunts around in the VM system for the data.
+ */
+
+int
+inmem(struct vnode * vp, daddr_t blkno)
+{
+ vm_object_t obj;
+ vm_offset_t toff, tinc, size;
+ vm_page_t m;
+ vm_ooffset_t off;
+
+ if (incore(vp, blkno))
+ return 1;
+ if (vp->v_mount == NULL)
+ return 0;
+ if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
+ return 0;
+
+ obj = vp->v_object;
+ size = PAGE_SIZE;
+ if (size > vp->v_mount->mnt_stat.f_iosize)
+ size = vp->v_mount->mnt_stat.f_iosize;
+ off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
+
+ for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
+ m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
+ if (!m)
+ return 0;
+ tinc = size;
+ if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
+ tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
+ if (vm_page_is_valid(m,
+ (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * now we set the dirty range for the buffer --
+ * for NFS -- if the file is mapped and pages have
+ * been written to, let it know. We want the
+ * entire range of the buffer to be marked dirty if
+ * any of the pages have been written to for consistancy
+ * with the b_validoff, b_validend set in the nfs write
+ * code, and used by the nfs read code.
+ */
+static void
+vfs_setdirty(struct buf *bp) {
+ int i;
+ vm_object_t object;
+ vm_offset_t boffset;
+#if 0
+ vm_offset_t offset;
+#endif
+
+ /*
+ * We qualify the scan for modified pages on whether the
+ * object has been flushed yet. The OBJ_WRITEABLE flag
+ * is not cleared simply by protecting pages off.
+ */
+ if ((bp->b_flags & B_VMIO) &&
+ ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
+ /*
+ * test the pages to see if they have been modified directly
+ * by users through the VM system.
+ */
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
+ vm_page_test_dirty(bp->b_pages[i]);
+ }
+
+ /*
+ * scan forwards for the first page modified
+ */
+ for (i = 0; i < bp->b_npages; i++) {
+ if (bp->b_pages[i]->dirty) {
+ break;
+ }
+ }
+
+ boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+ if (boffset < bp->b_dirtyoff) {
+ bp->b_dirtyoff = max(boffset, 0);
+ }
+
+ /*
+ * scan backwards for the last page modified
+ */
+ for (i = bp->b_npages - 1; i >= 0; --i) {
+ if (bp->b_pages[i]->dirty) {
+ break;
+ }
+ }
+ boffset = (i + 1);
+#if 0
+ offset = boffset + bp->b_pages[0]->pindex;
+ if (offset >= object->size)
+ boffset = object->size - bp->b_pages[0]->pindex;
+#endif
+ boffset = (boffset << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+ if (bp->b_dirtyend < boffset)
+ bp->b_dirtyend = min(boffset, bp->b_bufsize);
+ }
+}
+
+/*
+ * Get a block given a specified block and offset into a file/device.
+ */
+struct buf *
+getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
+{
+ struct buf *bp;
+ int i, s;
+ struct bufhashhdr *bh;
+
+#if !defined(MAX_PERF)
+ if (size > MAXBSIZE)
+ panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+#endif
+
+ s = splbio();
+loop:
+ if (numfreebuffers < lofreebuffers) {
+ waitfreebuffers(slpflag, slptimeo);
+ }
+
+ if ((bp = gbincore(vp, blkno))) {
+ if (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ if (bp->b_usecount < BUF_MAXUSE)
+ ++bp->b_usecount;
+
+ if (!tsleep(bp,
+ (PRIBIO + 4) | slpflag, "getblk", slptimeo)) {
+ goto loop;
+ }
+
+ splx(s);
+ return (struct buf *) NULL;
+ }
+ bp->b_flags |= B_BUSY | B_CACHE;
+ bremfree(bp);
+
+ /*
+ * check for size inconsistancies for non-VMIO case.
+ */
+
+ if (bp->b_bcount != size) {
+ if ((bp->b_flags & B_VMIO) == 0 ||
+ (size > bp->b_kvasize)
+ ) {
+ if (bp->b_flags & B_DELWRI) {
+ bp->b_flags |= B_NOCACHE;
+ VOP_BWRITE(bp);
+ } else {
+ if ((bp->b_flags & B_VMIO) &&
+ (LIST_FIRST(&bp->b_dep) == NULL)) {
+ bp->b_flags |= B_RELBUF;
+ brelse(bp);
+ } else {
+ bp->b_flags |= B_NOCACHE;
+ VOP_BWRITE(bp);
+ }
+ }
+ goto loop;
+ }
+ }
+
+ /*
+ * If the size is inconsistant in the VMIO case, we can resize
+ * the buffer. This might lead to B_CACHE getting cleared.
+ */
+
+ if (bp->b_bcount != size)
+ allocbuf(bp, size);
+
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("getblk: no buffer offset"));
+
+ /*
+ * Check that the constituted buffer really deserves for the
+ * B_CACHE bit to be set. B_VMIO type buffers might not
+ * contain fully valid pages. Normal (old-style) buffers
+ * should be fully valid. This might also lead to B_CACHE
+ * getting clear.
+ */
+ if ((bp->b_flags & B_VMIO|B_CACHE) == (B_VMIO|B_CACHE)) {
+ int checksize = bp->b_bufsize;
+ int poffset = bp->b_offset & PAGE_MASK;
+ int resid;
+ for (i = 0; i < bp->b_npages; i++) {
+ resid = (checksize > (PAGE_SIZE - poffset)) ?
+ (PAGE_SIZE - poffset) : checksize;
+ if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
+ bp->b_flags &= ~(B_CACHE | B_DONE);
+ break;
+ }
+ checksize -= resid;
+ poffset = 0;
+ }
+ }
+
+ /*
+ * If B_DELWRI is set and B_CACHE got cleared ( or was
+ * already clear ), we have to commit the write and
+ * retry. The NFS code absolutely depends on this,
+ * and so might the FFS code. In anycase, it formalizes
+ * the B_CACHE rules. See sys/buf.h.
+ */
+
+ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+ VOP_BWRITE(bp);
+ goto loop;
+ }
+
+ if (bp->b_usecount < BUF_MAXUSE)
+ ++bp->b_usecount;
+ splx(s);
+ return (bp);
+ } else {
+ int bsize, maxsize, vmio;
+ off_t offset;
+
+ if (vp->v_type == VBLK)
+ bsize = DEV_BSIZE;
+ else if (vp->v_mountedhere)
+ bsize = vp->v_mountedhere->mnt_stat.f_iosize;
+ else if (vp->v_mount)
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+ else
+ bsize = size;
+
+ offset = (off_t)blkno * bsize;
+ vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF);
+ maxsize = vmio ? size + (offset & PAGE_MASK) : size;
+ maxsize = imax(maxsize, bsize);
+
+ if ((bp = getnewbuf(vp, blkno,
+ slpflag, slptimeo, size, maxsize)) == 0) {
+ if (slpflag || slptimeo) {
+ splx(s);
+ return NULL;
+ }
+ goto loop;
+ }
+
+ /*
+ * This code is used to make sure that a buffer is not
+ * created while the getnewbuf routine is blocked.
+ * Normally the vnode is locked so this isn't a problem.
+ * VBLK type I/O requests, however, don't lock the vnode.
+ */
+ if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE && gbincore(vp, blkno)) {
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ goto loop;
+ }
+
+ /*
+ * Insert the buffer into the hash, so that it can
+ * be found by incore.
+ */
+ bp->b_blkno = bp->b_lblkno = blkno;
+ bp->b_offset = offset;
+
+ bgetvp(vp, bp);
+ LIST_REMOVE(bp, b_hash);
+ bh = BUFHASH(vp, blkno);
+ LIST_INSERT_HEAD(bh, bp, b_hash);
+
+ if (vmio) {
+ bp->b_flags |= (B_VMIO | B_CACHE);
+#if defined(VFS_BIO_DEBUG)
+ if (vp->v_type != VREG && vp->v_type != VBLK)
+ printf("getblk: vmioing file type %d???\n", vp->v_type);
+#endif
+ } else {
+ bp->b_flags &= ~B_VMIO;
+ }
+
+ allocbuf(bp, size);
+
+ splx(s);
+ return (bp);
+ }
+}
+
+/*
+ * Get an empty, disassociated buffer of given size.
+ */
+struct buf *
+geteblk(int size)
+{
+ struct buf *bp;
+ int s;
+
+ s = splbio();
+ while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
+ splx(s);
+ allocbuf(bp, size);
+ bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
+ return (bp);
+}
+
+
+/*
+ * This code constitutes the buffer memory from either anonymous system
+ * memory (in the case of non-VMIO operations) or from an associated
+ * VM object (in the case of VMIO operations). This code is able to
+ * resize a buffer up or down.
+ *
+ * Note that this code is tricky, and has many complications to resolve
+ * deadlock or inconsistant data situations. Tread lightly!!!
+ * There are B_CACHE and B_DELWRI interactions that must be dealt with by
+ * the caller. Calling this code willy nilly can result in the loss of data.
+ */
+
+int
+allocbuf(struct buf *bp, int size)
+{
+ int newbsize, mbsize;
+ int i;
+
+#if !defined(MAX_PERF)
+ if (!(bp->b_flags & B_BUSY))
+ panic("allocbuf: buffer not busy");
+
+ if (bp->b_kvasize < size)
+ panic("allocbuf: buffer too small");
+#endif
+
+ if ((bp->b_flags & B_VMIO) == 0) {
+ caddr_t origbuf;
+ int origbufsize;
+ /*
+ * Just get anonymous memory from the kernel
+ */
+ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+#if !defined(NO_B_MALLOC)
+ if (bp->b_flags & B_MALLOC)
+ newbsize = mbsize;
+ else
+#endif
+ newbsize = round_page(size);
+
+ if (newbsize < bp->b_bufsize) {
+#if !defined(NO_B_MALLOC)
+ /*
+ * malloced buffers are not shrunk
+ */
+ if (bp->b_flags & B_MALLOC) {
+ if (newbsize) {
+ bp->b_bcount = size;
+ } else {
+ free(bp->b_data, M_BIOBUF);
+ bufspace -= bp->b_bufsize;
+ bufmallocspace -= bp->b_bufsize;
+ bp->b_data = bp->b_kvabase;
+ bp->b_bufsize = 0;
+ bp->b_bcount = 0;
+ bp->b_flags &= ~B_MALLOC;
+ }
+ return 1;
+ }
+#endif
+ vm_hold_free_pages(
+ bp,
+ (vm_offset_t) bp->b_data + newbsize,
+ (vm_offset_t) bp->b_data + bp->b_bufsize);
+ } else if (newbsize > bp->b_bufsize) {
+#if !defined(NO_B_MALLOC)
+ /*
+ * We only use malloced memory on the first allocation.
+ * and revert to page-allocated memory when the buffer grows.
+ */
+ if ( (bufmallocspace < maxbufmallocspace) &&
+ (bp->b_bufsize == 0) &&
+ (mbsize <= PAGE_SIZE/2)) {
+
+ bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
+ bp->b_bufsize = mbsize;
+ bp->b_bcount = size;
+ bp->b_flags |= B_MALLOC;
+ bufspace += mbsize;
+ bufmallocspace += mbsize;
+ return 1;
+ }
+#endif
+ origbuf = NULL;
+ origbufsize = 0;
+#if !defined(NO_B_MALLOC)
+ /*
+ * If the buffer is growing on its other-than-first allocation,
+ * then we revert to the page-allocation scheme.
+ */
+ if (bp->b_flags & B_MALLOC) {
+ origbuf = bp->b_data;
+ origbufsize = bp->b_bufsize;
+ bp->b_data = bp->b_kvabase;
+ bufspace -= bp->b_bufsize;
+ bufmallocspace -= bp->b_bufsize;
+ bp->b_bufsize = 0;
+ bp->b_flags &= ~B_MALLOC;
+ newbsize = round_page(newbsize);
+ }
+#endif
+ vm_hold_load_pages(
+ bp,
+ (vm_offset_t) bp->b_data + bp->b_bufsize,
+ (vm_offset_t) bp->b_data + newbsize);
+#if !defined(NO_B_MALLOC)
+ if (origbuf) {
+ bcopy(origbuf, bp->b_data, origbufsize);
+ free(origbuf, M_BIOBUF);
+ }
+#endif
+ }
+ } else {
+ vm_page_t m;
+ int desiredpages;
+
+ newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+ desiredpages = (size == 0) ? 0 :
+ num_pages((bp->b_offset & PAGE_MASK) + newbsize);
+
+#if !defined(NO_B_MALLOC)
+ if (bp->b_flags & B_MALLOC)
+ panic("allocbuf: VMIO buffer can't be malloced");
+#endif
+
+ if (newbsize < bp->b_bufsize) {
+ if (desiredpages < bp->b_npages) {
+ for (i = desiredpages; i < bp->b_npages; i++) {
+ /*
+ * the page is not freed here -- it
+ * is the responsibility of vnode_pager_setsize
+ */
+ m = bp->b_pages[i];
+ KASSERT(m != bogus_page,
+ ("allocbuf: bogus page found"));
+ while (vm_page_sleep_busy(m, TRUE, "biodep"))
+ ;
+
+ bp->b_pages[i] = NULL;
+ vm_page_unwire(m, 0);
+ }
+ pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
+ (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
+ bp->b_npages = desiredpages;
+ }
+ } else if (newbsize > bp->b_bufsize) {
+ vm_object_t obj;
+ vm_offset_t tinc, toff;
+ vm_ooffset_t off;
+ vm_pindex_t objoff;
+ int pageindex, curbpnpages;
+ struct vnode *vp;
+ int bsize;
+ int orig_validoff = bp->b_validoff;
+ int orig_validend = bp->b_validend;
+
+ vp = bp->b_vp;
+
+ if (vp->v_type == VBLK)
+ bsize = DEV_BSIZE;
+ else
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+
+ if (bp->b_npages < desiredpages) {
+ obj = vp->v_object;
+ tinc = PAGE_SIZE;
+
+ off = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("allocbuf: no buffer offset"));
+ curbpnpages = bp->b_npages;
+ doretry:
+ bp->b_validoff = orig_validoff;
+ bp->b_validend = orig_validend;
+ bp->b_flags |= B_CACHE;
+ for (toff = 0; toff < newbsize; toff += tinc) {
+ objoff = OFF_TO_IDX(off + toff);
+ pageindex = objoff - OFF_TO_IDX(off);
+ tinc = PAGE_SIZE - ((off + toff) & PAGE_MASK);
+ if (pageindex < curbpnpages) {
+
+ m = bp->b_pages[pageindex];
+#ifdef VFS_BIO_DIAG
+ if (m->pindex != objoff)
+ panic("allocbuf: page changed offset?!!!?");
+#endif
+ if (tinc > (newbsize - toff))
+ tinc = newbsize - toff;
+ if (bp->b_flags & B_CACHE)
+ vfs_buf_set_valid(bp, off, toff, tinc, m);
+ continue;
+ }
+ m = vm_page_lookup(obj, objoff);
+ if (!m) {
+ m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
+ if (!m) {
+ VM_WAIT;
+ vm_pageout_deficit += (desiredpages - curbpnpages);
+ goto doretry;
+ }
+
+ vm_page_wire(m);
+ vm_page_wakeup(m);
+ bp->b_flags &= ~B_CACHE;
+
+ } else if (vm_page_sleep_busy(m, FALSE, "pgtblk")) {
+ /*
+ * If we had to sleep, retry.
+ *
+ * Also note that we only test
+ * PG_BUSY here, not m->busy.
+ *
+ * We cannot sleep on m->busy
+ * here because a vm_fault ->
+ * getpages -> cluster-read ->
+ * ...-> allocbuf sequence
+ * will convert PG_BUSY to
+ * m->busy so we have to let
+ * m->busy through if we do
+ * not want to deadlock.
+ */
+ goto doretry;
+ } else {
+ if ((curproc != pageproc) &&
+ ((m->queue - m->pc) == PQ_CACHE) &&
+ ((cnt.v_free_count + cnt.v_cache_count) <
+ (cnt.v_free_min + cnt.v_cache_min))) {
+ pagedaemon_wakeup();
+ }
+ if (tinc > (newbsize - toff))
+ tinc = newbsize - toff;
+ if (bp->b_flags & B_CACHE)
+ vfs_buf_set_valid(bp, off, toff, tinc, m);
+ vm_page_flag_clear(m, PG_ZERO);
+ vm_page_wire(m);
+ }
+ bp->b_pages[pageindex] = m;
+ curbpnpages = pageindex + 1;
+ }
+ if (vp->v_tag == VT_NFS &&
+ vp->v_type != VBLK) {
+ if (bp->b_dirtyend > 0) {
+ bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
+ bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
+ }
+ if (bp->b_validend == 0)
+ bp->b_flags &= ~B_CACHE;
+ }
+ bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data);
+ bp->b_npages = curbpnpages;
+ pmap_qenter((vm_offset_t) bp->b_data,
+ bp->b_pages, bp->b_npages);
+ ((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
+ }
+ }
+ }
+ if (bp->b_flags & B_VMIO)
+ vmiospace += (newbsize - bp->b_bufsize);
+ bufspace += (newbsize - bp->b_bufsize);
+ bp->b_bufsize = newbsize;
+ bp->b_bcount = size;
+ return 1;
+}
+
+/*
+ * Wait for buffer I/O completion, returning error status.
+ */
+int
+biowait(register struct buf * bp)
+{
+ int s;
+
+ s = splbio();
+ while ((bp->b_flags & B_DONE) == 0)
+#if defined(NO_SCHEDULE_MODS)
+ tsleep(bp, PRIBIO, "biowait", 0);
+#else
+ if (bp->b_flags & B_READ)
+ tsleep(bp, PRIBIO, "biord", 0);
+ else
+ tsleep(bp, PRIBIO, "biowr", 0);
+#endif
+ splx(s);
+ if (bp->b_flags & B_EINTR) {
+ bp->b_flags &= ~B_EINTR;
+ return (EINTR);
+ }
+ if (bp->b_flags & B_ERROR) {
+ return (bp->b_error ? bp->b_error : EIO);
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * Finish I/O on a buffer, calling an optional function.
+ * This is usually called from interrupt level, so process blocking
+ * is not *a good idea*.
+ */
+void
+biodone(register struct buf * bp)
+{
+ int s;
+
+ s = splbio();
+
+#if !defined(MAX_PERF)
+ if (!(bp->b_flags & B_BUSY))
+ panic("biodone: buffer not busy");
+#endif
+
+ if (bp->b_flags & B_DONE) {
+ splx(s);
+#if !defined(MAX_PERF)
+ printf("biodone: buffer already done\n");
+#endif
+ return;
+ }
+ bp->b_flags |= B_DONE;
+
+ if (bp->b_flags & B_FREEBUF) {
+ brelse(bp);
+ splx(s);
+ return;
+ }
+
+ if ((bp->b_flags & B_READ) == 0) {
+ vwakeup(bp);
+ }
+
+ /* call optional completion function if requested */
+ if (bp->b_flags & B_CALL) {
+ bp->b_flags &= ~B_CALL;
+ (*bp->b_iodone) (bp);
+ splx(s);
+ return;
+ }
+ if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
+ (*bioops.io_complete)(bp);
+
+ if (bp->b_flags & B_VMIO) {
+ int i, resid;
+ vm_ooffset_t foff;
+ vm_page_t m;
+ vm_object_t obj;
+ int iosize;
+ struct vnode *vp = bp->b_vp;
+
+ obj = vp->v_object;
+
+#if defined(VFS_BIO_DEBUG)
+ if (vp->v_usecount == 0) {
+ panic("biodone: zero vnode ref count");
+ }
+
+ if (vp->v_object == NULL) {
+ panic("biodone: missing VM object");
+ }
+
+ if ((vp->v_flag & VOBJBUF) == 0) {
+ panic("biodone: vnode is not setup for merged cache");
+ }
+#endif
+
+ foff = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("biodone: no buffer offset"));
+
+#if !defined(MAX_PERF)
+ if (!obj) {
+ panic("biodone: no object");
+ }
+#endif
+#if defined(VFS_BIO_DEBUG)
+ if (obj->paging_in_progress < bp->b_npages) {
+ printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
+ obj->paging_in_progress, bp->b_npages);
+ }
+#endif
+ iosize = bp->b_bufsize;
+ for (i = 0; i < bp->b_npages; i++) {
+ int bogusflag = 0;
+ m = bp->b_pages[i];
+ if (m == bogus_page) {
+ bogusflag = 1;
+ m = vm_page_lookup(obj, OFF_TO_IDX(foff));
+ if (!m) {
+#if defined(VFS_BIO_DEBUG)
+ printf("biodone: page disappeared\n");
+#endif
+ vm_object_pip_subtract(obj, 1);
+ continue;
+ }
+ bp->b_pages[i] = m;
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+ }
+#if defined(VFS_BIO_DEBUG)
+ if (OFF_TO_IDX(foff) != m->pindex) {
+ printf(
+"biodone: foff(%lu)/m->pindex(%d) mismatch\n",
+ (unsigned long)foff, m->pindex);
+ }
+#endif
+ resid = IDX_TO_OFF(m->pindex + 1) - foff;
+ if (resid > iosize)
+ resid = iosize;
+
+ /*
+ * In the write case, the valid and clean bits are
+ * already changed correctly, so we only need to do this
+ * here in the read case.
+ */
+ if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
+ vfs_page_set_valid(bp, foff, i, m);
+ }
+ vm_page_flag_clear(m, PG_ZERO);
+
+ /*
+ * when debugging new filesystems or buffer I/O methods, this
+ * is the most common error that pops up. if you see this, you
+ * have not set the page busy flag correctly!!!
+ */
+ if (m->busy == 0) {
+#if !defined(MAX_PERF)
+ printf("biodone: page busy < 0, "
+ "pindex: %d, foff: 0x(%x,%x), "
+ "resid: %d, index: %d\n",
+ (int) m->pindex, (int)(foff >> 32),
+ (int) foff & 0xffffffff, resid, i);
+#endif
+ if (vp->v_type != VBLK)
+#if !defined(MAX_PERF)
+ printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
+ bp->b_vp->v_mount->mnt_stat.f_iosize,
+ (int) bp->b_lblkno,
+ bp->b_flags, bp->b_npages);
+ else
+ printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
+ (int) bp->b_lblkno,
+ bp->b_flags, bp->b_npages);
+ printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
+ m->valid, m->dirty, m->wire_count);
+#endif
+ panic("biodone: page busy < 0\n");
+ }
+ vm_page_io_finish(m);
+ vm_object_pip_subtract(obj, 1);
+ foff += resid;
+ iosize -= resid;
+ }
+ if (obj)
+ vm_object_pip_wakeupn(obj, 0);
+ }
+ /*
+ * For asynchronous completions, release the buffer now. The brelse
+ * checks for B_WANTED and will do the wakeup there if necessary - so
+ * no need to do a wakeup here in the async case.
+ */
+
+ if (bp->b_flags & B_ASYNC) {
+ if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
+ brelse(bp);
+ else
+ bqrelse(bp);
+ } else {
+ bp->b_flags &= ~B_WANTED;
+ wakeup(bp);
+ }
+ splx(s);
+}
+
+#if 0 /* not with kirks code */
+static int vfs_update_interval = 30;
+
+static void
+vfs_update()
+{
+ while (1) {
+ tsleep(&vfs_update_wakeup, PUSER, "update",
+ hz * vfs_update_interval);
+ vfs_update_wakeup = 0;
+ sync(curproc, NULL);
+ }
+}
+
+static int
+sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
+{
+ int error = sysctl_handle_int(oidp,
+ oidp->oid_arg1, oidp->oid_arg2, req);
+ if (!error)
+ wakeup(&vfs_update_wakeup);
+ return error;
+}
+
+SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
+ &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
+
+#endif
+
+
+/*
+ * This routine is called in lieu of iodone in the case of
+ * incomplete I/O. This keeps the busy status for pages
+ * consistant.
+ */
+void
+vfs_unbusy_pages(struct buf * bp)
+{
+ int i;
+
+ if (bp->b_flags & B_VMIO) {
+ struct vnode *vp = bp->b_vp;
+ vm_object_t obj = vp->v_object;
+
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m = bp->b_pages[i];
+
+ if (m == bogus_page) {
+ m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
+#if !defined(MAX_PERF)
+ if (!m) {
+ panic("vfs_unbusy_pages: page missing\n");
+ }
+#endif
+ bp->b_pages[i] = m;
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+ }
+ vm_object_pip_subtract(obj, 1);
+ vm_page_flag_clear(m, PG_ZERO);
+ vm_page_io_finish(m);
+ }
+ vm_object_pip_wakeupn(obj, 0);
+ }
+}
+
+/*
+ * Set NFS' b_validoff and b_validend fields from the valid bits
+ * of a page. If the consumer is not NFS, and the page is not
+ * valid for the entire range, clear the B_CACHE flag to force
+ * the consumer to re-read the page.
+ *
+ * B_CACHE interaction is especially tricky.
+ */
+static void
+vfs_buf_set_valid(struct buf *bp,
+ vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
+ vm_page_t m)
+{
+ if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
+ vm_offset_t svalid, evalid;
+ int validbits = m->valid >> (((foff+off)&PAGE_MASK)/DEV_BSIZE);
+
+ /*
+ * This only bothers with the first valid range in the
+ * page.
+ */
+ svalid = off;
+ while (validbits && !(validbits & 1)) {
+ svalid += DEV_BSIZE;
+ validbits >>= 1;
+ }
+ evalid = svalid;
+ while (validbits & 1) {
+ evalid += DEV_BSIZE;
+ validbits >>= 1;
+ }
+ evalid = min(evalid, off + size);
+ /*
+ * We can only set b_validoff/end if this range is contiguous
+ * with the range built up already. If we cannot set
+ * b_validoff/end, we must clear B_CACHE to force an update
+ * to clean the bp up.
+ */
+ if (svalid == bp->b_validend) {
+ bp->b_validoff = min(bp->b_validoff, svalid);
+ bp->b_validend = max(bp->b_validend, evalid);
+ } else {
+ bp->b_flags &= ~B_CACHE;
+ }
+ } else if (!vm_page_is_valid(m,
+ (vm_offset_t) ((foff + off) & PAGE_MASK),
+ size)) {
+ bp->b_flags &= ~B_CACHE;
+ }
+}
+
+/*
+ * Set the valid bits in a page, taking care of the b_validoff,
+ * b_validend fields which NFS uses to optimise small reads. Off is
+ * the offset within the file and pageno is the page index within the buf.
+ *
+ * XXX we have to set the valid & clean bits for all page fragments
+ * touched by b_validoff/validend, even if the page fragment goes somewhat
+ * beyond b_validoff/validend due to alignment.
+ */
+static void
+vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
+{
+ struct vnode *vp = bp->b_vp;
+ vm_ooffset_t soff, eoff;
+
+ soff = off;
+ eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
+ if (eoff > bp->b_offset + bp->b_bufsize)
+ eoff = bp->b_offset + bp->b_bufsize;
+ if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
+ vm_ooffset_t sv, ev;
+ vm_page_set_invalid(m,
+ (vm_offset_t) (soff & PAGE_MASK),
+ (vm_offset_t) (eoff - soff));
+ sv = (bp->b_offset + bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+ ev = (bp->b_offset + bp->b_validend + (DEV_BSIZE - 1)) &
+ ~(DEV_BSIZE - 1);
+ soff = qmax(sv, soff);
+ eoff = qmin(ev, eoff);
+ }
+ if (eoff > soff)
+ vm_page_set_validclean(m,
+ (vm_offset_t) (soff & PAGE_MASK),
+ (vm_offset_t) (eoff - soff));
+}
+
+/*
+ * This routine is called before a device strategy routine.
+ * It is used to tell the VM system that paging I/O is in
+ * progress, and treat the pages associated with the buffer
+ * almost as being PG_BUSY. Also the object paging_in_progress
+ * flag is handled to make sure that the object doesn't become
+ * inconsistant.
+ */
+void
+vfs_busy_pages(struct buf * bp, int clear_modify)
+{
+ int i, bogus;
+
+ if (bp->b_flags & B_VMIO) {
+ struct vnode *vp = bp->b_vp;
+ vm_object_t obj = vp->v_object;
+ vm_ooffset_t foff;
+
+ foff = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("vfs_busy_pages: no buffer offset"));
+ vfs_setdirty(bp);
+
+retry:
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m = bp->b_pages[i];
+ if (vm_page_sleep_busy(m, FALSE, "vbpage"))
+ goto retry;
+ }
+
+ bogus = 0;
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m = bp->b_pages[i];
+
+ vm_page_flag_clear(m, PG_ZERO);
+ if ((bp->b_flags & B_CLUSTER) == 0) {
+ vm_object_pip_add(obj, 1);
+ vm_page_io_start(m);
+ }
+
+ vm_page_protect(m, VM_PROT_NONE);
+ if (clear_modify)
+ vfs_page_set_valid(bp, foff, i, m);
+ else if (m->valid == VM_PAGE_BITS_ALL &&
+ (bp->b_flags & B_CACHE) == 0) {
+ bp->b_pages[i] = bogus_page;
+ bogus++;
+ }
+ foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+ }
+ if (bogus)
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+ }
+}
+
+/*
+ * Tell the VM system that the pages associated with this buffer
+ * are clean. This is used for delayed writes where the data is
+ * going to go to disk eventually without additional VM intevention.
+ */
+void
+vfs_clean_pages(struct buf * bp)
+{
+ int i;
+
+ if (bp->b_flags & B_VMIO) {
+ vm_ooffset_t foff;
+ foff = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("vfs_clean_pages: no buffer offset"));
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m = bp->b_pages[i];
+ vfs_page_set_valid(bp, foff, i, m);
+ foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+ }
+ }
+}
+
+void
+vfs_bio_clrbuf(struct buf *bp) {
+ int i, mask = 0;
+ caddr_t sa, ea;
+ if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
+ if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
+ (bp->b_offset & PAGE_MASK) == 0) {
+ mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
+ if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
+ ((bp->b_pages[0]->valid & mask) != mask)) {
+ bzero(bp->b_data, bp->b_bufsize);
+ }
+ bp->b_pages[0]->valid |= mask;
+ bp->b_resid = 0;
+ return;
+ }
+ ea = sa = bp->b_data;
+ for(i=0;i<bp->b_npages;i++,sa=ea) {
+ int j = ((u_long)sa & PAGE_MASK) / DEV_BSIZE;
+ ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
+ ea = (caddr_t)ulmin((u_long)ea,
+ (u_long)bp->b_data + bp->b_bufsize);
+ mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
+ if ((bp->b_pages[i]->valid & mask) == mask)
+ continue;
+ if ((bp->b_pages[i]->valid & mask) == 0) {
+ if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
+ bzero(sa, ea - sa);
+ }
+ } else {
+ for (; sa < ea; sa += DEV_BSIZE, j++) {
+ if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
+ (bp->b_pages[i]->valid & (1<<j)) == 0)
+ bzero(sa, DEV_BSIZE);
+ }
+ }
+ bp->b_pages[i]->valid |= mask;
+ vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
+ }
+ bp->b_resid = 0;
+ } else {
+ clrbuf(bp);
+ }
+}
+
+/*
+ * vm_hold_load_pages and vm_hold_unload pages get pages into
+ * a buffers address space. The pages are anonymous and are
+ * not associated with a file object.
+ */
+void
+vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
+{
+ vm_offset_t pg;
+ vm_page_t p;
+ int index;
+
+ to = round_page(to);
+ from = round_page(from);
+ index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+
+ for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+
+tryagain:
+
+ p = vm_page_alloc(kernel_object,
+ ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
+ VM_ALLOC_NORMAL);
+ if (!p) {
+ vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
+ VM_WAIT;
+ goto tryagain;
+ }
+ vm_page_wire(p);
+ p->valid = VM_PAGE_BITS_ALL;
+ vm_page_flag_clear(p, PG_ZERO);
+ pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
+ bp->b_pages[index] = p;
+ vm_page_wakeup(p);
+ }
+ bp->b_npages = index;
+}
+
+void
+vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
+{
+ vm_offset_t pg;
+ vm_page_t p;
+ int index, newnpages;
+
+ from = round_page(from);
+ to = round_page(to);
+ newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+
+ for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+ p = bp->b_pages[index];
+ if (p && (index < bp->b_npages)) {
+#if !defined(MAX_PERF)
+ if (p->busy) {
+ printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
+ bp->b_blkno, bp->b_lblkno);
+ }
+#endif
+ bp->b_pages[index] = NULL;
+ pmap_kremove(pg);
+ vm_page_busy(p);
+ vm_page_unwire(p, 0);
+ vm_page_free(p);
+ }
+ }
+ bp->b_npages = newnpages;
+}
+
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(buffer, db_show_buffer)
+{
+ /* get args */
+ struct buf *bp = (struct buf *)addr;
+
+ if (!have_addr) {
+ db_printf("usage: show buffer <addr>\n");
+ return;
+ }
+
+ db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
+ (u_int)bp->b_flags, PRINT_BUF_FLAGS);
+ db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
+ "b_resid = %ld\nb_dev = 0x%x, b_data = %p, "
+ "b_blkno = %d, b_pblkno = %d\n",
+ bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
+ bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno);
+ if (bp->b_npages) {
+ int i;
+ db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m;
+ m = bp->b_pages[i];
+ db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
+ (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
+ if ((i + 1) < bp->b_npages)
+ db_printf(",");
+ }
+ db_printf("\n");
+ }
+}
+#endif /* DDB */
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
new file mode 100644
index 0000000..a8ac5e7
--- /dev/null
+++ b/sys/kern/vfs_cache.c
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) 1989, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Poul-Henning Kamp of the FreeBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
+ * $Id: vfs_cache.c,v 1.37 1997/12/19 23:18:37 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/namei.h>
+#include <sys/malloc.h>
+
+
+/*
+ * Name caching works as follows:
+ *
+ * Names found by directory scans are retained in a cache
+ * for future reference. It is managed LRU, so frequently
+ * used names will hang around. Cache is indexed by hash value
+ * obtained from (vp, name) where vp refers to the directory
+ * containing name.
+ *
+ * If it is a "negative" entry, (i.e. for a name that is known NOT to
+ * exist) the vnode pointer will be NULL.
+ *
+ * Upon reaching the last segment of a path, if the reference
+ * is for DELETE, or NOCACHE is set (rewrite), and the
+ * name is located in the cache, it will be dropped.
+ */
+
+/*
+ * Structures associated with name cacheing.
+ */
+#define NCHHASH(dvp, cnp) \
+ (&nchashtbl[((dvp)->v_id + (cnp)->cn_hash) & nchash])
+static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */
+static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */
+static u_long nchash; /* size of hash table */
+SYSCTL_INT(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
+static u_long ncnegfactor = 16; /* ratio of negative entries */
+SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
+static u_long numneg; /* number of cache entries allocated */
+SYSCTL_INT(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
+static u_long numcache; /* number of cache entries allocated */
+SYSCTL_INT(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
+struct nchstats nchstats; /* cache effectiveness statistics */
+
+static int doingcache = 1; /* 1 => enable the cache */
+SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
+SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
+SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
+
+/*
+ * The new name cache statistics
+ */
+SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
+#define STATNODE(mode, name, var) \
+ SYSCTL_INT(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
+STATNODE(CTLFLAG_RD, numneg, &numneg);
+STATNODE(CTLFLAG_RD, numcache, &numcache);
+static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
+static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
+static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
+static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
+static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
+static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
+static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
+static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
+static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
+static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
+
+
+static void cache_zap __P((struct namecache *ncp));
+
+/*
+ * Flags in namecache.nc_flag
+ */
+#define NCF_WHITE 1
+/*
+ * Delete an entry from its hash list and move it to the front
+ * of the LRU list for immediate reuse.
+ */
+static void
+cache_zap(ncp)
+ struct namecache *ncp;
+{
+ LIST_REMOVE(ncp, nc_hash);
+ LIST_REMOVE(ncp, nc_src);
+ if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src))
+ vdrop(ncp->nc_dvp);
+ if (ncp->nc_vp) {
+ TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
+ } else {
+ TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+ numneg--;
+ }
+ numcache--;
+ free(ncp, M_CACHE);
+}
+
+/*
+ * Lookup an entry in the cache
+ *
+ * We don't do this if the segment name is long, simply so the cache
+ * can avoid holding long names (which would either waste space, or
+ * add greatly to the complexity).
+ *
+ * Lookup is called with dvp pointing to the directory to search,
+ * cnp pointing to the name of the entry being sought. If the lookup
+ * succeeds, the vnode is returned in *vpp, and a status of -1 is
+ * returned. If the lookup determines that the name does not exist
+ * (negative cacheing), a status of ENOENT is returned. If the lookup
+ * fails, a status of zero is returned.
+ */
+
+int
+cache_lookup(dvp, vpp, cnp)
+ struct vnode *dvp;
+ struct vnode **vpp;
+ struct componentname *cnp;
+{
+ register struct namecache *ncp;
+
+ if (!doingcache) {
+ cnp->cn_flags &= ~MAKEENTRY;
+ return (0);
+ }
+
+ numcalls++;
+
+ if (cnp->cn_nameptr[0] == '.') {
+ if (cnp->cn_namelen == 1) {
+ *vpp = dvp;
+ dothits++;
+ return (-1);
+ }
+ if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+ dotdothits++;
+ if (dvp->v_dd->v_id != dvp->v_ddid ||
+ (cnp->cn_flags & MAKEENTRY) == 0) {
+ dvp->v_ddid = 0;
+ return (0);
+ }
+ *vpp = dvp->v_dd;
+ return (-1);
+ }
+ }
+
+ LIST_FOREACH(ncp, (NCHHASH(dvp, cnp)), nc_hash) {
+ numchecks++;
+ if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+ !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+ break;
+ }
+
+ /* We failed to find an entry */
+ if (ncp == 0) {
+ if ((cnp->cn_flags & MAKEENTRY) == 0) {
+ nummisszap++;
+ } else {
+ nummiss++;
+ }
+ nchstats.ncs_miss++;
+ return (0);
+ }
+
+ /* We don't want to have an entry, so dump it */
+ if ((cnp->cn_flags & MAKEENTRY) == 0) {
+ numposzaps++;
+ nchstats.ncs_badhits++;
+ cache_zap(ncp);
+ return (0);
+ }
+
+ /* We found a "positive" match, return the vnode */
+ if (ncp->nc_vp) {
+ numposhits++;
+ nchstats.ncs_goodhits++;
+ *vpp = ncp->nc_vp;
+ return (-1);
+ }
+
+ /* We found a negative match, and want to create it, so purge */
+ if (cnp->cn_nameiop == CREATE) {
+ numnegzaps++;
+ nchstats.ncs_badhits++;
+ cache_zap(ncp);
+ return (0);
+ }
+
+ numneghits++;
+ /*
+ * We found a "negative" match, ENOENT notifies client of this match.
+ * The nc_vpid field records whether this is a whiteout.
+ */
+ TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+ TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+ nchstats.ncs_neghits++;
+ if (ncp->nc_flag & NCF_WHITE)
+ cnp->cn_flags |= ISWHITEOUT;
+ return (ENOENT);
+}
+
+/*
+ * Add an entry to the cache.
+ */
+void
+cache_enter(dvp, vp, cnp)
+ struct vnode *dvp;
+ struct vnode *vp;
+ struct componentname *cnp;
+{
+ register struct namecache *ncp;
+ register struct nchashhead *ncpp;
+
+ if (!doingcache)
+ return;
+
+ if (cnp->cn_nameptr[0] == '.') {
+ if (cnp->cn_namelen == 1) {
+ return;
+ }
+ if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+ if (vp) {
+ dvp->v_dd = vp;
+ dvp->v_ddid = vp->v_id;
+ } else {
+ dvp->v_dd = dvp;
+ dvp->v_ddid = 0;
+ }
+ return;
+ }
+ }
+
+ ncp = (struct namecache *)
+ malloc(sizeof *ncp + cnp->cn_namelen, M_CACHE, M_WAITOK);
+ bzero((char *)ncp, sizeof *ncp);
+ numcache++;
+ if (!vp) {
+ numneg++;
+ ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
+ } else if (vp->v_type == VDIR) {
+ vp->v_dd = dvp;
+ vp->v_ddid = dvp->v_id;
+ }
+
+ /*
+ * Fill in cache info, if vp is NULL this is a "negative" cache entry.
+ * For negative entries, we have to record whether it is a whiteout.
+ * the whiteout flag is stored in the nc_vpid field which is
+ * otherwise unused.
+ */
+ ncp->nc_vp = vp;
+ ncp->nc_dvp = dvp;
+ ncp->nc_nlen = cnp->cn_namelen;
+ bcopy(cnp->cn_nameptr, ncp->nc_name, ncp->nc_nlen);
+ ncpp = NCHHASH(dvp, cnp);
+ LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+ if (LIST_EMPTY(&dvp->v_cache_src))
+ vhold(dvp);
+ LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
+ if (vp) {
+ TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
+ } else {
+ TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+ }
+ if (numneg*ncnegfactor > numcache) {
+ ncp = TAILQ_FIRST(&ncneg);
+ cache_zap(ncp);
+ }
+}
+
+/*
+ * Name cache initialization, from vfs_init() when we are booting
+ */
+void
+nchinit()
+{
+
+ TAILQ_INIT(&ncneg);
+ nchashtbl = hashinit(desiredvnodes*2, M_CACHE, &nchash);
+}
+
+/*
+ * Invalidate all entries to particular vnode.
+ *
+ * We actually just increment the v_id, that will do it. The stale entries
+ * will be purged by lookup as they get found. If the v_id wraps around, we
+ * need to ditch the entire cache, to avoid confusion. No valid vnode will
+ * ever have (v_id == 0).
+ */
+void
+cache_purge(vp)
+ struct vnode *vp;
+{
+ static u_long nextid;
+
+ while (!LIST_EMPTY(&vp->v_cache_src))
+ cache_zap(LIST_FIRST(&vp->v_cache_src));
+ while (!TAILQ_EMPTY(&vp->v_cache_dst))
+ cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
+
+ nextid++;
+ while (nextid == vp->v_id || !nextid)
+ continue;
+ vp->v_id = nextid;
+ vp->v_dd = vp;
+ vp->v_ddid = 0;
+}
+
+/*
+ * Flush all entries referencing a particular filesystem.
+ *
+ * Since we need to check it anyway, we will flush all the invalid
+ * entries at the same time.
+ */
+void
+cache_purgevfs(mp)
+ struct mount *mp;
+{
+ struct nchashhead *ncpp;
+ struct namecache *ncp, *nnp;
+
+ /* Scan hash tables for applicable entries */
+ for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
+ for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
+ nnp = LIST_NEXT(ncp, nc_hash);
+ if (ncp->nc_dvp->v_mount == mp) {
+ cache_zap(ncp);
+ }
+ }
+ }
+}
+
+/*
+ * Perform canonical checks and cache lookup and pass on to filesystem
+ * through the vop_cachedlookup only if needed.
+ */
+
+int
+vfs_cache_lookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ struct vnode *vdp;
+ struct vnode *pdp;
+ int lockparent;
+ int error;
+ struct vnode **vpp = ap->a_vpp;
+ struct componentname *cnp = ap->a_cnp;
+ struct ucred *cred = cnp->cn_cred;
+ int flags = cnp->cn_flags;
+ struct proc *p = cnp->cn_proc;
+ u_long vpid; /* capability number of vnode */
+
+ *vpp = NULL;
+ vdp = ap->a_dvp;
+ lockparent = flags & LOCKPARENT;
+
+ if (vdp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+ return (EROFS);
+
+ error = VOP_ACCESS(vdp, VEXEC, cred, cnp->cn_proc);
+
+ if (error)
+ return (error);
+
+ error = cache_lookup(vdp, vpp, cnp);
+
+ if (!error)
+ return (VOP_CACHEDLOOKUP(ap->a_dvp, ap->a_vpp, ap->a_cnp));
+
+ if (error == ENOENT)
+ return (error);
+
+ pdp = vdp;
+ vdp = *vpp;
+ vpid = vdp->v_id;
+ if (pdp == vdp) { /* lookup on "." */
+ VREF(vdp);
+ error = 0;
+ } else if (flags & ISDOTDOT) {
+ VOP_UNLOCK(pdp, 0, p);
+ error = vget(vdp, LK_EXCLUSIVE, p);
+ if (!error && lockparent && (flags & ISLASTCN))
+ error = vn_lock(pdp, LK_EXCLUSIVE, p);
+ } else {
+ error = vget(vdp, LK_EXCLUSIVE, p);
+ if (!lockparent || error || !(flags & ISLASTCN))
+ VOP_UNLOCK(pdp, 0, p);
+ }
+ /*
+ * Check that the capability number did not change
+ * while we were waiting for the lock.
+ */
+ if (!error) {
+ if (vpid == vdp->v_id)
+ return (0);
+ vput(vdp);
+ if (lockparent && pdp != vdp && (flags & ISLASTCN))
+ VOP_UNLOCK(pdp, 0, p);
+ }
+ error = vn_lock(pdp, LK_EXCLUSIVE, p);
+ if (error)
+ return (error);
+ return (VOP_CACHEDLOOKUP(ap->a_dvp, ap->a_vpp, ap->a_cnp));
+}
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
new file mode 100644
index 0000000..781508e
--- /dev/null
+++ b/sys/kern/vfs_cluster.c
@@ -0,0 +1,840 @@
+/*-
+ * Copyright (c) 1993
+ * The Regents of the University of California. All rights reserved.
+ * Modifications/enhancements:
+ * Copyright (c) 1995 John S. Dyson. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
+ * $Id: vfs_cluster.c,v 1.77 1999/01/10 01:58:25 eivind Exp $
+ */
+
+#include "opt_debug_cluster.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+
+#if defined(CLUSTERDEBUG)
+#include <sys/sysctl.h>
+static int rcluster= 0;
+SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
+#endif
+
+static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer");
+
+static struct cluster_save *
+ cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
+static struct buf *
+ cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+ daddr_t blkno, long size, int run, struct buf *fbp));
+
+extern vm_page_t bogus_page;
+
+extern int cluster_pbuf_freecnt;
+
+/*
+ * Maximum number of blocks for read-ahead.
+ */
+#define MAXRA 32
+
+/*
+ * This replaces bread.
+ */
+int
+cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
+ struct vnode *vp;
+ u_quad_t filesize;
+ daddr_t lblkno;
+ long size;
+ struct ucred *cred;
+ long totread;
+ int seqcount;
+ struct buf **bpp;
+{
+ struct buf *bp, *rbp, *reqbp;
+ daddr_t blkno, origblkno;
+ int error, num_ra;
+ int i;
+ int maxra, racluster;
+ long origtotread;
+
+ error = 0;
+ if (vp->v_maxio == 0)
+ vp->v_maxio = DFLTPHYS;
+
+ /*
+ * Try to limit the amount of read-ahead by a few
+ * ad-hoc parameters. This needs work!!!
+ */
+ racluster = vp->v_maxio/size;
+ maxra = 2 * racluster + (totread / size);
+ if (maxra > MAXRA)
+ maxra = MAXRA;
+ if (maxra > nbuf/8)
+ maxra = nbuf/8;
+
+ /*
+ * get the requested block
+ */
+ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
+ origblkno = lblkno;
+ origtotread = totread;
+
+ /*
+ * if it is in the cache, then check to see if the reads have been
+ * sequential. If they have, then try some read-ahead, otherwise
+ * back-off on prospective read-aheads.
+ */
+ if (bp->b_flags & B_CACHE) {
+ if (!seqcount) {
+ return 0;
+ } else if ((bp->b_flags & B_RAM) == 0) {
+ return 0;
+ } else {
+ int s;
+ struct buf *tbp;
+ bp->b_flags &= ~B_RAM;
+ /*
+ * We do the spl here so that there is no window
+ * between the incore and the b_usecount increment
+ * below. We opt to keep the spl out of the loop
+ * for efficiency.
+ */
+ s = splbio();
+ for(i=1;i<maxra;i++) {
+
+ if (!(tbp = incore(vp, lblkno+i))) {
+ break;
+ }
+
+ /*
+ * Set another read-ahead mark so we know to check
+ * again.
+ */
+ if (((i % racluster) == (racluster - 1)) ||
+ (i == (maxra - 1)))
+ tbp->b_flags |= B_RAM;
+
+ if ((tbp->b_usecount < 1) &&
+ ((tbp->b_flags & B_BUSY) == 0) &&
+ (tbp->b_qindex == QUEUE_LRU)) {
+ TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist);
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist);
+ }
+ }
+ splx(s);
+ if (i >= maxra) {
+ return 0;
+ }
+ lblkno += i;
+ }
+ reqbp = bp = NULL;
+ } else {
+ off_t firstread = bp->b_offset;
+
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("cluster_read: no buffer offset"));
+ if (firstread + totread > filesize)
+ totread = filesize - firstread;
+ if (totread > size) {
+ int nblks = 0;
+ int ncontigafter;
+ while (totread > 0) {
+ nblks++;
+ totread -= size;
+ }
+ if (nblks == 1)
+ goto single_block_read;
+ if (nblks > racluster)
+ nblks = racluster;
+
+ error = VOP_BMAP(vp, lblkno, NULL,
+ &blkno, &ncontigafter, NULL);
+ if (error)
+ goto single_block_read;
+ if (blkno == -1)
+ goto single_block_read;
+ if (ncontigafter == 0)
+ goto single_block_read;
+ if (ncontigafter + 1 < nblks)
+ nblks = ncontigafter + 1;
+
+ bp = cluster_rbuild(vp, filesize, lblkno,
+ blkno, size, nblks, bp);
+ lblkno += (bp->b_bufsize / size);
+ } else {
+single_block_read:
+ /*
+ * if it isn't in the cache, then get a chunk from
+ * disk if sequential, otherwise just get the block.
+ */
+ bp->b_flags |= B_READ | B_RAM;
+ lblkno += 1;
+ }
+ }
+
+ /*
+ * if we have been doing sequential I/O, then do some read-ahead
+ */
+ rbp = NULL;
+ if (seqcount && (lblkno < (origblkno + seqcount))) {
+ /*
+ * we now build the read-ahead buffer if it is desirable.
+ */
+ if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
+ !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
+ blkno != -1) {
+ int nblksread;
+ int ntoread = num_ra + 1;
+ nblksread = (origtotread + size - 1) / size;
+ if (seqcount < nblksread)
+ seqcount = nblksread;
+ if (seqcount < ntoread)
+ ntoread = seqcount;
+ if (num_ra) {
+ rbp = cluster_rbuild(vp, filesize, lblkno,
+ blkno, size, ntoread, NULL);
+ } else {
+ rbp = getblk(vp, lblkno, size, 0, 0);
+ rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
+ rbp->b_blkno = blkno;
+ }
+ }
+ }
+
+ /*
+ * handle the synchronous read
+ */
+ if (bp) {
+#if defined(CLUSTERDEBUG)
+ if (rcluster)
+ printf("S(%ld,%ld,%d) ",
+ (long)bp->b_lblkno, bp->b_bcount, seqcount);
+#endif
+ if ((bp->b_flags & B_CLUSTER) == 0)
+ vfs_busy_pages(bp, 0);
+ error = VOP_STRATEGY(vp, bp);
+ curproc->p_stats->p_ru.ru_inblock++;
+ }
+
+ /*
+ * and if we have read-aheads, do them too
+ */
+ if (rbp) {
+ if (error) {
+ rbp->b_flags &= ~(B_ASYNC | B_READ);
+ brelse(rbp);
+ } else if (rbp->b_flags & B_CACHE) {
+ rbp->b_flags &= ~(B_ASYNC | B_READ);
+ bqrelse(rbp);
+ } else {
+#if defined(CLUSTERDEBUG)
+ if (rcluster) {
+ if (bp)
+ printf("A+(%ld,%ld,%ld,%d) ",
+ (long)rbp->b_lblkno, rbp->b_bcount,
+ (long)(rbp->b_lblkno - origblkno),
+ seqcount);
+ else
+ printf("A(%ld,%ld,%ld,%d) ",
+ (long)rbp->b_lblkno, rbp->b_bcount,
+ (long)(rbp->b_lblkno - origblkno),
+ seqcount);
+ }
+#endif
+
+ if ((rbp->b_flags & B_CLUSTER) == 0)
+ vfs_busy_pages(rbp, 0);
+ (void) VOP_STRATEGY(vp, rbp);
+ curproc->p_stats->p_ru.ru_inblock++;
+ }
+ }
+ if (reqbp)
+ return (biowait(reqbp));
+ else
+ return (error);
+}
+
+/*
+ * If blocks are contiguous on disk, use this to provide clustered
+ * read ahead. We will read as many blocks as possible sequentially
+ * and then parcel them up into logical blocks in the buffer hash table.
+ */
+static struct buf *
+cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
+ struct vnode *vp;
+ u_quad_t filesize;
+ daddr_t lbn;
+ daddr_t blkno;
+ long size;
+ int run;
+ struct buf *fbp;
+{
+ struct buf *bp, *tbp;
+ daddr_t bn;
+ int i, inc, j;
+
+ KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
+ ("cluster_rbuild: size %ld != filesize %ld\n",
+ size, vp->v_mount->mnt_stat.f_iosize));
+
+ /*
+ * avoid a division
+ */
+ while ((u_quad_t) size * (lbn + run) > filesize) {
+ --run;
+ }
+
+ if (fbp) {
+ tbp = fbp;
+ tbp->b_flags |= B_READ;
+ } else {
+ tbp = getblk(vp, lbn, size, 0, 0);
+ if (tbp->b_flags & B_CACHE)
+ return tbp;
+ tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
+ }
+
+ tbp->b_blkno = blkno;
+ if( (tbp->b_flags & B_MALLOC) ||
+ ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
+ return tbp;
+
+ bp = trypbuf(&cluster_pbuf_freecnt);
+ if (bp == 0)
+ return tbp;
+
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
+ bp->b_iodone = cluster_callback;
+ bp->b_blkno = blkno;
+ bp->b_lblkno = lbn;
+ bp->b_offset = tbp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
+ pbgetvp(vp, bp);
+
+ TAILQ_INIT(&bp->b_cluster.cluster_head);
+
+ bp->b_bcount = 0;
+ bp->b_bufsize = 0;
+ bp->b_npages = 0;
+
+ if (vp->v_maxio == 0)
+ vp->v_maxio = DFLTPHYS;
+ inc = btodb(size);
+ for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
+ if (i != 0) {
+ if ((bp->b_npages * PAGE_SIZE) +
+ round_page(size) > vp->v_maxio)
+ break;
+
+ if (tbp = incore(vp, lbn + i)) {
+ if (tbp->b_flags & B_BUSY)
+ break;
+
+ for (j = 0; j < tbp->b_npages; j++)
+ if (tbp->b_pages[j]->valid)
+ break;
+
+ if (j != tbp->b_npages)
+ break;
+
+ if (tbp->b_bcount != size)
+ break;
+ }
+
+ tbp = getblk(vp, lbn + i, size, 0, 0);
+
+ if ((tbp->b_flags & B_CACHE) ||
+ (tbp->b_flags & B_VMIO) == 0) {
+ bqrelse(tbp);
+ break;
+ }
+
+ for (j = 0;j < tbp->b_npages; j++)
+ if (tbp->b_pages[j]->valid)
+ break;
+
+ if (j != tbp->b_npages) {
+ bqrelse(tbp);
+ break;
+ }
+
+ if ((fbp && (i == 1)) || (i == (run - 1)))
+ tbp->b_flags |= B_RAM;
+ tbp->b_flags |= B_READ | B_ASYNC;
+ if (tbp->b_blkno == tbp->b_lblkno) {
+ tbp->b_blkno = bn;
+ } else if (tbp->b_blkno != bn) {
+ brelse(tbp);
+ break;
+ }
+ }
+ TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+ tbp, b_cluster.cluster_entry);
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ vm_page_t m;
+ m = tbp->b_pages[j];
+ vm_page_io_start(m);
+ vm_object_pip_add(m->object, 1);
+ if ((bp->b_npages == 0) ||
+ (bp->b_pages[bp->b_npages-1] != m)) {
+ bp->b_pages[bp->b_npages] = m;
+ bp->b_npages++;
+ }
+ if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
+ tbp->b_pages[j] = bogus_page;
+ }
+ bp->b_bcount += tbp->b_bcount;
+ bp->b_bufsize += tbp->b_bufsize;
+ }
+
+ for(j=0;j<bp->b_npages;j++) {
+ if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
+ VM_PAGE_BITS_ALL)
+ bp->b_pages[j] = bogus_page;
+ }
+ if (bp->b_bufsize > bp->b_kvasize)
+ panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+ bp->b_bufsize, bp->b_kvasize);
+ bp->b_kvasize = bp->b_bufsize;
+
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ return (bp);
+}
+
+/*
+ * Cleanup after a clustered read or write.
+ * This is complicated by the fact that any of the buffers might have
+ * extra memory (if there were no empty buffer headers at allocbuf time)
+ * that we will need to shift around.
+ */
+void
+cluster_callback(bp)
+ struct buf *bp;
+{
+ struct buf *nbp, *tbp;
+ int error = 0;
+
+ /*
+ * Must propogate errors to all the components.
+ */
+ if (bp->b_flags & B_ERROR)
+ error = bp->b_error;
+
+ pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+ /*
+ * Move memory from the large cluster buffer into the component
+ * buffers and mark IO as done on these.
+ */
+ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
+ tbp; tbp = nbp) {
+ nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
+ if (error) {
+ tbp->b_flags |= B_ERROR;
+ tbp->b_error = error;
+ } else
+ tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+ biodone(tbp);
+ }
+ relpbuf(bp, &cluster_pbuf_freecnt);
+}
+
+/*
+ * Do clustered write for FFS.
+ *
+ * Three cases:
+ * 1. Write is not sequential (write asynchronously)
+ * Write is sequential:
+ * 2. beginning of cluster - begin cluster
+ * 3. middle of a cluster - add to cluster
+ * 4. end of a cluster - asynchronously write cluster
+ */
+void
+cluster_write(bp, filesize)
+ struct buf *bp;
+ u_quad_t filesize;
+{
+ struct vnode *vp;
+ daddr_t lbn;
+ int maxclen, cursize;
+ int lblocksize;
+ int async;
+
+ vp = bp->b_vp;
+ if (vp->v_maxio == 0)
+ vp->v_maxio = DFLTPHYS;
+ if (vp->v_type == VREG) {
+ async = vp->v_mount->mnt_flag & MNT_ASYNC;
+ lblocksize = vp->v_mount->mnt_stat.f_iosize;
+ } else {
+ async = 0;
+ lblocksize = bp->b_bufsize;
+ }
+ lbn = bp->b_lblkno;
+ KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
+
+ /* Initialize vnode to beginning of file. */
+ if (lbn == 0)
+ vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+
+ if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
+ (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
+ maxclen = vp->v_maxio / lblocksize - 1;
+ if (vp->v_clen != 0) {
+ /*
+ * Next block is not sequential.
+ *
+ * If we are not writing at end of file, the process
+ * seeked to another point in the file since its last
+ * write, or we have reached our maximum cluster size,
+ * then push the previous cluster. Otherwise try
+ * reallocating to make it sequential.
+ */
+ cursize = vp->v_lastw - vp->v_cstart + 1;
+ if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
+ lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
+ if (!async)
+ cluster_wbuild(vp, lblocksize,
+ vp->v_cstart, cursize);
+ } else {
+ struct buf **bpp, **endbp;
+ struct cluster_save *buflist;
+
+ buflist = cluster_collectbufs(vp, bp);
+ endbp = &buflist->bs_children
+ [buflist->bs_nchildren - 1];
+ if (VOP_REALLOCBLKS(vp, buflist)) {
+ /*
+ * Failed, push the previous cluster.
+ */
+ for (bpp = buflist->bs_children;
+ bpp < endbp; bpp++)
+ brelse(*bpp);
+ free(buflist, M_SEGMENT);
+ cluster_wbuild(vp, lblocksize,
+ vp->v_cstart, cursize);
+ } else {
+ /*
+ * Succeeded, keep building cluster.
+ */
+ for (bpp = buflist->bs_children;
+ bpp <= endbp; bpp++)
+ bdwrite(*bpp);
+ free(buflist, M_SEGMENT);
+ vp->v_lastw = lbn;
+ vp->v_lasta = bp->b_blkno;
+ return;
+ }
+ }
+ }
+ /*
+ * Consider beginning a cluster. If at end of file, make
+ * cluster as large as possible, otherwise find size of
+ * existing cluster.
+ */
+ if ((vp->v_type == VREG) &&
+ ((u_quad_t) bp->b_offset + lblocksize) != filesize &&
+ (bp->b_blkno == bp->b_lblkno) &&
+ (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
+ bp->b_blkno == -1)) {
+ bawrite(bp);
+ vp->v_clen = 0;
+ vp->v_lasta = bp->b_blkno;
+ vp->v_cstart = lbn + 1;
+ vp->v_lastw = lbn;
+ return;
+ }
+ vp->v_clen = maxclen;
+ if (!async && maxclen == 0) { /* I/O not contiguous */
+ vp->v_cstart = lbn + 1;
+ bawrite(bp);
+ } else { /* Wait for rest of cluster */
+ vp->v_cstart = lbn;
+ bdwrite(bp);
+ }
+ } else if (lbn == vp->v_cstart + vp->v_clen) {
+ /*
+ * At end of cluster, write it out.
+ */
+ bdwrite(bp);
+ cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
+ vp->v_clen = 0;
+ vp->v_cstart = lbn + 1;
+ } else
+ /*
+ * In the middle of a cluster, so just delay the I/O for now.
+ */
+ bdwrite(bp);
+ vp->v_lastw = lbn;
+ vp->v_lasta = bp->b_blkno;
+}
+
+
+/*
+ * This is an awful lot like cluster_rbuild...wish they could be combined.
+ * The last lbn argument is the current block on which I/O is being
+ * performed. Check to see that it doesn't fall in the middle of
+ * the current block (if last_bp == NULL).
+ */
+int
+cluster_wbuild(vp, size, start_lbn, len)
+ struct vnode *vp;
+ long size;
+ daddr_t start_lbn;
+ int len;
+{
+ struct buf *bp, *tbp;
+ int i, j, s;
+ int totalwritten = 0;
+ int dbsize = btodb(size);
+
+ if (vp->v_maxio == 0)
+ vp->v_maxio = DFLTPHYS;
+ while (len > 0) {
+ s = splbio();
+ if (((tbp = gbincore(vp, start_lbn)) == NULL) ||
+ ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
+ ++start_lbn;
+ --len;
+ splx(s);
+ continue;
+ }
+ bremfree(tbp);
+ tbp->b_flags |= B_BUSY;
+ tbp->b_flags &= ~B_DONE;
+ splx(s);
+
+ /*
+ * Extra memory in the buffer, punt on this buffer. XXX we could
+ * handle this in most cases, but we would have to push the extra
+ * memory down to after our max possible cluster size and then
+ * potentially pull it back up if the cluster was terminated
+ * prematurely--too much hassle.
+ */
+ if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
+ (tbp->b_bcount != tbp->b_bufsize) ||
+ (tbp->b_bcount != size) ||
+ (len == 1) ||
+ ((bp = trypbuf(&cluster_pbuf_freecnt)) == NULL)) {
+ totalwritten += tbp->b_bufsize;
+ bawrite(tbp);
+ ++start_lbn;
+ --len;
+ continue;
+ }
+
+ /*
+ * We got a pbuf to make the cluster in.
+ * so initialise it.
+ */
+ TAILQ_INIT(&bp->b_cluster.cluster_head);
+ bp->b_bcount = 0;
+ bp->b_bufsize = 0;
+ bp->b_npages = 0;
+ if (tbp->b_wcred != NOCRED) {
+ bp->b_wcred = tbp->b_wcred;
+ crhold(bp->b_wcred);
+ }
+
+ bp->b_blkno = tbp->b_blkno;
+ bp->b_lblkno = tbp->b_lblkno;
+ bp->b_offset = tbp->b_offset;
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER |
+ (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
+ bp->b_iodone = cluster_callback;
+ pbgetvp(vp, bp);
+ /*
+ * From this location in the file, scan forward to see
+ * if there are buffers with adjacent data that need to
+ * be written as well.
+ */
+ for (i = 0; i < len; ++i, ++start_lbn) {
+ if (i != 0) { /* If not the first buffer */
+ s = splbio();
+ /*
+ * If the adjacent data is not even in core it
+ * can't need to be written.
+ */
+ if ((tbp = gbincore(vp, start_lbn)) == NULL) {
+ splx(s);
+ break;
+ }
+
+ /*
+ * If it IS in core, but has different
+ * characteristics, don't cluster with it.
+ */
+ if ((tbp->b_flags &
+ (B_VMIO | B_CLUSTEROK | B_INVAL | B_BUSY |
+ B_DELWRI | B_NEEDCOMMIT))
+ != (B_DELWRI | B_CLUSTEROK |
+ (bp->b_flags & (B_VMIO | B_NEEDCOMMIT)))) {
+ splx(s);
+ break;
+ }
+
+ if (tbp->b_wcred != bp->b_wcred) {
+ splx(s);
+ break;
+ }
+
+ /*
+ * Check that the combined cluster
+ * would make sense with regard to pages
+ * and would not be too large
+ */
+ if ((tbp->b_bcount != size) ||
+ ((bp->b_blkno + (dbsize * i)) !=
+ tbp->b_blkno) ||
+ ((tbp->b_npages + bp->b_npages) >
+ (vp->v_maxio / PAGE_SIZE))) {
+ splx(s);
+ break;
+ }
+ /*
+ * Ok, it's passed all the tests,
+ * so remove it from the free list
+ * and mark it busy. We will use it.
+ */
+ bremfree(tbp);
+ tbp->b_flags |= B_BUSY;
+ tbp->b_flags &= ~B_DONE;
+ splx(s);
+ } /* end of code for non-first buffers only */
+ /* check for latent dependencies to be handled */
+ if ((LIST_FIRST(&tbp->b_dep)) != NULL &&
+ bioops.io_start)
+ (*bioops.io_start)(tbp);
+ /*
+ * If the IO is via the VM then we do some
+ * special VM hackery. (yuck)
+ */
+ if (tbp->b_flags & B_VMIO) {
+ vm_page_t m;
+
+ if (i != 0) { /* if not first buffer */
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ m = tbp->b_pages[j];
+ if (m->flags & PG_BUSY)
+ goto finishcluster;
+ }
+ }
+
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ m = tbp->b_pages[j];
+ vm_page_io_start(m);
+ vm_object_pip_add(m->object, 1);
+ if ((bp->b_npages == 0) ||
+ (bp->b_pages[bp->b_npages - 1] != m)) {
+ bp->b_pages[bp->b_npages] = m;
+ bp->b_npages++;
+ }
+ }
+ }
+ bp->b_bcount += size;
+ bp->b_bufsize += size;
+
+ s = splbio();
+ --numdirtybuffers;
+ tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
+ tbp->b_flags |= B_ASYNC;
+ reassignbuf(tbp, tbp->b_vp); /* put on clean list */
+ ++tbp->b_vp->v_numoutput;
+ splx(s);
+ TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+ tbp, b_cluster.cluster_entry);
+ }
+ finishcluster:
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *) bp->b_pages, bp->b_npages);
+ if (bp->b_bufsize > bp->b_kvasize)
+ panic(
+ "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+ bp->b_bufsize, bp->b_kvasize);
+ bp->b_kvasize = bp->b_bufsize;
+ totalwritten += bp->b_bufsize;
+ bp->b_dirtyoff = 0;
+ bp->b_dirtyend = bp->b_bufsize;
+ bawrite(bp);
+
+ len -= i;
+ }
+ return totalwritten;
+}
+
+/*
+ * Collect together all the buffers in a cluster.
+ * Plus add one additional buffer.
+ */
+static struct cluster_save *
+cluster_collectbufs(vp, last_bp)
+ struct vnode *vp;
+ struct buf *last_bp;
+{
+ struct cluster_save *buflist;
+ struct buf *bp;
+ daddr_t lbn;
+ int i, len;
+
+ len = vp->v_lastw - vp->v_cstart + 1;
+ buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
+ M_SEGMENT, M_WAITOK);
+ buflist->bs_nchildren = 0;
+ buflist->bs_children = (struct buf **) (buflist + 1);
+ for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
+ (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp);
+ buflist->bs_children[i] = bp;
+ if (bp->b_blkno == bp->b_lblkno)
+ VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
+ NULL, NULL);
+ }
+ buflist->bs_children[i] = bp = last_bp;
+ if (bp->b_blkno == bp->b_lblkno)
+ VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
+ NULL, NULL);
+ buflist->bs_nchildren = i + 1;
+ return (buflist);
+}
diff --git a/sys/kern/vfs_conf.c b/sys/kern/vfs_conf.c
new file mode 100644
index 0000000..a7a830f
--- /dev/null
+++ b/sys/kern/vfs_conf.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 1989, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ * Copyright (c) 1995 Artisoft, Inc. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_conf.c 8.8 (Berkeley) 3/31/94
+ * $Id: vfs_conf.c,v 1.25 1998/06/09 12:52:33 bde Exp $
+ */
+
+/*
+ * PURPOSE: This file abstracts the root mounting interface from
+ * the per file system semantics for handling mounts,
+ * the overall intent of which is to move the BSD
+ * internals dependence out of the FS code, both to
+ * make the FS code more portable and to free up some
+ * of the BSD internals so that they may more easily
+ * be changed.
+ *
+ * NOTE1: Code is single entry/single exit to aid debugging
+ * and conversion for kernel multithreading.
+ *
+ * NOTE2: Code notes lock state in headers on entry and exit
+ * as an aid to conversion for kernel multithreading
+ * on SMP reentrancy
+ */
+#include "opt_bootp.h"
+
+#include <sys/param.h> /* dev_t (types.h)*/
+#include <sys/kernel.h>
+#include <sys/systm.h> /* rootvp*/
+#include <sys/proc.h> /* curproc*/
+#include <sys/vnode.h> /* NULLVP*/
+#include <sys/mount.h> /* struct mount*/
+#include <sys/malloc.h> /* M_MOUNT*/
+
+/*
+ * GLOBALS
+ */
+
+MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount struct");
+
+/*
+ * These define the root filesystem, device, and root filesystem type.
+ */
+dev_t rootdevs[] = { NODEV, NODEV };
+char *rootdevnames[2];
+struct vnode *rootvnode;
+char *mountrootfsname;
+#ifdef BOOTP
+extern void bootpc_init __P((void));
+#endif
+
+/*
+ * vfs_init() will set maxvfsconf
+ * to the highest defined type number.
+ */
+int maxvfsconf;
+struct vfsconf *vfsconf;
+
+/*
+ * Common root mount code shared by all filesystems
+ */
+#define ROOTNAME "root_device"
+
+/*
+ * vfs_mountrootfs
+ *
+ * Common entry point for root mounts
+ *
+ * PARAMETERS:
+ * NONE
+ *
+ * RETURNS: 0 Success
+ * !0 error number (errno.h)
+ *
+ * LOCK STATE:
+ * ENTRY
+ * <no locks held>
+ * EXIT
+ * <no locks held>
+ *
+ * NOTES:
+ * This code is currently supported only for use for
+ * the FFS file system type. This is a matter of
+ * fixing the other file systems, not this code!
+ */
+static void
+vfs_mountrootfs(void *unused)
+{
+ struct mount *mp;
+ int i, err;
+ struct proc *p = curproc; /* XXX */
+ dev_t orootdev;
+
+#ifdef BOOTP
+ bootpc_init();
+#endif
+ /*
+ * New root mount structure
+ */
+ if ((err = vfs_rootmountalloc(mountrootfsname, ROOTNAME, &mp))) {
+ printf("error %d: ", err);
+ panic("cannot mount root\n");
+ return ;
+ }
+ mp->mnt_flag |= MNT_ROOTFS;
+
+ /*
+ * Attempt the mount
+ */
+ err = ENXIO;
+ orootdev = rootdev;
+ if (rootdevs[0] == NODEV)
+ rootdevs[0] = rootdev;
+ for (i = 0; i < sizeof(rootdevs) / sizeof(rootdevs[0]); i++) {
+ if (rootdevs[i] == NODEV)
+ break;
+ rootdev = rootdevs[i];
+ if (rootdev != orootdev) {
+ printf("changing root device to %s\n", rootdevnames[i]);
+ orootdev = rootdev;
+ }
+ strncpy(mp->mnt_stat.f_mntfromname,
+ rootdevnames[i] ? rootdevnames[i] : ROOTNAME, MNAMELEN - 1);
+ err = VFS_MOUNT(mp, NULL, NULL, NULL, p);
+ if (err != ENXIO)
+ break;
+ }
+ if (err) {
+ /*
+ * XXX should ask the user for the name in some cases.
+ * Why do we call vfs_unbusy() here and not after ENXIO
+ * is returned above?
+ */
+ vfs_unbusy(mp, p);
+ /*
+ * free mount struct before failing
+ * (hardly worthwhile with the PANIC eh?)
+ */
+ free( mp, M_MOUNT);
+ printf("error %d: ", err);
+ panic("cannot mount root (2)\n");
+ return;
+ }
+
+ simple_lock(&mountlist_slock);
+
+ /*
+ * Add fs to list of mounted file systems
+ */
+ CIRCLEQ_INSERT_HEAD(&mountlist, mp, mnt_list);
+
+ simple_unlock(&mountlist_slock);
+ vfs_unbusy(mp, p);
+
+ /* root mount, update system time from FS specific data*/
+ inittodr(mp->mnt_time);
+ return;
+}
+
+SYSINIT(mountroot, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, vfs_mountrootfs, NULL)
+
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
new file mode 100644
index 0000000..b73b126
--- /dev/null
+++ b/sys/kern/vfs_default.c
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/poll.h>
+
+static int vop_nostrategy __P((struct vop_strategy_args *));
+
+/*
+ * This vnode table stores what we want to do if the filesystem doesn't
+ * implement a particular VOP.
+ *
+ * If there is no specific entry here, we will return EOPNOTSUPP.
+ *
+ */
+
+vop_t **default_vnodeop_p;
+static struct vnodeopv_entry_desc default_vnodeop_entries[] = {
+ { &vop_default_desc, (vop_t *) vop_eopnotsupp },
+ { &vop_abortop_desc, (vop_t *) vop_null },
+ { &vop_advlock_desc, (vop_t *) vop_einval },
+ { &vop_bwrite_desc, (vop_t *) vop_stdbwrite },
+ { &vop_close_desc, (vop_t *) vop_null },
+ { &vop_fsync_desc, (vop_t *) vop_null },
+ { &vop_ioctl_desc, (vop_t *) vop_enotty },
+ { &vop_islocked_desc, (vop_t *) vop_noislocked },
+ { &vop_lease_desc, (vop_t *) vop_null },
+ { &vop_lock_desc, (vop_t *) vop_nolock },
+ { &vop_mmap_desc, (vop_t *) vop_einval },
+ { &vop_open_desc, (vop_t *) vop_null },
+ { &vop_pathconf_desc, (vop_t *) vop_einval },
+ { &vop_poll_desc, (vop_t *) vop_nopoll },
+ { &vop_readlink_desc, (vop_t *) vop_einval },
+ { &vop_reallocblks_desc, (vop_t *) vop_eopnotsupp },
+ { &vop_revoke_desc, (vop_t *) vop_revoke },
+ { &vop_strategy_desc, (vop_t *) vop_nostrategy },
+ { &vop_unlock_desc, (vop_t *) vop_nounlock },
+ { NULL, NULL }
+};
+
+static struct vnodeopv_desc default_vnodeop_opv_desc =
+ { &default_vnodeop_p, default_vnodeop_entries };
+
+VNODEOP_SET(default_vnodeop_opv_desc);
+
+int
+vop_eopnotsupp(struct vop_generic_args *ap)
+{
+ /*
+ printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
+ */
+
+ return (EOPNOTSUPP);
+}
+
+int
+vop_ebadf(struct vop_generic_args *ap)
+{
+
+ return (EBADF);
+}
+
+int
+vop_enotty(struct vop_generic_args *ap)
+{
+
+ return (ENOTTY);
+}
+
+int
+vop_einval(struct vop_generic_args *ap)
+{
+
+ return (EINVAL);
+}
+
+int
+vop_null(struct vop_generic_args *ap)
+{
+
+ return (0);
+}
+
+int
+vop_defaultop(struct vop_generic_args *ap)
+{
+
+ return (VOCALL(default_vnodeop_p, ap->a_desc->vdesc_offset, ap));
+}
+
+int
+vop_panic(struct vop_generic_args *ap)
+{
+
+ panic("illegal vnode op called");
+}
+
+static int
+vop_nostrategy (struct vop_strategy_args *ap)
+{
+ printf("No strategy for buffer at %p\n", ap->a_bp);
+ vprint("", ap->a_vp);
+ vprint("", ap->a_bp->b_vp);
+ ap->a_bp->b_flags |= B_ERROR;
+ ap->a_bp->b_error = EOPNOTSUPP;
+ biodone(ap->a_bp);
+ return (EOPNOTSUPP);
+}
+
+int
+vop_stdpathconf(ap)
+ struct vop_pathconf_args /* {
+ struct vnode *a_vp;
+ int a_name;
+ int *a_retval;
+ } */ *ap;
+{
+
+ switch (ap->a_name) {
+ case _PC_LINK_MAX:
+ *ap->a_retval = LINK_MAX;
+ return (0);
+ case _PC_MAX_CANON:
+ *ap->a_retval = MAX_CANON;
+ return (0);
+ case _PC_MAX_INPUT:
+ *ap->a_retval = MAX_INPUT;
+ return (0);
+ case _PC_PIPE_BUF:
+ *ap->a_retval = PIPE_BUF;
+ return (0);
+ case _PC_CHOWN_RESTRICTED:
+ *ap->a_retval = 1;
+ return (0);
+ case _PC_VDISABLE:
+ *ap->a_retval = _POSIX_VDISABLE;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Standard lock, unlock and islocked functions.
+ *
+ * These depend on the lock structure being the first element in the
+ * inode, ie: vp->v_data points to the the lock!
+ */
+int
+vop_stdlock(ap)
+ struct vop_lock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ struct proc *a_p;
+ } */ *ap;
+{
+ struct lock *l;
+
+ if ((l = (struct lock *)ap->a_vp->v_data) == NULL) {
+ if (ap->a_flags & LK_INTERLOCK)
+ simple_unlock(&ap->a_vp->v_interlock);
+ return 0;
+ }
+
+#ifndef DEBUG_LOCKS
+ return (lockmgr(l, ap->a_flags, &ap->a_vp->v_interlock, ap->a_p));
+#else
+ return (debuglockmgr(l, ap->a_flags, &ap->a_vp->v_interlock, ap->a_p,
+ "vop_stdlock", ap->a_vp->filename, ap->a_vp->line));
+#endif
+}
+
+int
+vop_stdunlock(ap)
+ struct vop_unlock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ struct proc *a_p;
+ } */ *ap;
+{
+ struct lock *l;
+
+ if ((l = (struct lock *)ap->a_vp->v_data) == NULL) {
+ if (ap->a_flags & LK_INTERLOCK)
+ simple_unlock(&ap->a_vp->v_interlock);
+ return 0;
+ }
+
+ return (lockmgr(l, ap->a_flags | LK_RELEASE, &ap->a_vp->v_interlock,
+ ap->a_p));
+}
+
+int
+vop_stdislocked(ap)
+ struct vop_islocked_args /* {
+ struct vnode *a_vp;
+ } */ *ap;
+{
+ struct lock *l;
+
+ if ((l = (struct lock *)ap->a_vp->v_data) == NULL)
+ return 0;
+
+ return (lockstatus(l));
+}
+
+/*
+ * Return true for select/poll.
+ */
+int
+vop_nopoll(ap)
+ struct vop_poll_args /* {
+ struct vnode *a_vp;
+ int a_events;
+ struct ucred *a_cred;
+ struct proc *a_p;
+ } */ *ap;
+{
+ /*
+ * Return true for read/write. If the user asked for something
+ * special, return POLLNVAL, so that clients have a way of
+ * determining reliably whether or not the extended
+ * functionality is present without hard-coding knowledge
+ * of specific filesystem implementations.
+ */
+ if (ap->a_events & ~POLLSTANDARD)
+ return (POLLNVAL);
+
+ return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Implement poll for local filesystems that support it.
+ */
+int
+vop_stdpoll(ap)
+ struct vop_poll_args /* {
+ struct vnode *a_vp;
+ int a_events;
+ struct ucred *a_cred;
+ struct proc *a_p;
+ } */ *ap;
+{
+ if ((ap->a_events & ~POLLSTANDARD) == 0)
+ return (ap->a_events & (POLLRDNORM|POLLWRNORM));
+ return (vn_pollrecord(ap->a_vp, ap->a_p, ap->a_events));
+}
+
+int
+vop_stdbwrite(ap)
+ struct vop_bwrite_args *ap;
+{
+ return (bwrite(ap->a_bp));
+}
+
+/*
+ * Stubs to use when there is no locking to be done on the underlying object.
+ * A minimal shared lock is necessary to ensure that the underlying object
+ * is not revoked while an operation is in progress. So, an active shared
+ * count is maintained in an auxillary vnode lock structure.
+ */
+int
+vop_sharedlock(ap)
+ struct vop_lock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ struct proc *a_p;
+ } */ *ap;
+{
+ /*
+ * This code cannot be used until all the non-locking filesystems
+ * (notably NFS) are converted to properly lock and release nodes.
+ * Also, certain vnode operations change the locking state within
+ * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
+ * and symlink). Ideally these operations should not change the
+ * lock state, but should be changed to let the caller of the
+ * function unlock them. Otherwise all intermediate vnode layers
+ * (such as union, umapfs, etc) must catch these functions to do
+ * the necessary locking at their layer. Note that the inactive
+ * and lookup operations also change their lock state, but this
+ * cannot be avoided, so these two operations will always need
+ * to be handled in intermediate layers.
+ */
+ struct vnode *vp = ap->a_vp;
+ int vnflags, flags = ap->a_flags;
+
+ if (vp->v_vnlock == NULL) {
+ if ((flags & LK_TYPE_MASK) == LK_DRAIN)
+ return (0);
+ MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock),
+ M_VNODE, M_WAITOK);
+ lockinit(vp->v_vnlock, PVFS, "vnlock", 0, LK_NOPAUSE);
+ }
+ switch (flags & LK_TYPE_MASK) {
+ case LK_DRAIN:
+ vnflags = LK_DRAIN;
+ break;
+ case LK_EXCLUSIVE:
+#ifdef DEBUG_VFS_LOCKS
+ /*
+ * Normally, we use shared locks here, but that confuses
+ * the locking assertions.
+ */
+ vnflags = LK_EXCLUSIVE;
+ break;
+#endif
+ case LK_SHARED:
+ vnflags = LK_SHARED;
+ break;
+ case LK_UPGRADE:
+ case LK_EXCLUPGRADE:
+ case LK_DOWNGRADE:
+ return (0);
+ case LK_RELEASE:
+ default:
+ panic("vop_sharedlock: bad operation %d", flags & LK_TYPE_MASK);
+ }
+ if (flags & LK_INTERLOCK)
+ vnflags |= LK_INTERLOCK;
+#ifndef DEBUG_LOCKS
+ return (lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
+#else
+ return (debuglockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p,
+ "vop_sharedlock", vp->filename, vp->line));
+#endif
+}
+
+/*
+ * Stubs to use when there is no locking to be done on the underlying object.
+ * A minimal shared lock is necessary to ensure that the underlying object
+ * is not revoked while an operation is in progress. So, an active shared
+ * count is maintained in an auxillary vnode lock structure.
+ */
+int
+vop_nolock(ap)
+ struct vop_lock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ struct proc *a_p;
+ } */ *ap;
+{
+#ifdef notyet
+ /*
+ * This code cannot be used until all the non-locking filesystems
+ * (notably NFS) are converted to properly lock and release nodes.
+ * Also, certain vnode operations change the locking state within
+ * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
+ * and symlink). Ideally these operations should not change the
+ * lock state, but should be changed to let the caller of the
+ * function unlock them. Otherwise all intermediate vnode layers
+ * (such as union, umapfs, etc) must catch these functions to do
+ * the necessary locking at their layer. Note that the inactive
+ * and lookup operations also change their lock state, but this
+ * cannot be avoided, so these two operations will always need
+ * to be handled in intermediate layers.
+ */
+ struct vnode *vp = ap->a_vp;
+ int vnflags, flags = ap->a_flags;
+
+ if (vp->v_vnlock == NULL) {
+ if ((flags & LK_TYPE_MASK) == LK_DRAIN)
+ return (0);
+ MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock),
+ M_VNODE, M_WAITOK);
+ lockinit(vp->v_vnlock, PVFS, "vnlock", 0, LK_NOPAUSE);
+ }
+ switch (flags & LK_TYPE_MASK) {
+ case LK_DRAIN:
+ vnflags = LK_DRAIN;
+ break;
+ case LK_EXCLUSIVE:
+ case LK_SHARED:
+ vnflags = LK_SHARED;
+ break;
+ case LK_UPGRADE:
+ case LK_EXCLUPGRADE:
+ case LK_DOWNGRADE:
+ return (0);
+ case LK_RELEASE:
+ default:
+ panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
+ }
+ if (flags & LK_INTERLOCK)
+ vnflags |= LK_INTERLOCK;
+ return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
+#else /* for now */
+ /*
+ * Since we are not using the lock manager, we must clear
+ * the interlock here.
+ */
+ if (ap->a_flags & LK_INTERLOCK)
+ simple_unlock(&ap->a_vp->v_interlock);
+ return (0);
+#endif
+}
+
+/*
+ * Do the inverse of vop_nolock, handling the interlock in a compatible way.
+ */
+int
+vop_nounlock(ap)
+ struct vop_unlock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ struct proc *a_p;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+ if (vp->v_vnlock == NULL) {
+ if (ap->a_flags & LK_INTERLOCK)
+ simple_unlock(&ap->a_vp->v_interlock);
+ return (0);
+ }
+ return (lockmgr(vp->v_vnlock, LK_RELEASE | ap->a_flags,
+ &ap->a_vp->v_interlock, ap->a_p));
+}
+
+/*
+ * Return whether or not the node is in use.
+ */
+int
+vop_noislocked(ap)
+ struct vop_islocked_args /* {
+ struct vnode *a_vp;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+ if (vp->v_vnlock == NULL)
+ return (0);
+ return (lockstatus(vp->v_vnlock));
+}
+
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
new file mode 100644
index 0000000..44b1698
--- /dev/null
+++ b/sys/kern/vfs_export.c
@@ -0,0 +1,2872 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
+ * $Id: vfs_subr.c,v 1.182 1999/01/10 01:58:26 eivind Exp $
+ */
+
+/*
+ * External virtual filesystem routines
+ */
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/domain.h>
+#include <sys/dirent.h>
+#include <sys/vmmeter.h>
+
+#include <machine/limits.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+#include <vm/vm_zone.h>
+#include <sys/sysctl.h>
+
+#include <miscfs/specfs/specdev.h>
+
+static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+
+static void insmntque __P((struct vnode *vp, struct mount *mp));
+static void vclean __P((struct vnode *vp, int flags, struct proc *p));
+static void vfree __P((struct vnode *));
+static void vgonel __P((struct vnode *vp, struct proc *p));
+static unsigned long numvnodes;
+SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
+
+enum vtype iftovt_tab[16] = {
+ VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+ VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
+};
+int vttoif_tab[9] = {
+ 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+ S_IFSOCK, S_IFIFO, S_IFMT,
+};
+
+static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
+struct tobefreelist vnode_tobefree_list; /* vnode free list */
+
+static u_long wantfreevnodes = 25;
+SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
+static u_long freevnodes = 0;
+SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
+
+int vfs_ioopt = 0;
+#ifdef ENABLE_VFS_IOOPT
+SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
+#endif
+
+struct mntlist mountlist; /* mounted filesystem list */
+struct simplelock mountlist_slock;
+struct simplelock mntvnode_slock;
+int nfs_mount_type = -1;
+#ifndef NULL_SIMPLELOCKS
+static struct simplelock mntid_slock;
+static struct simplelock vnode_free_list_slock;
+static struct simplelock spechash_slock;
+#endif
+struct nfs_public nfs_pub; /* publicly exported FS */
+static vm_zone_t vnode_zone;
+
+/*
+ * The workitem queue.
+ */
+#define SYNCER_MAXDELAY 32
+static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
+time_t syncdelay = 30;
+int rushjob; /* number of slots to run ASAP */
+
+static int syncer_delayno = 0;
+static long syncer_mask;
+LIST_HEAD(synclist, vnode);
+static struct synclist *syncer_workitem_pending;
+
+int desiredvnodes;
+SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
+
+static void vfs_free_addrlist __P((struct netexport *nep));
+static int vfs_free_netcred __P((struct radix_node *rn, void *w));
+static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
+ struct export_args *argp));
+
+/*
+ * Initialize the vnode management data structures.
+ */
+void
+vntblinit()
+{
+
+ desiredvnodes = maxproc + cnt.v_page_count / 4;
+ simple_lock_init(&mntvnode_slock);
+ simple_lock_init(&mntid_slock);
+ simple_lock_init(&spechash_slock);
+ TAILQ_INIT(&vnode_free_list);
+ TAILQ_INIT(&vnode_tobefree_list);
+ simple_lock_init(&vnode_free_list_slock);
+ CIRCLEQ_INIT(&mountlist);
+ vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
+ /*
+ * Initialize the filesystem syncer.
+ */
+ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
+ &syncer_mask);
+ syncer_maxdelay = syncer_mask + 1;
+}
+
+/*
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Interlock is not released on failure.
+ */
+int
+vfs_busy(mp, flags, interlkp, p)
+ struct mount *mp;
+ int flags;
+ struct simplelock *interlkp;
+ struct proc *p;
+{
+ int lkflags;
+
+ if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+ if (flags & LK_NOWAIT)
+ return (ENOENT);
+ mp->mnt_kern_flag |= MNTK_MWAIT;
+ if (interlkp) {
+ simple_unlock(interlkp);
+ }
+ /*
+ * Since all busy locks are shared except the exclusive
+ * lock granted when unmounting, the only place that a
+ * wakeup needs to be done is at the release of the
+ * exclusive lock at the end of dounmount.
+ */
+ tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
+ if (interlkp) {
+ simple_lock(interlkp);
+ }
+ return (ENOENT);
+ }
+ lkflags = LK_SHARED | LK_NOPAUSE;
+ if (interlkp)
+ lkflags |= LK_INTERLOCK;
+ if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
+ panic("vfs_busy: unexpected lock failure");
+ return (0);
+}
+
+/*
+ * Free a busy filesystem.
+ */
+void
+vfs_unbusy(mp, p)
+ struct mount *mp;
+ struct proc *p;
+{
+
+ lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
+}
+
+/*
+ * Lookup a filesystem type, and if found allocate and initialize
+ * a mount structure for it.
+ *
+ * Devname is usually updated by mount(8) after booting.
+ */
+int
+vfs_rootmountalloc(fstypename, devname, mpp)
+ char *fstypename;
+ char *devname;
+ struct mount **mpp;
+{
+ struct proc *p = curproc; /* XXX */
+ struct vfsconf *vfsp;
+ struct mount *mp;
+
+ if (fstypename == NULL)
+ return (ENODEV);
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstypename))
+ break;
+ if (vfsp == NULL)
+ return (ENODEV);
+ mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
+ bzero((char *)mp, (u_long)sizeof(struct mount));
+ lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+ (void)vfs_busy(mp, LK_NOWAIT, 0, p);
+ LIST_INIT(&mp->mnt_vnodelist);
+ mp->mnt_vfc = vfsp;
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_flag = MNT_RDONLY;
+ mp->mnt_vnodecovered = NULLVP;
+ vfsp->vfc_refcount++;
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+ strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+ mp->mnt_stat.f_mntonname[0] = '/';
+ mp->mnt_stat.f_mntonname[1] = 0;
+ (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
+ *mpp = mp;
+ return (0);
+}
+
+/*
+ * Find an appropriate filesystem to use for the root. If a filesystem
+ * has not been preselected, walk through the list of known filesystems
+ * trying those that have mountroot routines, and try them until one
+ * works or we have tried them all.
+ */
+#ifdef notdef /* XXX JH */
+int
+lite2_vfs_mountroot()
+{
+ struct vfsconf *vfsp;
+ extern int (*lite2_mountroot) __P((void));
+ int error;
+
+ if (lite2_mountroot != NULL)
+ return ((*lite2_mountroot)());
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+ if (vfsp->vfc_mountroot == NULL)
+ continue;
+ if ((error = (*vfsp->vfc_mountroot)()) == 0)
+ return (0);
+ printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
+ }
+ return (ENODEV);
+}
+#endif
+
+/*
+ * Lookup a mount point by filesystem identifier.
+ */
+struct mount *
+vfs_getvfs(fsid)
+ fsid_t *fsid;
+{
+ register struct mount *mp;
+
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
+ mp = mp->mnt_list.cqe_next) {
+ if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+ mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+ simple_unlock(&mountlist_slock);
+ return (mp);
+ }
+ }
+ simple_unlock(&mountlist_slock);
+ return ((struct mount *) 0);
+}
+
+/*
+ * Get a new unique fsid
+ */
+void
+vfs_getnewfsid(mp)
+ struct mount *mp;
+{
+ static u_short xxxfs_mntid;
+
+ fsid_t tfsid;
+ int mtype;
+
+ simple_lock(&mntid_slock);
+ mtype = mp->mnt_vfc->vfc_typenum;
+ mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
+ mp->mnt_stat.f_fsid.val[1] = mtype;
+ if (xxxfs_mntid == 0)
+ ++xxxfs_mntid;
+ tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
+ tfsid.val[1] = mtype;
+ if (mountlist.cqh_first != (void *)&mountlist) {
+ while (vfs_getvfs(&tfsid)) {
+ tfsid.val[0]++;
+ xxxfs_mntid++;
+ }
+ }
+ mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+ simple_unlock(&mntid_slock);
+}
+
+/*
+ * Set vnode attributes to VNOVAL
+ */
+void
+vattr_null(vap)
+ register struct vattr *vap;
+{
+
+ vap->va_type = VNON;
+ vap->va_size = VNOVAL;
+ vap->va_bytes = VNOVAL;
+ vap->va_mode = VNOVAL;
+ vap->va_nlink = VNOVAL;
+ vap->va_uid = VNOVAL;
+ vap->va_gid = VNOVAL;
+ vap->va_fsid = VNOVAL;
+ vap->va_fileid = VNOVAL;
+ vap->va_blocksize = VNOVAL;
+ vap->va_rdev = VNOVAL;
+ vap->va_atime.tv_sec = VNOVAL;
+ vap->va_atime.tv_nsec = VNOVAL;
+ vap->va_mtime.tv_sec = VNOVAL;
+ vap->va_mtime.tv_nsec = VNOVAL;
+ vap->va_ctime.tv_sec = VNOVAL;
+ vap->va_ctime.tv_nsec = VNOVAL;
+ vap->va_flags = VNOVAL;
+ vap->va_gen = VNOVAL;
+ vap->va_vaflags = 0;
+}
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+extern vop_t **dead_vnodeop_p;
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(tag, mp, vops, vpp)
+ enum vtagtype tag;
+ struct mount *mp;
+ vop_t **vops;
+ struct vnode **vpp;
+{
+ int s;
+ struct proc *p = curproc; /* XXX */
+ struct vnode *vp, *tvp, *nvp;
+ vm_object_t object;
+ TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
+
+ /*
+ * We take the least recently used vnode from the freelist
+ * if we can get it and it has no cached pages, and no
+ * namecache entries are relative to it.
+ * Otherwise we allocate a new vnode
+ */
+
+ s = splbio();
+ simple_lock(&vnode_free_list_slock);
+ TAILQ_INIT(&vnode_tmp_list);
+
+ for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
+ nvp = TAILQ_NEXT(vp, v_freelist);
+ TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+ if (vp->v_flag & VAGE) {
+ TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+ } else {
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ }
+ vp->v_flag &= ~(VTBFREE|VAGE);
+ vp->v_flag |= VFREE;
+ if (vp->v_usecount)
+ panic("tobe free vnode isn't");
+ freevnodes++;
+ }
+
+ if (wantfreevnodes && freevnodes < wantfreevnodes) {
+ vp = NULL;
+ } else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
+ /*
+ * XXX: this is only here to be backwards compatible
+ */
+ vp = NULL;
+ } else {
+ for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
+ nvp = TAILQ_NEXT(vp, v_freelist);
+ if (!simple_lock_try(&vp->v_interlock))
+ continue;
+ if (vp->v_usecount)
+ panic("free vnode isn't");
+
+ object = vp->v_object;
+ if (object && (object->resident_page_count || object->ref_count)) {
+ printf("object inconsistant state: RPC: %d, RC: %d\n",
+ object->resident_page_count, object->ref_count);
+ /* Don't recycle if it's caching some pages */
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
+ continue;
+ } else if (LIST_FIRST(&vp->v_cache_src)) {
+ /* Don't recycle if active in the namecache */
+ simple_unlock(&vp->v_interlock);
+ continue;
+ } else {
+ break;
+ }
+ }
+ }
+
+ for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
+ nvp = TAILQ_NEXT(tvp, v_freelist);
+ TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
+ TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
+ simple_unlock(&tvp->v_interlock);
+ }
+
+ if (vp) {
+ vp->v_flag |= VDOOMED;
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ freevnodes--;
+ simple_unlock(&vnode_free_list_slock);
+ cache_purge(vp);
+ vp->v_lease = NULL;
+ if (vp->v_type != VBAD) {
+ vgonel(vp, p);
+ } else {
+ simple_unlock(&vp->v_interlock);
+ }
+
+#ifdef INVARIANTS
+ {
+ int s;
+
+ if (vp->v_data)
+ panic("cleaned vnode isn't");
+ s = splbio();
+ if (vp->v_numoutput)
+ panic("Clean vnode has pending I/O's");
+ splx(s);
+ }
+#endif
+ vp->v_flag = 0;
+ vp->v_lastr = 0;
+ vp->v_lastw = 0;
+ vp->v_lasta = 0;
+ vp->v_cstart = 0;
+ vp->v_clen = 0;
+ vp->v_socket = 0;
+ vp->v_writecount = 0; /* XXX */
+ vp->v_maxio = 0;
+ } else {
+ simple_unlock(&vnode_free_list_slock);
+ vp = (struct vnode *) zalloc(vnode_zone);
+ bzero((char *) vp, sizeof *vp);
+ simple_lock_init(&vp->v_interlock);
+ vp->v_dd = vp;
+ cache_purge(vp);
+ LIST_INIT(&vp->v_cache_src);
+ TAILQ_INIT(&vp->v_cache_dst);
+ numvnodes++;
+ }
+
+ TAILQ_INIT(&vp->v_cleanblkhd);
+ TAILQ_INIT(&vp->v_dirtyblkhd);
+ vp->v_type = VNON;
+ vp->v_tag = tag;
+ vp->v_op = vops;
+ insmntque(vp, mp);
+ *vpp = vp;
+ vp->v_usecount = 1;
+ vp->v_data = 0;
+ splx(s);
+
+ vfs_object_create(vp, p, p->p_ucred);
+ return (0);
+}
+
+/*
+ * Move a vnode from one mount queue to another.
+ */
+static void
+insmntque(vp, mp)
+ register struct vnode *vp;
+ register struct mount *mp;
+{
+
+ simple_lock(&mntvnode_slock);
+ /*
+ * Delete from old mount point vnode list, if on one.
+ */
+ if (vp->v_mount != NULL)
+ LIST_REMOVE(vp, v_mntvnodes);
+ /*
+ * Insert into list of vnodes for the new mount point, if available.
+ */
+ if ((vp->v_mount = mp) == NULL) {
+ simple_unlock(&mntvnode_slock);
+ return;
+ }
+ LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
+ simple_unlock(&mntvnode_slock);
+}
+
+/*
+ * Update outstanding I/O count and do wakeup if requested.
+ */
+void
+vwakeup(bp)
+ register struct buf *bp;
+{
+ register struct vnode *vp;
+
+ bp->b_flags &= ~B_WRITEINPROG;
+ if ((vp = bp->b_vp)) {
+ vp->v_numoutput--;
+ if (vp->v_numoutput < 0)
+ panic("vwakeup: neg numoutput");
+ if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
+ vp->v_flag &= ~VBWAIT;
+ wakeup((caddr_t) &vp->v_numoutput);
+ }
+ }
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
+ */
+int
+vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
+ register struct vnode *vp;
+ int flags;
+ struct ucred *cred;
+ struct proc *p;
+ int slpflag, slptimeo;
+{
+ register struct buf *bp;
+ struct buf *nbp, *blist;
+ int s, error;
+ vm_object_t object;
+
+ if (flags & V_SAVE) {
+ s = splbio();
+ while (vp->v_numoutput) {
+ vp->v_flag |= VBWAIT;
+ error = tsleep((caddr_t)&vp->v_numoutput,
+ slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
+ if (error) {
+ splx(s);
+ return (error);
+ }
+ }
+ if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+ splx(s);
+ if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
+ return (error);
+ s = splbio();
+ if (vp->v_numoutput > 0 ||
+ !TAILQ_EMPTY(&vp->v_dirtyblkhd))
+ panic("vinvalbuf: dirty bufs");
+ }
+ splx(s);
+ }
+ s = splbio();
+ for (;;) {
+ blist = TAILQ_FIRST(&vp->v_cleanblkhd);
+ if (!blist)
+ blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
+ if (!blist)
+ break;
+
+ for (bp = blist; bp; bp = nbp) {
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+ if (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ error = tsleep((caddr_t) bp,
+ slpflag | (PRIBIO + 4), "vinvalbuf",
+ slptimeo);
+ if (error) {
+ splx(s);
+ return (error);
+ }
+ break;
+ }
+ /*
+ * XXX Since there are no node locks for NFS, I
+ * believe there is a slight chance that a delayed
+ * write will occur while sleeping just above, so
+ * check for it. Note that vfs_bio_awrite expects
+ * buffers to reside on a queue, while VOP_BWRITE and
+ * brelse do not.
+ */
+ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
+ (flags & V_SAVE)) {
+
+ if (bp->b_vp == vp) {
+ if (bp->b_flags & B_CLUSTEROK) {
+ vfs_bio_awrite(bp);
+ } else {
+ bremfree(bp);
+ bp->b_flags |= (B_BUSY | B_ASYNC);
+ VOP_BWRITE(bp);
+ }
+ } else {
+ bremfree(bp);
+ bp->b_flags |= B_BUSY;
+ (void) VOP_BWRITE(bp);
+ }
+ break;
+ }
+ bremfree(bp);
+ bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF | B_BUSY);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ }
+ }
+
+ while (vp->v_numoutput > 0) {
+ vp->v_flag |= VBWAIT;
+ tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
+ }
+
+ splx(s);
+
+ /*
+ * Destroy the copy in the VM cache, too.
+ */
+ simple_lock(&vp->v_interlock);
+ object = vp->v_object;
+ if (object != NULL) {
+ vm_object_page_remove(object, 0, 0,
+ (flags & V_SAVE) ? TRUE : FALSE);
+ }
+ simple_unlock(&vp->v_interlock);
+
+ if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
+ panic("vinvalbuf: flush failed");
+ return (0);
+}
+
+/*
+ * Truncate a file's buffer and pages to a specified length. This
+ * is in lieu of the old vinvalbuf mechanism, which performed unneeded
+ * sync activity.
+ */
+int
+vtruncbuf(vp, cred, p, length, blksize)
+ register struct vnode *vp;
+ struct ucred *cred;
+ struct proc *p;
+ off_t length;
+ int blksize;
+{
+ register struct buf *bp;
+ struct buf *nbp;
+ int s, anyfreed;
+ int trunclbn;
+
+ /*
+ * Round up to the *next* lbn.
+ */
+ trunclbn = (length + blksize - 1) / blksize;
+
+ s = splbio();
+restart:
+ anyfreed = 1;
+ for (;anyfreed;) {
+ anyfreed = 0;
+ for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+ if (bp->b_lblkno >= trunclbn) {
+ if (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ tsleep(bp, PRIBIO + 4, "vtrb1", 0);
+ goto restart;
+ } else {
+ bremfree(bp);
+ bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ anyfreed = 1;
+ }
+ if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)||
+ (nbp->b_vp != vp) ||
+ (nbp->b_flags & B_DELWRI))) {
+ goto restart;
+ }
+ }
+ }
+
+ for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+ if (bp->b_lblkno >= trunclbn) {
+ if (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ tsleep(bp, PRIBIO + 4, "vtrb2", 0);
+ goto restart;
+ } else {
+ bremfree(bp);
+ bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ anyfreed = 1;
+ }
+ if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)||
+ (nbp->b_vp != vp) ||
+ (nbp->b_flags & B_DELWRI) == 0)) {
+ goto restart;
+ }
+ }
+ }
+ }
+
+ if (length > 0) {
+restartsync:
+ for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+ if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
+ if (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ tsleep(bp, PRIBIO, "vtrb3", 0);
+ } else {
+ bremfree(bp);
+ bp->b_flags |= B_BUSY;
+ if (bp->b_vp == vp) {
+ bp->b_flags |= B_ASYNC;
+ } else {
+ bp->b_flags &= ~B_ASYNC;
+ }
+ VOP_BWRITE(bp);
+ }
+ goto restartsync;
+ }
+
+ }
+ }
+
+ while (vp->v_numoutput > 0) {
+ vp->v_flag |= VBWAIT;
+ tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
+ }
+
+ splx(s);
+
+ vnode_pager_setsize(vp, length);
+
+ return (0);
+}
+
+/*
+ * Associate a buffer with a vnode.
+ */
+void
+bgetvp(vp, bp)
+ register struct vnode *vp;
+ register struct buf *bp;
+{
+ int s;
+
+ KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
+
+ vhold(vp);
+ bp->b_vp = vp;
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ bp->b_dev = vp->v_rdev;
+ else
+ bp->b_dev = NODEV;
+ /*
+ * Insert onto list for new vnode.
+ */
+ s = splbio();
+ bp->b_xflags |= B_VNCLEAN;
+ bp->b_xflags &= ~B_VNDIRTY;
+ TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
+ splx(s);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ */
+void
+brelvp(bp)
+ register struct buf *bp;
+{
+ struct vnode *vp;
+ struct buflists *listheadp;
+ int s;
+
+ KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
+
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ vp = bp->b_vp;
+ s = splbio();
+ if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
+ if (bp->b_xflags & B_VNDIRTY)
+ listheadp = &vp->v_dirtyblkhd;
+ else
+ listheadp = &vp->v_cleanblkhd;
+ TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+ bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
+ }
+ if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+ vp->v_flag &= ~VONWORKLST;
+ LIST_REMOVE(vp, v_synclist);
+ }
+ splx(s);
+ bp->b_vp = (struct vnode *) 0;
+ vdrop(vp);
+}
+
+/*
+ * The workitem queue.
+ *
+ * It is useful to delay writes of file data and filesystem metadata
+ * for tens of seconds so that quickly created and deleted files need
+ * not waste disk bandwidth being created and removed. To realize this,
+ * we append vnodes to a "workitem" queue. When running with a soft
+ * updates implementation, most pending metadata dependencies should
+ * not wait for more than a few seconds. Thus, mounted on block devices
+ * are delayed only about a half the time that file data is delayed.
+ * Similarly, directory updates are more critical, so are only delayed
+ * about a third the time that file data is delayed. Thus, there are
+ * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
+ * one each second (driven off the filesystem syner process). The
+ * syncer_delayno variable indicates the next queue that is to be processed.
+ * Items that need to be processed soon are placed in this queue:
+ *
+ * syncer_workitem_pending[syncer_delayno]
+ *
+ * A delay of fifteen seconds is done by placing the request fifteen
+ * entries later in the queue:
+ *
+ * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
+ *
+ */
+
+/*
+ * Add an item to the syncer work queue.
+ */
+void
+vn_syncer_add_to_worklist(vp, delay)
+ struct vnode *vp;
+ int delay;
+{
+ int s, slot;
+
+ s = splbio();
+
+ if (vp->v_flag & VONWORKLST) {
+ LIST_REMOVE(vp, v_synclist);
+ }
+
+ if (delay > syncer_maxdelay - 2)
+ delay = syncer_maxdelay - 2;
+ slot = (syncer_delayno + delay) & syncer_mask;
+
+ LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
+ vp->v_flag |= VONWORKLST;
+ splx(s);
+}
+
+static void sched_sync __P((void));
+static struct proc *updateproc;
+static struct kproc_desc up_kp = {
+ "syncer",
+ sched_sync,
+ &updateproc
+};
+SYSINIT_KT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+
+/*
+ * System filesystem synchronizer daemon.
+ */
+void
+sched_sync(void)
+{
+ struct synclist *slp;
+ struct vnode *vp;
+ long starttime;
+ int s;
+ struct proc *p = updateproc;
+
+ for (;;) {
+ starttime = time_second;
+
+ /*
+ * Push files whose dirty time has expired.
+ */
+ s = splbio();
+ slp = &syncer_workitem_pending[syncer_delayno];
+ syncer_delayno += 1;
+ if (syncer_delayno == syncer_maxdelay)
+ syncer_delayno = 0;
+ splx(s);
+
+ while ((vp = LIST_FIRST(slp)) != NULL) {
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
+ VOP_UNLOCK(vp, 0, p);
+ if (LIST_FIRST(slp) == vp) {
+ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
+ vp->v_type != VBLK)
+ panic("sched_sync: fsync failed");
+ /*
+ * Move ourselves to the back of the sync list.
+ */
+ LIST_REMOVE(vp, v_synclist);
+ vn_syncer_add_to_worklist(vp, syncdelay);
+ }
+ }
+
+ /*
+ * Do soft update processing.
+ */
+ if (bioops.io_sync)
+ (*bioops.io_sync)(NULL);
+
+ /*
+ * The variable rushjob allows the kernel to speed up the
+ * processing of the filesystem syncer process. A rushjob
+ * value of N tells the filesystem syncer to process the next
+ * N seconds worth of work on its queue ASAP. Currently rushjob
+ * is used by the soft update code to speed up the filesystem
+ * syncer process when the incore state is getting so far
+ * ahead of the disk that the kernel memory pool is being
+ * threatened with exhaustion.
+ */
+ if (rushjob > 0) {
+ rushjob -= 1;
+ continue;
+ }
+ /*
+ * If it has taken us less than a second to process the
+ * current work, then wait. Otherwise start right over
+ * again. We can still lose time if any single round
+ * takes more than two seconds, but it does not really
+ * matter as we are just trying to generally pace the
+ * filesystem activity.
+ */
+ if (time_second == starttime)
+ tsleep(&lbolt, PPAUSE, "syncer", 0);
+ }
+}
+
+/*
+ * Associate a p-buffer with a vnode.
+ *
+ * Also sets B_PAGING flag to indicate that vnode is not fully associated
+ * with the buffer. i.e. the bp has not been linked into the vnode or
+ * ref-counted.
+ */
+void
+pbgetvp(vp, bp)
+ register struct vnode *vp;
+ register struct buf *bp;
+{
+
+ KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
+
+ bp->b_vp = vp;
+ bp->b_flags |= B_PAGING;
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ bp->b_dev = vp->v_rdev;
+ else
+ bp->b_dev = NODEV;
+}
+
+/*
+ * Disassociate a p-buffer from a vnode.
+ */
+void
+pbrelvp(bp)
+ register struct buf *bp;
+{
+
+ KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
+
+#if !defined(MAX_PERF)
+ /* XXX REMOVE ME */
+ if (bp->b_vnbufs.tqe_next != NULL) {
+ panic(
+ "relpbuf(): b_vp was probably reassignbuf()d %p %x",
+ bp,
+ (int)bp->b_flags
+ );
+ }
+#endif
+ bp->b_vp = (struct vnode *) 0;
+ bp->b_flags &= ~B_PAGING;
+}
+
+void
+pbreassignbuf(bp, newvp)
+ struct buf *bp;
+ struct vnode *newvp;
+{
+#if !defined(MAX_PERF)
+ if ((bp->b_flags & B_PAGING) == 0) {
+ panic(
+ "pbreassignbuf() on non phys bp %p",
+ bp
+ );
+ }
+#endif
+ bp->b_vp = newvp;
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+void
+reassignbuf(bp, newvp)
+ register struct buf *bp;
+ register struct vnode *newvp;
+{
+ struct buflists *listheadp;
+ struct vnode *oldvp;
+ int delay;
+ int s;
+
+ if (newvp == NULL) {
+ printf("reassignbuf: NULL");
+ return;
+ }
+
+#if !defined(MAX_PERF)
+ /*
+ * B_PAGING flagged buffers cannot be reassigned because their vp
+ * is not fully linked in.
+ */
+ if (bp->b_flags & B_PAGING)
+ panic("cannot reassign paging buffer");
+#endif
+
+ s = splbio();
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
+ oldvp = bp->b_vp;
+ if (bp->b_xflags & B_VNDIRTY)
+ listheadp = &oldvp->v_dirtyblkhd;
+ else
+ listheadp = &oldvp->v_cleanblkhd;
+ TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+ bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
+ vdrop(oldvp);
+ }
+ /*
+ * If dirty, put on list of dirty buffers; otherwise insert onto list
+ * of clean buffers.
+ */
+ if (bp->b_flags & B_DELWRI) {
+ struct buf *tbp;
+
+ listheadp = &newvp->v_dirtyblkhd;
+ if ((newvp->v_flag & VONWORKLST) == 0) {
+ switch (newvp->v_type) {
+ case VDIR:
+ delay = syncdelay / 3;
+ break;
+ case VBLK:
+ if (newvp->v_specmountpoint != NULL) {
+ delay = syncdelay / 2;
+ break;
+ }
+ /* fall through */
+ default:
+ delay = syncdelay;
+ }
+ vn_syncer_add_to_worklist(newvp, delay);
+ }
+ bp->b_xflags |= B_VNDIRTY;
+ tbp = TAILQ_FIRST(listheadp);
+ if (tbp == NULL ||
+ (bp->b_lblkno >= 0 && tbp->b_lblkno > bp->b_lblkno)) {
+ TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
+ } else {
+ if (bp->b_lblkno >= 0) {
+ struct buf *ttbp;
+ while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
+ (ttbp->b_lblkno < bp->b_lblkno)) {
+ tbp = ttbp;
+ }
+ TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
+ } else {
+ TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
+ }
+ }
+ } else {
+ bp->b_xflags |= B_VNCLEAN;
+ TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
+ if ((newvp->v_flag & VONWORKLST) &&
+ TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
+ newvp->v_flag &= ~VONWORKLST;
+ LIST_REMOVE(newvp, v_synclist);
+ }
+ }
+ bp->b_vp = newvp;
+ vhold(bp->b_vp);
+ splx(s);
+}
+
+/*
+ * Create a vnode for a block device.
+ * Used for mounting the root file system.
+ */
+int
+bdevvp(dev, vpp)
+ dev_t dev;
+ struct vnode **vpp;
+{
+ register struct vnode *vp;
+ struct vnode *nvp;
+ int error;
+
+ /* XXX 255 is for mfs. */
+ if (dev == NODEV || (major(dev) != 255 && (major(dev) >= nblkdev ||
+ bdevsw[major(dev)] == NULL))) {
+ *vpp = NULLVP;
+ return (ENXIO);
+ }
+ error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
+ if (error) {
+ *vpp = NULLVP;
+ return (error);
+ }
+ vp = nvp;
+ vp->v_type = VBLK;
+ if ((nvp = checkalias(vp, dev, (struct mount *)0)) != NULL) {
+ vput(vp);
+ vp = nvp;
+ }
+ *vpp = vp;
+ return (0);
+}
+
+/*
+ * Check to see if the new vnode represents a special device
+ * for which we already have a vnode (either because of
+ * bdevvp() or because of a different vnode representing
+ * the same block device). If such an alias exists, deallocate
+ * the existing contents and return the aliased vnode. The
+ * caller is responsible for filling it with its new contents.
+ */
+struct vnode *
+checkalias(nvp, nvp_rdev, mp)
+ register struct vnode *nvp;
+ dev_t nvp_rdev;
+ struct mount *mp;
+{
+ struct proc *p = curproc; /* XXX */
+ struct vnode *vp;
+ struct vnode **vpp;
+
+ if (nvp->v_type != VBLK && nvp->v_type != VCHR)
+ return (NULLVP);
+
+ vpp = &speclisth[SPECHASH(nvp_rdev)];
+loop:
+ simple_lock(&spechash_slock);
+ for (vp = *vpp; vp; vp = vp->v_specnext) {
+ if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
+ continue;
+ /*
+ * Alias, but not in use, so flush it out.
+ * Only alias active device nodes.
+ * Not sure why we don't re-use this like we do below.
+ */
+ simple_lock(&vp->v_interlock);
+ if (vp->v_usecount == 0) {
+ simple_unlock(&spechash_slock);
+ vgonel(vp, p);
+ goto loop;
+ }
+ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
+ /*
+ * It dissappeared, and we may have slept.
+ * Restart from the beginning
+ */
+ simple_unlock(&spechash_slock);
+ goto loop;
+ }
+ break;
+ }
+ /*
+ * It would be a lot clearer what is going on here if
+ * this had been expressed as:
+ * if ( vp && (vp->v_tag == VT_NULL))
+ * and the clauses had been swapped.
+ */
+ if (vp == NULL || vp->v_tag != VT_NON) {
+ /*
+ * Put the new vnode into the hash chain.
+ * and if there was an alias, connect them.
+ */
+ MALLOC(nvp->v_specinfo, struct specinfo *,
+ sizeof(struct specinfo), M_VNODE, M_WAITOK);
+ nvp->v_rdev = nvp_rdev;
+ nvp->v_hashchain = vpp;
+ nvp->v_specnext = *vpp;
+ nvp->v_specmountpoint = NULL;
+ simple_unlock(&spechash_slock);
+ *vpp = nvp;
+ if (vp != NULLVP) {
+ nvp->v_flag |= VALIASED;
+ vp->v_flag |= VALIASED;
+ vput(vp);
+ }
+ return (NULLVP);
+ }
+ /*
+ * if ( vp && (vp->v_tag == VT_NULL))
+ * We have a vnode alias, but it is a trashed.
+ * Make it look like it's newley allocated. (by getnewvnode())
+ * The caller should use this instead.
+ */
+ simple_unlock(&spechash_slock);
+ VOP_UNLOCK(vp, 0, p);
+ simple_lock(&vp->v_interlock);
+ vclean(vp, 0, p);
+ vp->v_op = nvp->v_op;
+ vp->v_tag = nvp->v_tag;
+ nvp->v_type = VNON;
+ insmntque(vp, mp);
+ return (vp);
+}
+
+/*
+ * Grab a particular vnode from the free list, increment its
+ * reference count and lock it. The vnode lock bit is set the
+ * vnode is being eliminated in vgone. The process is awakened
+ * when the transition is completed, and an error returned to
+ * indicate that the vnode is no longer usable (possibly having
+ * been changed to a new file system type).
+ */
+int
+vget(vp, flags, p)
+ register struct vnode *vp;
+ int flags;
+ struct proc *p;
+{
+ int error;
+
+ /*
+ * If the vnode is in the process of being cleaned out for
+ * another use, we wait for the cleaning to finish and then
+ * return failure. Cleaning is determined by checking that
+ * the VXLOCK flag is set.
+ */
+ if ((flags & LK_INTERLOCK) == 0) {
+ simple_lock(&vp->v_interlock);
+ }
+ if (vp->v_flag & VXLOCK) {
+ vp->v_flag |= VXWANT;
+ simple_unlock(&vp->v_interlock);
+ tsleep((caddr_t)vp, PINOD, "vget", 0);
+ return (ENOENT);
+ }
+
+ vp->v_usecount++;
+
+ if (VSHOULDBUSY(vp))
+ vbusy(vp);
+ if (flags & LK_TYPE_MASK) {
+ if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
+ /*
+ * must expand vrele here because we do not want
+ * to call VOP_INACTIVE if the reference count
+ * drops back to zero since it was never really
+ * active. We must remove it from the free list
+ * before sleeping so that multiple processes do
+ * not try to recycle it.
+ */
+ simple_lock(&vp->v_interlock);
+ vp->v_usecount--;
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+ simple_unlock(&vp->v_interlock);
+ }
+ return (error);
+ }
+ simple_unlock(&vp->v_interlock);
+ return (0);
+}
+
+void
+vref(struct vnode *vp)
+{
+ simple_lock(&vp->v_interlock);
+ vp->v_usecount++;
+ simple_unlock(&vp->v_interlock);
+}
+
+/*
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vrele(vp)
+ struct vnode *vp;
+{
+ struct proc *p = curproc; /* XXX */
+
+ KASSERT(vp != NULL, ("vrele: null vp"));
+
+ simple_lock(&vp->v_interlock);
+
+ if (vp->v_usecount > 1) {
+
+ vp->v_usecount--;
+ simple_unlock(&vp->v_interlock);
+
+ return;
+ }
+
+ if (vp->v_usecount == 1) {
+
+ vp->v_usecount--;
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+ /*
+ * If we are doing a vput, the node is already locked, and we must
+ * call VOP_INACTIVE with the node locked. So, in the case of
+ * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
+ */
+ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
+ VOP_INACTIVE(vp, p);
+ }
+
+ } else {
+#ifdef DIAGNOSTIC
+ vprint("vrele: negative ref count", vp);
+ simple_unlock(&vp->v_interlock);
+#endif
+ panic("vrele: negative ref cnt");
+ }
+}
+
+void
+vput(vp)
+ struct vnode *vp;
+{
+ struct proc *p = curproc; /* XXX */
+
+ KASSERT(vp != NULL, ("vput: null vp"));
+
+ simple_lock(&vp->v_interlock);
+
+ if (vp->v_usecount > 1) {
+
+ vp->v_usecount--;
+ VOP_UNLOCK(vp, LK_INTERLOCK, p);
+ return;
+
+ }
+
+ if (vp->v_usecount == 1) {
+
+ vp->v_usecount--;
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+ /*
+ * If we are doing a vput, the node is already locked, and we must
+ * call VOP_INACTIVE with the node locked. So, in the case of
+ * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
+ */
+ simple_unlock(&vp->v_interlock);
+ VOP_INACTIVE(vp, p);
+
+ } else {
+#ifdef DIAGNOSTIC
+ vprint("vput: negative ref count", vp);
+#endif
+ panic("vput: negative ref cnt");
+ }
+}
+
+/*
+ * Somebody doesn't want the vnode recycled.
+ */
+void
+vhold(vp)
+ register struct vnode *vp;
+{
+ int s;
+
+ s = splbio();
+ vp->v_holdcnt++;
+ if (VSHOULDBUSY(vp))
+ vbusy(vp);
+ splx(s);
+}
+
+/*
+ * One less who cares about this vnode.
+ */
+void
+vdrop(vp)
+ register struct vnode *vp;
+{
+ int s;
+
+ s = splbio();
+ if (vp->v_holdcnt <= 0)
+ panic("vdrop: holdcnt");
+ vp->v_holdcnt--;
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+ splx(s);
+}
+
+/*
+ * Remove any vnodes in the vnode table belonging to mount point mp.
+ *
+ * If MNT_NOFORCE is specified, there should not be any active ones,
+ * return error if any are found (nb: this is a user error, not a
+ * system error). If MNT_FORCE is specified, detach any active vnodes
+ * that are found.
+ */
+#ifdef DIAGNOSTIC
+static int busyprt = 0; /* print out busy vnodes */
+SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
+#endif
+
+int
+vflush(mp, skipvp, flags)
+ struct mount *mp;
+ struct vnode *skipvp;
+ int flags;
+{
+ struct proc *p = curproc; /* XXX */
+ struct vnode *vp, *nvp;
+ int busy = 0;
+
+ simple_lock(&mntvnode_slock);
+loop:
+ for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
+ /*
+ * Make sure this vnode wasn't reclaimed in getnewvnode().
+ * Start over if it has (it won't be on the list anymore).
+ */
+ if (vp->v_mount != mp)
+ goto loop;
+ nvp = vp->v_mntvnodes.le_next;
+ /*
+ * Skip over a selected vnode.
+ */
+ if (vp == skipvp)
+ continue;
+
+ simple_lock(&vp->v_interlock);
+ /*
+ * Skip over a vnodes marked VSYSTEM.
+ */
+ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
+ simple_unlock(&vp->v_interlock);
+ continue;
+ }
+ /*
+ * If WRITECLOSE is set, only flush out regular file vnodes
+ * open for writing.
+ */
+ if ((flags & WRITECLOSE) &&
+ (vp->v_writecount == 0 || vp->v_type != VREG)) {
+ simple_unlock(&vp->v_interlock);
+ continue;
+ }
+
+ /*
+ * With v_usecount == 0, all we need to do is clear out the
+ * vnode data structures and we are done.
+ */
+ if (vp->v_usecount == 0) {
+ simple_unlock(&mntvnode_slock);
+ vgonel(vp, p);
+ simple_lock(&mntvnode_slock);
+ continue;
+ }
+
+ /*
+ * If FORCECLOSE is set, forcibly close the vnode. For block
+ * or character devices, revert to an anonymous device. For
+ * all other files, just kill them.
+ */
+ if (flags & FORCECLOSE) {
+ simple_unlock(&mntvnode_slock);
+ if (vp->v_type != VBLK && vp->v_type != VCHR) {
+ vgonel(vp, p);
+ } else {
+ vclean(vp, 0, p);
+ vp->v_op = spec_vnodeop_p;
+ insmntque(vp, (struct mount *) 0);
+ }
+ simple_lock(&mntvnode_slock);
+ continue;
+ }
+#ifdef DIAGNOSTIC
+ if (busyprt)
+ vprint("vflush: busy vnode", vp);
+#endif
+ simple_unlock(&vp->v_interlock);
+ busy++;
+ }
+ simple_unlock(&mntvnode_slock);
+ if (busy)
+ return (EBUSY);
+ return (0);
+}
+
+/*
+ * Disassociate the underlying file system from a vnode.
+ */
+static void
+vclean(vp, flags, p)
+ struct vnode *vp;
+ int flags;
+ struct proc *p;
+{
+ int active;
+ vm_object_t obj;
+
+ /*
+ * Check to see if the vnode is in use. If so we have to reference it
+ * before we clean it out so that its count cannot fall to zero and
+ * generate a race against ourselves to recycle it.
+ */
+ if ((active = vp->v_usecount))
+ vp->v_usecount++;
+
+ /*
+ * Prevent the vnode from being recycled or brought into use while we
+ * clean it out.
+ */
+ if (vp->v_flag & VXLOCK)
+ panic("vclean: deadlock");
+ vp->v_flag |= VXLOCK;
+ /*
+ * Even if the count is zero, the VOP_INACTIVE routine may still
+ * have the object locked while it cleans it out. The VOP_LOCK
+ * ensures that the VOP_INACTIVE routine is done with its work.
+ * For active vnodes, it ensures that no other activity can
+ * occur while the underlying object is being cleaned out.
+ */
+ VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
+
+ /*
+ * Clean out any buffers associated with the vnode.
+ */
+ vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
+ if (obj = vp->v_object) {
+ if (obj->ref_count == 0) {
+ /*
+ * This is a normal way of shutting down the object/vnode
+ * association.
+ */
+ vm_object_terminate(obj);
+ } else {
+ /*
+ * Woe to the process that tries to page now :-).
+ */
+ vm_pager_deallocate(obj);
+ }
+ }
+
+ /*
+ * If purging an active vnode, it must be closed and
+ * deactivated before being reclaimed. Note that the
+ * VOP_INACTIVE will unlock the vnode.
+ */
+ if (active) {
+ if (flags & DOCLOSE)
+ VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
+ VOP_INACTIVE(vp, p);
+ } else {
+ /*
+ * Any other processes trying to obtain this lock must first
+ * wait for VXLOCK to clear, then call the new lock operation.
+ */
+ VOP_UNLOCK(vp, 0, p);
+ }
+ /*
+ * Reclaim the vnode.
+ */
+ if (VOP_RECLAIM(vp, p))
+ panic("vclean: cannot reclaim");
+
+ if (active)
+ vrele(vp);
+
+ cache_purge(vp);
+ if (vp->v_vnlock) {
+#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */
+#ifdef DIAGNOSTIC
+ if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
+ vprint("vclean: lock not drained", vp);
+#endif
+#endif
+ FREE(vp->v_vnlock, M_VNODE);
+ vp->v_vnlock = NULL;
+ }
+
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+
+ /*
+ * Done with purge, notify sleepers of the grim news.
+ */
+ vp->v_op = dead_vnodeop_p;
+ vn_pollgone(vp);
+ vp->v_tag = VT_NON;
+ vp->v_flag &= ~VXLOCK;
+ if (vp->v_flag & VXWANT) {
+ vp->v_flag &= ~VXWANT;
+ wakeup((caddr_t) vp);
+ }
+}
+
+/*
+ * Eliminate all activity associated with the requested vnode
+ * and with all vnodes aliased to the requested vnode.
+ */
+int
+vop_revoke(ap)
+ struct vop_revoke_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ } */ *ap;
+{
+ struct vnode *vp, *vq;
+ struct proc *p = curproc; /* XXX */
+
+ KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
+
+ vp = ap->a_vp;
+ simple_lock(&vp->v_interlock);
+
+ if (vp->v_flag & VALIASED) {
+ /*
+ * If a vgone (or vclean) is already in progress,
+ * wait until it is done and return.
+ */
+ if (vp->v_flag & VXLOCK) {
+ vp->v_flag |= VXWANT;
+ simple_unlock(&vp->v_interlock);
+ tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
+ return (0);
+ }
+ /*
+ * Ensure that vp will not be vgone'd while we
+ * are eliminating its aliases.
+ */
+ vp->v_flag |= VXLOCK;
+ simple_unlock(&vp->v_interlock);
+ while (vp->v_flag & VALIASED) {
+ simple_lock(&spechash_slock);
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_rdev != vp->v_rdev ||
+ vq->v_type != vp->v_type || vp == vq)
+ continue;
+ simple_unlock(&spechash_slock);
+ vgone(vq);
+ break;
+ }
+ if (vq == NULLVP) {
+ simple_unlock(&spechash_slock);
+ }
+ }
+ /*
+ * Remove the lock so that vgone below will
+ * really eliminate the vnode after which time
+ * vgone will awaken any sleepers.
+ */
+ simple_lock(&vp->v_interlock);
+ vp->v_flag &= ~VXLOCK;
+ if (vp->v_flag & VXWANT) {
+ vp->v_flag &= ~VXWANT;
+ wakeup(vp);
+ }
+ }
+ vgonel(vp, p);
+ return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ * Release the passed interlock if the vnode will be recycled.
+ */
+int
+vrecycle(vp, inter_lkp, p)
+ struct vnode *vp;
+ struct simplelock *inter_lkp;
+ struct proc *p;
+{
+
+ simple_lock(&vp->v_interlock);
+ if (vp->v_usecount == 0) {
+ if (inter_lkp) {
+ simple_unlock(inter_lkp);
+ }
+ vgonel(vp, p);
+ return (1);
+ }
+ simple_unlock(&vp->v_interlock);
+ return (0);
+}
+
+/*
+ * Eliminate all activity associated with a vnode
+ * in preparation for reuse.
+ */
+void
+vgone(vp)
+ register struct vnode *vp;
+{
+ struct proc *p = curproc; /* XXX */
+
+ simple_lock(&vp->v_interlock);
+ vgonel(vp, p);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+static void
+vgonel(vp, p)
+ struct vnode *vp;
+ struct proc *p;
+{
+ int s;
+ struct vnode *vq;
+ struct vnode *vx;
+
+ /*
+ * If a vgone (or vclean) is already in progress,
+ * wait until it is done and return.
+ */
+ if (vp->v_flag & VXLOCK) {
+ vp->v_flag |= VXWANT;
+ simple_unlock(&vp->v_interlock);
+ tsleep((caddr_t)vp, PINOD, "vgone", 0);
+ return;
+ }
+
+ /*
+ * Clean out the filesystem specific data.
+ */
+ vclean(vp, DOCLOSE, p);
+ simple_lock(&vp->v_interlock);
+
+ /*
+ * Delete from old mount point vnode list, if on one.
+ */
+ if (vp->v_mount != NULL)
+ insmntque(vp, (struct mount *)0);
+ /*
+ * If special device, remove it from special device alias list
+ * if it is on one.
+ */
+ if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
+ simple_lock(&spechash_slock);
+ if (*vp->v_hashchain == vp) {
+ *vp->v_hashchain = vp->v_specnext;
+ } else {
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_specnext != vp)
+ continue;
+ vq->v_specnext = vp->v_specnext;
+ break;
+ }
+ if (vq == NULL)
+ panic("missing bdev");
+ }
+ if (vp->v_flag & VALIASED) {
+ vx = NULL;
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_rdev != vp->v_rdev ||
+ vq->v_type != vp->v_type)
+ continue;
+ if (vx)
+ break;
+ vx = vq;
+ }
+ if (vx == NULL)
+ panic("missing alias");
+ if (vq == NULL)
+ vx->v_flag &= ~VALIASED;
+ vp->v_flag &= ~VALIASED;
+ }
+ simple_unlock(&spechash_slock);
+ FREE(vp->v_specinfo, M_VNODE);
+ vp->v_specinfo = NULL;
+ }
+
+ /*
+ * If it is on the freelist and not already at the head,
+ * move it to the head of the list. The test of the back
+ * pointer and the reference count of zero is because
+ * it will be removed from the free list by getnewvnode,
+ * but will not have its reference count incremented until
+ * after calling vgone. If the reference count were
+ * incremented first, vgone would (incorrectly) try to
+ * close the previous instance of the underlying object.
+ */
+ if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
+ s = splbio();
+ simple_lock(&vnode_free_list_slock);
+ if (vp->v_flag & VFREE) {
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ } else if (vp->v_flag & VTBFREE) {
+ TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+ vp->v_flag &= ~VTBFREE;
+ freevnodes++;
+ } else
+ freevnodes++;
+ vp->v_flag |= VFREE;
+ TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+ simple_unlock(&vnode_free_list_slock);
+ splx(s);
+ }
+
+ vp->v_type = VBAD;
+ simple_unlock(&vp->v_interlock);
+}
+
+/*
+ * Lookup a vnode by device number.
+ */
+int
+vfinddev(dev, type, vpp)
+ dev_t dev;
+ enum vtype type;
+ struct vnode **vpp;
+{
+ register struct vnode *vp;
+ int rc = 0;
+
+ simple_lock(&spechash_slock);
+ for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
+ if (dev != vp->v_rdev || type != vp->v_type)
+ continue;
+ *vpp = vp;
+ rc = 1;
+ break;
+ }
+ simple_unlock(&spechash_slock);
+ return (rc);
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(vp)
+ register struct vnode *vp;
+{
+ struct vnode *vq, *vnext;
+ int count;
+
+loop:
+ if ((vp->v_flag & VALIASED) == 0)
+ return (vp->v_usecount);
+ simple_lock(&spechash_slock);
+ for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
+ vnext = vq->v_specnext;
+ if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
+ continue;
+ /*
+ * Alias, but not in use, so flush it out.
+ */
+ if (vq->v_usecount == 0 && vq != vp) {
+ simple_unlock(&spechash_slock);
+ vgone(vq);
+ goto loop;
+ }
+ count += vq->v_usecount;
+ }
+ simple_unlock(&spechash_slock);
+ return (count);
+}
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
+
+void
+vprint(label, vp)
+ char *label;
+ register struct vnode *vp;
+{
+ char buf[96];
+
+ if (label != NULL)
+ printf("%s: %p: ", label, (void *)vp);
+ else
+ printf("%p: ", (void *)vp);
+ printf("type %s, usecount %d, writecount %d, refcount %d,",
+ typename[vp->v_type], vp->v_usecount, vp->v_writecount,
+ vp->v_holdcnt);
+ buf[0] = '\0';
+ if (vp->v_flag & VROOT)
+ strcat(buf, "|VROOT");
+ if (vp->v_flag & VTEXT)
+ strcat(buf, "|VTEXT");
+ if (vp->v_flag & VSYSTEM)
+ strcat(buf, "|VSYSTEM");
+ if (vp->v_flag & VXLOCK)
+ strcat(buf, "|VXLOCK");
+ if (vp->v_flag & VXWANT)
+ strcat(buf, "|VXWANT");
+ if (vp->v_flag & VBWAIT)
+ strcat(buf, "|VBWAIT");
+ if (vp->v_flag & VALIASED)
+ strcat(buf, "|VALIASED");
+ if (vp->v_flag & VDOOMED)
+ strcat(buf, "|VDOOMED");
+ if (vp->v_flag & VFREE)
+ strcat(buf, "|VFREE");
+ if (vp->v_flag & VOBJBUF)
+ strcat(buf, "|VOBJBUF");
+ if (buf[0] != '\0')
+ printf(" flags (%s)", &buf[1]);
+ if (vp->v_data == NULL) {
+ printf("\n");
+ } else {
+ printf("\n\t");
+ VOP_PRINT(vp);
+ }
+}
+
+#ifdef DDB
+#include <ddb/ddb.h>
+/*
+ * List all of the locked vnodes in the system.
+ * Called when debugging the kernel.
+ */
+DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
+{
+ struct proc *p = curproc; /* XXX */
+ struct mount *mp, *nmp;
+ struct vnode *vp;
+
+ printf("Locked vnodes\n");
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+ for (vp = mp->mnt_vnodelist.lh_first;
+ vp != NULL;
+ vp = vp->v_mntvnodes.le_next) {
+ if (VOP_ISLOCKED(vp))
+ vprint((char *)0, vp);
+ }
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ }
+ simple_unlock(&mountlist_slock);
+}
+#endif
+
+/*
+ * Top level filesystem related information gathering.
+ */
+static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl SYSCTL_HANDLER_ARGS
+{
+ int *name = (int *)arg1 - 1; /* XXX */
+ u_int namelen = arg2 + 1; /* XXX */
+ struct vfsconf *vfsp;
+
+#if 1 || defined(COMPAT_PRELITE2)
+ /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+ if (namelen == 1)
+ return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+#ifdef notyet
+ /* all sysctl names at this level are at least name and field */
+ if (namelen < 2)
+ return (ENOTDIR); /* overloaded */
+ if (name[0] != VFS_GENERIC) {
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == name[0])
+ break;
+ if (vfsp == NULL)
+ return (EOPNOTSUPP);
+ return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
+ oldp, oldlenp, newp, newlen, p));
+ }
+#endif
+ switch (name[1]) {
+ case VFS_MAXTYPENUM:
+ if (namelen != 2)
+ return (ENOTDIR);
+ return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
+ case VFS_CONF:
+ if (namelen != 3)
+ return (ENOTDIR); /* overloaded */
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == name[2])
+ break;
+ if (vfsp == NULL)
+ return (EOPNOTSUPP);
+ return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
+ }
+ return (EOPNOTSUPP);
+}
+
+SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
+ "Generic filesystem");
+
+#if 1 || defined(COMPAT_PRELITE2)
+
+static int
+sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
+{
+ int error;
+ struct vfsconf *vfsp;
+ struct ovfsconf ovfs;
+
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+ ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
+ strcpy(ovfs.vfc_name, vfsp->vfc_name);
+ ovfs.vfc_index = vfsp->vfc_typenum;
+ ovfs.vfc_refcount = vfsp->vfc_refcount;
+ ovfs.vfc_flags = vfsp->vfc_flags;
+ error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+ if (error)
+ return error;
+ }
+ return 0;
+}
+
+#endif /* 1 || COMPAT_PRELITE2 */
+
+#if 0
+#define KINFO_VNODESLOP 10
+/*
+ * Dump vnode list (via sysctl).
+ * Copyout address of vnode followed by vnode.
+ */
+/* ARGSUSED */
+static int
+sysctl_vnode SYSCTL_HANDLER_ARGS
+{
+ struct proc *p = curproc; /* XXX */
+ struct mount *mp, *nmp;
+ struct vnode *nvp, *vp;
+ int error;
+
+#define VPTRSZ sizeof (struct vnode *)
+#define VNODESZ sizeof (struct vnode)
+
+ req->lock = 0;
+ if (!req->oldptr) /* Make an estimate */
+ return (SYSCTL_OUT(req, 0,
+ (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
+
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+again:
+ simple_lock(&mntvnode_slock);
+ for (vp = mp->mnt_vnodelist.lh_first;
+ vp != NULL;
+ vp = nvp) {
+ /*
+ * Check that the vp is still associated with
+ * this filesystem. RACE: could have been
+ * recycled onto the same filesystem.
+ */
+ if (vp->v_mount != mp) {
+ simple_unlock(&mntvnode_slock);
+ goto again;
+ }
+ nvp = vp->v_mntvnodes.le_next;
+ simple_unlock(&mntvnode_slock);
+ if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
+ (error = SYSCTL_OUT(req, vp, VNODESZ)))
+ return (error);
+ simple_lock(&mntvnode_slock);
+ }
+ simple_unlock(&mntvnode_slock);
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ }
+ simple_unlock(&mountlist_slock);
+
+ return (0);
+}
+#endif
+
+/*
+ * XXX
+ * Exporting the vnode list on large systems causes them to crash.
+ * Exporting the vnode list on medium systems causes sysctl to coredump.
+ */
+#if 0
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, 0, sysctl_vnode, "S,vnode", "");
+#endif
+
+/*
+ * Check to see if a filesystem is mounted on a block device.
+ */
+int
+vfs_mountedon(vp)
+ struct vnode *vp;
+{
+ struct vnode *vq;
+ int error = 0;
+
+ if (vp->v_specmountpoint != NULL)
+ return (EBUSY);
+ if (vp->v_flag & VALIASED) {
+ simple_lock(&spechash_slock);
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_rdev != vp->v_rdev ||
+ vq->v_type != vp->v_type)
+ continue;
+ if (vq->v_specmountpoint != NULL) {
+ error = EBUSY;
+ break;
+ }
+ }
+ simple_unlock(&spechash_slock);
+ }
+ return (error);
+}
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+void
+vfs_unmountall()
+{
+ struct mount *mp, *nmp;
+ struct proc *p;
+ int error;
+
+ if (curproc != NULL)
+ p = curproc;
+ else
+ p = initproc; /* XXX XXX should this be proc0? */
+ /*
+ * Since this only runs when rebooting, it is not interlocked.
+ */
+ for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
+ nmp = mp->mnt_list.cqe_prev;
+ error = dounmount(mp, MNT_FORCE, p);
+ if (error) {
+ printf("unmount of %s failed (",
+ mp->mnt_stat.f_mntonname);
+ if (error == EBUSY)
+ printf("BUSY)\n");
+ else
+ printf("%d)\n", error);
+ }
+ }
+}
+
+/*
+ * Build hash lists of net addresses and hang them off the mount point.
+ * Called by ufs_mount() to set up the lists of export addresses.
+ */
+static int
+vfs_hang_addrlist(mp, nep, argp)
+ struct mount *mp;
+ struct netexport *nep;
+ struct export_args *argp;
+{
+ register struct netcred *np;
+ register struct radix_node_head *rnh;
+ register int i;
+ struct radix_node *rn;
+ struct sockaddr *saddr, *smask = 0;
+ struct domain *dom;
+ int error;
+
+ if (argp->ex_addrlen == 0) {
+ if (mp->mnt_flag & MNT_DEFEXPORTED)
+ return (EPERM);
+ np = &nep->ne_defexported;
+ np->netc_exflags = argp->ex_flags;
+ np->netc_anon = argp->ex_anon;
+ np->netc_anon.cr_ref = 1;
+ mp->mnt_flag |= MNT_DEFEXPORTED;
+ return (0);
+ }
+ i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
+ np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
+ bzero((caddr_t) np, i);
+ saddr = (struct sockaddr *) (np + 1);
+ if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
+ goto out;
+ if (saddr->sa_len > argp->ex_addrlen)
+ saddr->sa_len = argp->ex_addrlen;
+ if (argp->ex_masklen) {
+ smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
+ error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
+ if (error)
+ goto out;
+ if (smask->sa_len > argp->ex_masklen)
+ smask->sa_len = argp->ex_masklen;
+ }
+ i = saddr->sa_family;
+ if ((rnh = nep->ne_rtable[i]) == 0) {
+ /*
+ * Seems silly to initialize every AF when most are not used,
+ * do so on demand here
+ */
+ for (dom = domains; dom; dom = dom->dom_next)
+ if (dom->dom_family == i && dom->dom_rtattach) {
+ dom->dom_rtattach((void **) &nep->ne_rtable[i],
+ dom->dom_rtoffset);
+ break;
+ }
+ if ((rnh = nep->ne_rtable[i]) == 0) {
+ error = ENOBUFS;
+ goto out;
+ }
+ }
+ rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
+ np->netc_rnodes);
+ if (rn == 0 || np != (struct netcred *) rn) { /* already exists */
+ error = EPERM;
+ goto out;
+ }
+ np->netc_exflags = argp->ex_flags;
+ np->netc_anon = argp->ex_anon;
+ np->netc_anon.cr_ref = 1;
+ return (0);
+out:
+ free(np, M_NETADDR);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+vfs_free_netcred(rn, w)
+ struct radix_node *rn;
+ void *w;
+{
+ register struct radix_node_head *rnh = (struct radix_node_head *) w;
+
+ (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+ free((caddr_t) rn, M_NETADDR);
+ return (0);
+}
+
+/*
+ * Free the net address hash lists that are hanging off the mount points.
+ */
+static void
+vfs_free_addrlist(nep)
+ struct netexport *nep;
+{
+ register int i;
+ register struct radix_node_head *rnh;
+
+ for (i = 0; i <= AF_MAX; i++)
+ if ((rnh = nep->ne_rtable[i])) {
+ (*rnh->rnh_walktree) (rnh, vfs_free_netcred,
+ (caddr_t) rnh);
+ free((caddr_t) rnh, M_RTABLE);
+ nep->ne_rtable[i] = 0;
+ }
+}
+
+int
+vfs_export(mp, nep, argp)
+ struct mount *mp;
+ struct netexport *nep;
+ struct export_args *argp;
+{
+ int error;
+
+ if (argp->ex_flags & MNT_DELEXPORT) {
+ if (mp->mnt_flag & MNT_EXPUBLIC) {
+ vfs_setpublicfs(NULL, NULL, NULL);
+ mp->mnt_flag &= ~MNT_EXPUBLIC;
+ }
+ vfs_free_addrlist(nep);
+ mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+ }
+ if (argp->ex_flags & MNT_EXPORTED) {
+ if (argp->ex_flags & MNT_EXPUBLIC) {
+ if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
+ return (error);
+ mp->mnt_flag |= MNT_EXPUBLIC;
+ }
+ if ((error = vfs_hang_addrlist(mp, nep, argp)))
+ return (error);
+ mp->mnt_flag |= MNT_EXPORTED;
+ }
+ return (0);
+}
+
+
+/*
+ * Set the publicly exported filesystem (WebNFS). Currently, only
+ * one public filesystem is possible in the spec (RFC 2054 and 2055)
+ */
+int
+vfs_setpublicfs(mp, nep, argp)
+ struct mount *mp;
+ struct netexport *nep;
+ struct export_args *argp;
+{
+ int error;
+ struct vnode *rvp;
+ char *cp;
+
+ /*
+ * mp == NULL -> invalidate the current info, the FS is
+ * no longer exported. May be called from either vfs_export
+ * or unmount, so check if it hasn't already been done.
+ */
+ if (mp == NULL) {
+ if (nfs_pub.np_valid) {
+ nfs_pub.np_valid = 0;
+ if (nfs_pub.np_index != NULL) {
+ FREE(nfs_pub.np_index, M_TEMP);
+ nfs_pub.np_index = NULL;
+ }
+ }
+ return (0);
+ }
+
+ /*
+ * Only one allowed at a time.
+ */
+ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
+ return (EBUSY);
+
+ /*
+ * Get real filehandle for root of exported FS.
+ */
+ bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
+ nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
+
+ if ((error = VFS_ROOT(mp, &rvp)))
+ return (error);
+
+ if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
+ return (error);
+
+ vput(rvp);
+
+ /*
+ * If an indexfile was specified, pull it in.
+ */
+ if (argp->ex_indexfile != NULL) {
+ MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
+ M_WAITOK);
+ error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
+ MAXNAMLEN, (size_t *)0);
+ if (!error) {
+ /*
+ * Check for illegal filenames.
+ */
+ for (cp = nfs_pub.np_index; *cp; cp++) {
+ if (*cp == '/') {
+ error = EINVAL;
+ break;
+ }
+ }
+ }
+ if (error) {
+ FREE(nfs_pub.np_index, M_TEMP);
+ return (error);
+ }
+ }
+
+ nfs_pub.np_mount = mp;
+ nfs_pub.np_valid = 1;
+ return (0);
+}
+
+struct netcred *
+vfs_export_lookup(mp, nep, nam)
+ register struct mount *mp;
+ struct netexport *nep;
+ struct sockaddr *nam;
+{
+ register struct netcred *np;
+ register struct radix_node_head *rnh;
+ struct sockaddr *saddr;
+
+ np = NULL;
+ if (mp->mnt_flag & MNT_EXPORTED) {
+ /*
+ * Lookup in the export list first.
+ */
+ if (nam != NULL) {
+ saddr = nam;
+ rnh = nep->ne_rtable[saddr->sa_family];
+ if (rnh != NULL) {
+ np = (struct netcred *)
+ (*rnh->rnh_matchaddr)((caddr_t)saddr,
+ rnh);
+ if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
+ np = NULL;
+ }
+ }
+ /*
+ * If no address match, use the default if it exists.
+ */
+ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
+ np = &nep->ne_defexported;
+ }
+ return (np);
+}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags) {
+ struct vnode *vp, *nvp;
+ struct vm_object *obj;
+ int anyio, tries;
+
+ tries = 5;
+loop:
+ anyio = 0;
+ for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
+
+ nvp = vp->v_mntvnodes.le_next;
+
+ if (vp->v_mount != mp) {
+ goto loop;
+ }
+
+ if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */
+ continue;
+
+ if (flags != MNT_WAIT) {
+ obj = vp->v_object;
+ if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
+ continue;
+ if (VOP_ISLOCKED(vp))
+ continue;
+ }
+
+ simple_lock(&vp->v_interlock);
+ if (vp->v_object &&
+ (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
+ if (!vget(vp,
+ LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
+ if (vp->v_object) {
+ vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
+ anyio = 1;
+ }
+ vput(vp);
+ }
+ } else {
+ simple_unlock(&vp->v_interlock);
+ }
+ }
+ if (anyio && (--tries > 0))
+ goto loop;
+}
+
+/*
+ * Create the VM object needed for VMIO and mmap support. This
+ * is done for all VREG files in the system. Some filesystems might
+ * afford the additional metadata buffering capability of the
+ * VMIO code by making the device node be VMIO mode also.
+ *
+ * vp must be locked when vfs_object_create is called.
+ */
+int
+vfs_object_create(vp, p, cred)
+ struct vnode *vp;
+ struct proc *p;
+ struct ucred *cred;
+{
+ struct vattr vat;
+ vm_object_t object;
+ int error = 0;
+
+ if ((vp->v_type != VREG) && (vp->v_type != VBLK))
+ return 0;
+
+retry:
+ if ((object = vp->v_object) == NULL) {
+ if (vp->v_type == VREG) {
+ if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
+ goto retn;
+ object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
+ } else if (major(vp->v_rdev) < nblkdev &&
+ bdevsw[major(vp->v_rdev)] != NULL) {
+ /*
+ * This simply allocates the biggest object possible
+ * for a VBLK vnode. This should be fixed, but doesn't
+ * cause any problems (yet).
+ */
+ object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
+ }
+ object->ref_count--;
+ vp->v_usecount--;
+ } else {
+ if (object->flags & OBJ_DEAD) {
+ VOP_UNLOCK(vp, 0, p);
+ tsleep(object, PVM, "vodead", 0);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ goto retry;
+ }
+ }
+
+ if (vp->v_object)
+ vp->v_flag |= VOBJBUF;
+
+retn:
+ return error;
+}
+
+static void
+vfree(vp)
+ struct vnode *vp;
+{
+ int s;
+
+ s = splbio();
+ simple_lock(&vnode_free_list_slock);
+ if (vp->v_flag & VTBFREE) {
+ TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+ vp->v_flag &= ~VTBFREE;
+ }
+ if (vp->v_flag & VAGE) {
+ TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+ } else {
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ }
+ freevnodes++;
+ simple_unlock(&vnode_free_list_slock);
+ vp->v_flag &= ~VAGE;
+ vp->v_flag |= VFREE;
+ splx(s);
+}
+
+void
+vbusy(vp)
+ struct vnode *vp;
+{
+ int s;
+
+ s = splbio();
+ simple_lock(&vnode_free_list_slock);
+ if (vp->v_flag & VTBFREE) {
+ TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+ vp->v_flag &= ~VTBFREE;
+ } else {
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ freevnodes--;
+ }
+ simple_unlock(&vnode_free_list_slock);
+ vp->v_flag &= ~(VFREE|VAGE);
+ splx(s);
+}
+
+/*
+ * Record a process's interest in events which might happen to
+ * a vnode. Because poll uses the historic select-style interface
+ * internally, this routine serves as both the ``check for any
+ * pending events'' and the ``record my interest in future events''
+ * functions. (These are done together, while the lock is held,
+ * to avoid race conditions.)
+ */
+int
+vn_pollrecord(vp, p, events)
+ struct vnode *vp;
+ struct proc *p;
+ short events;
+{
+ simple_lock(&vp->v_pollinfo.vpi_lock);
+ if (vp->v_pollinfo.vpi_revents & events) {
+ /*
+ * This leaves events we are not interested
+ * in available for the other process which
+ * which presumably had requested them
+ * (otherwise they would never have been
+ * recorded).
+ */
+ events &= vp->v_pollinfo.vpi_revents;
+ vp->v_pollinfo.vpi_revents &= ~events;
+
+ simple_unlock(&vp->v_pollinfo.vpi_lock);
+ return events;
+ }
+ vp->v_pollinfo.vpi_events |= events;
+ selrecord(p, &vp->v_pollinfo.vpi_selinfo);
+ simple_unlock(&vp->v_pollinfo.vpi_lock);
+ return 0;
+}
+
+/*
+ * Note the occurrence of an event. If the VN_POLLEVENT macro is used,
+ * it is possible for us to miss an event due to race conditions, but
+ * that condition is expected to be rare, so for the moment it is the
+ * preferred interface.
+ */
+void
+vn_pollevent(vp, events)
+ struct vnode *vp;
+ short events;
+{
+ simple_lock(&vp->v_pollinfo.vpi_lock);
+ if (vp->v_pollinfo.vpi_events & events) {
+ /*
+ * We clear vpi_events so that we don't
+ * call selwakeup() twice if two events are
+ * posted before the polling process(es) is
+ * awakened. This also ensures that we take at
+ * most one selwakeup() if the polling process
+ * is no longer interested. However, it does
+ * mean that only one event can be noticed at
+ * a time. (Perhaps we should only clear those
+ * event bits which we note?) XXX
+ */
+ vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
+ vp->v_pollinfo.vpi_revents |= events;
+ selwakeup(&vp->v_pollinfo.vpi_selinfo);
+ }
+ simple_unlock(&vp->v_pollinfo.vpi_lock);
+}
+
+/*
+ * Wake up anyone polling on vp because it is being revoked.
+ * This depends on dead_poll() returning POLLHUP for correct
+ * behavior.
+ */
+void
+vn_pollgone(vp)
+ struct vnode *vp;
+{
+ simple_lock(&vp->v_pollinfo.vpi_lock);
+ if (vp->v_pollinfo.vpi_events) {
+ vp->v_pollinfo.vpi_events = 0;
+ selwakeup(&vp->v_pollinfo.vpi_selinfo);
+ }
+ simple_unlock(&vp->v_pollinfo.vpi_lock);
+}
+
+
+
+/*
+ * Routine to create and manage a filesystem syncer vnode.
+ */
+#define sync_close ((int (*) __P((struct vop_close_args *)))nullop)
+static int sync_fsync __P((struct vop_fsync_args *));
+static int sync_inactive __P((struct vop_inactive_args *));
+static int sync_reclaim __P((struct vop_reclaim_args *));
+#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock)
+#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock)
+static int sync_print __P((struct vop_print_args *));
+#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
+
+static vop_t **sync_vnodeop_p;
+static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
+ { &vop_default_desc, (vop_t *) vop_eopnotsupp },
+ { &vop_close_desc, (vop_t *) sync_close }, /* close */
+ { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */
+ { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */
+ { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */
+ { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */
+ { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */
+ { &vop_print_desc, (vop_t *) sync_print }, /* print */
+ { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */
+ { NULL, NULL }
+};
+static struct vnodeopv_desc sync_vnodeop_opv_desc =
+ { &sync_vnodeop_p, sync_vnodeop_entries };
+
+VNODEOP_SET(sync_vnodeop_opv_desc);
+
+/*
+ * Create a new filesystem syncer vnode for the specified mount point.
+ */
+int
+vfs_allocate_syncvnode(mp)
+ struct mount *mp;
+{
+ struct vnode *vp;
+ static long start, incr, next;
+ int error;
+
+ /* Allocate a new vnode */
+ if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
+ mp->mnt_syncer = NULL;
+ return (error);
+ }
+ vp->v_type = VNON;
+ /*
+ * Place the vnode onto the syncer worklist. We attempt to
+ * scatter them about on the list so that they will go off
+ * at evenly distributed times even if all the filesystems
+ * are mounted at once.
+ */
+ next += incr;
+ if (next == 0 || next > syncer_maxdelay) {
+ start /= 2;
+ incr /= 2;
+ if (start == 0) {
+ start = syncer_maxdelay / 2;
+ incr = syncer_maxdelay;
+ }
+ next = start;
+ }
+ vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
+ mp->mnt_syncer = vp;
+ return (0);
+}
+
+/*
+ * Do a lazy sync of the filesystem.
+ */
+static int
+sync_fsync(ap)
+ struct vop_fsync_args /* {
+ struct vnode *a_vp;
+ struct ucred *a_cred;
+ int a_waitfor;
+ struct proc *a_p;
+ } */ *ap;
+{
+ struct vnode *syncvp = ap->a_vp;
+ struct mount *mp = syncvp->v_mount;
+ struct proc *p = ap->a_p;
+ int asyncflag;
+
+ /*
+ * We only need to do something if this is a lazy evaluation.
+ */
+ if (ap->a_waitfor != MNT_LAZY)
+ return (0);
+
+ /*
+ * Move ourselves to the back of the sync list.
+ */
+ vn_syncer_add_to_worklist(syncvp, syncdelay);
+
+ /*
+ * Walk the list of vnodes pushing all that are dirty and
+ * not already on the sync list.
+ */
+ simple_lock(&mountlist_slock);
+ if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
+ simple_unlock(&mountlist_slock);
+ return (0);
+ }
+ asyncflag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &= ~MNT_ASYNC;
+ vfs_msync(mp, MNT_NOWAIT);
+ VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
+ if (asyncflag)
+ mp->mnt_flag |= MNT_ASYNC;
+ vfs_unbusy(mp, p);
+ return (0);
+}
+
+/*
+ * The syncer vnode is no referenced.
+ */
+static int
+sync_inactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct proc *a_p;
+ } */ *ap;
+{
+
+ vgone(ap->a_vp);
+ return (0);
+}
+
+/*
+ * The syncer vnode is no longer needed and is being decommissioned.
+ */
+static int
+sync_reclaim(ap)
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+ vp->v_mount->mnt_syncer = NULL;
+ if (vp->v_flag & VONWORKLST) {
+ LIST_REMOVE(vp, v_synclist);
+ vp->v_flag &= ~VONWORKLST;
+ }
+
+ return (0);
+}
+
+/*
+ * Print out a syncer vnode.
+ */
+static int
+sync_print(ap)
+ struct vop_print_args /* {
+ struct vnode *a_vp;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+ printf("syncer vnode");
+ if (vp->v_vnlock != NULL)
+ lockmgr_printinfo(vp->v_vnlock);
+ printf("\n");
+ return (0);
+}
diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c
new file mode 100644
index 0000000..18e39d6
--- /dev/null
+++ b/sys/kern/vfs_extattr.c
@@ -0,0 +1,3034 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94
+ * $Id: vfs_syscalls.c,v 1.111 1998/12/12 21:07:09 dillon Exp $
+ */
+
+/* For 4.3 integer FS ID compatibility */
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/linker.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/dirent.h>
+
+#include <miscfs/union/union.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_zone.h>
+#include <sys/sysctl.h>
+
+static int change_dir __P((struct nameidata *ndp, struct proc *p));
+static void checkdirs __P((struct vnode *olddp));
+static int setfown __P((struct proc *, struct vnode *, uid_t, gid_t));
+static int setfmode __P((struct proc *, struct vnode *, int));
+static int setfflags __P((struct proc *, struct vnode *, int));
+static int setutimes __P((struct proc *, struct vnode *, struct timeval *, int));
+static int usermount = 0; /* if 1, non-root can mount fs. */
+
+int (*union_dircheckp) __P((struct proc *, struct vnode **, struct file *));
+
+SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
+
+/*
+ * Virtual File System System Calls
+ */
+
+/*
+ * Mount a file system.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+ char *type;
+ char *path;
+ int flags;
+ caddr_t data;
+};
+#endif
+/* ARGSUSED */
+int
+mount(p, uap)
+ struct proc *p;
+ register struct mount_args /* {
+ syscallarg(char *) type;
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(caddr_t) data;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct vfsconf *vfsp;
+ int error, flag = 0, flag2 = 0;
+ struct vattr va;
+ u_long fstypenum;
+ struct nameidata nd;
+ char fstypename[MFSNAMELEN];
+
+ if (usermount == 0 && (error = suser(p->p_ucred, &p->p_acflag)))
+ return (error);
+
+ /*
+ * Get vnode to be covered
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (SCARG(uap, flags) & MNT_UPDATE) {
+ if ((vp->v_flag & VROOT) == 0) {
+ vput(vp);
+ return (EINVAL);
+ }
+ mp = vp->v_mount;
+ flag = mp->mnt_flag;
+ flag2 = mp->mnt_kern_flag;
+ /*
+ * We only allow the filesystem to be reloaded if it
+ * is currently mounted read-only.
+ */
+ if ((SCARG(uap, flags) & MNT_RELOAD) &&
+ ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+ vput(vp);
+ return (EOPNOTSUPP); /* Needs translation */
+ }
+ mp->mnt_flag |=
+ SCARG(uap, flags) & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to update it.
+ */
+ if (mp->mnt_stat.f_owner != p->p_ucred->cr_uid &&
+ (error = suser(p->p_ucred, &p->p_acflag))) {
+ vput(vp);
+ return (error);
+ }
+ /*
+ * Do not allow NFS export by non-root users. Silently
+ * enforce MNT_NOSUID and MNT_NODEV for non-root users.
+ */
+ if (p->p_ucred->cr_uid != 0) {
+ if (SCARG(uap, flags) & MNT_EXPORTED) {
+ vput(vp);
+ return (EPERM);
+ }
+ SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV;
+ }
+ if (vfs_busy(mp, LK_NOWAIT, 0, p)) {
+ vput(vp);
+ return (EBUSY);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ goto update;
+ }
+ /*
+ * If the user is not root, ensure that they own the directory
+ * onto which we are attempting to mount.
+ */
+ if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) ||
+ (va.va_uid != p->p_ucred->cr_uid &&
+ (error = suser(p->p_ucred, &p->p_acflag)))) {
+ vput(vp);
+ return (error);
+ }
+ /*
+ * Do not allow NFS export by non-root users. Silently
+ * enforce MNT_NOSUID and MNT_NODEV for non-root users.
+ */
+ if (p->p_ucred->cr_uid != 0) {
+ if (SCARG(uap, flags) & MNT_EXPORTED) {
+ vput(vp);
+ return (EPERM);
+ }
+ SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV;
+ }
+ if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0))
+ return (error);
+ if (vp->v_type != VDIR) {
+ vput(vp);
+ return (ENOTDIR);
+ }
+#ifdef COMPAT_43
+ /*
+ * Historically filesystem types were identified by number. If we
+ * get an integer for the filesystem type instead of a string, we
+ * check to see if it matches one of the historic filesystem types.
+ */
+ fstypenum = (uintptr_t)SCARG(uap, type);
+ if (fstypenum < maxvfsconf) {
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == fstypenum)
+ break;
+ if (vfsp == NULL) {
+ vput(vp);
+ return (ENODEV);
+ }
+ strncpy(fstypename, vfsp->vfc_name, MFSNAMELEN);
+ } else
+#endif /* COMPAT_43 */
+ if (error = copyinstr(SCARG(uap, type), fstypename, MFSNAMELEN, NULL)) {
+ vput(vp);
+ return (error);
+ }
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstypename))
+ break;
+ if (vfsp == NULL) {
+ linker_file_t lf;
+
+ /* Refuse to load modules if securelevel raised */
+ if (securelevel > 0) {
+ vput(vp);
+ return EPERM;
+ }
+ /* Only load modules for root (very important!) */
+ if (error = suser(p->p_ucred, &p->p_acflag)) {
+ vput(vp);
+ return error;
+ }
+ error = linker_load_file(fstypename, &lf);
+ if (error || lf == NULL) {
+ vput(vp);
+ if (lf == NULL)
+ error = ENODEV;
+ return error;
+ }
+ lf->userrefs++;
+ /* lookup again, see if the VFS was loaded */
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstypename))
+ break;
+ if (vfsp == NULL) {
+ lf->userrefs--;
+ linker_file_unload(lf);
+ vput(vp);
+ return (ENODEV);
+ }
+ }
+ simple_lock(&vp->v_interlock);
+ if ((vp->v_flag & VMOUNT) != 0 ||
+ vp->v_mountedhere != NULL) {
+ simple_unlock(&vp->v_interlock);
+ vput(vp);
+ return (EBUSY);
+ }
+ vp->v_flag |= VMOUNT;
+ simple_unlock(&vp->v_interlock);
+
+ /*
+ * Allocate and initialize the filesystem.
+ */
+ mp = (struct mount *)malloc((u_long)sizeof(struct mount),
+ M_MOUNT, M_WAITOK);
+ bzero((char *)mp, (u_long)sizeof(struct mount));
+ lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+ (void)vfs_busy(mp, LK_NOWAIT, 0, p);
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_vfc = vfsp;
+ vfsp->vfc_refcount++;
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+ strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+ mp->mnt_vnodecovered = vp;
+ mp->mnt_stat.f_owner = p->p_ucred->cr_uid;
+ VOP_UNLOCK(vp, 0, p);
+update:
+ /*
+ * Set the mount level flags.
+ */
+ if (SCARG(uap, flags) & MNT_RDONLY)
+ mp->mnt_flag |= MNT_RDONLY;
+ else if (mp->mnt_flag & MNT_RDONLY)
+ mp->mnt_kern_flag |= MNTK_WANTRDWR;
+ mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
+ MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME |
+ MNT_NOSYMFOLLOW |
+ MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
+ mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC |
+ MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
+ MNT_NOSYMFOLLOW |
+ MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
+ /*
+ * Mount the filesystem.
+ */
+ error = VFS_MOUNT(mp, SCARG(uap, path), SCARG(uap, data), &nd, p);
+ if (mp->mnt_flag & MNT_UPDATE) {
+ vrele(vp);
+ if (mp->mnt_kern_flag & MNTK_WANTRDWR)
+ mp->mnt_flag &= ~MNT_RDONLY;
+ mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
+ mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
+ if (error) {
+ mp->mnt_flag = flag;
+ mp->mnt_kern_flag = flag2;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+ if (mp->mnt_syncer == NULL)
+ error = vfs_allocate_syncvnode(mp);
+ } else {
+ if (mp->mnt_syncer != NULL)
+ vrele(mp->mnt_syncer);
+ mp->mnt_syncer = NULL;
+ }
+ vfs_unbusy(mp, p);
+ return (error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ /*
+ * Put the new filesystem on the mount list after root.
+ */
+ cache_purge(vp);
+ if (!error) {
+ simple_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ vp->v_mountedhere = mp;
+ simple_unlock(&vp->v_interlock);
+ simple_lock(&mountlist_slock);
+ CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ simple_unlock(&mountlist_slock);
+ checkdirs(vp);
+ VOP_UNLOCK(vp, 0, p);
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ error = vfs_allocate_syncvnode(mp);
+ vfs_unbusy(mp, p);
+ if (error = VFS_START(mp, 0, p))
+ vrele(vp);
+ } else {
+ simple_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ simple_unlock(&vp->v_interlock);
+ mp->mnt_vfc->vfc_refcount--;
+ vfs_unbusy(mp, p);
+ free((caddr_t)mp, M_MOUNT);
+ vput(vp);
+ }
+ return (error);
+}
+
+/*
+ * Scan all active processes to see if any of them have a current
+ * or root directory onto which the new filesystem has just been
+ * mounted. If so, replace them with the new mount point.
+ */
+static void
+checkdirs(olddp)
+ struct vnode *olddp;
+{
+ struct filedesc *fdp;
+ struct vnode *newdp;
+ struct proc *p;
+
+ if (olddp->v_usecount == 1)
+ return;
+ if (VFS_ROOT(olddp->v_mountedhere, &newdp))
+ panic("mount: lost mount");
+ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+ fdp = p->p_fd;
+ if (fdp->fd_cdir == olddp) {
+ vrele(fdp->fd_cdir);
+ VREF(newdp);
+ fdp->fd_cdir = newdp;
+ }
+ if (fdp->fd_rdir == olddp) {
+ vrele(fdp->fd_rdir);
+ VREF(newdp);
+ fdp->fd_rdir = newdp;
+ }
+ }
+ if (rootvnode == olddp) {
+ vrele(rootvnode);
+ VREF(newdp);
+ rootvnode = newdp;
+ }
+ vput(newdp);
+}
+
+/*
+ * Unmount a file system.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument,
+ * not special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+ char *path;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+unmount(p, uap)
+ struct proc *p;
+ register struct unmount_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct mount *mp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ mp = vp->v_mount;
+
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to unmount this filesystem.
+ */
+ if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) &&
+ (error = suser(p->p_ucred, &p->p_acflag))) {
+ vput(vp);
+ return (error);
+ }
+
+ /*
+ * Don't allow unmounting the root file system.
+ */
+ if (mp->mnt_flag & MNT_ROOTFS) {
+ vput(vp);
+ return (EINVAL);
+ }
+
+ /*
+ * Must be the root of the filesystem
+ */
+ if ((vp->v_flag & VROOT) == 0) {
+ vput(vp);
+ return (EINVAL);
+ }
+ vput(vp);
+ return (dounmount(mp, SCARG(uap, flags), p));
+}
+
+/*
+ * Do the actual file system unmount.
+ */
+int
+dounmount(mp, flags, p)
+ register struct mount *mp;
+ int flags;
+ struct proc *p;
+{
+ struct vnode *coveredvp;
+ int error;
+ int async_flag;
+
+ simple_lock(&mountlist_slock);
+ mp->mnt_kern_flag |= MNTK_UNMOUNT;
+ lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p);
+
+ if (mp->mnt_flag & MNT_EXPUBLIC)
+ vfs_setpublicfs(NULL, NULL, NULL);
+
+ vfs_msync(mp, MNT_WAIT);
+ async_flag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &=~ MNT_ASYNC;
+ cache_purgevfs(mp); /* remove cache entries for this file sys */
+ if (mp->mnt_syncer != NULL)
+ vrele(mp->mnt_syncer);
+ if (((mp->mnt_flag & MNT_RDONLY) ||
+ (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) ||
+ (flags & MNT_FORCE))
+ error = VFS_UNMOUNT(mp, flags, p);
+ simple_lock(&mountlist_slock);
+ if (error) {
+ if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
+ (void) vfs_allocate_syncvnode(mp);
+ mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
+ mp->mnt_flag |= async_flag;
+ lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK | LK_REENABLE,
+ &mountlist_slock, p);
+ if (mp->mnt_kern_flag & MNTK_MWAIT)
+ wakeup((caddr_t)mp);
+ return (error);
+ }
+ CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
+ if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
+ coveredvp->v_mountedhere = (struct mount *)0;
+ vrele(coveredvp);
+ }
+ mp->mnt_vfc->vfc_refcount--;
+ if (mp->mnt_vnodelist.lh_first != NULL)
+ panic("unmount: dangling vnode");
+ lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_slock, p);
+ if (mp->mnt_kern_flag & MNTK_MWAIT)
+ wakeup((caddr_t)mp);
+ free((caddr_t)mp, M_MOUNT);
+ return (0);
+}
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+ int dummy;
+};
+#endif
+
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
+/* ARGSUSED */
+int
+sync(p, uap)
+ struct proc *p;
+ struct sync_args *uap;
+{
+ register struct mount *mp, *nmp;
+ int asyncflag;
+
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+ asyncflag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &= ~MNT_ASYNC;
+ vfs_msync(mp, MNT_NOWAIT);
+ VFS_SYNC(mp, MNT_NOWAIT,
+ ((p != NULL) ? p->p_ucred : NOCRED), p);
+ mp->mnt_flag |= asyncflag;
+ }
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ }
+ simple_unlock(&mountlist_slock);
+#if 0
+/*
+ * XXX don't call vfs_bufstats() yet because that routine
+ * was not imported in the Lite2 merge.
+ */
+#ifdef DIAGNOSTIC
+ if (syncprt)
+ vfs_bufstats();
+#endif /* DIAGNOSTIC */
+#endif
+ return (0);
+}
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+ char *path;
+ int cmd;
+ int uid;
+ caddr_t arg;
+};
+#endif
+/* ARGSUSED */
+int
+quotactl(p, uap)
+ struct proc *p;
+ register struct quotactl_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) cmd;
+ syscallarg(int) uid;
+ syscallarg(caddr_t) arg;
+ } */ *uap;
+{
+ register struct mount *mp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ mp = nd.ni_vp->v_mount;
+ vrele(nd.ni_vp);
+ return (VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
+ SCARG(uap, arg), p));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+ char *path;
+ struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+statfs(p, uap)
+ struct proc *p;
+ register struct statfs_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct statfs *) buf;
+ } */ *uap;
+{
+ register struct mount *mp;
+ register struct statfs *sp;
+ int error;
+ struct nameidata nd;
+ struct statfs sb;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ mp = nd.ni_vp->v_mount;
+ sp = &mp->mnt_stat;
+ vrele(nd.ni_vp);
+ error = VFS_STATFS(mp, sp, p);
+ if (error)
+ return (error);
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (p->p_ucred->cr_uid != 0) {
+ bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
+ return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+ int fd;
+ struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+fstatfs(p, uap)
+ struct proc *p;
+ register struct fstatfs_args /* {
+ syscallarg(int) fd;
+ syscallarg(struct statfs *) buf;
+ } */ *uap;
+{
+ struct file *fp;
+ struct mount *mp;
+ register struct statfs *sp;
+ int error;
+ struct statfs sb;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ mp = ((struct vnode *)fp->f_data)->v_mount;
+ sp = &mp->mnt_stat;
+ error = VFS_STATFS(mp, sp, p);
+ if (error)
+ return (error);
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (p->p_ucred->cr_uid != 0) {
+ bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
+ return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+ struct statfs *buf;
+ long bufsize;
+ int flags;
+};
+#endif
+int
+getfsstat(p, uap)
+ struct proc *p;
+ register struct getfsstat_args /* {
+ syscallarg(struct statfs *) buf;
+ syscallarg(long) bufsize;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ register struct mount *mp, *nmp;
+ register struct statfs *sp;
+ caddr_t sfsp;
+ long count, maxcount, error;
+
+ maxcount = SCARG(uap, bufsize) / sizeof(struct statfs);
+ sfsp = (caddr_t)SCARG(uap, buf);
+ count = 0;
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+ if (sfsp && count < maxcount) {
+ sp = &mp->mnt_stat;
+ /*
+ * If MNT_NOWAIT or MNT_LAZY is specified, do not
+ * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
+ * overrides MNT_WAIT.
+ */
+ if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
+ (SCARG(uap, flags) & MNT_WAIT)) &&
+ (error = VFS_STATFS(mp, sp, p))) {
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ continue;
+ }
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ error = copyout((caddr_t)sp, sfsp, sizeof(*sp));
+ if (error) {
+ vfs_unbusy(mp, p);
+ return (error);
+ }
+ sfsp += sizeof(*sp);
+ }
+ count++;
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ }
+ simple_unlock(&mountlist_slock);
+ if (sfsp && count > maxcount)
+ p->p_retval[0] = maxcount;
+ else
+ p->p_retval[0] = count;
+ return (0);
+}
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+fchdir(p, uap)
+ struct proc *p;
+ struct fchdir_args /* {
+ syscallarg(int) fd;
+ } */ *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ struct vnode *vp, *tdp;
+ struct mount *mp;
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(fdp, SCARG(uap, fd), &fp))
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ VREF(vp);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ if (vp->v_type != VDIR)
+ error = ENOTDIR;
+ else
+ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+ while (!error && (mp = vp->v_mountedhere) != NULL) {
+ if (vfs_busy(mp, 0, 0, p))
+ continue;
+ error = VFS_ROOT(mp, &tdp);
+ vfs_unbusy(mp, p);
+ if (error)
+ break;
+ vput(vp);
+ vp = tdp;
+ }
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ vrele(fdp->fd_cdir);
+ fdp->fd_cdir = vp;
+ return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+chdir(p, uap)
+ struct proc *p;
+ struct chdir_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = change_dir(&nd, p))
+ return (error);
+ vrele(fdp->fd_cdir);
+ fdp->fd_cdir = nd.ni_vp;
+ return (0);
+}
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+chroot(p, uap)
+ struct proc *p;
+ struct chroot_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ int error;
+ struct nameidata nd;
+
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error)
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = change_dir(&nd, p))
+ return (error);
+ vrele(fdp->fd_rdir);
+ fdp->fd_rdir = nd.ni_vp;
+ return (0);
+}
+
+/*
+ * Common routine for chroot and chdir.
+ */
+static int
+change_dir(ndp, p)
+ register struct nameidata *ndp;
+ struct proc *p;
+{
+ struct vnode *vp;
+ int error;
+
+ error = namei(ndp);
+ if (error)
+ return (error);
+ vp = ndp->ni_vp;
+ if (vp->v_type != VDIR)
+ error = ENOTDIR;
+ else
+ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+ if (error)
+ vput(vp);
+ else
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+/*
+ * Check permissions, allocate an open file structure,
+ * and call the device open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+ char *path;
+ int flags;
+ int mode;
+};
+#endif
+int
+open(p, uap)
+ struct proc *p;
+ register struct open_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register struct file *fp;
+ register struct vnode *vp;
+ int cmode, flags, oflags;
+ struct file *nfp;
+ int type, indx, error;
+ struct flock lf;
+ struct nameidata nd;
+
+ oflags = SCARG(uap, flags);
+ if ((oflags & O_ACCMODE) == O_ACCMODE)
+ return (EINVAL);
+ flags = FFLAGS(oflags);
+ error = falloc(p, &nfp, &indx);
+ if (error)
+ return (error);
+ fp = nfp;
+ cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ p->p_dupfd = -indx - 1; /* XXX check for fdopen */
+ error = vn_open(&nd, flags, cmode);
+ if (error) {
+ ffree(fp);
+ if ((error == ENODEV || error == ENXIO) &&
+ p->p_dupfd >= 0 && /* XXX from fdopen */
+ (error =
+ dupfdopen(fdp, indx, p->p_dupfd, flags, error)) == 0) {
+ p->p_retval[0] = indx;
+ return (0);
+ }
+ if (error == ERESTART)
+ error = EINTR;
+ fdp->fd_ofiles[indx] = NULL;
+ return (error);
+ }
+ p->p_dupfd = 0;
+ vp = nd.ni_vp;
+
+ fp->f_flag = flags & FMASK;
+ fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+ fp->f_ops = &vnops;
+ fp->f_data = (caddr_t)vp;
+ if (flags & (O_EXLOCK | O_SHLOCK)) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ if (flags & O_EXLOCK)
+ lf.l_type = F_WRLCK;
+ else
+ lf.l_type = F_RDLCK;
+ type = F_FLOCK;
+ if ((flags & FNONBLOCK) == 0)
+ type |= F_WAIT;
+ VOP_UNLOCK(vp, 0, p);
+ if (error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) {
+ (void) vn_close(vp, fp->f_flag, fp->f_cred, p);
+ ffree(fp);
+ fdp->fd_ofiles[indx] = NULL;
+ return (error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ fp->f_flag |= FHASLOCK;
+ }
+ if ((vp->v_type == VREG) && (vp->v_object == NULL))
+ vfs_object_create(vp, p, p->p_ucred);
+ VOP_UNLOCK(vp, 0, p);
+ p->p_retval[0] = indx;
+ return (0);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+ocreat(p, uap)
+ struct proc *p;
+ register struct ocreat_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct open_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(int) mode;
+ } */ nuap;
+
+ SCARG(&nuap, path) = SCARG(uap, path);
+ SCARG(&nuap, mode) = SCARG(uap, mode);
+ SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
+ return (open(p, &nuap));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+ char *path;
+ int mode;
+ int dev;
+};
+#endif
+/* ARGSUSED */
+int
+mknod(p, uap)
+ struct proc *p;
+ register struct mknod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ syscallarg(int) dev;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ int whiteout = 0;
+ struct nameidata nd;
+
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error)
+ return (error);
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL)
+ error = EEXIST;
+ else {
+ VATTR_NULL(&vattr);
+ vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
+ vattr.va_rdev = SCARG(uap, dev);
+ whiteout = 0;
+
+ switch (SCARG(uap, mode) & S_IFMT) {
+ case S_IFMT: /* used by badsect to flag bad sectors */
+ vattr.va_type = VBAD;
+ break;
+ case S_IFCHR:
+ vattr.va_type = VCHR;
+ break;
+ case S_IFBLK:
+ vattr.va_type = VBLK;
+ break;
+ case S_IFWHT:
+ whiteout = 1;
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ }
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ if (whiteout) {
+ error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+ if (error)
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ vput(nd.ni_dvp);
+ } else {
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+ &nd.ni_cnd, &vattr);
+ vput(nd.ni_dvp);
+ }
+ } else {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (vp)
+ vrele(vp);
+ }
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod");
+ return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkfifo(p, uap)
+ struct proc *p;
+ register struct mkfifo_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ if (nd.ni_vp != NULL) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(nd.ni_vp);
+ return (EEXIST);
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VFIFO;
+ vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ vput(nd.ni_dvp);
+ return (error);
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+ char *path;
+ char *link;
+};
+#endif
+/* ARGSUSED */
+int
+link(p, uap)
+ struct proc *p;
+ register struct link_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) link;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type == VDIR)
+ error = EPERM; /* POSIX */
+ else {
+ NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p);
+ error = namei(&nd);
+ if (!error) {
+ if (nd.ni_vp != NULL) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_vp)
+ vrele(nd.ni_vp);
+ error = EEXIST;
+ } else {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred,
+ LEASE_WRITE);
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+ }
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ }
+ }
+ vrele(vp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "link");
+ return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+ char *path;
+ char *link;
+};
+#endif
+/* ARGSUSED */
+int
+symlink(p, uap)
+ struct proc *p;
+ register struct symlink_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) link;
+ } */ *uap;
+{
+ struct vattr vattr;
+ char *path;
+ int error;
+ struct nameidata nd;
+
+ path = zalloc(namei_zone);
+ if (error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL))
+ goto out;
+ NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p);
+ if (error = namei(&nd))
+ goto out;
+ if (nd.ni_vp) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(nd.ni_vp);
+ error = EEXIST;
+ goto out;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
+ vput(nd.ni_dvp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
+out:
+ zfree(namei_zone, path);
+ return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+/* ARGSUSED */
+int
+undelete(p, uap)
+ struct proc *p;
+ register struct undelete_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ error = namei(&nd);
+ if (error)
+ return (error);
+
+ if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (nd.ni_vp)
+ vrele(nd.ni_vp);
+ return (EEXIST);
+ }
+
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ if (error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE))
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ vput(nd.ni_dvp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete");
+ return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+unlink(p, uap)
+ struct proc *p;
+ struct unlink_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+
+ if (vp->v_type == VDIR)
+ error = EPERM; /* POSIX */
+ else {
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ *
+ * XXX: can this only be a VDIR case?
+ */
+ if (vp->v_flag & VROOT)
+ error = EBUSY;
+ }
+
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+ } else {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ }
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (vp != NULLVP)
+ vput(vp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink");
+ return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+ int fd;
+ int pad;
+ off_t offset;
+ int whence;
+};
+#endif
+int
+lseek(p, uap)
+ struct proc *p;
+ register struct lseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) offset;
+ syscallarg(int) whence;
+ } */ *uap;
+{
+ struct ucred *cred = p->p_ucred;
+ register struct filedesc *fdp = p->p_fd;
+ register struct file *fp;
+ struct vattr vattr;
+ int error;
+
+ if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
+ return (EBADF);
+ if (fp->f_type != DTYPE_VNODE)
+ return (ESPIPE);
+ switch (SCARG(uap, whence)) {
+ case L_INCR:
+ fp->f_offset += SCARG(uap, offset);
+ break;
+ case L_XTND:
+ error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p);
+ if (error)
+ return (error);
+ fp->f_offset = SCARG(uap, offset) + vattr.va_size;
+ break;
+ case L_SET:
+ fp->f_offset = SCARG(uap, offset);
+ break;
+ default:
+ return (EINVAL);
+ }
+ *(off_t *)(p->p_retval) = fp->f_offset;
+ return (0);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+ int fd;
+ long offset;
+ int whence;
+};
+#endif
+int
+olseek(p, uap)
+ struct proc *p;
+ register struct olseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(long) offset;
+ syscallarg(int) whence;
+ } */ *uap;
+{
+ struct lseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) offset;
+ syscallarg(int) whence;
+ } */ nuap;
+ int error;
+
+ SCARG(&nuap, fd) = SCARG(uap, fd);
+ SCARG(&nuap, offset) = SCARG(uap, offset);
+ SCARG(&nuap, whence) = SCARG(uap, whence);
+ error = lseek(p, &nuap);
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Check access permissions.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+ char *path;
+ int flags;
+};
+#endif
+int
+access(p, uap)
+ struct proc *p;
+ register struct access_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ register struct ucred *cred = p->p_ucred;
+ register struct vnode *vp;
+ int error, flags, t_gid, t_uid;
+ struct nameidata nd;
+
+ t_uid = cred->cr_uid;
+ t_gid = cred->cr_groups[0];
+ cred->cr_uid = p->p_cred->p_ruid;
+ cred->cr_groups[0] = p->p_cred->p_rgid;
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ goto out1;
+ vp = nd.ni_vp;
+
+ /* Flags == 0 means only check for existence. */
+ if (SCARG(uap, flags)) {
+ flags = 0;
+ if (SCARG(uap, flags) & R_OK)
+ flags |= VREAD;
+ if (SCARG(uap, flags) & W_OK)
+ flags |= VWRITE;
+ if (SCARG(uap, flags) & X_OK)
+ flags |= VEXEC;
+ if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+ error = VOP_ACCESS(vp, flags, cred, p);
+ }
+ vput(vp);
+out1:
+ cred->cr_uid = t_uid;
+ cred->cr_groups[0] = t_gid;
+ return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+ostat(p, uap)
+ struct proc *p;
+ register struct ostat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct ostat *) ub;
+ } */ *uap;
+{
+ struct stat sb;
+ struct ostat osb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = vn_stat(nd.ni_vp, &sb, p);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ cvtstat(&sb, &osb);
+ error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
+ return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+olstat(p, uap)
+ struct proc *p;
+ register struct olstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct ostat *) ub;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct stat sb;
+ struct ostat osb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ error = vn_stat(vp, &sb, p);
+ vput(vp);
+ if (error)
+ return (error);
+ cvtstat(&sb, &osb);
+ error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
+ return (error);
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ */
+void
+cvtstat(st, ost)
+ struct stat *st;
+ struct ostat *ost;
+{
+
+ ost->st_dev = st->st_dev;
+ ost->st_ino = st->st_ino;
+ ost->st_mode = st->st_mode;
+ ost->st_nlink = st->st_nlink;
+ ost->st_uid = st->st_uid;
+ ost->st_gid = st->st_gid;
+ ost->st_rdev = st->st_rdev;
+ if (st->st_size < (quad_t)1 << 32)
+ ost->st_size = st->st_size;
+ else
+ ost->st_size = -2;
+ ost->st_atime = st->st_atime;
+ ost->st_mtime = st->st_mtime;
+ ost->st_ctime = st->st_ctime;
+ ost->st_blksize = st->st_blksize;
+ ost->st_blocks = st->st_blocks;
+ ost->st_flags = st->st_flags;
+ ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+stat(p, uap)
+ struct proc *p;
+ register struct stat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct stat *) ub;
+ } */ *uap;
+{
+ struct stat sb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = vn_stat(nd.ni_vp, &sb, p);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
+ return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+lstat(p, uap)
+ struct proc *p;
+ register struct lstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct stat *) ub;
+ } */ *uap;
+{
+ int error;
+ struct vnode *vp;
+ struct stat sb;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ error = vn_stat(vp, &sb, p);
+ vput(vp);
+ if (error)
+ return (error);
+ error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
+ return (error);
+}
+
+void
+cvtnstat(sb, nsb)
+ struct stat *sb;
+ struct nstat *nsb;
+{
+ nsb->st_dev = sb->st_dev;
+ nsb->st_ino = sb->st_ino;
+ nsb->st_mode = sb->st_mode;
+ nsb->st_nlink = sb->st_nlink;
+ nsb->st_uid = sb->st_uid;
+ nsb->st_gid = sb->st_gid;
+ nsb->st_rdev = sb->st_rdev;
+ nsb->st_atimespec = sb->st_atimespec;
+ nsb->st_mtimespec = sb->st_mtimespec;
+ nsb->st_ctimespec = sb->st_ctimespec;
+ nsb->st_size = sb->st_size;
+ nsb->st_blocks = sb->st_blocks;
+ nsb->st_blksize = sb->st_blksize;
+ nsb->st_flags = sb->st_flags;
+ nsb->st_gen = sb->st_gen;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nstat_args {
+ char *path;
+ struct nstat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nstat(p, uap)
+ struct proc *p;
+ register struct nstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct nstat *) ub;
+ } */ *uap;
+{
+ struct stat sb;
+ struct nstat nsb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = vn_stat(nd.ni_vp, &sb, p);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ cvtnstat(&sb, &nsb);
+ error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb));
+ return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nlstat(p, uap)
+ struct proc *p;
+ register struct nlstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct nstat *) ub;
+ } */ *uap;
+{
+ int error;
+ struct vnode *vp;
+ struct stat sb;
+ struct nstat nsb;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ error = vn_stat(vp, &sb, p);
+ vput(vp);
+ if (error)
+ return (error);
+ cvtnstat(&sb, &nsb);
+ error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb));
+ return (error);
+}
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+ char *path;
+ int name;
+};
+#endif
+/* ARGSUSED */
+int
+pathconf(p, uap)
+ struct proc *p;
+ register struct pathconf_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) name;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), p->p_retval);
+ vput(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+ char *path;
+ char *buf;
+ int count;
+};
+#endif
+/* ARGSUSED */
+int
+readlink(p, uap)
+ struct proc *p;
+ register struct readlink_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) buf;
+ syscallarg(int) count;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct iovec aiov;
+ struct uio auio;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type != VLNK)
+ error = EINVAL;
+ else {
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+ auio.uio_resid = SCARG(uap, count);
+ error = VOP_READLINK(vp, &auio, p->p_ucred);
+ }
+ vput(vp);
+ p->p_retval[0] = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+
+static int
+setfflags(p, vp, flags)
+ struct proc *p;
+ struct vnode *vp;
+ int flags;
+{
+ int error;
+ struct vattr vattr;
+
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_flags = flags;
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ VOP_UNLOCK(vp, 0, p);
+ return error;
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+ char *path;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+chflags(p, uap)
+ struct proc *p;
+ register struct chflags_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = setfflags(p, nd.ni_vp, SCARG(uap, flags));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+ int fd;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+fchflags(p, uap)
+ struct proc *p;
+ register struct fchflags_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ return setfflags(p, (struct vnode *) fp->f_data, SCARG(uap, flags));
+}
+
+static int
+setfmode(p, vp, mode)
+ struct proc *p;
+ struct vnode *vp;
+ int mode;
+{
+ int error;
+ struct vattr vattr;
+
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_mode = mode & ALLPERMS;
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ VOP_UNLOCK(vp, 0, p);
+ return error;
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+chmod(p, uap)
+ struct proc *p;
+ register struct chmod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = setfmode(p, nd.ni_vp, SCARG(uap, mode));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Change mode of a file given path name (don't follow links.)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchmod_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+lchmod(p, uap)
+ struct proc *p;
+ register struct lchmod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = setfmode(p, nd.ni_vp, SCARG(uap, mode));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+ int fd;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+fchmod(p, uap)
+ struct proc *p;
+ register struct fchmod_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ return setfmode(p, (struct vnode *)fp->f_data, SCARG(uap, mode));
+}
+
+static int
+setfown(p, vp, uid, gid)
+ struct proc *p;
+ struct vnode *vp;
+ uid_t uid;
+ gid_t gid;
+{
+ int error;
+ struct vattr vattr;
+
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_uid = uid;
+ vattr.va_gid = gid;
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ VOP_UNLOCK(vp, 0, p);
+ return error;
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+chown(p, uap)
+ struct proc *p;
+ register struct chown_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = setfown(p, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+ vrele(nd.ni_vp);
+
+ return (error);
+}
+
+/*
+ * Set ownership given a path name, do not cross symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+lchown(p, uap)
+ struct proc *p;
+ register struct lchown_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = setfown(p, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+ int fd;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+fchown(p, uap)
+ struct proc *p;
+ register struct fchown_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+{
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ return setfown(p, (struct vnode *)fp->f_data,
+ SCARG(uap, uid), SCARG(uap, gid));
+}
+
+static int
+setutimes(p, vp, tv, nullflag)
+ struct proc *p;
+ struct vnode *vp;
+ struct timeval *tv;
+ int nullflag;
+{
+ int error;
+ struct vattr vattr;
+
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_atime.tv_sec = tv[0].tv_sec;
+ vattr.va_atime.tv_nsec = tv[0].tv_usec * 1000;
+ vattr.va_mtime.tv_sec = tv[1].tv_sec;
+ vattr.va_mtime.tv_nsec = tv[1].tv_usec * 1000;
+ if (nullflag)
+ vattr.va_vaflags |= VA_UTIMES_NULL;
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ VOP_UNLOCK(vp, 0, p);
+ return error;
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+utimes(p, uap)
+ struct proc *p;
+ register struct utimes_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+{
+ struct timeval tv[2];
+ int error;
+ struct nameidata nd;
+ int nullflag;
+
+ nullflag = 0;
+ if (SCARG(uap, tptr) == NULL) {
+ microtime(&tv[0]);
+ tv[1] = tv[0];
+ nullflag = 1;
+ } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+ sizeof (tv)))
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = setutimes(p, nd.ni_vp, tv, nullflag);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lutimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+lutimes(p, uap)
+ struct proc *p;
+ register struct lutimes_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+{
+ struct timeval tv[2];
+ int error;
+ struct nameidata nd;
+ int nullflag;
+
+ nullflag = 0;
+ if (SCARG(uap, tptr) == NULL) {
+ microtime(&tv[0]);
+ tv[1] = tv[0];
+ nullflag = 1;
+ } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+ sizeof (tv)))
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+
+ error = setutimes(p, nd.ni_vp, tv, nullflag);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct futimes_args {
+ int fd;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+futimes(p, uap)
+ struct proc *p;
+ register struct futimes_args /* {
+ syscallarg(int ) fd;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+{
+ struct timeval tv[2];
+ struct file *fp;
+ int error;
+ int nullflag;
+
+ nullflag = 0;
+ if (SCARG(uap, tptr) == NULL) {
+ microtime(&tv[0]);
+ tv[1] = tv[0];
+ nullflag = 1;
+ } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+ sizeof (tv)))
+ return (error);
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ return setutimes(p, (struct vnode *)fp->f_data, tv, nullflag);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+ char *path;
+ int pad;
+ off_t length;
+};
+#endif
+/* ARGSUSED */
+int
+truncate(p, uap)
+ struct proc *p;
+ register struct truncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ if (uap->length < 0)
+ return(EINVAL);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+ else if ((error = vn_writechk(vp)) == 0 &&
+ (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = SCARG(uap, length);
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ }
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+ int fd;
+ int pad;
+ off_t length;
+};
+#endif
+/* ARGSUSED */
+int
+ftruncate(p, uap)
+ struct proc *p;
+ register struct ftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ *uap;
+{
+ struct vattr vattr;
+ struct vnode *vp;
+ struct file *fp;
+ int error;
+
+ if (uap->length < 0)
+ return(EINVAL);
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ if ((fp->f_flag & FWRITE) == 0)
+ return (EINVAL);
+ vp = (struct vnode *)fp->f_data;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+ else if ((error = vn_writechk(vp)) == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = SCARG(uap, length);
+ error = VOP_SETATTR(vp, &vattr, fp->f_cred, p);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+ char *path;
+ long length;
+};
+#endif
+/* ARGSUSED */
+int
+otruncate(p, uap)
+ struct proc *p;
+ register struct otruncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(long) length;
+ } */ *uap;
+{
+ struct truncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ nuap;
+
+ SCARG(&nuap, path) = SCARG(uap, path);
+ SCARG(&nuap, length) = SCARG(uap, length);
+ return (truncate(p, &nuap));
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+ int fd;
+ long length;
+};
+#endif
+/* ARGSUSED */
+int
+oftruncate(p, uap)
+ struct proc *p;
+ register struct oftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(long) length;
+ } */ *uap;
+{
+ struct ftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ nuap;
+
+ SCARG(&nuap, fd) = SCARG(uap, fd);
+ SCARG(&nuap, length) = SCARG(uap, length);
+ return (ftruncate(p, &nuap));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+fsync(p, uap)
+ struct proc *p;
+ struct fsync_args /* {
+ syscallarg(int) fd;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ if (vp->v_object)
+ vm_object_page_clean(vp->v_object, 0, 0, 0);
+ if ((error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p)) == 0 &&
+ vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) &&
+ bioops.io_fsync)
+ error = (*bioops.io_fsync)(vp);
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+/*
+ * Rename files. Source and destination must either both be directories,
+ * or both not be directories. If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+ char *from;
+ char *to;
+};
+#endif
+/* ARGSUSED */
+int
+rename(p, uap)
+ struct proc *p;
+ register struct rename_args /* {
+ syscallarg(char *) from;
+ syscallarg(char *) to;
+ } */ *uap;
+{
+ register struct vnode *tvp, *fvp, *tdvp;
+ struct nameidata fromnd, tond;
+ int error;
+
+ NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE,
+ SCARG(uap, from), p);
+ if (error = namei(&fromnd))
+ return (error);
+ fvp = fromnd.ni_vp;
+ NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ,
+ UIO_USERSPACE, SCARG(uap, to), p);
+ if (fromnd.ni_vp->v_type == VDIR)
+ tond.ni_cnd.cn_flags |= WILLBEDIR;
+ if (error = namei(&tond)) {
+ /* Translate error code for rename("dir1", "dir2/."). */
+ if (error == EISDIR && fvp->v_type == VDIR)
+ error = EINVAL;
+ VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ goto out1;
+ }
+ tdvp = tond.ni_dvp;
+ tvp = tond.ni_vp;
+ if (tvp != NULL) {
+ if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
+ }
+ if (fvp == tdvp)
+ error = EINVAL;
+ /*
+ * If source is the same as the destination (that is the
+ * same inode number with the same name in the same directory),
+ * then there is nothing to do.
+ */
+ if (fvp == tvp && fromnd.ni_dvp == tdvp &&
+ fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
+ !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
+ fromnd.ni_cnd.cn_namelen))
+ error = -1;
+out:
+ if (!error) {
+ VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE);
+ if (fromnd.ni_dvp != tdvp) {
+ VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ }
+ if (tvp) {
+ VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE);
+ }
+ error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+ tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+ } else {
+ VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ if (tvp)
+ vput(tvp);
+ VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ }
+ vrele(tond.ni_startdir);
+ ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename");
+ ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename");
+ ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename");
+ ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename");
+ zfree(namei_zone, tond.ni_cnd.cn_pnbuf);
+out1:
+ if (fromnd.ni_startdir)
+ vrele(fromnd.ni_startdir);
+ zfree(namei_zone, fromnd.ni_cnd.cn_pnbuf);
+ if (error == -1)
+ return (0);
+ return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkdir(p, uap)
+ struct proc *p;
+ register struct mkdir_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ nd.ni_cnd.cn_flags |= WILLBEDIR;
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(vp);
+ return (EEXIST);
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VDIR;
+ vattr.va_mode = (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_fd->fd_cmask;
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ vput(nd.ni_dvp);
+ if (!error)
+ vput(nd.ni_vp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir");
+ return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+rmdir(p, uap)
+ struct proc *p;
+ struct rmdir_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+ /*
+ * No rmdir "." please.
+ */
+ if (nd.ni_dvp == vp) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ */
+ if (vp->v_flag & VROOT)
+ error = EBUSY;
+out:
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+ } else {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ }
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (vp != NULLVP)
+ vput(vp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir");
+ return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Read a block of directory entries in a file system independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+ogetdirentries(p, uap)
+ struct proc *p;
+ register struct ogetdirentries_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ syscallarg(long *) basep;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct file *fp;
+ struct uio auio, kuio;
+ struct iovec aiov, kiov;
+ struct dirent *dp, *edp;
+ caddr_t dirbuf;
+ int error, eofflag, readcnt;
+ long loff;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ if ((fp->f_flag & FREAD) == 0)
+ return (EBADF);
+ vp = (struct vnode *)fp->f_data;
+unionread:
+ if (vp->v_type != VDIR)
+ return (EINVAL);
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+ auio.uio_resid = SCARG(uap, count);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ loff = auio.uio_offset = fp->f_offset;
+# if (BYTE_ORDER != LITTLE_ENDIAN)
+ if (vp->v_mount->mnt_maxsymlinklen <= 0) {
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+ NULL, NULL);
+ fp->f_offset = auio.uio_offset;
+ } else
+# endif
+ {
+ kuio = auio;
+ kuio.uio_iov = &kiov;
+ kuio.uio_segflg = UIO_SYSSPACE;
+ kiov.iov_len = SCARG(uap, count);
+ MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
+ kiov.iov_base = dirbuf;
+ error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
+ NULL, NULL);
+ fp->f_offset = kuio.uio_offset;
+ if (error == 0) {
+ readcnt = SCARG(uap, count) - kuio.uio_resid;
+ edp = (struct dirent *)&dirbuf[readcnt];
+ for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+# if (BYTE_ORDER == LITTLE_ENDIAN)
+ /*
+ * The expected low byte of
+ * dp->d_namlen is our dp->d_type.
+ * The high MBZ byte of dp->d_namlen
+ * is our dp->d_namlen.
+ */
+ dp->d_type = dp->d_namlen;
+ dp->d_namlen = 0;
+# else
+ /*
+ * The dp->d_type is the high byte
+ * of the expected dp->d_namlen,
+ * so must be zero'ed.
+ */
+ dp->d_type = 0;
+# endif
+ if (dp->d_reclen > 0) {
+ dp = (struct dirent *)
+ ((char *)dp + dp->d_reclen);
+ } else {
+ error = EIO;
+ break;
+ }
+ }
+ if (dp >= edp)
+ error = uiomove(dirbuf, readcnt, &auio);
+ }
+ FREE(dirbuf, M_TEMP);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ if (error)
+ return (error);
+ if (union_dircheckp && SCARG(uap, count) == auio.uio_resid) {
+ error = union_dircheckp(p, &vp, fp);
+ if (error == -1)
+ goto unionread;
+ if (error)
+ return (error);
+ }
+ error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
+ sizeof(long));
+ p->p_retval[0] = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Read a block of directory entries in a file system independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+getdirentries(p, uap)
+ struct proc *p;
+ register struct getdirentries_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ syscallarg(long *) basep;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct file *fp;
+ struct uio auio;
+ struct iovec aiov;
+ long loff;
+ int error, eofflag;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ if ((fp->f_flag & FREAD) == 0)
+ return (EBADF);
+ vp = (struct vnode *)fp->f_data;
+unionread:
+ if (vp->v_type != VDIR)
+ return (EINVAL);
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+ auio.uio_resid = SCARG(uap, count);
+ /* vn_lock(vp, LK_SHARED | LK_RETRY, p); */
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ loff = auio.uio_offset = fp->f_offset;
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
+ fp->f_offset = auio.uio_offset;
+ VOP_UNLOCK(vp, 0, p);
+ if (error)
+ return (error);
+ if (union_dircheckp && SCARG(uap, count) == auio.uio_resid) {
+ error = union_dircheckp(p, &vp, fp);
+ if (error == -1)
+ goto unionread;
+ if (error)
+ return (error);
+ }
+ if (SCARG(uap, basep) != NULL) {
+ error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
+ sizeof(long));
+ }
+ p->p_retval[0] = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+#ifndef _SYS_SYSPROTO_H_
+struct getdents_args {
+ int fd;
+ char *buf;
+ size_t count;
+};
+#endif
+int
+getdents(p, uap)
+ struct proc *p;
+ register struct getdents_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ } */ *uap;
+{
+ struct getdirentries_args ap;
+ ap.fd = uap->fd;
+ ap.buf = uap->buf;
+ ap.count = uap->count;
+ ap.basep = NULL;
+ return getdirentries(p, &ap);
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+ int newmask;
+};
+#endif
+int
+umask(p, uap)
+ struct proc *p;
+ struct umask_args /* {
+ syscallarg(int) newmask;
+ } */ *uap;
+{
+ register struct filedesc *fdp;
+
+ fdp = p->p_fd;
+ p->p_retval[0] = fdp->fd_cmask;
+ fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS;
+ return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem
+ * away from vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+revoke(p, uap)
+ struct proc *p;
+ register struct revoke_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p))
+ goto out;
+ if (p->p_ucred->cr_uid != vattr.va_uid &&
+ (error = suser(p->p_ucred, &p->p_acflag)))
+ goto out;
+ if (vp->v_usecount > 1 || (vp->v_flag & VALIASED))
+ VOP_REVOKE(vp, REVOKEALL);
+out:
+ vrele(vp);
+ return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry.
+ */
+int
+getvnode(fdp, fd, fpp)
+ struct filedesc *fdp;
+ int fd;
+ struct file **fpp;
+{
+ struct file *fp;
+
+ if ((u_int)fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[fd]) == NULL)
+ return (EBADF);
+ if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO)
+ return (EINVAL);
+ *fpp = fp;
+ return (0);
+}
+#ifndef _SYS_SYSPROTO_H_
+struct __getcwd_args {
+ u_char *buf;
+ u_int buflen;
+};
+#endif
+#define STATNODE(mode, name, var) \
+ SYSCTL_INT(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
+
+static int disablecwd;
+SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
+
+static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
+static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
+static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
+static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
+static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
+static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
+int
+__getcwd(p, uap)
+ struct proc *p;
+ struct __getcwd_args *uap;
+{
+ char *bp, *buf;
+ int error, i, slash_prefixed;
+ struct filedesc *fdp;
+ struct namecache *ncp;
+ struct vnode *vp;
+
+ numcwdcalls++;
+ if (disablecwd)
+ return (ENODEV);
+ if (uap->buflen < 2)
+ return (EINVAL);
+ if (uap->buflen > MAXPATHLEN)
+ uap->buflen = MAXPATHLEN;
+ buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
+ bp += uap->buflen - 1;
+ *bp = '\0';
+ fdp = p->p_fd;
+ slash_prefixed = 0;
+ for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
+ if (vp->v_flag & VROOT) {
+ vp = vp->v_mount->mnt_vnodecovered;
+ continue;
+ }
+ if (vp->v_dd->v_id != vp->v_ddid) {
+ numcwdfail1++;
+ free(buf, M_TEMP);
+ return (ENOTDIR);
+ }
+ ncp = TAILQ_FIRST(&vp->v_cache_dst);
+ if (!ncp) {
+ numcwdfail2++;
+ free(buf, M_TEMP);
+ return (ENOENT);
+ }
+ if (ncp->nc_dvp != vp->v_dd) {
+ numcwdfail3++;
+ free(buf, M_TEMP);
+ return (EBADF);
+ }
+ for (i = ncp->nc_nlen - 1; i >= 0; i--) {
+ if (bp == buf) {
+ numcwdfail4++;
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+ *--bp = ncp->nc_name[i];
+ }
+ if (bp == buf) {
+ numcwdfail4++;
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+ *--bp = '/';
+ slash_prefixed = 1;
+ vp = vp->v_dd;
+ }
+ if (!slash_prefixed) {
+ if (bp == buf) {
+ numcwdfail4++;
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+ *--bp = '/';
+ }
+ numcwdfound++;
+ error = copyout(bp, uap->buf, strlen(bp) + 1);
+ free(buf, M_TEMP);
+ return (error);
+}
diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c
new file mode 100644
index 0000000..43589c74
--- /dev/null
+++ b/sys/kern/vfs_init.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_init.c 8.3 (Berkeley) 1/4/94
+ * $Id: vfs_init.c,v 1.40 1998/11/15 15:18:30 bde Exp $
+ */
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <vm/vm_zone.h>
+
+
+MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
+
+/*
+ * XXX this bloat just exands the sysctl__vfs linker set a little so that
+ * we can attach sysctls for VFS modules without expanding the linker set.
+ * Currently (1998/09/06), only one VFS uses sysctls, so 2 extra linker
+ * set slots are more than sufficient.
+ */
+extern struct linker_set sysctl__vfs;
+static int mod_xx;
+SYSCTL_INT(_vfs, OID_AUTO, mod0, CTLFLAG_RD, &mod_xx, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, mod1, CTLFLAG_RD, &mod_xx, 0, "");
+
+/*
+ * Zone for namei
+ */
+struct vm_zone *namei_zone;
+
+/*
+ * vfs_init.c
+ *
+ * Allocate and fill in operations vectors.
+ *
+ * An undocumented feature of this approach to defining operations is that
+ * there can be multiple entries in vfs_opv_descs for the same operations
+ * vector. This allows third parties to extend the set of operations
+ * supported by another layer in a binary compatibile way. For example,
+ * assume that NFS needed to be modified to support Ficus. NFS has an entry
+ * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by
+ * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions)
+ * listing those new operations Ficus adds to NFS, all without modifying the
+ * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but
+ * that is a(whole)nother story.) This is a feature.
+ */
+
+/* Table of known vnodeop vectors (list of VFS vnode vectors) */
+static struct vnodeopv_desc **vnodeopv_descs;
+static int vnodeopv_num;
+
+/* Table of known descs (list of vnode op handlers "vop_access_desc") */
+static struct vnodeop_desc **vfs_op_descs;
+static int *vfs_op_desc_refs; /* reference counts */
+static int num_op_descs;
+static int vfs_opv_numops;
+
+static void
+vfs_opv_recalc(void)
+{
+ int i, j;
+ vop_t ***opv_desc_vector_p;
+ vop_t **opv_desc_vector;
+ struct vnodeopv_entry_desc *opve_descp;
+ struct vnodeopv_desc *opv;
+
+ if (vfs_op_descs == NULL)
+ panic("vfs_opv_recalc called with null vfs_op_descs");
+
+ /*
+ * Run through and make sure all known descs have an offset
+ *
+ * vop_default_desc is hardwired at offset 1, and offset 0
+ * is a panic sanity check.
+ */
+ vfs_opv_numops = 0;
+ for (i = 0; i < num_op_descs; i++)
+ if (vfs_opv_numops < (vfs_op_descs[i]->vdesc_offset + 1))
+ vfs_opv_numops = vfs_op_descs[i]->vdesc_offset + 1;
+ for (i = 0; i < num_op_descs; i++)
+ if (vfs_op_descs[i]->vdesc_offset == 0)
+ vfs_op_descs[i]->vdesc_offset = vfs_opv_numops++;
+ /*
+ * Allocate and fill in the vectors
+ */
+ for (i = 0; i < vnodeopv_num; i++) {
+ opv = vnodeopv_descs[i];
+ opv_desc_vector_p = opv->opv_desc_vector_p;
+ if (*opv_desc_vector_p)
+ FREE(*opv_desc_vector_p, M_VNODE);
+ MALLOC(*opv_desc_vector_p, vop_t **,
+ vfs_opv_numops * sizeof(vop_t *), M_VNODE, M_WAITOK);
+ if (*opv_desc_vector_p == NULL)
+ panic("no memory for vop_t ** vector");
+ bzero(*opv_desc_vector_p, vfs_opv_numops * sizeof(vop_t *));
+
+ /* Fill in, with slot 0 being panic */
+ opv_desc_vector = *opv_desc_vector_p;
+ opv_desc_vector[0] = (vop_t *)vop_panic;
+ for (j = 0; opv->opv_desc_ops[j].opve_op; j++) {
+ opve_descp = &(opv->opv_desc_ops[j]);
+ opv_desc_vector[opve_descp->opve_op->vdesc_offset] =
+ opve_descp->opve_impl;
+ }
+
+ /* Replace unfilled routines with their default (slot 1). */
+ opv_desc_vector = *(opv->opv_desc_vector_p);
+ if (opv_desc_vector[1] == NULL)
+ panic("vfs_opv_recalc: vector without a default.");
+ for (j = 0; j < vfs_opv_numops; j++)
+ if (opv_desc_vector[j] == NULL)
+ opv_desc_vector[j] = opv_desc_vector[1];
+ }
+}
+
+void
+vfs_add_vnodeops(void *data)
+{
+ struct vnodeopv_desc *opv;
+ struct vnodeopv_desc **newopv;
+ struct vnodeop_desc **newop;
+ int *newref;
+ vop_t **opv_desc_vector;
+ struct vnodeop_desc *desc;
+ int i, j;
+
+ opv = (struct vnodeopv_desc *)data;
+ MALLOC(newopv, struct vnodeopv_desc **,
+ (vnodeopv_num + 1) * sizeof(*newopv), M_VNODE, M_WAITOK);
+ if (newopv == NULL)
+ panic("vfs_add_vnodeops: no memory");
+ if (vnodeopv_descs) {
+ bcopy(vnodeopv_descs, newopv, vnodeopv_num * sizeof(*newopv));
+ FREE(vnodeopv_descs, M_VNODE);
+ }
+ newopv[vnodeopv_num] = opv;
+ vnodeopv_descs = newopv;
+ vnodeopv_num++;
+
+ /* See if we have turned up a new vnode op desc */
+ opv_desc_vector = *(opv->opv_desc_vector_p);
+ for (i = 0; (desc = opv->opv_desc_ops[i].opve_op); i++) {
+ for (j = 0; j < num_op_descs; j++) {
+ if (desc == vfs_op_descs[j]) {
+ /* found it, increase reference count */
+ vfs_op_desc_refs[j]++;
+ break;
+ }
+ }
+ if (j == num_op_descs) {
+ /* not found, new entry */
+ MALLOC(newop, struct vnodeop_desc **,
+ (num_op_descs + 1) * sizeof(*newop),
+ M_VNODE, M_WAITOK);
+ if (newop == NULL)
+ panic("vfs_add_vnodeops: no memory for desc");
+ /* new reference count (for unload) */
+ MALLOC(newref, int *,
+ (num_op_descs + 1) * sizeof(*newref),
+ M_VNODE, M_WAITOK);
+ if (newref == NULL)
+ panic("vfs_add_vnodeops: no memory for refs");
+ if (vfs_op_descs) {
+ bcopy(vfs_op_descs, newop,
+ num_op_descs * sizeof(*newop));
+ FREE(vfs_op_descs, M_VNODE);
+ }
+ if (vfs_op_desc_refs) {
+ bcopy(vfs_op_desc_refs, newref,
+ num_op_descs * sizeof(*newref));
+ FREE(vfs_op_desc_refs, M_VNODE);
+ }
+ newop[num_op_descs] = desc;
+ newref[num_op_descs] = 1;
+ vfs_op_descs = newop;
+ vfs_op_desc_refs = newref;
+ num_op_descs++;
+ }
+ }
+ vfs_opv_recalc();
+}
+
+void
+vfs_rm_vnodeops(void *data)
+{
+ struct vnodeopv_desc *opv;
+ struct vnodeopv_desc **newopv;
+ struct vnodeop_desc **newop;
+ int *newref;
+ vop_t **opv_desc_vector;
+ struct vnodeop_desc *desc;
+ int i, j, k;
+
+ opv = (struct vnodeopv_desc *)data;
+ /* Lower ref counts on descs in the table and release if zero */
+ opv_desc_vector = *(opv->opv_desc_vector_p);
+ for (i = 0; (desc = opv->opv_desc_ops[i].opve_op); i++) {
+ for (j = 0; j < num_op_descs; j++) {
+ if (desc == vfs_op_descs[j]) {
+ /* found it, decrease reference count */
+ vfs_op_desc_refs[j]--;
+ break;
+ }
+ }
+ for (j = 0; j < num_op_descs; j++) {
+ if (vfs_op_desc_refs[j] > 0)
+ continue;
+ if (vfs_op_desc_refs[j] < 0)
+ panic("vfs_remove_vnodeops: negative refcnt");
+ MALLOC(newop, struct vnodeop_desc **,
+ (num_op_descs - 1) * sizeof(*newop),
+ M_VNODE, M_WAITOK);
+ if (newop == NULL)
+ panic("vfs_remove_vnodeops: no memory for desc");
+ /* new reference count (for unload) */
+ MALLOC(newref, int *,
+ (num_op_descs - 1) * sizeof(*newref),
+ M_VNODE, M_WAITOK);
+ if (newref == NULL)
+ panic("vfs_remove_vnodeops: no memory for refs");
+ for (k = j; k < (num_op_descs - 1); k++) {
+ vfs_op_descs[k] = vfs_op_descs[k + 1];
+ vfs_op_desc_refs[k] = vfs_op_desc_refs[k + 1];
+ }
+ bcopy(vfs_op_descs, newop,
+ (num_op_descs - 1) * sizeof(*newop));
+ bcopy(vfs_op_desc_refs, newref,
+ (num_op_descs - 1) * sizeof(*newref));
+ FREE(vfs_op_descs, M_VNODE);
+ FREE(vfs_op_desc_refs, M_VNODE);
+ vfs_op_descs = newop;
+ vfs_op_desc_refs = newref;
+ num_op_descs--;
+ }
+ }
+
+ for (i = 0; i < vnodeopv_num; i++) {
+ if (vnodeopv_descs[i] == opv) {
+ for (j = i; j < (vnodeopv_num - 1); j++)
+ vnodeopv_descs[j] = vnodeopv_descs[j + 1];
+ break;
+ }
+ }
+ if (i == vnodeopv_num)
+ panic("vfs_remove_vnodeops: opv not found");
+ MALLOC(newopv, struct vnodeopv_desc **,
+ (vnodeopv_num - 1) * sizeof(*newopv), M_VNODE, M_WAITOK);
+ if (newopv == NULL)
+ panic("vfs_remove_vnodeops: no memory");
+ bcopy(vnodeopv_descs, newopv, (vnodeopv_num - 1) * sizeof(*newopv));
+ FREE(vnodeopv_descs, M_VNODE);
+ vnodeopv_descs = newopv;
+ vnodeopv_num--;
+
+ vfs_opv_recalc();
+}
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+struct vattr va_null;
+
+/*
+ * Initialize the vnode structures and initialize each file system type.
+ */
+/* ARGSUSED*/
+static void
+vfsinit(void *dummy)
+{
+
+ namei_zone = zinit("NAMEI", MAXPATHLEN, 0, 0, 2);
+
+ /*
+ * Initialize the vnode table
+ */
+ vntblinit();
+ /*
+ * Initialize the vnode name cache
+ */
+ nchinit();
+ /*
+ * Initialize each file system type.
+ * Vfs type numbers must be distinct from VFS_GENERIC (and VFS_VFSCONF).
+ */
+ vattr_null(&va_null);
+ maxvfsconf = VFS_GENERIC + 1;
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vfsinit, NULL)
+
+int
+vfs_register(struct vfsconf *vfc)
+{
+ struct linker_set *l;
+ struct sysctl_oid **oidpp;
+ struct vfsconf *vfsp;
+ int i, exists;
+
+ vfsp = NULL;
+ l = &sysctl__vfs;
+ if (vfsconf)
+ for (vfsp = vfsconf; vfsp->vfc_next; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfc->vfc_name, vfsp->vfc_name))
+ return EEXIST;
+
+ vfc->vfc_typenum = maxvfsconf++;
+ if (vfc->vfc_vfsops->vfs_oid != NULL) {
+ /*
+ * Attach the oid to the "vfs" node of the sysctl tree if
+ * it isn't already there (it will be there for statically
+ * configured vfs's).
+ */
+ exists = 0;
+ for (i = l->ls_length,
+ oidpp = (struct sysctl_oid **)l->ls_items;
+ i-- != 0; oidpp++)
+ if (*oidpp == vfc->vfc_vfsops->vfs_oid) {
+ exists = 1;
+ break;
+ }
+ if (exists == 0)
+ for (i = l->ls_length,
+ oidpp = (struct sysctl_oid **)l->ls_items;
+ i-- != 0; oidpp++) {
+ if (*oidpp == NULL ||
+ *oidpp == &sysctl___vfs_mod0 ||
+ *oidpp == &sysctl___vfs_mod1) {
+ *oidpp = vfc->vfc_vfsops->vfs_oid;
+ break;
+ }
+ }
+
+ vfc->vfc_vfsops->vfs_oid->oid_number = vfc->vfc_typenum;
+ sysctl_order_all();
+ }
+ if (vfsp)
+ vfsp->vfc_next = vfc;
+ else
+ vfsconf = vfc;
+ vfc->vfc_next = NULL;
+
+ /*
+ * Call init function for this VFS...
+ */
+ (*(vfc->vfc_vfsops->vfs_init))(vfc);
+
+ return 0;
+}
+
+
+int
+vfs_unregister(struct vfsconf *vfc)
+{
+ struct linker_set *l;
+ struct sysctl_oid **oidpp;
+ struct vfsconf *vfsp, *prev_vfsp;
+ int error, i, maxtypenum;
+
+ i = vfc->vfc_typenum;
+
+ prev_vfsp = NULL;
+ for (vfsp = vfsconf; vfsp;
+ prev_vfsp = vfsp, vfsp = vfsp->vfc_next) {
+ if (!strcmp(vfc->vfc_name, vfsp->vfc_name))
+ break;
+ }
+ if (vfsp == NULL)
+ return EINVAL;
+ if (vfsp->vfc_refcount)
+ return EBUSY;
+ if (vfc->vfc_vfsops->vfs_uninit != NULL) {
+ error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp);
+ if (error)
+ return (error);
+ }
+ if (prev_vfsp)
+ prev_vfsp->vfc_next = vfsp->vfc_next;
+ else
+ vfsconf = vfsp->vfc_next;
+ if (vfsp->vfc_vfsops->vfs_oid != NULL) {
+ l = &sysctl__vfs;
+ for (i = l->ls_length,
+ oidpp = (struct sysctl_oid **)l->ls_items;
+ i--; oidpp++) {
+ if (*oidpp == vfsp->vfc_vfsops->vfs_oid) {
+ *oidpp = NULL;
+ sysctl_order_all();
+ break;
+ }
+ }
+ }
+ maxtypenum = VFS_GENERIC;
+ for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
+ if (maxtypenum < vfsp->vfc_typenum)
+ maxtypenum = vfsp->vfc_typenum;
+ maxvfsconf = maxtypenum + 1;
+ return 0;
+}
+
+int
+vfs_modevent(module_t mod, int type, void *data)
+{
+ struct vfsconf *vfc;
+ int error = 0;
+
+ vfc = (struct vfsconf *)data;
+
+ switch (type) {
+ case MOD_LOAD:
+ if (vfc)
+ error = vfs_register(vfc);
+ break;
+
+ case MOD_UNLOAD:
+ if (vfc)
+ error = vfs_unregister(vfc);
+ break;
+ default: /* including MOD_SHUTDOWN */
+ break;
+ }
+ return (error);
+}
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
new file mode 100644
index 0000000..67efd52
--- /dev/null
+++ b/sys/kern/vfs_lookup.c
@@ -0,0 +1,706 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94
+ * $Id: vfs_lookup.c,v 1.30 1999/01/08 17:31:16 eivind Exp $
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm_zone.h>
+
+/*
+ * Convert a pathname into a pointer to a locked inode.
+ *
+ * The FOLLOW flag is set when symbolic links are to be followed
+ * when they occur at the end of the name translation process.
+ * Symbolic links are always followed for all other pathname
+ * components other than the last.
+ *
+ * The segflg defines whether the name is to be copied from user
+ * space or kernel space.
+ *
+ * Overall outline of namei:
+ *
+ * copy in name
+ * get starting directory
+ * while (!done && !error) {
+ * call lookup to search path.
+ * if symbolic link, massage name in buffer and continue
+ * }
+ */
+int
+namei(ndp)
+ register struct nameidata *ndp;
+{
+ register struct filedesc *fdp; /* pointer to file descriptor state */
+ register char *cp; /* pointer into pathname argument */
+ register struct vnode *dp; /* the directory we are searching */
+ struct iovec aiov; /* uio for reading symbolic links */
+ struct uio auio;
+ int error, linklen;
+ struct componentname *cnp = &ndp->ni_cnd;
+ struct proc *p = cnp->cn_proc;
+
+ ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_proc->p_ucred;
+ KASSERT(cnp->cn_cred && cnp->cn_proc, ("namei: bad cred/proc"));
+ KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
+ ("namei: nameiop contaminated with flags"));
+ KASSERT((cnp->cn_flags & OPMASK) == 0,
+ ("namei: flags contaminated with nameiops"));
+ fdp = cnp->cn_proc->p_fd;
+
+ /*
+ * Get a buffer for the name to be translated, and copy the
+ * name into the buffer.
+ */
+ if ((cnp->cn_flags & HASBUF) == 0)
+ cnp->cn_pnbuf = zalloc(namei_zone);
+ if (ndp->ni_segflg == UIO_SYSSPACE)
+ error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
+ MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+ else
+ error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
+ MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+
+ /*
+ * Don't allow empty pathnames.
+ */
+ if (!error && *cnp->cn_pnbuf == '\0')
+ error = ENOENT;
+
+ if (error) {
+ zfree(namei_zone, cnp->cn_pnbuf);
+ ndp->ni_vp = NULL;
+ return (error);
+ }
+ ndp->ni_loopcnt = 0;
+#ifdef KTRACE
+ if (KTRPOINT(cnp->cn_proc, KTR_NAMEI))
+ ktrnamei(cnp->cn_proc->p_tracep, cnp->cn_pnbuf);
+#endif
+
+ /*
+ * Get starting point for the translation.
+ */
+ ndp->ni_rootdir = fdp->fd_rdir;
+
+ dp = fdp->fd_cdir;
+ VREF(dp);
+ for (;;) {
+ /*
+ * Check if root directory should replace current directory.
+ * Done at start of translation and after symbolic link.
+ */
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+ if (*(cnp->cn_nameptr) == '/') {
+ vrele(dp);
+ while (*(cnp->cn_nameptr) == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+ dp = ndp->ni_rootdir;
+ VREF(dp);
+ }
+ ndp->ni_startdir = dp;
+ error = lookup(ndp);
+ if (error) {
+ zfree(namei_zone, cnp->cn_pnbuf);
+ return (error);
+ }
+ /*
+ * Check for symbolic link
+ */
+ if ((cnp->cn_flags & ISSYMLINK) == 0) {
+ if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0)
+ zfree(namei_zone, cnp->cn_pnbuf);
+ else
+ cnp->cn_flags |= HASBUF;
+
+ if (ndp->ni_vp && ndp->ni_vp->v_type == VREG &&
+ (cnp->cn_nameiop != DELETE) &&
+ ((cnp->cn_flags & (NOOBJ|LOCKLEAF)) ==
+ LOCKLEAF))
+ vfs_object_create(ndp->ni_vp,
+ ndp->ni_cnd.cn_proc,
+ ndp->ni_cnd.cn_cred);
+
+ return (0);
+ }
+ if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
+ VOP_UNLOCK(ndp->ni_dvp, 0, p);
+ if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
+ error = ELOOP;
+ break;
+ }
+ if (ndp->ni_pathlen > 1)
+ cp = zalloc(namei_zone);
+ else
+ cp = cnp->cn_pnbuf;
+ aiov.iov_base = cp;
+ aiov.iov_len = MAXPATHLEN;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_procp = (struct proc *)0;
+ auio.uio_resid = MAXPATHLEN;
+ error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
+ if (error) {
+ if (ndp->ni_pathlen > 1)
+ zfree(namei_zone, cp);
+ break;
+ }
+ linklen = MAXPATHLEN - auio.uio_resid;
+ if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
+ if (ndp->ni_pathlen > 1)
+ zfree(namei_zone, cp);
+ error = ENAMETOOLONG;
+ break;
+ }
+ if (ndp->ni_pathlen > 1) {
+ bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
+ zfree(namei_zone, cnp->cn_pnbuf);
+ cnp->cn_pnbuf = cp;
+ } else
+ cnp->cn_pnbuf[linklen] = '\0';
+ ndp->ni_pathlen += linklen;
+ vput(ndp->ni_vp);
+ dp = ndp->ni_dvp;
+ }
+ zfree(namei_zone, cnp->cn_pnbuf);
+ vrele(ndp->ni_dvp);
+ vput(ndp->ni_vp);
+ ndp->ni_vp = NULL;
+ return (error);
+}
+
+/*
+ * Search a pathname.
+ * This is a very central and rather complicated routine.
+ *
+ * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
+ * The starting directory is taken from ni_startdir. The pathname is
+ * descended until done, or a symbolic link is encountered. The variable
+ * ni_more is clear if the path is completed; it is set to one if a
+ * symbolic link needing interpretation is encountered.
+ *
+ * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
+ * whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it, the parent directory is returned
+ * locked. If flag has WANTPARENT or'ed into it, the parent directory is
+ * returned unlocked. Otherwise the parent directory is not returned. If
+ * the target of the pathname exists and LOCKLEAF is or'ed into the flag
+ * the target is returned locked, otherwise it is returned unlocked.
+ * When creating or renaming and LOCKPARENT is specified, the target may not
+ * be ".". When deleting and LOCKPARENT is specified, the target may be ".".
+ *
+ * Overall outline of lookup:
+ *
+ * dirloop:
+ * identify next component of name at ndp->ni_ptr
+ * handle degenerate case where name is null string
+ * if .. and crossing mount points and on mounted filesys, find parent
+ * call VOP_LOOKUP routine for next component name
+ * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
+ * component vnode returned in ni_vp (if it exists), locked.
+ * if result vnode is mounted on and crossing mount points,
+ * find mounted on vnode
+ * if more components of name, do next level at dirloop
+ * return the answer in ni_vp, locked if LOCKLEAF set
+ * if LOCKPARENT set, return locked parent in ni_dvp
+ * if WANTPARENT set, return unlocked parent in ni_dvp
+ */
+int
+lookup(ndp)
+ register struct nameidata *ndp;
+{
+ register char *cp; /* pointer into pathname argument */
+ register struct vnode *dp = 0; /* the directory we are searching */
+ struct vnode *tdp; /* saved dp */
+ struct mount *mp; /* mount table entry */
+ int docache; /* == 0 do not cache last component */
+ int wantparent; /* 1 => wantparent or lockparent flag */
+ int rdonly; /* lookup read-only flag bit */
+ int trailing_slash;
+ int error = 0;
+ struct componentname *cnp = &ndp->ni_cnd;
+ struct proc *p = cnp->cn_proc;
+
+ /*
+ * Setup: break out flag bits into variables.
+ */
+ wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
+ docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
+ if (cnp->cn_nameiop == DELETE ||
+ (wantparent && cnp->cn_nameiop != CREATE &&
+ cnp->cn_nameiop != LOOKUP))
+ docache = 0;
+ rdonly = cnp->cn_flags & RDONLY;
+ ndp->ni_dvp = NULL;
+ cnp->cn_flags &= ~ISSYMLINK;
+ dp = ndp->ni_startdir;
+ ndp->ni_startdir = NULLVP;
+ vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p);
+
+dirloop:
+ /*
+ * Search a new directory.
+ *
+ * The cn_hash value is for use by vfs_cache.
+ * The last component of the filename is left accessible via
+ * cnp->cn_nameptr for callers that need the name. Callers needing
+ * the name set the SAVENAME flag. When done, they assume
+ * responsibility for freeing the pathname buffer.
+ */
+ cnp->cn_consume = 0;
+ cnp->cn_hash = 0;
+ for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+ cnp->cn_hash += (unsigned char)*cp;
+ cnp->cn_namelen = cp - cnp->cn_nameptr;
+ if (cnp->cn_namelen > NAME_MAX) {
+ error = ENAMETOOLONG;
+ goto bad;
+ }
+#ifdef NAMEI_DIAGNOSTIC
+ { char c = *cp;
+ *cp = '\0';
+ printf("{%s}: ", cnp->cn_nameptr);
+ *cp = c; }
+#endif
+ ndp->ni_pathlen -= cnp->cn_namelen;
+ ndp->ni_next = cp;
+
+ /*
+ * Replace multiple slashes by a single slash and trailing slashes
+ * by a null. This must be done before VOP_LOOKUP() because some
+ * fs's don't know about trailing slashes. Remember if there were
+ * trailing slashes to handle symlinks, existing non-directories
+ * and non-existing files that won't be directories specially later.
+ */
+ trailing_slash = 0;
+ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+ cp++;
+ ndp->ni_pathlen--;
+ if (*cp == '\0') {
+ trailing_slash = 1;
+ *ndp->ni_next = '\0'; /* XXX for direnter() ... */
+ }
+ }
+ ndp->ni_next = cp;
+
+ cnp->cn_flags |= MAKEENTRY;
+ if (*cp == '\0' && docache == 0)
+ cnp->cn_flags &= ~MAKEENTRY;
+ if (cnp->cn_namelen == 2 &&
+ cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
+ cnp->cn_flags |= ISDOTDOT;
+ else
+ cnp->cn_flags &= ~ISDOTDOT;
+ if (*ndp->ni_next == 0)
+ cnp->cn_flags |= ISLASTCN;
+ else
+ cnp->cn_flags &= ~ISLASTCN;
+
+
+ /*
+ * Check for degenerate name (e.g. / or "")
+ * which is a way of talking about a directory,
+ * e.g. like "/." or ".".
+ */
+ if (cnp->cn_nameptr[0] == '\0') {
+ if (dp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto bad;
+ }
+ if (cnp->cn_nameiop != LOOKUP) {
+ error = EISDIR;
+ goto bad;
+ }
+ if (wantparent) {
+ ndp->ni_dvp = dp;
+ VREF(dp);
+ }
+ ndp->ni_vp = dp;
+ if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
+ VOP_UNLOCK(dp, 0, p);
+ if (cnp->cn_flags & SAVESTART)
+ panic("lookup: SAVESTART");
+ return (0);
+ }
+
+ /*
+ * Handle "..": two special cases.
+ * 1. If at root directory (e.g. after chroot)
+ * or at absolute root directory
+ * then ignore it so can't get out.
+ * 2. If this vnode is the root of a mounted
+ * filesystem, then replace it with the
+ * vnode which was mounted on so we take the
+ * .. in the other file system.
+ */
+ if (cnp->cn_flags & ISDOTDOT) {
+ for (;;) {
+ if (dp == ndp->ni_rootdir || dp == rootvnode) {
+ ndp->ni_dvp = dp;
+ ndp->ni_vp = dp;
+ VREF(dp);
+ goto nextname;
+ }
+ if ((dp->v_flag & VROOT) == 0 ||
+ (cnp->cn_flags & NOCROSSMOUNT))
+ break;
+ tdp = dp;
+ dp = dp->v_mount->mnt_vnodecovered;
+ vput(tdp);
+ VREF(dp);
+ vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p);
+ }
+ }
+
+ /*
+ * We now have a segment name to search for, and a directory to search.
+ */
+unionlookup:
+ ndp->ni_dvp = dp;
+ ndp->ni_vp = NULL;
+ ASSERT_VOP_LOCKED(dp, "lookup");
+ if (error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) {
+ KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
+#ifdef NAMEI_DIAGNOSTIC
+ printf("not found\n");
+#endif
+ if ((error == ENOENT) &&
+ (dp->v_flag & VROOT) &&
+ (dp->v_mount->mnt_flag & MNT_UNION)) {
+ tdp = dp;
+ dp = dp->v_mount->mnt_vnodecovered;
+ vput(tdp);
+ VREF(dp);
+ vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p);
+ goto unionlookup;
+ }
+
+ if (error != EJUSTRETURN)
+ goto bad;
+ /*
+ * If creating and at end of pathname, then can consider
+ * allowing file to be created.
+ */
+ if (rdonly) {
+ error = EROFS;
+ goto bad;
+ }
+ if (*cp == '\0' && trailing_slash &&
+ !(cnp->cn_flags & WILLBEDIR)) {
+ error = ENOENT;
+ goto bad;
+ }
+ /*
+ * We return with ni_vp NULL to indicate that the entry
+ * doesn't currently exist, leaving a pointer to the
+ * (possibly locked) directory inode in ndp->ni_dvp.
+ */
+ if (cnp->cn_flags & SAVESTART) {
+ ndp->ni_startdir = ndp->ni_dvp;
+ VREF(ndp->ni_startdir);
+ }
+ return (0);
+ }
+#ifdef NAMEI_DIAGNOSTIC
+ printf("found\n");
+#endif
+
+ ASSERT_VOP_LOCKED(ndp->ni_vp, "lookup");
+
+ /*
+ * Take into account any additional components consumed by
+ * the underlying filesystem.
+ */
+ if (cnp->cn_consume > 0) {
+ cnp->cn_nameptr += cnp->cn_consume;
+ ndp->ni_next += cnp->cn_consume;
+ ndp->ni_pathlen -= cnp->cn_consume;
+ cnp->cn_consume = 0;
+ }
+
+ dp = ndp->ni_vp;
+
+ /*
+ * Check to see if the vnode has been mounted on;
+ * if so find the root of the mounted file system.
+ */
+ while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
+ (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+ if (vfs_busy(mp, 0, 0, p))
+ continue;
+ error = VFS_ROOT(mp, &tdp);
+ vfs_unbusy(mp, p);
+ if (error)
+ goto bad2;
+ vput(dp);
+ ndp->ni_vp = dp = tdp;
+ }
+
+ /*
+ * Check for symbolic link
+ */
+ if ((dp->v_type == VLNK) &&
+ ((cnp->cn_flags & FOLLOW) || trailing_slash ||
+ *ndp->ni_next == '/')) {
+ cnp->cn_flags |= ISSYMLINK;
+ if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
+ error = EACCES;
+ goto bad2;
+ }
+ return (0);
+ }
+
+ /*
+ * Check for bogus trailing slashes.
+ */
+ if (trailing_slash && dp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto bad2;
+ }
+
+nextname:
+ /*
+ * Not a symbolic link. If more pathname,
+ * continue at next component, else return.
+ */
+ if (*ndp->ni_next == '/') {
+ cnp->cn_nameptr = ndp->ni_next;
+ while (*cnp->cn_nameptr == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+ if (ndp->ni_dvp != ndp->ni_vp) {
+ ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "lookup");
+ }
+ vrele(ndp->ni_dvp);
+ goto dirloop;
+ }
+ /*
+ * Disallow directory write attempts on read-only file systems.
+ */
+ if (rdonly &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ error = EROFS;
+ goto bad2;
+ }
+ if (cnp->cn_flags & SAVESTART) {
+ ndp->ni_startdir = ndp->ni_dvp;
+ VREF(ndp->ni_startdir);
+ }
+ if (!wantparent)
+ vrele(ndp->ni_dvp);
+
+ if ((cnp->cn_flags & LOCKLEAF) == 0)
+ VOP_UNLOCK(dp, 0, p);
+ return (0);
+
+bad2:
+ if ((cnp->cn_flags & LOCKPARENT) && *ndp->ni_next == '\0')
+ VOP_UNLOCK(ndp->ni_dvp, 0, p);
+ vrele(ndp->ni_dvp);
+bad:
+ vput(dp);
+ ndp->ni_vp = NULL;
+ return (error);
+}
+
+/*
+ * relookup - lookup a path name component
+ * Used by lookup to re-aquire things.
+ */
+int
+relookup(dvp, vpp, cnp)
+ struct vnode *dvp, **vpp;
+ struct componentname *cnp;
+{
+ struct proc *p = cnp->cn_proc;
+ struct vnode *dp = 0; /* the directory we are searching */
+ int docache; /* == 0 do not cache last component */
+ int wantparent; /* 1 => wantparent or lockparent flag */
+ int rdonly; /* lookup read-only flag bit */
+ int error = 0;
+#ifdef NAMEI_DIAGNOSTIC
+ int newhash; /* DEBUG: check name hash */
+ char *cp; /* DEBUG: check name ptr/len */
+#endif
+
+ /*
+ * Setup: break out flag bits into variables.
+ */
+ wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
+ docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
+ if (cnp->cn_nameiop == DELETE ||
+ (wantparent && cnp->cn_nameiop != CREATE))
+ docache = 0;
+ rdonly = cnp->cn_flags & RDONLY;
+ cnp->cn_flags &= ~ISSYMLINK;
+ dp = dvp;
+ vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p);
+
+/* dirloop: */
+ /*
+ * Search a new directory.
+ *
+ * The cn_hash value is for use by vfs_cache.
+ * The last component of the filename is left accessible via
+ * cnp->cn_nameptr for callers that need the name. Callers needing
+ * the name set the SAVENAME flag. When done, they assume
+ * responsibility for freeing the pathname buffer.
+ */
+#ifdef NAMEI_DIAGNOSTIC
+ for (newhash = 0, cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+ newhash += (unsigned char)*cp;
+ if (newhash != cnp->cn_hash)
+ panic("relookup: bad hash");
+ if (cnp->cn_namelen != cp - cnp->cn_nameptr)
+ panic ("relookup: bad len");
+ if (*cp != 0)
+ panic("relookup: not last component");
+ printf("{%s}: ", cnp->cn_nameptr);
+#endif
+
+ /*
+ * Check for degenerate name (e.g. / or "")
+ * which is a way of talking about a directory,
+ * e.g. like "/." or ".".
+ */
+ if (cnp->cn_nameptr[0] == '\0') {
+ if (cnp->cn_nameiop != LOOKUP || wantparent) {
+ error = EISDIR;
+ goto bad;
+ }
+ if (dp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto bad;
+ }
+ if (!(cnp->cn_flags & LOCKLEAF))
+ VOP_UNLOCK(dp, 0, p);
+ *vpp = dp;
+ if (cnp->cn_flags & SAVESTART)
+ panic("lookup: SAVESTART");
+ return (0);
+ }
+
+ if (cnp->cn_flags & ISDOTDOT)
+ panic ("relookup: lookup on dot-dot");
+
+ /*
+ * We now have a segment name to search for, and a directory to search.
+ */
+ if (error = VOP_LOOKUP(dp, vpp, cnp)) {
+ KASSERT(*vpp == NULL, ("leaf should be empty"));
+ if (error != EJUSTRETURN)
+ goto bad;
+ /*
+ * If creating and at end of pathname, then can consider
+ * allowing file to be created.
+ */
+ if (rdonly) {
+ error = EROFS;
+ goto bad;
+ }
+ /* ASSERT(dvp == ndp->ni_startdir) */
+ if (cnp->cn_flags & SAVESTART)
+ VREF(dvp);
+ /*
+ * We return with ni_vp NULL to indicate that the entry
+ * doesn't currently exist, leaving a pointer to the
+ * (possibly locked) directory inode in ndp->ni_dvp.
+ */
+ return (0);
+ }
+ dp = *vpp;
+
+ /*
+ * Check for symbolic link
+ */
+ KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
+ ("relookup: symlink found.\n"));
+
+ /*
+ * Disallow directory write attempts on read-only file systems.
+ */
+ if (rdonly &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ error = EROFS;
+ goto bad2;
+ }
+ /* ASSERT(dvp == ndp->ni_startdir) */
+ if (cnp->cn_flags & SAVESTART)
+ VREF(dvp);
+
+ if (!wantparent)
+ vrele(dvp);
+
+ if (dp->v_type == VREG &&
+ ((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == LOCKLEAF))
+ vfs_object_create(dp, cnp->cn_proc, cnp->cn_cred);
+
+ if ((cnp->cn_flags & LOCKLEAF) == 0)
+ VOP_UNLOCK(dp, 0, p);
+ return (0);
+
+bad2:
+ if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN))
+ VOP_UNLOCK(dvp, 0, p);
+ vrele(dvp);
+bad:
+ vput(dp);
+ *vpp = NULL;
+ return (error);
+}
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
new file mode 100644
index 0000000..a7a830f
--- /dev/null
+++ b/sys/kern/vfs_mount.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 1989, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ * Copyright (c) 1995 Artisoft, Inc. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_conf.c 8.8 (Berkeley) 3/31/94
+ * $Id: vfs_conf.c,v 1.25 1998/06/09 12:52:33 bde Exp $
+ */
+
+/*
+ * PURPOSE: This file abstracts the root mounting interface from
+ * the per file system semantics for handling mounts,
+ * the overall intent of which is to move the BSD
+ * internals dependence out of the FS code, both to
+ * make the FS code more portable and to free up some
+ * of the BSD internals so that they may more easily
+ * be changed.
+ *
+ * NOTE1: Code is single entry/single exit to aid debugging
+ * and conversion for kernel multithreading.
+ *
+ * NOTE2: Code notes lock state in headers on entry and exit
+ * as an aid to conversion for kernel multithreading
+ * on SMP reentrancy
+ */
+#include "opt_bootp.h"
+
+#include <sys/param.h> /* dev_t (types.h)*/
+#include <sys/kernel.h>
+#include <sys/systm.h> /* rootvp*/
+#include <sys/proc.h> /* curproc*/
+#include <sys/vnode.h> /* NULLVP*/
+#include <sys/mount.h> /* struct mount*/
+#include <sys/malloc.h> /* M_MOUNT*/
+
+/*
+ * GLOBALS
+ */
+
+MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount struct");
+
+/*
+ * These define the root filesystem, device, and root filesystem type.
+ */
+dev_t rootdevs[] = { NODEV, NODEV };
+char *rootdevnames[2];
+struct vnode *rootvnode;
+char *mountrootfsname;
+#ifdef BOOTP
+extern void bootpc_init __P((void));
+#endif
+
+/*
+ * vfs_init() will set maxvfsconf
+ * to the highest defined type number.
+ */
+int maxvfsconf;
+struct vfsconf *vfsconf;
+
+/*
+ * Common root mount code shared by all filesystems
+ */
+#define ROOTNAME "root_device"
+
+/*
+ * vfs_mountrootfs
+ *
+ * Common entry point for root mounts
+ *
+ * PARAMETERS:
+ * NONE
+ *
+ * RETURNS: 0 Success
+ * !0 error number (errno.h)
+ *
+ * LOCK STATE:
+ * ENTRY
+ * <no locks held>
+ * EXIT
+ * <no locks held>
+ *
+ * NOTES:
+ * This code is currently supported only for use for
+ * the FFS file system type. This is a matter of
+ * fixing the other file systems, not this code!
+ */
+static void
+vfs_mountrootfs(void *unused)
+{
+ struct mount *mp;
+ int i, err;
+ struct proc *p = curproc; /* XXX */
+ dev_t orootdev;
+
+#ifdef BOOTP
+ bootpc_init();
+#endif
+ /*
+ * New root mount structure
+ */
+ if ((err = vfs_rootmountalloc(mountrootfsname, ROOTNAME, &mp))) {
+ printf("error %d: ", err);
+ panic("cannot mount root\n");
+ return ;
+ }
+ mp->mnt_flag |= MNT_ROOTFS;
+
+ /*
+ * Attempt the mount
+ */
+ err = ENXIO;
+ orootdev = rootdev;
+ if (rootdevs[0] == NODEV)
+ rootdevs[0] = rootdev;
+ for (i = 0; i < sizeof(rootdevs) / sizeof(rootdevs[0]); i++) {
+ if (rootdevs[i] == NODEV)
+ break;
+ rootdev = rootdevs[i];
+ if (rootdev != orootdev) {
+ printf("changing root device to %s\n", rootdevnames[i]);
+ orootdev = rootdev;
+ }
+ strncpy(mp->mnt_stat.f_mntfromname,
+ rootdevnames[i] ? rootdevnames[i] : ROOTNAME, MNAMELEN - 1);
+ err = VFS_MOUNT(mp, NULL, NULL, NULL, p);
+ if (err != ENXIO)
+ break;
+ }
+ if (err) {
+ /*
+ * XXX should ask the user for the name in some cases.
+ * Why do we call vfs_unbusy() here and not after ENXIO
+ * is returned above?
+ */
+ vfs_unbusy(mp, p);
+ /*
+ * free mount struct before failing
+ * (hardly worthwhile with the PANIC eh?)
+ */
+ free( mp, M_MOUNT);
+ printf("error %d: ", err);
+ panic("cannot mount root (2)\n");
+ return;
+ }
+
+ simple_lock(&mountlist_slock);
+
+ /*
+ * Add fs to list of mounted file systems
+ */
+ CIRCLEQ_INSERT_HEAD(&mountlist, mp, mnt_list);
+
+ simple_unlock(&mountlist_slock);
+ vfs_unbusy(mp, p);
+
+ /* root mount, update system time from FS specific data*/
+ inittodr(mp->mnt_time);
+ return;
+}
+
+SYSINIT(mountroot, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, vfs_mountrootfs, NULL)
+
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
new file mode 100644
index 0000000..44b1698
--- /dev/null
+++ b/sys/kern/vfs_subr.c
@@ -0,0 +1,2872 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
+ * $Id: vfs_subr.c,v 1.182 1999/01/10 01:58:26 eivind Exp $
+ */
+
+/*
+ * External virtual filesystem routines
+ */
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/domain.h>
+#include <sys/dirent.h>
+#include <sys/vmmeter.h>
+
+#include <machine/limits.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+#include <vm/vm_zone.h>
+#include <sys/sysctl.h>
+
+#include <miscfs/specfs/specdev.h>
+
+static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+
+static void insmntque __P((struct vnode *vp, struct mount *mp));
+static void vclean __P((struct vnode *vp, int flags, struct proc *p));
+static void vfree __P((struct vnode *));
+static void vgonel __P((struct vnode *vp, struct proc *p));
+static unsigned long numvnodes;
+SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
+
+enum vtype iftovt_tab[16] = {
+ VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+ VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
+};
+int vttoif_tab[9] = {
+ 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+ S_IFSOCK, S_IFIFO, S_IFMT,
+};
+
+static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
+struct tobefreelist vnode_tobefree_list; /* vnode free list */
+
+static u_long wantfreevnodes = 25;
+SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
+static u_long freevnodes = 0;
+SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
+
+int vfs_ioopt = 0;
+#ifdef ENABLE_VFS_IOOPT
+SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
+#endif
+
+struct mntlist mountlist; /* mounted filesystem list */
+struct simplelock mountlist_slock;
+struct simplelock mntvnode_slock;
+int nfs_mount_type = -1;
+#ifndef NULL_SIMPLELOCKS
+static struct simplelock mntid_slock;
+static struct simplelock vnode_free_list_slock;
+static struct simplelock spechash_slock;
+#endif
+struct nfs_public nfs_pub; /* publicly exported FS */
+static vm_zone_t vnode_zone;
+
+/*
+ * The workitem queue.
+ */
+#define SYNCER_MAXDELAY 32
+static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
+time_t syncdelay = 30;
+int rushjob; /* number of slots to run ASAP */
+
+static int syncer_delayno = 0;
+static long syncer_mask;
+LIST_HEAD(synclist, vnode);
+static struct synclist *syncer_workitem_pending;
+
+int desiredvnodes;
+SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
+
+static void vfs_free_addrlist __P((struct netexport *nep));
+static int vfs_free_netcred __P((struct radix_node *rn, void *w));
+static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
+ struct export_args *argp));
+
+/*
+ * Initialize the vnode management data structures.
+ */
+void
+vntblinit()
+{
+
+ desiredvnodes = maxproc + cnt.v_page_count / 4;
+ simple_lock_init(&mntvnode_slock);
+ simple_lock_init(&mntid_slock);
+ simple_lock_init(&spechash_slock);
+ TAILQ_INIT(&vnode_free_list);
+ TAILQ_INIT(&vnode_tobefree_list);
+ simple_lock_init(&vnode_free_list_slock);
+ CIRCLEQ_INIT(&mountlist);
+ vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
+ /*
+ * Initialize the filesystem syncer.
+ */
+ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
+ &syncer_mask);
+ syncer_maxdelay = syncer_mask + 1;
+}
+
+/*
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Interlock is not released on failure.
+ */
+int
+vfs_busy(mp, flags, interlkp, p)
+ struct mount *mp;
+ int flags;
+ struct simplelock *interlkp;
+ struct proc *p;
+{
+ int lkflags;
+
+ if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+ if (flags & LK_NOWAIT)
+ return (ENOENT);
+ mp->mnt_kern_flag |= MNTK_MWAIT;
+ if (interlkp) {
+ simple_unlock(interlkp);
+ }
+ /*
+ * Since all busy locks are shared except the exclusive
+ * lock granted when unmounting, the only place that a
+ * wakeup needs to be done is at the release of the
+ * exclusive lock at the end of dounmount.
+ */
+ tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
+ if (interlkp) {
+ simple_lock(interlkp);
+ }
+ return (ENOENT);
+ }
+ lkflags = LK_SHARED | LK_NOPAUSE;
+ if (interlkp)
+ lkflags |= LK_INTERLOCK;
+ if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
+ panic("vfs_busy: unexpected lock failure");
+ return (0);
+}
+
+/*
+ * Free a busy filesystem.
+ */
+void
+vfs_unbusy(mp, p)
+ struct mount *mp;
+ struct proc *p;
+{
+
+ lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
+}
+
+/*
+ * Lookup a filesystem type, and if found allocate and initialize
+ * a mount structure for it.
+ *
+ * Devname is usually updated by mount(8) after booting.
+ */
+int
+vfs_rootmountalloc(fstypename, devname, mpp)
+ char *fstypename;
+ char *devname;
+ struct mount **mpp;
+{
+ struct proc *p = curproc; /* XXX */
+ struct vfsconf *vfsp;
+ struct mount *mp;
+
+ if (fstypename == NULL)
+ return (ENODEV);
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstypename))
+ break;
+ if (vfsp == NULL)
+ return (ENODEV);
+ mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
+ bzero((char *)mp, (u_long)sizeof(struct mount));
+ lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+ (void)vfs_busy(mp, LK_NOWAIT, 0, p);
+ LIST_INIT(&mp->mnt_vnodelist);
+ mp->mnt_vfc = vfsp;
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_flag = MNT_RDONLY;
+ mp->mnt_vnodecovered = NULLVP;
+ vfsp->vfc_refcount++;
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+ strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+ mp->mnt_stat.f_mntonname[0] = '/';
+ mp->mnt_stat.f_mntonname[1] = 0;
+ (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
+ *mpp = mp;
+ return (0);
+}
+
+/*
+ * Find an appropriate filesystem to use for the root. If a filesystem
+ * has not been preselected, walk through the list of known filesystems
+ * trying those that have mountroot routines, and try them until one
+ * works or we have tried them all.
+ */
+#ifdef notdef /* XXX JH */
+int
+lite2_vfs_mountroot()
+{
+ struct vfsconf *vfsp;
+ extern int (*lite2_mountroot) __P((void));
+ int error;
+
+ if (lite2_mountroot != NULL)
+ return ((*lite2_mountroot)());
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+ if (vfsp->vfc_mountroot == NULL)
+ continue;
+ if ((error = (*vfsp->vfc_mountroot)()) == 0)
+ return (0);
+ printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
+ }
+ return (ENODEV);
+}
+#endif
+
+/*
+ * Lookup a mount point by filesystem identifier.
+ */
+struct mount *
+vfs_getvfs(fsid)
+ fsid_t *fsid;
+{
+ register struct mount *mp;
+
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
+ mp = mp->mnt_list.cqe_next) {
+ if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+ mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+ simple_unlock(&mountlist_slock);
+ return (mp);
+ }
+ }
+ simple_unlock(&mountlist_slock);
+ return ((struct mount *) 0);
+}
+
+/*
+ * Get a new unique fsid
+ */
+void
+vfs_getnewfsid(mp)
+ struct mount *mp;
+{
+ static u_short xxxfs_mntid;
+
+ fsid_t tfsid;
+ int mtype;
+
+ simple_lock(&mntid_slock);
+ mtype = mp->mnt_vfc->vfc_typenum;
+ mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
+ mp->mnt_stat.f_fsid.val[1] = mtype;
+ if (xxxfs_mntid == 0)
+ ++xxxfs_mntid;
+ tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
+ tfsid.val[1] = mtype;
+ if (mountlist.cqh_first != (void *)&mountlist) {
+ while (vfs_getvfs(&tfsid)) {
+ tfsid.val[0]++;
+ xxxfs_mntid++;
+ }
+ }
+ mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+ simple_unlock(&mntid_slock);
+}
+
+/*
+ * Set vnode attributes to VNOVAL
+ */
+void
+vattr_null(vap)
+ register struct vattr *vap;
+{
+
+ vap->va_type = VNON;
+ vap->va_size = VNOVAL;
+ vap->va_bytes = VNOVAL;
+ vap->va_mode = VNOVAL;
+ vap->va_nlink = VNOVAL;
+ vap->va_uid = VNOVAL;
+ vap->va_gid = VNOVAL;
+ vap->va_fsid = VNOVAL;
+ vap->va_fileid = VNOVAL;
+ vap->va_blocksize = VNOVAL;
+ vap->va_rdev = VNOVAL;
+ vap->va_atime.tv_sec = VNOVAL;
+ vap->va_atime.tv_nsec = VNOVAL;
+ vap->va_mtime.tv_sec = VNOVAL;
+ vap->va_mtime.tv_nsec = VNOVAL;
+ vap->va_ctime.tv_sec = VNOVAL;
+ vap->va_ctime.tv_nsec = VNOVAL;
+ vap->va_flags = VNOVAL;
+ vap->va_gen = VNOVAL;
+ vap->va_vaflags = 0;
+}
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+extern vop_t **dead_vnodeop_p;
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(tag, mp, vops, vpp)
+ enum vtagtype tag;
+ struct mount *mp;
+ vop_t **vops;
+ struct vnode **vpp;
+{
+ int s;
+ struct proc *p = curproc; /* XXX */
+ struct vnode *vp, *tvp, *nvp;
+ vm_object_t object;
+ TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
+
+ /*
+ * We take the least recently used vnode from the freelist
+ * if we can get it and it has no cached pages, and no
+ * namecache entries are relative to it.
+ * Otherwise we allocate a new vnode
+ */
+
+ s = splbio();
+ simple_lock(&vnode_free_list_slock);
+ TAILQ_INIT(&vnode_tmp_list);
+
+ for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
+ nvp = TAILQ_NEXT(vp, v_freelist);
+ TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+ if (vp->v_flag & VAGE) {
+ TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+ } else {
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ }
+ vp->v_flag &= ~(VTBFREE|VAGE);
+ vp->v_flag |= VFREE;
+ if (vp->v_usecount)
+ panic("tobe free vnode isn't");
+ freevnodes++;
+ }
+
+ if (wantfreevnodes && freevnodes < wantfreevnodes) {
+ vp = NULL;
+ } else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
+ /*
+ * XXX: this is only here to be backwards compatible
+ */
+ vp = NULL;
+ } else {
+ for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
+ nvp = TAILQ_NEXT(vp, v_freelist);
+ if (!simple_lock_try(&vp->v_interlock))
+ continue;
+ if (vp->v_usecount)
+ panic("free vnode isn't");
+
+ object = vp->v_object;
+ if (object && (object->resident_page_count || object->ref_count)) {
+ printf("object inconsistant state: RPC: %d, RC: %d\n",
+ object->resident_page_count, object->ref_count);
+ /* Don't recycle if it's caching some pages */
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
+ continue;
+ } else if (LIST_FIRST(&vp->v_cache_src)) {
+ /* Don't recycle if active in the namecache */
+ simple_unlock(&vp->v_interlock);
+ continue;
+ } else {
+ break;
+ }
+ }
+ }
+
+ for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
+ nvp = TAILQ_NEXT(tvp, v_freelist);
+ TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
+ TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
+ simple_unlock(&tvp->v_interlock);
+ }
+
+ if (vp) {
+ vp->v_flag |= VDOOMED;
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ freevnodes--;
+ simple_unlock(&vnode_free_list_slock);
+ cache_purge(vp);
+ vp->v_lease = NULL;
+ if (vp->v_type != VBAD) {
+ vgonel(vp, p);
+ } else {
+ simple_unlock(&vp->v_interlock);
+ }
+
+#ifdef INVARIANTS
+ {
+ int s;
+
+ if (vp->v_data)
+ panic("cleaned vnode isn't");
+ s = splbio();
+ if (vp->v_numoutput)
+ panic("Clean vnode has pending I/O's");
+ splx(s);
+ }
+#endif
+ vp->v_flag = 0;
+ vp->v_lastr = 0;
+ vp->v_lastw = 0;
+ vp->v_lasta = 0;
+ vp->v_cstart = 0;
+ vp->v_clen = 0;
+ vp->v_socket = 0;
+ vp->v_writecount = 0; /* XXX */
+ vp->v_maxio = 0;
+ } else {
+ simple_unlock(&vnode_free_list_slock);
+ vp = (struct vnode *) zalloc(vnode_zone);
+ bzero((char *) vp, sizeof *vp);
+ simple_lock_init(&vp->v_interlock);
+ vp->v_dd = vp;
+ cache_purge(vp);
+ LIST_INIT(&vp->v_cache_src);
+ TAILQ_INIT(&vp->v_cache_dst);
+ numvnodes++;
+ }
+
+ TAILQ_INIT(&vp->v_cleanblkhd);
+ TAILQ_INIT(&vp->v_dirtyblkhd);
+ vp->v_type = VNON;
+ vp->v_tag = tag;
+ vp->v_op = vops;
+ insmntque(vp, mp);
+ *vpp = vp;
+ vp->v_usecount = 1;
+ vp->v_data = 0;
+ splx(s);
+
+ vfs_object_create(vp, p, p->p_ucred);
+ return (0);
+}
+
+/*
+ * Move a vnode from one mount queue to another.
+ */
+static void
+insmntque(vp, mp)
+ register struct vnode *vp;
+ register struct mount *mp;
+{
+
+ simple_lock(&mntvnode_slock);
+ /*
+ * Delete from old mount point vnode list, if on one.
+ */
+ if (vp->v_mount != NULL)
+ LIST_REMOVE(vp, v_mntvnodes);
+ /*
+ * Insert into list of vnodes for the new mount point, if available.
+ */
+ if ((vp->v_mount = mp) == NULL) {
+ simple_unlock(&mntvnode_slock);
+ return;
+ }
+ LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
+ simple_unlock(&mntvnode_slock);
+}
+
+/*
+ * Update outstanding I/O count and do wakeup if requested.
+ */
+void
+vwakeup(bp)
+ register struct buf *bp;
+{
+ register struct vnode *vp;
+
+ bp->b_flags &= ~B_WRITEINPROG;
+ if ((vp = bp->b_vp)) {
+ vp->v_numoutput--;
+ if (vp->v_numoutput < 0)
+ panic("vwakeup: neg numoutput");
+ if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
+ vp->v_flag &= ~VBWAIT;
+ wakeup((caddr_t) &vp->v_numoutput);
+ }
+ }
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
+ */
+int
+vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
+ register struct vnode *vp;
+ int flags;
+ struct ucred *cred;
+ struct proc *p;
+ int slpflag, slptimeo;
+{
+ register struct buf *bp;
+ struct buf *nbp, *blist;
+ int s, error;
+ vm_object_t object;
+
+ if (flags & V_SAVE) {
+ s = splbio();
+ while (vp->v_numoutput) {
+ vp->v_flag |= VBWAIT;
+ error = tsleep((caddr_t)&vp->v_numoutput,
+ slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
+ if (error) {
+ splx(s);
+ return (error);
+ }
+ }
+ if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+ splx(s);
+ if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
+ return (error);
+ s = splbio();
+ if (vp->v_numoutput > 0 ||
+ !TAILQ_EMPTY(&vp->v_dirtyblkhd))
+ panic("vinvalbuf: dirty bufs");
+ }
+ splx(s);
+ }
+ s = splbio();
+ for (;;) {
+ blist = TAILQ_FIRST(&vp->v_cleanblkhd);
+ if (!blist)
+ blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
+ if (!blist)
+ break;
+
+ for (bp = blist; bp; bp = nbp) {
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+ if (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ error = tsleep((caddr_t) bp,
+ slpflag | (PRIBIO + 4), "vinvalbuf",
+ slptimeo);
+ if (error) {
+ splx(s);
+ return (error);
+ }
+ break;
+ }
+ /*
+ * XXX Since there are no node locks for NFS, I
+ * believe there is a slight chance that a delayed
+ * write will occur while sleeping just above, so
+ * check for it. Note that vfs_bio_awrite expects
+ * buffers to reside on a queue, while VOP_BWRITE and
+ * brelse do not.
+ */
+ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
+ (flags & V_SAVE)) {
+
+ if (bp->b_vp == vp) {
+ if (bp->b_flags & B_CLUSTEROK) {
+ vfs_bio_awrite(bp);
+ } else {
+ bremfree(bp);
+ bp->b_flags |= (B_BUSY | B_ASYNC);
+ VOP_BWRITE(bp);
+ }
+ } else {
+ bremfree(bp);
+ bp->b_flags |= B_BUSY;
+ (void) VOP_BWRITE(bp);
+ }
+ break;
+ }
+ bremfree(bp);
+ bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF | B_BUSY);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ }
+ }
+
+ while (vp->v_numoutput > 0) {
+ vp->v_flag |= VBWAIT;
+ tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
+ }
+
+ splx(s);
+
+ /*
+ * Destroy the copy in the VM cache, too.
+ */
+ simple_lock(&vp->v_interlock);
+ object = vp->v_object;
+ if (object != NULL) {
+ vm_object_page_remove(object, 0, 0,
+ (flags & V_SAVE) ? TRUE : FALSE);
+ }
+ simple_unlock(&vp->v_interlock);
+
+ if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
+ panic("vinvalbuf: flush failed");
+ return (0);
+}
+
+/*
+ * Truncate a file's buffer and pages to a specified length. This
+ * is in lieu of the old vinvalbuf mechanism, which performed unneeded
+ * sync activity.
+ */
+int
+vtruncbuf(vp, cred, p, length, blksize)
+ register struct vnode *vp;
+ struct ucred *cred;
+ struct proc *p;
+ off_t length;
+ int blksize;
+{
+ register struct buf *bp;
+ struct buf *nbp;
+ int s, anyfreed;
+ int trunclbn;
+
+ /*
+ * Round up to the *next* lbn.
+ */
+ trunclbn = (length + blksize - 1) / blksize;
+
+ s = splbio();
+restart:
+ anyfreed = 1;
+ for (;anyfreed;) {
+ anyfreed = 0;
+ for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+ if (bp->b_lblkno >= trunclbn) {
+ if (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ tsleep(bp, PRIBIO + 4, "vtrb1", 0);
+ goto restart;
+ } else {
+ bremfree(bp);
+ bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ anyfreed = 1;
+ }
+ if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)||
+ (nbp->b_vp != vp) ||
+ (nbp->b_flags & B_DELWRI))) {
+ goto restart;
+ }
+ }
+ }
+
+ for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+ if (bp->b_lblkno >= trunclbn) {
+ if (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ tsleep(bp, PRIBIO + 4, "vtrb2", 0);
+ goto restart;
+ } else {
+ bremfree(bp);
+ bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ anyfreed = 1;
+ }
+ if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)||
+ (nbp->b_vp != vp) ||
+ (nbp->b_flags & B_DELWRI) == 0)) {
+ goto restart;
+ }
+ }
+ }
+ }
+
+ if (length > 0) {
+restartsync:
+ for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+ nbp = TAILQ_NEXT(bp, b_vnbufs);
+ if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
+ if (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ tsleep(bp, PRIBIO, "vtrb3", 0);
+ } else {
+ bremfree(bp);
+ bp->b_flags |= B_BUSY;
+ if (bp->b_vp == vp) {
+ bp->b_flags |= B_ASYNC;
+ } else {
+ bp->b_flags &= ~B_ASYNC;
+ }
+ VOP_BWRITE(bp);
+ }
+ goto restartsync;
+ }
+
+ }
+ }
+
+ while (vp->v_numoutput > 0) {
+ vp->v_flag |= VBWAIT;
+ tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
+ }
+
+ splx(s);
+
+ vnode_pager_setsize(vp, length);
+
+ return (0);
+}
+
+/*
+ * Associate a buffer with a vnode.
+ */
+void
+bgetvp(vp, bp)
+ register struct vnode *vp;
+ register struct buf *bp;
+{
+ int s;
+
+ KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
+
+ vhold(vp);
+ bp->b_vp = vp;
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ bp->b_dev = vp->v_rdev;
+ else
+ bp->b_dev = NODEV;
+ /*
+ * Insert onto list for new vnode.
+ */
+ s = splbio();
+ bp->b_xflags |= B_VNCLEAN;
+ bp->b_xflags &= ~B_VNDIRTY;
+ TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
+ splx(s);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ */
+void
+brelvp(bp)
+ register struct buf *bp;
+{
+ struct vnode *vp;
+ struct buflists *listheadp;
+ int s;
+
+ KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
+
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ vp = bp->b_vp;
+ s = splbio();
+ if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
+ if (bp->b_xflags & B_VNDIRTY)
+ listheadp = &vp->v_dirtyblkhd;
+ else
+ listheadp = &vp->v_cleanblkhd;
+ TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+ bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
+ }
+ if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+ vp->v_flag &= ~VONWORKLST;
+ LIST_REMOVE(vp, v_synclist);
+ }
+ splx(s);
+ bp->b_vp = (struct vnode *) 0;
+ vdrop(vp);
+}
+
+/*
+ * The workitem queue.
+ *
+ * It is useful to delay writes of file data and filesystem metadata
+ * for tens of seconds so that quickly created and deleted files need
+ * not waste disk bandwidth being created and removed. To realize this,
+ * we append vnodes to a "workitem" queue. When running with a soft
+ * updates implementation, most pending metadata dependencies should
+ * not wait for more than a few seconds. Thus, mounted on block devices
+ * are delayed only about a half the time that file data is delayed.
+ * Similarly, directory updates are more critical, so are only delayed
+ * about a third the time that file data is delayed. Thus, there are
+ * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
+ * one each second (driven off the filesystem syner process). The
+ * syncer_delayno variable indicates the next queue that is to be processed.
+ * Items that need to be processed soon are placed in this queue:
+ *
+ * syncer_workitem_pending[syncer_delayno]
+ *
+ * A delay of fifteen seconds is done by placing the request fifteen
+ * entries later in the queue:
+ *
+ * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
+ *
+ */
+
+/*
+ * Add an item to the syncer work queue.
+ */
+void
+vn_syncer_add_to_worklist(vp, delay)
+ struct vnode *vp;
+ int delay;
+{
+ int s, slot;
+
+ s = splbio();
+
+ if (vp->v_flag & VONWORKLST) {
+ LIST_REMOVE(vp, v_synclist);
+ }
+
+ if (delay > syncer_maxdelay - 2)
+ delay = syncer_maxdelay - 2;
+ slot = (syncer_delayno + delay) & syncer_mask;
+
+ LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
+ vp->v_flag |= VONWORKLST;
+ splx(s);
+}
+
+static void sched_sync __P((void));
+static struct proc *updateproc;
+static struct kproc_desc up_kp = {
+ "syncer",
+ sched_sync,
+ &updateproc
+};
+SYSINIT_KT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+
+/*
+ * System filesystem synchronizer daemon.
+ */
+void
+sched_sync(void)
+{
+ struct synclist *slp;
+ struct vnode *vp;
+ long starttime;
+ int s;
+ struct proc *p = updateproc;
+
+ for (;;) {
+ starttime = time_second;
+
+ /*
+ * Push files whose dirty time has expired.
+ */
+ s = splbio();
+ slp = &syncer_workitem_pending[syncer_delayno];
+ syncer_delayno += 1;
+ if (syncer_delayno == syncer_maxdelay)
+ syncer_delayno = 0;
+ splx(s);
+
+ while ((vp = LIST_FIRST(slp)) != NULL) {
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
+ VOP_UNLOCK(vp, 0, p);
+ if (LIST_FIRST(slp) == vp) {
+ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
+ vp->v_type != VBLK)
+ panic("sched_sync: fsync failed");
+ /*
+ * Move ourselves to the back of the sync list.
+ */
+ LIST_REMOVE(vp, v_synclist);
+ vn_syncer_add_to_worklist(vp, syncdelay);
+ }
+ }
+
+ /*
+ * Do soft update processing.
+ */
+ if (bioops.io_sync)
+ (*bioops.io_sync)(NULL);
+
+ /*
+ * The variable rushjob allows the kernel to speed up the
+ * processing of the filesystem syncer process. A rushjob
+ * value of N tells the filesystem syncer to process the next
+ * N seconds worth of work on its queue ASAP. Currently rushjob
+ * is used by the soft update code to speed up the filesystem
+ * syncer process when the incore state is getting so far
+ * ahead of the disk that the kernel memory pool is being
+ * threatened with exhaustion.
+ */
+ if (rushjob > 0) {
+ rushjob -= 1;
+ continue;
+ }
+ /*
+ * If it has taken us less than a second to process the
+ * current work, then wait. Otherwise start right over
+ * again. We can still lose time if any single round
+ * takes more than two seconds, but it does not really
+ * matter as we are just trying to generally pace the
+ * filesystem activity.
+ */
+ if (time_second == starttime)
+ tsleep(&lbolt, PPAUSE, "syncer", 0);
+ }
+}
+
+/*
+ * Associate a p-buffer with a vnode.
+ *
+ * Also sets B_PAGING flag to indicate that vnode is not fully associated
+ * with the buffer. i.e. the bp has not been linked into the vnode or
+ * ref-counted.
+ */
+void
+pbgetvp(vp, bp)
+ register struct vnode *vp;
+ register struct buf *bp;
+{
+
+ KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
+
+ bp->b_vp = vp;
+ bp->b_flags |= B_PAGING;
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ bp->b_dev = vp->v_rdev;
+ else
+ bp->b_dev = NODEV;
+}
+
+/*
+ * Disassociate a p-buffer from a vnode.
+ */
+void
+pbrelvp(bp)
+ register struct buf *bp;
+{
+
+ KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
+
+#if !defined(MAX_PERF)
+ /* XXX REMOVE ME */
+ if (bp->b_vnbufs.tqe_next != NULL) {
+ panic(
+ "relpbuf(): b_vp was probably reassignbuf()d %p %x",
+ bp,
+ (int)bp->b_flags
+ );
+ }
+#endif
+ bp->b_vp = (struct vnode *) 0;
+ bp->b_flags &= ~B_PAGING;
+}
+
+void
+pbreassignbuf(bp, newvp)
+ struct buf *bp;
+ struct vnode *newvp;
+{
+#if !defined(MAX_PERF)
+ if ((bp->b_flags & B_PAGING) == 0) {
+ panic(
+ "pbreassignbuf() on non phys bp %p",
+ bp
+ );
+ }
+#endif
+ bp->b_vp = newvp;
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+void
+reassignbuf(bp, newvp)
+ register struct buf *bp;
+ register struct vnode *newvp;
+{
+ struct buflists *listheadp;
+ struct vnode *oldvp;
+ int delay;
+ int s;
+
+ if (newvp == NULL) {
+ printf("reassignbuf: NULL");
+ return;
+ }
+
+#if !defined(MAX_PERF)
+ /*
+ * B_PAGING flagged buffers cannot be reassigned because their vp
+ * is not fully linked in.
+ */
+ if (bp->b_flags & B_PAGING)
+ panic("cannot reassign paging buffer");
+#endif
+
+ s = splbio();
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
+ oldvp = bp->b_vp;
+ if (bp->b_xflags & B_VNDIRTY)
+ listheadp = &oldvp->v_dirtyblkhd;
+ else
+ listheadp = &oldvp->v_cleanblkhd;
+ TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+ bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
+ vdrop(oldvp);
+ }
+ /*
+ * If dirty, put on list of dirty buffers; otherwise insert onto list
+ * of clean buffers.
+ */
+ if (bp->b_flags & B_DELWRI) {
+ struct buf *tbp;
+
+ listheadp = &newvp->v_dirtyblkhd;
+ if ((newvp->v_flag & VONWORKLST) == 0) {
+ switch (newvp->v_type) {
+ case VDIR:
+ delay = syncdelay / 3;
+ break;
+ case VBLK:
+ if (newvp->v_specmountpoint != NULL) {
+ delay = syncdelay / 2;
+ break;
+ }
+ /* fall through */
+ default:
+ delay = syncdelay;
+ }
+ vn_syncer_add_to_worklist(newvp, delay);
+ }
+ bp->b_xflags |= B_VNDIRTY;
+ tbp = TAILQ_FIRST(listheadp);
+ if (tbp == NULL ||
+ (bp->b_lblkno >= 0 && tbp->b_lblkno > bp->b_lblkno)) {
+ TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
+ } else {
+ if (bp->b_lblkno >= 0) {
+ struct buf *ttbp;
+ while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
+ (ttbp->b_lblkno < bp->b_lblkno)) {
+ tbp = ttbp;
+ }
+ TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
+ } else {
+ TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
+ }
+ }
+ } else {
+ bp->b_xflags |= B_VNCLEAN;
+ TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
+ if ((newvp->v_flag & VONWORKLST) &&
+ TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
+ newvp->v_flag &= ~VONWORKLST;
+ LIST_REMOVE(newvp, v_synclist);
+ }
+ }
+ bp->b_vp = newvp;
+ vhold(bp->b_vp);
+ splx(s);
+}
+
+/*
+ * Create a vnode for a block device.
+ * Used for mounting the root file system.
+ */
+int
+bdevvp(dev, vpp)
+ dev_t dev;
+ struct vnode **vpp;
+{
+ register struct vnode *vp;
+ struct vnode *nvp;
+ int error;
+
+ /* XXX 255 is for mfs. */
+ if (dev == NODEV || (major(dev) != 255 && (major(dev) >= nblkdev ||
+ bdevsw[major(dev)] == NULL))) {
+ *vpp = NULLVP;
+ return (ENXIO);
+ }
+ error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
+ if (error) {
+ *vpp = NULLVP;
+ return (error);
+ }
+ vp = nvp;
+ vp->v_type = VBLK;
+ if ((nvp = checkalias(vp, dev, (struct mount *)0)) != NULL) {
+ vput(vp);
+ vp = nvp;
+ }
+ *vpp = vp;
+ return (0);
+}
+
+/*
+ * Check to see if the new vnode represents a special device
+ * for which we already have a vnode (either because of
+ * bdevvp() or because of a different vnode representing
+ * the same block device). If such an alias exists, deallocate
+ * the existing contents and return the aliased vnode. The
+ * caller is responsible for filling it with its new contents.
+ */
+struct vnode *
+checkalias(nvp, nvp_rdev, mp)
+ register struct vnode *nvp;
+ dev_t nvp_rdev;
+ struct mount *mp;
+{
+ struct proc *p = curproc; /* XXX */
+ struct vnode *vp;
+ struct vnode **vpp;
+
+ if (nvp->v_type != VBLK && nvp->v_type != VCHR)
+ return (NULLVP);
+
+ vpp = &speclisth[SPECHASH(nvp_rdev)];
+loop:
+ simple_lock(&spechash_slock);
+ for (vp = *vpp; vp; vp = vp->v_specnext) {
+ if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
+ continue;
+ /*
+ * Alias, but not in use, so flush it out.
+ * Only alias active device nodes.
+ * Not sure why we don't re-use this like we do below.
+ */
+ simple_lock(&vp->v_interlock);
+ if (vp->v_usecount == 0) {
+ simple_unlock(&spechash_slock);
+ vgonel(vp, p);
+ goto loop;
+ }
+ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
+ /*
+ * It dissappeared, and we may have slept.
+ * Restart from the beginning
+ */
+ simple_unlock(&spechash_slock);
+ goto loop;
+ }
+ break;
+ }
+ /*
+ * It would be a lot clearer what is going on here if
+ * this had been expressed as:
+ * if ( vp && (vp->v_tag == VT_NULL))
+ * and the clauses had been swapped.
+ */
+ if (vp == NULL || vp->v_tag != VT_NON) {
+ /*
+ * Put the new vnode into the hash chain.
+ * and if there was an alias, connect them.
+ */
+ MALLOC(nvp->v_specinfo, struct specinfo *,
+ sizeof(struct specinfo), M_VNODE, M_WAITOK);
+ nvp->v_rdev = nvp_rdev;
+ nvp->v_hashchain = vpp;
+ nvp->v_specnext = *vpp;
+ nvp->v_specmountpoint = NULL;
+ simple_unlock(&spechash_slock);
+ *vpp = nvp;
+ if (vp != NULLVP) {
+ nvp->v_flag |= VALIASED;
+ vp->v_flag |= VALIASED;
+ vput(vp);
+ }
+ return (NULLVP);
+ }
+ /*
+ * if ( vp && (vp->v_tag == VT_NULL))
+ * We have a vnode alias, but it is a trashed.
+ * Make it look like it's newley allocated. (by getnewvnode())
+ * The caller should use this instead.
+ */
+ simple_unlock(&spechash_slock);
+ VOP_UNLOCK(vp, 0, p);
+ simple_lock(&vp->v_interlock);
+ vclean(vp, 0, p);
+ vp->v_op = nvp->v_op;
+ vp->v_tag = nvp->v_tag;
+ nvp->v_type = VNON;
+ insmntque(vp, mp);
+ return (vp);
+}
+
+/*
+ * Grab a particular vnode from the free list, increment its
+ * reference count and lock it. The vnode lock bit is set the
+ * vnode is being eliminated in vgone. The process is awakened
+ * when the transition is completed, and an error returned to
+ * indicate that the vnode is no longer usable (possibly having
+ * been changed to a new file system type).
+ */
+int
+vget(vp, flags, p)
+ register struct vnode *vp;
+ int flags;
+ struct proc *p;
+{
+ int error;
+
+ /*
+ * If the vnode is in the process of being cleaned out for
+ * another use, we wait for the cleaning to finish and then
+ * return failure. Cleaning is determined by checking that
+ * the VXLOCK flag is set.
+ */
+ if ((flags & LK_INTERLOCK) == 0) {
+ simple_lock(&vp->v_interlock);
+ }
+ if (vp->v_flag & VXLOCK) {
+ vp->v_flag |= VXWANT;
+ simple_unlock(&vp->v_interlock);
+ tsleep((caddr_t)vp, PINOD, "vget", 0);
+ return (ENOENT);
+ }
+
+ vp->v_usecount++;
+
+ if (VSHOULDBUSY(vp))
+ vbusy(vp);
+ if (flags & LK_TYPE_MASK) {
+ if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
+ /*
+ * must expand vrele here because we do not want
+ * to call VOP_INACTIVE if the reference count
+ * drops back to zero since it was never really
+ * active. We must remove it from the free list
+ * before sleeping so that multiple processes do
+ * not try to recycle it.
+ */
+ simple_lock(&vp->v_interlock);
+ vp->v_usecount--;
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+ simple_unlock(&vp->v_interlock);
+ }
+ return (error);
+ }
+ simple_unlock(&vp->v_interlock);
+ return (0);
+}
+
+void
+vref(struct vnode *vp)
+{
+ simple_lock(&vp->v_interlock);
+ vp->v_usecount++;
+ simple_unlock(&vp->v_interlock);
+}
+
+/*
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vrele(vp)
+ struct vnode *vp;
+{
+ struct proc *p = curproc; /* XXX */
+
+ KASSERT(vp != NULL, ("vrele: null vp"));
+
+ simple_lock(&vp->v_interlock);
+
+ if (vp->v_usecount > 1) {
+
+ vp->v_usecount--;
+ simple_unlock(&vp->v_interlock);
+
+ return;
+ }
+
+ if (vp->v_usecount == 1) {
+
+ vp->v_usecount--;
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+ /*
+ * If we are doing a vput, the node is already locked, and we must
+ * call VOP_INACTIVE with the node locked. So, in the case of
+ * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
+ */
+ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
+ VOP_INACTIVE(vp, p);
+ }
+
+ } else {
+#ifdef DIAGNOSTIC
+ vprint("vrele: negative ref count", vp);
+ simple_unlock(&vp->v_interlock);
+#endif
+ panic("vrele: negative ref cnt");
+ }
+}
+
+void
+vput(vp)
+ struct vnode *vp;
+{
+ struct proc *p = curproc; /* XXX */
+
+ KASSERT(vp != NULL, ("vput: null vp"));
+
+ simple_lock(&vp->v_interlock);
+
+ if (vp->v_usecount > 1) {
+
+ vp->v_usecount--;
+ VOP_UNLOCK(vp, LK_INTERLOCK, p);
+ return;
+
+ }
+
+ if (vp->v_usecount == 1) {
+
+ vp->v_usecount--;
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+ /*
+ * If we are doing a vput, the node is already locked, and we must
+ * call VOP_INACTIVE with the node locked. So, in the case of
+ * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
+ */
+ simple_unlock(&vp->v_interlock);
+ VOP_INACTIVE(vp, p);
+
+ } else {
+#ifdef DIAGNOSTIC
+ vprint("vput: negative ref count", vp);
+#endif
+ panic("vput: negative ref cnt");
+ }
+}
+
+/*
+ * Somebody doesn't want the vnode recycled.
+ */
+void
+vhold(vp)
+ register struct vnode *vp;
+{
+ int s;
+
+ s = splbio();
+ vp->v_holdcnt++;
+ if (VSHOULDBUSY(vp))
+ vbusy(vp);
+ splx(s);
+}
+
+/*
+ * One less who cares about this vnode.
+ */
+void
+vdrop(vp)
+ register struct vnode *vp;
+{
+ int s;
+
+ s = splbio();
+ if (vp->v_holdcnt <= 0)
+ panic("vdrop: holdcnt");
+ vp->v_holdcnt--;
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+ splx(s);
+}
+
+/*
+ * Remove any vnodes in the vnode table belonging to mount point mp.
+ *
+ * If MNT_NOFORCE is specified, there should not be any active ones,
+ * return error if any are found (nb: this is a user error, not a
+ * system error). If MNT_FORCE is specified, detach any active vnodes
+ * that are found.
+ */
+#ifdef DIAGNOSTIC
+static int busyprt = 0; /* print out busy vnodes */
+SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
+#endif
+
+int
+vflush(mp, skipvp, flags)
+ struct mount *mp;
+ struct vnode *skipvp;
+ int flags;
+{
+ struct proc *p = curproc; /* XXX */
+ struct vnode *vp, *nvp;
+ int busy = 0;
+
+ simple_lock(&mntvnode_slock);
+loop:
+ for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
+ /*
+ * Make sure this vnode wasn't reclaimed in getnewvnode().
+ * Start over if it has (it won't be on the list anymore).
+ */
+ if (vp->v_mount != mp)
+ goto loop;
+ nvp = vp->v_mntvnodes.le_next;
+ /*
+ * Skip over a selected vnode.
+ */
+ if (vp == skipvp)
+ continue;
+
+ simple_lock(&vp->v_interlock);
+ /*
+ * Skip over a vnodes marked VSYSTEM.
+ */
+ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
+ simple_unlock(&vp->v_interlock);
+ continue;
+ }
+ /*
+ * If WRITECLOSE is set, only flush out regular file vnodes
+ * open for writing.
+ */
+ if ((flags & WRITECLOSE) &&
+ (vp->v_writecount == 0 || vp->v_type != VREG)) {
+ simple_unlock(&vp->v_interlock);
+ continue;
+ }
+
+ /*
+ * With v_usecount == 0, all we need to do is clear out the
+ * vnode data structures and we are done.
+ */
+ if (vp->v_usecount == 0) {
+ simple_unlock(&mntvnode_slock);
+ vgonel(vp, p);
+ simple_lock(&mntvnode_slock);
+ continue;
+ }
+
+ /*
+ * If FORCECLOSE is set, forcibly close the vnode. For block
+ * or character devices, revert to an anonymous device. For
+ * all other files, just kill them.
+ */
+ if (flags & FORCECLOSE) {
+ simple_unlock(&mntvnode_slock);
+ if (vp->v_type != VBLK && vp->v_type != VCHR) {
+ vgonel(vp, p);
+ } else {
+ vclean(vp, 0, p);
+ vp->v_op = spec_vnodeop_p;
+ insmntque(vp, (struct mount *) 0);
+ }
+ simple_lock(&mntvnode_slock);
+ continue;
+ }
+#ifdef DIAGNOSTIC
+ if (busyprt)
+ vprint("vflush: busy vnode", vp);
+#endif
+ simple_unlock(&vp->v_interlock);
+ busy++;
+ }
+ simple_unlock(&mntvnode_slock);
+ if (busy)
+ return (EBUSY);
+ return (0);
+}
+
+/*
+ * Disassociate the underlying file system from a vnode.
+ */
+static void
+vclean(vp, flags, p)
+ struct vnode *vp;
+ int flags;
+ struct proc *p;
+{
+ int active;
+ vm_object_t obj;
+
+ /*
+ * Check to see if the vnode is in use. If so we have to reference it
+ * before we clean it out so that its count cannot fall to zero and
+ * generate a race against ourselves to recycle it.
+ */
+ if ((active = vp->v_usecount))
+ vp->v_usecount++;
+
+ /*
+ * Prevent the vnode from being recycled or brought into use while we
+ * clean it out.
+ */
+ if (vp->v_flag & VXLOCK)
+ panic("vclean: deadlock");
+ vp->v_flag |= VXLOCK;
+ /*
+ * Even if the count is zero, the VOP_INACTIVE routine may still
+ * have the object locked while it cleans it out. The VOP_LOCK
+ * ensures that the VOP_INACTIVE routine is done with its work.
+ * For active vnodes, it ensures that no other activity can
+ * occur while the underlying object is being cleaned out.
+ */
+ VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
+
+ /*
+ * Clean out any buffers associated with the vnode.
+ */
+ vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
+ if (obj = vp->v_object) {
+ if (obj->ref_count == 0) {
+ /*
+ * This is a normal way of shutting down the object/vnode
+ * association.
+ */
+ vm_object_terminate(obj);
+ } else {
+ /*
+ * Woe to the process that tries to page now :-).
+ */
+ vm_pager_deallocate(obj);
+ }
+ }
+
+ /*
+ * If purging an active vnode, it must be closed and
+ * deactivated before being reclaimed. Note that the
+ * VOP_INACTIVE will unlock the vnode.
+ */
+ if (active) {
+ if (flags & DOCLOSE)
+ VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
+ VOP_INACTIVE(vp, p);
+ } else {
+ /*
+ * Any other processes trying to obtain this lock must first
+ * wait for VXLOCK to clear, then call the new lock operation.
+ */
+ VOP_UNLOCK(vp, 0, p);
+ }
+ /*
+ * Reclaim the vnode.
+ */
+ if (VOP_RECLAIM(vp, p))
+ panic("vclean: cannot reclaim");
+
+ if (active)
+ vrele(vp);
+
+ cache_purge(vp);
+ if (vp->v_vnlock) {
+#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */
+#ifdef DIAGNOSTIC
+ if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
+ vprint("vclean: lock not drained", vp);
+#endif
+#endif
+ FREE(vp->v_vnlock, M_VNODE);
+ vp->v_vnlock = NULL;
+ }
+
+ if (VSHOULDFREE(vp))
+ vfree(vp);
+
+ /*
+ * Done with purge, notify sleepers of the grim news.
+ */
+ vp->v_op = dead_vnodeop_p;
+ vn_pollgone(vp);
+ vp->v_tag = VT_NON;
+ vp->v_flag &= ~VXLOCK;
+ if (vp->v_flag & VXWANT) {
+ vp->v_flag &= ~VXWANT;
+ wakeup((caddr_t) vp);
+ }
+}
+
+/*
+ * Eliminate all activity associated with the requested vnode
+ * and with all vnodes aliased to the requested vnode.
+ */
+int
+vop_revoke(ap)
+ struct vop_revoke_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ } */ *ap;
+{
+ struct vnode *vp, *vq;
+ struct proc *p = curproc; /* XXX */
+
+ KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
+
+ vp = ap->a_vp;
+ simple_lock(&vp->v_interlock);
+
+ if (vp->v_flag & VALIASED) {
+ /*
+ * If a vgone (or vclean) is already in progress,
+ * wait until it is done and return.
+ */
+ if (vp->v_flag & VXLOCK) {
+ vp->v_flag |= VXWANT;
+ simple_unlock(&vp->v_interlock);
+ tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
+ return (0);
+ }
+ /*
+ * Ensure that vp will not be vgone'd while we
+ * are eliminating its aliases.
+ */
+ vp->v_flag |= VXLOCK;
+ simple_unlock(&vp->v_interlock);
+ while (vp->v_flag & VALIASED) {
+ simple_lock(&spechash_slock);
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_rdev != vp->v_rdev ||
+ vq->v_type != vp->v_type || vp == vq)
+ continue;
+ simple_unlock(&spechash_slock);
+ vgone(vq);
+ break;
+ }
+ if (vq == NULLVP) {
+ simple_unlock(&spechash_slock);
+ }
+ }
+ /*
+ * Remove the lock so that vgone below will
+ * really eliminate the vnode after which time
+ * vgone will awaken any sleepers.
+ */
+ simple_lock(&vp->v_interlock);
+ vp->v_flag &= ~VXLOCK;
+ if (vp->v_flag & VXWANT) {
+ vp->v_flag &= ~VXWANT;
+ wakeup(vp);
+ }
+ }
+ vgonel(vp, p);
+ return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ * Release the passed interlock if the vnode will be recycled.
+ */
+int
+vrecycle(vp, inter_lkp, p)
+ struct vnode *vp;
+ struct simplelock *inter_lkp;
+ struct proc *p;
+{
+
+ simple_lock(&vp->v_interlock);
+ if (vp->v_usecount == 0) {
+ if (inter_lkp) {
+ simple_unlock(inter_lkp);
+ }
+ vgonel(vp, p);
+ return (1);
+ }
+ simple_unlock(&vp->v_interlock);
+ return (0);
+}
+
+/*
+ * Eliminate all activity associated with a vnode
+ * in preparation for reuse.
+ */
+void
+vgone(vp)
+ register struct vnode *vp;
+{
+ struct proc *p = curproc; /* XXX */
+
+ simple_lock(&vp->v_interlock);
+ vgonel(vp, p);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+static void
+vgonel(vp, p)
+ struct vnode *vp;
+ struct proc *p;
+{
+ int s;
+ struct vnode *vq;
+ struct vnode *vx;
+
+ /*
+ * If a vgone (or vclean) is already in progress,
+ * wait until it is done and return.
+ */
+ if (vp->v_flag & VXLOCK) {
+ vp->v_flag |= VXWANT;
+ simple_unlock(&vp->v_interlock);
+ tsleep((caddr_t)vp, PINOD, "vgone", 0);
+ return;
+ }
+
+ /*
+ * Clean out the filesystem specific data.
+ */
+ vclean(vp, DOCLOSE, p);
+ simple_lock(&vp->v_interlock);
+
+ /*
+ * Delete from old mount point vnode list, if on one.
+ */
+ if (vp->v_mount != NULL)
+ insmntque(vp, (struct mount *)0);
+ /*
+ * If special device, remove it from special device alias list
+ * if it is on one.
+ */
+ if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
+ simple_lock(&spechash_slock);
+ if (*vp->v_hashchain == vp) {
+ *vp->v_hashchain = vp->v_specnext;
+ } else {
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_specnext != vp)
+ continue;
+ vq->v_specnext = vp->v_specnext;
+ break;
+ }
+ if (vq == NULL)
+ panic("missing bdev");
+ }
+ if (vp->v_flag & VALIASED) {
+ vx = NULL;
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_rdev != vp->v_rdev ||
+ vq->v_type != vp->v_type)
+ continue;
+ if (vx)
+ break;
+ vx = vq;
+ }
+ if (vx == NULL)
+ panic("missing alias");
+ if (vq == NULL)
+ vx->v_flag &= ~VALIASED;
+ vp->v_flag &= ~VALIASED;
+ }
+ simple_unlock(&spechash_slock);
+ FREE(vp->v_specinfo, M_VNODE);
+ vp->v_specinfo = NULL;
+ }
+
+ /*
+ * If it is on the freelist and not already at the head,
+ * move it to the head of the list. The test of the back
+ * pointer and the reference count of zero is because
+ * it will be removed from the free list by getnewvnode,
+ * but will not have its reference count incremented until
+ * after calling vgone. If the reference count were
+ * incremented first, vgone would (incorrectly) try to
+ * close the previous instance of the underlying object.
+ */
+ if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
+ s = splbio();
+ simple_lock(&vnode_free_list_slock);
+ if (vp->v_flag & VFREE) {
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ } else if (vp->v_flag & VTBFREE) {
+ TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+ vp->v_flag &= ~VTBFREE;
+ freevnodes++;
+ } else
+ freevnodes++;
+ vp->v_flag |= VFREE;
+ TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+ simple_unlock(&vnode_free_list_slock);
+ splx(s);
+ }
+
+ vp->v_type = VBAD;
+ simple_unlock(&vp->v_interlock);
+}
+
+/*
+ * Lookup a vnode by device number.
+ */
+int
+vfinddev(dev, type, vpp)
+ dev_t dev;
+ enum vtype type;
+ struct vnode **vpp;
+{
+ register struct vnode *vp;
+ int rc = 0;
+
+ simple_lock(&spechash_slock);
+ for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
+ if (dev != vp->v_rdev || type != vp->v_type)
+ continue;
+ *vpp = vp;
+ rc = 1;
+ break;
+ }
+ simple_unlock(&spechash_slock);
+ return (rc);
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(vp)
+ register struct vnode *vp;
+{
+ struct vnode *vq, *vnext;
+ int count;
+
+loop:
+ if ((vp->v_flag & VALIASED) == 0)
+ return (vp->v_usecount);
+ simple_lock(&spechash_slock);
+ for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
+ vnext = vq->v_specnext;
+ if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
+ continue;
+ /*
+ * Alias, but not in use, so flush it out.
+ */
+ if (vq->v_usecount == 0 && vq != vp) {
+ simple_unlock(&spechash_slock);
+ vgone(vq);
+ goto loop;
+ }
+ count += vq->v_usecount;
+ }
+ simple_unlock(&spechash_slock);
+ return (count);
+}
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
+
+void
+vprint(label, vp)
+ char *label;
+ register struct vnode *vp;
+{
+ char buf[96];
+
+ if (label != NULL)
+ printf("%s: %p: ", label, (void *)vp);
+ else
+ printf("%p: ", (void *)vp);
+ printf("type %s, usecount %d, writecount %d, refcount %d,",
+ typename[vp->v_type], vp->v_usecount, vp->v_writecount,
+ vp->v_holdcnt);
+ buf[0] = '\0';
+ if (vp->v_flag & VROOT)
+ strcat(buf, "|VROOT");
+ if (vp->v_flag & VTEXT)
+ strcat(buf, "|VTEXT");
+ if (vp->v_flag & VSYSTEM)
+ strcat(buf, "|VSYSTEM");
+ if (vp->v_flag & VXLOCK)
+ strcat(buf, "|VXLOCK");
+ if (vp->v_flag & VXWANT)
+ strcat(buf, "|VXWANT");
+ if (vp->v_flag & VBWAIT)
+ strcat(buf, "|VBWAIT");
+ if (vp->v_flag & VALIASED)
+ strcat(buf, "|VALIASED");
+ if (vp->v_flag & VDOOMED)
+ strcat(buf, "|VDOOMED");
+ if (vp->v_flag & VFREE)
+ strcat(buf, "|VFREE");
+ if (vp->v_flag & VOBJBUF)
+ strcat(buf, "|VOBJBUF");
+ if (buf[0] != '\0')
+ printf(" flags (%s)", &buf[1]);
+ if (vp->v_data == NULL) {
+ printf("\n");
+ } else {
+ printf("\n\t");
+ VOP_PRINT(vp);
+ }
+}
+
+#ifdef DDB
+#include <ddb/ddb.h>
+/*
+ * List all of the locked vnodes in the system.
+ * Called when debugging the kernel.
+ */
+DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
+{
+ struct proc *p = curproc; /* XXX */
+ struct mount *mp, *nmp;
+ struct vnode *vp;
+
+ printf("Locked vnodes\n");
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+ for (vp = mp->mnt_vnodelist.lh_first;
+ vp != NULL;
+ vp = vp->v_mntvnodes.le_next) {
+ if (VOP_ISLOCKED(vp))
+ vprint((char *)0, vp);
+ }
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ }
+ simple_unlock(&mountlist_slock);
+}
+#endif
+
+/*
+ * Top level filesystem related information gathering.
+ */
+static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl SYSCTL_HANDLER_ARGS
+{
+ int *name = (int *)arg1 - 1; /* XXX */
+ u_int namelen = arg2 + 1; /* XXX */
+ struct vfsconf *vfsp;
+
+#if 1 || defined(COMPAT_PRELITE2)
+ /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+ if (namelen == 1)
+ return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+#ifdef notyet
+ /* all sysctl names at this level are at least name and field */
+ if (namelen < 2)
+ return (ENOTDIR); /* overloaded */
+ if (name[0] != VFS_GENERIC) {
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == name[0])
+ break;
+ if (vfsp == NULL)
+ return (EOPNOTSUPP);
+ return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
+ oldp, oldlenp, newp, newlen, p));
+ }
+#endif
+ switch (name[1]) {
+ case VFS_MAXTYPENUM:
+ if (namelen != 2)
+ return (ENOTDIR);
+ return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
+ case VFS_CONF:
+ if (namelen != 3)
+ return (ENOTDIR); /* overloaded */
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == name[2])
+ break;
+ if (vfsp == NULL)
+ return (EOPNOTSUPP);
+ return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
+ }
+ return (EOPNOTSUPP);
+}
+
+SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
+ "Generic filesystem");
+
+#if 1 || defined(COMPAT_PRELITE2)
+
+static int
+sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
+{
+ int error;
+ struct vfsconf *vfsp;
+ struct ovfsconf ovfs;
+
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+ ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
+ strcpy(ovfs.vfc_name, vfsp->vfc_name);
+ ovfs.vfc_index = vfsp->vfc_typenum;
+ ovfs.vfc_refcount = vfsp->vfc_refcount;
+ ovfs.vfc_flags = vfsp->vfc_flags;
+ error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+ if (error)
+ return error;
+ }
+ return 0;
+}
+
+#endif /* 1 || COMPAT_PRELITE2 */
+
+#if 0
+#define KINFO_VNODESLOP 10
+/*
+ * Dump vnode list (via sysctl).
+ * Copyout address of vnode followed by vnode.
+ */
+/* ARGSUSED */
+static int
+sysctl_vnode SYSCTL_HANDLER_ARGS
+{
+ struct proc *p = curproc; /* XXX */
+ struct mount *mp, *nmp;
+ struct vnode *nvp, *vp;
+ int error;
+
+#define VPTRSZ sizeof (struct vnode *)
+#define VNODESZ sizeof (struct vnode)
+
+ req->lock = 0;
+ if (!req->oldptr) /* Make an estimate */
+ return (SYSCTL_OUT(req, 0,
+ (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
+
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+again:
+ simple_lock(&mntvnode_slock);
+ for (vp = mp->mnt_vnodelist.lh_first;
+ vp != NULL;
+ vp = nvp) {
+ /*
+ * Check that the vp is still associated with
+ * this filesystem. RACE: could have been
+ * recycled onto the same filesystem.
+ */
+ if (vp->v_mount != mp) {
+ simple_unlock(&mntvnode_slock);
+ goto again;
+ }
+ nvp = vp->v_mntvnodes.le_next;
+ simple_unlock(&mntvnode_slock);
+ if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
+ (error = SYSCTL_OUT(req, vp, VNODESZ)))
+ return (error);
+ simple_lock(&mntvnode_slock);
+ }
+ simple_unlock(&mntvnode_slock);
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ }
+ simple_unlock(&mountlist_slock);
+
+ return (0);
+}
+#endif
+
+/*
+ * XXX
+ * Exporting the vnode list on large systems causes them to crash.
+ * Exporting the vnode list on medium systems causes sysctl to coredump.
+ */
+#if 0
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, 0, sysctl_vnode, "S,vnode", "");
+#endif
+
+/*
+ * Check to see if a filesystem is mounted on a block device.
+ */
+int
+vfs_mountedon(vp)
+ struct vnode *vp;
+{
+ struct vnode *vq;
+ int error = 0;
+
+ if (vp->v_specmountpoint != NULL)
+ return (EBUSY);
+ if (vp->v_flag & VALIASED) {
+ simple_lock(&spechash_slock);
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_rdev != vp->v_rdev ||
+ vq->v_type != vp->v_type)
+ continue;
+ if (vq->v_specmountpoint != NULL) {
+ error = EBUSY;
+ break;
+ }
+ }
+ simple_unlock(&spechash_slock);
+ }
+ return (error);
+}
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+void
+vfs_unmountall()
+{
+ struct mount *mp, *nmp;
+ struct proc *p;
+ int error;
+
+ if (curproc != NULL)
+ p = curproc;
+ else
+ p = initproc; /* XXX XXX should this be proc0? */
+ /*
+ * Since this only runs when rebooting, it is not interlocked.
+ */
+ for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
+ nmp = mp->mnt_list.cqe_prev;
+ error = dounmount(mp, MNT_FORCE, p);
+ if (error) {
+ printf("unmount of %s failed (",
+ mp->mnt_stat.f_mntonname);
+ if (error == EBUSY)
+ printf("BUSY)\n");
+ else
+ printf("%d)\n", error);
+ }
+ }
+}
+
+/*
+ * Build hash lists of net addresses and hang them off the mount point.
+ * Called by ufs_mount() to set up the lists of export addresses.
+ */
+static int
+vfs_hang_addrlist(mp, nep, argp)
+ struct mount *mp;
+ struct netexport *nep;
+ struct export_args *argp;
+{
+ register struct netcred *np;
+ register struct radix_node_head *rnh;
+ register int i;
+ struct radix_node *rn;
+ struct sockaddr *saddr, *smask = 0;
+ struct domain *dom;
+ int error;
+
+ if (argp->ex_addrlen == 0) {
+ if (mp->mnt_flag & MNT_DEFEXPORTED)
+ return (EPERM);
+ np = &nep->ne_defexported;
+ np->netc_exflags = argp->ex_flags;
+ np->netc_anon = argp->ex_anon;
+ np->netc_anon.cr_ref = 1;
+ mp->mnt_flag |= MNT_DEFEXPORTED;
+ return (0);
+ }
+ i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
+ np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
+ bzero((caddr_t) np, i);
+ saddr = (struct sockaddr *) (np + 1);
+ if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
+ goto out;
+ if (saddr->sa_len > argp->ex_addrlen)
+ saddr->sa_len = argp->ex_addrlen;
+ if (argp->ex_masklen) {
+ smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
+ error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
+ if (error)
+ goto out;
+ if (smask->sa_len > argp->ex_masklen)
+ smask->sa_len = argp->ex_masklen;
+ }
+ i = saddr->sa_family;
+ if ((rnh = nep->ne_rtable[i]) == 0) {
+ /*
+ * Seems silly to initialize every AF when most are not used,
+ * do so on demand here
+ */
+ for (dom = domains; dom; dom = dom->dom_next)
+ if (dom->dom_family == i && dom->dom_rtattach) {
+ dom->dom_rtattach((void **) &nep->ne_rtable[i],
+ dom->dom_rtoffset);
+ break;
+ }
+ if ((rnh = nep->ne_rtable[i]) == 0) {
+ error = ENOBUFS;
+ goto out;
+ }
+ }
+ rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
+ np->netc_rnodes);
+ if (rn == 0 || np != (struct netcred *) rn) { /* already exists */
+ error = EPERM;
+ goto out;
+ }
+ np->netc_exflags = argp->ex_flags;
+ np->netc_anon = argp->ex_anon;
+ np->netc_anon.cr_ref = 1;
+ return (0);
+out:
+ free(np, M_NETADDR);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+vfs_free_netcred(rn, w)
+ struct radix_node *rn;
+ void *w;
+{
+ register struct radix_node_head *rnh = (struct radix_node_head *) w;
+
+ (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+ free((caddr_t) rn, M_NETADDR);
+ return (0);
+}
+
+/*
+ * Free the net address hash lists that are hanging off the mount points.
+ */
+static void
+vfs_free_addrlist(nep)
+ struct netexport *nep;
+{
+ register int i;
+ register struct radix_node_head *rnh;
+
+ for (i = 0; i <= AF_MAX; i++)
+ if ((rnh = nep->ne_rtable[i])) {
+ (*rnh->rnh_walktree) (rnh, vfs_free_netcred,
+ (caddr_t) rnh);
+ free((caddr_t) rnh, M_RTABLE);
+ nep->ne_rtable[i] = 0;
+ }
+}
+
+int
+vfs_export(mp, nep, argp)
+ struct mount *mp;
+ struct netexport *nep;
+ struct export_args *argp;
+{
+ int error;
+
+ if (argp->ex_flags & MNT_DELEXPORT) {
+ if (mp->mnt_flag & MNT_EXPUBLIC) {
+ vfs_setpublicfs(NULL, NULL, NULL);
+ mp->mnt_flag &= ~MNT_EXPUBLIC;
+ }
+ vfs_free_addrlist(nep);
+ mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+ }
+ if (argp->ex_flags & MNT_EXPORTED) {
+ if (argp->ex_flags & MNT_EXPUBLIC) {
+ if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
+ return (error);
+ mp->mnt_flag |= MNT_EXPUBLIC;
+ }
+ if ((error = vfs_hang_addrlist(mp, nep, argp)))
+ return (error);
+ mp->mnt_flag |= MNT_EXPORTED;
+ }
+ return (0);
+}
+
+
+/*
+ * Set the publicly exported filesystem (WebNFS). Currently, only
+ * one public filesystem is possible in the spec (RFC 2054 and 2055)
+ */
+int
+vfs_setpublicfs(mp, nep, argp)
+ struct mount *mp;
+ struct netexport *nep;
+ struct export_args *argp;
+{
+ int error;
+ struct vnode *rvp;
+ char *cp;
+
+ /*
+ * mp == NULL -> invalidate the current info, the FS is
+ * no longer exported. May be called from either vfs_export
+ * or unmount, so check if it hasn't already been done.
+ */
+ if (mp == NULL) {
+ if (nfs_pub.np_valid) {
+ nfs_pub.np_valid = 0;
+ if (nfs_pub.np_index != NULL) {
+ FREE(nfs_pub.np_index, M_TEMP);
+ nfs_pub.np_index = NULL;
+ }
+ }
+ return (0);
+ }
+
+ /*
+ * Only one allowed at a time.
+ */
+ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
+ return (EBUSY);
+
+ /*
+ * Get real filehandle for root of exported FS.
+ */
+ bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
+ nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
+
+ if ((error = VFS_ROOT(mp, &rvp)))
+ return (error);
+
+ if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
+ return (error);
+
+ vput(rvp);
+
+ /*
+ * If an indexfile was specified, pull it in.
+ */
+ if (argp->ex_indexfile != NULL) {
+ MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
+ M_WAITOK);
+ error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
+ MAXNAMLEN, (size_t *)0);
+ if (!error) {
+ /*
+ * Check for illegal filenames.
+ */
+ for (cp = nfs_pub.np_index; *cp; cp++) {
+ if (*cp == '/') {
+ error = EINVAL;
+ break;
+ }
+ }
+ }
+ if (error) {
+ FREE(nfs_pub.np_index, M_TEMP);
+ return (error);
+ }
+ }
+
+ nfs_pub.np_mount = mp;
+ nfs_pub.np_valid = 1;
+ return (0);
+}
+
+struct netcred *
+vfs_export_lookup(mp, nep, nam)
+ register struct mount *mp;
+ struct netexport *nep;
+ struct sockaddr *nam;
+{
+ register struct netcred *np;
+ register struct radix_node_head *rnh;
+ struct sockaddr *saddr;
+
+ np = NULL;
+ if (mp->mnt_flag & MNT_EXPORTED) {
+ /*
+ * Lookup in the export list first.
+ */
+ if (nam != NULL) {
+ saddr = nam;
+ rnh = nep->ne_rtable[saddr->sa_family];
+ if (rnh != NULL) {
+ np = (struct netcred *)
+ (*rnh->rnh_matchaddr)((caddr_t)saddr,
+ rnh);
+ if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
+ np = NULL;
+ }
+ }
+ /*
+ * If no address match, use the default if it exists.
+ */
+ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
+ np = &nep->ne_defexported;
+ }
+ return (np);
+}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags) {
+ struct vnode *vp, *nvp;
+ struct vm_object *obj;
+ int anyio, tries;
+
+ tries = 5;
+loop:
+ anyio = 0;
+ for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
+
+ nvp = vp->v_mntvnodes.le_next;
+
+ if (vp->v_mount != mp) {
+ goto loop;
+ }
+
+ if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */
+ continue;
+
+ if (flags != MNT_WAIT) {
+ obj = vp->v_object;
+ if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
+ continue;
+ if (VOP_ISLOCKED(vp))
+ continue;
+ }
+
+ simple_lock(&vp->v_interlock);
+ if (vp->v_object &&
+ (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
+ if (!vget(vp,
+ LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
+ if (vp->v_object) {
+ vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
+ anyio = 1;
+ }
+ vput(vp);
+ }
+ } else {
+ simple_unlock(&vp->v_interlock);
+ }
+ }
+ if (anyio && (--tries > 0))
+ goto loop;
+}
+
+/*
+ * Create the VM object needed for VMIO and mmap support. This
+ * is done for all VREG files in the system. Some filesystems might
+ * afford the additional metadata buffering capability of the
+ * VMIO code by making the device node be VMIO mode also.
+ *
+ * vp must be locked when vfs_object_create is called.
+ */
+int
+vfs_object_create(vp, p, cred)
+ struct vnode *vp;
+ struct proc *p;
+ struct ucred *cred;
+{
+ struct vattr vat;
+ vm_object_t object;
+ int error = 0;
+
+ if ((vp->v_type != VREG) && (vp->v_type != VBLK))
+ return 0;
+
+retry:
+ if ((object = vp->v_object) == NULL) {
+ if (vp->v_type == VREG) {
+ if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
+ goto retn;
+ object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
+ } else if (major(vp->v_rdev) < nblkdev &&
+ bdevsw[major(vp->v_rdev)] != NULL) {
+ /*
+ * This simply allocates the biggest object possible
+ * for a VBLK vnode. This should be fixed, but doesn't
+ * cause any problems (yet).
+ */
+ object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
+ }
+ object->ref_count--;
+ vp->v_usecount--;
+ } else {
+ if (object->flags & OBJ_DEAD) {
+ VOP_UNLOCK(vp, 0, p);
+ tsleep(object, PVM, "vodead", 0);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ goto retry;
+ }
+ }
+
+ if (vp->v_object)
+ vp->v_flag |= VOBJBUF;
+
+retn:
+ return error;
+}
+
+static void
+vfree(vp)
+ struct vnode *vp;
+{
+ int s;
+
+ s = splbio();
+ simple_lock(&vnode_free_list_slock);
+ if (vp->v_flag & VTBFREE) {
+ TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+ vp->v_flag &= ~VTBFREE;
+ }
+ if (vp->v_flag & VAGE) {
+ TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+ } else {
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ }
+ freevnodes++;
+ simple_unlock(&vnode_free_list_slock);
+ vp->v_flag &= ~VAGE;
+ vp->v_flag |= VFREE;
+ splx(s);
+}
+
+void
+vbusy(vp)
+ struct vnode *vp;
+{
+ int s;
+
+ s = splbio();
+ simple_lock(&vnode_free_list_slock);
+ if (vp->v_flag & VTBFREE) {
+ TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+ vp->v_flag &= ~VTBFREE;
+ } else {
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ freevnodes--;
+ }
+ simple_unlock(&vnode_free_list_slock);
+ vp->v_flag &= ~(VFREE|VAGE);
+ splx(s);
+}
+
+/*
+ * Record a process's interest in events which might happen to
+ * a vnode. Because poll uses the historic select-style interface
+ * internally, this routine serves as both the ``check for any
+ * pending events'' and the ``record my interest in future events''
+ * functions. (These are done together, while the lock is held,
+ * to avoid race conditions.)
+ */
+int
+vn_pollrecord(vp, p, events)
+ struct vnode *vp;
+ struct proc *p;
+ short events;
+{
+ simple_lock(&vp->v_pollinfo.vpi_lock);
+ if (vp->v_pollinfo.vpi_revents & events) {
+ /*
+ * This leaves events we are not interested
+ * in available for the other process which
+ * which presumably had requested them
+ * (otherwise they would never have been
+ * recorded).
+ */
+ events &= vp->v_pollinfo.vpi_revents;
+ vp->v_pollinfo.vpi_revents &= ~events;
+
+ simple_unlock(&vp->v_pollinfo.vpi_lock);
+ return events;
+ }
+ vp->v_pollinfo.vpi_events |= events;
+ selrecord(p, &vp->v_pollinfo.vpi_selinfo);
+ simple_unlock(&vp->v_pollinfo.vpi_lock);
+ return 0;
+}
+
+/*
+ * Note the occurrence of an event. If the VN_POLLEVENT macro is used,
+ * it is possible for us to miss an event due to race conditions, but
+ * that condition is expected to be rare, so for the moment it is the
+ * preferred interface.
+ */
+void
+vn_pollevent(vp, events)
+ struct vnode *vp;
+ short events;
+{
+ simple_lock(&vp->v_pollinfo.vpi_lock);
+ if (vp->v_pollinfo.vpi_events & events) {
+ /*
+ * We clear vpi_events so that we don't
+ * call selwakeup() twice if two events are
+ * posted before the polling process(es) is
+ * awakened. This also ensures that we take at
+ * most one selwakeup() if the polling process
+ * is no longer interested. However, it does
+ * mean that only one event can be noticed at
+ * a time. (Perhaps we should only clear those
+ * event bits which we note?) XXX
+ */
+ vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
+ vp->v_pollinfo.vpi_revents |= events;
+ selwakeup(&vp->v_pollinfo.vpi_selinfo);
+ }
+ simple_unlock(&vp->v_pollinfo.vpi_lock);
+}
+
+/*
+ * Wake up anyone polling on vp because it is being revoked.
+ * This depends on dead_poll() returning POLLHUP for correct
+ * behavior.
+ */
+void
+vn_pollgone(vp)
+ struct vnode *vp;
+{
+ simple_lock(&vp->v_pollinfo.vpi_lock);
+ if (vp->v_pollinfo.vpi_events) {
+ vp->v_pollinfo.vpi_events = 0;
+ selwakeup(&vp->v_pollinfo.vpi_selinfo);
+ }
+ simple_unlock(&vp->v_pollinfo.vpi_lock);
+}
+
+
+
+/*
+ * Routine to create and manage a filesystem syncer vnode.
+ */
+#define sync_close ((int (*) __P((struct vop_close_args *)))nullop)
+static int sync_fsync __P((struct vop_fsync_args *));
+static int sync_inactive __P((struct vop_inactive_args *));
+static int sync_reclaim __P((struct vop_reclaim_args *));
+#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock)
+#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock)
+static int sync_print __P((struct vop_print_args *));
+#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
+
+static vop_t **sync_vnodeop_p;
+static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
+ { &vop_default_desc, (vop_t *) vop_eopnotsupp },
+ { &vop_close_desc, (vop_t *) sync_close }, /* close */
+ { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */
+ { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */
+ { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */
+ { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */
+ { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */
+ { &vop_print_desc, (vop_t *) sync_print }, /* print */
+ { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */
+ { NULL, NULL }
+};
+static struct vnodeopv_desc sync_vnodeop_opv_desc =
+ { &sync_vnodeop_p, sync_vnodeop_entries };
+
+VNODEOP_SET(sync_vnodeop_opv_desc);
+
+/*
+ * Create a new filesystem syncer vnode for the specified mount point.
+ */
+int
+vfs_allocate_syncvnode(mp)
+ struct mount *mp;
+{
+ struct vnode *vp;
+ static long start, incr, next;
+ int error;
+
+ /* Allocate a new vnode */
+ if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
+ mp->mnt_syncer = NULL;
+ return (error);
+ }
+ vp->v_type = VNON;
+ /*
+ * Place the vnode onto the syncer worklist. We attempt to
+ * scatter them about on the list so that they will go off
+ * at evenly distributed times even if all the filesystems
+ * are mounted at once.
+ */
+ next += incr;
+ if (next == 0 || next > syncer_maxdelay) {
+ start /= 2;
+ incr /= 2;
+ if (start == 0) {
+ start = syncer_maxdelay / 2;
+ incr = syncer_maxdelay;
+ }
+ next = start;
+ }
+ vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
+ mp->mnt_syncer = vp;
+ return (0);
+}
+
+/*
+ * Do a lazy sync of the filesystem.
+ */
+static int
+sync_fsync(ap)
+ struct vop_fsync_args /* {
+ struct vnode *a_vp;
+ struct ucred *a_cred;
+ int a_waitfor;
+ struct proc *a_p;
+ } */ *ap;
+{
+ struct vnode *syncvp = ap->a_vp;
+ struct mount *mp = syncvp->v_mount;
+ struct proc *p = ap->a_p;
+ int asyncflag;
+
+ /*
+ * We only need to do something if this is a lazy evaluation.
+ */
+ if (ap->a_waitfor != MNT_LAZY)
+ return (0);
+
+ /*
+ * Move ourselves to the back of the sync list.
+ */
+ vn_syncer_add_to_worklist(syncvp, syncdelay);
+
+ /*
+ * Walk the list of vnodes pushing all that are dirty and
+ * not already on the sync list.
+ */
+ simple_lock(&mountlist_slock);
+ if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
+ simple_unlock(&mountlist_slock);
+ return (0);
+ }
+ asyncflag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &= ~MNT_ASYNC;
+ vfs_msync(mp, MNT_NOWAIT);
+ VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
+ if (asyncflag)
+ mp->mnt_flag |= MNT_ASYNC;
+ vfs_unbusy(mp, p);
+ return (0);
+}
+
+/*
+ * The syncer vnode is no referenced.
+ */
+static int
+sync_inactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct proc *a_p;
+ } */ *ap;
+{
+
+ vgone(ap->a_vp);
+ return (0);
+}
+
+/*
+ * The syncer vnode is no longer needed and is being decommissioned.
+ */
+static int
+sync_reclaim(ap)
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+ vp->v_mount->mnt_syncer = NULL;
+ if (vp->v_flag & VONWORKLST) {
+ LIST_REMOVE(vp, v_synclist);
+ vp->v_flag &= ~VONWORKLST;
+ }
+
+ return (0);
+}
+
+/*
+ * Print out a syncer vnode.
+ */
+static int
+sync_print(ap)
+ struct vop_print_args /* {
+ struct vnode *a_vp;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+ printf("syncer vnode");
+ if (vp->v_vnlock != NULL)
+ lockmgr_printinfo(vp->v_vnlock);
+ printf("\n");
+ return (0);
+}
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
new file mode 100644
index 0000000..18e39d6
--- /dev/null
+++ b/sys/kern/vfs_syscalls.c
@@ -0,0 +1,3034 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94
+ * $Id: vfs_syscalls.c,v 1.111 1998/12/12 21:07:09 dillon Exp $
+ */
+
+/* For 4.3 integer FS ID compatibility */
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/linker.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/dirent.h>
+
+#include <miscfs/union/union.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_zone.h>
+#include <sys/sysctl.h>
+
+static int change_dir __P((struct nameidata *ndp, struct proc *p));
+static void checkdirs __P((struct vnode *olddp));
+static int setfown __P((struct proc *, struct vnode *, uid_t, gid_t));
+static int setfmode __P((struct proc *, struct vnode *, int));
+static int setfflags __P((struct proc *, struct vnode *, int));
+static int setutimes __P((struct proc *, struct vnode *, struct timeval *, int));
+static int usermount = 0; /* if 1, non-root can mount fs. */
+
+int (*union_dircheckp) __P((struct proc *, struct vnode **, struct file *));
+
+SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
+
+/*
+ * Virtual File System System Calls
+ */
+
+/*
+ * Mount a file system.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+ char *type;
+ char *path;
+ int flags;
+ caddr_t data;
+};
+#endif
+/* ARGSUSED */
+int
+mount(p, uap)
+ struct proc *p;
+ register struct mount_args /* {
+ syscallarg(char *) type;
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(caddr_t) data;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct vfsconf *vfsp;
+ int error, flag = 0, flag2 = 0;
+ struct vattr va;
+ u_long fstypenum;
+ struct nameidata nd;
+ char fstypename[MFSNAMELEN];
+
+ if (usermount == 0 && (error = suser(p->p_ucred, &p->p_acflag)))
+ return (error);
+
+ /*
+ * Get vnode to be covered
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (SCARG(uap, flags) & MNT_UPDATE) {
+ if ((vp->v_flag & VROOT) == 0) {
+ vput(vp);
+ return (EINVAL);
+ }
+ mp = vp->v_mount;
+ flag = mp->mnt_flag;
+ flag2 = mp->mnt_kern_flag;
+ /*
+ * We only allow the filesystem to be reloaded if it
+ * is currently mounted read-only.
+ */
+ if ((SCARG(uap, flags) & MNT_RELOAD) &&
+ ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+ vput(vp);
+ return (EOPNOTSUPP); /* Needs translation */
+ }
+ mp->mnt_flag |=
+ SCARG(uap, flags) & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to update it.
+ */
+ if (mp->mnt_stat.f_owner != p->p_ucred->cr_uid &&
+ (error = suser(p->p_ucred, &p->p_acflag))) {
+ vput(vp);
+ return (error);
+ }
+ /*
+ * Do not allow NFS export by non-root users. Silently
+ * enforce MNT_NOSUID and MNT_NODEV for non-root users.
+ */
+ if (p->p_ucred->cr_uid != 0) {
+ if (SCARG(uap, flags) & MNT_EXPORTED) {
+ vput(vp);
+ return (EPERM);
+ }
+ SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV;
+ }
+ if (vfs_busy(mp, LK_NOWAIT, 0, p)) {
+ vput(vp);
+ return (EBUSY);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ goto update;
+ }
+ /*
+ * If the user is not root, ensure that they own the directory
+ * onto which we are attempting to mount.
+ */
+ if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) ||
+ (va.va_uid != p->p_ucred->cr_uid &&
+ (error = suser(p->p_ucred, &p->p_acflag)))) {
+ vput(vp);
+ return (error);
+ }
+ /*
+ * Do not allow NFS export by non-root users. Silently
+ * enforce MNT_NOSUID and MNT_NODEV for non-root users.
+ */
+ if (p->p_ucred->cr_uid != 0) {
+ if (SCARG(uap, flags) & MNT_EXPORTED) {
+ vput(vp);
+ return (EPERM);
+ }
+ SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV;
+ }
+ if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0))
+ return (error);
+ if (vp->v_type != VDIR) {
+ vput(vp);
+ return (ENOTDIR);
+ }
+#ifdef COMPAT_43
+ /*
+ * Historically filesystem types were identified by number. If we
+ * get an integer for the filesystem type instead of a string, we
+ * check to see if it matches one of the historic filesystem types.
+ */
+ fstypenum = (uintptr_t)SCARG(uap, type);
+ if (fstypenum < maxvfsconf) {
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == fstypenum)
+ break;
+ if (vfsp == NULL) {
+ vput(vp);
+ return (ENODEV);
+ }
+ strncpy(fstypename, vfsp->vfc_name, MFSNAMELEN);
+ } else
+#endif /* COMPAT_43 */
+ if (error = copyinstr(SCARG(uap, type), fstypename, MFSNAMELEN, NULL)) {
+ vput(vp);
+ return (error);
+ }
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstypename))
+ break;
+ if (vfsp == NULL) {
+ linker_file_t lf;
+
+ /* Refuse to load modules if securelevel raised */
+ if (securelevel > 0) {
+ vput(vp);
+ return EPERM;
+ }
+ /* Only load modules for root (very important!) */
+ if (error = suser(p->p_ucred, &p->p_acflag)) {
+ vput(vp);
+ return error;
+ }
+ error = linker_load_file(fstypename, &lf);
+ if (error || lf == NULL) {
+ vput(vp);
+ if (lf == NULL)
+ error = ENODEV;
+ return error;
+ }
+ lf->userrefs++;
+ /* lookup again, see if the VFS was loaded */
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstypename))
+ break;
+ if (vfsp == NULL) {
+ lf->userrefs--;
+ linker_file_unload(lf);
+ vput(vp);
+ return (ENODEV);
+ }
+ }
+ simple_lock(&vp->v_interlock);
+ if ((vp->v_flag & VMOUNT) != 0 ||
+ vp->v_mountedhere != NULL) {
+ simple_unlock(&vp->v_interlock);
+ vput(vp);
+ return (EBUSY);
+ }
+ vp->v_flag |= VMOUNT;
+ simple_unlock(&vp->v_interlock);
+
+ /*
+ * Allocate and initialize the filesystem.
+ */
+ mp = (struct mount *)malloc((u_long)sizeof(struct mount),
+ M_MOUNT, M_WAITOK);
+ bzero((char *)mp, (u_long)sizeof(struct mount));
+ lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+ (void)vfs_busy(mp, LK_NOWAIT, 0, p);
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_vfc = vfsp;
+ vfsp->vfc_refcount++;
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+ strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+ mp->mnt_vnodecovered = vp;
+ mp->mnt_stat.f_owner = p->p_ucred->cr_uid;
+ VOP_UNLOCK(vp, 0, p);
+update:
+ /*
+ * Set the mount level flags.
+ */
+ if (SCARG(uap, flags) & MNT_RDONLY)
+ mp->mnt_flag |= MNT_RDONLY;
+ else if (mp->mnt_flag & MNT_RDONLY)
+ mp->mnt_kern_flag |= MNTK_WANTRDWR;
+ mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
+ MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME |
+ MNT_NOSYMFOLLOW |
+ MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
+ mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC |
+ MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
+ MNT_NOSYMFOLLOW |
+ MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
+ /*
+ * Mount the filesystem.
+ */
+ error = VFS_MOUNT(mp, SCARG(uap, path), SCARG(uap, data), &nd, p);
+ if (mp->mnt_flag & MNT_UPDATE) {
+ vrele(vp);
+ if (mp->mnt_kern_flag & MNTK_WANTRDWR)
+ mp->mnt_flag &= ~MNT_RDONLY;
+ mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
+ mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
+ if (error) {
+ mp->mnt_flag = flag;
+ mp->mnt_kern_flag = flag2;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+ if (mp->mnt_syncer == NULL)
+ error = vfs_allocate_syncvnode(mp);
+ } else {
+ if (mp->mnt_syncer != NULL)
+ vrele(mp->mnt_syncer);
+ mp->mnt_syncer = NULL;
+ }
+ vfs_unbusy(mp, p);
+ return (error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ /*
+ * Put the new filesystem on the mount list after root.
+ */
+ cache_purge(vp);
+ if (!error) {
+ simple_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ vp->v_mountedhere = mp;
+ simple_unlock(&vp->v_interlock);
+ simple_lock(&mountlist_slock);
+ CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ simple_unlock(&mountlist_slock);
+ checkdirs(vp);
+ VOP_UNLOCK(vp, 0, p);
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ error = vfs_allocate_syncvnode(mp);
+ vfs_unbusy(mp, p);
+ if (error = VFS_START(mp, 0, p))
+ vrele(vp);
+ } else {
+ simple_lock(&vp->v_interlock);
+ vp->v_flag &= ~VMOUNT;
+ simple_unlock(&vp->v_interlock);
+ mp->mnt_vfc->vfc_refcount--;
+ vfs_unbusy(mp, p);
+ free((caddr_t)mp, M_MOUNT);
+ vput(vp);
+ }
+ return (error);
+}
+
+/*
+ * Scan all active processes to see if any of them have a current
+ * or root directory onto which the new filesystem has just been
+ * mounted. If so, replace them with the new mount point.
+ */
+static void
+checkdirs(olddp)
+ struct vnode *olddp;
+{
+ struct filedesc *fdp;
+ struct vnode *newdp;
+ struct proc *p;
+
+ if (olddp->v_usecount == 1)
+ return;
+ if (VFS_ROOT(olddp->v_mountedhere, &newdp))
+ panic("mount: lost mount");
+ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+ fdp = p->p_fd;
+ if (fdp->fd_cdir == olddp) {
+ vrele(fdp->fd_cdir);
+ VREF(newdp);
+ fdp->fd_cdir = newdp;
+ }
+ if (fdp->fd_rdir == olddp) {
+ vrele(fdp->fd_rdir);
+ VREF(newdp);
+ fdp->fd_rdir = newdp;
+ }
+ }
+ if (rootvnode == olddp) {
+ vrele(rootvnode);
+ VREF(newdp);
+ rootvnode = newdp;
+ }
+ vput(newdp);
+}
+
+/*
+ * Unmount a file system.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument,
+ * not special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+ char *path;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+unmount(p, uap)
+ struct proc *p;
+ register struct unmount_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct mount *mp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ mp = vp->v_mount;
+
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to unmount this filesystem.
+ */
+ if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) &&
+ (error = suser(p->p_ucred, &p->p_acflag))) {
+ vput(vp);
+ return (error);
+ }
+
+ /*
+ * Don't allow unmounting the root file system.
+ */
+ if (mp->mnt_flag & MNT_ROOTFS) {
+ vput(vp);
+ return (EINVAL);
+ }
+
+ /*
+ * Must be the root of the filesystem
+ */
+ if ((vp->v_flag & VROOT) == 0) {
+ vput(vp);
+ return (EINVAL);
+ }
+ vput(vp);
+ return (dounmount(mp, SCARG(uap, flags), p));
+}
+
+/*
+ * Do the actual file system unmount.
+ */
+int
+dounmount(mp, flags, p)
+ register struct mount *mp;
+ int flags;
+ struct proc *p;
+{
+ struct vnode *coveredvp;
+ int error;
+ int async_flag;
+
+ simple_lock(&mountlist_slock);
+ mp->mnt_kern_flag |= MNTK_UNMOUNT;
+ lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p);
+
+ if (mp->mnt_flag & MNT_EXPUBLIC)
+ vfs_setpublicfs(NULL, NULL, NULL);
+
+ vfs_msync(mp, MNT_WAIT);
+ async_flag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &=~ MNT_ASYNC;
+ cache_purgevfs(mp); /* remove cache entries for this file sys */
+ if (mp->mnt_syncer != NULL)
+ vrele(mp->mnt_syncer);
+ if (((mp->mnt_flag & MNT_RDONLY) ||
+ (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) ||
+ (flags & MNT_FORCE))
+ error = VFS_UNMOUNT(mp, flags, p);
+ simple_lock(&mountlist_slock);
+ if (error) {
+ if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
+ (void) vfs_allocate_syncvnode(mp);
+ mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
+ mp->mnt_flag |= async_flag;
+ lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK | LK_REENABLE,
+ &mountlist_slock, p);
+ if (mp->mnt_kern_flag & MNTK_MWAIT)
+ wakeup((caddr_t)mp);
+ return (error);
+ }
+ CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
+ if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
+ coveredvp->v_mountedhere = (struct mount *)0;
+ vrele(coveredvp);
+ }
+ mp->mnt_vfc->vfc_refcount--;
+ if (mp->mnt_vnodelist.lh_first != NULL)
+ panic("unmount: dangling vnode");
+ lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_slock, p);
+ if (mp->mnt_kern_flag & MNTK_MWAIT)
+ wakeup((caddr_t)mp);
+ free((caddr_t)mp, M_MOUNT);
+ return (0);
+}
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+ int dummy;
+};
+#endif
+
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
+/* ARGSUSED */
+int
+sync(p, uap)
+ struct proc *p;
+ struct sync_args *uap;
+{
+ register struct mount *mp, *nmp;
+ int asyncflag;
+
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+ asyncflag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &= ~MNT_ASYNC;
+ vfs_msync(mp, MNT_NOWAIT);
+ VFS_SYNC(mp, MNT_NOWAIT,
+ ((p != NULL) ? p->p_ucred : NOCRED), p);
+ mp->mnt_flag |= asyncflag;
+ }
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ }
+ simple_unlock(&mountlist_slock);
+#if 0
+/*
+ * XXX don't call vfs_bufstats() yet because that routine
+ * was not imported in the Lite2 merge.
+ */
+#ifdef DIAGNOSTIC
+ if (syncprt)
+ vfs_bufstats();
+#endif /* DIAGNOSTIC */
+#endif
+ return (0);
+}
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+ char *path;
+ int cmd;
+ int uid;
+ caddr_t arg;
+};
+#endif
+/* ARGSUSED */
+int
+quotactl(p, uap)
+ struct proc *p;
+ register struct quotactl_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) cmd;
+ syscallarg(int) uid;
+ syscallarg(caddr_t) arg;
+ } */ *uap;
+{
+ register struct mount *mp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ mp = nd.ni_vp->v_mount;
+ vrele(nd.ni_vp);
+ return (VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
+ SCARG(uap, arg), p));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+ char *path;
+ struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+statfs(p, uap)
+ struct proc *p;
+ register struct statfs_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct statfs *) buf;
+ } */ *uap;
+{
+ register struct mount *mp;
+ register struct statfs *sp;
+ int error;
+ struct nameidata nd;
+ struct statfs sb;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ mp = nd.ni_vp->v_mount;
+ sp = &mp->mnt_stat;
+ vrele(nd.ni_vp);
+ error = VFS_STATFS(mp, sp, p);
+ if (error)
+ return (error);
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (p->p_ucred->cr_uid != 0) {
+ bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
+ return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+ int fd;
+ struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+fstatfs(p, uap)
+ struct proc *p;
+ register struct fstatfs_args /* {
+ syscallarg(int) fd;
+ syscallarg(struct statfs *) buf;
+ } */ *uap;
+{
+ struct file *fp;
+ struct mount *mp;
+ register struct statfs *sp;
+ int error;
+ struct statfs sb;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ mp = ((struct vnode *)fp->f_data)->v_mount;
+ sp = &mp->mnt_stat;
+ error = VFS_STATFS(mp, sp, p);
+ if (error)
+ return (error);
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (p->p_ucred->cr_uid != 0) {
+ bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
+ return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+ struct statfs *buf;
+ long bufsize;
+ int flags;
+};
+#endif
+int
+getfsstat(p, uap)
+ struct proc *p;
+ register struct getfsstat_args /* {
+ syscallarg(struct statfs *) buf;
+ syscallarg(long) bufsize;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ register struct mount *mp, *nmp;
+ register struct statfs *sp;
+ caddr_t sfsp;
+ long count, maxcount, error;
+
+ maxcount = SCARG(uap, bufsize) / sizeof(struct statfs);
+ sfsp = (caddr_t)SCARG(uap, buf);
+ count = 0;
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+ if (sfsp && count < maxcount) {
+ sp = &mp->mnt_stat;
+ /*
+ * If MNT_NOWAIT or MNT_LAZY is specified, do not
+ * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
+ * overrides MNT_WAIT.
+ */
+ if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
+ (SCARG(uap, flags) & MNT_WAIT)) &&
+ (error = VFS_STATFS(mp, sp, p))) {
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ continue;
+ }
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ error = copyout((caddr_t)sp, sfsp, sizeof(*sp));
+ if (error) {
+ vfs_unbusy(mp, p);
+ return (error);
+ }
+ sfsp += sizeof(*sp);
+ }
+ count++;
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ }
+ simple_unlock(&mountlist_slock);
+ if (sfsp && count > maxcount)
+ p->p_retval[0] = maxcount;
+ else
+ p->p_retval[0] = count;
+ return (0);
+}
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+fchdir(p, uap)
+ struct proc *p;
+ struct fchdir_args /* {
+ syscallarg(int) fd;
+ } */ *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ struct vnode *vp, *tdp;
+ struct mount *mp;
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(fdp, SCARG(uap, fd), &fp))
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ VREF(vp);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ if (vp->v_type != VDIR)
+ error = ENOTDIR;
+ else
+ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+ while (!error && (mp = vp->v_mountedhere) != NULL) {
+ if (vfs_busy(mp, 0, 0, p))
+ continue;
+ error = VFS_ROOT(mp, &tdp);
+ vfs_unbusy(mp, p);
+ if (error)
+ break;
+ vput(vp);
+ vp = tdp;
+ }
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ vrele(fdp->fd_cdir);
+ fdp->fd_cdir = vp;
+ return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+chdir(p, uap)
+ struct proc *p;
+ struct chdir_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = change_dir(&nd, p))
+ return (error);
+ vrele(fdp->fd_cdir);
+ fdp->fd_cdir = nd.ni_vp;
+ return (0);
+}
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+chroot(p, uap)
+ struct proc *p;
+ struct chroot_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ int error;
+ struct nameidata nd;
+
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error)
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = change_dir(&nd, p))
+ return (error);
+ vrele(fdp->fd_rdir);
+ fdp->fd_rdir = nd.ni_vp;
+ return (0);
+}
+
+/*
+ * Common routine for chroot and chdir.
+ */
+static int
+change_dir(ndp, p)
+ register struct nameidata *ndp;
+ struct proc *p;
+{
+ struct vnode *vp;
+ int error;
+
+ error = namei(ndp);
+ if (error)
+ return (error);
+ vp = ndp->ni_vp;
+ if (vp->v_type != VDIR)
+ error = ENOTDIR;
+ else
+ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+ if (error)
+ vput(vp);
+ else
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+/*
+ * Check permissions, allocate an open file structure,
+ * and call the device open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+ char *path;
+ int flags;
+ int mode;
+};
+#endif
+int
+open(p, uap)
+ struct proc *p;
+ register struct open_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register struct file *fp;
+ register struct vnode *vp;
+ int cmode, flags, oflags;
+ struct file *nfp;
+ int type, indx, error;
+ struct flock lf;
+ struct nameidata nd;
+
+ oflags = SCARG(uap, flags);
+ if ((oflags & O_ACCMODE) == O_ACCMODE)
+ return (EINVAL);
+ flags = FFLAGS(oflags);
+ error = falloc(p, &nfp, &indx);
+ if (error)
+ return (error);
+ fp = nfp;
+ cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ p->p_dupfd = -indx - 1; /* XXX check for fdopen */
+ error = vn_open(&nd, flags, cmode);
+ if (error) {
+ ffree(fp);
+ if ((error == ENODEV || error == ENXIO) &&
+ p->p_dupfd >= 0 && /* XXX from fdopen */
+ (error =
+ dupfdopen(fdp, indx, p->p_dupfd, flags, error)) == 0) {
+ p->p_retval[0] = indx;
+ return (0);
+ }
+ if (error == ERESTART)
+ error = EINTR;
+ fdp->fd_ofiles[indx] = NULL;
+ return (error);
+ }
+ p->p_dupfd = 0;
+ vp = nd.ni_vp;
+
+ fp->f_flag = flags & FMASK;
+ fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+ fp->f_ops = &vnops;
+ fp->f_data = (caddr_t)vp;
+ if (flags & (O_EXLOCK | O_SHLOCK)) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ if (flags & O_EXLOCK)
+ lf.l_type = F_WRLCK;
+ else
+ lf.l_type = F_RDLCK;
+ type = F_FLOCK;
+ if ((flags & FNONBLOCK) == 0)
+ type |= F_WAIT;
+ VOP_UNLOCK(vp, 0, p);
+ if (error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) {
+ (void) vn_close(vp, fp->f_flag, fp->f_cred, p);
+ ffree(fp);
+ fdp->fd_ofiles[indx] = NULL;
+ return (error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ fp->f_flag |= FHASLOCK;
+ }
+ if ((vp->v_type == VREG) && (vp->v_object == NULL))
+ vfs_object_create(vp, p, p->p_ucred);
+ VOP_UNLOCK(vp, 0, p);
+ p->p_retval[0] = indx;
+ return (0);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+ocreat(p, uap)
+ struct proc *p;
+ register struct ocreat_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct open_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(int) mode;
+ } */ nuap;
+
+ SCARG(&nuap, path) = SCARG(uap, path);
+ SCARG(&nuap, mode) = SCARG(uap, mode);
+ SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
+ return (open(p, &nuap));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+ char *path;
+ int mode;
+ int dev;
+};
+#endif
+/* ARGSUSED */
+int
+mknod(p, uap)
+ struct proc *p;
+ register struct mknod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ syscallarg(int) dev;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ int whiteout = 0;
+ struct nameidata nd;
+
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error)
+ return (error);
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL)
+ error = EEXIST;
+ else {
+ VATTR_NULL(&vattr);
+ vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
+ vattr.va_rdev = SCARG(uap, dev);
+ whiteout = 0;
+
+ switch (SCARG(uap, mode) & S_IFMT) {
+ case S_IFMT: /* used by badsect to flag bad sectors */
+ vattr.va_type = VBAD;
+ break;
+ case S_IFCHR:
+ vattr.va_type = VCHR;
+ break;
+ case S_IFBLK:
+ vattr.va_type = VBLK;
+ break;
+ case S_IFWHT:
+ whiteout = 1;
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ }
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ if (whiteout) {
+ error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+ if (error)
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ vput(nd.ni_dvp);
+ } else {
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+ &nd.ni_cnd, &vattr);
+ vput(nd.ni_dvp);
+ }
+ } else {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (vp)
+ vrele(vp);
+ }
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod");
+ return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkfifo(p, uap)
+ struct proc *p;
+ register struct mkfifo_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ if (nd.ni_vp != NULL) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(nd.ni_vp);
+ return (EEXIST);
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VFIFO;
+ vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ vput(nd.ni_dvp);
+ return (error);
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+ char *path;
+ char *link;
+};
+#endif
+/* ARGSUSED */
+int
+link(p, uap)
+ struct proc *p;
+ register struct link_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) link;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type == VDIR)
+ error = EPERM; /* POSIX */
+ else {
+ NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p);
+ error = namei(&nd);
+ if (!error) {
+ if (nd.ni_vp != NULL) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_vp)
+ vrele(nd.ni_vp);
+ error = EEXIST;
+ } else {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred,
+ LEASE_WRITE);
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+ }
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ }
+ }
+ vrele(vp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "link");
+ return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+ char *path;
+ char *link;
+};
+#endif
+/* ARGSUSED */
+int
+symlink(p, uap)
+ struct proc *p;
+ register struct symlink_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) link;
+ } */ *uap;
+{
+ struct vattr vattr;
+ char *path;
+ int error;
+ struct nameidata nd;
+
+ path = zalloc(namei_zone);
+ if (error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL))
+ goto out;
+ NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p);
+ if (error = namei(&nd))
+ goto out;
+ if (nd.ni_vp) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(nd.ni_vp);
+ error = EEXIST;
+ goto out;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
+ vput(nd.ni_dvp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
+out:
+ zfree(namei_zone, path);
+ return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+/* ARGSUSED */
+int
+undelete(p, uap)
+ struct proc *p;
+ register struct undelete_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ error = namei(&nd);
+ if (error)
+ return (error);
+
+ if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (nd.ni_vp)
+ vrele(nd.ni_vp);
+ return (EEXIST);
+ }
+
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ if (error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE))
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ vput(nd.ni_dvp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete");
+ return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+unlink(p, uap)
+ struct proc *p;
+ struct unlink_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+
+ if (vp->v_type == VDIR)
+ error = EPERM; /* POSIX */
+ else {
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ *
+ * XXX: can this only be a VDIR case?
+ */
+ if (vp->v_flag & VROOT)
+ error = EBUSY;
+ }
+
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+ } else {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ }
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (vp != NULLVP)
+ vput(vp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink");
+ return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+ int fd;
+ int pad;
+ off_t offset;
+ int whence;
+};
+#endif
+int
+lseek(p, uap)
+ struct proc *p;
+ register struct lseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) offset;
+ syscallarg(int) whence;
+ } */ *uap;
+{
+ struct ucred *cred = p->p_ucred;
+ register struct filedesc *fdp = p->p_fd;
+ register struct file *fp;
+ struct vattr vattr;
+ int error;
+
+ if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
+ return (EBADF);
+ if (fp->f_type != DTYPE_VNODE)
+ return (ESPIPE);
+ switch (SCARG(uap, whence)) {
+ case L_INCR:
+ fp->f_offset += SCARG(uap, offset);
+ break;
+ case L_XTND:
+ error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p);
+ if (error)
+ return (error);
+ fp->f_offset = SCARG(uap, offset) + vattr.va_size;
+ break;
+ case L_SET:
+ fp->f_offset = SCARG(uap, offset);
+ break;
+ default:
+ return (EINVAL);
+ }
+ *(off_t *)(p->p_retval) = fp->f_offset;
+ return (0);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+ int fd;
+ long offset;
+ int whence;
+};
+#endif
+int
+olseek(p, uap)
+ struct proc *p;
+ register struct olseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(long) offset;
+ syscallarg(int) whence;
+ } */ *uap;
+{
+ struct lseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) offset;
+ syscallarg(int) whence;
+ } */ nuap;
+ int error;
+
+ SCARG(&nuap, fd) = SCARG(uap, fd);
+ SCARG(&nuap, offset) = SCARG(uap, offset);
+ SCARG(&nuap, whence) = SCARG(uap, whence);
+ error = lseek(p, &nuap);
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Check access permissions.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+ char *path;
+ int flags;
+};
+#endif
+int
+access(p, uap)
+ struct proc *p;
+ register struct access_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ register struct ucred *cred = p->p_ucred;
+ register struct vnode *vp;
+ int error, flags, t_gid, t_uid;
+ struct nameidata nd;
+
+ t_uid = cred->cr_uid;
+ t_gid = cred->cr_groups[0];
+ cred->cr_uid = p->p_cred->p_ruid;
+ cred->cr_groups[0] = p->p_cred->p_rgid;
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ goto out1;
+ vp = nd.ni_vp;
+
+ /* Flags == 0 means only check for existence. */
+ if (SCARG(uap, flags)) {
+ flags = 0;
+ if (SCARG(uap, flags) & R_OK)
+ flags |= VREAD;
+ if (SCARG(uap, flags) & W_OK)
+ flags |= VWRITE;
+ if (SCARG(uap, flags) & X_OK)
+ flags |= VEXEC;
+ if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+ error = VOP_ACCESS(vp, flags, cred, p);
+ }
+ vput(vp);
+out1:
+ cred->cr_uid = t_uid;
+ cred->cr_groups[0] = t_gid;
+ return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+ostat(p, uap)
+ struct proc *p;
+ register struct ostat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct ostat *) ub;
+ } */ *uap;
+{
+ struct stat sb;
+ struct ostat osb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = vn_stat(nd.ni_vp, &sb, p);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ cvtstat(&sb, &osb);
+ error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
+ return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+olstat(p, uap)
+ struct proc *p;
+ register struct olstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct ostat *) ub;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct stat sb;
+ struct ostat osb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ error = vn_stat(vp, &sb, p);
+ vput(vp);
+ if (error)
+ return (error);
+ cvtstat(&sb, &osb);
+ error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
+ return (error);
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ */
+void
+cvtstat(st, ost)
+ struct stat *st;
+ struct ostat *ost;
+{
+
+ ost->st_dev = st->st_dev;
+ ost->st_ino = st->st_ino;
+ ost->st_mode = st->st_mode;
+ ost->st_nlink = st->st_nlink;
+ ost->st_uid = st->st_uid;
+ ost->st_gid = st->st_gid;
+ ost->st_rdev = st->st_rdev;
+ if (st->st_size < (quad_t)1 << 32)
+ ost->st_size = st->st_size;
+ else
+ ost->st_size = -2;
+ ost->st_atime = st->st_atime;
+ ost->st_mtime = st->st_mtime;
+ ost->st_ctime = st->st_ctime;
+ ost->st_blksize = st->st_blksize;
+ ost->st_blocks = st->st_blocks;
+ ost->st_flags = st->st_flags;
+ ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+stat(p, uap)
+ struct proc *p;
+ register struct stat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct stat *) ub;
+ } */ *uap;
+{
+ struct stat sb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = vn_stat(nd.ni_vp, &sb, p);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
+ return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+lstat(p, uap)
+ struct proc *p;
+ register struct lstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct stat *) ub;
+ } */ *uap;
+{
+ int error;
+ struct vnode *vp;
+ struct stat sb;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ error = vn_stat(vp, &sb, p);
+ vput(vp);
+ if (error)
+ return (error);
+ error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
+ return (error);
+}
+
+void
+cvtnstat(sb, nsb)
+ struct stat *sb;
+ struct nstat *nsb;
+{
+ nsb->st_dev = sb->st_dev;
+ nsb->st_ino = sb->st_ino;
+ nsb->st_mode = sb->st_mode;
+ nsb->st_nlink = sb->st_nlink;
+ nsb->st_uid = sb->st_uid;
+ nsb->st_gid = sb->st_gid;
+ nsb->st_rdev = sb->st_rdev;
+ nsb->st_atimespec = sb->st_atimespec;
+ nsb->st_mtimespec = sb->st_mtimespec;
+ nsb->st_ctimespec = sb->st_ctimespec;
+ nsb->st_size = sb->st_size;
+ nsb->st_blocks = sb->st_blocks;
+ nsb->st_blksize = sb->st_blksize;
+ nsb->st_flags = sb->st_flags;
+ nsb->st_gen = sb->st_gen;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nstat_args {
+ char *path;
+ struct nstat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nstat(p, uap)
+ struct proc *p;
+ register struct nstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct nstat *) ub;
+ } */ *uap;
+{
+ struct stat sb;
+ struct nstat nsb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = vn_stat(nd.ni_vp, &sb, p);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ cvtnstat(&sb, &nsb);
+ error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb));
+ return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nlstat(p, uap)
+ struct proc *p;
+ register struct nlstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct nstat *) ub;
+ } */ *uap;
+{
+ int error;
+ struct vnode *vp;
+ struct stat sb;
+ struct nstat nsb;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ error = vn_stat(vp, &sb, p);
+ vput(vp);
+ if (error)
+ return (error);
+ cvtnstat(&sb, &nsb);
+ error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb));
+ return (error);
+}
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+ char *path;
+ int name;
+};
+#endif
+/* ARGSUSED */
+int
+pathconf(p, uap)
+ struct proc *p;
+ register struct pathconf_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) name;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), p->p_retval);
+ vput(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+ char *path;
+ char *buf;
+ int count;
+};
+#endif
+/* ARGSUSED */
+int
+readlink(p, uap)
+ struct proc *p;
+ register struct readlink_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) buf;
+ syscallarg(int) count;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct iovec aiov;
+ struct uio auio;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type != VLNK)
+ error = EINVAL;
+ else {
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+ auio.uio_resid = SCARG(uap, count);
+ error = VOP_READLINK(vp, &auio, p->p_ucred);
+ }
+ vput(vp);
+ p->p_retval[0] = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+
+static int
+setfflags(p, vp, flags)
+ struct proc *p;
+ struct vnode *vp;
+ int flags;
+{
+ int error;
+ struct vattr vattr;
+
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_flags = flags;
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ VOP_UNLOCK(vp, 0, p);
+ return error;
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+ char *path;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+chflags(p, uap)
+ struct proc *p;
+ register struct chflags_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = setfflags(p, nd.ni_vp, SCARG(uap, flags));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+ int fd;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+fchflags(p, uap)
+ struct proc *p;
+ register struct fchflags_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) flags;
+ } */ *uap;
+{
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ return setfflags(p, (struct vnode *) fp->f_data, SCARG(uap, flags));
+}
+
+static int
+setfmode(p, vp, mode)
+ struct proc *p;
+ struct vnode *vp;
+ int mode;
+{
+ int error;
+ struct vattr vattr;
+
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_mode = mode & ALLPERMS;
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ VOP_UNLOCK(vp, 0, p);
+ return error;
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+chmod(p, uap)
+ struct proc *p;
+ register struct chmod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = setfmode(p, nd.ni_vp, SCARG(uap, mode));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Change mode of a file given path name (don't follow links.)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchmod_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+lchmod(p, uap)
+ struct proc *p;
+ register struct lchmod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = setfmode(p, nd.ni_vp, SCARG(uap, mode));
+ vrele(nd.ni_vp);
+ return error;
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+ int fd;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+fchmod(p, uap)
+ struct proc *p;
+ register struct fchmod_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ return setfmode(p, (struct vnode *)fp->f_data, SCARG(uap, mode));
+}
+
+static int
+setfown(p, vp, uid, gid)
+ struct proc *p;
+ struct vnode *vp;
+ uid_t uid;
+ gid_t gid;
+{
+ int error;
+ struct vattr vattr;
+
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_uid = uid;
+ vattr.va_gid = gid;
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ VOP_UNLOCK(vp, 0, p);
+ return error;
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+chown(p, uap)
+ struct proc *p;
+ register struct chown_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = setfown(p, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+ vrele(nd.ni_vp);
+
+ return (error);
+}
+
+/*
+ * Set ownership given a path name, do not cross symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+lchown(p, uap)
+ struct proc *p;
+ register struct lchown_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = setfown(p, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+ int fd;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+fchown(p, uap)
+ struct proc *p;
+ register struct fchown_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+{
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ return setfown(p, (struct vnode *)fp->f_data,
+ SCARG(uap, uid), SCARG(uap, gid));
+}
+
+static int
+setutimes(p, vp, tv, nullflag)
+ struct proc *p;
+ struct vnode *vp;
+ struct timeval *tv;
+ int nullflag;
+{
+ int error;
+ struct vattr vattr;
+
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_atime.tv_sec = tv[0].tv_sec;
+ vattr.va_atime.tv_nsec = tv[0].tv_usec * 1000;
+ vattr.va_mtime.tv_sec = tv[1].tv_sec;
+ vattr.va_mtime.tv_nsec = tv[1].tv_usec * 1000;
+ if (nullflag)
+ vattr.va_vaflags |= VA_UTIMES_NULL;
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ VOP_UNLOCK(vp, 0, p);
+ return error;
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+utimes(p, uap)
+ struct proc *p;
+ register struct utimes_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+{
+ struct timeval tv[2];
+ int error;
+ struct nameidata nd;
+ int nullflag;
+
+ nullflag = 0;
+ if (SCARG(uap, tptr) == NULL) {
+ microtime(&tv[0]);
+ tv[1] = tv[0];
+ nullflag = 1;
+ } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+ sizeof (tv)))
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = setutimes(p, nd.ni_vp, tv, nullflag);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lutimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+lutimes(p, uap)
+ struct proc *p;
+ register struct lutimes_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+{
+ struct timeval tv[2];
+ int error;
+ struct nameidata nd;
+ int nullflag;
+
+ nullflag = 0;
+ if (SCARG(uap, tptr) == NULL) {
+ microtime(&tv[0]);
+ tv[1] = tv[0];
+ nullflag = 1;
+ } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+ sizeof (tv)))
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+
+ error = setutimes(p, nd.ni_vp, tv, nullflag);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct futimes_args {
+ int fd;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+futimes(p, uap)
+ struct proc *p;
+ register struct futimes_args /* {
+ syscallarg(int ) fd;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+{
+ struct timeval tv[2];
+ struct file *fp;
+ int error;
+ int nullflag;
+
+ nullflag = 0;
+ if (SCARG(uap, tptr) == NULL) {
+ microtime(&tv[0]);
+ tv[1] = tv[0];
+ nullflag = 1;
+ } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+ sizeof (tv)))
+ return (error);
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ return setutimes(p, (struct vnode *)fp->f_data, tv, nullflag);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+ char *path;
+ int pad;
+ off_t length;
+};
+#endif
+/* ARGSUSED */
+int
+truncate(p, uap)
+ struct proc *p;
+ register struct truncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ if (uap->length < 0)
+ return(EINVAL);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+ else if ((error = vn_writechk(vp)) == 0 &&
+ (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = SCARG(uap, length);
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ }
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+ int fd;
+ int pad;
+ off_t length;
+};
+#endif
+/* ARGSUSED */
+int
+ftruncate(p, uap)
+ struct proc *p;
+ register struct ftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ *uap;
+{
+ struct vattr vattr;
+ struct vnode *vp;
+ struct file *fp;
+ int error;
+
+ if (uap->length < 0)
+ return(EINVAL);
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ if ((fp->f_flag & FWRITE) == 0)
+ return (EINVAL);
+ vp = (struct vnode *)fp->f_data;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+ else if ((error = vn_writechk(vp)) == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = SCARG(uap, length);
+ error = VOP_SETATTR(vp, &vattr, fp->f_cred, p);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+ char *path;
+ long length;
+};
+#endif
+/* ARGSUSED */
+int
+otruncate(p, uap)
+ struct proc *p;
+ register struct otruncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(long) length;
+ } */ *uap;
+{
+ struct truncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ nuap;
+
+ SCARG(&nuap, path) = SCARG(uap, path);
+ SCARG(&nuap, length) = SCARG(uap, length);
+ return (truncate(p, &nuap));
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+ int fd;
+ long length;
+};
+#endif
+/* ARGSUSED */
+int
+oftruncate(p, uap)
+ struct proc *p;
+ register struct oftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(long) length;
+ } */ *uap;
+{
+ struct ftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ nuap;
+
+ SCARG(&nuap, fd) = SCARG(uap, fd);
+ SCARG(&nuap, length) = SCARG(uap, length);
+ return (ftruncate(p, &nuap));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+fsync(p, uap)
+ struct proc *p;
+ struct fsync_args /* {
+ syscallarg(int) fd;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ if (vp->v_object)
+ vm_object_page_clean(vp->v_object, 0, 0, 0);
+ if ((error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p)) == 0 &&
+ vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) &&
+ bioops.io_fsync)
+ error = (*bioops.io_fsync)(vp);
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+/*
+ * Rename files. Source and destination must either both be directories,
+ * or both not be directories. If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+ char *from;
+ char *to;
+};
+#endif
+/* ARGSUSED */
+int
+rename(p, uap)
+ struct proc *p;
+ register struct rename_args /* {
+ syscallarg(char *) from;
+ syscallarg(char *) to;
+ } */ *uap;
+{
+ register struct vnode *tvp, *fvp, *tdvp;
+ struct nameidata fromnd, tond;
+ int error;
+
+ NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE,
+ SCARG(uap, from), p);
+ if (error = namei(&fromnd))
+ return (error);
+ fvp = fromnd.ni_vp;
+ NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ,
+ UIO_USERSPACE, SCARG(uap, to), p);
+ if (fromnd.ni_vp->v_type == VDIR)
+ tond.ni_cnd.cn_flags |= WILLBEDIR;
+ if (error = namei(&tond)) {
+ /* Translate error code for rename("dir1", "dir2/."). */
+ if (error == EISDIR && fvp->v_type == VDIR)
+ error = EINVAL;
+ VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ goto out1;
+ }
+ tdvp = tond.ni_dvp;
+ tvp = tond.ni_vp;
+ if (tvp != NULL) {
+ if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
+ }
+ if (fvp == tdvp)
+ error = EINVAL;
+ /*
+ * If source is the same as the destination (that is the
+ * same inode number with the same name in the same directory),
+ * then there is nothing to do.
+ */
+ if (fvp == tvp && fromnd.ni_dvp == tdvp &&
+ fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
+ !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
+ fromnd.ni_cnd.cn_namelen))
+ error = -1;
+out:
+ if (!error) {
+ VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE);
+ if (fromnd.ni_dvp != tdvp) {
+ VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ }
+ if (tvp) {
+ VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE);
+ }
+ error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+ tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+ } else {
+ VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ if (tvp)
+ vput(tvp);
+ VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ }
+ vrele(tond.ni_startdir);
+ ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename");
+ ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename");
+ ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename");
+ ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename");
+ zfree(namei_zone, tond.ni_cnd.cn_pnbuf);
+out1:
+ if (fromnd.ni_startdir)
+ vrele(fromnd.ni_startdir);
+ zfree(namei_zone, fromnd.ni_cnd.cn_pnbuf);
+ if (error == -1)
+ return (0);
+ return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkdir(p, uap)
+ struct proc *p;
+ register struct mkdir_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ nd.ni_cnd.cn_flags |= WILLBEDIR;
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(vp);
+ return (EEXIST);
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VDIR;
+ vattr.va_mode = (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_fd->fd_cmask;
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ vput(nd.ni_dvp);
+ if (!error)
+ vput(nd.ni_vp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir");
+ return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+rmdir(p, uap)
+ struct proc *p;
+ struct rmdir_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+ /*
+ * No rmdir "." please.
+ */
+ if (nd.ni_dvp == vp) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ */
+ if (vp->v_flag & VROOT)
+ error = EBUSY;
+out:
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+ } else {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ }
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (vp != NULLVP)
+ vput(vp);
+ ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir");
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir");
+ return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Read a block of directory entries in a file system independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+ogetdirentries(p, uap)
+ struct proc *p;
+ register struct ogetdirentries_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ syscallarg(long *) basep;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct file *fp;
+ struct uio auio, kuio;
+ struct iovec aiov, kiov;
+ struct dirent *dp, *edp;
+ caddr_t dirbuf;
+ int error, eofflag, readcnt;
+ long loff;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ if ((fp->f_flag & FREAD) == 0)
+ return (EBADF);
+ vp = (struct vnode *)fp->f_data;
+unionread:
+ if (vp->v_type != VDIR)
+ return (EINVAL);
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+ auio.uio_resid = SCARG(uap, count);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ loff = auio.uio_offset = fp->f_offset;
+# if (BYTE_ORDER != LITTLE_ENDIAN)
+ if (vp->v_mount->mnt_maxsymlinklen <= 0) {
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+ NULL, NULL);
+ fp->f_offset = auio.uio_offset;
+ } else
+# endif
+ {
+ kuio = auio;
+ kuio.uio_iov = &kiov;
+ kuio.uio_segflg = UIO_SYSSPACE;
+ kiov.iov_len = SCARG(uap, count);
+ MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
+ kiov.iov_base = dirbuf;
+ error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
+ NULL, NULL);
+ fp->f_offset = kuio.uio_offset;
+ if (error == 0) {
+ readcnt = SCARG(uap, count) - kuio.uio_resid;
+ edp = (struct dirent *)&dirbuf[readcnt];
+ for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+# if (BYTE_ORDER == LITTLE_ENDIAN)
+ /*
+ * The expected low byte of
+ * dp->d_namlen is our dp->d_type.
+ * The high MBZ byte of dp->d_namlen
+ * is our dp->d_namlen.
+ */
+ dp->d_type = dp->d_namlen;
+ dp->d_namlen = 0;
+# else
+ /*
+ * The dp->d_type is the high byte
+ * of the expected dp->d_namlen,
+ * so must be zero'ed.
+ */
+ dp->d_type = 0;
+# endif
+ if (dp->d_reclen > 0) {
+ dp = (struct dirent *)
+ ((char *)dp + dp->d_reclen);
+ } else {
+ error = EIO;
+ break;
+ }
+ }
+ if (dp >= edp)
+ error = uiomove(dirbuf, readcnt, &auio);
+ }
+ FREE(dirbuf, M_TEMP);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ if (error)
+ return (error);
+ if (union_dircheckp && SCARG(uap, count) == auio.uio_resid) {
+ error = union_dircheckp(p, &vp, fp);
+ if (error == -1)
+ goto unionread;
+ if (error)
+ return (error);
+ }
+ error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
+ sizeof(long));
+ p->p_retval[0] = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Read a block of directory entries in a file system independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+getdirentries(p, uap)
+ struct proc *p;
+ register struct getdirentries_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ syscallarg(long *) basep;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct file *fp;
+ struct uio auio;
+ struct iovec aiov;
+ long loff;
+ int error, eofflag;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ if ((fp->f_flag & FREAD) == 0)
+ return (EBADF);
+ vp = (struct vnode *)fp->f_data;
+unionread:
+ if (vp->v_type != VDIR)
+ return (EINVAL);
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+ auio.uio_resid = SCARG(uap, count);
+ /* vn_lock(vp, LK_SHARED | LK_RETRY, p); */
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ loff = auio.uio_offset = fp->f_offset;
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
+ fp->f_offset = auio.uio_offset;
+ VOP_UNLOCK(vp, 0, p);
+ if (error)
+ return (error);
+ if (union_dircheckp && SCARG(uap, count) == auio.uio_resid) {
+ error = union_dircheckp(p, &vp, fp);
+ if (error == -1)
+ goto unionread;
+ if (error)
+ return (error);
+ }
+ if (SCARG(uap, basep) != NULL) {
+ error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
+ sizeof(long));
+ }
+ p->p_retval[0] = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+#ifndef _SYS_SYSPROTO_H_
+struct getdents_args {
+ int fd;
+ char *buf;
+ size_t count;
+};
+#endif
+int
+getdents(p, uap)
+ struct proc *p;
+ register struct getdents_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ } */ *uap;
+{
+ struct getdirentries_args ap;
+ ap.fd = uap->fd;
+ ap.buf = uap->buf;
+ ap.count = uap->count;
+ ap.basep = NULL;
+ return getdirentries(p, &ap);
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+ int newmask;
+};
+#endif
+int
+umask(p, uap)
+ struct proc *p;
+ struct umask_args /* {
+ syscallarg(int) newmask;
+ } */ *uap;
+{
+ register struct filedesc *fdp;
+
+ fdp = p->p_fd;
+ p->p_retval[0] = fdp->fd_cmask;
+ fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS;
+ return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem
+ * away from vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+revoke(p, uap)
+ struct proc *p;
+ register struct revoke_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p))
+ goto out;
+ if (p->p_ucred->cr_uid != vattr.va_uid &&
+ (error = suser(p->p_ucred, &p->p_acflag)))
+ goto out;
+ if (vp->v_usecount > 1 || (vp->v_flag & VALIASED))
+ VOP_REVOKE(vp, REVOKEALL);
+out:
+ vrele(vp);
+ return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry.
+ */
+int
+getvnode(fdp, fd, fpp)
+ struct filedesc *fdp;
+ int fd;
+ struct file **fpp;
+{
+ struct file *fp;
+
+ if ((u_int)fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[fd]) == NULL)
+ return (EBADF);
+ if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO)
+ return (EINVAL);
+ *fpp = fp;
+ return (0);
+}
+#ifndef _SYS_SYSPROTO_H_
+struct __getcwd_args {
+ u_char *buf;
+ u_int buflen;
+};
+#endif
+#define STATNODE(mode, name, var) \
+ SYSCTL_INT(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
+
+static int disablecwd;
+SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
+
+static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
+static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
+static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
+static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
+static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
+static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
+int
+__getcwd(p, uap)
+ struct proc *p;
+ struct __getcwd_args *uap;
+{
+ char *bp, *buf;
+ int error, i, slash_prefixed;
+ struct filedesc *fdp;
+ struct namecache *ncp;
+ struct vnode *vp;
+
+ numcwdcalls++;
+ if (disablecwd)
+ return (ENODEV);
+ if (uap->buflen < 2)
+ return (EINVAL);
+ if (uap->buflen > MAXPATHLEN)
+ uap->buflen = MAXPATHLEN;
+ buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
+ bp += uap->buflen - 1;
+ *bp = '\0';
+ fdp = p->p_fd;
+ slash_prefixed = 0;
+ for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
+ if (vp->v_flag & VROOT) {
+ vp = vp->v_mount->mnt_vnodecovered;
+ continue;
+ }
+ if (vp->v_dd->v_id != vp->v_ddid) {
+ numcwdfail1++;
+ free(buf, M_TEMP);
+ return (ENOTDIR);
+ }
+ ncp = TAILQ_FIRST(&vp->v_cache_dst);
+ if (!ncp) {
+ numcwdfail2++;
+ free(buf, M_TEMP);
+ return (ENOENT);
+ }
+ if (ncp->nc_dvp != vp->v_dd) {
+ numcwdfail3++;
+ free(buf, M_TEMP);
+ return (EBADF);
+ }
+ for (i = ncp->nc_nlen - 1; i >= 0; i--) {
+ if (bp == buf) {
+ numcwdfail4++;
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+ *--bp = ncp->nc_name[i];
+ }
+ if (bp == buf) {
+ numcwdfail4++;
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+ *--bp = '/';
+ slash_prefixed = 1;
+ vp = vp->v_dd;
+ }
+ if (!slash_prefixed) {
+ if (bp == buf) {
+ numcwdfail4++;
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+ *--bp = '/';
+ }
+ numcwdfound++;
+ error = copyout(bp, uap->buf, strlen(bp) + 1);
+ free(buf, M_TEMP);
+ return (error);
+}
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
new file mode 100644
index 0000000..0b32a7d
--- /dev/null
+++ b/sys/kern/vfs_vnops.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
+ * $Id: vfs_vnops.c,v 1.61 1999/01/05 18:49:56 eivind Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+
+static int vn_closefile __P((struct file *fp, struct proc *p));
+static int vn_ioctl __P((struct file *fp, u_long com, caddr_t data,
+ struct proc *p));
+static int vn_read __P((struct file *fp, struct uio *uio,
+ struct ucred *cred));
+static int vn_poll __P((struct file *fp, int events, struct ucred *cred,
+ struct proc *p));
+static int vn_write __P((struct file *fp, struct uio *uio,
+ struct ucred *cred));
+
+struct fileops vnops =
+ { vn_read, vn_write, vn_ioctl, vn_poll, vn_closefile };
+
+/*
+ * Common code for vnode open operations.
+ * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
+ */
+int
+vn_open(ndp, fmode, cmode)
+ register struct nameidata *ndp;
+ int fmode, cmode;
+{
+ register struct vnode *vp;
+ register struct proc *p = ndp->ni_cnd.cn_proc;
+ register struct ucred *cred = p->p_ucred;
+ struct vattr vat;
+ struct vattr *vap = &vat;
+ int mode, error;
+
+ if (fmode & O_CREAT) {
+ ndp->ni_cnd.cn_nameiop = CREATE;
+ ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
+ if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
+ ndp->ni_cnd.cn_flags |= FOLLOW;
+ error = namei(ndp);
+ if (error)
+ return (error);
+ if (ndp->ni_vp == NULL) {
+ VATTR_NULL(vap);
+ vap->va_type = VREG;
+ vap->va_mode = cmode;
+ if (fmode & O_EXCL)
+ vap->va_vaflags |= VA_EXCLUSIVE;
+ VOP_LEASE(ndp->ni_dvp, p, cred, LEASE_WRITE);
+ error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
+ &ndp->ni_cnd, vap);
+ vput(ndp->ni_dvp);
+ if (error)
+ return (error);
+ ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
+ ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
+ fmode &= ~O_TRUNC;
+ vp = ndp->ni_vp;
+ } else {
+ VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd);
+ if (ndp->ni_dvp == ndp->ni_vp)
+ vrele(ndp->ni_dvp);
+ else
+ vput(ndp->ni_dvp);
+ ndp->ni_dvp = NULL;
+ vp = ndp->ni_vp;
+ if (fmode & O_EXCL) {
+ error = EEXIST;
+ goto bad;
+ }
+ fmode &= ~O_CREAT;
+ }
+ } else {
+ ndp->ni_cnd.cn_nameiop = LOOKUP;
+ ndp->ni_cnd.cn_flags =
+ ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
+ error = namei(ndp);
+ if (error)
+ return (error);
+ vp = ndp->ni_vp;
+ }
+ if (vp->v_type == VLNK) {
+ error = EMLINK;
+ goto bad;
+ }
+ if (vp->v_type == VSOCK) {
+ error = EOPNOTSUPP;
+ goto bad;
+ }
+ if ((fmode & O_CREAT) == 0) {
+ mode = 0;
+ if (fmode & (FWRITE | O_TRUNC)) {
+ if (vp->v_type == VDIR) {
+ error = EISDIR;
+ goto bad;
+ }
+ error = vn_writechk(vp);
+ if (error)
+ goto bad;
+ mode |= VWRITE;
+ }
+ if (fmode & FREAD)
+ mode |= VREAD;
+ if (mode) {
+ error = VOP_ACCESS(vp, mode, cred, p);
+ if (error)
+ goto bad;
+ }
+ }
+ if (fmode & O_TRUNC) {
+ VOP_UNLOCK(vp, 0, p); /* XXX */
+ VOP_LEASE(vp, p, cred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */
+ VATTR_NULL(vap);
+ vap->va_size = 0;
+ error = VOP_SETATTR(vp, vap, cred, p);
+ if (error)
+ goto bad;
+ }
+ error = VOP_OPEN(vp, fmode, cred, p);
+ if (error)
+ goto bad;
+ /*
+ * Make sure that a VM object is created for VMIO support.
+ */
+ if (vp->v_type == VREG) {
+ if ((error = vfs_object_create(vp, p, cred)) != 0)
+ goto bad;
+ }
+
+ if (fmode & FWRITE)
+ vp->v_writecount++;
+ return (0);
+bad:
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Check for write permissions on the specified vnode.
+ * Prototype text segments cannot be written.
+ */
+int
+vn_writechk(vp)
+ register struct vnode *vp;
+{
+
+ /*
+ * If there's shared text associated with
+ * the vnode, try to free it up once. If
+ * we fail, we can't allow writing.
+ */
+ if (vp->v_flag & VTEXT)
+ return (ETXTBSY);
+ return (0);
+}
+
+/*
+ * Vnode close call
+ */
+int
+vn_close(vp, flags, cred, p)
+ register struct vnode *vp;
+ int flags;
+ struct ucred *cred;
+ struct proc *p;
+{
+ int error;
+
+ if (flags & FWRITE)
+ vp->v_writecount--;
+ error = VOP_CLOSE(vp, flags, cred, p);
+ vrele(vp);
+ return (error);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it.
+ */
+int
+vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
+ enum uio_rw rw;
+ struct vnode *vp;
+ caddr_t base;
+ int len;
+ off_t offset;
+ enum uio_seg segflg;
+ int ioflg;
+ struct ucred *cred;
+ int *aresid;
+ struct proc *p;
+{
+ struct uio auio;
+ struct iovec aiov;
+ int error;
+
+ if ((ioflg & IO_NODELOCKED) == 0)
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ aiov.iov_base = base;
+ aiov.iov_len = len;
+ auio.uio_resid = len;
+ auio.uio_offset = offset;
+ auio.uio_segflg = segflg;
+ auio.uio_rw = rw;
+ auio.uio_procp = p;
+ if (rw == UIO_READ) {
+ error = VOP_READ(vp, &auio, ioflg, cred);
+ } else {
+ error = VOP_WRITE(vp, &auio, ioflg, cred);
+ }
+ if (aresid)
+ *aresid = auio.uio_resid;
+ else
+ if (auio.uio_resid && error == 0)
+ error = EIO;
+ if ((ioflg & IO_NODELOCKED) == 0)
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+/*
+ * File table vnode read routine.
+ */
+static int
+vn_read(fp, uio, cred)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+{
+ struct vnode *vp = (struct vnode *)fp->f_data;
+ struct proc *p = uio->uio_procp;
+ int count, error;
+ int flag;
+
+ VOP_LEASE(vp, p, cred, LEASE_READ);
+ vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
+ if (uio->uio_offset == -1)
+ uio->uio_offset = fp->f_offset;
+ count = uio->uio_resid;
+ flag = 0;
+ if (fp->f_flag & FNONBLOCK)
+ flag |= IO_NDELAY;
+
+ /*
+ * Sequential read heuristic.
+ * If we have been doing sequential input,
+ * a rewind operation doesn't turn off
+ * sequential input mode.
+ */
+ if (((fp->f_offset == 0) && (fp->f_seqcount > 0)) ||
+ (fp->f_offset == fp->f_nextread)) {
+ int tmpseq = fp->f_seqcount;
+ /*
+ * XXX we assume that the filesystem block size is
+ * the default. Not true, but still gives us a pretty
+ * good indicator of how sequential the read operations
+ * are.
+ */
+ tmpseq += ((count + BKVASIZE - 1) / BKVASIZE);
+ if (tmpseq >= 127)
+ tmpseq = 127;
+ fp->f_seqcount = tmpseq;
+ flag |= (fp->f_seqcount << 16);
+ } else {
+ if (fp->f_seqcount > 1)
+ fp->f_seqcount = 1;
+ else
+ fp->f_seqcount = 0;
+ }
+
+ error = VOP_READ(vp, uio, flag, cred);
+ fp->f_offset += count - uio->uio_resid;
+ fp->f_nextread = fp->f_offset;
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+/*
+ * File table vnode write routine.
+ */
+static int
+vn_write(fp, uio, cred)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+{
+ struct vnode *vp = (struct vnode *)fp->f_data;
+ struct proc *p = uio->uio_procp;
+ int count, error, ioflag = IO_UNIT;
+
+ if (uio->uio_offset == -1 && vp->v_type == VREG && (fp->f_flag & O_APPEND))
+ ioflag |= IO_APPEND;
+ if (fp->f_flag & FNONBLOCK)
+ ioflag |= IO_NDELAY;
+ if ((fp->f_flag & O_FSYNC) ||
+ (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
+ ioflag |= IO_SYNC;
+ VOP_LEASE(vp, p, cred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ uio->uio_offset = fp->f_offset;
+ count = uio->uio_resid;
+ error = VOP_WRITE(vp, uio, ioflag, cred);
+ if (ioflag & IO_APPEND)
+ fp->f_offset = uio->uio_offset;
+ else
+ fp->f_offset += count - uio->uio_resid;
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+/*
+ * File table vnode stat routine.
+ */
+int
+vn_stat(vp, sb, p)
+ struct vnode *vp;
+ register struct stat *sb;
+ struct proc *p;
+{
+ struct vattr vattr;
+ register struct vattr *vap;
+ int error;
+ u_short mode;
+
+ vap = &vattr;
+ error = VOP_GETATTR(vp, vap, p->p_ucred, p);
+ if (error)
+ return (error);
+ /*
+ * Copy from vattr table
+ */
+ sb->st_dev = vap->va_fsid;
+ sb->st_ino = vap->va_fileid;
+ mode = vap->va_mode;
+ switch (vap->va_type) {
+ case VREG:
+ mode |= S_IFREG;
+ break;
+ case VDIR:
+ mode |= S_IFDIR;
+ break;
+ case VBLK:
+ mode |= S_IFBLK;
+ break;
+ case VCHR:
+ mode |= S_IFCHR;
+ break;
+ case VLNK:
+ mode |= S_IFLNK;
+ /* This is a cosmetic change, symlinks do not have a mode. */
+ if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
+ sb->st_mode &= ~ACCESSPERMS; /* 0000 */
+ else
+ sb->st_mode |= ACCESSPERMS; /* 0777 */
+ break;
+ case VSOCK:
+ mode |= S_IFSOCK;
+ break;
+ case VFIFO:
+ mode |= S_IFIFO;
+ break;
+ default:
+ return (EBADF);
+ };
+ sb->st_mode = mode;
+ sb->st_nlink = vap->va_nlink;
+ sb->st_uid = vap->va_uid;
+ sb->st_gid = vap->va_gid;
+ sb->st_rdev = vap->va_rdev;
+ sb->st_size = vap->va_size;
+ sb->st_atimespec = vap->va_atime;
+ sb->st_mtimespec = vap->va_mtime;
+ sb->st_ctimespec = vap->va_ctime;
+ sb->st_blksize = vap->va_blocksize;
+ sb->st_flags = vap->va_flags;
+ if (p->p_ucred->cr_uid != 0)
+ sb->st_gen = 0;
+ else
+ sb->st_gen = vap->va_gen;
+
+#if (S_BLKSIZE == 512)
+ /* Optimize this case */
+ sb->st_blocks = vap->va_bytes >> 9;
+#else
+ sb->st_blocks = vap->va_bytes / S_BLKSIZE;
+#endif
+ return (0);
+}
+
+/*
+ * File table vnode ioctl routine.
+ */
+static int
+vn_ioctl(fp, com, data, p)
+ struct file *fp;
+ u_long com;
+ caddr_t data;
+ struct proc *p;
+{
+ register struct vnode *vp = ((struct vnode *)fp->f_data);
+ struct vattr vattr;
+ int error;
+
+ switch (vp->v_type) {
+
+ case VREG:
+ case VDIR:
+ if (com == FIONREAD) {
+ error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
+ if (error)
+ return (error);
+ *(int *)data = vattr.va_size - fp->f_offset;
+ return (0);
+ }
+ if (com == FIONBIO || com == FIOASYNC) /* XXX */
+ return (0); /* XXX */
+ /* fall into ... */
+
+ default:
+#if 0
+ return (ENOTTY);
+#endif
+ case VFIFO:
+ case VCHR:
+ case VBLK:
+ error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p);
+ if (error == 0 && com == TIOCSCTTY) {
+
+ /* Do nothing if reassigning same control tty */
+ if (p->p_session->s_ttyvp == vp)
+ return (0);
+
+ /* Get rid of reference to old control tty */
+ if (p->p_session->s_ttyvp)
+ vrele(p->p_session->s_ttyvp);
+
+ p->p_session->s_ttyvp = vp;
+ VREF(vp);
+ }
+ return (error);
+ }
+}
+
+/*
+ * File table vnode poll routine.
+ */
+static int
+vn_poll(fp, events, cred, p)
+ struct file *fp;
+ int events;
+ struct ucred *cred;
+ struct proc *p;
+{
+
+ return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, p));
+}
+
+/*
+ * Check that the vnode is still valid, and if so
+ * acquire requested lock.
+ */
+int
+#ifndef DEBUG_LOCKS
+vn_lock(vp, flags, p)
+#else
+debug_vn_lock(vp, flags, p, filename, line)
+#endif
+ struct vnode *vp;
+ int flags;
+ struct proc *p;
+#ifdef DEBUG_LOCKS
+ const char *filename;
+ int line;
+#endif
+{
+ int error;
+
+ do {
+ if ((flags & LK_INTERLOCK) == 0)
+ simple_lock(&vp->v_interlock);
+ if (vp->v_flag & VXLOCK) {
+ vp->v_flag |= VXWANT;
+ simple_unlock(&vp->v_interlock);
+ tsleep((caddr_t)vp, PINOD, "vn_lock", 0);
+ error = ENOENT;
+ } else {
+#ifdef DEBUG_LOCKS
+ vp->filename = filename;
+ vp->line = line;
+#endif
+ error = VOP_LOCK(vp,
+ flags | LK_NOPAUSE | LK_INTERLOCK, p);
+ if (error == 0)
+ return (error);
+ }
+ flags &= ~LK_INTERLOCK;
+ } while (flags & LK_RETRY);
+ return (error);
+}
+
+/*
+ * File table vnode close routine.
+ */
+static int
+vn_closefile(fp, p)
+ struct file *fp;
+ struct proc *p;
+{
+
+ return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
+ fp->f_cred, p));
+}
diff --git a/sys/kern/vnode_if.pl b/sys/kern/vnode_if.pl
new file mode 100644
index 0000000..8193edb
--- /dev/null
+++ b/sys/kern/vnode_if.pl
@@ -0,0 +1,402 @@
+#!/bin/sh -
+#
+# Copyright (c) 1992, 1993
+# The Regents of the University of California. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+# must display the following acknowledgement:
+# This product includes software developed by the University of
+# California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+# $Id: vnode_if.sh,v 1.15 1998/07/04 20:45:32 julian Exp $
+#
+
+# Script to produce VFS front-end sugar.
+#
+# usage: vnode_if.sh srcfile
+# (where srcfile is currently /sys/kern/vnode_if.src)
+#
+# These awk scripts are not particularly well written, specifically they
+# don't use arrays well and figure out the same information repeatedly.
+# Please rewrite them if you actually understand how to use awk. Note,
+# they use nawk extensions and gawk's toupper.
+
+if [ $# -ne 1 ] ; then
+ echo 'usage: vnode_if.sh srcfile'
+ exit 1
+fi
+
+# Name of the source file.
+SRC=$1
+
+# Names of the created files.
+CFILE=vnode_if.c
+HEADER=vnode_if.h
+
+# Awk program (must support nawk extensions and gawk's "toupper")
+# Use "awk" at Berkeley, "gawk" elsewhere.
+AWK=awk
+
+# Print out header information for vnode_if.h.
+cat << END_OF_LEADING_COMMENT > $HEADER
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+ */
+
+extern struct vnodeop_desc vop_default_desc;
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.h.
+$AWK '
+ NF == 0 || $0 ~ "^#" {
+ next;
+ }
+ {
+ # Get the function name.
+ name = $1;
+ uname = toupper(name);
+
+ # Get the function arguments.
+ for (c1 = 0;; ++c1) {
+ if (getline <= 0)
+ exit
+ if ($0 ~ "^};")
+ break;
+ a[c1] = $0;
+ }
+
+ # Print out the vop_F_args structure.
+ printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n",
+ name);
+ for (c2 = 0; c2 < c1; ++c2) {
+ c3 = split(a[c2], t);
+ printf("\t");
+ if (t[2] ~ "WILLRELE")
+ c4 = 3;
+ else
+ c4 = 2;
+ for (; c4 < c3; ++c4)
+ printf("%s ", t[c4]);
+ beg = match(t[c3], "[^*]");
+ printf("%sa_%s\n",
+ substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+ }
+ printf("};\n");
+
+ # Print out extern declaration.
+ printf("extern struct vnodeop_desc %s_desc;\n", name);
+
+ # Print out prototype.
+ printf("static int %s __P((\n", uname);
+ sep = ",\n";
+ for (c2 = 0; c2 < c1; ++c2) {
+ if (c2 == c1 - 1)
+ sep = "));\n";
+ c3 = split(a[c2], t);
+ printf("\t");
+ if (t[2] ~ "WILLRELE")
+ c4 = 3;
+ else
+ c4 = 2;
+ for (; c4 < c3; ++c4)
+ printf("%s ", t[c4]);
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("%s%s%s",
+ substr(t[c4], 0, beg - 1),
+ substr(t[c4], beg, end - beg), sep);
+ }
+
+ # Print out inline struct.
+ printf("static __inline int %s(", uname);
+ sep = ", ";
+ for (c2 = 0; c2 < c1; ++c2) {
+ if (c2 == c1 - 1)
+ sep = ")\n";
+ c3 = split(a[c2], t);
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("%s%s", substr(t[c3], beg, end - beg), sep);
+ }
+ for (c2 = 0; c2 < c1; ++c2) {
+ c3 = split(a[c2], t);
+ printf("\t");
+ if (t[2] ~ "WILLRELE")
+ c4 = 3;
+ else
+ c4 = 2;
+ for (; c4 < c3; ++c4)
+ printf("%s ", t[c4]);
+ beg = match(t[c3], "[^*]");
+ printf("%s%s\n",
+ substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+ }
+ printf("{\n\tstruct %s_args a;\n\n", name);
+ printf("\ta.a_desc = VDESC(%s);\n", name);
+ for (c2 = 0; c2 < c1; ++c2) {
+ c3 = split(a[c2], t);
+ printf("\t");
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("a.a_%s = %s\n",
+ substr(t[c3], beg, end - beg), substr(t[c3], beg));
+ }
+ c1 = split(a[0], t);
+ beg = match(t[c1], "[^*]");
+ end = match(t[c1], ";");
+ printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n",
+ substr(t[c1], beg, end - beg), name);
+ }' < $SRC >> $HEADER
+
+# Print out header information for vnode_if.c.
+cat << END_OF_LEADING_COMMENT > $CFILE
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/param.h>
+#include <sys/vnode.h>
+
+struct vnodeop_desc vop_default_desc = {
+ 1, /* special case, vop_default => 1 */
+ "default",
+ 0,
+ NULL,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ NULL,
+};
+
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.c.
+$AWK 'function kill_surrounding_ws (s) {
+ sub (/^[ \t]*/, "", s);
+ sub (/[ \t]*$/, "", s);
+ return s;
+ }
+
+ function read_args() {
+ numargs = 0;
+ while (getline ln) {
+ if (ln ~ /}/) {
+ break;
+ };
+
+ # Delete comments, if any.
+ gsub (/\/\*.*\*\//, "", ln);
+
+ # Delete leading/trailing space.
+ ln = kill_surrounding_ws(ln);
+
+ # Pick off direction.
+ if (1 == sub(/^INOUT[ \t]+/, "", ln))
+ dir = "INOUT";
+ else if (1 == sub(/^IN[ \t]+/, "", ln))
+ dir = "IN";
+ else if (1 == sub(/^OUT[ \t]+/, "", ln))
+ dir = "OUT";
+ else
+ bail("No IN/OUT direction for \"" ln "\".");
+
+ # check for "WILLRELE"
+ if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) {
+ rele = "WILLRELE";
+ } else {
+ rele = "WONTRELE";
+ };
+
+ # kill trailing ;
+ if (1 != sub (/;$/, "", ln)) {
+ bail("Missing end-of-line ; in \"" ln "\".");
+ };
+
+ # pick off variable name
+ if (!(i = match(ln, /[A-Za-z0-9_]+$/))) {
+ bail("Missing var name \"a_foo\" in \"" ln "\".");
+ };
+ arg = substr (ln, i);
+ # Want to <<substr(ln, i) = "";>>, but nawk cannot.
+ # Hack around this.
+ ln = substr(ln, 1, i-1);
+
+ # what is left must be type
+ # (put clean it up some)
+ type = ln;
+ gsub (/[ \t]+/, " ", type); # condense whitespace
+ type = kill_surrounding_ws(type);
+
+ # (boy this was easier in Perl)
+
+ numargs++;
+ dirs[numargs] = dir;
+ reles[numargs] = rele;
+ types[numargs] = type;
+ args[numargs] = arg;
+ };
+ }
+
+ function generate_operation_vp_offsets() {
+ printf ("static int %s_vp_offsets[] = {\n", name);
+ # as a side effect, figure out the releflags
+ releflags = "";
+ vpnum = 0;
+ for (i=1; i<=numargs; i++) {
+ if (types[i] == "struct vnode *") {
+ printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n",
+ name, args[i]);
+ if (reles[i] == "WILLRELE") {
+ releflags = releflags "|VDESC_VP" vpnum "_WILLRELE";
+ };
+ vpnum++;
+ };
+ };
+ sub (/^\|/, "", releflags);
+ print "\tVDESC_NO_OFFSET";
+ print "};";
+ }
+
+ function find_arg_with_type (type) {
+ for (i=1; i<=numargs; i++) {
+ if (types[i] == type) {
+ return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")";
+ };
+ };
+ return "VDESC_NO_OFFSET";
+ }
+
+ function generate_operation_desc() {
+ printf ("struct vnodeop_desc %s_desc = {\n", name);
+ # offset
+ printf ("\t0,\n");
+ # printable name
+ printf ("\t\"%s\",\n", name);
+ # flags
+ vppwillrele = "";
+ for (i=1; i<=numargs; i++) {
+ if (types[i] == "struct vnode **" &&
+ (reles[i] == "WILLRELE")) {
+ vppwillrele = "|VDESC_VPP_WILLRELE";
+ };
+ };
+ if (releflags == "") {
+ printf ("\t0%s,\n", vppwillrele);
+ } else {
+ printf ("\t%s%s,\n", releflags, vppwillrele);
+ };
+ # vp offsets
+ printf ("\t%s_vp_offsets,\n", name);
+ # vpp (if any)
+ printf ("\t%s,\n", find_arg_with_type("struct vnode **"));
+ # cred (if any)
+ printf ("\t%s,\n", find_arg_with_type("struct ucred *"));
+ # proc (if any)
+ printf ("\t%s,\n", find_arg_with_type("struct proc *"));
+ # componentname
+ printf ("\t%s,\n", find_arg_with_type("struct componentname *"));
+ # transport layer information
+ printf ("\tNULL,\n};\n");
+ }
+
+ NF == 0 || $0 ~ "^#" {
+ next;
+ }
+ {
+ # get the function name
+ name = $1;
+
+ # get the function arguments
+ read_args();
+
+ # Print out the vop_F_vp_offsets structure. This all depends
+ # on naming conventions and nothing else.
+ generate_operation_vp_offsets();
+
+ # Print out the vnodeop_desc structure.
+ generate_operation_desc();
+
+ printf "\n";
+
+ }' < $SRC >> $CFILE
+# THINGS THAT DON'T WORK RIGHT YET.
+#
+# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as
+# arguments. This means that these operations can't function successfully
+# through a bypass routine.
+#
+# Bwrite and strategy will be replaced when the VM page/buffer cache
+# integration happens.
+#
+# To get around this problem for now we handle these ops as special cases.
+
+cat << END_OF_SPECIAL_CASES >> $HEADER
+#include <sys/buf.h>
+
+struct vop_bwrite_args {
+ struct vnodeop_desc *a_desc;
+ struct buf *a_bp;
+};
+extern struct vnodeop_desc vop_bwrite_desc;
+static int VOP_BWRITE __P((
+ struct buf *bp));
+static __inline int VOP_BWRITE(bp)
+ struct buf *bp;
+{
+ struct vop_bwrite_args a;
+
+ a.a_desc = VDESC(vop_bwrite);
+ a.a_bp = bp;
+ return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a));
+}
+
+extern int vfs_opv_numops;
+END_OF_SPECIAL_CASES
+
+cat << END_OF_SPECIAL_CASES >> $CFILE
+static int vop_bwrite_vp_offsets[] = {
+ VDESC_NO_OFFSET
+};
+struct vnodeop_desc vop_bwrite_desc = {
+ 0,
+ "vop_bwrite",
+ 0,
+ vop_bwrite_vp_offsets,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ NULL,
+};
+END_OF_SPECIAL_CASES
diff --git a/sys/kern/vnode_if.sh b/sys/kern/vnode_if.sh
new file mode 100644
index 0000000..8193edb
--- /dev/null
+++ b/sys/kern/vnode_if.sh
@@ -0,0 +1,402 @@
+#!/bin/sh -
+#
+# Copyright (c) 1992, 1993
+# The Regents of the University of California. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+# must display the following acknowledgement:
+# This product includes software developed by the University of
+# California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+# $Id: vnode_if.sh,v 1.15 1998/07/04 20:45:32 julian Exp $
+#
+
+# Script to produce VFS front-end sugar.
+#
+# usage: vnode_if.sh srcfile
+# (where srcfile is currently /sys/kern/vnode_if.src)
+#
+# These awk scripts are not particularly well written, specifically they
+# don't use arrays well and figure out the same information repeatedly.
+# Please rewrite them if you actually understand how to use awk. Note,
+# they use nawk extensions and gawk's toupper.
+
+if [ $# -ne 1 ] ; then
+ echo 'usage: vnode_if.sh srcfile'
+ exit 1
+fi
+
+# Name of the source file.
+SRC=$1
+
+# Names of the created files.
+CFILE=vnode_if.c
+HEADER=vnode_if.h
+
+# Awk program (must support nawk extensions and gawk's "toupper")
+# Use "awk" at Berkeley, "gawk" elsewhere.
+AWK=awk
+
+# Print out header information for vnode_if.h.
+cat << END_OF_LEADING_COMMENT > $HEADER
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+ */
+
+extern struct vnodeop_desc vop_default_desc;
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.h.
+$AWK '
+ NF == 0 || $0 ~ "^#" {
+ next;
+ }
+ {
+ # Get the function name.
+ name = $1;
+ uname = toupper(name);
+
+ # Get the function arguments.
+ for (c1 = 0;; ++c1) {
+ if (getline <= 0)
+ exit
+ if ($0 ~ "^};")
+ break;
+ a[c1] = $0;
+ }
+
+ # Print out the vop_F_args structure.
+ printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n",
+ name);
+ for (c2 = 0; c2 < c1; ++c2) {
+ c3 = split(a[c2], t);
+ printf("\t");
+ if (t[2] ~ "WILLRELE")
+ c4 = 3;
+ else
+ c4 = 2;
+ for (; c4 < c3; ++c4)
+ printf("%s ", t[c4]);
+ beg = match(t[c3], "[^*]");
+ printf("%sa_%s\n",
+ substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+ }
+ printf("};\n");
+
+ # Print out extern declaration.
+ printf("extern struct vnodeop_desc %s_desc;\n", name);
+
+ # Print out prototype.
+ printf("static int %s __P((\n", uname);
+ sep = ",\n";
+ for (c2 = 0; c2 < c1; ++c2) {
+ if (c2 == c1 - 1)
+ sep = "));\n";
+ c3 = split(a[c2], t);
+ printf("\t");
+ if (t[2] ~ "WILLRELE")
+ c4 = 3;
+ else
+ c4 = 2;
+ for (; c4 < c3; ++c4)
+ printf("%s ", t[c4]);
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("%s%s%s",
+ substr(t[c4], 0, beg - 1),
+ substr(t[c4], beg, end - beg), sep);
+ }
+
+ # Print out inline struct.
+ printf("static __inline int %s(", uname);
+ sep = ", ";
+ for (c2 = 0; c2 < c1; ++c2) {
+ if (c2 == c1 - 1)
+ sep = ")\n";
+ c3 = split(a[c2], t);
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("%s%s", substr(t[c3], beg, end - beg), sep);
+ }
+ for (c2 = 0; c2 < c1; ++c2) {
+ c3 = split(a[c2], t);
+ printf("\t");
+ if (t[2] ~ "WILLRELE")
+ c4 = 3;
+ else
+ c4 = 2;
+ for (; c4 < c3; ++c4)
+ printf("%s ", t[c4]);
+ beg = match(t[c3], "[^*]");
+ printf("%s%s\n",
+ substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+ }
+ printf("{\n\tstruct %s_args a;\n\n", name);
+ printf("\ta.a_desc = VDESC(%s);\n", name);
+ for (c2 = 0; c2 < c1; ++c2) {
+ c3 = split(a[c2], t);
+ printf("\t");
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("a.a_%s = %s\n",
+ substr(t[c3], beg, end - beg), substr(t[c3], beg));
+ }
+ c1 = split(a[0], t);
+ beg = match(t[c1], "[^*]");
+ end = match(t[c1], ";");
+ printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n",
+ substr(t[c1], beg, end - beg), name);
+ }' < $SRC >> $HEADER
+
+# Print out header information for vnode_if.c.
+cat << END_OF_LEADING_COMMENT > $CFILE
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/param.h>
+#include <sys/vnode.h>
+
+struct vnodeop_desc vop_default_desc = {
+ 1, /* special case, vop_default => 1 */
+ "default",
+ 0,
+ NULL,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ NULL,
+};
+
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.c.
+$AWK 'function kill_surrounding_ws (s) {
+ sub (/^[ \t]*/, "", s);
+ sub (/[ \t]*$/, "", s);
+ return s;
+ }
+
+ function read_args() {
+ numargs = 0;
+ while (getline ln) {
+ if (ln ~ /}/) {
+ break;
+ };
+
+ # Delete comments, if any.
+ gsub (/\/\*.*\*\//, "", ln);
+
+ # Delete leading/trailing space.
+ ln = kill_surrounding_ws(ln);
+
+ # Pick off direction.
+ if (1 == sub(/^INOUT[ \t]+/, "", ln))
+ dir = "INOUT";
+ else if (1 == sub(/^IN[ \t]+/, "", ln))
+ dir = "IN";
+ else if (1 == sub(/^OUT[ \t]+/, "", ln))
+ dir = "OUT";
+ else
+ bail("No IN/OUT direction for \"" ln "\".");
+
+ # check for "WILLRELE"
+ if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) {
+ rele = "WILLRELE";
+ } else {
+ rele = "WONTRELE";
+ };
+
+ # kill trailing ;
+ if (1 != sub (/;$/, "", ln)) {
+ bail("Missing end-of-line ; in \"" ln "\".");
+ };
+
+ # pick off variable name
+ if (!(i = match(ln, /[A-Za-z0-9_]+$/))) {
+ bail("Missing var name \"a_foo\" in \"" ln "\".");
+ };
+ arg = substr (ln, i);
+ # Want to <<substr(ln, i) = "";>>, but nawk cannot.
+ # Hack around this.
+ ln = substr(ln, 1, i-1);
+
+ # what is left must be type
+ # (put clean it up some)
+ type = ln;
+ gsub (/[ \t]+/, " ", type); # condense whitespace
+ type = kill_surrounding_ws(type);
+
+ # (boy this was easier in Perl)
+
+ numargs++;
+ dirs[numargs] = dir;
+ reles[numargs] = rele;
+ types[numargs] = type;
+ args[numargs] = arg;
+ };
+ }
+
+ function generate_operation_vp_offsets() {
+ printf ("static int %s_vp_offsets[] = {\n", name);
+ # as a side effect, figure out the releflags
+ releflags = "";
+ vpnum = 0;
+ for (i=1; i<=numargs; i++) {
+ if (types[i] == "struct vnode *") {
+ printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n",
+ name, args[i]);
+ if (reles[i] == "WILLRELE") {
+ releflags = releflags "|VDESC_VP" vpnum "_WILLRELE";
+ };
+ vpnum++;
+ };
+ };
+ sub (/^\|/, "", releflags);
+ print "\tVDESC_NO_OFFSET";
+ print "};";
+ }
+
+ function find_arg_with_type (type) {
+ for (i=1; i<=numargs; i++) {
+ if (types[i] == type) {
+ return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")";
+ };
+ };
+ return "VDESC_NO_OFFSET";
+ }
+
+ function generate_operation_desc() {
+ printf ("struct vnodeop_desc %s_desc = {\n", name);
+ # offset
+ printf ("\t0,\n");
+ # printable name
+ printf ("\t\"%s\",\n", name);
+ # flags
+ vppwillrele = "";
+ for (i=1; i<=numargs; i++) {
+ if (types[i] == "struct vnode **" &&
+ (reles[i] == "WILLRELE")) {
+ vppwillrele = "|VDESC_VPP_WILLRELE";
+ };
+ };
+ if (releflags == "") {
+ printf ("\t0%s,\n", vppwillrele);
+ } else {
+ printf ("\t%s%s,\n", releflags, vppwillrele);
+ };
+ # vp offsets
+ printf ("\t%s_vp_offsets,\n", name);
+ # vpp (if any)
+ printf ("\t%s,\n", find_arg_with_type("struct vnode **"));
+ # cred (if any)
+ printf ("\t%s,\n", find_arg_with_type("struct ucred *"));
+ # proc (if any)
+ printf ("\t%s,\n", find_arg_with_type("struct proc *"));
+ # componentname
+ printf ("\t%s,\n", find_arg_with_type("struct componentname *"));
+ # transport layer information
+ printf ("\tNULL,\n};\n");
+ }
+
+ NF == 0 || $0 ~ "^#" {
+ next;
+ }
+ {
+ # get the function name
+ name = $1;
+
+ # get the function arguments
+ read_args();
+
+ # Print out the vop_F_vp_offsets structure. This all depends
+ # on naming conventions and nothing else.
+ generate_operation_vp_offsets();
+
+ # Print out the vnodeop_desc structure.
+ generate_operation_desc();
+
+ printf "\n";
+
+ }' < $SRC >> $CFILE
+# THINGS THAT DON'T WORK RIGHT YET.
+#
+# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as
+# arguments. This means that these operations can't function successfully
+# through a bypass routine.
+#
+# Bwrite and strategy will be replaced when the VM page/buffer cache
+# integration happens.
+#
+# To get around this problem for now we handle these ops as special cases.
+
+cat << END_OF_SPECIAL_CASES >> $HEADER
+#include <sys/buf.h>
+
+struct vop_bwrite_args {
+ struct vnodeop_desc *a_desc;
+ struct buf *a_bp;
+};
+extern struct vnodeop_desc vop_bwrite_desc;
+static int VOP_BWRITE __P((
+ struct buf *bp));
+static __inline int VOP_BWRITE(bp)
+ struct buf *bp;
+{
+ struct vop_bwrite_args a;
+
+ a.a_desc = VDESC(vop_bwrite);
+ a.a_bp = bp;
+ return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a));
+}
+
+extern int vfs_opv_numops;
+END_OF_SPECIAL_CASES
+
+cat << END_OF_SPECIAL_CASES >> $CFILE
+static int vop_bwrite_vp_offsets[] = {
+ VDESC_NO_OFFSET
+};
+struct vnodeop_desc vop_bwrite_desc = {
+ 0,
+ "vop_bwrite",
+ 0,
+ vop_bwrite_vp_offsets,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ NULL,
+};
+END_OF_SPECIAL_CASES
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
new file mode 100644
index 0000000..48c9fef
--- /dev/null
+++ b/sys/kern/vnode_if.src
@@ -0,0 +1,488 @@
+#
+# Copyright (c) 1992, 1993
+# The Regents of the University of California. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+# must display the following acknowledgement:
+# This product includes software developed by the University of
+# California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# @(#)vnode_if.src 8.12 (Berkeley) 5/14/95
+# $Id: vnode_if.src,v 1.18 1998/07/04 20:45:32 julian Exp $
+#
+
+#
+# Above each of the vop descriptors is a specification of the locking
+# protocol used by each vop call. The first column is the name of
+# the variable, the remaining three columns are in, out and error
+# respectively. The "in" column defines the lock state on input,
+# the "out" column defines the state on succesful return, and the
+# "error" column defines the locking state on error exit.
+#
+# The locking value can take the following values:
+# L: locked.
+# U: unlocked/
+# -: not applicable. vnode does not yet (or no longer) exists.
+# =: the same on input and output, may be either L or U.
+# X: locked if not nil.
+#
+
+#
+#% lookup dvp L ? ?
+#% lookup vpp - L -
+#
+# XXX - the lookup locking protocol defies simple description and depends
+# on the flags and operation fields in the (cnp) structure. Note
+# especially that *vpp may equal dvp and both may be locked.
+#
+vop_lookup {
+ IN struct vnode *dvp;
+ INOUT struct vnode **vpp;
+ IN struct componentname *cnp;
+};
+
+#
+#% cachedlookup dvp L ? ?
+#% cachedlookup vpp - L -
+#
+# This must be an exact copy of lookup. See kern/vfs_cache.c for details.
+#
+vop_cachedlookup {
+ IN struct vnode *dvp;
+ INOUT struct vnode **vpp;
+ IN struct componentname *cnp;
+};
+
+#
+#% create dvp L L L
+#% create vpp - L -
+#
+vop_create {
+ IN struct vnode *dvp;
+ OUT struct vnode **vpp;
+ IN struct componentname *cnp;
+ IN struct vattr *vap;
+};
+
+#
+#% whiteout dvp L L L
+#% whiteout cnp - - -
+#% whiteout flag - - -
+#
+vop_whiteout {
+ IN struct vnode *dvp;
+ IN struct componentname *cnp;
+ IN int flags;
+};
+
+#
+#% mknod dvp L L L
+#% mknod vpp - X -
+#
+vop_mknod {
+ IN struct vnode *dvp;
+ OUT WILLRELE struct vnode **vpp;
+ IN struct componentname *cnp;
+ IN struct vattr *vap;
+};
+
+#
+#% open vp L L L
+#
+vop_open {
+ IN struct vnode *vp;
+ IN int mode;
+ IN struct ucred *cred;
+ IN struct proc *p;
+};
+
+#
+#% close vp U U U
+#
+vop_close {
+ IN struct vnode *vp;
+ IN int fflag;
+ IN struct ucred *cred;
+ IN struct proc *p;
+};
+
+#
+#% access vp L L L
+#
+vop_access {
+ IN struct vnode *vp;
+ IN int mode;
+ IN struct ucred *cred;
+ IN struct proc *p;
+};
+
+#
+#% getattr vp = = =
+#
+vop_getattr {
+ IN struct vnode *vp;
+ IN struct vattr *vap;
+ IN struct ucred *cred;
+ IN struct proc *p;
+};
+
+#
+#% setattr vp L L L
+#
+vop_setattr {
+ IN struct vnode *vp;
+ IN struct vattr *vap;
+ IN struct ucred *cred;
+ IN struct proc *p;
+};
+
+#
+#% read vp L L L
+#
+vop_read {
+ IN struct vnode *vp;
+ INOUT struct uio *uio;
+ IN int ioflag;
+ IN struct ucred *cred;
+};
+
+#
+#% write vp L L L
+#
+vop_write {
+ IN struct vnode *vp;
+ INOUT struct uio *uio;
+ IN int ioflag;
+ IN struct ucred *cred;
+};
+
+#
+#% lease vp = = =
+#
+vop_lease {
+ IN struct vnode *vp;
+ IN struct proc *p;
+ IN struct ucred *cred;
+ IN int flag;
+};
+
+#
+#% ioctl vp U U U
+#
+vop_ioctl {
+ IN struct vnode *vp;
+ IN u_long command;
+ IN caddr_t data;
+ IN int fflag;
+ IN struct ucred *cred;
+ IN struct proc *p;
+};
+
+#
+#% poll vp U U U
+#
+vop_poll {
+ IN struct vnode *vp;
+ IN int events;
+ IN struct ucred *cred;
+ IN struct proc *p;
+};
+
+#
+#% revoke vp U U U
+#
+vop_revoke {
+ IN struct vnode *vp;
+ IN int flags;
+};
+
+#
+# XXX - not used
+#
+vop_mmap {
+ IN struct vnode *vp;
+ IN int fflags;
+ IN struct ucred *cred;
+ IN struct proc *p;
+};
+
+#
+#% fsync vp L L L
+#
+vop_fsync {
+ IN struct vnode *vp;
+ IN struct ucred *cred;
+ IN int waitfor;
+ IN struct proc *p;
+};
+
+#
+#% remove dvp L L L
+#% remove vp L L L
+#
+vop_remove {
+ IN struct vnode *dvp;
+ IN struct vnode *vp;
+ IN struct componentname *cnp;
+};
+
+#
+#% link tdvp L L L
+#% link vp U U U
+#
+vop_link {
+ IN struct vnode *tdvp;
+ IN struct vnode *vp;
+ IN struct componentname *cnp;
+};
+
+#
+#% rename fdvp U U U
+#% rename fvp U U U
+#% rename tdvp L U U
+#% rename tvp X U U
+#
+vop_rename {
+ IN WILLRELE struct vnode *fdvp;
+ IN WILLRELE struct vnode *fvp;
+ IN struct componentname *fcnp;
+ IN WILLRELE struct vnode *tdvp;
+ IN WILLRELE struct vnode *tvp;
+ IN struct componentname *tcnp;
+};
+
+#
+#% mkdir dvp L L L
+#% mkdir vpp - L -
+#
+vop_mkdir {
+ IN struct vnode *dvp;
+ OUT struct vnode **vpp;
+ IN struct componentname *cnp;
+ IN struct vattr *vap;
+};
+
+#
+#% rmdir dvp L L L
+#% rmdir vp L L L
+#
+vop_rmdir {
+ IN struct vnode *dvp;
+ IN struct vnode *vp;
+ IN struct componentname *cnp;
+};
+
+#
+#% symlink dvp L L L
+#% symlink vpp - U -
+#
+# XXX - note that the return vnode has already been VRELE'ed
+# by the filesystem layer. To use it you must use vget,
+# possibly with a further namei.
+#
+vop_symlink {
+ IN struct vnode *dvp;
+ OUT WILLRELE struct vnode **vpp;
+ IN struct componentname *cnp;
+ IN struct vattr *vap;
+ IN char *target;
+};
+
+#
+#% readdir vp L L L
+#
+vop_readdir {
+ IN struct vnode *vp;
+ INOUT struct uio *uio;
+ IN struct ucred *cred;
+ INOUT int *eofflag;
+ OUT int *ncookies;
+ INOUT u_long **cookies;
+};
+
+#
+#% readlink vp L L L
+#
+vop_readlink {
+ IN struct vnode *vp;
+ INOUT struct uio *uio;
+ IN struct ucred *cred;
+};
+
+#
+#% abortop dvp = = =
+#
+vop_abortop {
+ IN struct vnode *dvp;
+ IN struct componentname *cnp;
+};
+
+#
+#% inactive vp L U U
+#
+vop_inactive {
+ IN struct vnode *vp;
+ IN struct proc *p;
+};
+
+#
+#% reclaim vp U U U
+#
+vop_reclaim {
+ IN struct vnode *vp;
+ IN struct proc *p;
+};
+
+#
+#% lock vp U L U
+#
+vop_lock {
+ IN struct vnode *vp;
+ IN int flags;
+ IN struct proc *p;
+};
+
+#
+#% unlock vp L U L
+#
+vop_unlock {
+ IN struct vnode *vp;
+ IN int flags;
+ IN struct proc *p;
+};
+
+#
+#% bmap vp L L L
+#% bmap vpp - U -
+#
+vop_bmap {
+ IN struct vnode *vp;
+ IN daddr_t bn;
+ OUT struct vnode **vpp;
+ IN daddr_t *bnp;
+ OUT int *runp;
+ OUT int *runb;
+};
+
+#
+# Needs work: no vp?
+#
+vop_strategy {
+ IN struct vnode *vp;
+ IN struct buf *bp;
+};
+
+#
+#% print vp = = =
+#
+vop_print {
+ IN struct vnode *vp;
+};
+
+#
+#% islocked vp = = =
+#
+vop_islocked {
+ IN struct vnode *vp;
+};
+
+#
+#% pathconf vp L L L
+#
+vop_pathconf {
+ IN struct vnode *vp;
+ IN int name;
+ OUT register_t *retval;
+};
+
+#
+#% advlock vp U U U
+#
+vop_advlock {
+ IN struct vnode *vp;
+ IN caddr_t id;
+ IN int op;
+ IN struct flock *fl;
+ IN int flags;
+};
+
+#
+#% balloc vp L L L
+#
+vop_balloc {
+ IN struct vnode *vp;
+ IN off_t startoffset;
+ IN int size;
+ IN struct ucred *cred;
+ IN int flags;
+ OUT struct buf **bpp;
+};
+
+#
+#% reallocblks vp L L L
+#
+vop_reallocblks {
+ IN struct vnode *vp;
+ IN struct cluster_save *buflist;
+};
+
+vop_getpages {
+ IN struct vnode *vp;
+ IN vm_page_t *m;
+ IN int count;
+ IN int reqpage;
+ IN vm_ooffset_t offset;
+};
+
+vop_putpages {
+ IN struct vnode *vp;
+ IN vm_page_t *m;
+ IN int count;
+ IN int sync;
+ IN int *rtvals;
+ IN vm_ooffset_t offset;
+};
+
+#
+#% freeblks vp - - -
+#
+# This call is used by the filesystem to release blocks back to
+# device-driver. This is useful if the driver has a lengthy
+# erase handling or similar.
+#
+
+vop_freeblks {
+ IN struct vnode *vp;
+ IN daddr_t addr;
+ IN daddr_t length;
+};
+
+#
+# Needs work: no vp?
+#
+#vop_bwrite {
+# IN struct buf *bp;
+#};
OpenPOWER on IntegriCloud