diff options
Diffstat (limited to 'sys/kern')
146 files changed, 108911 insertions, 0 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc new file mode 100644 index 0000000..b958ba7 --- /dev/null +++ b/sys/kern/Make.tags.inc @@ -0,0 +1,103 @@ +# $FreeBSD$ +# @(#)Make.tags.inc 8.1 (Berkeley) 6/11/93 + +SYS?= ${.CURDIR}/.. + +# Common files for "make tags", included by the Makefile for each +# architecture. + +# Put the /sys/sys include files at the end so that subroutine definitions +# win when there is a struct tag with the same name (e.g., vmmeter). The +# better solution would be for ctags to generate "struct vmmeter" tags. + +COMM= ${SYS}/dev/advansys/*.[ch] \ + ${SYS}/dev/aha/*.[ch] \ + ${SYS}/dev/aic7xxx/*.[ch] \ + ${SYS}/dev/buslogic/*.[ch] \ + ${SYS}/dev/ccd/*.[ch] \ + ${SYS}/dev/dec/*.[ch] \ + ${SYS}/dev/dpt/*.[ch] \ + ${SYS}/dev/en/*.[ch] \ + ${SYS}/dev/hea/*.[ch] \ + ${SYS}/dev/hfa/*.[ch] \ + ${SYS}/dev/iicbus/*.[ch] \ + ${SYS}/dev/isp/*.[ch] \ + ${SYS}/dev/pdq/*.[ch] \ + ${SYS}/dev/ppbus/*.[ch] \ + ${SYS}/dev/smbus/*.[ch] \ + ${SYS}/dev/vx/*.[ch] \ + ${SYS}/fs/deadfs/*.[ch] \ + ${SYS}/fs/fdescfs/*.[ch] \ + ${SYS}/fs/fifofs/*.[ch] \ + ${SYS}/fs/msdosfs/*.[ch] \ + ${SYS}/fs/nullfs/*.[ch] \ + ${SYS}/fs/portalfs/*.[ch] \ + ${SYS}/fs/procfs/*.[ch] \ + ${SYS}/fs/specfs/*.[ch] \ + ${SYS}/fs/umapfs/*.[ch] \ + ${SYS}/fs/unionfs/*.[ch] \ + ${SYS}/isofs/cd9660/*.[ch] \ + ${SYS}/kern/*.[ch] \ + ${SYS}/net/*.[ch] \ + ${SYS}/netatalk/*.[ch] \ + ${SYS}/netatm/*.[ch] \ + ${SYS}/netinet/*.[ch] \ + ${SYS}/netipx/*.[ch] \ + ${SYS}/netkey/*.[ch] \ + ${SYS}/netnatm/*.[ch] \ + ${SYS}/netns/*.[ch] \ + ${SYS}/nfs/*.[ch] \ + ${SYS}/pci/*.[ch] \ + ${SYS}/posix4/*.[ch] \ + ${SYS}/ufs/ffs/*.[ch] \ + ${SYS}/ufs/ufs/*.[ch] \ + ${SYS}/vm/*.[ch] \ + ${SYS}/sys/*.[ch] + +COMMDIR1= ${SYS}/conf \ + ${SYS}/kern \ + ${SYS}/net \ + ${SYS}/netatalk \ + ${SYS}/netatm \ + ${SYS}/netinet \ + ${SYS}/netipx \ + ${SYS}/netkey \ + ${SYS}/netnatm \ + ${SYS}/netns \ + ${SYS}/nfs \ + ${SYS}/pci \ + ${SYS}/posix4 \ + ${SYS}/vm \ + ${SYS}/sys + +COMMDIR2= ${SYS}/dev/advansys \ + ${SYS}/dev/aha \ + ${SYS}/dev/aic7xxx \ + ${SYS}/dev/buslogic \ + ${SYS}/dev/ccd \ + ${SYS}/dev/dec \ + ${SYS}/dev/dpt \ + ${SYS}/dev/en \ + ${SYS}/dev/hea \ + ${SYS}/dev/hfa \ + ${SYS}/dev/iicbus \ + ${SYS}/dev/isp \ + ${SYS}/dev/pdq \ + ${SYS}/dev/ppbus \ + ${SYS}/dev/smbus \ + ${SYS}/dev/vn \ + ${SYS}/dev/vx \ + ${SYS}/fs/deadfs \ + ${SYS}/fs/devfs \ + ${SYS}/fs/fdescfs \ + ${SYS}/fs/fifofs \ + ${SYS}/fs/msdosfs \ + ${SYS}/fs/nullfs \ + ${SYS}/fs/portalfs \ + ${SYS}/fs/procfs \ + ${SYS}/fs/specfs \ + ${SYS}/fs/umapfs \ + ${SYS}/fs/unionfs \ + ${SYS}/isofs/cd9660 \ + ${SYS}/ufs/ffs \ + ${SYS}/ufs/ufs diff --git a/sys/kern/Makefile b/sys/kern/Makefile new file mode 100644 index 0000000..cdfcc2a --- /dev/null +++ b/sys/kern/Makefile @@ -0,0 +1,54 @@ +# @(#)Makefile 8.2 (Berkeley) 3/21/94 +# $FreeBSD$ + +# Makefile for kernel tags files, init_sysent, etc. + +ARCH= i386 # luna68k news3400 pmax sparc tahoe vax + +all: + @echo "make tags, make links or init_sysent.c only" + +init_sysent.c syscalls.c ../sys/syscall.h \ +../sys/syscall.mk ../sys/sysproto.h: makesyscalls.sh syscalls.master + -mv -f init_sysent.c init_sysent.c.bak + -mv -f syscalls.c syscalls.c.bak + -mv -f ../sys/syscall.h ../sys/syscall.h.bak + -mv -f ../sys/syscall.mk ../sys/syscall.mk.bak + -mv -f ../sys/sysproto.h ../sys/sysproto.h.bak + sh makesyscalls.sh syscalls.master + +# Kernel tags: +# Tags files are built in the top-level directory for each architecture, +# with a makefile listing the architecture-dependent files, etc. The list +# of common files is in ./Make.tags.inc. Links to the correct tags file +# are placed in each source directory. We need to have links to tags files +# from the generic directories that are relative to the machine type, even +# via remote mounts; therefore we use symlinks to $SYSTAGS, which points at +# ${SYSDIR}/${MACHINE_ARCH}/tags. + +SYSTAGS=/var/db/sys_tags +SYSDIR=/sys + +# Directories in which to place tags links (other than machine-dependent) +DGEN= conf \ + dev dev/scsi \ + fs fs/deadfs fs/fdescfs fs/fifofs \ + fs/lofs fs/nullfs fs/portalfs fs/procfs \ + fs/specfs fs/umapfs fs/unionfs \ + hp hp/dev hp/hpux \ + kern libkern \ + net netccitt netinet netiso netns nfs scripts sys \ + ufs ufs/ffs ufs/lfs ufs/ufs \ + vm + +tags:: + -for i in ${ARCH}; do \ + (cd ../$$i && make ${MFLAGS} tags); done + +links:: + rm -f ${SYSTAGS} + ln -s ${SYSDIR}/${MACHINE_ARCH}/tags ${SYSTAGS} + -for i in ${DGEN}; do \ + (cd ../$$i && { rm -f tags; ln -s ${SYSTAGS} tags; }) done + -for i in ${ARCH}; do \ + (cd ../$$i && make ${MFLAGS} SYSTAGS=${SYSTAGS} links); done diff --git a/sys/kern/bus_if.m b/sys/kern/bus_if.m new file mode 100644 index 0000000..bf8d4ac --- /dev/null +++ b/sys/kern/bus_if.m @@ -0,0 +1,246 @@ +# +# Copyright (c) 1998 Doug Rabson +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD$ +# + +#include <sys/bus.h> + +INTERFACE bus; + +# +# Default implementations of some methods. +# +CODE { + static struct resource * + null_alloc_resource(device_t dev, device_t child, + int type, int *rid, + u_long start, u_long end, + u_long count, u_int flags) + { + return 0; + } +}; + +# +# This is called from system code which prints out a description of a +# device. It should describe the attachment that the child has with +# the parent. For instance the TurboLaser bus prints which node the +# device is attached to. See bus_generic_print_child.9 for more +# information. +# This method returns the number of characters output. +# +METHOD int print_child { + device_t dev; + device_t child; +}; + +# +# Called for each child device that +# did not succeed in probing for a +# driver. +# +METHOD void probe_nomatch { + device_t dev; + device_t child; +}; + +# +# These two methods manage a bus specific set of instance variables of +# a child device. The intention is that each different type of bus +# defines a set of appropriate instance variables (such as ports and +# irqs for ISA bus etc.) +# +# This information could be given to the child device as a struct but +# that makes it hard for a bus to add or remove variables without +# forcing an edit and recompile for all drivers which may not be +# possible for vendor supplied binary drivers. + +# +# Read an instance variable. Return 0 on success. +# +METHOD int read_ivar { + device_t _dev; + device_t _child; + int _indx; + uintptr_t *_result; +}; + +# +# Write an instance variable. Return 0 on success. +# +METHOD int write_ivar { + device_t _dev; + device_t _child; + int _indx; + uintptr_t _value; +}; + +# +# Called after the child's DEVICE_DETACH method to allow the parent +# to reclaim any resources allocated on behalf of the child. +# +METHOD void child_detached { + device_t _dev; + device_t _child; +}; + +# +# Called when a new driver is added to the devclass which owns this +# bus. The generic implementation of this method attempts to probe and +# attach any un-matched children of the bus. +# +METHOD void driver_added { + device_t _dev; + driver_t *_driver; +} DEFAULT bus_generic_driver_added; + +# +# For busses which use use drivers supporting DEVICE_IDENTIFY to +# enumerate their devices, these methods are used to create new +# device instances. If place is non-NULL, the new device will be +# added after the last existing child with the same order. +# +METHOD device_t add_child { + device_t _dev; + int _order; + const char *_name; + int _unit; +}; + +# +# Allocate a system resource attached to `dev' on behalf of `child'. +# The types are defined in <machine/resource.h>; the meaning of the +# resource-ID field varies from bus to bus (but *rid == 0 is always +# valid if the resource type is). start and end reflect the allowable +# range, and should be passed as `0UL' and `~0UL', respectively, if +# the client has no range restriction. count is the number of consecutive +# indices in the resource required. flags is a set of sharing flags +# as defined in <sys/rman.h>. +# +# Returns a resource or a null pointer on failure. The caller is +# responsible for calling rman_activate_resource() when it actually +# uses the resource. +# +METHOD struct resource * alloc_resource { + device_t _dev; + device_t _child; + int _type; + int *_rid; + u_long _start; + u_long _end; + u_long _count; + u_int _flags; +} DEFAULT null_alloc_resource; + +METHOD int activate_resource { + device_t _dev; + device_t _child; + int _type; + int _rid; + struct resource *_r; +}; + +METHOD int deactivate_resource { + device_t _dev; + device_t _child; + int _type; + int _rid; + struct resource *_r; +}; + +# +# Free a resource allocated by the preceding method. The `rid' value +# must be the same as the one returned by BUS_ALLOC_RESOURCE (which +# is not necessarily the same as the one the client passed). +# +METHOD int release_resource { + device_t _dev; + device_t _child; + int _type; + int _rid; + struct resource *_res; +}; + +METHOD int setup_intr { + device_t _dev; + device_t _child; + struct resource *_irq; + int _flags; + driver_intr_t *_intr; + void *_arg; + void **_cookiep; +}; + +METHOD int teardown_intr { + device_t _dev; + device_t _child; + struct resource *_irq; + void *_cookie; +}; + +# +# Set the range used for a particular resource. Return EINVAL if +# the type or rid are out of range. +# +METHOD int set_resource { + device_t _dev; + device_t _child; + int _type; + int _rid; + u_long _start; + u_long _count; +}; + +# +# Get the range for a resource. Return ENOENT if the type or rid are +# out of range or have not been set. +# +METHOD int get_resource { + device_t _dev; + device_t _child; + int _type; + int _rid; + u_long *_startp; + u_long *_countp; +}; + +# +# Delete a resource. +# +METHOD void delete_resource { + device_t _dev; + device_t _child; + int _type; + int _rid; +}; + +# +# Return a struct resource_list. +# +METHOD struct resource_list * get_resource_list { + device_t _dev; + device_t _child; +} DEFAULT bus_generic_get_resource_list; diff --git a/sys/kern/clock_if.m b/sys/kern/clock_if.m new file mode 100644 index 0000000..3ddb25e --- /dev/null +++ b/sys/kern/clock_if.m @@ -0,0 +1,44 @@ +# Copyright (c) 2001 by Thomas Moestl <tmm@FreeBSD.org>. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# $FreeBSD$ + +#include <sys/bus.h> +#include <sys/time.h> + +INTERFACE clock; + +# Interface for clock drivers. This is inspired by the NetBSD device-independent +# clock code (by Gordon W. Ross). + +# An EINVAL error return from this call signifies that the clock has an illegal +# setting. +METHOD int gettime { + device_t dev; + struct timespec *ts; +}; + +METHOD int settime { + device_t dev; + struct timespec *ts; +}; diff --git a/sys/kern/device_if.m b/sys/kern/device_if.m new file mode 100644 index 0000000..005eb38 --- /dev/null +++ b/sys/kern/device_if.m @@ -0,0 +1,127 @@ +# +# Copyright (c) 1998 Doug Rabson +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD$ +# + +#include <sys/bus.h> + +INTERFACE device; + +# +# Default implementations of some methods. +# +CODE { + static int null_shutdown(device_t dev) + { + return 0; + } + + static int null_suspend(device_t dev) + { + return 0; + } + + static int null_resume(device_t dev) + { + return 0; + } +}; + +# +# Probe to see if the device is present. Return 0 if the device exists, +# ENXIO if it cannot be found. If some other error happens during the +# probe (such as a memory allocation failure), an appropriate error code +# should be returned. For cases where more than one driver matches a +# device, a priority value can be returned. In this case, success codes +# are values less than or equal to zero with the highest value representing +# the best match. Failure codes are represented by positive values and +# the regular unix error codes should be used for the purpose. + +# If a driver returns a success code which is less than zero, it must +# not assume that it will be the same driver which is attached to the +# device. In particular, it must not assume that any values stored in +# the softc structure will be available for its attach method and any +# resources allocated during probe must be released and re-allocated +# if the attach method is called. If a success code of zero is +# returned, the driver can assume that it will be the one attached. +# +# Devices which implement busses should use this method to probe for +# the existence of devices attached to the bus and add them as +# children. If this is combined with the use of bus_generic_attach, +# the child devices will be automatically probed and attached. +# +METHOD int probe { + device_t dev; +}; + +# +# Called by a parent bus to add new devices to the bus. +# +STATICMETHOD void identify { + driver_t *driver; + device_t parent; +}; + +# +# Attach a device to the system. The probe method will have been +# called and will have indicated that the device exists. This routine +# should initialise the hardware and allocate other system resources +# (such as devfs entries). Returns 0 on success. +# +METHOD int attach { + device_t dev; +}; + +# +# Detach a device. This can be called if the user is replacing the +# driver software or if a device is about to be physically removed +# from the system (e.g. for pccard devices). Returns 0 on success. +# +METHOD int detach { + device_t dev; +}; + +# +# This is called during system shutdown to allow the driver to put the +# hardware into a consistent state for rebooting the computer. +# +METHOD int shutdown { + device_t dev; +} DEFAULT null_shutdown; + +# +# This is called by the power-management subsystem when a suspend has been +# requested by the user or by some automatic mechanism. This gives +# drivers a chance to veto the suspend or save their configuration before +# power is removed. +# +METHOD int suspend { + device_t dev; +} DEFAULT null_suspend; + +METHOD int resume { + device_t dev; +} DEFAULT null_resume; diff --git a/sys/kern/genassym.sh b/sys/kern/genassym.sh new file mode 100644 index 0000000..70ad69e --- /dev/null +++ b/sys/kern/genassym.sh @@ -0,0 +1,54 @@ +#!/bin/sh +# $FreeBSD$ + +# Grrr, this should use stdin and stdout, but is encrufted for compatibility. + +usage() +{ + echo "usage: genassym [-o outfile] objfile" + exit 1 +} + +outfile=/dev/stdout +while getopts "o:" option +do + case "$option" in + o) outfile="$OPTARG";; + *) usage;; + esac +done +shift $(($OPTIND - 1)) +case $# in +1) ;; +*) usage;; +esac + +${NM:='nm'} "$1" | ${AWK:='awk'} ' +/ C .*sign$/ { + sign = substr($1, length($1) - 3, 4) + sub("^0*", "", sign) + if (sign != "") + sign = "-" +} +/ C .*w0$/ { + w0 = substr($1, length($1) - 3, 4) +} +/ C .*w1$/ { + w1 = substr($1, length($1) - 3, 4) +} +/ C .*w2$/ { + w2 = substr($1, length($1) - 3, 4) +} +/ C .*w3$/ { + w3 = substr($1, length($1) - 3, 4) + w = w3 w2 w1 w0 + sub("^0*", "", w) + if (w == "") + w = "0" + sub("w3$", "", $3) + # This still has minor problems representing INT_MIN, etc. E.g., + # with 32-bit 2''s complement ints, this prints -0x80000000, which + # has the wrong type (unsigned int). + printf("#define\t%s\t%s0x%s\n", $3, sign, w) +} +' 3>"$outfile" >&3 3>&- diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c new file mode 100644 index 0000000..41ae8cf --- /dev/null +++ b/sys/kern/imgact_aout.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 1993, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_kstack_pages.h" + +#include <sys/param.h> +#include <sys/exec.h> +#include <sys/fcntl.h> +#include <sys/imgact.h> +#include <sys/imgact_aout.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/namei.h> +#include <sys/pioctl.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/systm.h> +#include <sys/signalvar.h> +#include <sys/stat.h> +#include <sys/sysent.h> +#include <sys/syscall.h> +#include <sys/vnode.h> +#include <sys/user.h> + +#include <machine/md_var.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> + +static int exec_aout_imgact(struct image_params *imgp); + +struct sysentvec aout_sysvec = { + SYS_MAXSYSCALL, + sysent, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + sendsig, + sigcode, + &szsigcode, + 0, + "FreeBSD a.out", + aout_coredump, + NULL, + MINSIGSTKSZ +}; + +static int +exec_aout_imgact(imgp) + struct image_params *imgp; +{ + const struct exec *a_out = (const struct exec *) imgp->image_header; + struct vmspace *vmspace; + struct vnode *vp; + vm_map_t map; + vm_object_t object; + vm_offset_t text_end, data_end; + unsigned long virtual_offset; + unsigned long file_offset; + unsigned long bss_size; + int error; + + GIANT_REQUIRED; + + /* + * Linux and *BSD binaries look very much alike, + * only the machine id is different: + * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI. + * NetBSD is in network byte order.. ugh. + */ + if (((a_out->a_magic >> 16) & 0xff) != 0x86 && + ((a_out->a_magic >> 16) & 0xff) != 0 && + ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86) + return -1; + + /* + * Set file/virtual offset based on a.out variant. + * We do two cases: host byte order and network byte order + * (for NetBSD compatibility) + */ + switch ((int)(a_out->a_magic & 0xffff)) { + case ZMAGIC: + virtual_offset = 0; + if (a_out->a_text) { + file_offset = PAGE_SIZE; + } else { + /* Bill's "screwball mode" */ + file_offset = 0; + } + break; + case QMAGIC: + virtual_offset = PAGE_SIZE; + file_offset = 0; + /* Pass PS_STRINGS for BSD/OS binaries only. */ + if (N_GETMID(*a_out) == MID_ZERO) + imgp->ps_strings = PS_STRINGS; + break; + default: + /* NetBSD compatibility */ + switch ((int)(ntohl(a_out->a_magic) & 0xffff)) { + case ZMAGIC: + case QMAGIC: + virtual_offset = PAGE_SIZE; + file_offset = 0; + break; + default: + return (-1); + } + } + + bss_size = roundup(a_out->a_bss, PAGE_SIZE); + + /* + * Check various fields in header for validity/bounds. + */ + if (/* entry point must lay with text region */ + a_out->a_entry < virtual_offset || + a_out->a_entry >= virtual_offset + a_out->a_text || + + /* text and data size must each be page rounded */ + a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) + return (-1); + + /* text + data can't exceed file size */ + if (a_out->a_data + a_out->a_text > imgp->attr->va_size) + return (EFAULT); + + /* + * text/data/bss must not exceed limits + */ + mtx_assert(&Giant, MA_OWNED); + if (/* text can't exceed maximum text size */ + a_out->a_text > maxtsiz || + + /* data + bss can't exceed rlimit */ + a_out->a_data + bss_size > + imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur) + return (ENOMEM); + + /* copy in arguments and/or environment from old process */ + error = exec_extract_strings(imgp); + if (error) + return (error); + + /* + * Destroy old process VM and create a new one (with a new stack) + */ + exec_new_vmspace(imgp); + + /* + * The vm space can be changed by exec_new_vmspace + */ + vmspace = imgp->proc->p_vmspace; + + vp = imgp->vp; + map = &vmspace->vm_map; + vm_map_lock(map); + VOP_GETVOBJECT(vp, &object); + vm_object_reference(object); + + text_end = virtual_offset + a_out->a_text; + error = vm_map_insert(map, object, + file_offset, + virtual_offset, text_end, + VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, + MAP_COPY_ON_WRITE | MAP_PREFAULT); + if (error) { + vm_map_unlock(map); + return (error); + } + data_end = text_end + a_out->a_data; + if (a_out->a_data) { + vm_object_reference(object); + error = vm_map_insert(map, object, + file_offset + a_out->a_text, + text_end, data_end, + VM_PROT_ALL, VM_PROT_ALL, + MAP_COPY_ON_WRITE | MAP_PREFAULT); + if (error) { + vm_map_unlock(map); + return (error); + } + } + + if (bss_size) { + error = vm_map_insert(map, NULL, 0, + data_end, data_end + bss_size, + VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) { + vm_map_unlock(map); + return (error); + } + } + vm_map_unlock(map); + + /* Fill in process VM information */ + vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT; + vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT; + vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset; + vmspace->vm_daddr = (caddr_t) (uintptr_t) + (virtual_offset + a_out->a_text); + + /* Fill in image_params */ + imgp->interpreted = 0; + imgp->entry_addr = a_out->a_entry; + + imgp->proc->p_sysent = &aout_sysvec; + + /* Indicate that this file should not be modified */ + imgp->vp->v_flag |= VTEXT; + + return (0); +} + +/* + * Dump core, into a file named as described in the comments for + * expand_name(), unless the process was setuid/setgid. + */ +int +aout_coredump(td, vp, limit) + register struct thread *td; + register struct vnode *vp; + off_t limit; +{ + struct proc *p = td->td_proc; + register struct ucred *cred = td->td_ucred; + register struct vmspace *vm = p->p_vmspace; + int error; + + if (ctob((UAREA_PAGES + KSTACK_PAGES) + + vm->vm_dsize + vm->vm_ssize) >= limit) + return (EFAULT); + PROC_LOCK(p); + fill_kinfo_proc(p, &p->p_uarea->u_kproc); + PROC_UNLOCK(p); + error = cpu_coredump(td, vp, cred); + if (error == 0) + error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr, + (int)ctob(vm->vm_dsize), + (off_t)ctob(UAREA_PAGES + KSTACK_PAGES), UIO_USERSPACE, + IO_UNIT | IO_DIRECT, cred, (int *) NULL, td); + if (error == 0) + error = vn_rdwr_inchunks(UIO_WRITE, vp, + (caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)), + round_page(ctob(vm->vm_ssize)), + (off_t)ctob(UAREA_PAGES + KSTACK_PAGES) + + ctob(vm->vm_dsize), UIO_USERSPACE, + IO_UNIT | IO_DIRECT, cred, (int *) NULL, td); + return (error); +} + +/* + * Tell kern_execve.c about it, with a little help from the linker. + */ +static struct execsw aout_execsw = { exec_aout_imgact, "a.out" }; +EXEC_SET(aout, aout_execsw); diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c new file mode 100644 index 0000000..9044527 --- /dev/null +++ b/sys/kern/imgact_elf.c @@ -0,0 +1,1075 @@ +/*- + * Copyright (c) 2000 David O'Brien + * Copyright (c) 1995-1996 Søren Schmidt + * Copyright (c) 1996 Peter Wemm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/exec.h> +#include <sys/fcntl.h> +#include <sys/imgact.h> +#include <sys/imgact_elf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/mman.h> +#include <sys/namei.h> +#include <sys/pioctl.h> +#include <sys/proc.h> +#include <sys/procfs.h> +#include <sys/resourcevar.h> +#include <sys/systm.h> +#include <sys/signalvar.h> +#include <sys/stat.h> +#include <sys/sx.h> +#include <sys/syscall.h> +#include <sys/sysctl.h> +#include <sys/sysent.h> +#include <sys/vnode.h> + +#include <vm/vm.h> +#include <vm/vm_kern.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> + +#include <machine/elf.h> +#include <machine/md_var.h> + +#define OLD_EI_BRAND 8 + +__ElfType(Brandinfo); +__ElfType(Auxargs); + +static int elf_check_header(const Elf_Ehdr *hdr); +static int elf_freebsd_fixup(register_t **stack_base, + struct image_params *imgp); +static int elf_load_file(struct proc *p, const char *file, u_long *addr, + u_long *entry); +static int elf_load_section(struct proc *p, + struct vmspace *vmspace, struct vnode *vp, + vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, + vm_prot_t prot); +static int exec_elf_imgact(struct image_params *imgp); + +static int elf_trace = 0; +SYSCTL_INT(_debug, OID_AUTO, elf_trace, CTLFLAG_RW, &elf_trace, 0, ""); + +struct sysentvec elf_freebsd_sysvec = { + SYS_MAXSYSCALL, + sysent, + 0, + 0, + 0, + 0, + 0, + 0, + elf_freebsd_fixup, + sendsig, + sigcode, + &szsigcode, + 0, + "FreeBSD ELF", + elf_coredump, + NULL, + MINSIGSTKSZ +}; + +static Elf_Brandinfo freebsd_brand_info = { + ELFOSABI_FREEBSD, + "FreeBSD", + "", + "/usr/libexec/ld-elf.so.1", + &elf_freebsd_sysvec + }; +static Elf_Brandinfo *elf_brand_list[MAX_BRANDS] = { + &freebsd_brand_info, + NULL, NULL, NULL, + NULL, NULL, NULL, NULL + }; + +int +elf_insert_brand_entry(Elf_Brandinfo *entry) +{ + int i; + + for (i=1; i<MAX_BRANDS; i++) { + if (elf_brand_list[i] == NULL) { + elf_brand_list[i] = entry; + break; + } + } + if (i == MAX_BRANDS) + return -1; + return 0; +} + +int +elf_remove_brand_entry(Elf_Brandinfo *entry) +{ + int i; + + for (i=1; i<MAX_BRANDS; i++) { + if (elf_brand_list[i] == entry) { + elf_brand_list[i] = NULL; + break; + } + } + if (i == MAX_BRANDS) + return -1; + return 0; +} + +int +elf_brand_inuse(Elf_Brandinfo *entry) +{ + struct proc *p; + int rval = FALSE; + + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + if (p->p_sysent == entry->sysvec) { + rval = TRUE; + break; + } + } + sx_sunlock(&allproc_lock); + + return (rval); +} + +static int +elf_check_header(const Elf_Ehdr *hdr) +{ + if (!IS_ELF(*hdr) || + hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || + hdr->e_ident[EI_DATA] != ELF_TARG_DATA || + hdr->e_ident[EI_VERSION] != EV_CURRENT) + return ENOEXEC; + + if (!ELF_MACHINE_OK(hdr->e_machine)) + return ENOEXEC; + + if (hdr->e_version != ELF_TARG_VER) + return ENOEXEC; + + return 0; +} + +static int +elf_load_section(struct proc *p, struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot) +{ + size_t map_len; + vm_offset_t map_addr; + int error, rv; + size_t copy_len; + vm_object_t object; + vm_offset_t file_addr; + vm_offset_t data_buf = 0; + + GIANT_REQUIRED; + + VOP_GETVOBJECT(vp, &object); + error = 0; + + /* + * It's necessary to fail if the filsz + offset taken from the + * header is greater than the actual file pager object's size. + * If we were to allow this, then the vm_map_find() below would + * walk right off the end of the file object and into the ether. + * + * While I'm here, might as well check for something else that + * is invalid: filsz cannot be greater than memsz. + */ + if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size || + filsz > memsz) { + uprintf("elf_load_section: truncated ELF file\n"); + return (ENOEXEC); + } + + map_addr = trunc_page((vm_offset_t)vmaddr); + file_addr = trunc_page(offset); + + /* + * We have two choices. We can either clear the data in the last page + * of an oversized mapping, or we can start the anon mapping a page + * early and copy the initialized data into that first page. We + * choose the second.. + */ + if (memsz > filsz) + map_len = trunc_page(offset+filsz) - file_addr; + else + map_len = round_page(offset+filsz) - file_addr; + + if (map_len != 0) { + vm_object_reference(object); + vm_map_lock(&vmspace->vm_map); + rv = vm_map_insert(&vmspace->vm_map, + object, + file_addr, /* file offset */ + map_addr, /* virtual start */ + map_addr + map_len,/* virtual end */ + prot, + VM_PROT_ALL, + MAP_COPY_ON_WRITE | MAP_PREFAULT); + vm_map_unlock(&vmspace->vm_map); + if (rv != KERN_SUCCESS) { + vm_object_deallocate(object); + return EINVAL; + } + + /* we can stop now if we've covered it all */ + if (memsz == filsz) { + return 0; + } + } + + + /* + * We have to get the remaining bit of the file into the first part + * of the oversized map segment. This is normally because the .data + * segment in the file is extended to provide bss. It's a neat idea + * to try and save a page, but it's a pain in the behind to implement. + */ + copy_len = (offset + filsz) - trunc_page(offset + filsz); + map_addr = trunc_page((vm_offset_t)vmaddr + filsz); + map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr; + + /* This had damn well better be true! */ + if (map_len != 0) { + vm_map_lock(&vmspace->vm_map); + rv = vm_map_insert(&vmspace->vm_map, NULL, 0, + map_addr, map_addr + map_len, + VM_PROT_ALL, VM_PROT_ALL, 0); + vm_map_unlock(&vmspace->vm_map); + if (rv != KERN_SUCCESS) { + return EINVAL; + } + } + + if (copy_len != 0) { + vm_object_reference(object); + rv = vm_map_find(exec_map, + object, + trunc_page(offset + filsz), + &data_buf, + PAGE_SIZE, + TRUE, + VM_PROT_READ, + VM_PROT_ALL, + MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL); + if (rv != KERN_SUCCESS) { + vm_object_deallocate(object); + return EINVAL; + } + + /* send the page fragment to user space */ + error = copyout((caddr_t)data_buf, (caddr_t)map_addr, copy_len); + vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE); + if (error) { + return (error); + } + } + + /* + * set it to the specified protection + */ + vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len, prot, + FALSE); + + return error; +} + +/* + * Load the file "file" into memory. It may be either a shared object + * or an executable. + * + * The "addr" reference parameter is in/out. On entry, it specifies + * the address where a shared object should be loaded. If the file is + * an executable, this value is ignored. On exit, "addr" specifies + * where the file was actually loaded. + * + * The "entry" reference parameter is out only. On exit, it specifies + * the entry point for the loaded file. + */ +static int +elf_load_file(struct proc *p, const char *file, u_long *addr, u_long *entry) +{ + struct { + struct nameidata nd; + struct vattr attr; + struct image_params image_params; + } *tempdata; + const Elf_Ehdr *hdr = NULL; + const Elf_Phdr *phdr = NULL; + struct nameidata *nd; + struct vmspace *vmspace = p->p_vmspace; + struct vattr *attr; + struct image_params *imgp; + vm_prot_t prot; + u_long rbase; + u_long base_addr = 0; + int error, i, numsegs; + + if (curthread->td_proc != p) + panic("elf_load_file - thread"); /* XXXKSE DIAGNOSTIC */ + + tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK); + nd = &tempdata->nd; + attr = &tempdata->attr; + imgp = &tempdata->image_params; + + /* + * Initialize part of the common data + */ + imgp->proc = p; + imgp->uap = NULL; + imgp->attr = attr; + imgp->firstpage = NULL; + imgp->image_header = (char *)kmem_alloc_wait(exec_map, PAGE_SIZE); + + if (imgp->image_header == NULL) { + nd->ni_vp = NULL; + error = ENOMEM; + goto fail; + } + + /* XXXKSE */ + NDINIT(nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, curthread); + + if ((error = namei(nd)) != 0) { + nd->ni_vp = NULL; + goto fail; + } + NDFREE(nd, NDF_ONLY_PNBUF); + imgp->vp = nd->ni_vp; + + /* + * Check permissions, modes, uid, etc on the file, and "open" it. + */ + error = exec_check_permissions(imgp); + if (error) { + VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */ + goto fail; + } + + error = exec_map_first_page(imgp); + /* + * Also make certain that the interpreter stays the same, so set + * its VTEXT flag, too. + */ + if (error == 0) + nd->ni_vp->v_flag |= VTEXT; + VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */ + if (error) + goto fail; + + hdr = (const Elf_Ehdr *)imgp->image_header; + if ((error = elf_check_header(hdr)) != 0) + goto fail; + if (hdr->e_type == ET_DYN) + rbase = *addr; + else if (hdr->e_type == ET_EXEC) + rbase = 0; + else { + error = ENOEXEC; + goto fail; + } + + /* Only support headers that fit within first page for now */ + if ((hdr->e_phoff > PAGE_SIZE) || + (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) { + error = ENOEXEC; + goto fail; + } + + phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); + + for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) { + if (phdr[i].p_type == PT_LOAD) { /* Loadable segment */ + prot = 0; + if (phdr[i].p_flags & PF_X) + prot |= VM_PROT_EXECUTE; + if (phdr[i].p_flags & PF_W) + prot |= VM_PROT_WRITE; + if (phdr[i].p_flags & PF_R) + prot |= VM_PROT_READ; + + if ((error = elf_load_section(p, vmspace, nd->ni_vp, + phdr[i].p_offset, + (caddr_t)phdr[i].p_vaddr + + rbase, + phdr[i].p_memsz, + phdr[i].p_filesz, prot)) != 0) + goto fail; + /* + * Establish the base address if this is the + * first segment. + */ + if (numsegs == 0) + base_addr = trunc_page(phdr[i].p_vaddr + rbase); + numsegs++; + } + } + *addr = base_addr; + *entry=(unsigned long)hdr->e_entry + rbase; + +fail: + if (imgp->firstpage) + exec_unmap_first_page(imgp); + if (imgp->image_header) + kmem_free_wakeup(exec_map, (vm_offset_t)imgp->image_header, + PAGE_SIZE); + if (nd->ni_vp) + vrele(nd->ni_vp); + + free(tempdata, M_TEMP); + + return error; +} + +/* + * non static, as it can be overridden by start_init() + */ +#ifdef __ia64__ +int fallback_elf_brand = ELFOSABI_FREEBSD; +#else +int fallback_elf_brand = -1; +#endif +SYSCTL_INT(_kern, OID_AUTO, fallback_elf_brand, CTLFLAG_RW, + &fallback_elf_brand, -1, + "ELF brand of last resort"); + +static int +exec_elf_imgact(struct image_params *imgp) +{ + const Elf_Ehdr *hdr = (const Elf_Ehdr *) imgp->image_header; + const Elf_Phdr *phdr; + Elf_Auxargs *elf_auxargs = NULL; + struct vmspace *vmspace; + vm_prot_t prot; + u_long text_size = 0, data_size = 0; + u_long text_addr = 0, data_addr = 0; + u_long addr, entry = 0, proghdr = 0; + int error, i; + const char *interp = NULL; + Elf_Brandinfo *brand_info; + char *path; + + GIANT_REQUIRED; + + /* + * Do we have a valid ELF header ? + */ + if (elf_check_header(hdr) != 0 || hdr->e_type != ET_EXEC) + return -1; + + /* + * From here on down, we return an errno, not -1, as we've + * detected an ELF file. + */ + + if ((hdr->e_phoff > PAGE_SIZE) || + (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) { + /* Only support headers in first page for now */ + return ENOEXEC; + } + phdr = (const Elf_Phdr*)(imgp->image_header + hdr->e_phoff); + + /* + * From this point on, we may have resources that need to be freed. + */ + + /* + * Yeah, I'm paranoid. There is every reason in the world to get + * VTEXT now since from here on out, there are places we can have + * a context switch. Better safe than sorry; I really don't want + * the file to change while it's being loaded. + */ + mtx_lock(&imgp->vp->v_interlock); + imgp->vp->v_flag |= VTEXT; + mtx_unlock(&imgp->vp->v_interlock); + + if ((error = exec_extract_strings(imgp)) != 0) + goto fail; + + exec_new_vmspace(imgp); + + vmspace = imgp->proc->p_vmspace; + + for (i = 0; i < hdr->e_phnum; i++) { + switch(phdr[i].p_type) { + + case PT_LOAD: /* Loadable segment */ + prot = 0; + if (phdr[i].p_flags & PF_X) + prot |= VM_PROT_EXECUTE; + if (phdr[i].p_flags & PF_W) + prot |= VM_PROT_WRITE; + if (phdr[i].p_flags & PF_R) + prot |= VM_PROT_READ; + + if ((error = elf_load_section(imgp->proc, + vmspace, imgp->vp, + phdr[i].p_offset, + (caddr_t)phdr[i].p_vaddr, + phdr[i].p_memsz, + phdr[i].p_filesz, prot)) != 0) + goto fail; + + /* + * Is this .text or .data ?? + * + * We only handle one each of those yet XXX + */ + if (hdr->e_entry >= phdr[i].p_vaddr && + hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) { + text_addr = trunc_page(phdr[i].p_vaddr); + text_size = round_page(phdr[i].p_memsz + + phdr[i].p_vaddr - + text_addr); + entry = (u_long)hdr->e_entry; + } else { + data_addr = trunc_page(phdr[i].p_vaddr); + data_size = round_page(phdr[i].p_memsz + + phdr[i].p_vaddr - + data_addr); + } + break; + case PT_INTERP: /* Path to interpreter */ + if (phdr[i].p_filesz > MAXPATHLEN || + phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) { + error = ENOEXEC; + goto fail; + } + interp = imgp->image_header + phdr[i].p_offset; + break; + case PT_PHDR: /* Program header table info */ + proghdr = phdr[i].p_vaddr; + break; + default: + break; + } + } + + vmspace->vm_tsize = text_size >> PAGE_SHIFT; + vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr; + vmspace->vm_dsize = data_size >> PAGE_SHIFT; + vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr; + + addr = ELF_RTLD_ADDR(vmspace); + + imgp->entry_addr = entry; + + brand_info = NULL; + + /* We support three types of branding -- (1) the ELF EI_OSABI field + * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string + * branding w/in the ELF header, and (3) path of the `interp_path' + * field. We should also look for an ".note.ABI-tag" ELF section now + * in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones. + */ + + /* If the executable has a brand, search for it in the brand list. */ + if (brand_info == NULL) { + for (i = 0; i < MAX_BRANDS; i++) { + Elf_Brandinfo *bi = elf_brand_list[i]; + + if (bi != NULL && + (hdr->e_ident[EI_OSABI] == bi->brand + || 0 == + strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND], + bi->compat_3_brand, strlen(bi->compat_3_brand)))) { + brand_info = bi; + break; + } + } + } + + /* Lacking a known brand, search for a recognized interpreter. */ + if (brand_info == NULL && interp != NULL) { + for (i = 0; i < MAX_BRANDS; i++) { + Elf_Brandinfo *bi = elf_brand_list[i]; + + if (bi != NULL && + strcmp(interp, bi->interp_path) == 0) { + brand_info = bi; + break; + } + } + } + + /* Lacking a recognized interpreter, try the default brand */ + if (brand_info == NULL) { + for (i = 0; i < MAX_BRANDS; i++) { + Elf_Brandinfo *bi = elf_brand_list[i]; + + if (bi != NULL && fallback_elf_brand == bi->brand) { + brand_info = bi; + break; + } + } + } + + if (brand_info == NULL) { + uprintf("ELF binary type \"%u\" not known.\n", + hdr->e_ident[EI_OSABI]); + error = ENOEXEC; + goto fail; + } + + imgp->proc->p_sysent = brand_info->sysvec; + if (interp != NULL) { + path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + snprintf(path, MAXPATHLEN, "%s%s", + brand_info->emul_path, interp); + if ((error = elf_load_file(imgp->proc, path, &addr, + &imgp->entry_addr)) != 0) { + if ((error = elf_load_file(imgp->proc, interp, &addr, + &imgp->entry_addr)) != 0) { + uprintf("ELF interpreter %s not found\n", path); + free(path, M_TEMP); + goto fail; + } + } + free(path, M_TEMP); + } + + /* + * Construct auxargs table (used by the fixup routine) + */ + elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK); + elf_auxargs->execfd = -1; + elf_auxargs->phdr = proghdr; + elf_auxargs->phent = hdr->e_phentsize; + elf_auxargs->phnum = hdr->e_phnum; + elf_auxargs->pagesz = PAGE_SIZE; + elf_auxargs->base = addr; + elf_auxargs->flags = 0; + elf_auxargs->entry = entry; + elf_auxargs->trace = elf_trace; + + imgp->auxargs = elf_auxargs; + imgp->interpreted = 0; + +fail: + return error; +} + +static int +elf_freebsd_fixup(register_t **stack_base, struct image_params *imgp) +{ + Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs; + register_t *pos; + + pos = *stack_base + (imgp->argc + imgp->envc + 2); + + if (args->trace) { + AUXARGS_ENTRY(pos, AT_DEBUG, 1); + } + if (args->execfd != -1) { + AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); + } + AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); + AUXARGS_ENTRY(pos, AT_PHENT, args->phent); + AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); + AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); + AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); + AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); + AUXARGS_ENTRY(pos, AT_BASE, args->base); + AUXARGS_ENTRY(pos, AT_NULL, 0); + + free(imgp->auxargs, M_TEMP); + imgp->auxargs = NULL; + + (*stack_base)--; + suword(*stack_base, (long) imgp->argc); + return 0; +} + +/* + * Code for generating ELF core dumps. + */ + +typedef void (*segment_callback)(vm_map_entry_t, void *); + +/* Closure for cb_put_phdr(). */ +struct phdr_closure { + Elf_Phdr *phdr; /* Program header to fill in */ + Elf_Off offset; /* Offset of segment in core file */ +}; + +/* Closure for cb_size_segment(). */ +struct sseg_closure { + int count; /* Count of writable segments. */ + size_t size; /* Total size of all writable segments. */ +}; + +static void cb_put_phdr(vm_map_entry_t, void *); +static void cb_size_segment(vm_map_entry_t, void *); +static void each_writable_segment(struct proc *, segment_callback, void *); +static int elf_corehdr(struct thread *, struct vnode *, struct ucred *, + int, void *, size_t); +static void elf_puthdr(struct proc *, void *, size_t *, + const prstatus_t *, const prfpregset_t *, const prpsinfo_t *, int); +static void elf_putnote(void *, size_t *, const char *, int, + const void *, size_t); + +extern int osreldate; + +int +elf_coredump(td, vp, limit) + struct thread *td; + register struct vnode *vp; + off_t limit; +{ + register struct proc *p = td->td_proc; + register struct ucred *cred = td->td_ucred; + int error = 0; + struct sseg_closure seginfo; + void *hdr; + size_t hdrsize; + + /* Size the program segments. */ + seginfo.count = 0; + seginfo.size = 0; + each_writable_segment(p, cb_size_segment, &seginfo); + + /* + * Calculate the size of the core file header area by making + * a dry run of generating it. Nothing is written, but the + * size is calculated. + */ + hdrsize = 0; + elf_puthdr((struct proc *)NULL, (void *)NULL, &hdrsize, + (const prstatus_t *)NULL, (const prfpregset_t *)NULL, + (const prpsinfo_t *)NULL, seginfo.count); + + if (hdrsize + seginfo.size >= limit) + return (EFAULT); + + /* + * Allocate memory for building the header, fill it up, + * and write it out. + */ + hdr = malloc(hdrsize, M_TEMP, M_WAITOK); + if (hdr == NULL) { + return EINVAL; + } + error = elf_corehdr(td, vp, cred, seginfo.count, hdr, hdrsize); + + /* Write the contents of all of the writable segments. */ + if (error == 0) { + Elf_Phdr *php; + off_t offset; + int i; + + php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1; + offset = hdrsize; + for (i = 0; i < seginfo.count; i++) { + error = vn_rdwr_inchunks(UIO_WRITE, vp, + (caddr_t)php->p_vaddr, + php->p_filesz, offset, UIO_USERSPACE, + IO_UNIT | IO_DIRECT, cred, (int *)NULL, curthread); /* XXXKSE */ + if (error != 0) + break; + offset += php->p_filesz; + php++; + } + } + free(hdr, M_TEMP); + + return error; +} + +/* + * A callback for each_writable_segment() to write out the segment's + * program header entry. + */ +static void +cb_put_phdr(entry, closure) + vm_map_entry_t entry; + void *closure; +{ + struct phdr_closure *phc = (struct phdr_closure *)closure; + Elf_Phdr *phdr = phc->phdr; + + phc->offset = round_page(phc->offset); + + phdr->p_type = PT_LOAD; + phdr->p_offset = phc->offset; + phdr->p_vaddr = entry->start; + phdr->p_paddr = 0; + phdr->p_filesz = phdr->p_memsz = entry->end - entry->start; + phdr->p_align = PAGE_SIZE; + phdr->p_flags = 0; + if (entry->protection & VM_PROT_READ) + phdr->p_flags |= PF_R; + if (entry->protection & VM_PROT_WRITE) + phdr->p_flags |= PF_W; + if (entry->protection & VM_PROT_EXECUTE) + phdr->p_flags |= PF_X; + + phc->offset += phdr->p_filesz; + phc->phdr++; +} + +/* + * A callback for each_writable_segment() to gather information about + * the number of segments and their total size. + */ +static void +cb_size_segment(entry, closure) + vm_map_entry_t entry; + void *closure; +{ + struct sseg_closure *ssc = (struct sseg_closure *)closure; + + ssc->count++; + ssc->size += entry->end - entry->start; +} + +/* + * For each writable segment in the process's memory map, call the given + * function with a pointer to the map entry and some arbitrary + * caller-supplied data. + */ +static void +each_writable_segment(p, func, closure) + struct proc *p; + segment_callback func; + void *closure; +{ + vm_map_t map = &p->p_vmspace->vm_map; + vm_map_entry_t entry; + + for (entry = map->header.next; entry != &map->header; + entry = entry->next) { + vm_object_t obj; + + if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) || + (entry->protection & (VM_PROT_READ|VM_PROT_WRITE)) != + (VM_PROT_READ|VM_PROT_WRITE)) + continue; + + /* + ** Dont include memory segment in the coredump if + ** MAP_NOCORE is set in mmap(2) or MADV_NOCORE in + ** madvise(2). + */ + if (entry->eflags & MAP_ENTRY_NOCOREDUMP) + continue; + + if ((obj = entry->object.vm_object) == NULL) + continue; + + /* Find the deepest backing object. */ + while (obj->backing_object != NULL) + obj = obj->backing_object; + + /* Ignore memory-mapped devices and such things. */ + if (obj->type != OBJT_DEFAULT && + obj->type != OBJT_SWAP && + obj->type != OBJT_VNODE) + continue; + + (*func)(entry, closure); + } +} + +/* + * Write the core file header to the file, including padding up to + * the page boundary. + */ +static int +elf_corehdr(td, vp, cred, numsegs, hdr, hdrsize) + struct thread *td; + struct vnode *vp; + struct ucred *cred; + int numsegs; + size_t hdrsize; + void *hdr; +{ + struct { + prstatus_t status; + prfpregset_t fpregset; + prpsinfo_t psinfo; + } *tempdata; + struct proc *p = td->td_proc; + size_t off; + prstatus_t *status; + prfpregset_t *fpregset; + prpsinfo_t *psinfo; + + tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO | M_WAITOK); + status = &tempdata->status; + fpregset = &tempdata->fpregset; + psinfo = &tempdata->psinfo; + + /* Gather the information for the header. */ + status->pr_version = PRSTATUS_VERSION; + status->pr_statussz = sizeof(prstatus_t); + status->pr_gregsetsz = sizeof(gregset_t); + status->pr_fpregsetsz = sizeof(fpregset_t); + status->pr_osreldate = osreldate; + status->pr_cursig = p->p_sig; + status->pr_pid = p->p_pid; + fill_regs(td, &status->pr_reg); + + fill_fpregs(td, fpregset); + + psinfo->pr_version = PRPSINFO_VERSION; + psinfo->pr_psinfosz = sizeof(prpsinfo_t); + strncpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname) - 1); + + /* XXX - We don't fill in the command line arguments properly yet. */ + strncpy(psinfo->pr_psargs, p->p_comm, PRARGSZ); + + /* Fill in the header. */ + bzero(hdr, hdrsize); + off = 0; + elf_puthdr(p, hdr, &off, status, fpregset, psinfo, numsegs); + + free(tempdata, M_TEMP); + + /* Write it to the core file. */ + return vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0, + UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NULL, td); /* XXXKSE */ +} + +static void +elf_puthdr(struct proc *p, void *dst, size_t *off, const prstatus_t *status, + const prfpregset_t *fpregset, const prpsinfo_t *psinfo, int numsegs) +{ + size_t ehoff; + size_t phoff; + size_t noteoff; + size_t notesz; + + ehoff = *off; + *off += sizeof(Elf_Ehdr); + + phoff = *off; + *off += (numsegs + 1) * sizeof(Elf_Phdr); + + noteoff = *off; + elf_putnote(dst, off, "FreeBSD", NT_PRSTATUS, status, + sizeof *status); + elf_putnote(dst, off, "FreeBSD", NT_FPREGSET, fpregset, + sizeof *fpregset); + elf_putnote(dst, off, "FreeBSD", NT_PRPSINFO, psinfo, + sizeof *psinfo); + notesz = *off - noteoff; + + /* Align up to a page boundary for the program segments. */ + *off = round_page(*off); + + if (dst != NULL) { + Elf_Ehdr *ehdr; + Elf_Phdr *phdr; + struct phdr_closure phc; + + /* + * Fill in the ELF header. + */ + ehdr = (Elf_Ehdr *)((char *)dst + ehoff); + ehdr->e_ident[EI_MAG0] = ELFMAG0; + ehdr->e_ident[EI_MAG1] = ELFMAG1; + ehdr->e_ident[EI_MAG2] = ELFMAG2; + ehdr->e_ident[EI_MAG3] = ELFMAG3; + ehdr->e_ident[EI_CLASS] = ELF_CLASS; + ehdr->e_ident[EI_DATA] = ELF_DATA; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD; + ehdr->e_ident[EI_ABIVERSION] = 0; + ehdr->e_ident[EI_PAD] = 0; + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_entry = 0; + ehdr->e_phoff = phoff; + ehdr->e_flags = 0; + ehdr->e_ehsize = sizeof(Elf_Ehdr); + ehdr->e_phentsize = sizeof(Elf_Phdr); + ehdr->e_phnum = numsegs + 1; + ehdr->e_shentsize = sizeof(Elf_Shdr); + ehdr->e_shnum = 0; + ehdr->e_shstrndx = SHN_UNDEF; + + /* + * Fill in the program header entries. + */ + phdr = (Elf_Phdr *)((char *)dst + phoff); + + /* The note segement. */ + phdr->p_type = PT_NOTE; + phdr->p_offset = noteoff; + phdr->p_vaddr = 0; + phdr->p_paddr = 0; + phdr->p_filesz = notesz; + phdr->p_memsz = 0; + phdr->p_flags = 0; + phdr->p_align = 0; + phdr++; + + /* All the writable segments from the program. */ + phc.phdr = phdr; + phc.offset = *off; + each_writable_segment(p, cb_put_phdr, &phc); + } +} + +static void +elf_putnote(void *dst, size_t *off, const char *name, int type, + const void *desc, size_t descsz) +{ + Elf_Note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = descsz; + note.n_type = type; + if (dst != NULL) + bcopy(¬e, (char *)dst + *off, sizeof note); + *off += sizeof note; + if (dst != NULL) + bcopy(name, (char *)dst + *off, note.n_namesz); + *off += roundup2(note.n_namesz, sizeof(Elf_Size)); + if (dst != NULL) + bcopy(desc, (char *)dst + *off, note.n_descsz); + *off += roundup2(note.n_descsz, sizeof(Elf_Size)); +} + +/* + * Tell kern_execve.c about it, with a little help from the linker. + */ +static struct execsw elf_execsw = {exec_elf_imgact, "ELF"}; +EXEC_SET(elf, elf_execsw); diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c new file mode 100644 index 0000000..57a5c1d --- /dev/null +++ b/sys/kern/imgact_gzip.c @@ -0,0 +1,385 @@ +/* + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * <phk@FreeBSD.org> wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + * + * $FreeBSD$ + * + * This module handles execution of a.out files which have been run through + * "gzip". This saves diskspace, but wastes cpu-cycles and VM. + * + * TODO: + * text-segments should be made R/O after being filled + * is the vm-stuff safe ? + * should handle the entire header of gzip'ed stuff. + * inflate isn't quite reentrant yet... + * error-handling is a mess... + * so is the rest... + * tidy up unnecesary includes + */ + +#include <sys/param.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/imgact_aout.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mman.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/sysent.h> +#include <sys/systm.h> +#include <sys/vnode.h> +#include <sys/inflate.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> + +struct imgact_gzip { + struct image_params *ip; + struct exec a_out; + int error; + int gotheader; + int where; + u_char *inbuf; + u_long offset; + u_long output; + u_long len; + int idx; + u_long virtual_offset, file_offset, file_end, bss_size; +}; + +static int exec_gzip_imgact(struct image_params *imgp); +static int NextByte(void *vp); +static int do_aout_hdr(struct imgact_gzip *); +static int Flush(void *vp, u_char *, u_long siz); + +static int +exec_gzip_imgact(imgp) + struct image_params *imgp; +{ + int error, error2 = 0; + const u_char *p = (const u_char *) imgp->image_header; + struct imgact_gzip igz; + struct inflate infl; + struct vmspace *vmspace; + + /* If these four are not OK, it isn't a gzip file */ + if (p[0] != 0x1f) + return -1; /* 0 Simply magic */ + if (p[1] != 0x8b) + return -1; /* 1 Simply magic */ + if (p[2] != 0x08) + return -1; /* 2 Compression method */ + if (p[9] != 0x03) + return -1; /* 9 OS compressed on */ + + /* + * If this one contains anything but a comment or a filename marker, + * we don't want to chew on it + */ + if (p[3] & ~(0x18)) + return ENOEXEC; /* 3 Flags */ + + /* These are of no use to us */ + /* 4-7 Timestamp */ + /* 8 Extra flags */ + + bzero(&igz, sizeof igz); + bzero(&infl, sizeof infl); + infl.gz_private = (void *) &igz; + infl.gz_input = NextByte; + infl.gz_output = Flush; + + igz.ip = imgp; + igz.idx = 10; + + if (p[3] & 0x08) { /* skip a filename */ + while (p[igz.idx++]) + if (igz.idx >= PAGE_SIZE) + return ENOEXEC; + } + if (p[3] & 0x10) { /* skip a comment */ + while (p[igz.idx++]) + if (igz.idx >= PAGE_SIZE) + return ENOEXEC; + } + igz.len = imgp->attr->va_size; + + error = inflate(&infl); + + /* + * The unzipped file may not even have been long enough to contain + * a header giving Flush() a chance to return error. Check for this. + */ + if ( !igz.gotheader ) + return ENOEXEC; + + if ( !error ) { + vmspace = imgp->proc->p_vmspace; + error = vm_map_protect(&vmspace->vm_map, + (vm_offset_t) vmspace->vm_taddr, + (vm_offset_t) (vmspace->vm_taddr + + (vmspace->vm_tsize << PAGE_SHIFT)) , + VM_PROT_READ|VM_PROT_EXECUTE,0); + } + + if (igz.inbuf) { + error2 = + vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf, + (vm_offset_t) igz.inbuf + PAGE_SIZE); + } + if (igz.error || error || error2) { + printf("Output=%lu ", igz.output); + printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n", + error, igz.error, error2, igz.where); + } + if (igz.error) + return igz.error; + if (error) + return ENOEXEC; + if (error2) + return error2; + return 0; +} + +static int +do_aout_hdr(struct imgact_gzip * gz) +{ + int error; + struct vmspace *vmspace; + vm_offset_t vmaddr; + + /* + * Set file/virtual offset based on a.out variant. We do two cases: + * host byte order and network byte order (for NetBSD compatibility) + */ + switch ((int) (gz->a_out.a_magic & 0xffff)) { + case ZMAGIC: + gz->virtual_offset = 0; + if (gz->a_out.a_text) { + gz->file_offset = PAGE_SIZE; + } else { + /* Bill's "screwball mode" */ + gz->file_offset = 0; + } + break; + case QMAGIC: + gz->virtual_offset = PAGE_SIZE; + gz->file_offset = 0; + break; + default: + /* NetBSD compatibility */ + switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) { + case ZMAGIC: + case QMAGIC: + gz->virtual_offset = PAGE_SIZE; + gz->file_offset = 0; + break; + default: + gz->where = __LINE__; + return (-1); + } + } + + gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE); + + /* + * Check various fields in header for validity/bounds. + */ + if ( /* entry point must lay with text region */ + gz->a_out.a_entry < gz->virtual_offset || + gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text || + + /* text and data size must each be page rounded */ + gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) { + gz->where = __LINE__; + return (-1); + } + /* + * text/data/bss must not exceed limits + */ + mtx_assert(&Giant, MA_OWNED); + if ( /* text can't exceed maximum text size */ + gz->a_out.a_text > maxtsiz || + + /* data + bss can't exceed rlimit */ + gz->a_out.a_data + gz->bss_size > + gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) { + gz->where = __LINE__; + return (ENOMEM); + } + /* Find out how far we should go */ + gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data; + + /* copy in arguments and/or environment from old process */ + error = exec_extract_strings(gz->ip); + if (error) { + gz->where = __LINE__; + return (error); + } + /* + * Destroy old process VM and create a new one (with a new stack) + */ + exec_new_vmspace(gz->ip); + + vmspace = gz->ip->proc->p_vmspace; + + vmaddr = gz->virtual_offset; + + error = vm_mmap(&vmspace->vm_map, + &vmaddr, + gz->a_out.a_text + gz->a_out.a_data, + VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED, + 0, + 0); + + if (error) { + gz->where = __LINE__; + return (error); + } + + if (gz->bss_size != 0) { + /* + * Allocate demand-zeroed area for uninitialized data. + * "bss" = 'block started by symbol' - named after the + * IBM 7090 instruction of the same name. + */ + vmaddr = gz->virtual_offset + gz->a_out.a_text + + gz->a_out.a_data; + error = vm_map_find(&vmspace->vm_map, + NULL, + 0, + &vmaddr, + gz->bss_size, + FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) { + gz->where = __LINE__; + return (error); + } + } + /* Fill in process VM information */ + vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT; + vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT; + vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset; + vmspace->vm_daddr = (caddr_t) (uintptr_t) + (gz->virtual_offset + gz->a_out.a_text); + + /* Fill in image_params */ + gz->ip->interpreted = 0; + gz->ip->entry_addr = gz->a_out.a_entry; + + gz->ip->proc->p_sysent = &aout_sysvec; + + return 0; +} + +static int +NextByte(void *vp) +{ + int error; + struct imgact_gzip *igz = (struct imgact_gzip *) vp; + + if (igz->idx >= igz->len) { + igz->where = __LINE__; + return GZ_EOF; + } + if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) { + return igz->inbuf[(igz->idx++) - igz->offset]; + } + if (igz->inbuf) { + error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf, + (vm_offset_t) igz->inbuf + PAGE_SIZE); + if (error) { + igz->where = __LINE__; + igz->error = error; + return GZ_EOF; + } + } + igz->offset = igz->idx & ~PAGE_MASK; + + error = vm_mmap(kernel_map, /* map */ + (vm_offset_t *) & igz->inbuf, /* address */ + PAGE_SIZE, /* size */ + VM_PROT_READ, /* protection */ + VM_PROT_READ, /* max protection */ + 0, /* flags */ + (caddr_t) igz->ip->vp, /* vnode */ + igz->offset); /* offset */ + if (error) { + igz->where = __LINE__; + igz->error = error; + return GZ_EOF; + } + return igz->inbuf[(igz->idx++) - igz->offset]; +} + +static int +Flush(void *vp, u_char * ptr, u_long siz) +{ + struct imgact_gzip *gz = (struct imgact_gzip *) vp; + u_char *p = ptr, *q; + int i; + + /* First, find a a.out-header */ + if (gz->output < sizeof gz->a_out) { + q = (u_char *) & gz->a_out; + i = min(siz, sizeof gz->a_out - gz->output); + bcopy(p, q + gz->output, i); + gz->output += i; + p += i; + siz -= i; + if (gz->output == sizeof gz->a_out) { + gz->gotheader = 1; + i = do_aout_hdr(gz); + if (i == -1) { + if (!gz->where) + gz->where = __LINE__; + gz->error = ENOEXEC; + return ENOEXEC; + } else if (i) { + gz->where = __LINE__; + gz->error = i; + return ENOEXEC; + } + if (gz->file_offset == 0) { + q = (u_char *) (uintptr_t) gz->virtual_offset; + copyout(&gz->a_out, q, sizeof gz->a_out); + } + } + } + /* Skip over zero-padded first PAGE if needed */ + if (gz->output < gz->file_offset && + gz->output + siz > gz->file_offset) { + i = min(siz, gz->file_offset - gz->output); + gz->output += i; + p += i; + siz -= i; + } + if (gz->output >= gz->file_offset && gz->output < gz->file_end) { + i = min(siz, gz->file_end - gz->output); + q = (u_char *) (uintptr_t) + (gz->virtual_offset + gz->output - gz->file_offset); + copyout(p, q, i); + gz->output += i; + p += i; + siz -= i; + } + gz->output += siz; + return 0; +} + + +/* + * Tell kern_execve.c about it, with a little help from the linker. + */ +static struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"}; +EXEC_SET(execgzip, gzip_execsw); diff --git a/sys/kern/imgact_shell.c b/sys/kern/imgact_shell.c new file mode 100644 index 0000000..8480fcc --- /dev/null +++ b/sys/kern/imgact_shell.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 1993, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/kernel.h> + +#if BYTE_ORDER == LITTLE_ENDIAN +#define SHELLMAGIC 0x2123 /* #! */ +#else +#define SHELLMAGIC 0x2321 +#endif + +/* + * Shell interpreter image activator. A interpreter name beginning + * at imgp->stringbase is the minimal successful exit requirement. + */ +int +exec_shell_imgact(imgp) + struct image_params *imgp; +{ + const char *image_header = imgp->image_header; + const char *ihp, *line_endp; + char *interp; + + /* a shell script? */ + if (((const short *) image_header)[0] != SHELLMAGIC) + return(-1); + + /* + * Don't allow a shell script to be the shell for a shell + * script. :-) + */ + if (imgp->interpreted) + return(ENOEXEC); + + imgp->interpreted = 1; + + /* + * Copy shell name and arguments from image_header into string + * buffer. + */ + + /* + * Find end of line; return if the line > MAXSHELLCMDLEN long. + */ + for (ihp = &image_header[2]; *ihp != '\n' && *ihp != '#'; ++ihp) { + if (ihp >= &image_header[MAXSHELLCMDLEN]) + return(ENAMETOOLONG); + } + line_endp = ihp; + + /* reset for another pass */ + ihp = &image_header[2]; + + /* Skip over leading spaces - until the interpreter name */ + while ((*ihp == ' ') || (*ihp == '\t')) ihp++; + + /* copy the interpreter name */ + interp = imgp->interpreter_name; + while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) + *interp++ = *ihp++; + *interp = '\0'; + + /* Disallow a null interpreter filename */ + if (*imgp->interpreter_name == '\0') + return(ENOEXEC); + + /* reset for another pass */ + ihp = &image_header[2]; + + /* copy the interpreter name and arguments */ + while (ihp < line_endp) { + /* Skip over leading spaces */ + while ((*ihp == ' ') || (*ihp == '\t')) ihp++; + + if (ihp < line_endp) { + /* + * Copy to end of token. No need to watch stringspace + * because this is at the front of the string buffer + * and the maximum shell command length is tiny. + */ + while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) { + *imgp->stringp++ = *ihp++; + imgp->stringspace--; + } + + *imgp->stringp++ = 0; + imgp->stringspace--; + + imgp->argc++; + } + } + + imgp->argv0 = imgp->uap->fname; + + return(0); +} + +/* + * Tell kern_execve.c about it, with a little help from the linker. + */ +static struct execsw shell_execsw = { exec_shell_imgact, "#!" }; +EXEC_SET(shell, shell_execsw); diff --git a/sys/kern/inflate.c b/sys/kern/inflate.c new file mode 100644 index 0000000..2a16ba2 --- /dev/null +++ b/sys/kern/inflate.c @@ -0,0 +1,1078 @@ +/* + * Most parts of this file are not covered by: + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * <phk@FreeBSD.org> wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + * + * $FreeBSD$ + * + * + */ + +#include <sys/param.h> +#include <sys/inflate.h> +#ifdef _KERNEL +#include <sys/systm.h> +#include <sys/kernel.h> +#endif +#include <sys/malloc.h> + +#ifdef _KERNEL +static MALLOC_DEFINE(M_GZIP, "Gzip trees", "Gzip trees"); +#endif + +/* needed to make inflate() work */ +#define uch u_char +#define ush u_short +#define ulg u_long + +/* Stuff to make inflate() work */ +#ifdef _KERNEL +#define memzero(dest,len) bzero(dest,len) +#endif +#define NOMEMCPY +#ifdef _KERNEL +#define FPRINTF printf +#else +extern void putstr (char *); +#define FPRINTF putstr +#endif + +#define FLUSH(x,y) { \ + int foo = (*x->gz_output)(x->gz_private,x->gz_slide,y); \ + if (foo) \ + return foo; \ + } + +static const int qflag = 0; + +#ifndef _KERNEL /* want to use this file in kzip also */ +extern unsigned char *kzipmalloc (int); +extern void kzipfree (void*); +#define malloc(x, y, z) kzipmalloc((x)) +#define free(x, y) kzipfree((x)) +#endif + +/* + * This came from unzip-5.12. I have changed it the flow to pass + * a structure pointer around, thus hopefully making it re-entrant. + * Poul-Henning + */ + +/* inflate.c -- put in the public domain by Mark Adler + version c14o, 23 August 1994 */ + +/* You can do whatever you like with this source file, though I would + prefer that if you modify it and redistribute it that you include + comments to that effect with your name and the date. Thank you. + + History: + vers date who what + ---- --------- -------------- ------------------------------------ + a ~~ Feb 92 M. Adler used full (large, one-step) lookup table + b1 21 Mar 92 M. Adler first version with partial lookup tables + b2 21 Mar 92 M. Adler fixed bug in fixed-code blocks + b3 22 Mar 92 M. Adler sped up match copies, cleaned up some + b4 25 Mar 92 M. Adler added prototypes; removed window[] (now + is the responsibility of unzip.h--also + changed name to slide[]), so needs diffs + for unzip.c and unzip.h (this allows + compiling in the small model on MSDOS); + fixed cast of q in huft_build(); + b5 26 Mar 92 M. Adler got rid of unintended macro recursion. + b6 27 Mar 92 M. Adler got rid of nextbyte() routine. fixed + bug in inflate_fixed(). + c1 30 Mar 92 M. Adler removed lbits, dbits environment variables. + changed BMAX to 16 for explode. Removed + OUTB usage, and replaced it with flush()-- + this was a 20% speed improvement! Added + an explode.c (to replace unimplod.c) that + uses the huft routines here. Removed + register union. + c2 4 Apr 92 M. Adler fixed bug for file sizes a multiple of 32k. + c3 10 Apr 92 M. Adler reduced memory of code tables made by + huft_build significantly (factor of two to + three). + c4 15 Apr 92 M. Adler added NOMEMCPY do kill use of memcpy(). + worked around a Turbo C optimization bug. + c5 21 Apr 92 M. Adler added the GZ_WSIZE #define to allow reducing + the 32K window size for specialized + applications. + c6 31 May 92 M. Adler added some typecasts to eliminate warnings + c7 27 Jun 92 G. Roelofs added some more typecasts (444: MSC bug). + c8 5 Oct 92 J-l. Gailly added ifdef'd code to deal with PKZIP bug. + c9 9 Oct 92 M. Adler removed a memory error message (~line 416). + c10 17 Oct 92 G. Roelofs changed ULONG/UWORD/byte to ulg/ush/uch, + removed old inflate, renamed inflate_entry + to inflate, added Mark's fix to a comment. + c10.5 14 Dec 92 M. Adler fix up error messages for incomplete trees. + c11 2 Jan 93 M. Adler fixed bug in detection of incomplete + tables, and removed assumption that EOB is + the longest code (bad assumption). + c12 3 Jan 93 M. Adler make tables for fixed blocks only once. + c13 5 Jan 93 M. Adler allow all zero length codes (pkzip 2.04c + outputs one zero length code for an empty + distance tree). + c14 12 Mar 93 M. Adler made inflate.c standalone with the + introduction of inflate.h. + c14b 16 Jul 93 G. Roelofs added (unsigned) typecast to w at 470. + c14c 19 Jul 93 J. Bush changed v[N_MAX], l[288], ll[28x+3x] arrays + to static for Amiga. + c14d 13 Aug 93 J-l. Gailly de-complicatified Mark's c[*p++]++ thing. + c14e 8 Oct 93 G. Roelofs changed memset() to memzero(). + c14f 22 Oct 93 G. Roelofs renamed quietflg to qflag; made Trace() + conditional; added inflate_free(). + c14g 28 Oct 93 G. Roelofs changed l/(lx+1) macro to pointer (Cray bug) + c14h 7 Dec 93 C. Ghisler huft_build() optimizations. + c14i 9 Jan 94 A. Verheijen set fixed_t{d,l} to NULL after freeing; + G. Roelofs check NEXTBYTE macro for GZ_EOF. + c14j 23 Jan 94 G. Roelofs removed Ghisler "optimizations"; ifdef'd + GZ_EOF check. + c14k 27 Feb 94 G. Roelofs added some typecasts to avoid warnings. + c14l 9 Apr 94 G. Roelofs fixed split comments on preprocessor lines + to avoid bug in Encore compiler. + c14m 7 Jul 94 P. Kienitz modified to allow assembler version of + inflate_codes() (define ASM_INFLATECODES) + c14n 22 Jul 94 G. Roelofs changed fprintf to FPRINTF for DLL versions + c14o 23 Aug 94 C. Spieler added a newline to a debug statement; + G. Roelofs added another typecast to avoid MSC warning + */ + + +/* + Inflate deflated (PKZIP's method 8 compressed) data. The compression + method searches for as much of the current string of bytes (up to a + length of 258) in the previous 32K bytes. If it doesn't find any + matches (of at least length 3), it codes the next byte. Otherwise, it + codes the length of the matched string and its distance backwards from + the current position. There is a single Huffman code that codes both + single bytes (called "literals") and match lengths. A second Huffman + code codes the distance information, which follows a length code. Each + length or distance code actually represents a base value and a number + of "extra" (sometimes zero) bits to get to add to the base value. At + the end of each deflated block is a special end-of-block (EOB) literal/ + length code. The decoding process is basically: get a literal/length + code; if EOB then done; if a literal, emit the decoded byte; if a + length then get the distance and emit the referred-to bytes from the + sliding window of previously emitted data. + + There are (currently) three kinds of inflate blocks: stored, fixed, and + dynamic. The compressor outputs a chunk of data at a time and decides + which method to use on a chunk-by-chunk basis. A chunk might typically + be 32K to 64K, uncompressed. If the chunk is uncompressible, then the + "stored" method is used. In this case, the bytes are simply stored as + is, eight bits per byte, with none of the above coding. The bytes are + preceded by a count, since there is no longer an EOB code. + + If the data is compressible, then either the fixed or dynamic methods + are used. In the dynamic method, the compressed data is preceded by + an encoding of the literal/length and distance Huffman codes that are + to be used to decode this block. The representation is itself Huffman + coded, and so is preceded by a description of that code. These code + descriptions take up a little space, and so for small blocks, there is + a predefined set of codes, called the fixed codes. The fixed method is + used if the block ends up smaller that way (usually for quite small + chunks); otherwise the dynamic method is used. In the latter case, the + codes are customized to the probabilities in the current block and so + can code it much better than the pre-determined fixed codes can. + + The Huffman codes themselves are decoded using a mutli-level table + lookup, in order to maximize the speed of decoding plus the speed of + building the decoding tables. See the comments below that precede the + lbits and dbits tuning parameters. + */ + + +/* + Notes beyond the 1.93a appnote.txt: + + 1. Distance pointers never point before the beginning of the output + stream. + 2. Distance pointers can point back across blocks, up to 32k away. + 3. There is an implied maximum of 7 bits for the bit length table and + 15 bits for the actual data. + 4. If only one code exists, then it is encoded using one bit. (Zero + would be more efficient, but perhaps a little confusing.) If two + codes exist, they are coded using one bit each (0 and 1). + 5. There is no way of sending zero distance codes--a dummy must be + sent if there are none. (History: a pre 2.0 version of PKZIP would + store blocks with no distance codes, but this was discovered to be + too harsh a criterion.) Valid only for 1.93a. 2.04c does allow + zero distance codes, which is sent as one code of zero bits in + length. + 6. There are up to 286 literal/length codes. Code 256 represents the + end-of-block. Note however that the static length tree defines + 288 codes just to fill out the Huffman codes. Codes 286 and 287 + cannot be used though, since there is no length base or extra bits + defined for them. Similarily, there are up to 30 distance codes. + However, static trees define 32 codes (all 5 bits) to fill out the + Huffman codes, but the last two had better not show up in the data. + 7. Unzip can check dynamic Huffman blocks for complete code sets. + The exception is that a single code would not be complete (see #4). + 8. The five bits following the block type is really the number of + literal codes sent minus 257. + 9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits + (1+6+6). Therefore, to output three times the length, you output + three codes (1+1+1), whereas to output four times the same length, + you only need two codes (1+3). Hmm. + 10. In the tree reconstruction algorithm, Code = Code + Increment + only if BitLength(i) is not zero. (Pretty obvious.) + 11. Correction: 4 Bits: # of Bit Length codes - 4 (4 - 19) + 12. Note: length code 284 can represent 227-258, but length code 285 + really is 258. The last length deserves its own, short code + since it gets used a lot in very redundant files. The length + 258 is special since 258 - 3 (the min match length) is 255. + 13. The literal/length and distance code bit lengths are read as a + single stream of lengths. It is possible (and advantageous) for + a repeat code (16, 17, or 18) to go across the boundary between + the two sets of lengths. + */ + + +#define PKZIP_BUG_WORKAROUND /* PKZIP 1.93a problem--live with it */ + +/* + inflate.h must supply the uch slide[GZ_WSIZE] array and the NEXTBYTE, + FLUSH() and memzero macros. If the window size is not 32K, it + should also define GZ_WSIZE. If INFMOD is defined, it can include + compiled functions to support the NEXTBYTE and/or FLUSH() macros. + There are defaults for NEXTBYTE and FLUSH() below for use as + examples of what those functions need to do. Normally, you would + also want FLUSH() to compute a crc on the data. inflate.h also + needs to provide these typedefs: + + typedef unsigned char uch; + typedef unsigned short ush; + typedef unsigned long ulg; + + This module uses the external functions malloc() and free() (and + probably memset() or bzero() in the memzero() macro). Their + prototypes are normally found in <string.h> and <stdlib.h>. + */ +#define INFMOD /* tell inflate.h to include code to be + * compiled */ + +/* Huffman code lookup table entry--this entry is four bytes for machines + that have 16-bit pointers (e.g. PC's in the small or medium model). + Valid extra bits are 0..13. e == 15 is EOB (end of block), e == 16 + means that v is a literal, 16 < e < 32 means that v is a pointer to + the next table, which codes e - 16 bits, and lastly e == 99 indicates + an unused code. If a code with e == 99 is looked up, this implies an + error in the data. */ +struct huft { + uch e; /* number of extra bits or operation */ + uch b; /* number of bits in this code or subcode */ + union { + ush n; /* literal, length base, or distance + * base */ + struct huft *t; /* pointer to next level of table */ + } v; +}; + + +/* Function prototypes */ +static int huft_build(struct inflate *, unsigned *, unsigned, unsigned, const ush *, const ush *, struct huft **, int *); +static int huft_free(struct inflate *, struct huft *); +static int inflate_codes(struct inflate *, struct huft *, struct huft *, int, int); +static int inflate_stored(struct inflate *); +static int xinflate(struct inflate *); +static int inflate_fixed(struct inflate *); +static int inflate_dynamic(struct inflate *); +static int inflate_block(struct inflate *, int *); + +/* The inflate algorithm uses a sliding 32K byte window on the uncompressed + stream to find repeated byte strings. This is implemented here as a + circular buffer. The index is updated simply by incrementing and then + and'ing with 0x7fff (32K-1). */ +/* It is left to other modules to supply the 32K area. It is assumed + to be usable as if it were declared "uch slide[32768];" or as just + "uch *slide;" and then malloc'ed in the latter case. The definition + must be in unzip.h, included above. */ + + +/* Tables for deflate from PKZIP's appnote.txt. */ + +/* Order of the bit length code lengths */ +static const unsigned border[] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; + +static const ush cplens[] = { /* Copy lengths for literal codes 257..285 */ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; + /* note: see note #13 above about the 258 in this list. */ + +static const ush cplext[] = { /* Extra bits for literal codes 257..285 */ + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, + 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99}; /* 99==invalid */ + +static const ush cpdist[] = { /* Copy offsets for distance codes 0..29 */ + 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, + 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, + 8193, 12289, 16385, 24577}; + +static const ush cpdext[] = { /* Extra bits for distance codes */ + 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 12, 13, 13}; + +/* And'ing with mask[n] masks the lower n bits */ +static const ush mask[] = { + 0x0000, + 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff, + 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff +}; + + +/* Macros for inflate() bit peeking and grabbing. + The usage is: + + NEEDBITS(glbl,j) + x = b & mask[j]; + DUMPBITS(j) + + where NEEDBITS makes sure that b has at least j bits in it, and + DUMPBITS removes the bits from b. The macros use the variable k + for the number of bits in b. Normally, b and k are register + variables for speed, and are initialized at the begining of a + routine that uses these macros from a global bit buffer and count. + + In order to not ask for more bits than there are in the compressed + stream, the Huffman tables are constructed to only ask for just + enough bits to make up the end-of-block code (value 256). Then no + bytes need to be "returned" to the buffer at the end of the last + block. See the huft_build() routine. + */ + +/* + * The following 2 were global variables. + * They are now fields of the inflate structure. + */ + +#define NEEDBITS(glbl,n) { \ + while(k<(n)) { \ + int c=(*glbl->gz_input)(glbl->gz_private); \ + if(c==GZ_EOF) \ + return 1; \ + b|=((ulg)c)<<k; \ + k+=8; \ + } \ + } + +#define DUMPBITS(n) {b>>=(n);k-=(n);} + +/* + Huffman code decoding is performed using a multi-level table lookup. + The fastest way to decode is to simply build a lookup table whose + size is determined by the longest code. However, the time it takes + to build this table can also be a factor if the data being decoded + is not very long. The most common codes are necessarily the + shortest codes, so those codes dominate the decoding time, and hence + the speed. The idea is you can have a shorter table that decodes the + shorter, more probable codes, and then point to subsidiary tables for + the longer codes. The time it costs to decode the longer codes is + then traded against the time it takes to make longer tables. + + This results of this trade are in the variables lbits and dbits + below. lbits is the number of bits the first level table for literal/ + length codes can decode in one step, and dbits is the same thing for + the distance codes. Subsequent tables are also less than or equal to + those sizes. These values may be adjusted either when all of the + codes are shorter than that, in which case the longest code length in + bits is used, or when the shortest code is *longer* than the requested + table size, in which case the length of the shortest code in bits is + used. + + There are two different values for the two tables, since they code a + different number of possibilities each. The literal/length table + codes 286 possible values, or in a flat code, a little over eight + bits. The distance table codes 30 possible values, or a little less + than five bits, flat. The optimum values for speed end up being + about one bit more than those, so lbits is 8+1 and dbits is 5+1. + The optimum values may differ though from machine to machine, and + possibly even between compilers. Your mileage may vary. + */ + +static const int lbits = 9; /* bits in base literal/length lookup table */ +static const int dbits = 6; /* bits in base distance lookup table */ + + +/* If BMAX needs to be larger than 16, then h and x[] should be ulg. */ +#define BMAX 16 /* maximum bit length of any code (16 for + * explode) */ +#define N_MAX 288 /* maximum number of codes in any set */ + +/* Given a list of code lengths and a maximum table size, make a set of + tables to decode that set of codes. Return zero on success, one if + the given code set is incomplete (the tables are still built in this + case), two if the input is invalid (all zero length codes or an + oversubscribed set of lengths), and three if not enough memory. + The code with value 256 is special, and the tables are constructed + so that no bits beyond that code are fetched when that code is + decoded. */ +static int +huft_build(glbl, b, n, s, d, e, t, m) + struct inflate *glbl; + unsigned *b; /* code lengths in bits (all assumed <= BMAX) */ + unsigned n; /* number of codes (assumed <= N_MAX) */ + unsigned s; /* number of simple-valued codes (0..s-1) */ + const ush *d; /* list of base values for non-simple codes */ + const ush *e; /* list of extra bits for non-simple codes */ + struct huft **t; /* result: starting table */ + int *m; /* maximum lookup bits, returns actual */ +{ + unsigned a; /* counter for codes of length k */ + unsigned c[BMAX + 1]; /* bit length count table */ + unsigned el; /* length of EOB code (value 256) */ + unsigned f; /* i repeats in table every f entries */ + int g; /* maximum code length */ + int h; /* table level */ + register unsigned i; /* counter, current code */ + register unsigned j; /* counter */ + register int k; /* number of bits in current code */ + int lx[BMAX + 1]; /* memory for l[-1..BMAX-1] */ + int *l = lx + 1; /* stack of bits per table */ + register unsigned *p; /* pointer into c[], b[], or v[] */ + register struct huft *q;/* points to current table */ + struct huft r; /* table entry for structure assignment */ + struct huft *u[BMAX];/* table stack */ + unsigned v[N_MAX]; /* values in order of bit length */ + register int w; /* bits before this table == (l * h) */ + unsigned x[BMAX + 1]; /* bit offsets, then code stack */ + unsigned *xp; /* pointer into x */ + int y; /* number of dummy codes added */ + unsigned z; /* number of entries in current table */ + + /* Generate counts for each bit length */ + el = n > 256 ? b[256] : BMAX; /* set length of EOB code, if any */ +#ifdef _KERNEL + memzero((char *) c, sizeof(c)); +#else + for (i = 0; i < BMAX+1; i++) + c [i] = 0; +#endif + p = b; + i = n; + do { + c[*p]++; + p++; /* assume all entries <= BMAX */ + } while (--i); + if (c[0] == n) { /* null input--all zero length codes */ + *t = (struct huft *) NULL; + *m = 0; + return 0; + } + /* Find minimum and maximum length, bound *m by those */ + for (j = 1; j <= BMAX; j++) + if (c[j]) + break; + k = j; /* minimum code length */ + if ((unsigned) *m < j) + *m = j; + for (i = BMAX; i; i--) + if (c[i]) + break; + g = i; /* maximum code length */ + if ((unsigned) *m > i) + *m = i; + + /* Adjust last length count to fill out codes, if needed */ + for (y = 1 << j; j < i; j++, y <<= 1) + if ((y -= c[j]) < 0) + return 2; /* bad input: more codes than bits */ + if ((y -= c[i]) < 0) + return 2; + c[i] += y; + + /* Generate starting offsets into the value table for each length */ + x[1] = j = 0; + p = c + 1; + xp = x + 2; + while (--i) { /* note that i == g from above */ + *xp++ = (j += *p++); + } + + /* Make a table of values in order of bit lengths */ + p = b; + i = 0; + do { + if ((j = *p++) != 0) + v[x[j]++] = i; + } while (++i < n); + + /* Generate the Huffman codes and for each, make the table entries */ + x[0] = i = 0; /* first Huffman code is zero */ + p = v; /* grab values in bit order */ + h = -1; /* no tables yet--level -1 */ + w = l[-1] = 0; /* no bits decoded yet */ + u[0] = (struct huft *) NULL; /* just to keep compilers happy */ + q = (struct huft *) NULL; /* ditto */ + z = 0; /* ditto */ + + /* go through the bit lengths (k already is bits in shortest code) */ + for (; k <= g; k++) { + a = c[k]; + while (a--) { + /* + * here i is the Huffman code of length k bits for + * value *p + */ + /* make tables up to required level */ + while (k > w + l[h]) { + w += l[h++]; /* add bits already decoded */ + + /* + * compute minimum size table less than or + * equal to *m bits + */ + z = (z = g - w) > (unsigned) *m ? *m : z; /* upper limit */ + if ((f = 1 << (j = k - w)) > a + 1) { /* try a k-w bit table *//* t + * oo few codes for k-w + * bit table */ + f -= a + 1; /* deduct codes from + * patterns left */ + xp = c + k; + while (++j < z) { /* try smaller tables up + * to z bits */ + if ((f <<= 1) <= *++xp) + break; /* enough codes to use + * up j bits */ + f -= *xp; /* else deduct codes + * from patterns */ + } + } + if ((unsigned) w + j > el && (unsigned) w < el) + j = el - w; /* make EOB code end at + * table */ + z = 1 << j; /* table entries for j-bit + * table */ + l[h] = j; /* set table size in stack */ + + /* allocate and link in new table */ + if ((q = (struct huft *) malloc((z + 1) * sizeof(struct huft), M_GZIP, M_WAITOK)) == + (struct huft *) NULL) { + if (h) + huft_free(glbl, u[0]); + return 3; /* not enough memory */ + } + glbl->gz_hufts += z + 1; /* track memory usage */ + *t = q + 1; /* link to list for + * huft_free() */ + *(t = &(q->v.t)) = (struct huft *) NULL; + u[h] = ++q; /* table starts after link */ + + /* connect to last table, if there is one */ + if (h) { + x[h] = i; /* save pattern for + * backing up */ + r.b = (uch) l[h - 1]; /* bits to dump before + * this table */ + r.e = (uch) (16 + j); /* bits in this table */ + r.v.t = q; /* pointer to this table */ + j = (i & ((1 << w) - 1)) >> (w - l[h - 1]); + u[h - 1][j] = r; /* connect to last table */ + } + } + + /* set up table entry in r */ + r.b = (uch) (k - w); + if (p >= v + n) + r.e = 99; /* out of values--invalid + * code */ + else if (*p < s) { + r.e = (uch) (*p < 256 ? 16 : 15); /* 256 is end-of-block + * code */ + r.v.n = *p++; /* simple code is just the + * value */ + } else { + r.e = (uch) e[*p - s]; /* non-simple--look up + * in lists */ + r.v.n = d[*p++ - s]; + } + + /* fill code-like entries with r */ + f = 1 << (k - w); + for (j = i >> w; j < z; j += f) + q[j] = r; + + /* backwards increment the k-bit code i */ + for (j = 1 << (k - 1); i & j; j >>= 1) + i ^= j; + i ^= j; + + /* backup over finished tables */ + while ((i & ((1 << w) - 1)) != x[h]) + w -= l[--h]; /* don't need to update q */ + } + } + + /* return actual size of base table */ + *m = l[0]; + + /* Return true (1) if we were given an incomplete table */ + return y != 0 && g != 1; +} + +static int +huft_free(glbl, t) + struct inflate *glbl; + struct huft *t; /* table to free */ +/* Free the malloc'ed tables built by huft_build(), which makes a linked + list of the tables it made, with the links in a dummy first entry of + each table. */ +{ + register struct huft *p, *q; + + /* Go through linked list, freeing from the malloced (t[-1]) address. */ + p = t; + while (p != (struct huft *) NULL) { + q = (--p)->v.t; + free(p, M_GZIP); + p = q; + } + return 0; +} + +/* inflate (decompress) the codes in a deflated (compressed) block. + Return an error code or zero if it all goes ok. */ +static int +inflate_codes(glbl, tl, td, bl, bd) + struct inflate *glbl; + struct huft *tl, *td;/* literal/length and distance decoder tables */ + int bl, bd; /* number of bits decoded by tl[] and td[] */ +{ + register unsigned e; /* table entry flag/number of extra bits */ + unsigned n, d; /* length and index for copy */ + unsigned w; /* current window position */ + struct huft *t; /* pointer to table entry */ + unsigned ml, md; /* masks for bl and bd bits */ + register ulg b; /* bit buffer */ + register unsigned k; /* number of bits in bit buffer */ + + /* make local copies of globals */ + b = glbl->gz_bb; /* initialize bit buffer */ + k = glbl->gz_bk; + w = glbl->gz_wp; /* initialize window position */ + + /* inflate the coded data */ + ml = mask[bl]; /* precompute masks for speed */ + md = mask[bd]; + while (1) { /* do until end of block */ + NEEDBITS(glbl, (unsigned) bl) + if ((e = (t = tl + ((unsigned) b & ml))->e) > 16) + do { + if (e == 99) + return 1; + DUMPBITS(t->b) + e -= 16; + NEEDBITS(glbl, e) + } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16); + DUMPBITS(t->b) + if (e == 16) { /* then it's a literal */ + glbl->gz_slide[w++] = (uch) t->v.n; + if (w == GZ_WSIZE) { + FLUSH(glbl, w); + w = 0; + } + } else { /* it's an EOB or a length */ + /* exit if end of block */ + if (e == 15) + break; + + /* get length of block to copy */ + NEEDBITS(glbl, e) + n = t->v.n + ((unsigned) b & mask[e]); + DUMPBITS(e); + + /* decode distance of block to copy */ + NEEDBITS(glbl, (unsigned) bd) + if ((e = (t = td + ((unsigned) b & md))->e) > 16) + do { + if (e == 99) + return 1; + DUMPBITS(t->b) + e -= 16; + NEEDBITS(glbl, e) + } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16); + DUMPBITS(t->b) + NEEDBITS(glbl, e) + d = w - t->v.n - ((unsigned) b & mask[e]); + DUMPBITS(e) + /* do the copy */ + do { + n -= (e = (e = GZ_WSIZE - ((d &= GZ_WSIZE - 1) > w ? d : w)) > n ? n : e); +#ifndef NOMEMCPY + if (w - d >= e) { /* (this test assumes + * unsigned comparison) */ + memcpy(glbl->gz_slide + w, glbl->gz_slide + d, e); + w += e; + d += e; + } else /* do it slow to avoid memcpy() + * overlap */ +#endif /* !NOMEMCPY */ + do { + glbl->gz_slide[w++] = glbl->gz_slide[d++]; + } while (--e); + if (w == GZ_WSIZE) { + FLUSH(glbl, w); + w = 0; + } + } while (n); + } + } + + /* restore the globals from the locals */ + glbl->gz_wp = w; /* restore global window pointer */ + glbl->gz_bb = b; /* restore global bit buffer */ + glbl->gz_bk = k; + + /* done */ + return 0; +} + +/* "decompress" an inflated type 0 (stored) block. */ +static int +inflate_stored(glbl) + struct inflate *glbl; +{ + unsigned n; /* number of bytes in block */ + unsigned w; /* current window position */ + register ulg b; /* bit buffer */ + register unsigned k; /* number of bits in bit buffer */ + + /* make local copies of globals */ + b = glbl->gz_bb; /* initialize bit buffer */ + k = glbl->gz_bk; + w = glbl->gz_wp; /* initialize window position */ + + /* go to byte boundary */ + n = k & 7; + DUMPBITS(n); + + /* get the length and its complement */ + NEEDBITS(glbl, 16) + n = ((unsigned) b & 0xffff); + DUMPBITS(16) + NEEDBITS(glbl, 16) + if (n != (unsigned) ((~b) & 0xffff)) + return 1; /* error in compressed data */ + DUMPBITS(16) + /* read and output the compressed data */ + while (n--) { + NEEDBITS(glbl, 8) + glbl->gz_slide[w++] = (uch) b; + if (w == GZ_WSIZE) { + FLUSH(glbl, w); + w = 0; + } + DUMPBITS(8) + } + + /* restore the globals from the locals */ + glbl->gz_wp = w; /* restore global window pointer */ + glbl->gz_bb = b; /* restore global bit buffer */ + glbl->gz_bk = k; + return 0; +} + +/* decompress an inflated type 1 (fixed Huffman codes) block. We should + either replace this with a custom decoder, or at least precompute the + Huffman tables. */ +static int +inflate_fixed(glbl) + struct inflate *glbl; +{ + /* if first time, set up tables for fixed blocks */ + if (glbl->gz_fixed_tl == (struct huft *) NULL) { + int i; /* temporary variable */ + static unsigned l[288]; /* length list for huft_build */ + + /* literal table */ + for (i = 0; i < 144; i++) + l[i] = 8; + for (; i < 256; i++) + l[i] = 9; + for (; i < 280; i++) + l[i] = 7; + for (; i < 288; i++) /* make a complete, but wrong code + * set */ + l[i] = 8; + glbl->gz_fixed_bl = 7; + if ((i = huft_build(glbl, l, 288, 257, cplens, cplext, + &glbl->gz_fixed_tl, &glbl->gz_fixed_bl)) != 0) { + glbl->gz_fixed_tl = (struct huft *) NULL; + return i; + } + /* distance table */ + for (i = 0; i < 30; i++) /* make an incomplete code + * set */ + l[i] = 5; + glbl->gz_fixed_bd = 5; + if ((i = huft_build(glbl, l, 30, 0, cpdist, cpdext, + &glbl->gz_fixed_td, &glbl->gz_fixed_bd)) > 1) { + huft_free(glbl, glbl->gz_fixed_tl); + glbl->gz_fixed_tl = (struct huft *) NULL; + return i; + } + } + /* decompress until an end-of-block code */ + return inflate_codes(glbl, glbl->gz_fixed_tl, glbl->gz_fixed_td, glbl->gz_fixed_bl, glbl->gz_fixed_bd) != 0; +} + +/* decompress an inflated type 2 (dynamic Huffman codes) block. */ +static int +inflate_dynamic(glbl) + struct inflate *glbl; +{ + int i; /* temporary variables */ + unsigned j; + unsigned l; /* last length */ + unsigned m; /* mask for bit lengths table */ + unsigned n; /* number of lengths to get */ + struct huft *tl; /* literal/length code table */ + struct huft *td; /* distance code table */ + int bl; /* lookup bits for tl */ + int bd; /* lookup bits for td */ + unsigned nb; /* number of bit length codes */ + unsigned nl; /* number of literal/length codes */ + unsigned nd; /* number of distance codes */ +#ifdef PKZIP_BUG_WORKAROUND + unsigned ll[288 + 32]; /* literal/length and distance code + * lengths */ +#else + unsigned ll[286 + 30]; /* literal/length and distance code + * lengths */ +#endif + register ulg b; /* bit buffer */ + register unsigned k; /* number of bits in bit buffer */ + + /* make local bit buffer */ + b = glbl->gz_bb; + k = glbl->gz_bk; + + /* read in table lengths */ + NEEDBITS(glbl, 5) + nl = 257 + ((unsigned) b & 0x1f); /* number of + * literal/length codes */ + DUMPBITS(5) + NEEDBITS(glbl, 5) + nd = 1 + ((unsigned) b & 0x1f); /* number of distance codes */ + DUMPBITS(5) + NEEDBITS(glbl, 4) + nb = 4 + ((unsigned) b & 0xf); /* number of bit length codes */ + DUMPBITS(4) +#ifdef PKZIP_BUG_WORKAROUND + if (nl > 288 || nd > 32) +#else + if (nl > 286 || nd > 30) +#endif + return 1; /* bad lengths */ + /* read in bit-length-code lengths */ + for (j = 0; j < nb; j++) { + NEEDBITS(glbl, 3) + ll[border[j]] = (unsigned) b & 7; + DUMPBITS(3) + } + for (; j < 19; j++) + ll[border[j]] = 0; + + /* build decoding table for trees--single level, 7 bit lookup */ + bl = 7; + if ((i = huft_build(glbl, ll, 19, 19, NULL, NULL, &tl, &bl)) != 0) { + if (i == 1) + huft_free(glbl, tl); + return i; /* incomplete code set */ + } + /* read in literal and distance code lengths */ + n = nl + nd; + m = mask[bl]; + i = l = 0; + while ((unsigned) i < n) { + NEEDBITS(glbl, (unsigned) bl) + j = (td = tl + ((unsigned) b & m))->b; + DUMPBITS(j) + j = td->v.n; + if (j < 16) /* length of code in bits (0..15) */ + ll[i++] = l = j; /* save last length in l */ + else if (j == 16) { /* repeat last length 3 to 6 times */ + NEEDBITS(glbl, 2) + j = 3 + ((unsigned) b & 3); + DUMPBITS(2) + if ((unsigned) i + j > n) + return 1; + while (j--) + ll[i++] = l; + } else if (j == 17) { /* 3 to 10 zero length codes */ + NEEDBITS(glbl, 3) + j = 3 + ((unsigned) b & 7); + DUMPBITS(3) + if ((unsigned) i + j > n) + return 1; + while (j--) + ll[i++] = 0; + l = 0; + } else { /* j == 18: 11 to 138 zero length codes */ + NEEDBITS(glbl, 7) + j = 11 + ((unsigned) b & 0x7f); + DUMPBITS(7) + if ((unsigned) i + j > n) + return 1; + while (j--) + ll[i++] = 0; + l = 0; + } + } + + /* free decoding table for trees */ + huft_free(glbl, tl); + + /* restore the global bit buffer */ + glbl->gz_bb = b; + glbl->gz_bk = k; + + /* build the decoding tables for literal/length and distance codes */ + bl = lbits; + i = huft_build(glbl, ll, nl, 257, cplens, cplext, &tl, &bl); + if (i != 0) { + if (i == 1 && !qflag) { + FPRINTF("(incomplete l-tree) "); + huft_free(glbl, tl); + } + return i; /* incomplete code set */ + } + bd = dbits; + i = huft_build(glbl, ll + nl, nd, 0, cpdist, cpdext, &td, &bd); + if (i != 0) { + if (i == 1 && !qflag) { + FPRINTF("(incomplete d-tree) "); +#ifdef PKZIP_BUG_WORKAROUND + i = 0; + } +#else + huft_free(glbl, td); + } + huft_free(glbl, tl); + return i; /* incomplete code set */ +#endif + } + /* decompress until an end-of-block code */ + if (inflate_codes(glbl, tl, td, bl, bd)) + return 1; + + /* free the decoding tables, return */ + huft_free(glbl, tl); + huft_free(glbl, td); + return 0; +} + +/* decompress an inflated block */ +static int +inflate_block(glbl, e) + struct inflate *glbl; + int *e; /* last block flag */ +{ + unsigned t; /* block type */ + register ulg b; /* bit buffer */ + register unsigned k; /* number of bits in bit buffer */ + + /* make local bit buffer */ + b = glbl->gz_bb; + k = glbl->gz_bk; + + /* read in last block bit */ + NEEDBITS(glbl, 1) + * e = (int) b & 1; + DUMPBITS(1) + /* read in block type */ + NEEDBITS(glbl, 2) + t = (unsigned) b & 3; + DUMPBITS(2) + /* restore the global bit buffer */ + glbl->gz_bb = b; + glbl->gz_bk = k; + + /* inflate that block type */ + if (t == 2) + return inflate_dynamic(glbl); + if (t == 0) + return inflate_stored(glbl); + if (t == 1) + return inflate_fixed(glbl); + /* bad block type */ + return 2; +} + + + +/* decompress an inflated entry */ +static int +xinflate(glbl) + struct inflate *glbl; +{ + int e; /* last block flag */ + int r; /* result code */ + unsigned h; /* maximum struct huft's malloc'ed */ + + glbl->gz_fixed_tl = (struct huft *) NULL; + + /* initialize window, bit buffer */ + glbl->gz_wp = 0; + glbl->gz_bk = 0; + glbl->gz_bb = 0; + + /* decompress until the last block */ + h = 0; + do { + glbl->gz_hufts = 0; + if ((r = inflate_block(glbl, &e)) != 0) + return r; + if (glbl->gz_hufts > h) + h = glbl->gz_hufts; + } while (!e); + + /* flush out slide */ + FLUSH(glbl, glbl->gz_wp); + + /* return success */ + return 0; +} + +/* Nobody uses this - why not? */ +int +inflate(glbl) + struct inflate *glbl; +{ + int i; +#ifdef _KERNEL + u_char *p = NULL; + + if (!glbl->gz_slide) + p = glbl->gz_slide = malloc(GZ_WSIZE, M_GZIP, M_WAITOK); +#endif + if (!glbl->gz_slide) +#ifdef _KERNEL + return(ENOMEM); +#else + return 3; /* kzip expects 3 */ +#endif + i = xinflate(glbl); + + if (glbl->gz_fixed_td != (struct huft *) NULL) { + huft_free(glbl, glbl->gz_fixed_td); + glbl->gz_fixed_td = (struct huft *) NULL; + } + if (glbl->gz_fixed_tl != (struct huft *) NULL) { + huft_free(glbl, glbl->gz_fixed_tl); + glbl->gz_fixed_tl = (struct huft *) NULL; + } +#ifdef _KERNEL + if (p == glbl->gz_slide) { + free(glbl->gz_slide, M_GZIP); + glbl->gz_slide = NULL; + } +#endif + return i; +} +/* ----------------------- END INFLATE.C */ diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c new file mode 100644 index 0000000..d5c5656 --- /dev/null +++ b/sys/kern/init_main.c @@ -0,0 +1,669 @@ +/* + * Copyright (c) 1995 Terrence R. Lambert + * All rights reserved. + * + * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)init_main.c 8.9 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#include "opt_init_path.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/file.h> +#include <sys/filedesc.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/mount.h> +#include <sys/mutex.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/systm.h> +#include <sys/signalvar.h> +#include <sys/vnode.h> +#include <sys/sysent.h> +#include <sys/reboot.h> +#include <sys/sx.h> +#include <sys/sysproto.h> +#include <sys/vmmeter.h> +#include <sys/unistd.h> +#include <sys/malloc.h> +#include <sys/conf.h> + +#include <machine/cpu.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <sys/user.h> +#include <sys/copyright.h> + +void mi_startup(void); /* Should be elsewhere */ + +/* Components of the first process -- never freed. */ +static struct session session0; +static struct pgrp pgrp0; +struct proc proc0; +struct thread thread0; +static struct procsig procsig0; +static struct filedesc0 filedesc0; +static struct plimit limit0; +static struct vmspace vmspace0; +struct proc *initproc; + +int cmask = CMASK; +extern int fallback_elf_brand; + +struct vnode *rootvp; +int boothowto = 0; /* initialized so that it can be patched */ +SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, ""); +int bootverbose; +SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, ""); + +/* + * This ensures that there is at least one entry so that the sysinit_set + * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never + * executed. + */ +SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL) + +/* + * The sysinit table itself. Items are checked off as the are run. + * If we want to register new sysinit types, add them to newsysinit. + */ +SET_DECLARE(sysinit_set, struct sysinit); +struct sysinit **sysinit, **sysinit_end; +struct sysinit **newsysinit, **newsysinit_end; + +/* + * Merge a new sysinit set into the current set, reallocating it if + * necessary. This can only be called after malloc is running. + */ +void +sysinit_add(struct sysinit **set, struct sysinit **set_end) +{ + struct sysinit **newset; + struct sysinit **sipp; + struct sysinit **xipp; + int count; + + count = set_end - set; + if (newsysinit) + count += newsysinit_end - newsysinit; + else + count += sysinit_end - sysinit; + newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); + if (newset == NULL) + panic("cannot malloc for sysinit"); + xipp = newset; + if (newsysinit) + for (sipp = newsysinit; sipp < newsysinit_end; sipp++) + *xipp++ = *sipp; + else + for (sipp = sysinit; sipp < sysinit_end; sipp++) + *xipp++ = *sipp; + for (sipp = set; sipp < set_end; sipp++) + *xipp++ = *sipp; + if (newsysinit) + free(newsysinit, M_TEMP); + newsysinit = newset; + newsysinit_end = newset + count; +} + +/* + * System startup; initialize the world, create process 0, mount root + * filesystem, and fork to create init and pagedaemon. Most of the + * hard work is done in the lower-level initialization routines including + * startup(), which does memory initialization and autoconfiguration. + * + * This allows simple addition of new kernel subsystems that require + * boot time initialization. It also allows substitution of subsystem + * (for instance, a scheduler, kernel profiler, or VM system) by object + * module. Finally, it allows for optional "kernel threads". + */ +void +mi_startup(void) +{ + + register struct sysinit **sipp; /* system initialization*/ + register struct sysinit **xipp; /* interior loop of sort*/ + register struct sysinit *save; /* bubble*/ + + if (sysinit == NULL) { + sysinit = SET_BEGIN(sysinit_set); + sysinit_end = SET_LIMIT(sysinit_set); + } + +restart: + /* + * Perform a bubble sort of the system initialization objects by + * their subsystem (primary key) and order (secondary key). + */ + for (sipp = sysinit; sipp < sysinit_end; sipp++) { + for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { + if ((*sipp)->subsystem < (*xipp)->subsystem || + ((*sipp)->subsystem == (*xipp)->subsystem && + (*sipp)->order <= (*xipp)->order)) + continue; /* skip*/ + save = *sipp; + *sipp = *xipp; + *xipp = save; + } + } + + /* + * Traverse the (now) ordered list of system initialization tasks. + * Perform each task, and continue on to the next task. + * + * The last item on the list is expected to be the scheduler, + * which will not return. + */ + for (sipp = sysinit; sipp < sysinit_end; sipp++) { + + if ((*sipp)->subsystem == SI_SUB_DUMMY) + continue; /* skip dummy task(s)*/ + + if ((*sipp)->subsystem == SI_SUB_DONE) + continue; + + /* Call function */ + (*((*sipp)->func))((*sipp)->udata); + + /* Check off the one we're just done */ + (*sipp)->subsystem = SI_SUB_DONE; + + /* Check if we've installed more sysinit items via KLD */ + if (newsysinit != NULL) { + if (sysinit != SET_BEGIN(sysinit_set)) + free(sysinit, M_TEMP); + sysinit = newsysinit; + sysinit_end = newsysinit_end; + newsysinit = NULL; + newsysinit_end = NULL; + goto restart; + } + } + + panic("Shouldn't get here!"); + /* NOTREACHED*/ +} + + +/* + *************************************************************************** + **** + **** The following SYSINIT's belong elsewhere, but have not yet + **** been moved. + **** + *************************************************************************** + */ +static void +print_caddr_t(void *data __unused) +{ + printf("%s", (char *)data); +} +SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright) +SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, version) + +static void +set_boot_verbose(void *data __unused) +{ + + if (boothowto & RB_VERBOSE) + bootverbose++; +} +SYSINIT(boot_verbose, SI_SUB_TUNABLES, SI_ORDER_ANY, set_boot_verbose, NULL) + +/* + *************************************************************************** + **** + **** The two following SYSINT's are proc0 specific glue code. I am not + **** convinced that they can not be safely combined, but their order of + **** operation has been maintained as the same as the original init_main.c + **** for right now. + **** + **** These probably belong in init_proc.c or kern_proc.c, since they + **** deal with proc0 (the fork template process). + **** + *************************************************************************** + */ +/* ARGSUSED*/ +static void +proc0_init(void *dummy __unused) +{ + register struct proc *p; + register struct filedesc0 *fdp; + register unsigned i; + struct thread *td; + struct ksegrp *kg; + struct kse *ke; + + GIANT_REQUIRED; + p = &proc0; + td = &thread0; + + /* + * Initialize magic number. + */ + p->p_magic = P_MAGIC; + + /* + * Initialize thread, process and pgrp structures. + */ + procinit(); + + /* + * Initialize sleep queue hash table + */ + sleepinit(); + + /* + * additional VM structures + */ + vm_init2(); + + /* + * Create process 0 (the swapper). + */ + LIST_INSERT_HEAD(&allproc, p, p_list); + LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); + mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); + p->p_pgrp = &pgrp0; + LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); + LIST_INIT(&pgrp0.pg_members); + LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); + + pgrp0.pg_session = &session0; + mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); + session0.s_count = 1; + session0.s_leader = p; + +#ifdef __ELF__ + p->p_sysent = &elf_freebsd_sysvec; +#else + p->p_sysent = &aout_sysvec; +#endif + + ke = &proc0.p_kse; /* XXXKSE */ + kg = &proc0.p_ksegrp; /* XXXKSE */ + p->p_flag = P_SYSTEM; + p->p_sflag = PS_INMEM; + p->p_stat = SRUN; + p->p_ksegrp.kg_nice = NZERO; + kg->kg_pri_class = PRI_TIMESHARE; + kg->kg_user_pri = PUSER; + td->td_priority = PVM; + td->td_base_pri = PUSER; + + p->p_peers = 0; + p->p_leader = p; + + bcopy("swapper", p->p_comm, sizeof ("swapper")); + + callout_init(&p->p_itcallout, 0); + callout_init(&td->td_slpcallout, 1); + + /* Create credentials. */ + p->p_ucred = crget(); + p->p_ucred->cr_ngroups = 1; /* group 0 */ + p->p_ucred->cr_uidinfo = uifind(0); + p->p_ucred->cr_ruidinfo = uifind(0); + p->p_ucred->cr_prison = NULL; /* Don't jail it. */ + td->td_ucred = crhold(p->p_ucred); + + /* Create procsig. */ + p->p_procsig = &procsig0; + p->p_procsig->ps_refcnt = 1; + + /* Initialize signal state for process 0. */ + siginit(&proc0); + + /* Create the file descriptor table. */ + fdp = &filedesc0; + p->p_fd = &fdp->fd_fd; + mtx_init(&fdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF); + fdp->fd_fd.fd_refcnt = 1; + fdp->fd_fd.fd_cmask = cmask; + fdp->fd_fd.fd_ofiles = fdp->fd_dfiles; + fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags; + fdp->fd_fd.fd_nfiles = NDFILE; + + /* Create the limits structures. */ + p->p_limit = &limit0; + for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++) + limit0.pl_rlimit[i].rlim_cur = + limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY; + limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = + limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; + limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = + limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; + i = ptoa(cnt.v_free_count); + limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i; + limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i; + limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3; + limit0.p_cpulimit = RLIM_INFINITY; + limit0.p_refcnt = 1; + + /* Allocate a prototype map so we have something to fork. */ + pmap_pinit0(vmspace_pmap(&vmspace0)); + p->p_vmspace = &vmspace0; + vmspace0.vm_refcnt = 1; + vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS), + trunc_page(VM_MAXUSER_ADDRESS)); + vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0); + + /* + * We continue to place resource usage info and signal + * actions in the user struct so they're pageable. + */ + p->p_stats = &p->p_uarea->u_stats; + p->p_sigacts = &p->p_uarea->u_sigacts; + + /* + * Charge root for one process. + */ + (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); +} +SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL) + +/* ARGSUSED*/ +static void +proc0_post(void *dummy __unused) +{ + struct timespec ts; + struct proc *p; + + /* + * Now we can look at the time, having had a chance to verify the + * time from the filesystem. Pretend that proc0 started now. + */ + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + microtime(&p->p_stats->p_start); + p->p_runtime.sec = 0; + p->p_runtime.frac = 0; + } + sx_sunlock(&allproc_lock); + binuptime(PCPU_PTR(switchtime)); + PCPU_SET(switchticks, ticks); + + /* + * Give the ``random'' number generator a thump. + */ + nanotime(&ts); + srandom(ts.tv_sec ^ ts.tv_nsec); +} +SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL) + +/* + *************************************************************************** + **** + **** The following SYSINIT's and glue code should be moved to the + **** respective files on a per subsystem basis. + **** + *************************************************************************** + */ + + +/* + *************************************************************************** + **** + **** The following code probably belongs in another file, like + **** kern/init_init.c. + **** + *************************************************************************** + */ + +/* + * List of paths to try when searching for "init". + */ +static char init_path[MAXPATHLEN] = +#ifdef INIT_PATH + __XSTRING(INIT_PATH); +#else + "/sbin/init:/sbin/oinit:/sbin/init.bak:/stand/sysinstall"; +#endif +SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, + "Path used to search the init process"); + +/* + * Start the initial user process; try exec'ing each pathname in init_path. + * The program is invoked with one argument containing the boot flags. + */ +static void +start_init(void *dummy) +{ + vm_offset_t addr; + struct execve_args args; + int options, error; + char *var, *path, *next, *s; + char *ucp, **uap, *arg0, *arg1; + struct thread *td; + struct proc *p; + int init_does_devfs = 0; + + mtx_lock(&Giant); + + GIANT_REQUIRED; + + td = curthread; + p = td->td_proc; + + vfs_mountroot(NULL); + + /* Get the vnode for '/'. Set p->p_fd->fd_cdir to reference it. */ + if (VFS_ROOT(TAILQ_FIRST(&mountlist), &rootvnode)) + panic("cannot find root vnode"); + FILEDESC_LOCK(p->p_fd); + p->p_fd->fd_cdir = rootvnode; + VREF(p->p_fd->fd_cdir); + p->p_fd->fd_rdir = rootvnode; + VREF(p->p_fd->fd_rdir); + FILEDESC_UNLOCK(p->p_fd); + VOP_UNLOCK(rootvnode, 0, td); + + if (devfs_present) { + /* + * For disk based systems, we probably cannot do this yet + * since the fs will be read-only. But a NFS root + * might be ok. It is worth a shot. + */ + error = vn_mkdir("/dev", 0700, UIO_SYSSPACE, td); + if (error == EEXIST) + error = 0; + if (error == 0) + error = kernel_vmount(0, "fstype", "devfs", + "fspath", "/dev", NULL); + if (error != 0) + init_does_devfs = 1; + } + + /* + * Need just enough stack to hold the faked-up "execve()" arguments. + */ + addr = trunc_page(USRSTACK - PAGE_SIZE); + if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, + FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) + panic("init: couldn't allocate argument space"); + p->p_vmspace->vm_maxsaddr = (caddr_t)addr; + p->p_vmspace->vm_ssize = 1; + + if ((var = getenv("init_path")) != NULL) { + strncpy(init_path, var, sizeof init_path); + init_path[sizeof init_path - 1] = 0; + freeenv(var); + } + if ((var = getenv("kern.fallback_elf_brand")) != NULL) { + fallback_elf_brand = strtol(var, NULL, 0); + freeenv(var); + } + + for (path = init_path; *path != '\0'; path = next) { + while (*path == ':') + path++; + if (*path == '\0') + break; + for (next = path; *next != '\0' && *next != ':'; next++) + /* nothing */ ; + if (bootverbose) + printf("start_init: trying %.*s\n", (int)(next - path), + path); + + /* + * Move out the boot flag argument. + */ + options = 0; + ucp = (char *)USRSTACK; + (void)subyte(--ucp, 0); /* trailing zero */ + if (boothowto & RB_SINGLE) { + (void)subyte(--ucp, 's'); + options = 1; + } +#ifdef notyet + if (boothowto & RB_FASTBOOT) { + (void)subyte(--ucp, 'f'); + options = 1; + } +#endif + +#ifdef BOOTCDROM + (void)subyte(--ucp, 'C'); + options = 1; +#endif + if (init_does_devfs) { + (void)subyte(--ucp, 'd'); + options = 1; + } + + if (options == 0) + (void)subyte(--ucp, '-'); + (void)subyte(--ucp, '-'); /* leading hyphen */ + arg1 = ucp; + + /* + * Move out the file name (also arg 0). + */ + (void)subyte(--ucp, 0); + for (s = next - 1; s >= path; s--) + (void)subyte(--ucp, *s); + arg0 = ucp; + + /* + * Move out the arg pointers. + */ + uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1)); + (void)suword((caddr_t)--uap, (long)0); /* terminator */ + (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); + (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); + + /* + * Point at the arguments. + */ + args.fname = arg0; + args.argv = uap; + args.envv = NULL; + + /* + * Now try to exec the program. If can't for any reason + * other than it doesn't exist, complain. + * + * Otherwise, return via fork_trampoline() all the way + * to user mode as init! + */ + if ((error = execve(td, &args)) == 0) { + mtx_unlock(&Giant); + return; + } + if (error != ENOENT) + printf("exec %.*s: error %d\n", (int)(next - path), + path, error); + } + printf("init: not found in path %s\n", init_path); + panic("no init"); +} + +/* + * Like kthread_create(), but runs in it's own address space. + * We do this early to reserve pid 1. + * + * Note special case - do not make it runnable yet. Other work + * in progress will change this more. + */ +static void +create_init(const void *udata __unused) +{ + struct ucred *newcred, *oldcred; + int error; + + error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, &initproc); + if (error) + panic("cannot fork init: %d\n", error); + /* divorce init's credentials from the kernel's */ + newcred = crget(); + PROC_LOCK(initproc); + initproc->p_flag |= P_SYSTEM; + oldcred = initproc->p_ucred; + crcopy(newcred, oldcred); + initproc->p_ucred = newcred; + PROC_UNLOCK(initproc); + crfree(oldcred); + mtx_lock_spin(&sched_lock); + initproc->p_sflag |= PS_INMEM; + mtx_unlock_spin(&sched_lock); + cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); +} +SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL) + +/* + * Make it runnable now. + */ +static void +kick_init(const void *udata __unused) +{ + struct thread *td; + + td = FIRST_THREAD_IN_PROC(initproc); + mtx_lock_spin(&sched_lock); + initproc->p_stat = SRUN; + setrunqueue(FIRST_THREAD_IN_PROC(initproc)); /* XXXKSE */ + mtx_unlock_spin(&sched_lock); +} +SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL) diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c new file mode 100644 index 0000000..425e3b7 --- /dev/null +++ b/sys/kern/init_sysent.c @@ -0,0 +1,418 @@ +/* + * System call switch table. + * + * DO NOT EDIT-- this file is automatically generated. + * $FreeBSD$ + * created from FreeBSD: src/sys/kern/syscalls.master,v 1.113 2002/06/13 23:43:53 rwatson Exp + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> + +#define AS(name) (sizeof(struct name) / sizeof(register_t)) + +#ifdef COMPAT_43 +#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name) +#else +#define compat(n, name) 0, (sy_call_t *)nosys +#endif + +/* The casts are bogus but will do for now. */ +struct sysent sysent[] = { + { 0, (sy_call_t *)nosys }, /* 0 = syscall */ + { SYF_MPSAFE | AS(sys_exit_args), (sy_call_t *)sys_exit }, /* 1 = exit */ + { SYF_MPSAFE | 0, (sy_call_t *)fork }, /* 2 = fork */ + { SYF_MPSAFE | AS(read_args), (sy_call_t *)read }, /* 3 = read */ + { SYF_MPSAFE | AS(write_args), (sy_call_t *)write }, /* 4 = write */ + { AS(open_args), (sy_call_t *)open }, /* 5 = open */ + { SYF_MPSAFE | AS(close_args), (sy_call_t *)close }, /* 6 = close */ + { SYF_MPSAFE | AS(wait_args), (sy_call_t *)wait4 }, /* 7 = wait4 */ + { compat(AS(ocreat_args),creat) }, /* 8 = old creat */ + { AS(link_args), (sy_call_t *)link }, /* 9 = link */ + { AS(unlink_args), (sy_call_t *)unlink }, /* 10 = unlink */ + { 0, (sy_call_t *)nosys }, /* 11 = obsolete execv */ + { AS(chdir_args), (sy_call_t *)chdir }, /* 12 = chdir */ + { AS(fchdir_args), (sy_call_t *)fchdir }, /* 13 = fchdir */ + { AS(mknod_args), (sy_call_t *)mknod }, /* 14 = mknod */ + { AS(chmod_args), (sy_call_t *)chmod }, /* 15 = chmod */ + { AS(chown_args), (sy_call_t *)chown }, /* 16 = chown */ + { SYF_MPSAFE | AS(obreak_args), (sy_call_t *)obreak }, /* 17 = break */ + { AS(getfsstat_args), (sy_call_t *)getfsstat }, /* 18 = getfsstat */ + { compat(AS(olseek_args),lseek) }, /* 19 = old lseek */ + { SYF_MPSAFE | 0, (sy_call_t *)getpid }, /* 20 = getpid */ + { AS(mount_args), (sy_call_t *)mount }, /* 21 = mount */ + { AS(unmount_args), (sy_call_t *)unmount }, /* 22 = unmount */ + { SYF_MPSAFE | AS(setuid_args), (sy_call_t *)setuid }, /* 23 = setuid */ + { SYF_MPSAFE | 0, (sy_call_t *)getuid }, /* 24 = getuid */ + { SYF_MPSAFE | 0, (sy_call_t *)geteuid }, /* 25 = geteuid */ + { AS(ptrace_args), (sy_call_t *)ptrace }, /* 26 = ptrace */ + { SYF_MPSAFE | AS(recvmsg_args), (sy_call_t *)recvmsg }, /* 27 = recvmsg */ + { SYF_MPSAFE | AS(sendmsg_args), (sy_call_t *)sendmsg }, /* 28 = sendmsg */ + { SYF_MPSAFE | AS(recvfrom_args), (sy_call_t *)recvfrom }, /* 29 = recvfrom */ + { SYF_MPSAFE | AS(accept_args), (sy_call_t *)accept }, /* 30 = accept */ + { SYF_MPSAFE | AS(getpeername_args), (sy_call_t *)getpeername }, /* 31 = getpeername */ + { SYF_MPSAFE | AS(getsockname_args), (sy_call_t *)getsockname }, /* 32 = getsockname */ + { AS(access_args), (sy_call_t *)access }, /* 33 = access */ + { AS(chflags_args), (sy_call_t *)chflags }, /* 34 = chflags */ + { AS(fchflags_args), (sy_call_t *)fchflags }, /* 35 = fchflags */ + { 0, (sy_call_t *)sync }, /* 36 = sync */ + { SYF_MPSAFE | AS(kill_args), (sy_call_t *)kill }, /* 37 = kill */ + { compat(AS(ostat_args),stat) }, /* 38 = old stat */ + { SYF_MPSAFE | 0, (sy_call_t *)getppid }, /* 39 = getppid */ + { compat(AS(olstat_args),lstat) }, /* 40 = old lstat */ + { AS(dup_args), (sy_call_t *)dup }, /* 41 = dup */ + { 0, (sy_call_t *)pipe }, /* 42 = pipe */ + { SYF_MPSAFE | 0, (sy_call_t *)getegid }, /* 43 = getegid */ + { SYF_MPSAFE | AS(profil_args), (sy_call_t *)profil }, /* 44 = profil */ + { AS(ktrace_args), (sy_call_t *)ktrace }, /* 45 = ktrace */ + { compat(SYF_MPSAFE | AS(osigaction_args),sigaction) }, /* 46 = old sigaction */ + { SYF_MPSAFE | 0, (sy_call_t *)getgid }, /* 47 = getgid */ + { compat(SYF_MPSAFE | AS(osigprocmask_args),sigprocmask) }, /* 48 = old sigprocmask */ + { SYF_MPSAFE | AS(getlogin_args), (sy_call_t *)getlogin }, /* 49 = getlogin */ + { SYF_MPSAFE | AS(setlogin_args), (sy_call_t *)setlogin }, /* 50 = setlogin */ + { SYF_MPSAFE | AS(acct_args), (sy_call_t *)acct }, /* 51 = acct */ + { compat(SYF_MPSAFE | 0,sigpending) }, /* 52 = old sigpending */ + { SYF_MPSAFE | AS(sigaltstack_args), (sy_call_t *)sigaltstack }, /* 53 = sigaltstack */ + { SYF_MPSAFE | AS(ioctl_args), (sy_call_t *)ioctl }, /* 54 = ioctl */ + { SYF_MPSAFE | AS(reboot_args), (sy_call_t *)reboot }, /* 55 = reboot */ + { AS(revoke_args), (sy_call_t *)revoke }, /* 56 = revoke */ + { AS(symlink_args), (sy_call_t *)symlink }, /* 57 = symlink */ + { AS(readlink_args), (sy_call_t *)readlink }, /* 58 = readlink */ + { SYF_MPSAFE | AS(execve_args), (sy_call_t *)execve }, /* 59 = execve */ + { SYF_MPSAFE | AS(umask_args), (sy_call_t *)umask }, /* 60 = umask */ + { AS(chroot_args), (sy_call_t *)chroot }, /* 61 = chroot */ + { compat(SYF_MPSAFE | AS(ofstat_args),fstat) }, /* 62 = old fstat */ + { compat(SYF_MPSAFE | AS(getkerninfo_args),getkerninfo) }, /* 63 = old getkerninfo */ + { compat(SYF_MPSAFE | 0,getpagesize) }, /* 64 = old getpagesize */ + { AS(msync_args), (sy_call_t *)msync }, /* 65 = msync */ + { SYF_MPSAFE | 0, (sy_call_t *)vfork }, /* 66 = vfork */ + { 0, (sy_call_t *)nosys }, /* 67 = obsolete vread */ + { 0, (sy_call_t *)nosys }, /* 68 = obsolete vwrite */ + { SYF_MPSAFE | AS(sbrk_args), (sy_call_t *)sbrk }, /* 69 = sbrk */ + { SYF_MPSAFE | AS(sstk_args), (sy_call_t *)sstk }, /* 70 = sstk */ + { compat(SYF_MPSAFE | AS(ommap_args),mmap) }, /* 71 = old mmap */ + { SYF_MPSAFE | AS(ovadvise_args), (sy_call_t *)ovadvise }, /* 72 = vadvise */ + { SYF_MPSAFE | AS(munmap_args), (sy_call_t *)munmap }, /* 73 = munmap */ + { SYF_MPSAFE | AS(mprotect_args), (sy_call_t *)mprotect }, /* 74 = mprotect */ + { SYF_MPSAFE | AS(madvise_args), (sy_call_t *)madvise }, /* 75 = madvise */ + { 0, (sy_call_t *)nosys }, /* 76 = obsolete vhangup */ + { 0, (sy_call_t *)nosys }, /* 77 = obsolete vlimit */ + { SYF_MPSAFE | AS(mincore_args), (sy_call_t *)mincore }, /* 78 = mincore */ + { SYF_MPSAFE | AS(getgroups_args), (sy_call_t *)getgroups }, /* 79 = getgroups */ + { SYF_MPSAFE | AS(setgroups_args), (sy_call_t *)setgroups }, /* 80 = setgroups */ + { SYF_MPSAFE | 0, (sy_call_t *)getpgrp }, /* 81 = getpgrp */ + { SYF_MPSAFE | AS(setpgid_args), (sy_call_t *)setpgid }, /* 82 = setpgid */ + { SYF_MPSAFE | AS(setitimer_args), (sy_call_t *)setitimer }, /* 83 = setitimer */ + { compat(SYF_MPSAFE | 0,wait) }, /* 84 = old wait */ + { SYF_MPSAFE | AS(swapon_args), (sy_call_t *)swapon }, /* 85 = swapon */ + { SYF_MPSAFE | AS(getitimer_args), (sy_call_t *)getitimer }, /* 86 = getitimer */ + { compat(SYF_MPSAFE | AS(gethostname_args),gethostname) }, /* 87 = old gethostname */ + { compat(SYF_MPSAFE | AS(sethostname_args),sethostname) }, /* 88 = old sethostname */ + { SYF_MPSAFE | 0, (sy_call_t *)getdtablesize }, /* 89 = getdtablesize */ + { SYF_MPSAFE | AS(dup2_args), (sy_call_t *)dup2 }, /* 90 = dup2 */ + { 0, (sy_call_t *)nosys }, /* 91 = getdopt */ + { SYF_MPSAFE | AS(fcntl_args), (sy_call_t *)fcntl }, /* 92 = fcntl */ + { SYF_MPSAFE | AS(select_args), (sy_call_t *)select }, /* 93 = select */ + { 0, (sy_call_t *)nosys }, /* 94 = setdopt */ + { AS(fsync_args), (sy_call_t *)fsync }, /* 95 = fsync */ + { SYF_MPSAFE | AS(setpriority_args), (sy_call_t *)setpriority }, /* 96 = setpriority */ + { SYF_MPSAFE | AS(socket_args), (sy_call_t *)socket }, /* 97 = socket */ + { SYF_MPSAFE | AS(connect_args), (sy_call_t *)connect }, /* 98 = connect */ + { compat(SYF_MPSAFE | AS(accept_args),accept) }, /* 99 = old accept */ + { SYF_MPSAFE | AS(getpriority_args), (sy_call_t *)getpriority }, /* 100 = getpriority */ + { compat(SYF_MPSAFE | AS(osend_args),send) }, /* 101 = old send */ + { compat(SYF_MPSAFE | AS(orecv_args),recv) }, /* 102 = old recv */ + { SYF_MPSAFE | AS(osigreturn_args), (sy_call_t *)osigreturn }, /* 103 = osigreturn */ + { SYF_MPSAFE | AS(bind_args), (sy_call_t *)bind }, /* 104 = bind */ + { SYF_MPSAFE | AS(setsockopt_args), (sy_call_t *)setsockopt }, /* 105 = setsockopt */ + { SYF_MPSAFE | AS(listen_args), (sy_call_t *)listen }, /* 106 = listen */ + { 0, (sy_call_t *)nosys }, /* 107 = obsolete vtimes */ + { compat(SYF_MPSAFE | AS(osigvec_args),sigvec) }, /* 108 = old sigvec */ + { compat(SYF_MPSAFE | AS(osigblock_args),sigblock) }, /* 109 = old sigblock */ + { compat(SYF_MPSAFE | AS(osigsetmask_args),sigsetmask) }, /* 110 = old sigsetmask */ + { compat(SYF_MPSAFE | AS(osigsuspend_args),sigsuspend) }, /* 111 = old sigsuspend */ + { compat(SYF_MPSAFE | AS(osigstack_args),sigstack) }, /* 112 = old sigstack */ + { compat(SYF_MPSAFE | AS(orecvmsg_args),recvmsg) }, /* 113 = old recvmsg */ + { compat(SYF_MPSAFE | AS(osendmsg_args),sendmsg) }, /* 114 = old sendmsg */ + { 0, (sy_call_t *)nosys }, /* 115 = obsolete vtrace */ + { SYF_MPSAFE | AS(gettimeofday_args), (sy_call_t *)gettimeofday }, /* 116 = gettimeofday */ + { SYF_MPSAFE | AS(getrusage_args), (sy_call_t *)getrusage }, /* 117 = getrusage */ + { SYF_MPSAFE | AS(getsockopt_args), (sy_call_t *)getsockopt }, /* 118 = getsockopt */ + { 0, (sy_call_t *)nosys }, /* 119 = resuba */ + { SYF_MPSAFE | AS(readv_args), (sy_call_t *)readv }, /* 120 = readv */ + { SYF_MPSAFE | AS(writev_args), (sy_call_t *)writev }, /* 121 = writev */ + { SYF_MPSAFE | AS(settimeofday_args), (sy_call_t *)settimeofday }, /* 122 = settimeofday */ + { AS(fchown_args), (sy_call_t *)fchown }, /* 123 = fchown */ + { AS(fchmod_args), (sy_call_t *)fchmod }, /* 124 = fchmod */ + { compat(SYF_MPSAFE | AS(recvfrom_args),recvfrom) }, /* 125 = old recvfrom */ + { SYF_MPSAFE | AS(setreuid_args), (sy_call_t *)setreuid }, /* 126 = setreuid */ + { SYF_MPSAFE | AS(setregid_args), (sy_call_t *)setregid }, /* 127 = setregid */ + { AS(rename_args), (sy_call_t *)rename }, /* 128 = rename */ + { compat(AS(otruncate_args),truncate) }, /* 129 = old truncate */ + { compat(AS(oftruncate_args),ftruncate) }, /* 130 = old ftruncate */ + { SYF_MPSAFE | AS(flock_args), (sy_call_t *)flock }, /* 131 = flock */ + { AS(mkfifo_args), (sy_call_t *)mkfifo }, /* 132 = mkfifo */ + { SYF_MPSAFE | AS(sendto_args), (sy_call_t *)sendto }, /* 133 = sendto */ + { SYF_MPSAFE | AS(shutdown_args), (sy_call_t *)shutdown }, /* 134 = shutdown */ + { SYF_MPSAFE | AS(socketpair_args), (sy_call_t *)socketpair }, /* 135 = socketpair */ + { AS(mkdir_args), (sy_call_t *)mkdir }, /* 136 = mkdir */ + { AS(rmdir_args), (sy_call_t *)rmdir }, /* 137 = rmdir */ + { AS(utimes_args), (sy_call_t *)utimes }, /* 138 = utimes */ + { 0, (sy_call_t *)nosys }, /* 139 = obsolete 4.2 sigreturn */ + { SYF_MPSAFE | AS(adjtime_args), (sy_call_t *)adjtime }, /* 140 = adjtime */ + { compat(SYF_MPSAFE | AS(ogetpeername_args),getpeername) }, /* 141 = old getpeername */ + { compat(SYF_MPSAFE | 0,gethostid) }, /* 142 = old gethostid */ + { compat(SYF_MPSAFE | AS(osethostid_args),sethostid) }, /* 143 = old sethostid */ + { compat(SYF_MPSAFE | AS(ogetrlimit_args),getrlimit) }, /* 144 = old getrlimit */ + { compat(SYF_MPSAFE | AS(osetrlimit_args),setrlimit) }, /* 145 = old setrlimit */ + { compat(SYF_MPSAFE | AS(okillpg_args),killpg) }, /* 146 = old killpg */ + { SYF_MPSAFE | 0, (sy_call_t *)setsid }, /* 147 = setsid */ + { AS(quotactl_args), (sy_call_t *)quotactl }, /* 148 = quotactl */ + { compat(SYF_MPSAFE | 0,quota) }, /* 149 = old quota */ + { compat(SYF_MPSAFE | AS(getsockname_args),getsockname) }, /* 150 = old getsockname */ + { 0, (sy_call_t *)nosys }, /* 151 = sem_lock */ + { 0, (sy_call_t *)nosys }, /* 152 = sem_wakeup */ + { 0, (sy_call_t *)nosys }, /* 153 = asyncdaemon */ + { 0, (sy_call_t *)nosys }, /* 154 = nosys */ + { SYF_MPSAFE | AS(nfssvc_args), (sy_call_t *)nosys }, /* 155 = nfssvc */ + { compat(AS(ogetdirentries_args),getdirentries) }, /* 156 = old getdirentries */ + { AS(statfs_args), (sy_call_t *)statfs }, /* 157 = statfs */ + { AS(fstatfs_args), (sy_call_t *)fstatfs }, /* 158 = fstatfs */ + { 0, (sy_call_t *)nosys }, /* 159 = nosys */ + { 0, (sy_call_t *)nosys }, /* 160 = nosys */ + { AS(getfh_args), (sy_call_t *)getfh }, /* 161 = getfh */ + { SYF_MPSAFE | AS(getdomainname_args), (sy_call_t *)getdomainname }, /* 162 = getdomainname */ + { SYF_MPSAFE | AS(setdomainname_args), (sy_call_t *)setdomainname }, /* 163 = setdomainname */ + { SYF_MPSAFE | AS(uname_args), (sy_call_t *)uname }, /* 164 = uname */ + { AS(sysarch_args), (sy_call_t *)sysarch }, /* 165 = sysarch */ + { SYF_MPSAFE | AS(rtprio_args), (sy_call_t *)rtprio }, /* 166 = rtprio */ + { 0, (sy_call_t *)nosys }, /* 167 = nosys */ + { 0, (sy_call_t *)nosys }, /* 168 = nosys */ + { SYF_MPSAFE | AS(semsys_args), (sy_call_t *)lkmressys }, /* 169 = semsys */ + { SYF_MPSAFE | AS(msgsys_args), (sy_call_t *)lkmressys }, /* 170 = msgsys */ + { SYF_MPSAFE | AS(shmsys_args), (sy_call_t *)lkmressys }, /* 171 = shmsys */ + { 0, (sy_call_t *)nosys }, /* 172 = nosys */ + { SYF_MPSAFE | AS(pread_args), (sy_call_t *)pread }, /* 173 = pread */ + { SYF_MPSAFE | AS(pwrite_args), (sy_call_t *)pwrite }, /* 174 = pwrite */ + { 0, (sy_call_t *)nosys }, /* 175 = nosys */ + { SYF_MPSAFE | AS(ntp_adjtime_args), (sy_call_t *)ntp_adjtime }, /* 176 = ntp_adjtime */ + { 0, (sy_call_t *)nosys }, /* 177 = sfork */ + { 0, (sy_call_t *)nosys }, /* 178 = getdescriptor */ + { 0, (sy_call_t *)nosys }, /* 179 = setdescriptor */ + { 0, (sy_call_t *)nosys }, /* 180 = nosys */ + { SYF_MPSAFE | AS(setgid_args), (sy_call_t *)setgid }, /* 181 = setgid */ + { SYF_MPSAFE | AS(setegid_args), (sy_call_t *)setegid }, /* 182 = setegid */ + { SYF_MPSAFE | AS(seteuid_args), (sy_call_t *)seteuid }, /* 183 = seteuid */ + { 0, (sy_call_t *)nosys }, /* 184 = lfs_bmapv */ + { 0, (sy_call_t *)nosys }, /* 185 = lfs_markv */ + { 0, (sy_call_t *)nosys }, /* 186 = lfs_segclean */ + { 0, (sy_call_t *)nosys }, /* 187 = lfs_segwait */ + { AS(stat_args), (sy_call_t *)stat }, /* 188 = stat */ + { SYF_MPSAFE | AS(fstat_args), (sy_call_t *)fstat }, /* 189 = fstat */ + { AS(lstat_args), (sy_call_t *)lstat }, /* 190 = lstat */ + { AS(pathconf_args), (sy_call_t *)pathconf }, /* 191 = pathconf */ + { SYF_MPSAFE | AS(fpathconf_args), (sy_call_t *)fpathconf }, /* 192 = fpathconf */ + { 0, (sy_call_t *)nosys }, /* 193 = nosys */ + { SYF_MPSAFE | AS(__getrlimit_args), (sy_call_t *)getrlimit }, /* 194 = getrlimit */ + { SYF_MPSAFE | AS(__setrlimit_args), (sy_call_t *)setrlimit }, /* 195 = setrlimit */ + { AS(getdirentries_args), (sy_call_t *)getdirentries }, /* 196 = getdirentries */ + { SYF_MPSAFE | AS(mmap_args), (sy_call_t *)mmap }, /* 197 = mmap */ + { 0, (sy_call_t *)nosys }, /* 198 = __syscall */ + { AS(lseek_args), (sy_call_t *)lseek }, /* 199 = lseek */ + { AS(truncate_args), (sy_call_t *)truncate }, /* 200 = truncate */ + { AS(ftruncate_args), (sy_call_t *)ftruncate }, /* 201 = ftruncate */ + { SYF_MPSAFE | AS(sysctl_args), (sy_call_t *)__sysctl }, /* 202 = __sysctl */ + { SYF_MPSAFE | AS(mlock_args), (sy_call_t *)mlock }, /* 203 = mlock */ + { SYF_MPSAFE | AS(munlock_args), (sy_call_t *)munlock }, /* 204 = munlock */ + { AS(undelete_args), (sy_call_t *)undelete }, /* 205 = undelete */ + { AS(futimes_args), (sy_call_t *)futimes }, /* 206 = futimes */ + { SYF_MPSAFE | AS(getpgid_args), (sy_call_t *)getpgid }, /* 207 = getpgid */ + { 0, (sy_call_t *)nosys }, /* 208 = newreboot */ + { SYF_MPSAFE | AS(poll_args), (sy_call_t *)poll }, /* 209 = poll */ + { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 210 = lkmnosys */ + { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 211 = lkmnosys */ + { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 212 = lkmnosys */ + { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 213 = lkmnosys */ + { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 214 = lkmnosys */ + { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 215 = lkmnosys */ + { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 216 = lkmnosys */ + { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 217 = lkmnosys */ + { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 218 = lkmnosys */ + { AS(nosys_args), (sy_call_t *)lkmnosys }, /* 219 = lkmnosys */ + { SYF_MPSAFE | AS(__semctl_args), (sy_call_t *)lkmressys }, /* 220 = __semctl */ + { SYF_MPSAFE | AS(semget_args), (sy_call_t *)lkmressys }, /* 221 = semget */ + { SYF_MPSAFE | AS(semop_args), (sy_call_t *)lkmressys }, /* 222 = semop */ + { 0, (sy_call_t *)nosys }, /* 223 = semconfig */ + { SYF_MPSAFE | AS(msgctl_args), (sy_call_t *)lkmressys }, /* 224 = msgctl */ + { SYF_MPSAFE | AS(msgget_args), (sy_call_t *)lkmressys }, /* 225 = msgget */ + { SYF_MPSAFE | AS(msgsnd_args), (sy_call_t *)lkmressys }, /* 226 = msgsnd */ + { SYF_MPSAFE | AS(msgrcv_args), (sy_call_t *)lkmressys }, /* 227 = msgrcv */ + { SYF_MPSAFE | AS(shmat_args), (sy_call_t *)lkmressys }, /* 228 = shmat */ + { SYF_MPSAFE | AS(shmctl_args), (sy_call_t *)lkmressys }, /* 229 = shmctl */ + { SYF_MPSAFE | AS(shmdt_args), (sy_call_t *)lkmressys }, /* 230 = shmdt */ + { SYF_MPSAFE | AS(shmget_args), (sy_call_t *)lkmressys }, /* 231 = shmget */ + { SYF_MPSAFE | AS(clock_gettime_args), (sy_call_t *)clock_gettime }, /* 232 = clock_gettime */ + { SYF_MPSAFE | AS(clock_settime_args), (sy_call_t *)clock_settime }, /* 233 = clock_settime */ + { SYF_MPSAFE | AS(clock_getres_args), (sy_call_t *)clock_getres }, /* 234 = clock_getres */ + { 0, (sy_call_t *)nosys }, /* 235 = timer_create */ + { 0, (sy_call_t *)nosys }, /* 236 = timer_delete */ + { 0, (sy_call_t *)nosys }, /* 237 = timer_settime */ + { 0, (sy_call_t *)nosys }, /* 238 = timer_gettime */ + { 0, (sy_call_t *)nosys }, /* 239 = timer_getoverrun */ + { SYF_MPSAFE | AS(nanosleep_args), (sy_call_t *)nanosleep }, /* 240 = nanosleep */ + { 0, (sy_call_t *)nosys }, /* 241 = nosys */ + { 0, (sy_call_t *)nosys }, /* 242 = nosys */ + { 0, (sy_call_t *)nosys }, /* 243 = nosys */ + { 0, (sy_call_t *)nosys }, /* 244 = nosys */ + { 0, (sy_call_t *)nosys }, /* 245 = nosys */ + { 0, (sy_call_t *)nosys }, /* 246 = nosys */ + { 0, (sy_call_t *)nosys }, /* 247 = nosys */ + { 0, (sy_call_t *)nosys }, /* 248 = nosys */ + { 0, (sy_call_t *)nosys }, /* 249 = nosys */ + { SYF_MPSAFE | AS(minherit_args), (sy_call_t *)minherit }, /* 250 = minherit */ + { SYF_MPSAFE | AS(rfork_args), (sy_call_t *)rfork }, /* 251 = rfork */ + { SYF_MPSAFE | AS(openbsd_poll_args), (sy_call_t *)openbsd_poll }, /* 252 = openbsd_poll */ + { 0, (sy_call_t *)issetugid }, /* 253 = issetugid */ + { AS(lchown_args), (sy_call_t *)lchown }, /* 254 = lchown */ + { 0, (sy_call_t *)nosys }, /* 255 = nosys */ + { 0, (sy_call_t *)nosys }, /* 256 = nosys */ + { 0, (sy_call_t *)nosys }, /* 257 = nosys */ + { 0, (sy_call_t *)nosys }, /* 258 = nosys */ + { 0, (sy_call_t *)nosys }, /* 259 = nosys */ + { 0, (sy_call_t *)nosys }, /* 260 = nosys */ + { 0, (sy_call_t *)nosys }, /* 261 = nosys */ + { 0, (sy_call_t *)nosys }, /* 262 = nosys */ + { 0, (sy_call_t *)nosys }, /* 263 = nosys */ + { 0, (sy_call_t *)nosys }, /* 264 = nosys */ + { 0, (sy_call_t *)nosys }, /* 265 = nosys */ + { 0, (sy_call_t *)nosys }, /* 266 = nosys */ + { 0, (sy_call_t *)nosys }, /* 267 = nosys */ + { 0, (sy_call_t *)nosys }, /* 268 = nosys */ + { 0, (sy_call_t *)nosys }, /* 269 = nosys */ + { 0, (sy_call_t *)nosys }, /* 270 = nosys */ + { 0, (sy_call_t *)nosys }, /* 271 = nosys */ + { AS(getdents_args), (sy_call_t *)getdents }, /* 272 = getdents */ + { 0, (sy_call_t *)nosys }, /* 273 = nosys */ + { AS(lchmod_args), (sy_call_t *)lchmod }, /* 274 = lchmod */ + { AS(lchown_args), (sy_call_t *)lchown }, /* 275 = netbsd_lchown */ + { AS(lutimes_args), (sy_call_t *)lutimes }, /* 276 = lutimes */ + { SYF_MPSAFE | AS(msync_args), (sy_call_t *)msync }, /* 277 = netbsd_msync */ + { AS(nstat_args), (sy_call_t *)nstat }, /* 278 = nstat */ + { SYF_MPSAFE | AS(nfstat_args), (sy_call_t *)nfstat }, /* 279 = nfstat */ + { AS(nlstat_args), (sy_call_t *)nlstat }, /* 280 = nlstat */ + { 0, (sy_call_t *)nosys }, /* 281 = nosys */ + { 0, (sy_call_t *)nosys }, /* 282 = nosys */ + { 0, (sy_call_t *)nosys }, /* 283 = nosys */ + { 0, (sy_call_t *)nosys }, /* 284 = nosys */ + { 0, (sy_call_t *)nosys }, /* 285 = nosys */ + { 0, (sy_call_t *)nosys }, /* 286 = nosys */ + { 0, (sy_call_t *)nosys }, /* 287 = nosys */ + { 0, (sy_call_t *)nosys }, /* 288 = nosys */ + { 0, (sy_call_t *)nosys }, /* 289 = nosys */ + { 0, (sy_call_t *)nosys }, /* 290 = nosys */ + { 0, (sy_call_t *)nosys }, /* 291 = nosys */ + { 0, (sy_call_t *)nosys }, /* 292 = nosys */ + { 0, (sy_call_t *)nosys }, /* 293 = nosys */ + { 0, (sy_call_t *)nosys }, /* 294 = nosys */ + { 0, (sy_call_t *)nosys }, /* 295 = nosys */ + { 0, (sy_call_t *)nosys }, /* 296 = nosys */ + { AS(fhstatfs_args), (sy_call_t *)fhstatfs }, /* 297 = fhstatfs */ + { AS(fhopen_args), (sy_call_t *)fhopen }, /* 298 = fhopen */ + { AS(fhstat_args), (sy_call_t *)fhstat }, /* 299 = fhstat */ + { SYF_MPSAFE | AS(modnext_args), (sy_call_t *)modnext }, /* 300 = modnext */ + { SYF_MPSAFE | AS(modstat_args), (sy_call_t *)modstat }, /* 301 = modstat */ + { SYF_MPSAFE | AS(modfnext_args), (sy_call_t *)modfnext }, /* 302 = modfnext */ + { SYF_MPSAFE | AS(modfind_args), (sy_call_t *)modfind }, /* 303 = modfind */ + { SYF_MPSAFE | AS(kldload_args), (sy_call_t *)kldload }, /* 304 = kldload */ + { SYF_MPSAFE | AS(kldunload_args), (sy_call_t *)kldunload }, /* 305 = kldunload */ + { SYF_MPSAFE | AS(kldfind_args), (sy_call_t *)kldfind }, /* 306 = kldfind */ + { SYF_MPSAFE | AS(kldnext_args), (sy_call_t *)kldnext }, /* 307 = kldnext */ + { SYF_MPSAFE | AS(kldstat_args), (sy_call_t *)kldstat }, /* 308 = kldstat */ + { SYF_MPSAFE | AS(kldfirstmod_args), (sy_call_t *)kldfirstmod }, /* 309 = kldfirstmod */ + { SYF_MPSAFE | AS(getsid_args), (sy_call_t *)getsid }, /* 310 = getsid */ + { SYF_MPSAFE | AS(setresuid_args), (sy_call_t *)setresuid }, /* 311 = setresuid */ + { SYF_MPSAFE | AS(setresgid_args), (sy_call_t *)setresgid }, /* 312 = setresgid */ + { 0, (sy_call_t *)nosys }, /* 313 = obsolete signanosleep */ + { AS(aio_return_args), (sy_call_t *)lkmressys }, /* 314 = aio_return */ + { AS(aio_suspend_args), (sy_call_t *)lkmressys }, /* 315 = aio_suspend */ + { AS(aio_cancel_args), (sy_call_t *)lkmressys }, /* 316 = aio_cancel */ + { AS(aio_error_args), (sy_call_t *)lkmressys }, /* 317 = aio_error */ + { AS(aio_read_args), (sy_call_t *)lkmressys }, /* 318 = aio_read */ + { AS(aio_write_args), (sy_call_t *)lkmressys }, /* 319 = aio_write */ + { AS(lio_listio_args), (sy_call_t *)lkmressys }, /* 320 = lio_listio */ + { SYF_MPSAFE | 0, (sy_call_t *)yield }, /* 321 = yield */ + { 0, (sy_call_t *)nosys }, /* 322 = obsolete thr_sleep */ + { 0, (sy_call_t *)nosys }, /* 323 = obsolete thr_wakeup */ + { SYF_MPSAFE | AS(mlockall_args), (sy_call_t *)mlockall }, /* 324 = mlockall */ + { SYF_MPSAFE | 0, (sy_call_t *)munlockall }, /* 325 = munlockall */ + { AS(__getcwd_args), (sy_call_t *)__getcwd }, /* 326 = __getcwd */ + { SYF_MPSAFE | AS(sched_setparam_args), (sy_call_t *)sched_setparam }, /* 327 = sched_setparam */ + { SYF_MPSAFE | AS(sched_getparam_args), (sy_call_t *)sched_getparam }, /* 328 = sched_getparam */ + { SYF_MPSAFE | AS(sched_setscheduler_args), (sy_call_t *)sched_setscheduler }, /* 329 = sched_setscheduler */ + { SYF_MPSAFE | AS(sched_getscheduler_args), (sy_call_t *)sched_getscheduler }, /* 330 = sched_getscheduler */ + { SYF_MPSAFE | 0, (sy_call_t *)sched_yield }, /* 331 = sched_yield */ + { SYF_MPSAFE | AS(sched_get_priority_max_args), (sy_call_t *)sched_get_priority_max }, /* 332 = sched_get_priority_max */ + { SYF_MPSAFE | AS(sched_get_priority_min_args), (sy_call_t *)sched_get_priority_min }, /* 333 = sched_get_priority_min */ + { SYF_MPSAFE | AS(sched_rr_get_interval_args), (sy_call_t *)sched_rr_get_interval }, /* 334 = sched_rr_get_interval */ + { AS(utrace_args), (sy_call_t *)utrace }, /* 335 = utrace */ + { SYF_MPSAFE | AS(sendfile_args), (sy_call_t *)sendfile }, /* 336 = sendfile */ + { AS(kldsym_args), (sy_call_t *)kldsym }, /* 337 = kldsym */ + { SYF_MPSAFE | AS(jail_args), (sy_call_t *)jail }, /* 338 = jail */ + { 0, (sy_call_t *)nosys }, /* 339 = pioctl */ + { SYF_MPSAFE | AS(sigprocmask_args), (sy_call_t *)sigprocmask }, /* 340 = sigprocmask */ + { SYF_MPSAFE | AS(sigsuspend_args), (sy_call_t *)sigsuspend }, /* 341 = sigsuspend */ + { SYF_MPSAFE | AS(sigaction_args), (sy_call_t *)sigaction }, /* 342 = sigaction */ + { SYF_MPSAFE | AS(sigpending_args), (sy_call_t *)sigpending }, /* 343 = sigpending */ + { SYF_MPSAFE | AS(sigreturn_args), (sy_call_t *)sigreturn }, /* 344 = sigreturn */ + { 0, (sy_call_t *)nosys }, /* 345 = sigtimedwait */ + { 0, (sy_call_t *)nosys }, /* 346 = sigwaitinfo */ + { SYF_MPSAFE | AS(__acl_get_file_args), (sy_call_t *)__acl_get_file }, /* 347 = __acl_get_file */ + { SYF_MPSAFE | AS(__acl_set_file_args), (sy_call_t *)__acl_set_file }, /* 348 = __acl_set_file */ + { SYF_MPSAFE | AS(__acl_get_fd_args), (sy_call_t *)__acl_get_fd }, /* 349 = __acl_get_fd */ + { SYF_MPSAFE | AS(__acl_set_fd_args), (sy_call_t *)__acl_set_fd }, /* 350 = __acl_set_fd */ + { SYF_MPSAFE | AS(__acl_delete_file_args), (sy_call_t *)__acl_delete_file }, /* 351 = __acl_delete_file */ + { SYF_MPSAFE | AS(__acl_delete_fd_args), (sy_call_t *)__acl_delete_fd }, /* 352 = __acl_delete_fd */ + { SYF_MPSAFE | AS(__acl_aclcheck_file_args), (sy_call_t *)__acl_aclcheck_file }, /* 353 = __acl_aclcheck_file */ + { SYF_MPSAFE | AS(__acl_aclcheck_fd_args), (sy_call_t *)__acl_aclcheck_fd }, /* 354 = __acl_aclcheck_fd */ + { AS(extattrctl_args), (sy_call_t *)extattrctl }, /* 355 = extattrctl */ + { AS(extattr_set_file_args), (sy_call_t *)extattr_set_file }, /* 356 = extattr_set_file */ + { AS(extattr_get_file_args), (sy_call_t *)extattr_get_file }, /* 357 = extattr_get_file */ + { AS(extattr_delete_file_args), (sy_call_t *)extattr_delete_file }, /* 358 = extattr_delete_file */ + { AS(aio_waitcomplete_args), (sy_call_t *)lkmressys }, /* 359 = aio_waitcomplete */ + { SYF_MPSAFE | AS(getresuid_args), (sy_call_t *)getresuid }, /* 360 = getresuid */ + { SYF_MPSAFE | AS(getresgid_args), (sy_call_t *)getresgid }, /* 361 = getresgid */ + { SYF_MPSAFE | 0, (sy_call_t *)kqueue }, /* 362 = kqueue */ + { SYF_MPSAFE | AS(kevent_args), (sy_call_t *)kevent }, /* 363 = kevent */ + { 0, (sy_call_t *)nosys }, /* 364 = __cap_get_proc */ + { 0, (sy_call_t *)nosys }, /* 365 = __cap_set_proc */ + { 0, (sy_call_t *)nosys }, /* 366 = __cap_get_fd */ + { 0, (sy_call_t *)nosys }, /* 367 = __cap_get_file */ + { 0, (sy_call_t *)nosys }, /* 368 = __cap_set_fd */ + { 0, (sy_call_t *)nosys }, /* 369 = __cap_set_file */ + { AS(nosys_args), (sy_call_t *)lkmressys }, /* 370 = lkmressys */ + { AS(extattr_set_fd_args), (sy_call_t *)extattr_set_fd }, /* 371 = extattr_set_fd */ + { AS(extattr_get_fd_args), (sy_call_t *)extattr_get_fd }, /* 372 = extattr_get_fd */ + { AS(extattr_delete_fd_args), (sy_call_t *)extattr_delete_fd }, /* 373 = extattr_delete_fd */ + { SYF_MPSAFE | AS(__setugid_args), (sy_call_t *)__setugid }, /* 374 = __setugid */ + { AS(nfsclnt_args), (sy_call_t *)nosys }, /* 375 = nfsclnt */ + { AS(eaccess_args), (sy_call_t *)eaccess }, /* 376 = eaccess */ + { 0, (sy_call_t *)nosys }, /* 377 = afs_syscall */ + { AS(nmount_args), (sy_call_t *)nmount }, /* 378 = nmount */ + { 0, (sy_call_t *)kse_exit }, /* 379 = kse_exit */ + { 0, (sy_call_t *)kse_wakeup }, /* 380 = kse_wakeup */ + { AS(kse_new_args), (sy_call_t *)kse_new }, /* 381 = kse_new */ + { AS(thread_wakeup_args), (sy_call_t *)thread_wakeup }, /* 382 = thread_wakeup */ + { 0, (sy_call_t *)kse_yield }, /* 383 = kse_yield */ + { 0, (sy_call_t *)nosys }, /* 384 = __mac_get_proc */ + { 0, (sy_call_t *)nosys }, /* 385 = __mac_set_proc */ + { 0, (sy_call_t *)nosys }, /* 386 = __mac_get_fd */ + { 0, (sy_call_t *)nosys }, /* 387 = __mac_get_file */ + { 0, (sy_call_t *)nosys }, /* 388 = __mac_set_fd */ + { 0, (sy_call_t *)nosys }, /* 389 = __mac_set_file */ + { AS(kenv_args), (sy_call_t *)kenv }, /* 390 = kenv */ + { AS(lchflags_args), (sy_call_t *)lchflags }, /* 391 = lchflags */ + { AS(uuidgen_args), (sy_call_t *)uuidgen }, /* 392 = uuidgen */ +}; diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c new file mode 100644 index 0000000..6626197 --- /dev/null +++ b/sys/kern/kern_acct.c @@ -0,0 +1,345 @@ +/*- + * Copyright (c) 1994 Christopher G. Demetriou + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_acct.c 8.1 (Berkeley) 6/14/93 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sysproto.h> +#include <sys/proc.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/fcntl.h> +#include <sys/syslog.h> +#include <sys/kernel.h> +#include <sys/sysent.h> +#include <sys/sysctl.h> +#include <sys/namei.h> +#include <sys/acct.h> +#include <sys/resourcevar.h> +#include <sys/tty.h> + +/* + * The routines implemented in this file are described in: + * Leffler, et al.: The Design and Implementation of the 4.3BSD + * UNIX Operating System (Addison Welley, 1989) + * on pages 62-63. + * + * Arguably, to simplify accounting operations, this mechanism should + * be replaced by one in which an accounting log file (similar to /dev/klog) + * is read by a user process, etc. However, that has its own problems. + */ + +/* + * Internal accounting functions. + * The former's operation is described in Leffler, et al., and the latter + * was provided by UCB with the 4.4BSD-Lite release + */ +static comp_t encode_comp_t(u_long, u_long); +static void acctwatch(void *); + +/* + * Accounting callout used for periodic scheduling of acctwatch. + */ +static struct callout acctwatch_callout; + +/* + * Accounting vnode pointer, and saved vnode pointer. + */ +static struct vnode *acctp; +static struct vnode *savacctp; + +/* + * Values associated with enabling and disabling accounting + */ +static int acctsuspend = 2; /* stop accounting when < 2% free space left */ +SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW, + &acctsuspend, 0, "percentage of free disk space below which accounting stops"); + +static int acctresume = 4; /* resume when free space risen to > 4% */ +SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW, + &acctresume, 0, "percentage of free disk space above which accounting resumes"); + +static int acctchkfreq = 15; /* frequency (in seconds) to check space */ +SYSCTL_INT(_kern, OID_AUTO, acct_chkfreq, CTLFLAG_RW, + &acctchkfreq, 0, "frequency for checking the free space"); + +/* + * Accounting system call. Written based on the specification and + * previous implementation done by Mark Tinguely. + * + * MPSAFE + */ +int +acct(td, uap) + struct thread *td; + struct acct_args /* { + syscallarg(char *) path; + } */ *uap; +{ + struct nameidata nd; + int error, flags; + + /* Make sure that the caller is root. */ + error = suser(td); + if (error) + return (error); + + mtx_lock(&Giant); + /* + * If accounting is to be started to a file, open that file for + * writing and make sure it's a 'normal'. + */ + if (SCARG(uap, path) != NULL) { + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), + td); + flags = FWRITE; + error = vn_open(&nd, &flags, 0); + if (error) + goto done2; + NDFREE(&nd, NDF_ONLY_PNBUF); + VOP_UNLOCK(nd.ni_vp, 0, td); + if (nd.ni_vp->v_type != VREG) { + vn_close(nd.ni_vp, FWRITE, td->td_ucred, td); + error = EACCES; + goto done2; + } + } + + /* + * If accounting was previously enabled, kill the old space-watcher, + * close the file, and (if no new file was specified, leave). + */ + if (acctp != NULLVP || savacctp != NULLVP) { + callout_stop(&acctwatch_callout); + error = vn_close((acctp != NULLVP ? acctp : savacctp), FWRITE, + td->td_ucred, td); + acctp = savacctp = NULLVP; + } + if (SCARG(uap, path) == NULL) + goto done2; + + /* + * Save the new accounting file vnode, and schedule the new + * free space watcher. + */ + acctp = nd.ni_vp; + callout_init(&acctwatch_callout, 0); + acctwatch(NULL); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * Write out process accounting information, on process exit. + * Data to be written out is specified in Leffler, et al. + * and are enumerated below. (They're also noted in the system + * "acct.h" header file.) + */ + +int +acct_process(td) + struct thread *td; +{ + struct proc *p = td->td_proc; + struct acct acct; + struct rusage *r; + struct timeval ut, st, tmp; + int t; + struct vnode *vp; + + /* If accounting isn't enabled, don't bother */ + vp = acctp; + if (vp == NULLVP) + return (0); + + /* + * Get process accounting information. + */ + + /* (1) The name of the command that ran */ + bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm); + + /* (2) The amount of user and system time that was used */ + mtx_lock_spin(&sched_lock); + calcru(p, &ut, &st, NULL); + mtx_unlock_spin(&sched_lock); + acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec); + acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec); + + /* (3) The elapsed time the commmand ran (and its starting time) */ + acct.ac_btime = p->p_stats->p_start.tv_sec; + microtime(&tmp); + timevalsub(&tmp, &p->p_stats->p_start); + acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec); + + /* (4) The average amount of memory used */ + r = &p->p_stats->p_ru; + tmp = ut; + timevaladd(&tmp, &st); + t = tmp.tv_sec * hz + tmp.tv_usec / tick; + if (t) + acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t; + else + acct.ac_mem = 0; + + /* (5) The number of disk I/O operations done */ + acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0); + + /* (6) The UID and GID of the process */ + acct.ac_uid = p->p_ucred->cr_ruid; + acct.ac_gid = p->p_ucred->cr_rgid; + + /* (7) The terminal from which the process was started */ + PROC_LOCK(p); + SESS_LOCK(p->p_session); + if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp) + acct.ac_tty = dev2udev(p->p_pgrp->pg_session->s_ttyp->t_dev); + else + acct.ac_tty = NOUDEV; + SESS_UNLOCK(p->p_session); + PROC_UNLOCK(p); + + /* (8) The boolean flags that tell how the process terminated, etc. */ + acct.ac_flag = p->p_acflag; + + /* + * Eliminate any file size rlimit. + */ + if (p->p_limit->p_refcnt > 1 && + (p->p_limit->p_lflags & PL_SHAREMOD) == 0) { + p->p_limit->p_refcnt--; + p->p_limit = limcopy(p->p_limit); + } + p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + + /* + * Write the accounting information to the file. + */ + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + return (vn_rdwr(UIO_WRITE, vp, (caddr_t)&acct, sizeof (acct), + (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, td->td_ucred, + (int *)0, td)); +} + +/* + * Encode_comp_t converts from ticks in seconds and microseconds + * to ticks in 1/AHZ seconds. The encoding is described in + * Leffler, et al., on page 63. + */ + +#define MANTSIZE 13 /* 13 bit mantissa. */ +#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ +#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ + +static comp_t +encode_comp_t(s, us) + u_long s, us; +{ + int exp, rnd; + + exp = 0; + rnd = 0; + s *= AHZ; + s += us / (1000000 / AHZ); /* Maximize precision. */ + + while (s > MAXFRACT) { + rnd = s & (1 << (EXPSIZE - 1)); /* Round up? */ + s >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */ + exp++; + } + + /* If we need to round up, do it (and handle overflow correctly). */ + if (rnd && (++s > MAXFRACT)) { + s >>= EXPSIZE; + exp++; + } + + /* Clean it up and polish it off. */ + exp <<= MANTSIZE; /* Shift the exponent into place */ + exp += s; /* and add on the mantissa. */ + return (exp); +} + +/* + * Periodically check the filesystem to see if accounting + * should be turned on or off. Beware the case where the vnode + * has been vgone()'d out from underneath us, e.g. when the file + * system containing the accounting file has been forcibly unmounted. + */ +/* ARGSUSED */ +static void +acctwatch(a) + void *a; +{ + struct statfs sb; + + if (savacctp != NULLVP) { + if (savacctp->v_type == VBAD) { + (void) vn_close(savacctp, FWRITE, NOCRED, NULL); + savacctp = NULLVP; + return; + } + (void)VFS_STATFS(savacctp->v_mount, &sb, (struct thread *)0); + if (sb.f_bavail > acctresume * sb.f_blocks / 100) { + acctp = savacctp; + savacctp = NULLVP; + log(LOG_NOTICE, "Accounting resumed\n"); + } + } else { + if (acctp == NULLVP) + return; + if (acctp->v_type == VBAD) { + (void) vn_close(acctp, FWRITE, NOCRED, NULL); + acctp = NULLVP; + return; + } + (void)VFS_STATFS(acctp->v_mount, &sb, (struct thread *)0); + if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) { + savacctp = acctp; + acctp = NULLVP; + log(LOG_NOTICE, "Accounting suspended\n"); + } + } + callout_reset(&acctwatch_callout, acctchkfreq * hz, acctwatch, NULL); +} diff --git a/sys/kern/kern_acl.c b/sys/kern/kern_acl.c new file mode 100644 index 0000000..70be0ec --- /dev/null +++ b/sys/kern/kern_acl.c @@ -0,0 +1,830 @@ +/*- + * Copyright (c) 1999-2001 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * Developed by the TrustedBSD Project. + * Support for POSIX.1e access control lists. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/vnode.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/namei.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/sysent.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/acl.h> + +MALLOC_DEFINE(M_ACL, "acl", "access control list"); + +static int vacl_set_acl(struct thread *td, struct vnode *vp, + acl_type_t type, struct acl *aclp); +static int vacl_get_acl(struct thread *td, struct vnode *vp, + acl_type_t type, struct acl *aclp); +static int vacl_aclcheck(struct thread *td, struct vnode *vp, + acl_type_t type, struct acl *aclp); + +/* + * Implement a version of vaccess() that understands POSIX.1e ACL semantics. + * Return 0 on success, else an errno value. Should be merged into + * vaccess() eventually. + */ +int +vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, + struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused) +{ + struct acl_entry *acl_other, *acl_mask; + mode_t dac_granted; + mode_t cap_granted; + mode_t acl_mask_granted; + int group_matched, i; + + /* + * Look for a normal, non-privileged way to access the file/directory + * as requested. If it exists, go with that. Otherwise, attempt + * to use privileges granted via cap_granted. In some cases, + * which privileges to use may be ambiguous due to "best match", + * in which case fall back on first match for the time being. + */ + if (privused != NULL) + *privused = 0; + + /* + * Determine privileges now, but don't apply until we've found + * a DAC entry that matches but has failed to allow access. + */ +#ifndef CAPABILITIES + if (suser_cred(cred, PRISON_ROOT) == 0) + cap_granted = (VEXEC | VREAD | VWRITE | VADMIN); + else + cap_granted = 0; +#else + cap_granted = 0; + + if (type == VDIR) { + if ((acc_mode & VEXEC) && !cap_check(cred, NULL, + CAP_DAC_READ_SEARCH, PRISON_ROOT)) + cap_granted |= VEXEC; + } else { + if ((acc_mode & VEXEC) && !cap_check(cred, NULL, + CAP_DAC_EXECUTE, PRISON_ROOT)) + cap_granted |= VEXEC; + } + + if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, + PRISON_ROOT)) + cap_granted |= VREAD; + + if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE, + PRISON_ROOT)) + cap_granted |= VWRITE; + + if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER, + PRISON_ROOT)) + cap_granted |= VADMIN; +#endif /* CAPABILITIES */ + + /* + * The owner matches if the effective uid associated with the + * credential matches that of the ACL_USER_OBJ entry. While we're + * doing the first scan, also cache the location of the ACL_MASK + * and ACL_OTHER entries, preventing some future iterations. + */ + acl_mask = acl_other = NULL; + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_USER_OBJ: + if (file_uid != cred->cr_uid) + break; + dac_granted = 0; + dac_granted |= VADMIN; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + if ((acc_mode & dac_granted) == acc_mode) + return (0); + if ((acc_mode & (dac_granted | cap_granted)) == + acc_mode) { + if (privused != NULL) + *privused = 1; + return (0); + } + goto error; + + case ACL_MASK: + acl_mask = &acl->acl_entry[i]; + break; + + case ACL_OTHER: + acl_other = &acl->acl_entry[i]; + break; + + default: + break; + } + } + + /* + * An ACL_OTHER entry should always exist in a valid access + * ACL. If it doesn't, then generate a serious failure. For now, + * this means a debugging message and EPERM, but in the future + * should probably be a panic. + */ + if (acl_other == NULL) { + /* + * XXX This should never happen + */ + printf("vaccess_acl_posix1e: ACL_OTHER missing\n"); + return (EPERM); + } + + /* + * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields + * are masked by an ACL_MASK entry, if any. As such, first identify + * the ACL_MASK field, then iterate through identifying potential + * user matches, then group matches. If there is no ACL_MASK, + * assume that the mask allows all requests to succeed. + */ + if (acl_mask != NULL) { + acl_mask_granted = 0; + if (acl_mask->ae_perm & ACL_EXECUTE) + acl_mask_granted |= VEXEC; + if (acl_mask->ae_perm & ACL_READ) + acl_mask_granted |= VREAD; + if (acl_mask->ae_perm & ACL_WRITE) + acl_mask_granted |= VWRITE; + } else + acl_mask_granted = VEXEC | VREAD | VWRITE; + + /* + * Iterate through user ACL entries. Do checks twice, first + * without privilege, and then if a match is found but failed, + * a second time with privilege. + */ + + /* + * Check ACL_USER ACL entries. + */ + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_USER: + if (acl->acl_entry[i].ae_id != cred->cr_uid) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + if ((acc_mode & dac_granted) == acc_mode) + return (0); + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + goto error; + + if (privused != NULL) + *privused = 1; + return (0); + } + } + + /* + * Group match is best-match, not first-match, so find a + * "best" match. Iterate across, testing each potential group + * match. Make sure we keep track of whether we found a match + * or not, so that we know if we should try again with any + * available privilege, or if we should move on to ACL_OTHER. + */ + group_matched = 0; + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_GROUP_OBJ: + if (!groupmember(file_gid, cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + + group_matched = 1; + break; + + case ACL_GROUP: + if (!groupmember(acl->acl_entry[i].ae_id, cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + + group_matched = 1; + break; + + default: + break; + } + } + + if (group_matched == 1) { + /* + * There was a match, but it did not grant rights via + * pure DAC. Try again, this time with privilege. + */ + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_GROUP_OBJ: + if (!groupmember(file_gid, cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + break; + + if (privused != NULL) + *privused = 1; + return (0); + + case ACL_GROUP: + if (!groupmember(acl->acl_entry[i].ae_id, + cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + break; + + if (privused != NULL) + *privused = 1; + return (0); + + default: + break; + } + } + /* + * Even with privilege, group membership was not sufficient. + * Return failure. + */ + goto error; + } + + /* + * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER. + */ + dac_granted = 0; + if (acl_other->ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl_other->ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl_other->ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { + if (privused != NULL) + *privused = 1; + return (0); + } + +error: + return ((acc_mode & VADMIN) ? EPERM : EACCES); +} + +/* + * For the purposes of filesystems maintaining the _OBJ entries in an + * inode with a mode_t field, this routine converts a mode_t entry + * to an acl_perm_t. + */ +acl_perm_t +acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode) +{ + acl_perm_t perm = 0; + + switch(tag) { + case ACL_USER_OBJ: + if (mode & S_IXUSR) + perm |= ACL_EXECUTE; + if (mode & S_IRUSR) + perm |= ACL_READ; + if (mode & S_IWUSR) + perm |= ACL_WRITE; + return (perm); + + case ACL_GROUP_OBJ: + if (mode & S_IXGRP) + perm |= ACL_EXECUTE; + if (mode & S_IRGRP) + perm |= ACL_READ; + if (mode & S_IWGRP) + perm |= ACL_WRITE; + return (perm); + + case ACL_OTHER: + if (mode & S_IXOTH) + perm |= ACL_EXECUTE; + if (mode & S_IROTH) + perm |= ACL_READ; + if (mode & S_IWOTH) + perm |= ACL_WRITE; + return (perm); + + default: + printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag); + return (0); + } +} + +/* + * Given inode information (uid, gid, mode), return an acl entry of the + * appropriate type. + */ +struct acl_entry +acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode) +{ + struct acl_entry acl_entry; + + acl_entry.ae_tag = tag; + acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode); + switch(tag) { + case ACL_USER_OBJ: + acl_entry.ae_id = uid; + break; + + case ACL_GROUP_OBJ: + acl_entry.ae_id = gid; + break; + + case ACL_OTHER: + acl_entry.ae_id = ACL_UNDEFINED_ID; + break; + + default: + acl_entry.ae_id = ACL_UNDEFINED_ID; + printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag); + } + + return (acl_entry); +} + +/* + * Utility function to generate a file mode given appropriate ACL entries. + */ +mode_t +acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry, + struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry) +{ + mode_t mode; + + mode = 0; + if (acl_user_obj_entry->ae_perm & ACL_EXECUTE) + mode |= S_IXUSR; + if (acl_user_obj_entry->ae_perm & ACL_READ) + mode |= S_IRUSR; + if (acl_user_obj_entry->ae_perm & ACL_WRITE) + mode |= S_IWUSR; + if (acl_group_obj_entry->ae_perm & ACL_EXECUTE) + mode |= S_IXGRP; + if (acl_group_obj_entry->ae_perm & ACL_READ) + mode |= S_IRGRP; + if (acl_group_obj_entry->ae_perm & ACL_WRITE) + mode |= S_IWGRP; + if (acl_other_entry->ae_perm & ACL_EXECUTE) + mode |= S_IXOTH; + if (acl_other_entry->ae_perm & ACL_READ) + mode |= S_IROTH; + if (acl_other_entry->ae_perm & ACL_WRITE) + mode |= S_IWOTH; + + return (mode); +} + +/* + * Perform a syntactic check of the ACL, sufficient to allow an + * implementing filesystem to determine if it should accept this and + * rely on the POSIX.1e ACL properties. + */ +int +acl_posix1e_check(struct acl *acl) +{ + int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group; + int num_acl_mask, num_acl_other, i; + + /* + * Verify that the number of entries does not exceed the maximum + * defined for acl_t. + * Verify that the correct number of various sorts of ae_tags are + * present: + * Exactly one ACL_USER_OBJ + * Exactly one ACL_GROUP_OBJ + * Exactly one ACL_OTHER + * If any ACL_USER or ACL_GROUP entries appear, then exactly one + * ACL_MASK entry must also appear. + * Verify that all ae_perm entries are in ACL_PERM_BITS. + * Verify all ae_tag entries are understood by this implementation. + * Note: Does not check for uniqueness of qualifier (ae_id) field. + */ + num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group = + num_acl_mask = num_acl_other = 0; + if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0) + return (EINVAL); + for (i = 0; i < acl->acl_cnt; i++) { + /* + * Check for a valid tag. + */ + switch(acl->acl_entry[i].ae_tag) { + case ACL_USER_OBJ: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_user_obj++; + break; + case ACL_GROUP_OBJ: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_group_obj++; + break; + case ACL_USER: + if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_user++; + break; + case ACL_GROUP: + if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_group++; + break; + case ACL_OTHER: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_other++; + break; + case ACL_MASK: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_mask++; + break; + default: + return (EINVAL); + } + /* + * Check for valid perm entries. + */ + if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) != + ACL_PERM_BITS) + return (EINVAL); + } + if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) || + (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1)) + return (EINVAL); + if (((num_acl_group != 0) || (num_acl_user != 0)) && + (num_acl_mask != 1)) + return (EINVAL); + return (0); +} + +/* + * These calls wrap the real vnode operations, and are called by the + * syscall code once the syscall has converted the path or file + * descriptor to a vnode (unlocked). The aclp pointer is assumed + * still to point to userland, so this should not be consumed within + * the kernel except by syscall code. Other code should directly + * invoke VOP_{SET,GET}ACL. + */ + +/* + * Given a vnode, set its ACL. + */ +static int +vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, + struct acl *aclp) +{ + struct acl inkernacl; + struct mount *mp; + int error; + + error = copyin(aclp, &inkernacl, sizeof(struct acl)); + if (error) + return(error); + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return(error); +} + +/* + * Given a vnode, get its ACL. + */ +static int +vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, + struct acl *aclp) +{ + struct acl inkernelacl; + int error; + + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + if (error == 0) + error = copyout(&inkernelacl, aclp, sizeof(struct acl)); + return (error); +} + +/* + * Given a vnode, delete its ACL. + */ +static int +vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type) +{ + struct mount *mp; + int error; + + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_SETACL(vp, type, NULL, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return (error); +} + +/* + * Given a vnode, check whether an ACL is appropriate for it + */ +static int +vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, + struct acl *aclp) +{ + struct acl inkernelacl; + int error; + + error = copyin(aclp, &inkernelacl, sizeof(struct acl)); + if (error) + return(error); + error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td); + return (error); +} + +/* + * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. + * Don't need to lock, as the vacl_ code will get/release any locks + * required. + */ + +/* + * Given a file path, get an ACL for it + * + * MPSAFE + */ +int +__acl_get_file(struct thread *td, struct __acl_get_file_args *uap) +{ + struct nameidata nd; + int error; + + mtx_lock(&Giant); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + error = namei(&nd); + if (error == 0) { + error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type), + SCARG(uap, aclp)); + NDFREE(&nd, 0); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file path, set an ACL for it + * + * MPSAFE + */ +int +__acl_set_file(struct thread *td, struct __acl_set_file_args *uap) +{ + struct nameidata nd; + int error; + + mtx_lock(&Giant); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + error = namei(&nd); + if (error == 0) { + error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type), + SCARG(uap, aclp)); + NDFREE(&nd, 0); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file descriptor, get an ACL for it + * + * MPSAFE + */ +int +__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap) +{ + struct file *fp; + int error; + + mtx_lock(&Giant); + error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); + if (error == 0) { + error = vacl_get_acl(td, (struct vnode *)fp->f_data, + SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file descriptor, set an ACL for it + * + * MPSAFE + */ +int +__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap) +{ + struct file *fp; + int error; + + mtx_lock(&Giant); + error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); + if (error == 0) { + error = vacl_set_acl(td, (struct vnode *)fp->f_data, + SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file path, delete an ACL from it. + * + * MPSAFE + */ +int +__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap) +{ + struct nameidata nd; + int error; + + mtx_lock(&Giant); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + error = namei(&nd); + if (error == 0) { + error = vacl_delete(td, nd.ni_vp, SCARG(uap, type)); + NDFREE(&nd, 0); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file path, delete an ACL from it. + * + * MPSAFE + */ +int +__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap) +{ + struct file *fp; + int error; + + mtx_lock(&Giant); + error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); + if (error == 0) { + error = vacl_delete(td, (struct vnode *)fp->f_data, + SCARG(uap, type)); + fdrop(fp, td); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file path, check an ACL for it + * + * MPSAFE + */ +int +__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap) +{ + struct nameidata nd; + int error; + + mtx_lock(&Giant); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + error = namei(&nd); + if (error == 0) { + error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type), + SCARG(uap, aclp)); + NDFREE(&nd, 0); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file descriptor, check an ACL for it + * + * MPSAFE + */ +int +__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap) +{ + struct file *fp; + int error; + + mtx_lock(&Giant); + error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); + if (error == 0) { + error = vacl_aclcheck(td, (struct vnode *)fp->f_data, + SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); + } + mtx_unlock(&Giant); + return (error); +} diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c new file mode 100644 index 0000000..2e7ca8b --- /dev/null +++ b/sys/kern/kern_clock.c @@ -0,0 +1,492 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#include "opt_ntp.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/dkstat.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/ktr.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/smp.h> +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <sys/sysctl.h> +#include <sys/bus.h> +#include <sys/interrupt.h> + +#include <machine/cpu.h> +#include <machine/limits.h> + +#ifdef GPROF +#include <sys/gmon.h> +#endif + +#ifdef DEVICE_POLLING +extern void init_device_poll(void); +extern void hardclock_device_poll(void); +#endif /* DEVICE_POLLING */ + +static void initclocks(void *dummy); +SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) + +/* Some of these don't belong here, but it's easiest to concentrate them. */ +long cp_time[CPUSTATES]; + +SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time), + "LU", "CPU time statistics"); + +long tk_cancc; +long tk_nin; +long tk_nout; +long tk_rawcc; + +/* + * Clock handling routines. + * + * This code is written to operate with two timers that run independently of + * each other. + * + * The main timer, running hz times per second, is used to trigger interval + * timers, timeouts and rescheduling as needed. + * + * The second timer handles kernel and user profiling, + * and does resource use estimation. If the second timer is programmable, + * it is randomized to avoid aliasing between the two clocks. For example, + * the randomization prevents an adversary from always giving up the cpu + * just before its quantum expires. Otherwise, it would never accumulate + * cpu ticks. The mean frequency of the second timer is stathz. + * + * If no second timer exists, stathz will be zero; in this case we drive + * profiling and statistics off the main clock. This WILL NOT be accurate; + * do not do it unless absolutely necessary. + * + * The statistics clock may (or may not) be run at a higher rate while + * profiling. This profile clock runs at profhz. We require that profhz + * be an integral multiple of stathz. + * + * If the statistics clock is running fast, it must be divided by the ratio + * profhz/stathz for statistics. (For profiling, every tick counts.) + * + * Time-of-day is maintained using a "timecounter", which may or may + * not be related to the hardware generating the above mentioned + * interrupts. + */ + +int stathz; +int profhz; +static int profprocs; +int ticks; +static int psdiv, pscnt; /* prof => stat divider */ +int psratio; /* ratio: prof / stat */ + +/* + * Initialize clock frequencies and start both clocks running. + */ +/* ARGSUSED*/ +static void +initclocks(dummy) + void *dummy; +{ + register int i; + + /* + * Set divisors to 1 (normal case) and let the machine-specific + * code do its bit. + */ + psdiv = pscnt = 1; + cpu_initclocks(); + +#ifdef DEVICE_POLLING + init_device_poll(); +#endif + /* + * Compute profhz/stathz, and fix profhz if needed. + */ + i = stathz ? stathz : hz; + if (profhz == 0) + profhz = i; + psratio = profhz / i; +} + +/* + * Each time the real-time timer fires, this function is called on all CPUs + * with each CPU passing in its curthread as the first argument. If possible + * a nice optimization in the future would be to allow the CPU receiving the + * actual real-time timer interrupt to call this function on behalf of the + * other CPUs rather than sending an IPI to all other CPUs so that they + * can call this function. Note that hardclock() calls hardclock_process() + * for the CPU receiving the timer interrupt, so only the other CPUs in the + * system need to call this function (or have it called on their behalf. + */ +void +hardclock_process(td, user) + struct thread *td; + int user; +{ + struct pstats *pstats; + struct proc *p = td->td_proc; + + /* + * Run current process's virtual and profile time, as needed. + */ + mtx_assert(&sched_lock, MA_OWNED); + if (p->p_flag & P_KSES) { + /* XXXKSE What to do? */ + } else { + pstats = p->p_stats; + if (user && + timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) { + p->p_sflag |= PS_ALRMPEND; + td->td_kse->ke_flags |= KEF_ASTPENDING; + } + if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) { + p->p_sflag |= PS_PROFPEND; + td->td_kse->ke_flags |= KEF_ASTPENDING; + } + } +} + +/* + * The real-time timer, interrupting hz times per second. + */ +void +hardclock(frame) + register struct clockframe *frame; +{ + int need_softclock = 0; + + CTR0(KTR_CLK, "hardclock fired"); + mtx_lock_spin_flags(&sched_lock, MTX_QUIET); + hardclock_process(curthread, CLKF_USERMODE(frame)); + mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); + + /* + * If no separate statistics clock is available, run it from here. + * + * XXX: this only works for UP + */ + if (stathz == 0) + statclock(frame); + +#ifdef DEVICE_POLLING + hardclock_device_poll(); /* this is very short and quick */ +#endif /* DEVICE_POLLING */ + + /* + * Process callouts at a very low cpu priority, so we don't keep the + * relatively high clock interrupt priority any longer than necessary. + */ + mtx_lock_spin_flags(&callout_lock, MTX_QUIET); + ticks++; + if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) { + need_softclock = 1; + } else if (softticks + 1 == ticks) + ++softticks; + mtx_unlock_spin_flags(&callout_lock, MTX_QUIET); + + /* + * swi_sched acquires sched_lock, so we don't want to call it with + * callout_lock held; incorrect locking order. + */ + if (need_softclock) + swi_sched(softclock_ih, 0); +} + +/* + * Compute number of ticks in the specified amount of time. + */ +int +tvtohz(tv) + struct timeval *tv; +{ + register unsigned long ticks; + register long sec, usec; + + /* + * If the number of usecs in the whole seconds part of the time + * difference fits in a long, then the total number of usecs will + * fit in an unsigned long. Compute the total and convert it to + * ticks, rounding up and adding 1 to allow for the current tick + * to expire. Rounding also depends on unsigned long arithmetic + * to avoid overflow. + * + * Otherwise, if the number of ticks in the whole seconds part of + * the time difference fits in a long, then convert the parts to + * ticks separately and add, using similar rounding methods and + * overflow avoidance. This method would work in the previous + * case but it is slightly slower and assumes that hz is integral. + * + * Otherwise, round the time difference down to the maximum + * representable value. + * + * If ints have 32 bits, then the maximum value for any timeout in + * 10ms ticks is 248 days. + */ + sec = tv->tv_sec; + usec = tv->tv_usec; + if (usec < 0) { + sec--; + usec += 1000000; + } + if (sec < 0) { +#ifdef DIAGNOSTIC + if (usec > 0) { + sec++; + usec -= 1000000; + } + printf("tvotohz: negative time difference %ld sec %ld usec\n", + sec, usec); +#endif + ticks = 1; + } else if (sec <= LONG_MAX / 1000000) + ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) + / tick + 1; + else if (sec <= LONG_MAX / hz) + ticks = sec * hz + + ((unsigned long)usec + (tick - 1)) / tick + 1; + else + ticks = LONG_MAX; + if (ticks > INT_MAX) + ticks = INT_MAX; + return ((int)ticks); +} + +/* + * Start profiling on a process. + * + * Kernel profiling passes proc0 which never exits and hence + * keeps the profile clock running constantly. + */ +void +startprofclock(p) + register struct proc *p; +{ + int s; + + /* + * XXX; Right now sched_lock protects statclock(), but perhaps + * it should be protected later on by a time_lock, which would + * cover psdiv, etc. as well. + */ + mtx_lock_spin(&sched_lock); + if ((p->p_sflag & PS_PROFIL) == 0) { + p->p_sflag |= PS_PROFIL; + if (++profprocs == 1 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = psratio; + setstatclockrate(profhz); + splx(s); + } + } + mtx_unlock_spin(&sched_lock); +} + +/* + * Stop profiling on a process. + */ +void +stopprofclock(p) + register struct proc *p; +{ + int s; + + mtx_lock_spin(&sched_lock); + if (p->p_sflag & PS_PROFIL) { + p->p_sflag &= ~PS_PROFIL; + if (--profprocs == 0 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = 1; + setstatclockrate(stathz); + splx(s); + } + } + mtx_unlock_spin(&sched_lock); +} + +/* + * Do process and kernel statistics. Most of the statistics are only + * used by user-level statistics programs. The main exceptions are + * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu. This function + * should be called by all CPUs in the system for each statistics clock + * interrupt. See the description of hardclock_process for more detail on + * this function's relationship to statclock. + */ +void +statclock_process(ke, pc, user) + struct kse *ke; + register_t pc; + int user; +{ +#ifdef GPROF + struct gmonparam *g; + int i; +#endif + struct pstats *pstats; + long rss; + struct rusage *ru; + struct vmspace *vm; + struct proc *p = ke->ke_proc; + struct thread *td = ke->ke_thread; /* current thread */ + + KASSERT(ke == curthread->td_kse, ("statclock_process: td != curthread")); + mtx_assert(&sched_lock, MA_OWNED); + if (user) { + /* + * Came from user mode; CPU was in user state. + * If this process is being profiled, record the tick. + */ + if (p->p_sflag & PS_PROFIL) + addupc_intr(ke, pc, 1); + if (pscnt < psdiv) + return; + /* + * Charge the time as appropriate. + */ + ke->ke_uticks++; + if (ke->ke_ksegrp->kg_nice > NZERO) + cp_time[CP_NICE]++; + else + cp_time[CP_USER]++; + } else { +#ifdef GPROF + /* + * Kernel statistics are just like addupc_intr, only easier. + */ + g = &_gmonparam; + if (g->state == GMON_PROF_ON) { + i = pc - g->lowpc; + if (i < g->textsize) { + i /= HISTFRACTION * sizeof(*g->kcount); + g->kcount[i]++; + } + } +#endif + if (pscnt < psdiv) + return; + /* + * Came from kernel mode, so we were: + * - handling an interrupt, + * - doing syscall or trap work on behalf of the current + * user process, or + * - spinning in the idle loop. + * Whichever it is, charge the time as appropriate. + * Note that we charge interrupts to the current process, + * regardless of whether they are ``for'' that process, + * so that we know how much of its real time was spent + * in ``non-process'' (i.e., interrupt) work. + */ + if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) { + ke->ke_iticks++; + cp_time[CP_INTR]++; + } else { + ke->ke_sticks++; + if (p != PCPU_GET(idlethread)->td_proc) + cp_time[CP_SYS]++; + else + cp_time[CP_IDLE]++; + } + } + + schedclock(ke->ke_thread); + + /* Update resource usage integrals and maximums. */ + if ((pstats = p->p_stats) != NULL && + (ru = &pstats->p_ru) != NULL && + (vm = p->p_vmspace) != NULL) { + ru->ru_ixrss += pgtok(vm->vm_tsize); + ru->ru_idrss += pgtok(vm->vm_dsize); + ru->ru_isrss += pgtok(vm->vm_ssize); + rss = pgtok(vmspace_resident_count(vm)); + if (ru->ru_maxrss < rss) + ru->ru_maxrss = rss; + } +} + +/* + * Statistics clock. Grab profile sample, and if divider reaches 0, + * do process and kernel statistics. Most of the statistics are only + * used by user-level statistics programs. The main exceptions are + * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu. + */ +void +statclock(frame) + register struct clockframe *frame; +{ + + CTR0(KTR_CLK, "statclock fired"); + mtx_lock_spin_flags(&sched_lock, MTX_QUIET); + if (--pscnt == 0) + pscnt = psdiv; + statclock_process(curthread->td_kse, CLKF_PC(frame), CLKF_USERMODE(frame)); + mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); +} + +/* + * Return information about system clocks. + */ +static int +sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS) +{ + struct clockinfo clkinfo; + /* + * Construct clockinfo structure. + */ + bzero(&clkinfo, sizeof(clkinfo)); + clkinfo.hz = hz; + clkinfo.tick = tick; + clkinfo.profhz = profhz; + clkinfo.stathz = stathz ? stathz : hz; + return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); +} + +SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, + 0, 0, sysctl_kern_clockrate, "S,clockinfo", + "Rate and period of various kernel clocks"); diff --git a/sys/kern/kern_condvar.c b/sys/kern/kern_condvar.c new file mode 100644 index 0000000..9d30d25 --- /dev/null +++ b/sys/kern/kern_condvar.c @@ -0,0 +1,579 @@ +/*- + * Copyright (c) 2000 Jake Burkholder <jake@freebsd.org>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/condvar.h> +#include <sys/signalvar.h> +#include <sys/resourcevar.h> +#ifdef KTRACE +#include <sys/uio.h> +#include <sys/ktrace.h> +#endif + +/* + * Common sanity checks for cv_wait* functions. + */ +#define CV_ASSERT(cvp, mp, td) do { \ + KASSERT((td) != NULL, ("%s: curthread NULL", __func__)); \ + KASSERT((td)->td_proc->p_stat == SRUN, ("%s: not SRUN", __func__)); \ + KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__)); \ + KASSERT((mp) != NULL, ("%s: mp NULL", __func__)); \ + mtx_assert((mp), MA_OWNED | MA_NOTRECURSED); \ +} while (0) + +#ifdef INVARIANTS +#define CV_WAIT_VALIDATE(cvp, mp) do { \ + if (TAILQ_EMPTY(&(cvp)->cv_waitq)) { \ + /* Only waiter. */ \ + (cvp)->cv_mtx = (mp); \ + } else { \ + /* \ + * Other waiter; assert that we're using the \ + * same mutex. \ + */ \ + KASSERT((cvp)->cv_mtx == (mp), \ + ("%s: Multiple mutexes", __func__)); \ + } \ +} while (0) +#define CV_SIGNAL_VALIDATE(cvp) do { \ + if (!TAILQ_EMPTY(&(cvp)->cv_waitq)) { \ + KASSERT(mtx_owned((cvp)->cv_mtx), \ + ("%s: Mutex not owned", __func__)); \ + } \ +} while (0) +#else +#define CV_WAIT_VALIDATE(cvp, mp) +#define CV_SIGNAL_VALIDATE(cvp) +#endif + +static void cv_timedwait_end(void *arg); + +/* + * Initialize a condition variable. Must be called before use. + */ +void +cv_init(struct cv *cvp, const char *desc) +{ + + TAILQ_INIT(&cvp->cv_waitq); + cvp->cv_mtx = NULL; + cvp->cv_description = desc; +} + +/* + * Destroy a condition variable. The condition variable must be re-initialized + * in order to be re-used. + */ +void +cv_destroy(struct cv *cvp) +{ + + KASSERT(cv_waitq_empty(cvp), ("%s: cv_waitq non-empty", __func__)); +} + +/* + * Common code for cv_wait* functions. All require sched_lock. + */ + +/* + * Switch context. + */ +static __inline void +cv_switch(struct thread *td) +{ + + td->td_proc->p_stat = SSLEEP; + td->td_proc->p_stats->p_ru.ru_nvcsw++; + mi_switch(); + CTR3(KTR_PROC, "cv_switch: resume thread %p (pid %d, %s)", td, + td->td_proc->p_pid, td->td_proc->p_comm); +} + +/* + * Switch context, catching signals. + */ +static __inline int +cv_switch_catch(struct thread *td) +{ + struct proc *p; + int sig; + + /* + * We put ourselves on the sleep queue and start our timeout before + * calling cursig, as we could stop there, and a wakeup or a SIGCONT (or + * both) could occur while we were stopped. A SIGCONT would cause us to + * be marked as SSLEEP without resuming us, thus we must be ready for + * sleep when cursig is called. If the wakeup happens while we're + * stopped, td->td_wchan will be 0 upon return from cursig. + */ + td->td_flags |= TDF_SINTR; + mtx_unlock_spin(&sched_lock); + p = td->td_proc; + PROC_LOCK(p); + sig = cursig(p); /* XXXKSE */ + mtx_lock_spin(&sched_lock); + PROC_UNLOCK(p); + if (sig != 0) { + if (td->td_wchan != NULL) + cv_waitq_remove(td); + td->td_proc->p_stat = SRUN; + } else if (td->td_wchan != NULL) { + cv_switch(td); + } + td->td_flags &= ~TDF_SINTR; + + return sig; +} + +/* + * Add a thread to the wait queue of a condition variable. + */ +static __inline void +cv_waitq_add(struct cv *cvp, struct thread *td) +{ + + /* + * Process may be sitting on a slpque if asleep() was called, remove it + * before re-adding. + */ + if (td->td_wchan != NULL) + unsleep(td); + + td->td_flags |= TDF_CVWAITQ; + td->td_wchan = cvp; + td->td_wmesg = cvp->cv_description; + td->td_kse->ke_slptime = 0; /* XXXKSE */ + td->td_ksegrp->kg_slptime = 0; /* XXXKSE */ + td->td_base_pri = td->td_priority; + CTR3(KTR_PROC, "cv_waitq_add: thread %p (pid %d, %s)", td, + td->td_proc->p_pid, td->td_proc->p_comm); + TAILQ_INSERT_TAIL(&cvp->cv_waitq, td, td_slpq); +} + +/* + * Wait on a condition variable. The current thread is placed on the condition + * variable's wait queue and suspended. A cv_signal or cv_broadcast on the same + * condition variable will resume the thread. The mutex is released before + * sleeping and will be held on return. It is recommended that the mutex be + * held when cv_signal or cv_broadcast are called. + */ +void +cv_wait(struct cv *cvp, struct mtx *mp) +{ + struct thread *td; + WITNESS_SAVE_DECL(mp); + + td = curthread; +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(1, 0); +#endif + CV_ASSERT(cvp, mp, td); + WITNESS_SLEEP(0, &mp->mtx_object); + WITNESS_SAVE(&mp->mtx_object, mp); + + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, just give + * interrupts a chance, then just return; don't run any other + * thread or panic below, in case this is the idle process and + * already asleep. + */ + return; + } + + mtx_lock_spin(&sched_lock); + + CV_WAIT_VALIDATE(cvp, mp); + + DROP_GIANT(); + mtx_unlock(mp); + + cv_waitq_add(cvp, td); + cv_switch(td); + + mtx_unlock_spin(&sched_lock); +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(0, 0); +#endif + PICKUP_GIANT(); + mtx_lock(mp); + WITNESS_RESTORE(&mp->mtx_object, mp); +} + +/* + * Wait on a condition variable, allowing interruption by signals. Return 0 if + * the thread was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if + * a signal was caught. If ERESTART is returned the system call should be + * restarted if possible. + */ +int +cv_wait_sig(struct cv *cvp, struct mtx *mp) +{ + struct thread *td; + struct proc *p; + int rval; + int sig; + WITNESS_SAVE_DECL(mp); + + td = curthread; + p = td->td_proc; + rval = 0; +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(1, 0); +#endif + CV_ASSERT(cvp, mp, td); + WITNESS_SLEEP(0, &mp->mtx_object); + WITNESS_SAVE(&mp->mtx_object, mp); + + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, just give + * interrupts a chance, then just return; don't run any other + * procs or panic below, in case this is the idle process and + * already asleep. + */ + return 0; + } + + mtx_lock_spin(&sched_lock); + + CV_WAIT_VALIDATE(cvp, mp); + + DROP_GIANT(); + mtx_unlock(mp); + + cv_waitq_add(cvp, td); + sig = cv_switch_catch(td); + + mtx_unlock_spin(&sched_lock); + + PROC_LOCK(p); + if (sig == 0) + sig = cursig(p); /* XXXKSE */ + if (sig != 0) { + if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) + rval = EINTR; + else + rval = ERESTART; + } + PROC_UNLOCK(p); + +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(0, 0); +#endif + PICKUP_GIANT(); + mtx_lock(mp); + WITNESS_RESTORE(&mp->mtx_object, mp); + + return (rval); +} + +/* + * Wait on a condition variable for at most timo/hz seconds. Returns 0 if the + * process was resumed by cv_signal or cv_broadcast, EWOULDBLOCK if the timeout + * expires. + */ +int +cv_timedwait(struct cv *cvp, struct mtx *mp, int timo) +{ + struct thread *td; + int rval; + WITNESS_SAVE_DECL(mp); + + td = curthread; + rval = 0; +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(1, 0); +#endif + CV_ASSERT(cvp, mp, td); + WITNESS_SLEEP(0, &mp->mtx_object); + WITNESS_SAVE(&mp->mtx_object, mp); + + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, just give + * interrupts a chance, then just return; don't run any other + * thread or panic below, in case this is the idle process and + * already asleep. + */ + return 0; + } + + mtx_lock_spin(&sched_lock); + + CV_WAIT_VALIDATE(cvp, mp); + + DROP_GIANT(); + mtx_unlock(mp); + + cv_waitq_add(cvp, td); + callout_reset(&td->td_slpcallout, timo, cv_timedwait_end, td); + cv_switch(td); + + if (td->td_flags & TDF_TIMEOUT) { + td->td_flags &= ~TDF_TIMEOUT; + rval = EWOULDBLOCK; + } else if (td->td_flags & TDF_TIMOFAIL) + td->td_flags &= ~TDF_TIMOFAIL; + else if (callout_stop(&td->td_slpcallout) == 0) { + /* + * Work around race with cv_timedwait_end similar to that + * between msleep and endtsleep. + */ + td->td_flags |= TDF_TIMEOUT; + td->td_proc->p_stats->p_ru.ru_nivcsw++; + mi_switch(); + } + + mtx_unlock_spin(&sched_lock); +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(0, 0); +#endif + PICKUP_GIANT(); + mtx_lock(mp); + WITNESS_RESTORE(&mp->mtx_object, mp); + + return (rval); +} + +/* + * Wait on a condition variable for at most timo/hz seconds, allowing + * interruption by signals. Returns 0 if the thread was resumed by cv_signal + * or cv_broadcast, EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if + * a signal was caught. + */ +int +cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo) +{ + struct thread *td; + struct proc *p; + int rval; + int sig; + WITNESS_SAVE_DECL(mp); + + td = curthread; + p = td->td_proc; + rval = 0; +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(1, 0); +#endif + CV_ASSERT(cvp, mp, td); + WITNESS_SLEEP(0, &mp->mtx_object); + WITNESS_SAVE(&mp->mtx_object, mp); + + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, just give + * interrupts a chance, then just return; don't run any other + * thread or panic below, in case this is the idle process and + * already asleep. + */ + return 0; + } + + mtx_lock_spin(&sched_lock); + + CV_WAIT_VALIDATE(cvp, mp); + + DROP_GIANT(); + mtx_unlock(mp); + + cv_waitq_add(cvp, td); + callout_reset(&td->td_slpcallout, timo, cv_timedwait_end, td); + sig = cv_switch_catch(td); + + if (td->td_flags & TDF_TIMEOUT) { + td->td_flags &= ~TDF_TIMEOUT; + rval = EWOULDBLOCK; + } else if (td->td_flags & TDF_TIMOFAIL) + td->td_flags &= ~TDF_TIMOFAIL; + else if (callout_stop(&td->td_slpcallout) == 0) { + /* + * Work around race with cv_timedwait_end similar to that + * between msleep and endtsleep. + */ + td->td_flags |= TDF_TIMEOUT; + td->td_proc->p_stats->p_ru.ru_nivcsw++; + mi_switch(); + } + + mtx_unlock_spin(&sched_lock); + + PROC_LOCK(p); + if (sig == 0) + sig = cursig(p); + if (sig != 0) { + if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) + rval = EINTR; + else + rval = ERESTART; + } + PROC_UNLOCK(p); + +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(0, 0); +#endif + PICKUP_GIANT(); + mtx_lock(mp); + WITNESS_RESTORE(&mp->mtx_object, mp); + + return (rval); +} + +/* + * Common code for signal and broadcast. Assumes waitq is not empty. Must be + * called with sched_lock held. + */ +static __inline void +cv_wakeup(struct cv *cvp) +{ + struct thread *td; + + mtx_assert(&sched_lock, MA_OWNED); + td = TAILQ_FIRST(&cvp->cv_waitq); + KASSERT(td->td_wchan == cvp, ("%s: bogus wchan", __func__)); + KASSERT(td->td_flags & TDF_CVWAITQ, ("%s: not on waitq", __func__)); + TAILQ_REMOVE(&cvp->cv_waitq, td, td_slpq); + td->td_flags &= ~TDF_CVWAITQ; + td->td_wchan = 0; + if (td->td_proc->p_stat == SSLEEP) { + /* OPTIMIZED EXPANSION OF setrunnable(td); */ + CTR3(KTR_PROC, "cv_signal: thread %p (pid %d, %s)", + td, td->td_proc->p_pid, td->td_proc->p_comm); + if (td->td_ksegrp->kg_slptime > 1) /* XXXKSE */ + updatepri(td); + td->td_kse->ke_slptime = 0; + td->td_ksegrp->kg_slptime = 0; + td->td_proc->p_stat = SRUN; + if (td->td_proc->p_sflag & PS_INMEM) { + setrunqueue(td); + maybe_resched(td); + } else { + td->td_proc->p_sflag |= PS_SWAPINREQ; + wakeup(&proc0); /* XXXKSE */ + } + /* END INLINE EXPANSION */ + } +} + +/* + * Signal a condition variable, wakes up one waiting thread. Will also wakeup + * the swapper if the process is not in memory, so that it can bring the + * sleeping process in. Note that this may also result in additional threads + * being made runnable. Should be called with the same mutex as was passed to + * cv_wait held. + */ +void +cv_signal(struct cv *cvp) +{ + + KASSERT(cvp != NULL, ("%s: cvp NULL", __func__)); + mtx_lock_spin(&sched_lock); + if (!TAILQ_EMPTY(&cvp->cv_waitq)) { + CV_SIGNAL_VALIDATE(cvp); + cv_wakeup(cvp); + } + mtx_unlock_spin(&sched_lock); +} + +/* + * Broadcast a signal to a condition variable. Wakes up all waiting threads. + * Should be called with the same mutex as was passed to cv_wait held. + */ +void +cv_broadcast(struct cv *cvp) +{ + + KASSERT(cvp != NULL, ("%s: cvp NULL", __func__)); + mtx_lock_spin(&sched_lock); + CV_SIGNAL_VALIDATE(cvp); + while (!TAILQ_EMPTY(&cvp->cv_waitq)) + cv_wakeup(cvp); + mtx_unlock_spin(&sched_lock); +} + +/* + * Remove a thread from the wait queue of its condition variable. This may be + * called externally. + */ +void +cv_waitq_remove(struct thread *td) +{ + struct cv *cvp; + + mtx_lock_spin(&sched_lock); + if ((cvp = td->td_wchan) != NULL && td->td_flags & TDF_CVWAITQ) { + TAILQ_REMOVE(&cvp->cv_waitq, td, td_slpq); + td->td_flags &= ~TDF_CVWAITQ; + td->td_wchan = NULL; + } + mtx_unlock_spin(&sched_lock); +} + +/* + * Timeout function for cv_timedwait. Put the thread on the runqueue and set + * its timeout flag. + */ +static void +cv_timedwait_end(void *arg) +{ + struct thread *td; + + td = arg; + CTR3(KTR_PROC, "cv_timedwait_end: thread %p (pid %d, %s)", td, td->td_proc->p_pid, + td->td_proc->p_comm); + mtx_lock_spin(&sched_lock); + if (td->td_flags & TDF_TIMEOUT) { + td->td_flags &= ~TDF_TIMEOUT; + setrunqueue(td); + } else if (td->td_wchan != NULL) { + if (td->td_proc->p_stat == SSLEEP) /* XXXKSE */ + setrunnable(td); + else + cv_waitq_remove(td); + td->td_flags |= TDF_TIMEOUT; + } else + td->td_flags |= TDF_TIMOFAIL; + mtx_unlock_spin(&sched_lock); +} diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c new file mode 100644 index 0000000..d1ce2fc --- /dev/null +++ b/sys/kern/kern_conf.c @@ -0,0 +1,491 @@ +/*- + * Parts Copyright (c) 1995 Terrence R. Lambert + * Copyright (c) 1995 Julian R. Elischer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Terrence R. Lambert. + * 4. The name Terrence R. Lambert may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY Julian R. Elischer ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sysctl.h> +#include <sys/module.h> +#include <sys/malloc.h> +#include <sys/conf.h> +#include <sys/vnode.h> +#include <sys/queue.h> +#include <sys/ctype.h> +#include <machine/stdarg.h> + +#define cdevsw_ALLOCSTART (NUMCDEVSW/2) + +static struct cdevsw *cdevsw[NUMCDEVSW]; + +static MALLOC_DEFINE(M_DEVT, "dev_t", "dev_t storage"); + +/* + * This is the number of hash-buckets. Experiements with 'real-life' + * udev_t's show that a prime halfway between two powers of two works + * best. + */ +#define DEVT_HASH 83 + +/* The number of dev_t's we can create before malloc(9) kick in. */ +#define DEVT_STASH 50 + +static struct specinfo devt_stash[DEVT_STASH]; + +static LIST_HEAD(, specinfo) dev_hash[DEVT_HASH]; + +static LIST_HEAD(, specinfo) dev_free; + +devfs_create_t *devfs_create_hook; +devfs_destroy_t *devfs_destroy_hook; +int devfs_present; + +static int ready_for_devs; + +static int free_devt; +SYSCTL_INT(_debug, OID_AUTO, free_devt, CTLFLAG_RW, &free_devt, 0, ""); + +/* XXX: This is a hack */ +void disk_dev_synth(dev_t dev); + +struct cdevsw * +devsw(dev_t dev) +{ + if (dev->si_devsw) + return (dev->si_devsw); + /* XXX: Hack around our backwards disk code */ + disk_dev_synth(dev); + if (dev->si_devsw) + return (dev->si_devsw); + if (devfs_present) + return (NULL); + return(cdevsw[major(dev)]); +} + +/* + * Add a cdevsw entry + */ + +int +cdevsw_add(struct cdevsw *newentry) +{ + + if (newentry->d_maj < 0 || newentry->d_maj >= NUMCDEVSW) { + printf("%s: ERROR: driver has bogus cdevsw->d_maj = %d\n", + newentry->d_name, newentry->d_maj); + return (EINVAL); + } + + if (cdevsw[newentry->d_maj]) { + printf("WARNING: \"%s\" is usurping \"%s\"'s cdevsw[]\n", + newentry->d_name, cdevsw[newentry->d_maj]->d_name); + } + + cdevsw[newentry->d_maj] = newentry; + + return (0); +} + +/* + * Remove a cdevsw entry + */ + +int +cdevsw_remove(struct cdevsw *oldentry) +{ + if (oldentry->d_maj < 0 || oldentry->d_maj >= NUMCDEVSW) { + printf("%s: ERROR: driver has bogus cdevsw->d_maj = %d\n", + oldentry->d_name, oldentry->d_maj); + return EINVAL; + } + + cdevsw[oldentry->d_maj] = NULL; + + return 0; +} + +/* + * dev_t and u_dev_t primitives + */ + +int +major(dev_t x) +{ + if (x == NODEV) + return NOUDEV; + return((x->si_udev >> 8) & 0xff); +} + +int +minor(dev_t x) +{ + if (x == NODEV) + return NOUDEV; + return(x->si_udev & 0xffff00ff); +} + +int +dev2unit(dev_t x) +{ + int i; + + if (x == NODEV) + return NOUDEV; + i = minor(x); + return ((i & 0xff) | (i >> 8)); +} + +int +unit2minor(int unit) +{ + + KASSERT(unit <= 0xffffff, ("Invalid unit (%d) in unit2minor", unit)); + return ((unit & 0xff) | ((unit << 8) & ~0xffff)); +} + +static dev_t +allocdev(void) +{ + static int stashed; + struct specinfo *si; + + if (stashed >= DEVT_STASH) { + MALLOC(si, struct specinfo *, sizeof(*si), M_DEVT, + M_USE_RESERVE | M_ZERO); + } else if (LIST_FIRST(&dev_free)) { + si = LIST_FIRST(&dev_free); + LIST_REMOVE(si, si_hash); + } else { + si = devt_stash + stashed++; + bzero(si, sizeof *si); + si->si_flags |= SI_STASHED; + } + LIST_INIT(&si->si_children); + TAILQ_INIT(&si->si_snapshots); + return (si); +} + +dev_t +makedev(int x, int y) +{ + struct specinfo *si; + udev_t udev; + int hash; + + if (x == umajor(NOUDEV) && y == uminor(NOUDEV)) + panic("makedev of NOUDEV"); + udev = (x << 8) | y; + hash = udev % DEVT_HASH; + LIST_FOREACH(si, &dev_hash[hash], si_hash) { + if (si->si_udev == udev) + return (si); + } + si = allocdev(); + si->si_udev = udev; + LIST_INSERT_HEAD(&dev_hash[hash], si, si_hash); + return (si); +} + +void +freedev(dev_t dev) +{ + + if (!free_devt) + return; + if (SLIST_FIRST(&dev->si_hlist)) + return; + if (dev->si_devsw || dev->si_drv1 || dev->si_drv2) + return; + LIST_REMOVE(dev, si_hash); + if (dev->si_flags & SI_STASHED) { + bzero(dev, sizeof(*dev)); + dev->si_flags |= SI_STASHED; + LIST_INSERT_HEAD(&dev_free, dev, si_hash); + } else { + FREE(dev, M_DEVT); + } +} + +udev_t +dev2udev(dev_t x) +{ + if (x == NODEV) + return NOUDEV; + return (x->si_udev); +} + +dev_t +udev2dev(udev_t x, int b) +{ + + if (x == NOUDEV) + return (NODEV); + switch (b) { + case 0: + return makedev(umajor(x), uminor(x)); + case 1: + return (NODEV); + default: + Debugger("udev2dev(...,X)"); + return NODEV; + } +} + +int +uminor(udev_t dev) +{ + return(dev & 0xffff00ff); +} + +int +umajor(udev_t dev) +{ + return((dev & 0xff00) >> 8); +} + +udev_t +makeudev(int x, int y) +{ + return ((x << 8) | y); +} + +dev_t +make_dev(struct cdevsw *devsw, int minor, uid_t uid, gid_t gid, int perms, const char *fmt, ...) +{ + dev_t dev; + va_list ap; + int i; + + KASSERT(umajor(makeudev(devsw->d_maj, minor)) == devsw->d_maj, + ("Invalid minor (%d) in make_dev", minor)); + + if (!ready_for_devs) { + printf("WARNING: Driver mistake: make_dev(%s) called before SI_SUB_DRIVERS\n", + fmt); + /* XXX panic here once drivers are cleaned up */ + } + + dev = makedev(devsw->d_maj, minor); + if (dev->si_flags & SI_NAMED) { + printf( "WARNING: Driver mistake: repeat make_dev(\"%s\")\n", + dev->si_name); + panic("don't do that"); + return (dev); + } + va_start(ap, fmt); + i = kvprintf(fmt, NULL, dev->si_name, 32, ap); + dev->si_name[i] = '\0'; + va_end(ap); + dev->si_devsw = devsw; + dev->si_uid = uid; + dev->si_gid = gid; + dev->si_mode = perms; + dev->si_flags |= SI_NAMED; + + if (devfs_create_hook) + devfs_create_hook(dev); + return (dev); +} + +int +dev_named(dev_t pdev, const char *name) +{ + dev_t cdev; + + if (strcmp(devtoname(pdev), name) == 0) + return (1); + LIST_FOREACH(cdev, &pdev->si_children, si_siblings) + if (strcmp(devtoname(cdev), name) == 0) + return (1); + return (0); +} + +void +dev_depends(dev_t pdev, dev_t cdev) +{ + + cdev->si_parent = pdev; + cdev->si_flags |= SI_CHILD; + LIST_INSERT_HEAD(&pdev->si_children, cdev, si_siblings); +} + +dev_t +make_dev_alias(dev_t pdev, const char *fmt, ...) +{ + dev_t dev; + va_list ap; + int i; + + dev = allocdev(); + dev->si_flags |= SI_ALIAS; + dev->si_flags |= SI_NAMED; + dev_depends(pdev, dev); + va_start(ap, fmt); + i = kvprintf(fmt, NULL, dev->si_name, 32, ap); + dev->si_name[i] = '\0'; + va_end(ap); + + if (devfs_create_hook) + devfs_create_hook(dev); + return (dev); +} + +void +revoke_and_destroy_dev(dev_t dev) +{ + struct vnode *vp; + + GIANT_REQUIRED; + + vp = SLIST_FIRST(&dev->si_hlist); + if (vp != NULL) + VOP_REVOKE(vp, REVOKEALL); + destroy_dev(dev); +} + +void +destroy_dev(dev_t dev) +{ + + if (!(dev->si_flags & SI_NAMED)) { + printf( "WARNING: Driver mistake: destroy_dev on %d/%d\n", + major(dev), minor(dev)); + panic("don't do that"); + return; + } + + if (devfs_destroy_hook) + devfs_destroy_hook(dev); + if (dev->si_flags & SI_CHILD) { + LIST_REMOVE(dev, si_siblings); + dev->si_flags &= ~SI_CHILD; + } + while (!LIST_EMPTY(&dev->si_children)) + destroy_dev(LIST_FIRST(&dev->si_children)); + dev->si_drv1 = 0; + dev->si_drv2 = 0; + dev->si_devsw = 0; + bzero(&dev->__si_u, sizeof(dev->__si_u)); + dev->si_flags &= ~SI_NAMED; + dev->si_flags &= ~SI_ALIAS; + freedev(dev); +} + +const char * +devtoname(dev_t dev) +{ + char *p; + int mynor; + + if (dev->si_name[0] == '#' || dev->si_name[0] == '\0') { + p = dev->si_name; + if (devsw(dev)) + sprintf(p, "#%s/", devsw(dev)->d_name); + else + sprintf(p, "#%d/", major(dev)); + p += strlen(p); + mynor = minor(dev); + if (mynor < 0 || mynor > 255) + sprintf(p, "%#x", (u_int)mynor); + else + sprintf(p, "%d", mynor); + } + return (dev->si_name); +} + +int +dev_stdclone(char *name, char **namep, const char *stem, int *unit) +{ + int u, i; + + i = strlen(stem); + if (bcmp(stem, name, i) != 0) + return (0); + if (!isdigit(name[i])) + return (0); + u = 0; + if (name[i] == '0' && isdigit(name[i+1])) + return (0); + while (isdigit(name[i])) { + u *= 10; + u += name[i++] - '0'; + } + *unit = u; + if (namep) + *namep = &name[i]; + if (name[i]) + return (2); + return (1); +} + +/* + * Helper sysctl for devname(3). We're given a {u}dev_t and return + * the name, if any, registered by the device driver. + */ +static int +sysctl_devname(SYSCTL_HANDLER_ARGS) +{ + int error; + udev_t ud; + dev_t dev; + + error = SYSCTL_IN(req, &ud, sizeof (ud)); + if (error) + return (error); + if (ud == NOUDEV) + return(EINVAL); + dev = makedev(umajor(ud), uminor(ud)); + if (dev->si_name[0] == '\0') + error = ENOENT; + else + error = SYSCTL_OUT(req, dev->si_name, strlen(dev->si_name) + 1); + freedev(dev); + return (error); +} + +SYSCTL_PROC(_kern, OID_AUTO, devname, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_ANYBODY, + NULL, 0, sysctl_devname, "", "devname(3) handler"); + +/* + * Set ready_for_devs; prior to this point, device creation is not allowed. + */ +static void +dev_set_ready(void *junk) +{ + ready_for_devs = 1; +} + +SYSINIT(dev_ready, SI_SUB_DEVFS, SI_ORDER_FIRST, dev_set_ready, NULL); diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c new file mode 100644 index 0000000..15837d3 --- /dev/null +++ b/sys/kern/kern_descrip.c @@ -0,0 +1,2210 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 + * $FreeBSD$ + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/sysproto.h> +#include <sys/conf.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/vnode.h> +#include <sys/proc.h> +#include <sys/namei.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/filio.h> +#include <sys/fcntl.h> +#include <sys/unistd.h> +#include <sys/resourcevar.h> +#include <sys/event.h> +#include <sys/sx.h> +#include <sys/socketvar.h> +#include <sys/signalvar.h> + +#include <machine/limits.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/uma.h> + +static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table"); +static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); + +uma_zone_t file_zone; + +static d_open_t fdopen; +#define NUMFDESC 64 + +#define CDEV_MAJOR 22 +static struct cdevsw fildesc_cdevsw = { + /* open */ fdopen, + /* close */ noclose, + /* read */ noread, + /* write */ nowrite, + /* ioctl */ noioctl, + /* poll */ nopoll, + /* mmap */ nommap, + /* strategy */ nostrategy, + /* name */ "FD", + /* maj */ CDEV_MAJOR, + /* dump */ nodump, + /* psize */ nopsize, + /* flags */ 0, +}; + +static int do_dup(struct filedesc *fdp, int old, int new, register_t *retval, struct thread *td); +static int badfo_readwrite(struct file *fp, struct uio *uio, + struct ucred *cred, int flags, struct thread *td); +static int badfo_ioctl(struct file *fp, u_long com, caddr_t data, + struct thread *td); +static int badfo_poll(struct file *fp, int events, + struct ucred *cred, struct thread *td); +static int badfo_kqfilter(struct file *fp, struct knote *kn); +static int badfo_stat(struct file *fp, struct stat *sb, struct thread *td); +static int badfo_close(struct file *fp, struct thread *td); + +/* + * Descriptor management. + */ +struct filelist filehead; /* head of list of open files */ +int nfiles; /* actual number of open files */ +extern int cmask; +struct sx filelist_lock; /* sx to protect filelist */ +struct mtx sigio_lock; /* mtx to protect pointers to sigio */ + +/* + * System calls on descriptors. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getdtablesize_args { + int dummy; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getdtablesize(td, uap) + struct thread *td; + struct getdtablesize_args *uap; +{ + struct proc *p = td->td_proc; + + mtx_lock(&Giant); + td->td_retval[0] = + min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); + mtx_unlock(&Giant); + return (0); +} + +/* + * Duplicate a file descriptor to a particular value. + * + * note: keep in mind that a potential race condition exists when closing + * descriptors from a shared descriptor table (via rfork). + */ +#ifndef _SYS_SYSPROTO_H_ +struct dup2_args { + u_int from; + u_int to; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +dup2(td, uap) + struct thread *td; + struct dup2_args *uap; +{ + struct proc *p = td->td_proc; + register struct filedesc *fdp = td->td_proc->p_fd; + register u_int old = uap->from, new = uap->to; + int i, error; + + FILEDESC_LOCK(fdp); +retry: + if (old >= fdp->fd_nfiles || + fdp->fd_ofiles[old] == NULL || + new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || + new >= maxfilesperproc) { + FILEDESC_UNLOCK(fdp); + return (EBADF); + } + if (old == new) { + td->td_retval[0] = new; + FILEDESC_UNLOCK(fdp); + return (0); + } + if (new >= fdp->fd_nfiles) { + if ((error = fdalloc(td, new, &i))) { + FILEDESC_UNLOCK(fdp); + return (error); + } + /* + * fdalloc() may block, retest everything. + */ + goto retry; + } + error = do_dup(fdp, (int)old, (int)new, td->td_retval, td); + return(error); +} + +/* + * Duplicate a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct dup_args { + u_int fd; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +dup(td, uap) + struct thread *td; + struct dup_args *uap; +{ + register struct filedesc *fdp; + u_int old; + int new, error; + + old = uap->fd; + fdp = td->td_proc->p_fd; + FILEDESC_LOCK(fdp); + if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) { + FILEDESC_UNLOCK(fdp); + return (EBADF); + } + if ((error = fdalloc(td, 0, &new))) { + FILEDESC_UNLOCK(fdp); + return (error); + } + error = do_dup(fdp, (int)old, new, td->td_retval, td); + return (error); +} + +/* + * The file control system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fcntl_args { + int fd; + int cmd; + long arg; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +fcntl(td, uap) + struct thread *td; + register struct fcntl_args *uap; +{ + register struct proc *p = td->td_proc; + register struct filedesc *fdp; + register struct file *fp; + register char *pop; + struct vnode *vp; + int i, tmp, error = 0, flg = F_POSIX; + struct flock fl; + u_int newmin; + struct proc *leaderp; + + mtx_lock(&Giant); + + fdp = p->p_fd; + FILEDESC_LOCK(fdp); + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) { + FILEDESC_UNLOCK(fdp); + error = EBADF; + goto done2; + } + pop = &fdp->fd_ofileflags[uap->fd]; + + switch (uap->cmd) { + case F_DUPFD: + newmin = uap->arg; + if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || + newmin >= maxfilesperproc) { + FILEDESC_UNLOCK(fdp); + error = EINVAL; + break; + } + if ((error = fdalloc(td, newmin, &i))) { + FILEDESC_UNLOCK(fdp); + break; + } + error = do_dup(fdp, uap->fd, i, td->td_retval, td); + break; + + case F_GETFD: + td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0; + FILEDESC_UNLOCK(fdp); + break; + + case F_SETFD: + *pop = (*pop &~ UF_EXCLOSE) | + (uap->arg & FD_CLOEXEC ? UF_EXCLOSE : 0); + FILEDESC_UNLOCK(fdp); + break; + + case F_GETFL: + FILE_LOCK(fp); + FILEDESC_UNLOCK(fdp); + td->td_retval[0] = OFLAGS(fp->f_flag); + FILE_UNLOCK(fp); + break; + + case F_SETFL: + fhold(fp); + FILEDESC_UNLOCK(fdp); + fp->f_flag &= ~FCNTLFLAGS; + fp->f_flag |= FFLAGS(uap->arg & ~O_ACCMODE) & FCNTLFLAGS; + tmp = fp->f_flag & FNONBLOCK; + error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); + if (error) { + fdrop(fp, td); + break; + } + tmp = fp->f_flag & FASYNC; + error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td); + if (!error) { + fdrop(fp, td); + break; + } + fp->f_flag &= ~FNONBLOCK; + tmp = 0; + (void)fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); + fdrop(fp, td); + break; + + case F_GETOWN: + fhold(fp); + FILEDESC_UNLOCK(fdp); + error = fo_ioctl(fp, FIOGETOWN, (caddr_t)td->td_retval, td); + fdrop(fp, td); + break; + + case F_SETOWN: + fhold(fp); + FILEDESC_UNLOCK(fdp); + error = fo_ioctl(fp, FIOSETOWN, (caddr_t)&uap->arg, td); + fdrop(fp, td); + break; + + case F_SETLKW: + flg |= F_WAIT; + /* Fall into F_SETLK */ + + case F_SETLK: + if (fp->f_type != DTYPE_VNODE) { + FILEDESC_UNLOCK(fdp); + error = EBADF; + break; + } + vp = (struct vnode *)fp->f_data; + /* + * copyin/lockop may block + */ + fhold(fp); + FILEDESC_UNLOCK(fdp); + vp = (struct vnode *)fp->f_data; + + /* Copy in the lock structure */ + error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl, + sizeof(fl)); + if (error) { + fdrop(fp, td); + break; + } + if (fl.l_whence == SEEK_CUR) { + if (fp->f_offset < 0 || + (fl.l_start > 0 && + fp->f_offset > OFF_MAX - fl.l_start)) { + fdrop(fp, td); + error = EOVERFLOW; + break; + } + fl.l_start += fp->f_offset; + } + + switch (fl.l_type) { + case F_RDLCK: + if ((fp->f_flag & FREAD) == 0) { + error = EBADF; + break; + } + PROC_LOCK(p); + p->p_flag |= P_ADVLOCK; + leaderp = p->p_leader; + PROC_UNLOCK(p); + error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_SETLK, + &fl, flg); + break; + case F_WRLCK: + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + break; + } + PROC_LOCK(p); + p->p_flag |= P_ADVLOCK; + leaderp = p->p_leader; + PROC_UNLOCK(p); + error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_SETLK, + &fl, flg); + break; + case F_UNLCK: + PROC_LOCK(p); + leaderp = p->p_leader; + PROC_UNLOCK(p); + error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_UNLCK, + &fl, F_POSIX); + break; + default: + error = EINVAL; + break; + } + fdrop(fp, td); + break; + + case F_GETLK: + if (fp->f_type != DTYPE_VNODE) { + FILEDESC_UNLOCK(fdp); + error = EBADF; + break; + } + vp = (struct vnode *)fp->f_data; + /* + * copyin/lockop may block + */ + fhold(fp); + FILEDESC_UNLOCK(fdp); + vp = (struct vnode *)fp->f_data; + + /* Copy in the lock structure */ + error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl, + sizeof(fl)); + if (error) { + fdrop(fp, td); + break; + } + if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK && + fl.l_type != F_UNLCK) { + fdrop(fp, td); + error = EINVAL; + break; + } + if (fl.l_whence == SEEK_CUR) { + if ((fl.l_start > 0 && + fp->f_offset > OFF_MAX - fl.l_start) || + (fl.l_start < 0 && + fp->f_offset < OFF_MIN - fl.l_start)) { + fdrop(fp, td); + error = EOVERFLOW; + break; + } + fl.l_start += fp->f_offset; + } + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, + &fl, F_POSIX); + fdrop(fp, td); + if (error == 0) { + error = copyout((caddr_t)&fl, + (caddr_t)(intptr_t)uap->arg, sizeof(fl)); + } + break; + default: + FILEDESC_UNLOCK(fdp); + error = EINVAL; + break; + } +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * Common code for dup, dup2, and fcntl(F_DUPFD). + * filedesc must be locked, but will be unlocked as a side effect. + */ +static int +do_dup(fdp, old, new, retval, td) + register struct filedesc *fdp; + register int old, new; + register_t *retval; + struct thread *td; +{ + struct file *fp; + struct file *delfp; + + FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); + + /* + * Save info on the descriptor being overwritten. We have + * to do the unmap now, but we cannot close it without + * introducing an ownership race for the slot. + */ + delfp = fdp->fd_ofiles[new]; +#if 0 + if (delfp && (fdp->fd_ofileflags[new] & UF_MAPPED)) + (void) munmapfd(td, new); +#endif + + /* + * Duplicate the source descriptor, update lastfile + */ + fp = fdp->fd_ofiles[old]; + fdp->fd_ofiles[new] = fp; + fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; + fhold(fp); + if (new > fdp->fd_lastfile) + fdp->fd_lastfile = new; + *retval = new; + + FILEDESC_UNLOCK(fdp); + + /* + * If we dup'd over a valid file, we now own the reference to it + * and must dispose of it using closef() semantics (as if a + * close() were performed on it). + */ + if (delfp) { + mtx_lock(&Giant); + (void) closef(delfp, td); + mtx_unlock(&Giant); + } + return (0); +} + +/* + * If sigio is on the list associated with a process or process group, + * disable signalling from the device, remove sigio from the list and + * free sigio. + */ +void +funsetown(sigiop) + struct sigio **sigiop; +{ + struct sigio *sigio; + + SIGIO_LOCK(); + sigio = *sigiop; + if (sigio == NULL) { + SIGIO_UNLOCK(); + return; + } + *(sigio->sio_myref) = NULL; + if ((sigio)->sio_pgid < 0) { + struct pgrp *pg = (sigio)->sio_pgrp; + PGRP_LOCK(pg); + SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, + sigio, sio_pgsigio); + PGRP_UNLOCK(pg); + } else { + struct proc *p = (sigio)->sio_proc; + PROC_LOCK(p); + SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, + sigio, sio_pgsigio); + PROC_UNLOCK(p); + } + SIGIO_UNLOCK(); + crfree(sigio->sio_ucred); + FREE(sigio, M_SIGIO); +} + +/* + * Free a list of sigio structures. + * We only need to lock the SIGIO_LOCK because we have made ourselves + * inaccessable to callers of fsetown and therefore do not need to lock + * the proc or pgrp struct for the list manipulation. + */ +void +funsetownlst(sigiolst) + struct sigiolst *sigiolst; +{ + struct sigio *sigio; + struct proc *p; + struct pgrp *pg; + + sigio = SLIST_FIRST(sigiolst); + if (sigio == NULL) + return; + + p = NULL; + pg = NULL; + + /* + * Every entry of the list should belong + * to a single proc or pgrp. + */ + if (sigio->sio_pgid < 0) { + pg = sigio->sio_pgrp; + PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); + } else /* if (sigio->sio_pgid > 0) */ { + p = sigio->sio_proc; + PROC_LOCK_ASSERT(p, MA_NOTOWNED); + } + + SIGIO_LOCK(); + while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { + *(sigio->sio_myref) = NULL; + if (pg != NULL) { + KASSERT(sigio->sio_pgid < 0, + ("Proc sigio in pgrp sigio list")); + KASSERT(sigio->sio_pgrp == pg, + ("Bogus pgrp in sigio list")); + PGRP_LOCK(pg); + SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, + sio_pgsigio); + PGRP_UNLOCK(pg); + } else /* if (p != NULL) */ { + KASSERT(sigio->sio_pgid > 0, + ("Pgrp sigio in proc sigio list")); + KASSERT(sigio->sio_proc == p, + ("Bogus proc in sigio list")); + PROC_LOCK(p); + SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, + sio_pgsigio); + PROC_UNLOCK(p); + } + SIGIO_UNLOCK(); + crfree(sigio->sio_ucred); + FREE(sigio, M_SIGIO); + SIGIO_LOCK(); + } + SIGIO_UNLOCK(); +} + +/* + * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). + * + * After permission checking, add a sigio structure to the sigio list for + * the process or process group. + */ +int +fsetown(pgid, sigiop) + pid_t pgid; + struct sigio **sigiop; +{ + struct proc *proc; + struct pgrp *pgrp; + struct sigio *sigio; + int ret; + + if (pgid == 0) { + funsetown(sigiop); + return (0); + } + + ret = 0; + + /* Allocate and fill in the new sigio out of locks. */ + MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK); + sigio->sio_pgid = pgid; + sigio->sio_ucred = crhold(curthread->td_ucred); + sigio->sio_myref = sigiop; + + sx_slock(&proctree_lock); + if (pgid > 0) { + proc = pfind(pgid); + if (proc == NULL) { + ret = ESRCH; + goto fail; + } + + /* + * Policy - Don't allow a process to FSETOWN a process + * in another session. + * + * Remove this test to allow maximum flexibility or + * restrict FSETOWN to the current process or process + * group for maximum safety. + */ + PROC_UNLOCK(proc); + if (proc->p_session != curthread->td_proc->p_session) { + ret = EPERM; + goto fail; + } + + pgrp = NULL; + } else /* if (pgid < 0) */ { + pgrp = pgfind(-pgid); + if (pgrp == NULL) { + ret = ESRCH; + goto fail; + } + PGRP_UNLOCK(pgrp); + + /* + * Policy - Don't allow a process to FSETOWN a process + * in another session. + * + * Remove this test to allow maximum flexibility or + * restrict FSETOWN to the current process or process + * group for maximum safety. + */ + if (pgrp->pg_session != curthread->td_proc->p_session) { + ret = EPERM; + goto fail; + } + + proc = NULL; + } + funsetown(sigiop); + if (pgid > 0) { + PROC_LOCK(proc); + /* + * since funsetownlst() is called without the proctree + * locked we need to check for P_WEXIT. + * XXX: is ESRCH correct? + */ + if ((proc->p_flag & P_WEXIT) != 0) { + PROC_UNLOCK(proc); + ret = ESRCH; + goto fail; + } + SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); + sigio->sio_proc = proc; + PROC_UNLOCK(proc); + } else { + PGRP_LOCK(pgrp); + SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); + sigio->sio_pgrp = pgrp; + PGRP_UNLOCK(pgrp); + } + sx_sunlock(&proctree_lock); + SIGIO_LOCK(); + *sigiop = sigio; + SIGIO_UNLOCK(); + return (0); + +fail: + sx_sunlock(&proctree_lock); + crfree(sigio->sio_ucred); + FREE(sigio, M_SIGIO); + return (ret); +} + +/* + * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). + */ +pid_t +fgetown(sigio) + struct sigio *sigio; +{ + return (sigio != NULL ? sigio->sio_pgid : 0); +} + +/* + * Close a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct close_args { + int fd; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +close(td, uap) + struct thread *td; + struct close_args *uap; +{ + register struct filedesc *fdp; + register struct file *fp; + register int fd = uap->fd; + int error = 0; + + mtx_lock(&Giant); + fdp = td->td_proc->p_fd; + FILEDESC_LOCK(fdp); + if ((unsigned)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) { + FILEDESC_UNLOCK(fdp); + error = EBADF; + goto done2; + } +#if 0 + if (fdp->fd_ofileflags[fd] & UF_MAPPED) + (void) munmapfd(td, fd); +#endif + fdp->fd_ofiles[fd] = NULL; + fdp->fd_ofileflags[fd] = 0; + + /* + * we now hold the fp reference that used to be owned by the descriptor + * array. + */ + while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL) + fdp->fd_lastfile--; + if (fd < fdp->fd_freefile) + fdp->fd_freefile = fd; + if (fd < fdp->fd_knlistsize) { + FILEDESC_UNLOCK(fdp); + knote_fdclose(td, fd); + } else + FILEDESC_UNLOCK(fdp); + + error = closef(fp, td); +done2: + mtx_unlock(&Giant); + return(error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ofstat_args { + int fd; + struct ostat *sb; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +ofstat(td, uap) + struct thread *td; + register struct ofstat_args *uap; +{ + struct file *fp; + struct stat ub; + struct ostat oub; + int error; + + mtx_lock(&Giant); + if ((error = fget(td, uap->fd, &fp)) != 0) + goto done2; + error = fo_stat(fp, &ub, td); + if (error == 0) { + cvtstat(&ub, &oub); + error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub)); + } + fdrop(fp, td); +done2: + mtx_unlock(&Giant); + return (error); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fstat_args { + int fd; + struct stat *sb; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +fstat(td, uap) + struct thread *td; + struct fstat_args *uap; +{ + struct file *fp; + struct stat ub; + int error; + + mtx_lock(&Giant); + if ((error = fget(td, uap->fd, &fp)) != 0) + goto done2; + error = fo_stat(fp, &ub, td); + if (error == 0) + error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub)); + fdrop(fp, td); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct nfstat_args { + int fd; + struct nstat *sb; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +nfstat(td, uap) + struct thread *td; + register struct nfstat_args *uap; +{ + struct file *fp; + struct stat ub; + struct nstat nub; + int error; + + mtx_lock(&Giant); + if ((error = fget(td, uap->fd, &fp)) != 0) + goto done2; + error = fo_stat(fp, &ub, td); + if (error == 0) { + cvtnstat(&ub, &nub); + error = copyout((caddr_t)&nub, (caddr_t)uap->sb, sizeof (nub)); + } + fdrop(fp, td); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * Return pathconf information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fpathconf_args { + int fd; + int name; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +fpathconf(td, uap) + struct thread *td; + register struct fpathconf_args *uap; +{ + struct file *fp; + struct vnode *vp; + int error; + + if ((error = fget(td, uap->fd, &fp)) != 0) + return (error); + + switch (fp->f_type) { + case DTYPE_PIPE: + case DTYPE_SOCKET: + if (uap->name != _PC_PIPE_BUF) { + error = EINVAL; + } else { + td->td_retval[0] = PIPE_BUF; + error = 0; + } + break; + case DTYPE_FIFO: + case DTYPE_VNODE: + vp = (struct vnode *)fp->f_data; + mtx_lock(&Giant); + error = VOP_PATHCONF(vp, uap->name, td->td_retval); + mtx_unlock(&Giant); + break; + default: + error = EOPNOTSUPP; + break; + } + fdrop(fp, td); + return(error); +} + +/* + * Allocate a file descriptor for the process. + */ +static int fdexpand; +SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, ""); + +int +fdalloc(td, want, result) + struct thread *td; + int want; + int *result; +{ + struct proc *p = td->td_proc; + register struct filedesc *fdp = td->td_proc->p_fd; + register int i; + int lim, last, nfiles; + struct file **newofile, **oldofile; + char *newofileflags; + + FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); + + /* + * Search for a free descriptor starting at the higher + * of want or fd_freefile. If that fails, consider + * expanding the ofile array. + */ + lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); + for (;;) { + last = min(fdp->fd_nfiles, lim); + if ((i = want) < fdp->fd_freefile) + i = fdp->fd_freefile; + for (; i < last; i++) { + if (fdp->fd_ofiles[i] == NULL) { + fdp->fd_ofileflags[i] = 0; + if (i > fdp->fd_lastfile) + fdp->fd_lastfile = i; + if (want <= fdp->fd_freefile) + fdp->fd_freefile = i; + *result = i; + return (0); + } + } + + /* + * No space in current array. Expand? + */ + if (fdp->fd_nfiles >= lim) + return (EMFILE); + if (fdp->fd_nfiles < NDEXTENT) + nfiles = NDEXTENT; + else + nfiles = 2 * fdp->fd_nfiles; + FILEDESC_UNLOCK(fdp); + mtx_lock(&Giant); + MALLOC(newofile, struct file **, nfiles * OFILESIZE, + M_FILEDESC, M_WAITOK); + mtx_unlock(&Giant); + FILEDESC_LOCK(fdp); + + /* + * deal with file-table extend race that might have occured + * when malloc was blocked. + */ + if (fdp->fd_nfiles >= nfiles) { + FILEDESC_UNLOCK(fdp); + mtx_lock(&Giant); + FREE(newofile, M_FILEDESC); + mtx_unlock(&Giant); + FILEDESC_LOCK(fdp); + continue; + } + newofileflags = (char *) &newofile[nfiles]; + /* + * Copy the existing ofile and ofileflags arrays + * and zero the new portion of each array. + */ + bcopy(fdp->fd_ofiles, newofile, + (i = sizeof(struct file *) * fdp->fd_nfiles)); + bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i); + bcopy(fdp->fd_ofileflags, newofileflags, + (i = sizeof(char) * fdp->fd_nfiles)); + bzero(newofileflags + i, nfiles * sizeof(char) - i); + if (fdp->fd_nfiles > NDFILE) + oldofile = fdp->fd_ofiles; + else + oldofile = NULL; + fdp->fd_ofiles = newofile; + fdp->fd_ofileflags = newofileflags; + fdp->fd_nfiles = nfiles; + fdexpand++; + if (oldofile != NULL) { + FILEDESC_UNLOCK(fdp); + mtx_lock(&Giant); + FREE(oldofile, M_FILEDESC); + mtx_unlock(&Giant); + FILEDESC_LOCK(fdp); + } + } + return (0); +} + +/* + * Check to see whether n user file descriptors + * are available to the process p. + */ +int +fdavail(td, n) + struct thread *td; + register int n; +{ + struct proc *p = td->td_proc; + register struct filedesc *fdp = td->td_proc->p_fd; + register struct file **fpp; + register int i, lim, last; + + FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); + + lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); + if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) + return (1); + + last = min(fdp->fd_nfiles, lim); + fpp = &fdp->fd_ofiles[fdp->fd_freefile]; + for (i = last - fdp->fd_freefile; --i >= 0; fpp++) { + if (*fpp == NULL && --n <= 0) + return (1); + } + return (0); +} + +/* + * Create a new open file structure and allocate + * a file decriptor for the process that refers to it. + */ +int +falloc(td, resultfp, resultfd) + register struct thread *td; + struct file **resultfp; + int *resultfd; +{ + struct proc *p = td->td_proc; + register struct file *fp, *fq; + int error, i; + + sx_xlock(&filelist_lock); + if (nfiles >= maxfiles) { + sx_xunlock(&filelist_lock); + tablefull("file"); + return (ENFILE); + } + nfiles++; + sx_xunlock(&filelist_lock); + /* + * Allocate a new file descriptor. + * If the process has file descriptor zero open, add to the list + * of open files at that point, otherwise put it at the front of + * the list of open files. + */ + fp = uma_zalloc(file_zone, M_WAITOK); + bzero(fp, sizeof(*fp)); + + /* + * wait until after malloc (which may have blocked) returns before + * allocating the slot, else a race might have shrunk it if we had + * allocated it before the malloc. + */ + FILEDESC_LOCK(p->p_fd); + if ((error = fdalloc(td, 0, &i))) { + FILEDESC_UNLOCK(p->p_fd); + sx_xlock(&filelist_lock); + nfiles--; + sx_xunlock(&filelist_lock); + uma_zfree(file_zone, fp); + return (error); + } + fp->f_mtxp = mtx_pool_alloc(); + fp->f_gcflag = 0; + fp->f_count = 1; + fp->f_cred = crhold(td->td_ucred); + fp->f_ops = &badfileops; + fp->f_seqcount = 1; + FILEDESC_UNLOCK(p->p_fd); + sx_xlock(&filelist_lock); + FILEDESC_LOCK(p->p_fd); + if ((fq = p->p_fd->fd_ofiles[0])) { + LIST_INSERT_AFTER(fq, fp, f_list); + } else { + LIST_INSERT_HEAD(&filehead, fp, f_list); + } + p->p_fd->fd_ofiles[i] = fp; + FILEDESC_UNLOCK(p->p_fd); + sx_xunlock(&filelist_lock); + if (resultfp) + *resultfp = fp; + if (resultfd) + *resultfd = i; + return (0); +} + +/* + * Free a file descriptor. + */ +void +ffree(fp) + register struct file *fp; +{ + + KASSERT((fp->f_count == 0), ("ffree: fp_fcount not 0!")); + sx_xlock(&filelist_lock); + LIST_REMOVE(fp, f_list); + nfiles--; + sx_xunlock(&filelist_lock); + crfree(fp->f_cred); + uma_zfree(file_zone, fp); +} + +/* + * Build a new filedesc structure. + */ +struct filedesc * +fdinit(td) + struct thread *td; +{ + register struct filedesc0 *newfdp; + register struct filedesc *fdp = td->td_proc->p_fd; + + MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0), + M_FILEDESC, M_WAITOK | M_ZERO); + mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF); + FILEDESC_LOCK(&newfdp->fd_fd); + newfdp->fd_fd.fd_cdir = fdp->fd_cdir; + if (newfdp->fd_fd.fd_cdir) + VREF(newfdp->fd_fd.fd_cdir); + newfdp->fd_fd.fd_rdir = fdp->fd_rdir; + if (newfdp->fd_fd.fd_rdir) + VREF(newfdp->fd_fd.fd_rdir); + newfdp->fd_fd.fd_jdir = fdp->fd_jdir; + if (newfdp->fd_fd.fd_jdir) + VREF(newfdp->fd_fd.fd_jdir); + + /* Create the file descriptor table. */ + newfdp->fd_fd.fd_refcnt = 1; + newfdp->fd_fd.fd_cmask = cmask; + newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; + newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; + newfdp->fd_fd.fd_nfiles = NDFILE; + newfdp->fd_fd.fd_knlistsize = -1; + FILEDESC_UNLOCK(&newfdp->fd_fd); + + return (&newfdp->fd_fd); +} + +/* + * Share a filedesc structure. + */ +struct filedesc * +fdshare(p) + struct proc *p; +{ + FILEDESC_LOCK(p->p_fd); + p->p_fd->fd_refcnt++; + FILEDESC_UNLOCK(p->p_fd); + return (p->p_fd); +} + +/* + * Copy a filedesc structure. + */ +struct filedesc * +fdcopy(td) + struct thread *td; +{ + register struct filedesc *newfdp, *fdp = td->td_proc->p_fd; + register struct file **fpp; + register int i, j; + + /* Certain daemons might not have file descriptors. */ + if (fdp == NULL) + return (NULL); + + FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); + + FILEDESC_UNLOCK(fdp); + MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0), + M_FILEDESC, M_WAITOK); + FILEDESC_LOCK(fdp); + bcopy(fdp, newfdp, sizeof(struct filedesc)); + FILEDESC_UNLOCK(fdp); + bzero(&newfdp->fd_mtx, sizeof(newfdp->fd_mtx)); + mtx_init(&newfdp->fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF); + if (newfdp->fd_cdir) + VREF(newfdp->fd_cdir); + if (newfdp->fd_rdir) + VREF(newfdp->fd_rdir); + if (newfdp->fd_jdir) + VREF(newfdp->fd_jdir); + newfdp->fd_refcnt = 1; + + /* + * If the number of open files fits in the internal arrays + * of the open file structure, use them, otherwise allocate + * additional memory for the number of descriptors currently + * in use. + */ + FILEDESC_LOCK(fdp); + newfdp->fd_lastfile = fdp->fd_lastfile; + newfdp->fd_nfiles = fdp->fd_nfiles; + if (newfdp->fd_lastfile < NDFILE) { + newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles; + newfdp->fd_ofileflags = + ((struct filedesc0 *) newfdp)->fd_dfileflags; + i = NDFILE; + } else { + /* + * Compute the smallest multiple of NDEXTENT needed + * for the file descriptors currently in use, + * allowing the table to shrink. + */ +retry: + i = newfdp->fd_nfiles; + while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2) + i /= 2; + FILEDESC_UNLOCK(fdp); + MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE, + M_FILEDESC, M_WAITOK); + FILEDESC_LOCK(fdp); + newfdp->fd_lastfile = fdp->fd_lastfile; + newfdp->fd_nfiles = fdp->fd_nfiles; + j = newfdp->fd_nfiles; + while (j > 2 * NDEXTENT && j > newfdp->fd_lastfile * 2) + j /= 2; + if (i != j) { + /* + * The size of the original table has changed. + * Go over once again. + */ + FILEDESC_UNLOCK(fdp); + FREE(newfdp->fd_ofiles, M_FILEDESC); + FILEDESC_LOCK(fdp); + newfdp->fd_lastfile = fdp->fd_lastfile; + newfdp->fd_nfiles = fdp->fd_nfiles; + goto retry; + } + newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i]; + } + newfdp->fd_nfiles = i; + bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **)); + bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char)); + + /* + * kq descriptors cannot be copied. + */ + if (newfdp->fd_knlistsize != -1) { + fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile]; + for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) { + if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) { + *fpp = NULL; + if (i < newfdp->fd_freefile) + newfdp->fd_freefile = i; + } + if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0) + newfdp->fd_lastfile--; + } + newfdp->fd_knlist = NULL; + newfdp->fd_knlistsize = -1; + newfdp->fd_knhash = NULL; + newfdp->fd_knhashmask = 0; + } + + fpp = newfdp->fd_ofiles; + for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) { + if (*fpp != NULL) { + fhold(*fpp); + } + } + return (newfdp); +} + +/* + * Release a filedesc structure. + */ +void +fdfree(td) + struct thread *td; +{ + register struct filedesc *fdp; + struct file **fpp; + register int i; + + fdp = td->td_proc->p_fd; + /* Certain daemons might not have file descriptors. */ + if (fdp == NULL) + return; + + FILEDESC_LOCK(fdp); + if (--fdp->fd_refcnt > 0) { + FILEDESC_UNLOCK(fdp); + return; + } + /* + * we are the last reference to the structure, we can + * safely assume it will not change out from under us. + */ + FILEDESC_UNLOCK(fdp); + fpp = fdp->fd_ofiles; + for (i = fdp->fd_lastfile; i-- >= 0; fpp++) { + if (*fpp) + (void) closef(*fpp, td); + } + + PROC_LOCK(td->td_proc); + td->td_proc->p_fd = NULL; + PROC_UNLOCK(td->td_proc); + + if (fdp->fd_nfiles > NDFILE) + FREE(fdp->fd_ofiles, M_FILEDESC); + if (fdp->fd_cdir) + vrele(fdp->fd_cdir); + if (fdp->fd_rdir) + vrele(fdp->fd_rdir); + if (fdp->fd_jdir) + vrele(fdp->fd_jdir); + if (fdp->fd_knlist) + FREE(fdp->fd_knlist, M_KQUEUE); + if (fdp->fd_knhash) + FREE(fdp->fd_knhash, M_KQUEUE); + mtx_destroy(&fdp->fd_mtx); + FREE(fdp, M_FILEDESC); +} + +/* + * For setugid programs, we don't want to people to use that setugidness + * to generate error messages which write to a file which otherwise would + * otherwise be off-limits to the process. + * + * This is a gross hack to plug the hole. A better solution would involve + * a special vop or other form of generalized access control mechanism. We + * go ahead and just reject all procfs filesystems accesses as dangerous. + * + * Since setugidsafety calls this only for fd 0, 1 and 2, this check is + * sufficient. We also don't for check setugidness since we know we are. + */ +static int +is_unsafe(struct file *fp) +{ + if (fp->f_type == DTYPE_VNODE && + ((struct vnode *)(fp->f_data))->v_tag == VT_PROCFS) + return (1); + return (0); +} + +/* + * Make this setguid thing safe, if at all possible. + */ +void +setugidsafety(td) + struct thread *td; +{ + struct filedesc *fdp = td->td_proc->p_fd; + register int i; + + /* Certain daemons might not have file descriptors. */ + if (fdp == NULL) + return; + + /* + * note: fdp->fd_ofiles may be reallocated out from under us while + * we are blocked in a close. Be careful! + */ + FILEDESC_LOCK(fdp); + for (i = 0; i <= fdp->fd_lastfile; i++) { + if (i > 2) + break; + if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) { + struct file *fp; + +#if 0 + if ((fdp->fd_ofileflags[i] & UF_MAPPED) != 0) + (void) munmapfd(td, i); +#endif + if (i < fdp->fd_knlistsize) { + FILEDESC_UNLOCK(fdp); + knote_fdclose(td, i); + FILEDESC_LOCK(fdp); + } + /* + * NULL-out descriptor prior to close to avoid + * a race while close blocks. + */ + fp = fdp->fd_ofiles[i]; + fdp->fd_ofiles[i] = NULL; + fdp->fd_ofileflags[i] = 0; + if (i < fdp->fd_freefile) + fdp->fd_freefile = i; + FILEDESC_UNLOCK(fdp); + (void) closef(fp, td); + FILEDESC_LOCK(fdp); + } + } + while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL) + fdp->fd_lastfile--; + FILEDESC_UNLOCK(fdp); +} + +/* + * Close any files on exec? + */ +void +fdcloseexec(td) + struct thread *td; +{ + struct filedesc *fdp = td->td_proc->p_fd; + register int i; + + /* Certain daemons might not have file descriptors. */ + if (fdp == NULL) + return; + + FILEDESC_LOCK(fdp); + + /* + * We cannot cache fd_ofiles or fd_ofileflags since operations + * may block and rip them out from under us. + */ + for (i = 0; i <= fdp->fd_lastfile; i++) { + if (fdp->fd_ofiles[i] != NULL && + (fdp->fd_ofileflags[i] & UF_EXCLOSE)) { + struct file *fp; + +#if 0 + if (fdp->fd_ofileflags[i] & UF_MAPPED) + (void) munmapfd(td, i); +#endif + if (i < fdp->fd_knlistsize) { + FILEDESC_UNLOCK(fdp); + knote_fdclose(td, i); + FILEDESC_LOCK(fdp); + } + /* + * NULL-out descriptor prior to close to avoid + * a race while close blocks. + */ + fp = fdp->fd_ofiles[i]; + fdp->fd_ofiles[i] = NULL; + fdp->fd_ofileflags[i] = 0; + if (i < fdp->fd_freefile) + fdp->fd_freefile = i; + FILEDESC_UNLOCK(fdp); + (void) closef(fp, td); + FILEDESC_LOCK(fdp); + } + } + while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL) + fdp->fd_lastfile--; + FILEDESC_UNLOCK(fdp); +} + +/* + * It is unsafe for set[ug]id processes to be started with file + * descriptors 0..2 closed, as these descriptors are given implicit + * significance in the Standard C library. fdcheckstd() will create a + * descriptor referencing /dev/null for each of stdin, stdout, and + * stderr that is not already open. + */ +int +fdcheckstd(td) + struct thread *td; +{ + struct nameidata nd; + struct filedesc *fdp; + struct file *fp; + register_t retval; + int fd, i, error, flags, devnull; + + fdp = td->td_proc->p_fd; + if (fdp == NULL) + return (0); + devnull = -1; + error = 0; + for (i = 0; i < 3; i++) { + if (fdp->fd_ofiles[i] != NULL) + continue; + if (devnull < 0) { + error = falloc(td, &fp, &fd); + if (error != 0) + break; + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null", + td); + flags = FREAD | FWRITE; + error = vn_open(&nd, &flags, 0); + if (error != 0) { + FILEDESC_LOCK(fdp); + fdp->fd_ofiles[i] = NULL; + FILEDESC_UNLOCK(fdp); + fdrop(fp, td); + break; + } + NDFREE(&nd, NDF_ONLY_PNBUF); + fp->f_data = (caddr_t)nd.ni_vp; + fp->f_flag = flags; + fp->f_ops = &vnops; + fp->f_type = DTYPE_VNODE; + VOP_UNLOCK(nd.ni_vp, 0, td); + devnull = fd; + } else { + FILEDESC_LOCK(fdp); + error = fdalloc(td, 0, &fd); + if (error != 0) { + FILEDESC_UNLOCK(fdp); + break; + } + error = do_dup(fdp, devnull, fd, &retval, td); + if (error != 0) + break; + } + } + return (error); +} + +/* + * Internal form of close. + * Decrement reference count on file structure. + * Note: td may be NULL when closing a file + * that was being passed in a message. + */ +int +closef(fp, td) + register struct file *fp; + register struct thread *td; +{ + struct vnode *vp; + struct flock lf; + + if (fp == NULL) + return (0); + /* + * POSIX record locking dictates that any close releases ALL + * locks owned by this process. This is handled by setting + * a flag in the unlock to free ONLY locks obeying POSIX + * semantics, and not to free BSD-style file locks. + * If the descriptor was in a message, POSIX-style locks + * aren't passed with the descriptor. + */ + if (td && (td->td_proc->p_flag & P_ADVLOCK) && + fp->f_type == DTYPE_VNODE) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = (struct vnode *)fp->f_data; + (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, + F_UNLCK, &lf, F_POSIX); + } + return (fdrop(fp, td)); +} + +/* + * Drop reference on struct file passed in, may call closef if the + * reference hits zero. + */ +int +fdrop(fp, td) + struct file *fp; + struct thread *td; +{ + + FILE_LOCK(fp); + return (fdrop_locked(fp, td)); +} + +/* + * Extract the file pointer associated with the specified descriptor for + * the current user process. + * + * If the descriptor doesn't exist, EBADF is returned. + * + * If the descriptor exists but doesn't match 'flags' then + * return EBADF for read attempts and EINVAL for write attempts. + * + * If 'hold' is set (non-zero) the file's refcount will be bumped on return. + * It should be droped with fdrop(). + * If it is not set, then the refcount will not be bumped however the + * thread's filedesc struct will be returned locked (for fgetsock). + * + * If an error occured the non-zero error is returned and *fpp is set to NULL. + * Otherwise *fpp is set and zero is returned. + */ +static __inline +int +_fget(struct thread *td, int fd, struct file **fpp, int flags, int hold) +{ + struct filedesc *fdp; + struct file *fp; + + *fpp = NULL; + if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) + return(EBADF); + FILEDESC_LOCK(fdp); + if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) { + FILEDESC_UNLOCK(fdp); + return(EBADF); + } + + /* + * Note: FREAD failures returns EBADF to maintain backwards + * compatibility with what routines returned before. + * + * Only one flag, or 0, may be specified. + */ + if (flags == FREAD && (fp->f_flag & FREAD) == 0) { + FILEDESC_UNLOCK(fdp); + return(EBADF); + } + if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) { + FILEDESC_UNLOCK(fdp); + return(EINVAL); + } + if (hold) { + fhold(fp); + FILEDESC_UNLOCK(fdp); + } + *fpp = fp; + return(0); +} + +int +fget(struct thread *td, int fd, struct file **fpp) +{ + return(_fget(td, fd, fpp, 0, 1)); +} + +int +fget_read(struct thread *td, int fd, struct file **fpp) +{ + return(_fget(td, fd, fpp, FREAD, 1)); +} + +int +fget_write(struct thread *td, int fd, struct file **fpp) +{ + return(_fget(td, fd, fpp, FWRITE, 1)); +} + +/* + * Like fget() but loads the underlying vnode, or returns an error if + * the descriptor does not represent a vnode. Note that pipes use vnodes + * but never have VM objects (so VOP_GETVOBJECT() calls will return an + * error). The returned vnode will be vref()d. + */ + +static __inline +int +_fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags) +{ + struct file *fp; + int error; + + *vpp = NULL; + if ((error = _fget(td, fd, &fp, 0, 0)) != 0) + return (error); + if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { + error = EINVAL; + } else { + *vpp = (struct vnode *)fp->f_data; + vref(*vpp); + } + FILEDESC_UNLOCK(td->td_proc->p_fd); + return (error); +} + +int +fgetvp(struct thread *td, int fd, struct vnode **vpp) +{ + return(_fgetvp(td, fd, vpp, 0)); +} + +int +fgetvp_read(struct thread *td, int fd, struct vnode **vpp) +{ + return(_fgetvp(td, fd, vpp, FREAD)); +} + +int +fgetvp_write(struct thread *td, int fd, struct vnode **vpp) +{ + return(_fgetvp(td, fd, vpp, FWRITE)); +} + +/* + * Like fget() but loads the underlying socket, or returns an error if + * the descriptor does not represent a socket. + * + * We bump the ref count on the returned socket. XXX Also obtain the SX lock in + * the future. + */ +int +fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp) +{ + struct file *fp; + int error; + + *spp = NULL; + if (fflagp) + *fflagp = 0; + if ((error = _fget(td, fd, &fp, 0, 0)) != 0) + return (error); + if (fp->f_type != DTYPE_SOCKET) { + error = ENOTSOCK; + } else { + *spp = (struct socket *)fp->f_data; + if (fflagp) + *fflagp = fp->f_flag; + soref(*spp); + } + FILEDESC_UNLOCK(td->td_proc->p_fd); + return(error); +} + +/* + * Drop the reference count on the the socket and XXX release the SX lock in + * the future. The last reference closes the socket. + */ +void +fputsock(struct socket *so) +{ + sorele(so); +} + +/* + * Drop reference on struct file passed in, may call closef if the + * reference hits zero. + * Expects struct file locked, and will unlock it. + */ +int +fdrop_locked(fp, td) + struct file *fp; + struct thread *td; +{ + struct flock lf; + struct vnode *vp; + int error; + + FILE_LOCK_ASSERT(fp, MA_OWNED); + + if (--fp->f_count > 0) { + FILE_UNLOCK(fp); + return (0); + } + mtx_lock(&Giant); + if (fp->f_count < 0) + panic("fdrop: count < 0"); + if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = (struct vnode *)fp->f_data; + FILE_UNLOCK(fp); + (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); + } else + FILE_UNLOCK(fp); + if (fp->f_ops != &badfileops) + error = fo_close(fp, td); + else + error = 0; + ffree(fp); + mtx_unlock(&Giant); + return (error); +} + +/* + * Apply an advisory lock on a file descriptor. + * + * Just attempt to get a record lock of the requested type on + * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). + */ +#ifndef _SYS_SYSPROTO_H_ +struct flock_args { + int fd; + int how; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +flock(td, uap) + struct thread *td; + register struct flock_args *uap; +{ + struct file *fp; + struct vnode *vp; + struct flock lf; + int error; + + if ((error = fget(td, uap->fd, &fp)) != 0) + return (error); + if (fp->f_type != DTYPE_VNODE) { + fdrop(fp, td); + return (EOPNOTSUPP); + } + + mtx_lock(&Giant); + vp = (struct vnode *)fp->f_data; + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (uap->how & LOCK_UN) { + lf.l_type = F_UNLCK; + FILE_LOCK(fp); + fp->f_flag &= ~FHASLOCK; + FILE_UNLOCK(fp); + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); + goto done2; + } + if (uap->how & LOCK_EX) + lf.l_type = F_WRLCK; + else if (uap->how & LOCK_SH) + lf.l_type = F_RDLCK; + else { + error = EBADF; + goto done2; + } + FILE_LOCK(fp); + fp->f_flag |= FHASLOCK; + FILE_UNLOCK(fp); + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, + (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); +done2: + fdrop(fp, td); + mtx_unlock(&Giant); + return (error); +} + +/* + * File Descriptor pseudo-device driver (/dev/fd/). + * + * Opening minor device N dup()s the file (if any) connected to file + * descriptor N belonging to the calling process. Note that this driver + * consists of only the ``open()'' routine, because all subsequent + * references to this file will be direct to the other driver. + */ +/* ARGSUSED */ +static int +fdopen(dev, mode, type, td) + dev_t dev; + int mode, type; + struct thread *td; +{ + + /* + * XXX Kludge: set curthread->td_dupfd to contain the value of the + * the file descriptor being sought for duplication. The error + * return ensures that the vnode for this device will be released + * by vn_open. Open will detect this special error and take the + * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN + * will simply report the error. + */ + td->td_dupfd = dev2unit(dev); + return (ENODEV); +} + +/* + * Duplicate the specified descriptor to a free descriptor. + */ +int +dupfdopen(td, fdp, indx, dfd, mode, error) + struct thread *td; + struct filedesc *fdp; + int indx, dfd; + int mode; + int error; +{ + register struct file *wfp; + struct file *fp; + + /* + * If the to-be-dup'd fd number is greater than the allowed number + * of file descriptors, or the fd to be dup'd has already been + * closed, then reject. + */ + FILEDESC_LOCK(fdp); + if ((u_int)dfd >= fdp->fd_nfiles || + (wfp = fdp->fd_ofiles[dfd]) == NULL) { + FILEDESC_UNLOCK(fdp); + return (EBADF); + } + + /* + * There are two cases of interest here. + * + * For ENODEV simply dup (dfd) to file descriptor + * (indx) and return. + * + * For ENXIO steal away the file structure from (dfd) and + * store it in (indx). (dfd) is effectively closed by + * this operation. + * + * Any other error code is just returned. + */ + switch (error) { + case ENODEV: + /* + * Check that the mode the file is being opened for is a + * subset of the mode of the existing descriptor. + */ + FILE_LOCK(wfp); + if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { + FILE_UNLOCK(wfp); + FILEDESC_UNLOCK(fdp); + return (EACCES); + } + fp = fdp->fd_ofiles[indx]; +#if 0 + if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED) + (void) munmapfd(td, indx); +#endif + fdp->fd_ofiles[indx] = wfp; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + fhold_locked(wfp); + FILE_UNLOCK(wfp); + if (indx > fdp->fd_lastfile) + fdp->fd_lastfile = indx; + if (fp != NULL) + FILE_LOCK(fp); + FILEDESC_UNLOCK(fdp); + /* + * we now own the reference to fp that the ofiles[] array + * used to own. Release it. + */ + if (fp != NULL) + fdrop_locked(fp, td); + return (0); + + case ENXIO: + /* + * Steal away the file pointer from dfd, and stuff it into indx. + */ + fp = fdp->fd_ofiles[indx]; +#if 0 + if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED) + (void) munmapfd(td, indx); +#endif + fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; + fdp->fd_ofiles[dfd] = NULL; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + fdp->fd_ofileflags[dfd] = 0; + + /* + * Complete the clean up of the filedesc structure by + * recomputing the various hints. + */ + if (indx > fdp->fd_lastfile) { + fdp->fd_lastfile = indx; + } else { + while (fdp->fd_lastfile > 0 && + fdp->fd_ofiles[fdp->fd_lastfile] == NULL) { + fdp->fd_lastfile--; + } + if (dfd < fdp->fd_freefile) + fdp->fd_freefile = dfd; + } + if (fp != NULL) + FILE_LOCK(fp); + FILEDESC_UNLOCK(fdp); + + /* + * we now own the reference to fp that the ofiles[] array + * used to own. Release it. + */ + if (fp != NULL) + fdrop_locked(fp, td); + return (0); + + default: + FILEDESC_UNLOCK(fdp); + return (error); + } + /* NOTREACHED */ +} + +/* + * Get file structures. + */ +static int +sysctl_kern_file(SYSCTL_HANDLER_ARGS) +{ + int error; + struct file *fp; + + sx_slock(&filelist_lock); + if (!req->oldptr) { + /* + * overestimate by 10 files + */ + error = SYSCTL_OUT(req, 0, sizeof(filehead) + + (nfiles + 10) * sizeof(struct file)); + sx_sunlock(&filelist_lock); + return (error); + } + + error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead)); + if (error) { + sx_sunlock(&filelist_lock); + return (error); + } + + /* + * followed by an array of file structures + */ + LIST_FOREACH(fp, &filehead, f_list) { + error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file)); + if (error) { + sx_sunlock(&filelist_lock); + return (error); + } + } + sx_sunlock(&filelist_lock); + return (0); +} + +SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, 0, sysctl_kern_file, "S,file", "Entire file table"); + +SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, + &maxfilesperproc, 0, "Maximum files allowed open per process"); + +SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, + &maxfiles, 0, "Maximum number of files"); + +SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, + &nfiles, 0, "System-wide number of open files"); + +static void +fildesc_drvinit(void *unused) +{ + dev_t dev; + + dev = make_dev(&fildesc_cdevsw, 0, UID_BIN, GID_BIN, 0666, "fd/0"); + make_dev_alias(dev, "stdin"); + dev = make_dev(&fildesc_cdevsw, 1, UID_BIN, GID_BIN, 0666, "fd/1"); + make_dev_alias(dev, "stdout"); + dev = make_dev(&fildesc_cdevsw, 2, UID_BIN, GID_BIN, 0666, "fd/2"); + make_dev_alias(dev, "stderr"); + if (!devfs_present) { + int fd; + + for (fd = 3; fd < NUMFDESC; fd++) + make_dev(&fildesc_cdevsw, fd, UID_BIN, GID_BIN, 0666, + "fd/%d", fd); + } +} + +struct fileops badfileops = { + badfo_readwrite, + badfo_readwrite, + badfo_ioctl, + badfo_poll, + badfo_kqfilter, + badfo_stat, + badfo_close +}; + +static int +badfo_readwrite(fp, uio, cred, flags, td) + struct file *fp; + struct uio *uio; + struct ucred *cred; + struct thread *td; + int flags; +{ + + return (EBADF); +} + +static int +badfo_ioctl(fp, com, data, td) + struct file *fp; + u_long com; + caddr_t data; + struct thread *td; +{ + + return (EBADF); +} + +static int +badfo_poll(fp, events, cred, td) + struct file *fp; + int events; + struct ucred *cred; + struct thread *td; +{ + + return (0); +} + +static int +badfo_kqfilter(fp, kn) + struct file *fp; + struct knote *kn; +{ + + return (0); +} + +static int +badfo_stat(fp, sb, td) + struct file *fp; + struct stat *sb; + struct thread *td; +{ + + return (EBADF); +} + +static int +badfo_close(fp, td) + struct file *fp; + struct thread *td; +{ + + return (EBADF); +} + +SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR, + fildesc_drvinit,NULL) + +static void filelistinit(void *); +SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL) + +/* ARGSUSED*/ +static void +filelistinit(dummy) + void *dummy; +{ + file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, 0); + + sx_init(&filelist_lock, "filelist lock"); + mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); +} diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c new file mode 100644 index 0000000..a33b0c7 --- /dev/null +++ b/sys/kern/kern_environment.c @@ -0,0 +1,461 @@ +/*- + * Copyright (c) 1998 Michael Smith + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * The unified bootloader passes us a pointer to a preserved copy of + * bootstrap/kernel environment variables. We convert them to a + * dynamic array of strings later when the VM subsystem is up. + * + * We make these available through the kenv(2) syscall for userland + * and through getenv()/freeenv() setenv() unsetenv() testenv() for + * the kernel. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/kernel.h> +#include <sys/sx.h> +#include <sys/systm.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> +#include <sys/libkern.h> +#include <sys/kenv.h> + +MALLOC_DEFINE(M_KENV, "kenv", "kernel environment"); + +#define KENV_SIZE 512 /* Maximum number of environment strings */ + +/* pointer to the static environment */ +char *kern_envp; +static char *kernenv_next(char *); + +/* dynamic environment variables */ +char **kenvp; +struct sx kenv_lock; + +/* + * No need to protect this with a mutex + * since SYSINITS are single threaded. + */ +int dynamic_kenv = 0; + +#define KENV_CHECK if (!dynamic_kenv) \ + panic("%s: called before SI_SUB_KMEM", __func__) + +int +kenv(td, uap) + struct thread *td; + struct kenv_args /* { + syscallarg(int) what; + syscallarg(const char *) name; + syscallarg(char *) value; + syscallarg(int) len; + } */ *uap; +{ + char *name, *value; + size_t len, done; + int error, i; + + KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = 0")); + + error = 0; + if (SCARG(uap, what) == KENV_DUMP) { + len = 0; + /* Return the size if called with a NULL buffer */ + if (SCARG(uap, value) == NULL) { + sx_slock(&kenv_lock); + for (i = 0; kenvp[i] != NULL; i++) + len += strlen(kenvp[i]) + 1; + sx_sunlock(&kenv_lock); + td->td_retval[0] = len; + return (0); + } + done = 0; + sx_slock(&kenv_lock); + for (i = 0; kenvp[i] != NULL && done < SCARG(uap, len); i++) { + len = min(strlen(kenvp[i]) + 1, SCARG(uap, len) - done); + error = copyout(kenvp[i], SCARG(uap, value) + done, + len); + if (error) { + sx_sunlock(&kenv_lock); + return (error); + } + done += len; + } + sx_sunlock(&kenv_lock); + return (0); + } + + if ((SCARG(uap, what) == KENV_SET) || + (SCARG(uap, what) == KENV_UNSET)) { + error = suser(td); + if (error) + return (error); + } + + name = malloc(KENV_MNAMELEN, M_TEMP, M_WAITOK); + + error = copyinstr(SCARG(uap, name), name, KENV_MNAMELEN, NULL); + if (error) + goto done; + + switch (SCARG(uap, what)) { + case KENV_GET: + value = getenv(name); + if (value == NULL) { + error = ENOENT; + goto done; + } + len = strlen(value) + 1; + if (len > SCARG(uap, len)) + len = SCARG(uap, len); + error = copyout(value, SCARG(uap, value), len); + freeenv(value); + if (error) + goto done; + td->td_retval[0] = len; + break; + case KENV_SET: + len = SCARG(uap, len); + if (len < 1) { + error = EINVAL; + goto done; + } + if (len > KENV_MVALLEN) + len = KENV_MVALLEN; + value = malloc(len, M_TEMP, M_WAITOK); + error = copyinstr(SCARG(uap, value), value, len, NULL); + if (error) { + free(value, M_TEMP); + goto done; + } + setenv(name, value); + free(value, M_TEMP); + break; + case KENV_UNSET: + error = unsetenv(name); + if (error) + error = ENOENT; + break; + default: + error = EINVAL; + break; + } +done: + free(name, M_TEMP); + return (error); +} + +/* + * Setup the dynamic kernel environment. + */ +static void +init_dynamic_kenv(void *data __unused) +{ + char *cp; + int len, i; + + kenvp = malloc(KENV_SIZE * sizeof(char *), M_KENV, M_WAITOK | M_ZERO); + i = 0; + for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) { + len = strlen(cp) + 1; + kenvp[i] = malloc(len, M_KENV, M_WAITOK); + strcpy(kenvp[i++], cp); + } + kenvp[i] = NULL; + + sx_init(&kenv_lock, "kernel environment"); + dynamic_kenv = 1; +} +SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL); + +void +freeenv(char *env) +{ + + if (dynamic_kenv) + free(env, M_KENV); +} + +/* + * Internal functions for string lookup. + */ +static char * +_getenv_dynamic(const char *name, int *idx) +{ + char *cp; + int len, i; + + sx_assert(&kenv_lock, SX_LOCKED); + len = strlen(name); + for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) { + if ((cp[len] == '=') && + (strncmp(cp, name, len) == 0)) { + if (idx != NULL) + *idx = i; + return (cp + len + 1); + } + } + return (NULL); +} + +static char * +_getenv_static(const char *name) +{ + char *cp, *ep; + int len; + + for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) { + for (ep = cp; (*ep != '=') && (*ep != 0); ep++) + ; + if (*ep != '=') + continue; + len = ep - cp; + ep++; + if (!strncmp(name, cp, len) && name[len] == 0) + return (ep); + } + return (NULL); +} + +/* + * Look up an environment variable by name. + * Return a pointer to the string if found. + * The pointer has to be freed with freeenv() + * after use. + */ +char * +getenv(const char *name) +{ + char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1]; + char *ret, *cp; + int len; + + if (dynamic_kenv) { + sx_slock(&kenv_lock); + cp = _getenv_dynamic(name, NULL); + if (cp != NULL) { + strcpy(buf, cp); + sx_sunlock(&kenv_lock); + len = strlen(buf) + 1; + ret = malloc(len, M_KENV, M_WAITOK); + strcpy(ret, buf); + } else { + sx_sunlock(&kenv_lock); + ret = NULL; + } + } else + ret = _getenv_static(name); + return (ret); +} + +/* + * Test if an environment variable is defined. + */ +int +testenv(const char *name) +{ + char *cp; + + if (dynamic_kenv) { + sx_slock(&kenv_lock); + cp = _getenv_dynamic(name, NULL); + sx_sunlock(&kenv_lock); + } else + cp = _getenv_static(name); + if (cp != NULL) + return (1); + return (0); +} + +/* + * Set an environment variable by name. + */ +int +setenv(const char *name, const char *value) +{ + char *buf, *cp, *oldenv; + int namelen, vallen, i; + + KENV_CHECK; + + namelen = strlen(name) + 1; + if (namelen > KENV_MNAMELEN) + return (-1); + vallen = strlen(value) + 1; + if (vallen > KENV_MVALLEN) + return (-1); + buf = malloc(namelen + vallen, M_KENV, M_WAITOK); + sprintf(buf, "%s=%s", name, value); + + sx_xlock(&kenv_lock); + cp = _getenv_dynamic(name, &i); + if (cp != NULL) { + oldenv = kenvp[i]; + kenvp[i] = buf; + sx_xunlock(&kenv_lock); + free(oldenv, M_KENV); + } else { + /* We add the option if it wasn't found */ + for (i = 0; (cp = kenvp[i]) != NULL; i++) + ; + kenvp[i] = buf; + kenvp[i + 1] = NULL; + sx_xunlock(&kenv_lock); + } + return (0); +} + +/* + * Unset an environment variable string. + */ +int +unsetenv(const char *name) +{ + char *cp, *oldenv; + int i, j; + + KENV_CHECK; + + sx_xlock(&kenv_lock); + cp = _getenv_dynamic(name, &i); + if (cp != NULL) { + oldenv = kenvp[i]; + for (j = i + 1; kenvp[j] != NULL; j++) + kenvp[i++] = kenvp[j]; + kenvp[i] = NULL; + sx_xunlock(&kenv_lock); + free(oldenv, M_KENV); + return (0); + } + sx_xunlock(&kenv_lock); + return (-1); +} + +/* + * Return a string value from an environment variable. + */ +int +getenv_string(const char *name, char *data, int size) +{ + char *tmp; + + tmp = getenv(name); + if (tmp != NULL) { + strncpy(data, tmp, size); + freeenv(tmp); + data[size - 1] = 0; + return (1); + } else + return (0); +} + +/* + * Return an integer value from an environment variable. + */ +int +getenv_int(const char *name, int *data) +{ + quad_t tmp; + int rval; + + rval = getenv_quad(name, &tmp); + if (rval) + *data = (int) tmp; + return (rval); +} + +/* + * Return a quad_t value from an environment variable. + */ +int +getenv_quad(const char *name, quad_t *data) +{ + char *value; + char *vtp; + quad_t iv; + + value = getenv(name); + if (value == NULL) + return (0); + iv = strtoq(value, &vtp, 0); + if ((vtp == value) || (*vtp != '\0')) { + freeenv(value); + return (0); + } + freeenv(value); + *data = iv; + return (1); +} + +/* + * Find the next entry after the one which (cp) falls within, return a + * pointer to its start or NULL if there are no more. + */ +static char * +kernenv_next(char *cp) +{ + + if (cp != NULL) { + while (*cp != 0) + cp++; + cp++; + if (*cp == 0) + cp = NULL; + } + return (cp); +} + +void +tunable_int_init(void *data) +{ + struct tunable_int *d = (struct tunable_int *)data; + + TUNABLE_INT_FETCH(d->path, d->var); +} + +void +tunable_quad_init(void *data) +{ + struct tunable_quad *d = (struct tunable_quad *)data; + + TUNABLE_QUAD_FETCH(d->path, d->var); +} + +void +tunable_str_init(void *data) +{ + struct tunable_str *d = (struct tunable_str *)data; + + TUNABLE_STR_FETCH(d->path, d->var, d->size); +} diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c new file mode 100644 index 0000000..46d57c9 --- /dev/null +++ b/sys/kern/kern_event.c @@ -0,0 +1,1082 @@ +/*- + * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/unistd.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/selinfo.h> +#include <sys/queue.h> +#include <sys/event.h> +#include <sys/eventvar.h> +#include <sys/poll.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/stat.h> +#include <sys/sysctl.h> +#include <sys/sysproto.h> +#include <sys/uio.h> + +#include <vm/uma.h> + +MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); + +static int kqueue_scan(struct file *fp, int maxevents, + struct kevent *ulistp, const struct timespec *timeout, + struct thread *td); +static int kqueue_read(struct file *fp, struct uio *uio, + struct ucred *cred, int flags, struct thread *td); +static int kqueue_write(struct file *fp, struct uio *uio, + struct ucred *cred, int flags, struct thread *td); +static int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, + struct thread *td); +static int kqueue_poll(struct file *fp, int events, struct ucred *cred, + struct thread *td); +static int kqueue_kqfilter(struct file *fp, struct knote *kn); +static int kqueue_stat(struct file *fp, struct stat *st, struct thread *td); +static int kqueue_close(struct file *fp, struct thread *td); +static void kqueue_wakeup(struct kqueue *kq); + +static struct fileops kqueueops = { + kqueue_read, + kqueue_write, + kqueue_ioctl, + kqueue_poll, + kqueue_kqfilter, + kqueue_stat, + kqueue_close +}; + +static void knote_attach(struct knote *kn, struct filedesc *fdp); +static void knote_drop(struct knote *kn, struct thread *td); +static void knote_enqueue(struct knote *kn); +static void knote_dequeue(struct knote *kn); +static void knote_init(void); +static struct knote *knote_alloc(void); +static void knote_free(struct knote *kn); + +static void filt_kqdetach(struct knote *kn); +static int filt_kqueue(struct knote *kn, long hint); +static int filt_procattach(struct knote *kn); +static void filt_procdetach(struct knote *kn); +static int filt_proc(struct knote *kn, long hint); +static int filt_fileattach(struct knote *kn); +static void filt_timerexpire(void *knx); +static int filt_timerattach(struct knote *kn); +static void filt_timerdetach(struct knote *kn); +static int filt_timer(struct knote *kn, long hint); + +static struct filterops file_filtops = + { 1, filt_fileattach, NULL, NULL }; +static struct filterops kqread_filtops = + { 1, NULL, filt_kqdetach, filt_kqueue }; +static struct filterops proc_filtops = + { 0, filt_procattach, filt_procdetach, filt_proc }; +static struct filterops timer_filtops = + { 0, filt_timerattach, filt_timerdetach, filt_timer }; + +static uma_zone_t knote_zone; +static int kq_ncallouts = 0; +static int kq_calloutmax = (4 * 1024); +SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, + &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); + +#define KNOTE_ACTIVATE(kn) do { \ + kn->kn_status |= KN_ACTIVE; \ + if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ + knote_enqueue(kn); \ +} while(0) + +#define KN_HASHSIZE 64 /* XXX should be tunable */ +#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) + +static int +filt_nullattach(struct knote *kn) +{ + + return (ENXIO); +}; + +struct filterops null_filtops = + { 0, filt_nullattach, NULL, NULL }; + +extern struct filterops sig_filtops; + +/* + * Table for for all system-defined filters. + */ +static struct filterops *sysfilt_ops[] = { + &file_filtops, /* EVFILT_READ */ + &file_filtops, /* EVFILT_WRITE */ + &null_filtops, /* EVFILT_AIO */ + &file_filtops, /* EVFILT_VNODE */ + &proc_filtops, /* EVFILT_PROC */ + &sig_filtops, /* EVFILT_SIGNAL */ + &timer_filtops, /* EVFILT_TIMER */ + &file_filtops, /* EVFILT_NETDEV */ +}; + +static int +filt_fileattach(struct knote *kn) +{ + + return (fo_kqfilter(kn->kn_fp, kn)); +} + +/*ARGSUSED*/ +static int +kqueue_kqfilter(struct file *fp, struct knote *kn) +{ + struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; + + if (kn->kn_filter != EVFILT_READ) + return (1); + + kn->kn_fop = &kqread_filtops; + SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext); + return (0); +} + +static void +filt_kqdetach(struct knote *kn) +{ + struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; + + SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext); +} + +/*ARGSUSED*/ +static int +filt_kqueue(struct knote *kn, long hint) +{ + struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; + + kn->kn_data = kq->kq_count; + return (kn->kn_data > 0); +} + +static int +filt_procattach(struct knote *kn) +{ + struct proc *p; + int error; + + p = pfind(kn->kn_id); + if (p == NULL) + return (ESRCH); + if ((error = p_cansee(curthread, p))) { + PROC_UNLOCK(p); + return (error); + } + + kn->kn_ptr.p_proc = p; + kn->kn_flags |= EV_CLEAR; /* automatically set */ + + /* + * internal flag indicating registration done by kernel + */ + if (kn->kn_flags & EV_FLAG1) { + kn->kn_data = kn->kn_sdata; /* ppid */ + kn->kn_fflags = NOTE_CHILD; + kn->kn_flags &= ~EV_FLAG1; + } + + SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext); + PROC_UNLOCK(p); + + return (0); +} + +/* + * The knote may be attached to a different process, which may exit, + * leaving nothing for the knote to be attached to. So when the process + * exits, the knote is marked as DETACHED and also flagged as ONESHOT so + * it will be deleted when read out. However, as part of the knote deletion, + * this routine is called, so a check is needed to avoid actually performing + * a detach, because the original process does not exist any more. + */ +static void +filt_procdetach(struct knote *kn) +{ + struct proc *p = kn->kn_ptr.p_proc; + + if (kn->kn_status & KN_DETACHED) + return; + + PROC_LOCK(p); + SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext); + PROC_UNLOCK(p); +} + +static int +filt_proc(struct knote *kn, long hint) +{ + u_int event; + + /* + * mask off extra data + */ + event = (u_int)hint & NOTE_PCTRLMASK; + + /* + * if the user is interested in this event, record it. + */ + if (kn->kn_sfflags & event) + kn->kn_fflags |= event; + + /* + * process is gone, so flag the event as finished. + */ + if (event == NOTE_EXIT) { + kn->kn_status |= KN_DETACHED; + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + return (1); + } + + /* + * process forked, and user wants to track the new process, + * so attach a new knote to it, and immediately report an + * event with the parent's pid. + */ + if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { + struct kevent kev; + int error; + + /* + * register knote with new process. + */ + kev.ident = hint & NOTE_PDATAMASK; /* pid */ + kev.filter = kn->kn_filter; + kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; + kev.fflags = kn->kn_sfflags; + kev.data = kn->kn_id; /* parent */ + kev.udata = kn->kn_kevent.udata; /* preserve udata */ + error = kqueue_register(kn->kn_kq, &kev, NULL); + if (error) + kn->kn_fflags |= NOTE_TRACKERR; + } + + return (kn->kn_fflags != 0); +} + +static void +filt_timerexpire(void *knx) +{ + struct knote *kn = knx; + struct callout *calloutp; + struct timeval tv; + int tticks; + + kn->kn_data++; + KNOTE_ACTIVATE(kn); + + if ((kn->kn_flags & EV_ONESHOT) == 0) { + tv.tv_sec = kn->kn_sdata / 1000; + tv.tv_usec = (kn->kn_sdata % 1000) * 1000; + tticks = tvtohz(&tv); + calloutp = (struct callout *)kn->kn_hook; + callout_reset(calloutp, tticks, filt_timerexpire, kn); + } +} + +/* + * data contains amount of time to sleep, in milliseconds + */ +static int +filt_timerattach(struct knote *kn) +{ + struct callout *calloutp; + struct timeval tv; + int tticks; + + if (kq_ncallouts >= kq_calloutmax) + return (ENOMEM); + kq_ncallouts++; + + tv.tv_sec = kn->kn_sdata / 1000; + tv.tv_usec = (kn->kn_sdata % 1000) * 1000; + tticks = tvtohz(&tv); + + kn->kn_flags |= EV_CLEAR; /* automatically set */ + MALLOC(calloutp, struct callout *, sizeof(*calloutp), + M_KQUEUE, M_WAITOK); + callout_init(calloutp, 0); + callout_reset(calloutp, tticks, filt_timerexpire, kn); + kn->kn_hook = calloutp; + + return (0); +} + +static void +filt_timerdetach(struct knote *kn) +{ + struct callout *calloutp; + + calloutp = (struct callout *)kn->kn_hook; + callout_stop(calloutp); + FREE(calloutp, M_KQUEUE); + kq_ncallouts--; +} + +static int +filt_timer(struct knote *kn, long hint) +{ + + return (kn->kn_data != 0); +} + +/* + * MPSAFE + */ +int +kqueue(struct thread *td, struct kqueue_args *uap) +{ + struct filedesc *fdp; + struct kqueue *kq; + struct file *fp; + int fd, error; + + mtx_lock(&Giant); + fdp = td->td_proc->p_fd; + error = falloc(td, &fp, &fd); + if (error) + goto done2; + kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO); + TAILQ_INIT(&kq->kq_head); + FILE_LOCK(fp); + fp->f_flag = FREAD | FWRITE; + fp->f_type = DTYPE_KQUEUE; + fp->f_ops = &kqueueops; + TAILQ_INIT(&kq->kq_head); + fp->f_data = kq; + FILE_UNLOCK(fp); + FILEDESC_LOCK(fdp); + td->td_retval[0] = fd; + if (fdp->fd_knlistsize < 0) + fdp->fd_knlistsize = 0; /* this process has a kq */ + FILEDESC_UNLOCK(fdp); + kq->kq_fdp = fdp; +done2: + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct kevent_args { + int fd; + const struct kevent *changelist; + int nchanges; + struct kevent *eventlist; + int nevents; + const struct timespec *timeout; +}; +#endif +/* + * MPSAFE + */ +int +kevent(struct thread *td, struct kevent_args *uap) +{ + struct kevent *kevp; + struct kqueue *kq; + struct file *fp; + struct timespec ts; + int i, n, nerrors, error; + + if ((error = fget(td, uap->fd, &fp)) != 0) + return (error); + if (fp->f_type != DTYPE_KQUEUE) { + fdrop(fp, td); + return (EBADF); + } + if (uap->timeout != NULL) { + error = copyin(uap->timeout, &ts, sizeof(ts)); + if (error) + goto done_nogiant; + uap->timeout = &ts; + } + mtx_lock(&Giant); + + kq = (struct kqueue *)fp->f_data; + nerrors = 0; + + while (uap->nchanges > 0) { + n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges; + error = copyin(uap->changelist, kq->kq_kev, + n * sizeof(struct kevent)); + if (error) + goto done; + for (i = 0; i < n; i++) { + kevp = &kq->kq_kev[i]; + kevp->flags &= ~EV_SYSFLAGS; + error = kqueue_register(kq, kevp, td); + if (error) { + if (uap->nevents != 0) { + kevp->flags = EV_ERROR; + kevp->data = error; + (void) copyout(kevp, + uap->eventlist, + sizeof(*kevp)); + uap->eventlist++; + uap->nevents--; + nerrors++; + } else { + goto done; + } + } + } + uap->nchanges -= n; + uap->changelist += n; + } + if (nerrors) { + td->td_retval[0] = nerrors; + error = 0; + goto done; + } + + error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, td); +done: + mtx_unlock(&Giant); +done_nogiant: + if (fp != NULL) + fdrop(fp, td); + return (error); +} + +int +kqueue_add_filteropts(int filt, struct filterops *filtops) +{ + + if (filt > 0) + panic("filt(%d) > 0", filt); + if (filt + EVFILT_SYSCOUNT < 0) + panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0", + filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT); + if (sysfilt_ops[~filt] != &null_filtops) + panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt); + sysfilt_ops[~filt] = filtops; + return (0); +} + +int +kqueue_del_filteropts(int filt) +{ + + if (filt > 0) + panic("filt(%d) > 0", filt); + if (filt + EVFILT_SYSCOUNT < 0) + panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0", + filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT); + if (sysfilt_ops[~filt] == &null_filtops) + panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt); + sysfilt_ops[~filt] = &null_filtops; + return (0); +} + +int +kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td) +{ + struct filedesc *fdp = kq->kq_fdp; + struct filterops *fops; + struct file *fp = NULL; + struct knote *kn = NULL; + int s, error = 0; + + if (kev->filter < 0) { + if (kev->filter + EVFILT_SYSCOUNT < 0) + return (EINVAL); + fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ + } else { + /* + * XXX + * filter attach routine is responsible for insuring that + * the identifier can be attached to it. + */ + printf("unknown filter: %d\n", kev->filter); + return (EINVAL); + } + + FILEDESC_LOCK(fdp); + if (fops->f_isfd) { + /* validate descriptor */ + if ((u_int)kev->ident >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[kev->ident]) == NULL) { + FILEDESC_UNLOCK(fdp); + return (EBADF); + } + fhold(fp); + + if (kev->ident < fdp->fd_knlistsize) { + SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link) + if (kq == kn->kn_kq && + kev->filter == kn->kn_filter) + break; + } + } else { + if (fdp->fd_knhashmask != 0) { + struct klist *list; + + list = &fdp->fd_knhash[ + KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)]; + SLIST_FOREACH(kn, list, kn_link) + if (kev->ident == kn->kn_id && + kq == kn->kn_kq && + kev->filter == kn->kn_filter) + break; + } + } + FILEDESC_UNLOCK(fdp); + + if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { + error = ENOENT; + goto done; + } + + /* + * kn now contains the matching knote, or NULL if no match + */ + if (kev->flags & EV_ADD) { + + if (kn == NULL) { + kn = knote_alloc(); + if (kn == NULL) { + error = ENOMEM; + goto done; + } + kn->kn_fp = fp; + kn->kn_kq = kq; + kn->kn_fop = fops; + + /* + * apply reference count to knote structure, and + * do not release it at the end of this routine. + */ + fp = NULL; + + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + kev->fflags = 0; + kev->data = 0; + kn->kn_kevent = *kev; + + knote_attach(kn, fdp); + if ((error = fops->f_attach(kn)) != 0) { + knote_drop(kn, td); + goto done; + } + } else { + /* + * The user may change some filter values after the + * initial EV_ADD, but doing so will not reset any + * filter which have already been triggered. + */ + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + kn->kn_kevent.udata = kev->udata; + } + + s = splhigh(); + if (kn->kn_fop->f_event(kn, 0)) + KNOTE_ACTIVATE(kn); + splx(s); + + } else if (kev->flags & EV_DELETE) { + kn->kn_fop->f_detach(kn); + knote_drop(kn, td); + goto done; + } + + if ((kev->flags & EV_DISABLE) && + ((kn->kn_status & KN_DISABLED) == 0)) { + s = splhigh(); + kn->kn_status |= KN_DISABLED; + splx(s); + } + + if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { + s = splhigh(); + kn->kn_status &= ~KN_DISABLED; + if ((kn->kn_status & KN_ACTIVE) && + ((kn->kn_status & KN_QUEUED) == 0)) + knote_enqueue(kn); + splx(s); + } + +done: + if (fp != NULL) + fdrop(fp, td); + return (error); +} + +static int +kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp, + const struct timespec *tsp, struct thread *td) +{ + struct kqueue *kq; + struct kevent *kevp; + struct timeval atv, rtv, ttv; + struct knote *kn, marker; + int s, count, timeout, nkev = 0, error = 0; + + FILE_LOCK_ASSERT(fp, MA_NOTOWNED); + + kq = (struct kqueue *)fp->f_data; + count = maxevents; + if (count == 0) + goto done; + + if (tsp != NULL) { + TIMESPEC_TO_TIMEVAL(&atv, tsp); + if (itimerfix(&atv)) { + error = EINVAL; + goto done; + } + if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) + timeout = -1; + else + timeout = atv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&atv); + getmicrouptime(&rtv); + timevaladd(&atv, &rtv); + } else { + atv.tv_sec = 0; + atv.tv_usec = 0; + timeout = 0; + } + goto start; + +retry: + if (atv.tv_sec || atv.tv_usec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) + goto done; + ttv = atv; + timevalsub(&ttv, &rtv); + timeout = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + +start: + kevp = kq->kq_kev; + s = splhigh(); + if (kq->kq_count == 0) { + if (timeout < 0) { + error = EWOULDBLOCK; + } else { + kq->kq_state |= KQ_SLEEP; + error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout); + } + splx(s); + if (error == 0) + goto retry; + /* don't restart after signals... */ + if (error == ERESTART) + error = EINTR; + else if (error == EWOULDBLOCK) + error = 0; + goto done; + } + + TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe); + while (count) { + kn = TAILQ_FIRST(&kq->kq_head); + TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); + if (kn == &marker) { + splx(s); + if (count == maxevents) + goto retry; + goto done; + } + if (kn->kn_status & KN_DISABLED) { + kn->kn_status &= ~KN_QUEUED; + kq->kq_count--; + continue; + } + if ((kn->kn_flags & EV_ONESHOT) == 0 && + kn->kn_fop->f_event(kn, 0) == 0) { + kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); + kq->kq_count--; + continue; + } + *kevp = kn->kn_kevent; + kevp++; + nkev++; + if (kn->kn_flags & EV_ONESHOT) { + kn->kn_status &= ~KN_QUEUED; + kq->kq_count--; + splx(s); + kn->kn_fop->f_detach(kn); + knote_drop(kn, td); + s = splhigh(); + } else if (kn->kn_flags & EV_CLEAR) { + kn->kn_data = 0; + kn->kn_fflags = 0; + kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); + kq->kq_count--; + } else { + TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); + } + count--; + if (nkev == KQ_NEVENTS) { + splx(s); + error = copyout(&kq->kq_kev, ulistp, + sizeof(struct kevent) * nkev); + ulistp += nkev; + nkev = 0; + kevp = kq->kq_kev; + s = splhigh(); + if (error) + break; + } + } + TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe); + splx(s); +done: + if (nkev != 0) + error = copyout(&kq->kq_kev, ulistp, + sizeof(struct kevent) * nkev); + td->td_retval[0] = maxevents - count; + return (error); +} + +/* + * XXX + * This could be expanded to call kqueue_scan, if desired. + */ +/*ARGSUSED*/ +static int +kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, + int flags, struct thread *td) +{ + return (ENXIO); +} + +/*ARGSUSED*/ +static int +kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, + int flags, struct thread *td) +{ + return (ENXIO); +} + +/*ARGSUSED*/ +static int +kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct thread *td) +{ + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +kqueue_poll(struct file *fp, int events, struct ucred *cred, struct thread *td) +{ + struct kqueue *kq; + int revents = 0; + int s = splnet(); + + kq = (struct kqueue *)fp->f_data; + if (events & (POLLIN | POLLRDNORM)) { + if (kq->kq_count) { + revents |= events & (POLLIN | POLLRDNORM); + } else { + selrecord(td, &kq->kq_sel); + kq->kq_state |= KQ_SEL; + } + } + splx(s); + return (revents); +} + +/*ARGSUSED*/ +static int +kqueue_stat(struct file *fp, struct stat *st, struct thread *td) +{ + struct kqueue *kq; + + kq = (struct kqueue *)fp->f_data; + bzero((void *)st, sizeof(*st)); + st->st_size = kq->kq_count; + st->st_blksize = sizeof(struct kevent); + st->st_mode = S_IFIFO; + return (0); +} + +/*ARGSUSED*/ +static int +kqueue_close(struct file *fp, struct thread *td) +{ + struct kqueue *kq = (struct kqueue *)fp->f_data; + struct filedesc *fdp = td->td_proc->p_fd; + struct knote **knp, *kn, *kn0; + int i; + + FILEDESC_LOCK(fdp); + for (i = 0; i < fdp->fd_knlistsize; i++) { + knp = &SLIST_FIRST(&fdp->fd_knlist[i]); + kn = *knp; + while (kn != NULL) { + kn0 = SLIST_NEXT(kn, kn_link); + if (kq == kn->kn_kq) { + kn->kn_fop->f_detach(kn); + *knp = kn0; + FILE_LOCK(kn->kn_fp); + FILEDESC_UNLOCK(fdp); + fdrop_locked(kn->kn_fp, td); + knote_free(kn); + FILEDESC_LOCK(fdp); + } else { + knp = &SLIST_NEXT(kn, kn_link); + } + kn = kn0; + } + } + if (fdp->fd_knhashmask != 0) { + for (i = 0; i < fdp->fd_knhashmask + 1; i++) { + knp = &SLIST_FIRST(&fdp->fd_knhash[i]); + kn = *knp; + while (kn != NULL) { + kn0 = SLIST_NEXT(kn, kn_link); + if (kq == kn->kn_kq) { + kn->kn_fop->f_detach(kn); + *knp = kn0; + /* XXX non-fd release of kn->kn_ptr */ + FILEDESC_UNLOCK(fdp); + knote_free(kn); + FILEDESC_LOCK(fdp); + } else { + knp = &SLIST_NEXT(kn, kn_link); + } + kn = kn0; + } + } + } + FILEDESC_UNLOCK(fdp); + free(kq, M_KQUEUE); + fp->f_data = NULL; + + return (0); +} + +static void +kqueue_wakeup(struct kqueue *kq) +{ + + if (kq->kq_state & KQ_SLEEP) { + kq->kq_state &= ~KQ_SLEEP; + wakeup(kq); + } + if (kq->kq_state & KQ_SEL) { + kq->kq_state &= ~KQ_SEL; + selwakeup(&kq->kq_sel); + } + KNOTE(&kq->kq_sel.si_note, 0); +} + +/* + * walk down a list of knotes, activating them if their event has triggered. + */ +void +knote(struct klist *list, long hint) +{ + struct knote *kn; + + SLIST_FOREACH(kn, list, kn_selnext) + if (kn->kn_fop->f_event(kn, hint)) + KNOTE_ACTIVATE(kn); +} + +/* + * remove all knotes from a specified klist + */ +void +knote_remove(struct thread *td, struct klist *list) +{ + struct knote *kn; + + while ((kn = SLIST_FIRST(list)) != NULL) { + kn->kn_fop->f_detach(kn); + knote_drop(kn, td); + } +} + +/* + * remove all knotes referencing a specified fd + */ +void +knote_fdclose(struct thread *td, int fd) +{ + struct filedesc *fdp = td->td_proc->p_fd; + struct klist *list; + + FILEDESC_LOCK(fdp); + list = &fdp->fd_knlist[fd]; + FILEDESC_UNLOCK(fdp); + knote_remove(td, list); +} + +static void +knote_attach(struct knote *kn, struct filedesc *fdp) +{ + struct klist *list, *oldlist; + int size, newsize; + + FILEDESC_LOCK(fdp); + + if (! kn->kn_fop->f_isfd) { + if (fdp->fd_knhashmask == 0) + fdp->fd_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, + &fdp->fd_knhashmask); + list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; + goto done; + } + + if (fdp->fd_knlistsize <= kn->kn_id) { +retry: + size = fdp->fd_knlistsize; + while (size <= kn->kn_id) + size += KQEXTENT; + FILEDESC_UNLOCK(fdp); + MALLOC(list, struct klist *, + size * sizeof(struct klist *), M_KQUEUE, M_WAITOK); + FILEDESC_LOCK(fdp); + newsize = fdp->fd_knlistsize; + while (newsize <= kn->kn_id) + newsize += KQEXTENT; + if (newsize != size) { + FILEDESC_UNLOCK(fdp); + free(list, M_TEMP); + FILEDESC_LOCK(fdp); + goto retry; + } + bcopy(fdp->fd_knlist, list, + fdp->fd_knlistsize * sizeof(struct klist *)); + bzero((caddr_t)list + + fdp->fd_knlistsize * sizeof(struct klist *), + (size - fdp->fd_knlistsize) * sizeof(struct klist *)); + if (fdp->fd_knlist != NULL) + oldlist = fdp->fd_knlist; + else + oldlist = NULL; + fdp->fd_knlistsize = size; + fdp->fd_knlist = list; + FILEDESC_UNLOCK(fdp); + if (oldlist != NULL) + FREE(oldlist, M_KQUEUE); + FILEDESC_LOCK(fdp); + } + list = &fdp->fd_knlist[kn->kn_id]; +done: + FILEDESC_UNLOCK(fdp); + SLIST_INSERT_HEAD(list, kn, kn_link); + kn->kn_status = 0; +} + +/* + * should be called at spl == 0, since we don't want to hold spl + * while calling fdrop and free. + */ +static void +knote_drop(struct knote *kn, struct thread *td) +{ + struct filedesc *fdp = td->td_proc->p_fd; + struct klist *list; + + FILEDESC_LOCK(fdp); + if (kn->kn_fop->f_isfd) + list = &fdp->fd_knlist[kn->kn_id]; + else + list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; + if (kn->kn_fop->f_isfd) + FILE_LOCK(kn->kn_fp); + FILEDESC_UNLOCK(fdp); + + SLIST_REMOVE(list, kn, knote, kn_link); + if (kn->kn_status & KN_QUEUED) + knote_dequeue(kn); + if (kn->kn_fop->f_isfd) + fdrop_locked(kn->kn_fp, td); + knote_free(kn); +} + + +static void +knote_enqueue(struct knote *kn) +{ + struct kqueue *kq = kn->kn_kq; + int s = splhigh(); + + KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); + + TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); + kn->kn_status |= KN_QUEUED; + kq->kq_count++; + splx(s); + kqueue_wakeup(kq); +} + +static void +knote_dequeue(struct knote *kn) +{ + struct kqueue *kq = kn->kn_kq; + int s = splhigh(); + + KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); + + TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); + kn->kn_status &= ~KN_QUEUED; + kq->kq_count--; + splx(s); +} + +static void +knote_init(void) +{ + knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, 0); + +} +SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) + +static struct knote * +knote_alloc(void) +{ + return ((struct knote *)uma_zalloc(knote_zone, M_WAITOK)); +} + +static void +knote_free(struct knote *kn) +{ + uma_zfree(knote_zone, kn); +} diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c new file mode 100644 index 0000000..bc773df --- /dev/null +++ b/sys/kern/kern_exec.c @@ -0,0 +1,1022 @@ +/* + * Copyright (c) 1993, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sysproto.h> +#include <sys/signalvar.h> +#include <sys/kernel.h> +#include <sys/mount.h> +#include <sys/filedesc.h> +#include <sys/fcntl.h> +#include <sys/acct.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/imgact_elf.h> +#include <sys/wait.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/pioctl.h> +#include <sys/namei.h> +#include <sys/sysent.h> +#include <sys/shm.h> +#include <sys/sysctl.h> +#include <sys/user.h> +#include <sys/vnode.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/vm_object.h> +#include <vm/vm_pager.h> + +#include <machine/reg.h> + +MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); + +static MALLOC_DEFINE(M_ATEXEC, "atexec", "atexec callback"); + +/* + * callout list for things to do at exec time + */ +struct execlist { + execlist_fn function; + TAILQ_ENTRY(execlist) next; +}; + +TAILQ_HEAD(exec_list_head, execlist); +static struct exec_list_head exec_list = TAILQ_HEAD_INITIALIZER(exec_list); + +static register_t *exec_copyout_strings(struct image_params *); + +/* XXX This should be vm_size_t. */ +static u_long ps_strings = PS_STRINGS; +SYSCTL_ULONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings, 0, ""); + +/* XXX This should be vm_size_t. */ +static u_long usrstack = USRSTACK; +SYSCTL_ULONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, 0, ""); + +u_long ps_arg_cache_limit = PAGE_SIZE / 16; +SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, + &ps_arg_cache_limit, 0, ""); + +int ps_argsopen = 1; +SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, ""); + +#ifdef __ia64__ +/* XXX HACK */ +static int regstkpages = 256; +SYSCTL_INT(_machdep, OID_AUTO, regstkpages, CTLFLAG_RW, ®stkpages, 0, ""); +#endif + +/* + * Each of the items is a pointer to a `const struct execsw', hence the + * double pointer here. + */ +static const struct execsw **execsw; + +#ifndef _SYS_SYSPROTO_H_ +struct execve_args { + char *fname; + char **argv; + char **envv; +}; +#endif + +/* + * execve() system call. + * + * MPSAFE + */ +int +execve(td, uap) + struct thread *td; + register struct execve_args *uap; +{ + struct proc *p = td->td_proc; + struct nameidata nd, *ndp; + struct ucred *newcred = NULL, *oldcred; + struct uidinfo *euip; + register_t *stack_base; + int error, len, i; + struct image_params image_params, *imgp; + struct vattr attr; + int (*img_first)(struct image_params *); + struct pargs *oldargs = NULL, *newargs = NULL; + struct procsig *oldprocsig, *newprocsig; +#ifdef KTRACE + struct vnode *tracevp = NULL; +#endif + struct vnode *textvp = NULL; + + imgp = &image_params; + + /* + * Lock the process and set the P_INEXEC flag to indicate that + * it should be left alone until we're done here. This is + * necessary to avoid race conditions - e.g. in ptrace() - + * that might allow a local user to illicitly obtain elevated + * privileges. + */ + mtx_lock(&Giant); + PROC_LOCK(p); + KASSERT((p->p_flag & P_INEXEC) == 0, + ("%s(): process already has P_INEXEC flag", __func__)); + p->p_flag |= P_INEXEC; + PROC_UNLOCK(p); + +/* XXXKSE */ +/* !!!!!!!! we need abort all the other threads of this process before we */ +/* proceed beyond his point! */ + + /* + * Initialize part of the common data + */ + imgp->proc = p; + imgp->uap = uap; + imgp->attr = &attr; + imgp->argc = imgp->envc = 0; + imgp->argv0 = NULL; + imgp->entry_addr = 0; + imgp->vmspace_destroyed = 0; + imgp->interpreted = 0; + imgp->interpreter_name[0] = '\0'; + imgp->auxargs = NULL; + imgp->vp = NULL; + imgp->firstpage = NULL; + imgp->ps_strings = 0; + imgp->auxarg_size = 0; + + /* + * Allocate temporary demand zeroed space for argument and + * environment strings + */ + imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX + PAGE_SIZE); + if (imgp->stringbase == NULL) { + error = ENOMEM; + goto exec_fail; + } + imgp->stringp = imgp->stringbase; + imgp->stringspace = ARG_MAX; + imgp->image_header = imgp->stringbase + ARG_MAX; + + /* + * Translate the file name. namei() returns a vnode pointer + * in ni_vp amoung other things. + */ + ndp = &nd; + NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, + UIO_USERSPACE, uap->fname, td); + +interpret: + + error = namei(ndp); + if (error) { + kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, + ARG_MAX + PAGE_SIZE); + goto exec_fail; + } + + imgp->vp = ndp->ni_vp; + imgp->fname = uap->fname; + + /* + * Check file permissions (also 'opens' file) + */ + error = exec_check_permissions(imgp); + if (error) { + VOP_UNLOCK(imgp->vp, 0, td); + goto exec_fail_dealloc; + } + + error = exec_map_first_page(imgp); + VOP_UNLOCK(imgp->vp, 0, td); + if (error) + goto exec_fail_dealloc; + + /* + * If the current process has a special image activator it + * wants to try first, call it. For example, emulating shell + * scripts differently. + */ + error = -1; + if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) + error = img_first(imgp); + + /* + * Loop through the list of image activators, calling each one. + * An activator returns -1 if there is no match, 0 on success, + * and an error otherwise. + */ + for (i = 0; error == -1 && execsw[i]; ++i) { + if (execsw[i]->ex_imgact == NULL || + execsw[i]->ex_imgact == img_first) { + continue; + } + error = (*execsw[i]->ex_imgact)(imgp); + } + + if (error) { + if (error == -1) + error = ENOEXEC; + goto exec_fail_dealloc; + } + + /* + * Special interpreter operation, cleanup and loop up to try to + * activate the interpreter. + */ + if (imgp->interpreted) { + exec_unmap_first_page(imgp); + /* free name buffer and old vnode */ + NDFREE(ndp, NDF_ONLY_PNBUF); + vrele(ndp->ni_vp); + /* set new name to that of the interpreter */ + NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, + UIO_SYSSPACE, imgp->interpreter_name, td); + goto interpret; + } + + /* + * Copy out strings (args and env) and initialize stack base + */ + stack_base = exec_copyout_strings(imgp); + + /* + * If custom stack fixup routine present for this process + * let it do the stack setup. + * Else stuff argument count as first item on stack + */ + if (p->p_sysent->sv_fixup) + (*p->p_sysent->sv_fixup)(&stack_base, imgp); + else + suword(--stack_base, imgp->argc); + + /* + * For security and other reasons, the file descriptor table cannot + * be shared after an exec. + */ + FILEDESC_LOCK(p->p_fd); + if (p->p_fd->fd_refcnt > 1) { + struct filedesc *tmp; + + tmp = fdcopy(td); + FILEDESC_UNLOCK(p->p_fd); + fdfree(td); + p->p_fd = tmp; + } else + FILEDESC_UNLOCK(p->p_fd); + + /* + * Malloc things before we need locks. + */ + newcred = crget(); + euip = uifind(attr.va_uid); + i = imgp->endargs - imgp->stringbase; + if (ps_arg_cache_limit >= i + sizeof(struct pargs)) + newargs = pargs_alloc(i); + + /* close files on exec */ + fdcloseexec(td); + + /* + * For security and other reasons, signal handlers cannot + * be shared after an exec. The new process gets a copy of the old + * handlers. In execsigs(), the new process will have its signals + * reset. + */ + PROC_LOCK(p); + mp_fixme("procsig needs a lock"); + if (p->p_procsig->ps_refcnt > 1) { + oldprocsig = p->p_procsig; + PROC_UNLOCK(p); + MALLOC(newprocsig, struct procsig *, sizeof(struct procsig), + M_SUBPROC, M_WAITOK); + bcopy(oldprocsig, newprocsig, sizeof(*newprocsig)); + newprocsig->ps_refcnt = 1; + oldprocsig->ps_refcnt--; + PROC_LOCK(p); + p->p_procsig = newprocsig; + if (p->p_sigacts == &p->p_uarea->u_sigacts) + panic("shared procsig but private sigacts?"); + + p->p_uarea->u_sigacts = *p->p_sigacts; + p->p_sigacts = &p->p_uarea->u_sigacts; + } + /* Stop profiling */ + stopprofclock(p); + + /* reset caught signals */ + execsigs(p); + + /* name this process - nameiexec(p, ndp) */ + len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); + bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); + p->p_comm[len] = 0; + + /* + * mark as execed, wakeup the process that vforked (if any) and tell + * it that it now has its own resources back + */ + p->p_flag |= P_EXEC; + if (p->p_pptr && (p->p_flag & P_PPWAIT)) { + p->p_flag &= ~P_PPWAIT; + wakeup((caddr_t)p->p_pptr); + } + + /* + * Implement image setuid/setgid. + * + * Don't honor setuid/setgid if the filesystem prohibits it or if + * the process is being traced. + */ + oldcred = p->p_ucred; + if ((((attr.va_mode & VSUID) && oldcred->cr_uid != attr.va_uid) || + ((attr.va_mode & VSGID) && oldcred->cr_gid != attr.va_gid)) && + (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && + (p->p_flag & P_TRACED) == 0) { + /* + * Turn off syscall tracing for set-id programs, except for + * root. Record any set-id flags first to make sure that + * we do not regain any tracing during a possible block. + */ + setsugid(p); +#ifdef KTRACE + if (p->p_tracep && suser_cred(oldcred, PRISON_ROOT)) { + mtx_lock(&ktrace_mtx); + p->p_traceflag = 0; + tracevp = p->p_tracep; + p->p_tracep = NULL; + mtx_unlock(&ktrace_mtx); + } +#endif + /* Make sure file descriptors 0..2 are in use. */ + error = fdcheckstd(td); + if (error != 0) { + oldcred = NULL; + goto done1; + } + /* + * Set the new credentials. + */ + crcopy(newcred, oldcred); + if (attr.va_mode & VSUID) + change_euid(newcred, euip); + if (attr.va_mode & VSGID) + change_egid(newcred, attr.va_gid); + setugidsafety(td); + /* + * Implement correct POSIX saved-id behavior. + */ + change_svuid(newcred, newcred->cr_uid); + change_svgid(newcred, newcred->cr_gid); + p->p_ucred = newcred; + newcred = NULL; + } else { + if (oldcred->cr_uid == oldcred->cr_ruid && + oldcred->cr_gid == oldcred->cr_rgid) + p->p_flag &= ~P_SUGID; + /* + * Implement correct POSIX saved-id behavior. + * + * XXX: It's not clear that the existing behavior is + * POSIX-compliant. A number of sources indicate that the + * saved uid/gid should only be updated if the new ruid is + * not equal to the old ruid, or the new euid is not equal + * to the old euid and the new euid is not equal to the old + * ruid. The FreeBSD code always updates the saved uid/gid. + * Also, this code uses the new (replaced) euid and egid as + * the source, which may or may not be the right ones to use. + */ + if (oldcred->cr_svuid != oldcred->cr_uid || + oldcred->cr_svgid != oldcred->cr_gid) { + crcopy(newcred, oldcred); + change_svuid(newcred, newcred->cr_uid); + change_svgid(newcred, newcred->cr_gid); + p->p_ucred = newcred; + newcred = NULL; + } + } + + /* + * Store the vp for use in procfs + */ + textvp = p->p_textvp; + VREF(ndp->ni_vp); + p->p_textvp = ndp->ni_vp; + + /* + * Notify others that we exec'd, and clear the P_INEXEC flag + * as we're now a bona fide freshly-execed process. + */ + KNOTE(&p->p_klist, NOTE_EXEC); + p->p_flag &= ~P_INEXEC; + + /* + * If tracing the process, trap to debugger so breakpoints + * can be set before the program executes. + */ + _STOPEVENT(p, S_EXEC, 0); + + if (p->p_flag & P_TRACED) + psignal(p, SIGTRAP); + + /* clear "fork but no exec" flag, as we _are_ execing */ + p->p_acflag &= ~AFORK; + + /* Free any previous argument cache */ + oldargs = p->p_args; + p->p_args = NULL; + + /* Set values passed into the program in registers. */ + setregs(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base, + imgp->ps_strings); + + /* Cache arguments if they fit inside our allowance */ + if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { + bcopy(imgp->stringbase, newargs->ar_args, i); + p->p_args = newargs; + newargs = NULL; + } +done1: + PROC_UNLOCK(p); + + /* + * Free any resources malloc'd earlier that we didn't use. + */ + uifree(euip); + if (newcred == NULL) + crfree(oldcred); + else + crfree(newcred); + /* + * Handle deferred decrement of ref counts. + */ + if (textvp != NULL) + vrele(textvp); +#ifdef KTRACE + if (tracevp != NULL) + vrele(tracevp); +#endif + if (oldargs != NULL) + pargs_drop(oldargs); + if (newargs != NULL) + pargs_drop(newargs); + +exec_fail_dealloc: + + /* + * free various allocated resources + */ + if (imgp->firstpage) + exec_unmap_first_page(imgp); + + if (imgp->stringbase != NULL) + kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, + ARG_MAX + PAGE_SIZE); + + if (imgp->vp) { + NDFREE(ndp, NDF_ONLY_PNBUF); + vrele(imgp->vp); + } + + if (error == 0) + goto done2; + +exec_fail: + /* we're done here, clear P_INEXEC */ + PROC_LOCK(p); + p->p_flag &= ~P_INEXEC; + PROC_UNLOCK(p); + + if (imgp->vmspace_destroyed) { + /* sorry, no more process anymore. exit gracefully */ + exit1(td, W_EXITCODE(0, SIGABRT)); + /* NOT REACHED */ + error = 0; + } +done2: + mtx_unlock(&Giant); + return (error); +} + +int +exec_map_first_page(imgp) + struct image_params *imgp; +{ + int rv, i; + int initial_pagein; + vm_page_t ma[VM_INITIAL_PAGEIN]; + vm_object_t object; + + GIANT_REQUIRED; + + if (imgp->firstpage) { + exec_unmap_first_page(imgp); + } + + VOP_GETVOBJECT(imgp->vp, &object); + + ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); + + if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { + initial_pagein = VM_INITIAL_PAGEIN; + if (initial_pagein > object->size) + initial_pagein = object->size; + for (i = 1; i < initial_pagein; i++) { + if ((ma[i] = vm_page_lookup(object, i)) != NULL) { + if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) + break; + if (ma[i]->valid) + break; + vm_page_busy(ma[i]); + } else { + ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL); + if (ma[i] == NULL) + break; + } + } + initial_pagein = i; + + rv = vm_pager_get_pages(object, ma, initial_pagein, 0); + ma[0] = vm_page_lookup(object, 0); + + if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) { + if (ma[0]) { + vm_page_protect(ma[0], VM_PROT_NONE); + vm_page_free(ma[0]); + } + return EIO; + } + } + + vm_page_wire(ma[0]); + vm_page_wakeup(ma[0]); + + pmap_qenter((vm_offset_t)imgp->image_header, ma, 1); + imgp->firstpage = ma[0]; + + return 0; +} + +void +exec_unmap_first_page(imgp) + struct image_params *imgp; +{ + GIANT_REQUIRED; + + if (imgp->firstpage) { + pmap_qremove((vm_offset_t)imgp->image_header, 1); + vm_page_unwire(imgp->firstpage, 1); + imgp->firstpage = NULL; + } +} + +/* + * Destroy old address space, and allocate a new stack + * The new stack is only SGROWSIZ large because it is grown + * automatically in trap.c. + */ +int +exec_new_vmspace(imgp) + struct image_params *imgp; +{ + int error; + struct execlist *ep; + struct proc *p = imgp->proc; + struct vmspace *vmspace = p->p_vmspace; + vm_offset_t stack_addr = USRSTACK - maxssiz; + + GIANT_REQUIRED; + + imgp->vmspace_destroyed = 1; + + /* + * Perform functions registered with at_exec(). + */ + TAILQ_FOREACH(ep, &exec_list, next) + (*ep->function)(p); + + /* + * Blow away entire process VM, if address space not shared, + * otherwise, create a new VM space so that other threads are + * not disrupted + */ + if (vmspace->vm_refcnt == 1) { + if (vmspace->vm_shm) + shmexit(p); + pmap_remove_pages(vmspace_pmap(vmspace), 0, VM_MAXUSER_ADDRESS); + vm_map_remove(&vmspace->vm_map, 0, VM_MAXUSER_ADDRESS); + } else { + vmspace_exec(p); + vmspace = p->p_vmspace; + } + + /* Allocate a new stack */ + error = vm_map_stack(&vmspace->vm_map, stack_addr, (vm_size_t)maxssiz, + VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) + return (error); + +#ifdef __ia64__ + { + /* + * Allocate backing store. We really need something + * similar to vm_map_stack which can allow the backing + * store to grow upwards. This will do for now. + */ + vm_offset_t bsaddr; + bsaddr = USRSTACK - 2*maxssiz; + error = vm_map_find(&vmspace->vm_map, 0, 0, &bsaddr, + regstkpages * PAGE_SIZE, 0, + VM_PROT_ALL, VM_PROT_ALL, 0); + FIRST_THREAD_IN_PROC(p)->td_md.md_bspstore = bsaddr; + } +#endif + + /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the + * VM_STACK case, but they are still used to monitor the size of the + * process stack so we can check the stack rlimit. + */ + vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; + vmspace->vm_maxsaddr = (char *)USRSTACK - maxssiz; + + return(0); +} + +/* + * Copy out argument and environment strings from the old process + * address space into the temporary string buffer. + */ +int +exec_extract_strings(imgp) + struct image_params *imgp; +{ + char **argv, **envv; + char *argp, *envp; + int error; + size_t length; + + /* + * extract arguments first + */ + + argv = imgp->uap->argv; + + if (argv) { + argp = (caddr_t) (intptr_t) fuword(argv); + if (argp == (caddr_t) -1) + return (EFAULT); + if (argp) + argv++; + if (imgp->argv0) + argp = imgp->argv0; + if (argp) { + do { + if (argp == (caddr_t) -1) + return (EFAULT); + if ((error = copyinstr(argp, imgp->stringp, + imgp->stringspace, &length))) { + if (error == ENAMETOOLONG) + return(E2BIG); + return (error); + } + imgp->stringspace -= length; + imgp->stringp += length; + imgp->argc++; + } while ((argp = (caddr_t) (intptr_t) fuword(argv++))); + } + } + + imgp->endargs = imgp->stringp; + + /* + * extract environment strings + */ + + envv = imgp->uap->envv; + + if (envv) { + while ((envp = (caddr_t) (intptr_t) fuword(envv++))) { + if (envp == (caddr_t) -1) + return (EFAULT); + if ((error = copyinstr(envp, imgp->stringp, + imgp->stringspace, &length))) { + if (error == ENAMETOOLONG) + return(E2BIG); + return (error); + } + imgp->stringspace -= length; + imgp->stringp += length; + imgp->envc++; + } + } + + return (0); +} + +/* + * Copy strings out to the new process address space, constructing + * new arg and env vector tables. Return a pointer to the base + * so that it can be used as the initial stack pointer. + */ +register_t * +exec_copyout_strings(imgp) + struct image_params *imgp; +{ + int argc, envc; + char **vectp; + char *stringp, *destp; + register_t *stack_base; + struct ps_strings *arginfo; + int szsigcode; + + /* + * Calculate string base and vector table pointers. + * Also deal with signal trampoline code for this exec type. + */ + arginfo = (struct ps_strings *)PS_STRINGS; + szsigcode = *(imgp->proc->p_sysent->sv_szsigcode); + destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - + roundup((ARG_MAX - imgp->stringspace), sizeof(char *)); + + /* + * install sigcode + */ + if (szsigcode) + copyout(imgp->proc->p_sysent->sv_sigcode, + ((caddr_t)arginfo - szsigcode), szsigcode); + + /* + * If we have a valid auxargs ptr, prepare some room + * on the stack. + */ + if (imgp->auxargs) { + /* + * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for + * lower compatibility. + */ + imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size + : (AT_COUNT * 2); + /* + * The '+ 2' is for the null pointers at the end of each of + * the arg and env vector sets,and imgp->auxarg_size is room + * for argument of Runtime loader. + */ + vectp = (char **) (destp - (imgp->argc + imgp->envc + 2 + + imgp->auxarg_size) * sizeof(char *)); + + } else + /* + * The '+ 2' is for the null pointers at the end of each of + * the arg and env vector sets + */ + vectp = (char **) + (destp - (imgp->argc + imgp->envc + 2) * sizeof(char *)); + + /* + * vectp also becomes our initial stack base + */ + stack_base = (register_t *)vectp; + + stringp = imgp->stringbase; + argc = imgp->argc; + envc = imgp->envc; + + /* + * Copy out strings - arguments and environment. + */ + copyout(stringp, destp, ARG_MAX - imgp->stringspace); + + /* + * Fill in "ps_strings" struct for ps, w, etc. + */ + suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); + suword(&arginfo->ps_nargvstr, argc); + + /* + * Fill in argument portion of vector table. + */ + for (; argc > 0; --argc) { + suword(vectp++, (long)(intptr_t)destp); + while (*stringp++ != 0) + destp++; + destp++; + } + + /* a null vector table pointer separates the argp's from the envp's */ + suword(vectp++, 0); + + suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); + suword(&arginfo->ps_nenvstr, envc); + + /* + * Fill in environment portion of vector table. + */ + for (; envc > 0; --envc) { + suword(vectp++, (long)(intptr_t)destp); + while (*stringp++ != 0) + destp++; + destp++; + } + + /* end of vector table is a null pointer */ + suword(vectp, 0); + + return (stack_base); +} + +/* + * Check permissions of file to execute. + * Called with imgp->vp locked. + * Return 0 for success or error code on failure. + */ +int +exec_check_permissions(imgp) + struct image_params *imgp; +{ + struct vnode *vp = imgp->vp; + struct vattr *attr = imgp->attr; + struct thread *td; + int error; + + td = curthread; /* XXXKSE */ + /* Get file attributes */ + error = VOP_GETATTR(vp, attr, td->td_ucred, td); + if (error) + return (error); + + /* + * 1) Check if file execution is disabled for the filesystem that this + * file resides on. + * 2) Insure that at least one execute bit is on - otherwise root + * will always succeed, and we don't want to happen unless the + * file really is executable. + * 3) Insure that the file is a regular file. + */ + if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || + ((attr->va_mode & 0111) == 0) || + (attr->va_type != VREG)) + return (EACCES); + + /* + * Zero length files can't be exec'd + */ + if (attr->va_size == 0) + return (ENOEXEC); + + /* + * Check for execute permission to file based on current credentials. + */ + error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); + if (error) + return (error); + + /* + * Check number of open-for-writes on the file and deny execution + * if there are any. + */ + if (vp->v_writecount) + return (ETXTBSY); + + /* + * Call filesystem specific open routine (which does nothing in the + * general case). + */ + error = VOP_OPEN(vp, FREAD, td->td_ucred, td); + return (error); +} + +/* + * Exec handler registration + */ +int +exec_register(execsw_arg) + const struct execsw *execsw_arg; +{ + const struct execsw **es, **xs, **newexecsw; + int count = 2; /* New slot and trailing NULL */ + + if (execsw) + for (es = execsw; *es; es++) + count++; + newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); + if (newexecsw == NULL) + return ENOMEM; + xs = newexecsw; + if (execsw) + for (es = execsw; *es; es++) + *xs++ = *es; + *xs++ = execsw_arg; + *xs = NULL; + if (execsw) + free(execsw, M_TEMP); + execsw = newexecsw; + return 0; +} + +int +exec_unregister(execsw_arg) + const struct execsw *execsw_arg; +{ + const struct execsw **es, **xs, **newexecsw; + int count = 1; + + if (execsw == NULL) + panic("unregister with no handlers left?\n"); + + for (es = execsw; *es; es++) { + if (*es == execsw_arg) + break; + } + if (*es == NULL) + return ENOENT; + for (es = execsw; *es; es++) + if (*es != execsw_arg) + count++; + newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); + if (newexecsw == NULL) + return ENOMEM; + xs = newexecsw; + for (es = execsw; *es; es++) + if (*es != execsw_arg) + *xs++ = *es; + *xs = NULL; + if (execsw) + free(execsw, M_TEMP); + execsw = newexecsw; + return 0; +} + +int +at_exec(function) + execlist_fn function; +{ + struct execlist *ep; + +#ifdef INVARIANTS + /* Be noisy if the programmer has lost track of things */ + if (rm_at_exec(function)) + printf("WARNING: exec callout entry (%p) already present\n", + function); +#endif + ep = malloc(sizeof(*ep), M_ATEXEC, M_NOWAIT); + if (ep == NULL) + return (ENOMEM); + ep->function = function; + TAILQ_INSERT_TAIL(&exec_list, ep, next); + return (0); +} + +/* + * Scan the exec callout list for the given item and remove it. + * Returns the number of items removed (0 or 1) + */ +int +rm_at_exec(function) + execlist_fn function; +{ + struct execlist *ep; + + TAILQ_FOREACH(ep, &exec_list, next) { + if (ep->function == function) { + TAILQ_REMOVE(&exec_list, ep, next); + free(ep, M_ATEXEC); + return(1); + } + } + return (0); +} + diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c new file mode 100644 index 0000000..fab9437 --- /dev/null +++ b/sys/kern/kern_exit.c @@ -0,0 +1,805 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 + * $FreeBSD$ + */ + +#include "opt_compat.h" +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/pioctl.h> +#include <sys/tty.h> +#include <sys/wait.h> +#include <sys/vmmeter.h> +#include <sys/vnode.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/sx.h> +#include <sys/ptrace.h> +#include <sys/acct.h> /* for acct_process() function prototype */ +#include <sys/filedesc.h> +#include <sys/shm.h> +#include <sys/sem.h> +#include <sys/jail.h> +#ifdef KTRACE +#include <sys/ktrace.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/uma.h> +#include <sys/user.h> + +/* Required to be non-static for SysVR4 emulator */ +MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status"); + +static MALLOC_DEFINE(M_ATEXIT, "atexit", "atexit callback"); + +static int wait1(struct thread *, struct wait_args *, int); + +/* + * callout list for things to do at exit time + */ +struct exitlist { + exitlist_fn function; + TAILQ_ENTRY(exitlist) next; +}; + +TAILQ_HEAD(exit_list_head, exitlist); +static struct exit_list_head exit_list = TAILQ_HEAD_INITIALIZER(exit_list); + +/* + * exit -- + * Death of process. + * + * MPSAFE + */ +void +sys_exit(td, uap) + struct thread *td; + struct sys_exit_args /* { + int rval; + } */ *uap; +{ + + mtx_lock(&Giant); + exit1(td, W_EXITCODE(uap->rval, 0)); + /* NOTREACHED */ +} + +/* + * Exit: deallocate address space and other resources, change proc state + * to zombie, and unlink proc from allproc and parent's lists. Save exit + * status and rusage for wait(). Check for child processes and orphan them. + */ +void +exit1(td, rv) + register struct thread *td; + int rv; +{ + struct exitlist *ep; + struct proc *p, *nq, *q; + struct tty *tp; + struct vnode *ttyvp; + register struct vmspace *vm; + struct vnode *vtmp; +#ifdef KTRACE + struct vnode *tracevp; +#endif + + GIANT_REQUIRED; + + p = td->td_proc; + if (p == initproc) { + printf("init died (signal %d, exit %d)\n", + WTERMSIG(rv), WEXITSTATUS(rv)); + panic("Going nowhere without my init!"); + } + + /* + * XXXXKSE: MUST abort all other threads before proceeding past here. + */ + + /* Are we a task leader? */ + PROC_LOCK(p); + if (p == p->p_leader) { + q = p->p_peers; + while (q != NULL) { + PROC_LOCK(q); + psignal(q, SIGKILL); + PROC_UNLOCK(q); + q = q->p_peers; + } + while (p->p_peers) + msleep((caddr_t)p, &p->p_mtx, PWAIT, "exit1", 0); + } + PROC_UNLOCK(p); + +#ifdef PGINPROF + vmsizmon(); +#endif + STOPEVENT(p, S_EXIT, rv); + wakeup(&p->p_stype); /* Wakeup anyone in procfs' PIOCWAIT */ + + /* + * Check if any loadable modules need anything done at process exit. + * e.g. SYSV IPC stuff + * XXX what if one of these generates an error? + */ + TAILQ_FOREACH(ep, &exit_list, next) + (*ep->function)(p); + + stopprofclock(p); + + MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage), + M_ZOMBIE, M_WAITOK); + /* + * If parent is waiting for us to exit or exec, + * P_PPWAIT is set; we will wakeup the parent below. + */ + PROC_LOCK(p); + p->p_flag &= ~(P_TRACED | P_PPWAIT); + p->p_flag |= P_WEXIT; + SIGEMPTYSET(p->p_siglist); + PROC_UNLOCK(p); + if (timevalisset(&p->p_realtimer.it_value)) + callout_stop(&p->p_itcallout); + + /* + * Reset any sigio structures pointing to us as a result of + * F_SETOWN with our pid. + */ + funsetownlst(&p->p_sigiolst); + + /* + * Close open files and release open-file table. + * This may block! + */ + fdfree(td); /* XXXKSE *//* may not be the one in proc */ + + /* + * Remove ourself from our leader's peer list and wake our leader. + */ + PROC_LOCK(p->p_leader); + if (p->p_leader->p_peers) { + q = p->p_leader; + while (q->p_peers != p) + q = q->p_peers; + q->p_peers = p->p_peers; + wakeup((caddr_t)p->p_leader); + } + PROC_UNLOCK(p->p_leader); + + /* The next two chunks should probably be moved to vmspace_exit. */ + vm = p->p_vmspace; + /* + * Release user portion of address space. + * This releases references to vnodes, + * which could cause I/O if the file has been unlinked. + * Need to do this early enough that we can still sleep. + * Can't free the entire vmspace as the kernel stack + * may be mapped within that space also. + */ + if (--vm->vm_refcnt == 0) { + if (vm->vm_shm) + shmexit(p); + pmap_remove_pages(vmspace_pmap(vm), VM_MIN_ADDRESS, + VM_MAXUSER_ADDRESS); + (void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS, + VM_MAXUSER_ADDRESS); + vm->vm_freer = p; + } + + sx_xlock(&proctree_lock); + if (SESS_LEADER(p)) { + register struct session *sp; + + sp = p->p_session; + if (sp->s_ttyvp) { + /* + * Controlling process. + * Signal foreground pgrp, + * drain controlling terminal + * and revoke access to controlling terminal. + */ + if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) { + tp = sp->s_ttyp; + if (sp->s_ttyp->t_pgrp) { + PGRP_LOCK(sp->s_ttyp->t_pgrp); + pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1); + PGRP_UNLOCK(sp->s_ttyp->t_pgrp); + } + /* XXX tp should be locked. */ + sx_xunlock(&proctree_lock); + (void) ttywait(tp); + sx_xlock(&proctree_lock); + /* + * The tty could have been revoked + * if we blocked. + */ + if (sp->s_ttyvp) { + ttyvp = sp->s_ttyvp; + SESS_LOCK(p->p_session); + sp->s_ttyvp = NULL; + SESS_UNLOCK(p->p_session); + sx_xunlock(&proctree_lock); + VOP_REVOKE(ttyvp, REVOKEALL); + vrele(ttyvp); + sx_xlock(&proctree_lock); + } + } + if (sp->s_ttyvp) { + ttyvp = sp->s_ttyvp; + SESS_LOCK(p->p_session); + sp->s_ttyvp = NULL; + SESS_UNLOCK(p->p_session); + vrele(ttyvp); + } + /* + * s_ttyp is not zero'd; we use this to indicate + * that the session once had a controlling terminal. + * (for logging and informational purposes) + */ + } + SESS_LOCK(p->p_session); + sp->s_leader = NULL; + SESS_UNLOCK(p->p_session); + } + fixjobc(p, p->p_pgrp, 0); + sx_xunlock(&proctree_lock); + (void)acct_process(td); +#ifdef KTRACE + /* + * release trace file + */ + PROC_LOCK(p); + mtx_lock(&ktrace_mtx); + p->p_traceflag = 0; /* don't trace the vrele() */ + tracevp = p->p_tracep; + p->p_tracep = NULL; + mtx_unlock(&ktrace_mtx); + PROC_UNLOCK(p); + if (tracevp != NULL) + vrele(tracevp); +#endif + /* + * Release reference to text vnode + */ + if ((vtmp = p->p_textvp) != NULL) { + p->p_textvp = NULL; + vrele(vtmp); + } + + /* + * Release our limits structure. + */ + mtx_assert(&Giant, MA_OWNED); + if (--p->p_limit->p_refcnt == 0) { + FREE(p->p_limit, M_SUBPROC); + p->p_limit = NULL; + } + + /* + * Release this thread's reference to the ucred. The actual proc + * reference will stay around until the proc is harvested by + * wait(). At this point the ucred is immutable (no other threads + * from this proc are around that can change it) so we leave the + * per-thread ucred pointer intact in case it is needed although + * in theory nothing should be using it at this point. + */ + crfree(td->td_ucred); + + /* + * Remove proc from allproc queue and pidhash chain. + * Place onto zombproc. Unlink from parent's child list. + */ + sx_xlock(&allproc_lock); + LIST_REMOVE(p, p_list); + LIST_INSERT_HEAD(&zombproc, p, p_list); + LIST_REMOVE(p, p_hash); + sx_xunlock(&allproc_lock); + + sx_xlock(&proctree_lock); + q = LIST_FIRST(&p->p_children); + if (q != NULL) /* only need this if any child is S_ZOMB */ + wakeup((caddr_t) initproc); + for (; q != NULL; q = nq) { + nq = LIST_NEXT(q, p_sibling); + PROC_LOCK(q); + proc_reparent(q, initproc); + q->p_sigparent = SIGCHLD; + /* + * Traced processes are killed + * since their existence means someone is screwing up. + */ + if (q->p_flag & P_TRACED) { + q->p_flag &= ~P_TRACED; + psignal(q, SIGKILL); + } + PROC_UNLOCK(q); + } + + /* + * Save exit status and final rusage info, adding in child rusage + * info and self times. + */ + PROC_LOCK(p); + p->p_xstat = rv; + *p->p_ru = p->p_stats->p_ru; + mtx_lock_spin(&sched_lock); + calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL); + mtx_unlock_spin(&sched_lock); + ruadd(p->p_ru, &p->p_stats->p_cru); + + /* + * Notify interested parties of our demise. + */ + KNOTE(&p->p_klist, NOTE_EXIT); + + /* + * Notify parent that we're gone. If parent has the PS_NOCLDWAIT + * flag set, or if the handler is set to SIG_IGN, notify process + * 1 instead (and hope it will handle this situation). + */ + PROC_LOCK(p->p_pptr); + if (p->p_pptr->p_procsig->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) { + struct proc *pp; + + pp = p->p_pptr; + PROC_UNLOCK(pp); + proc_reparent(p, initproc); + PROC_LOCK(p->p_pptr); + /* + * If this was the last child of our parent, notify + * parent, so in case he was wait(2)ing, he will + * continue. + */ + if (LIST_EMPTY(&pp->p_children)) + wakeup((caddr_t)pp); + } + + if (p->p_sigparent && p->p_pptr != initproc) + psignal(p->p_pptr, p->p_sigparent); + else + psignal(p->p_pptr, SIGCHLD); + PROC_UNLOCK(p->p_pptr); + + /* + * If this is a kthread, then wakeup anyone waiting for it to exit. + */ + if (p->p_flag & P_KTHREAD) + wakeup((caddr_t)p); + PROC_UNLOCK(p); + + /* + * Finally, call machine-dependent code to release the remaining + * resources including address space, the kernel stack and pcb. + * The address space is released by "vmspace_exitfree(p)" in + * vm_waitproc(). + */ + cpu_exit(td); + + PROC_LOCK(p); + PROC_LOCK(p->p_pptr); + sx_xunlock(&proctree_lock); + mtx_lock_spin(&sched_lock); + while (mtx_owned(&Giant)) + mtx_unlock(&Giant); + + /* + * We have to wait until after releasing all locks before + * changing p_stat. If we block on a mutex then we will be + * back at SRUN when we resume and our parent will never + * harvest us. + */ + p->p_stat = SZOMB; + + wakeup(p->p_pptr); + PROC_UNLOCK(p->p_pptr); + PROC_UNLOCK(p); + + cnt.v_swtch++; + binuptime(PCPU_PTR(switchtime)); + PCPU_SET(switchticks, ticks); + + cpu_sched_exit(td); + cpu_throw(); + panic("exit1"); +} + +#ifdef COMPAT_43 +/* + * MPSAFE. The dirty work is handled by wait1(). + */ +int +owait(td, uap) + struct thread *td; + register struct owait_args /* { + int dummy; + } */ *uap; +{ + struct wait_args w; + + w.options = 0; + w.rusage = NULL; + w.pid = WAIT_ANY; + w.status = NULL; + return (wait1(td, &w, 1)); +} +#endif /* COMPAT_43 */ + +/* + * MPSAFE. The dirty work is handled by wait1(). + */ +int +wait4(td, uap) + struct thread *td; + struct wait_args *uap; +{ + + return (wait1(td, uap, 0)); +} + +/* + * MPSAFE + */ +static int +wait1(td, uap, compat) + register struct thread *td; + register struct wait_args /* { + int pid; + int *status; + int options; + struct rusage *rusage; + } */ *uap; + int compat; +{ + struct rusage ru; + register int nfound; + register struct proc *p, *q, *t; + int status, error; + + q = td->td_proc; + if (uap->pid == 0) { + PROC_LOCK(q); + uap->pid = -q->p_pgid; + PROC_UNLOCK(q); + } + if (uap->options &~ (WUNTRACED|WNOHANG|WCONTINUED|WLINUXCLONE)) + return (EINVAL); + mtx_lock(&Giant); +loop: + nfound = 0; + sx_xlock(&proctree_lock); + LIST_FOREACH(p, &q->p_children, p_sibling) { + PROC_LOCK(p); + if (uap->pid != WAIT_ANY && + p->p_pid != uap->pid && p->p_pgid != -uap->pid) { + PROC_UNLOCK(p); + continue; + } + + /* + * This special case handles a kthread spawned by linux_clone + * (see linux_misc.c). The linux_wait4 and linux_waitpid + * functions need to be able to distinguish between waiting + * on a process and waiting on a thread. It is a thread if + * p_sigparent is not SIGCHLD, and the WLINUXCLONE option + * signifies we want to wait for threads and not processes. + */ + if ((p->p_sigparent != SIGCHLD) ^ + ((uap->options & WLINUXCLONE) != 0)) { + PROC_UNLOCK(p); + continue; + } + + nfound++; + if (p->p_stat == SZOMB) { + /* + * charge childs scheduling cpu usage to parent + * XXXKSE assume only one thread & kse & ksegrp + * keep estcpu in each ksegrp + * so charge it to the ksegrp that did the wait + * since process estcpu is sum of all ksegrps, + * this is strictly as expected. + * Assume that the child process aggregated all + * tke estcpu into the 'build-in' ksegrp. + * XXXKSE + */ + if (curthread->td_proc->p_pid != 1) { + mtx_lock_spin(&sched_lock); + curthread->td_ksegrp->kg_estcpu = + ESTCPULIM(curthread->td_ksegrp->kg_estcpu + + p->p_ksegrp.kg_estcpu); + mtx_unlock_spin(&sched_lock); + } + + td->td_retval[0] = p->p_pid; +#ifdef COMPAT_43 + if (compat) + td->td_retval[1] = p->p_xstat; + else +#endif + if (uap->status) { + status = p->p_xstat; /* convert to int */ + PROC_UNLOCK(p); + if ((error = copyout((caddr_t)&status, + (caddr_t)uap->status, sizeof(status)))) { + sx_xunlock(&proctree_lock); + mtx_unlock(&Giant); + return (error); + } + PROC_LOCK(p); + } + if (uap->rusage) { + bcopy(p->p_ru, &ru, sizeof(ru)); + PROC_UNLOCK(p); + if ((error = copyout((caddr_t)&ru, + (caddr_t)uap->rusage, + sizeof (struct rusage)))) { + sx_xunlock(&proctree_lock); + mtx_unlock(&Giant); + return (error); + } + } else + PROC_UNLOCK(p); + /* + * If we got the child via a ptrace 'attach', + * we need to give it back to the old parent. + */ + if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) { + PROC_LOCK(p); + p->p_oppid = 0; + proc_reparent(p, t); + PROC_UNLOCK(p); + psignal(t, SIGCHLD); + wakeup((caddr_t)t); + PROC_UNLOCK(t); + sx_xunlock(&proctree_lock); + mtx_unlock(&Giant); + return (0); + } + /* + * Remove other references to this process to ensure + * we have an exclusive reference. + */ + leavepgrp(p); + + sx_xlock(&allproc_lock); + LIST_REMOVE(p, p_list); /* off zombproc */ + sx_xunlock(&allproc_lock); + + LIST_REMOVE(p, p_sibling); + sx_xunlock(&proctree_lock); + + /* + * As a side effect of this lock, we know that + * all other writes to this proc are visible now, so + * no more locking is needed for p. + */ + PROC_LOCK(p); + p->p_xstat = 0; /* XXX: why? */ + PROC_UNLOCK(p); + PROC_LOCK(q); + ruadd(&q->p_stats->p_cru, p->p_ru); + PROC_UNLOCK(q); + FREE(p->p_ru, M_ZOMBIE); + p->p_ru = NULL; + + /* + * Decrement the count of procs running with this uid. + */ + (void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0); + + /* + * Free up credentials. + */ + crfree(p->p_ucred); + p->p_ucred = NULL; /* XXX: why? */ + + /* + * Remove unused arguments + */ + pargs_drop(p->p_args); + p->p_args = NULL; + + if (--p->p_procsig->ps_refcnt == 0) { + if (p->p_sigacts != &p->p_uarea->u_sigacts) + FREE(p->p_sigacts, M_SUBPROC); + FREE(p->p_procsig, M_SUBPROC); + p->p_procsig = NULL; + } + + /* + * Give vm and machine-dependent layer a chance + * to free anything that cpu_exit couldn't + * release while still running in process context. + */ + vm_waitproc(p); + mtx_destroy(&p->p_mtx); + uma_zfree(proc_zone, p); + sx_xlock(&allproc_lock); + nprocs--; + sx_xunlock(&allproc_lock); + mtx_unlock(&Giant); + return (0); + } + if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 && + (p->p_flag & P_TRACED || uap->options & WUNTRACED)) { + p->p_flag |= P_WAITED; + sx_xunlock(&proctree_lock); + td->td_retval[0] = p->p_pid; +#ifdef COMPAT_43 + if (compat) { + td->td_retval[1] = W_STOPCODE(p->p_xstat); + PROC_UNLOCK(p); + error = 0; + } else +#endif + if (uap->status) { + status = W_STOPCODE(p->p_xstat); + PROC_UNLOCK(p); + error = copyout((caddr_t)&status, + (caddr_t)uap->status, sizeof(status)); + } else { + PROC_UNLOCK(p); + error = 0; + } + mtx_unlock(&Giant); + return (error); + } + if (uap->options & WCONTINUED && (p->p_flag & P_CONTINUED)) { + sx_xunlock(&proctree_lock); + td->td_retval[0] = p->p_pid; + p->p_flag &= ~P_CONTINUED; + PROC_UNLOCK(p); + + if (uap->status) { + status = SIGCONT; + error = copyout((caddr_t)&status, + (caddr_t)uap->status, sizeof(status)); + } else + error = 0; + + mtx_unlock(&Giant); + return (error); + } + PROC_UNLOCK(p); + } + if (nfound == 0) { + sx_xunlock(&proctree_lock); + mtx_unlock(&Giant); + return (ECHILD); + } + if (uap->options & WNOHANG) { + sx_xunlock(&proctree_lock); + td->td_retval[0] = 0; + mtx_unlock(&Giant); + return (0); + } + PROC_LOCK(q); + sx_xunlock(&proctree_lock); + error = msleep((caddr_t)q, &q->p_mtx, PWAIT | PCATCH, "wait", 0); + PROC_UNLOCK(q); + if (error) { + mtx_unlock(&Giant); + return (error); + } + goto loop; +} + +/* + * Make process 'parent' the new parent of process 'child'. + * Must be called with an exclusive hold of proctree lock. + */ +void +proc_reparent(child, parent) + register struct proc *child; + register struct proc *parent; +{ + + sx_assert(&proctree_lock, SX_XLOCKED); + PROC_LOCK_ASSERT(child, MA_OWNED); + if (child->p_pptr == parent) + return; + + LIST_REMOVE(child, p_sibling); + LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); + child->p_pptr = parent; +} + +/* + * The next two functions are to handle adding/deleting items on the + * exit callout list + * + * at_exit(): + * Take the arguments given and put them onto the exit callout list, + * However first make sure that it's not already there. + * returns 0 on success. + */ + +int +at_exit(function) + exitlist_fn function; +{ + struct exitlist *ep; + +#ifdef INVARIANTS + /* Be noisy if the programmer has lost track of things */ + if (rm_at_exit(function)) + printf("WARNING: exit callout entry (%p) already present\n", + function); +#endif + ep = malloc(sizeof(*ep), M_ATEXIT, M_NOWAIT); + if (ep == NULL) + return (ENOMEM); + ep->function = function; + TAILQ_INSERT_TAIL(&exit_list, ep, next); + return (0); +} + +/* + * Scan the exit callout list for the given item and remove it. + * Returns the number of items removed (0 or 1) + */ +int +rm_at_exit(function) + exitlist_fn function; +{ + struct exitlist *ep; + + TAILQ_FOREACH(ep, &exit_list, next) { + if (ep->function == function) { + TAILQ_REMOVE(&exit_list, ep, next); + free(ep, M_ATEXIT); + return (1); + } + } + return (0); +} diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c new file mode 100644 index 0000000..016653b --- /dev/null +++ b/sys/kern/kern_fork.c @@ -0,0 +1,866 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 + * $FreeBSD$ + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/syscall.h> +#include <sys/vnode.h> +#include <sys/acct.h> +#include <sys/ktr.h> +#include <sys/ktrace.h> +#include <sys/kthread.h> +#include <sys/unistd.h> +#include <sys/jail.h> +#include <sys/sx.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> +#include <vm/uma.h> + +#include <sys/vmmeter.h> +#include <sys/user.h> +#include <machine/critical.h> + +static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback"); + +/* + * These are the stuctures used to create a callout list for things to do + * when forking a process + */ +struct forklist { + forklist_fn function; + TAILQ_ENTRY(forklist) next; +}; + +static struct sx fork_list_lock; + +TAILQ_HEAD(forklist_head, forklist); +static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list); + +#ifndef _SYS_SYSPROTO_H_ +struct fork_args { + int dummy; +}; +#endif + +int forksleep; /* Place for fork1() to sleep on. */ + +static void +init_fork_list(void *data __unused) +{ + + sx_init(&fork_list_lock, "fork list"); +} +SYSINIT(fork_list, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_fork_list, NULL); + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +fork(td, uap) + struct thread *td; + struct fork_args *uap; +{ + int error; + struct proc *p2; + + mtx_lock(&Giant); + error = fork1(td, RFFDG | RFPROC, &p2); + if (error == 0) { + td->td_retval[0] = p2->p_pid; + td->td_retval[1] = 0; + } + mtx_unlock(&Giant); + return error; +} + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +vfork(td, uap) + struct thread *td; + struct vfork_args *uap; +{ + int error; + struct proc *p2; + + mtx_lock(&Giant); + error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2); + if (error == 0) { + td->td_retval[0] = p2->p_pid; + td->td_retval[1] = 0; + } + mtx_unlock(&Giant); + return error; +} + +/* + * MPSAFE + */ +int +rfork(td, uap) + struct thread *td; + struct rfork_args *uap; +{ + int error; + struct proc *p2; + + /* Don't allow kernel only flags. */ + if ((uap->flags & RFKERNELONLY) != 0) + return (EINVAL); + mtx_lock(&Giant); + error = fork1(td, uap->flags, &p2); + if (error == 0) { + td->td_retval[0] = p2 ? p2->p_pid : 0; + td->td_retval[1] = 0; + } + mtx_unlock(&Giant); + return error; +} + + +int nprocs = 1; /* process 0 */ +int lastpid = 0; +SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, + "Last used PID"); + +/* + * Random component to lastpid generation. We mix in a random factor to make + * it a little harder to predict. We sanity check the modulus value to avoid + * doing it in critical paths. Don't let it be too small or we pointlessly + * waste randomness entropy, and don't let it be impossibly large. Using a + * modulus that is too big causes a LOT more process table scans and slows + * down fork processing as the pidchecked caching is defeated. + */ +static int randompid = 0; + +static int +sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) +{ + int error, pid; + + sx_xlock(&allproc_lock); + pid = randompid; + error = sysctl_handle_int(oidp, &pid, 0, req); + if (error == 0 && req->newptr != NULL) { + if (pid < 0 || pid > PID_MAX - 100) /* out of range */ + pid = PID_MAX - 100; + else if (pid < 2) /* NOP */ + pid = 0; + else if (pid < 100) /* Make it reasonable */ + pid = 100; + randompid = pid; + } + sx_xunlock(&allproc_lock); + return (error); +} + +SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, + 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); + +#if 0 +void +kse_init(struct kse *kse1, struct kse *kse2) +{ +} + +void +thread_init(struct thread *thread1, struct thread *thread2) +{ +} + +void +ksegrp_init(struct ksegrp *ksegrp1, struct ksegrp *ksegrp2) +{ +} +#endif + +int +fork1(td, flags, procp) + struct thread *td; /* parent proc */ + int flags; + struct proc **procp; /* child proc */ +{ + struct proc *p2, *pptr; + uid_t uid; + struct proc *newproc; + int trypid; + int ok; + static int pidchecked = 0; + struct forklist *ep; + struct filedesc *fd; + struct proc *p1 = td->td_proc; + struct thread *td2; + struct kse *ke2; + struct ksegrp *kg2; + struct sigacts *newsigacts; + struct procsig *newprocsig; + + GIANT_REQUIRED; + + /* Can't copy and clear */ + if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) + return (EINVAL); + + /* + * Here we don't create a new process, but we divorce + * certain parts of a process from itself. + */ + if ((flags & RFPROC) == 0) { + vm_forkproc(td, NULL, NULL, flags); + + /* + * Close all file descriptors. + */ + if (flags & RFCFDG) { + struct filedesc *fdtmp; + fdtmp = fdinit(td); /* XXXKSE */ + PROC_LOCK(p1); + fdfree(td); /* XXXKSE */ + p1->p_fd = fdtmp; + PROC_UNLOCK(p1); + } + + /* + * Unshare file descriptors (from parent.) + */ + if (flags & RFFDG) { + FILEDESC_LOCK(p1->p_fd); + if (p1->p_fd->fd_refcnt > 1) { + struct filedesc *newfd; + + newfd = fdcopy(td); + FILEDESC_UNLOCK(p1->p_fd); + PROC_LOCK(p1); + fdfree(td); + p1->p_fd = newfd; + PROC_UNLOCK(p1); + } else + FILEDESC_UNLOCK(p1->p_fd); + } + *procp = NULL; + return (0); + } + + /* Allocate new proc. */ + newproc = uma_zalloc(proc_zone, M_WAITOK); + + /* + * Although process entries are dynamically created, we still keep + * a global limit on the maximum number we will create. Don't allow + * a nonprivileged user to use the last process; don't let root + * exceed the limit. The variable nprocs is the current number of + * processes, maxproc is the limit. + */ + sx_xlock(&allproc_lock); + uid = td->td_ucred->cr_ruid; + if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) { + sx_xunlock(&allproc_lock); + uma_zfree(proc_zone, newproc); + tsleep(&forksleep, PUSER, "fork", hz / 2); + return (EAGAIN); + } + /* + * Increment the count of procs running with this uid. Don't allow + * a nonprivileged user to exceed their current limit. + */ + PROC_LOCK(p1); + ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, + (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0); + PROC_UNLOCK(p1); + if (!ok) { + sx_xunlock(&allproc_lock); + uma_zfree(proc_zone, newproc); + tsleep(&forksleep, PUSER, "fork", hz / 2); + return (EAGAIN); + } + + /* + * Increment the nprocs resource before blocking can occur. There + * are hard-limits as to the number of processes that can run. + */ + nprocs++; + + /* + * Find an unused process ID. We remember a range of unused IDs + * ready to use (from lastpid+1 through pidchecked-1). + * + * If RFHIGHPID is set (used during system boot), do not allocate + * low-numbered pids. + */ + trypid = lastpid + 1; + if (flags & RFHIGHPID) { + if (trypid < 10) { + trypid = 10; + } + } else { + if (randompid) + trypid += arc4random() % randompid; + } +retry: + /* + * If the process ID prototype has wrapped around, + * restart somewhat above 0, as the low-numbered procs + * tend to include daemons that don't exit. + */ + if (trypid >= PID_MAX) { + trypid = trypid % PID_MAX; + if (trypid < 100) + trypid += 100; + pidchecked = 0; + } + if (trypid >= pidchecked) { + int doingzomb = 0; + + pidchecked = PID_MAX; + /* + * Scan the active and zombie procs to check whether this pid + * is in use. Remember the lowest pid that's greater + * than trypid, so we can avoid checking for a while. + */ + p2 = LIST_FIRST(&allproc); +again: + for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) { + PROC_LOCK(p2); + while (p2->p_pid == trypid || + p2->p_pgrp->pg_id == trypid || + p2->p_session->s_sid == trypid) { + trypid++; + if (trypid >= pidchecked) { + PROC_UNLOCK(p2); + goto retry; + } + } + if (p2->p_pid > trypid && pidchecked > p2->p_pid) + pidchecked = p2->p_pid; + if (p2->p_pgrp->pg_id > trypid && + pidchecked > p2->p_pgrp->pg_id) + pidchecked = p2->p_pgrp->pg_id; + if (p2->p_session->s_sid > trypid && + pidchecked > p2->p_session->s_sid) + pidchecked = p2->p_session->s_sid; + PROC_UNLOCK(p2); + } + if (!doingzomb) { + doingzomb = 1; + p2 = LIST_FIRST(&zombproc); + goto again; + } + } + + /* + * RFHIGHPID does not mess with the lastpid counter during boot. + */ + if (flags & RFHIGHPID) + pidchecked = 0; + else + lastpid = trypid; + + p2 = newproc; + p2->p_stat = SIDL; /* protect against others */ + p2->p_pid = trypid; + LIST_INSERT_HEAD(&allproc, p2, p_list); + LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); + sx_xunlock(&allproc_lock); + + /* + * Malloc things while we don't hold any locks. + */ + if (flags & RFSIGSHARE) { + MALLOC(newsigacts, struct sigacts *, + sizeof(struct sigacts), M_SUBPROC, M_WAITOK); + newprocsig = NULL; + } else { + newsigacts = NULL; + MALLOC(newprocsig, struct procsig *, sizeof(struct procsig), + M_SUBPROC, M_WAITOK); + } + + /* + * Copy filedesc. + * XXX: This is busted. fd*() need to not take proc + * arguments or something. + */ + if (flags & RFCFDG) + fd = fdinit(td); + else if (flags & RFFDG) { + FILEDESC_LOCK(p1->p_fd); + fd = fdcopy(td); + FILEDESC_UNLOCK(p1->p_fd); + } else + fd = fdshare(p1); + + /* + * Make a proc table entry for the new process. + * Start by zeroing the section of proc that is zero-initialized, + * then copy the section that is copied directly from the parent. + */ + td2 = thread_get(p2); + ke2 = &p2->p_kse; + kg2 = &p2->p_ksegrp; + +#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start)) + + bzero(&p2->p_startzero, + (unsigned) RANGEOF(struct proc, p_startzero, p_endzero)); + bzero(&ke2->ke_startzero, + (unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero)); + bzero(&td2->td_startzero, + (unsigned) RANGEOF(struct thread, td_startzero, td_endzero)); + bzero(&kg2->kg_startzero, + (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero)); + + mtx_init(&p2->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); + PROC_LOCK(p2); + PROC_LOCK(p1); + + bcopy(&p1->p_startcopy, &p2->p_startcopy, + (unsigned) RANGEOF(struct proc, p_startcopy, p_endcopy)); + bcopy(&td->td_kse->ke_startcopy, &ke2->ke_startcopy, + (unsigned) RANGEOF(struct kse, ke_startcopy, ke_endcopy)); + bcopy(&td->td_startcopy, &td2->td_startcopy, + (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy)); + bcopy(&td->td_ksegrp->kg_startcopy, &kg2->kg_startcopy, + (unsigned) RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy)); +#undef RANGEOF + + /* + * XXXKSE Theoretically only the running thread would get copied + * Others in the kernel would be 'aborted' in the child. + * i.e return E*something* + */ + proc_linkup(p2, kg2, ke2, td2); + + /* note.. XXXKSE no pcb or u-area yet */ + + /* + * Duplicate sub-structures as needed. + * Increase reference counts on shared objects. + * The p_stats and p_sigacts substructs are set in vm_forkproc. + */ + p2->p_flag = 0; + mtx_lock_spin(&sched_lock); + p2->p_sflag = PS_INMEM; + if (p1->p_sflag & PS_PROFIL) + startprofclock(p2); + mtx_unlock_spin(&sched_lock); + p2->p_ucred = crhold(td->td_ucred); + td2->td_ucred = crhold(p2->p_ucred); /* XXXKSE */ + + /* + * Setup linkage for kernel based threading + */ + if((flags & RFTHREAD) != 0) { + /* + * XXX: This assumes a leader is a parent or grandparent of + * all processes in a task. + */ + if (p1->p_leader != p1) + PROC_LOCK(p1->p_leader); + p2->p_peers = p1->p_peers; + p1->p_peers = p2; + p2->p_leader = p1->p_leader; + if (p1->p_leader != p1) + PROC_UNLOCK(p1->p_leader); + } else { + p2->p_peers = NULL; + p2->p_leader = p2; + } + + pargs_hold(p2->p_args); + + if (flags & RFSIGSHARE) { + p2->p_procsig = p1->p_procsig; + p2->p_procsig->ps_refcnt++; + if (p1->p_sigacts == &p1->p_uarea->u_sigacts) { + /* + * Set p_sigacts to the new shared structure. + * Note that this is updating p1->p_sigacts at the + * same time, since p_sigacts is just a pointer to + * the shared p_procsig->ps_sigacts. + */ + p2->p_sigacts = newsigacts; + newsigacts = NULL; + *p2->p_sigacts = p1->p_uarea->u_sigacts; + } + } else { + p2->p_procsig = newprocsig; + newprocsig = NULL; + bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig)); + p2->p_procsig->ps_refcnt = 1; + p2->p_sigacts = NULL; /* finished in vm_forkproc() */ + } + if (flags & RFLINUXTHPN) + p2->p_sigparent = SIGUSR1; + else + p2->p_sigparent = SIGCHLD; + + /* Bump references to the text vnode (for procfs) */ + p2->p_textvp = p1->p_textvp; + if (p2->p_textvp) + VREF(p2->p_textvp); + p2->p_fd = fd; + PROC_UNLOCK(p1); + PROC_UNLOCK(p2); + + /* + * If p_limit is still copy-on-write, bump refcnt, + * otherwise get a copy that won't be modified. + * (If PL_SHAREMOD is clear, the structure is shared + * copy-on-write.) + */ + if (p1->p_limit->p_lflags & PL_SHAREMOD) + p2->p_limit = limcopy(p1->p_limit); + else { + p2->p_limit = p1->p_limit; + p2->p_limit->p_refcnt++; + } + + sx_xlock(&proctree_lock); + PGRP_LOCK(p1->p_pgrp); + PROC_LOCK(p2); + PROC_LOCK(p1); + + /* + * Preserve some more flags in subprocess. PS_PROFIL has already + * been preserved. + */ + p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK); + SESS_LOCK(p1->p_session); + if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) + p2->p_flag |= P_CONTROLT; + SESS_UNLOCK(p1->p_session); + if (flags & RFPPWAIT) + p2->p_flag |= P_PPWAIT; + + LIST_INSERT_AFTER(p1, p2, p_pglist); + PGRP_UNLOCK(p1->p_pgrp); + LIST_INIT(&p2->p_children); + LIST_INIT(&td2->td_contested); /* XXXKSE only 1 thread? */ + + callout_init(&p2->p_itcallout, 0); + callout_init(&td2->td_slpcallout, 1); /* XXXKSE */ + +#ifdef KTRACE + /* + * Copy traceflag and tracefile if enabled. + */ + mtx_lock(&ktrace_mtx); + KASSERT(p2->p_tracep == NULL, ("new process has a ktrace vnode")); + if (p1->p_traceflag & KTRFAC_INHERIT) { + p2->p_traceflag = p1->p_traceflag; + if ((p2->p_tracep = p1->p_tracep) != NULL) + VREF(p2->p_tracep); + } + mtx_unlock(&ktrace_mtx); +#endif + + /* + * set priority of child to be that of parent + * XXXKSE hey! copying the estcpu seems dodgy.. should split it.. + */ + mtx_lock_spin(&sched_lock); + p2->p_ksegrp.kg_estcpu = p1->p_ksegrp.kg_estcpu; + mtx_unlock_spin(&sched_lock); + + /* + * This begins the section where we must prevent the parent + * from being swapped. + */ + _PHOLD(p1); + PROC_UNLOCK(p1); + + /* + * Attach the new process to its parent. + * + * If RFNOWAIT is set, the newly created process becomes a child + * of init. This effectively disassociates the child from the + * parent. + */ + if (flags & RFNOWAIT) + pptr = initproc; + else + pptr = p1; + p2->p_pptr = pptr; + LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); + PROC_UNLOCK(p2); + sx_xunlock(&proctree_lock); + + /* + * XXXKSE: In KSE, there would be a race here if one thread was + * dieing due to a signal (or calling exit1() for that matter) while + * another thread was calling fork1(). Not sure how KSE wants to work + * around that. The problem is that up until the point above, if p1 + * gets killed, it won't find p2 in its list in order for it to be + * reparented. Alternatively, we could add a new p_flag that gets set + * before we reparent all the children that we check above and just + * use init as our parent if that if that flag is set. (Either that + * or abort the fork if the flag is set since our parent died trying + * to fork us (which is evil)). + */ + + KASSERT(newprocsig == NULL, ("unused newprocsig")); + if (newsigacts != NULL) + FREE(newsigacts, M_SUBPROC); + /* + * Finish creating the child process. It will return via a different + * execution path later. (ie: directly into user mode) + */ + vm_forkproc(td, p2, td2, flags); + + if (flags == (RFFDG | RFPROC)) { + cnt.v_forks++; + cnt.v_forkpages += p2->p_vmspace->vm_dsize + + p2->p_vmspace->vm_ssize; + } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { + cnt.v_vforks++; + cnt.v_vforkpages += p2->p_vmspace->vm_dsize + + p2->p_vmspace->vm_ssize; + } else if (p1 == &proc0) { + cnt.v_kthreads++; + cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + + p2->p_vmspace->vm_ssize; + } else { + cnt.v_rforks++; + cnt.v_rforkpages += p2->p_vmspace->vm_dsize + + p2->p_vmspace->vm_ssize; + } + + /* + * Both processes are set up, now check if any loadable modules want + * to adjust anything. + * What if they have an error? XXX + */ + sx_slock(&fork_list_lock); + TAILQ_FOREACH(ep, &fork_list, next) { + (*ep->function)(p1, p2, flags); + } + sx_sunlock(&fork_list_lock); + + /* + * If RFSTOPPED not requested, make child runnable and add to + * run queue. + */ + microtime(&(p2->p_stats->p_start)); + p2->p_acflag = AFORK; + if ((flags & RFSTOPPED) == 0) { + mtx_lock_spin(&sched_lock); + p2->p_stat = SRUN; + setrunqueue(td2); + mtx_unlock_spin(&sched_lock); + } + + /* + * Now can be swapped. + */ + PROC_LOCK(p1); + _PRELE(p1); + + /* + * tell any interested parties about the new process + */ + KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid); + PROC_UNLOCK(p1); + + /* + * Preserve synchronization semantics of vfork. If waiting for + * child to exec or exit, set P_PPWAIT on child, and sleep on our + * proc (in case of exit). + */ + PROC_LOCK(p2); + while (p2->p_flag & P_PPWAIT) + msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0); + PROC_UNLOCK(p2); + + /* + * Return child proc pointer to parent. + */ + *procp = p2; + return (0); +} + +/* + * The next two functionms are general routines to handle adding/deleting + * items on the fork callout list. + * + * at_fork(): + * Take the arguments given and put them onto the fork callout list, + * However first make sure that it's not already there. + * Returns 0 on success or a standard error number. + */ + +int +at_fork(function) + forklist_fn function; +{ + struct forklist *ep; + +#ifdef INVARIANTS + /* let the programmer know if he's been stupid */ + if (rm_at_fork(function)) + printf("WARNING: fork callout entry (%p) already present\n", + function); +#endif + ep = malloc(sizeof(*ep), M_ATFORK, M_NOWAIT); + if (ep == NULL) + return (ENOMEM); + ep->function = function; + sx_xlock(&fork_list_lock); + TAILQ_INSERT_TAIL(&fork_list, ep, next); + sx_xunlock(&fork_list_lock); + return (0); +} + +/* + * Scan the exit callout list for the given item and remove it.. + * Returns the number of items removed (0 or 1) + */ + +int +rm_at_fork(function) + forklist_fn function; +{ + struct forklist *ep; + + sx_xlock(&fork_list_lock); + TAILQ_FOREACH(ep, &fork_list, next) { + if (ep->function == function) { + TAILQ_REMOVE(&fork_list, ep, next); + sx_xunlock(&fork_list_lock); + free(ep, M_ATFORK); + return(1); + } + } + sx_xunlock(&fork_list_lock); + return (0); +} + +/* + * Handle the return of a child process from fork1(). This function + * is called from the MD fork_trampoline() entry point. + */ +void +fork_exit(callout, arg, frame) + void (*callout)(void *, struct trapframe *); + void *arg; + struct trapframe *frame; +{ + struct thread *td = curthread; + struct proc *p = td->td_proc; + + td->td_kse->ke_oncpu = PCPU_GET(cpuid); + /* + * Finish setting up thread glue. We need to initialize + * the thread into a td_critnest=1 state. Some platforms + * may have already partially or fully initialized td_critnest + * and/or td_md.md_savecrit (when applciable). + * + * see <arch>/<arch>/critical.c + */ + sched_lock.mtx_lock = (uintptr_t)td; + sched_lock.mtx_recurse = 0; + cpu_critical_fork_exit(); + CTR3(KTR_PROC, "fork_exit: new proc %p (pid %d, %s)", p, p->p_pid, + p->p_comm); + if (PCPU_GET(switchtime.sec) == 0) + binuptime(PCPU_PTR(switchtime)); + PCPU_SET(switchticks, ticks); + mtx_unlock_spin(&sched_lock); + + /* + * cpu_set_fork_handler intercepts this function call to + * have this call a non-return function to stay in kernel mode. + * initproc has its own fork handler, but it does return. + */ + KASSERT(callout != NULL, ("NULL callout in fork_exit")); + callout(arg, frame); + + /* + * Check if a kernel thread misbehaved and returned from its main + * function. + */ + PROC_LOCK(p); + if (p->p_flag & P_KTHREAD) { + PROC_UNLOCK(p); + mtx_lock(&Giant); + printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", + p->p_comm, p->p_pid); + kthread_exit(0); + } + PROC_UNLOCK(p); + mtx_assert(&Giant, MA_NOTOWNED); +} + +/* + * Simplified back end of syscall(), used when returning from fork() + * directly into user mode. Giant is not held on entry, and must not + * be held on return. This function is passed in to fork_exit() as the + * first parameter and is called when returning to a new userland process. + */ +void +fork_return(td, frame) + struct thread *td; + struct trapframe *frame; +{ + + userret(td, frame, 0); +#ifdef KTRACE + if (KTRPOINT(td, KTR_SYSRET)) + ktrsysret(SYS_fork, 0, 0); +#endif + mtx_assert(&Giant, MA_NOTOWNED); +} diff --git a/sys/kern/kern_idle.c b/sys/kern/kern_idle.c new file mode 100644 index 0000000..29194b7 --- /dev/null +++ b/sys/kern/kern_idle.c @@ -0,0 +1,110 @@ +/*- + * Copyright (c) 2000, All rights reserved. See /usr/src/COPYRIGHT + * + * $FreeBSD$ + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/kthread.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/smp.h> +#include <sys/unistd.h> +#ifdef KTRACE +#include <sys/uio.h> +#include <sys/ktrace.h> +#endif + +static void idle_setup(void *dummy); +SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL) + +static void idle_proc(void *dummy); + +/* + * Setup per-cpu idle process contexts. The AP's shouldn't be running or + * accessing their idle processes at this point, so don't bother with + * locking. + */ +static void +idle_setup(void *dummy) +{ +#ifdef SMP + struct pcpu *pc; +#endif + struct proc *p; + int error; + +#ifdef SMP + SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { + error = kthread_create(idle_proc, NULL, &p, + RFSTOPPED | RFHIGHPID, "idle: cpu%d", pc->pc_cpuid); + pc->pc_idlethread = FIRST_THREAD_IN_PROC(p); + if (pc->pc_curthread == NULL) { + pc->pc_curthread = pc->pc_idlethread; + pc->pc_idlethread->td_critnest = 0; + } +#else + error = kthread_create(idle_proc, NULL, &p, + RFSTOPPED | RFHIGHPID, "idle"); + PCPU_SET(idlethread, FIRST_THREAD_IN_PROC(p)); +#endif + if (error) + panic("idle_setup: kthread_create error %d\n", error); + + p->p_flag |= P_NOLOAD; + p->p_stat = SRUN; +#ifdef SMP + } +#endif +} + +/* + * idle process context + */ +static void +idle_proc(void *dummy) +{ +#ifdef DIAGNOSTIC + int count; +#endif + + for (;;) { + mtx_assert(&Giant, MA_NOTOWNED); + +#ifdef DIAGNOSTIC + count = 0; + + while (count >= 0 && procrunnable() == 0) { +#else + while (procrunnable() == 0) { +#endif + /* + * This is a good place to put things to be done in + * the background, including sanity checks. + */ + +#ifdef DIAGNOSTIC + if (count++ < 0) + CTR0(KTR_PROC, "idle_proc: timed out waiting" + " for a process"); +#endif + +#ifdef __i386__ + cpu_idle(); +#endif + } + + mtx_lock_spin(&sched_lock); + curproc->p_stats->p_ru.ru_nvcsw++; + mi_switch(); + mtx_unlock_spin(&sched_lock); + } +} diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c new file mode 100644 index 0000000..d65dc82 --- /dev/null +++ b/sys/kern/kern_intr.c @@ -0,0 +1,684 @@ +/* + * Copyright (c) 1997, Stefan Esser <se@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/rtprio.h> +#include <sys/systm.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/random.h> +#include <sys/resourcevar.h> +#include <sys/sysctl.h> +#include <sys/unistd.h> +#include <sys/vmmeter.h> +#include <machine/atomic.h> +#include <machine/cpu.h> +#include <machine/md_var.h> +#include <machine/stdarg.h> + +#include <net/netisr.h> /* prototype for legacy_setsoftnet */ + +struct int_entropy { + struct proc *proc; + int vector; +}; + +void *net_ih; +void *vm_ih; +void *softclock_ih; +struct ithd *clk_ithd; +struct ithd *tty_ithd; + +static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads"); + +static void ithread_update(struct ithd *); +static void ithread_loop(void *); +static void start_softintr(void *); +static void swi_net(void *); + +u_char +ithread_priority(enum intr_type flags) +{ + u_char pri; + + flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET | + INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV); + switch (flags) { + case INTR_TYPE_TTY: + pri = PI_TTYLOW; + break; + case INTR_TYPE_BIO: + /* + * XXX We need to refine this. BSD/OS distinguishes + * between tape and disk priorities. + */ + pri = PI_DISK; + break; + case INTR_TYPE_NET: + pri = PI_NET; + break; + case INTR_TYPE_CAM: + pri = PI_DISK; /* XXX or PI_CAM? */ + break; + case INTR_TYPE_AV: /* Audio/video */ + pri = PI_AV; + break; + case INTR_TYPE_CLK: + pri = PI_REALTIME; + break; + case INTR_TYPE_MISC: + pri = PI_DULL; /* don't care */ + break; + default: + /* We didn't specify an interrupt level. */ + panic("ithread_priority: no interrupt type in flags"); + } + + return pri; +} + +/* + * Regenerate the name (p_comm) and priority for a threaded interrupt thread. + */ +static void +ithread_update(struct ithd *ithd) +{ + struct intrhand *ih; + struct thread *td; + struct proc *p; + int entropy; + + mtx_assert(&ithd->it_lock, MA_OWNED); + td = ithd->it_td; + if (td == NULL) + return; + p = td->td_proc; + + strncpy(p->p_comm, ithd->it_name, sizeof(ithd->it_name)); + ih = TAILQ_FIRST(&ithd->it_handlers); + if (ih == NULL) { + mtx_lock_spin(&sched_lock); + td->td_priority = PRI_MAX_ITHD; + td->td_base_pri = PRI_MAX_ITHD; + mtx_unlock_spin(&sched_lock); + ithd->it_flags &= ~IT_ENTROPY; + return; + } + entropy = 0; + mtx_lock_spin(&sched_lock); + td->td_priority = ih->ih_pri; + td->td_base_pri = ih->ih_pri; + mtx_unlock_spin(&sched_lock); + TAILQ_FOREACH(ih, &ithd->it_handlers, ih_next) { + if (strlen(p->p_comm) + strlen(ih->ih_name) + 1 < + sizeof(p->p_comm)) { + strcat(p->p_comm, " "); + strcat(p->p_comm, ih->ih_name); + } else if (strlen(p->p_comm) + 1 == sizeof(p->p_comm)) { + if (p->p_comm[sizeof(p->p_comm) - 2] == '+') + p->p_comm[sizeof(p->p_comm) - 2] = '*'; + else + p->p_comm[sizeof(p->p_comm) - 2] = '+'; + } else + strcat(p->p_comm, "+"); + if (ih->ih_flags & IH_ENTROPY) + entropy++; + } + if (entropy) + ithd->it_flags |= IT_ENTROPY; + else + ithd->it_flags &= ~IT_ENTROPY; + CTR2(KTR_INTR, "%s: updated %s\n", __func__, p->p_comm); +} + +int +ithread_create(struct ithd **ithread, int vector, int flags, + void (*disable)(int), void (*enable)(int), const char *fmt, ...) +{ + struct ithd *ithd; + struct thread *td; + struct proc *p; + int error; + va_list ap; + + /* The only valid flag during creation is IT_SOFT. */ + if ((flags & ~IT_SOFT) != 0) + return (EINVAL); + + ithd = malloc(sizeof(struct ithd), M_ITHREAD, M_WAITOK | M_ZERO); + ithd->it_vector = vector; + ithd->it_disable = disable; + ithd->it_enable = enable; + ithd->it_flags = flags; + TAILQ_INIT(&ithd->it_handlers); + mtx_init(&ithd->it_lock, "ithread", NULL, MTX_DEF); + + va_start(ap, fmt); + vsnprintf(ithd->it_name, sizeof(ithd->it_name), fmt, ap); + va_end(ap); + + error = kthread_create(ithread_loop, ithd, &p, RFSTOPPED | RFHIGHPID, + "%s", ithd->it_name); + if (error) { + mtx_destroy(&ithd->it_lock); + free(ithd, M_ITHREAD); + return (error); + } + td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */ + td->td_ksegrp->kg_pri_class = PRI_ITHD; + td->td_priority = PRI_MAX_ITHD; + p->p_stat = SWAIT; + ithd->it_td = td; + td->td_ithd = ithd; + if (ithread != NULL) + *ithread = ithd; + + CTR2(KTR_INTR, "%s: created %s", __func__, ithd->it_name); + return (0); +} + +int +ithread_destroy(struct ithd *ithread) +{ + + struct thread *td; + struct proc *p; + if (ithread == NULL) + return (EINVAL); + + td = ithread->it_td; + p = td->td_proc; + mtx_lock(&ithread->it_lock); + if (!TAILQ_EMPTY(&ithread->it_handlers)) { + mtx_unlock(&ithread->it_lock); + return (EINVAL); + } + ithread->it_flags |= IT_DEAD; + mtx_lock_spin(&sched_lock); + if (p->p_stat == SWAIT) { + p->p_stat = SRUN; /* XXXKSE */ + setrunqueue(td); + } + mtx_unlock_spin(&sched_lock); + mtx_unlock(&ithread->it_lock); + CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_name); + return (0); +} + +int +ithread_add_handler(struct ithd* ithread, const char *name, + driver_intr_t handler, void *arg, u_char pri, enum intr_type flags, + void **cookiep) +{ + struct intrhand *ih, *temp_ih; + + if (ithread == NULL || name == NULL || handler == NULL) + return (EINVAL); + if ((flags & INTR_FAST) !=0) + flags |= INTR_EXCL; + + ih = malloc(sizeof(struct intrhand), M_ITHREAD, M_WAITOK | M_ZERO); + ih->ih_handler = handler; + ih->ih_argument = arg; + ih->ih_name = name; + ih->ih_ithread = ithread; + ih->ih_pri = pri; + if (flags & INTR_FAST) + ih->ih_flags = IH_FAST | IH_EXCLUSIVE; + else if (flags & INTR_EXCL) + ih->ih_flags = IH_EXCLUSIVE; + if (flags & INTR_MPSAFE) + ih->ih_flags |= IH_MPSAFE; + if (flags & INTR_ENTROPY) + ih->ih_flags |= IH_ENTROPY; + + mtx_lock(&ithread->it_lock); + if ((flags & INTR_EXCL) !=0 && !TAILQ_EMPTY(&ithread->it_handlers)) + goto fail; + if (!TAILQ_EMPTY(&ithread->it_handlers) && + (TAILQ_FIRST(&ithread->it_handlers)->ih_flags & IH_EXCLUSIVE) != 0) + goto fail; + + TAILQ_FOREACH(temp_ih, &ithread->it_handlers, ih_next) + if (temp_ih->ih_pri > ih->ih_pri) + break; + if (temp_ih == NULL) + TAILQ_INSERT_TAIL(&ithread->it_handlers, ih, ih_next); + else + TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next); + ithread_update(ithread); + mtx_unlock(&ithread->it_lock); + + if (cookiep != NULL) + *cookiep = ih; + CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name, + ithread->it_name); + return (0); + +fail: + mtx_unlock(&ithread->it_lock); + free(ih, M_ITHREAD); + return (EINVAL); +} + +int +ithread_remove_handler(void *cookie) +{ + struct intrhand *handler = (struct intrhand *)cookie; + struct ithd *ithread; +#ifdef INVARIANTS + struct intrhand *ih; +#endif + + if (handler == NULL) + return (EINVAL); + ithread = handler->ih_ithread; + KASSERT(ithread != NULL, + ("interrupt handler \"%s\" has a NULL interrupt thread", + handler->ih_name)); + CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name, + ithread->it_name); + mtx_lock(&ithread->it_lock); +#ifdef INVARIANTS + TAILQ_FOREACH(ih, &ithread->it_handlers, ih_next) + if (ih == handler) + goto ok; + mtx_unlock(&ithread->it_lock); + panic("interrupt handler \"%s\" not found in interrupt thread \"%s\"", + ih->ih_name, ithread->it_name); +ok: +#endif + /* + * If the interrupt thread is already running, then just mark this + * handler as being dead and let the ithread do the actual removal. + */ + mtx_lock_spin(&sched_lock); + if (ithread->it_td->td_proc->p_stat != SWAIT) { + handler->ih_flags |= IH_DEAD; + + /* + * Ensure that the thread will process the handler list + * again and remove this handler if it has already passed + * it on the list. + */ + ithread->it_need = 1; + } else + TAILQ_REMOVE(&ithread->it_handlers, handler, ih_next); + mtx_unlock_spin(&sched_lock); + if ((handler->ih_flags & IH_DEAD) != 0) + msleep(handler, &ithread->it_lock, PUSER, "itrmh", 0); + ithread_update(ithread); + mtx_unlock(&ithread->it_lock); + free(handler, M_ITHREAD); + return (0); +} + +int +ithread_schedule(struct ithd *ithread, int do_switch) +{ + struct int_entropy entropy; + struct thread *td; + struct proc *p; + + /* + * If no ithread or no handlers, then we have a stray interrupt. + */ + if ((ithread == NULL) || TAILQ_EMPTY(&ithread->it_handlers)) + return (EINVAL); + + /* + * If any of the handlers for this ithread claim to be good + * sources of entropy, then gather some. + */ + if (harvest.interrupt && ithread->it_flags & IT_ENTROPY) { + entropy.vector = ithread->it_vector; + entropy.proc = curthread->td_proc;; + random_harvest(&entropy, sizeof(entropy), 2, 0, + RANDOM_INTERRUPT); + } + + td = ithread->it_td; + p = td->td_proc; + KASSERT(p != NULL, ("ithread %s has no process", ithread->it_name)); + CTR4(KTR_INTR, "%s: pid %d: (%s) need = %d", __func__, p->p_pid, p->p_comm, + ithread->it_need); + + /* + * Set it_need to tell the thread to keep running if it is already + * running. Then, grab sched_lock and see if we actually need to + * put this thread on the runqueue. If so and the do_switch flag is + * true and it is safe to switch, then switch to the ithread + * immediately. Otherwise, set the needresched flag to guarantee + * that this ithread will run before any userland processes. + */ + ithread->it_need = 1; + mtx_lock_spin(&sched_lock); + if (p->p_stat == SWAIT) { + CTR2(KTR_INTR, "%s: setrunqueue %d", __func__, p->p_pid); + p->p_stat = SRUN; + setrunqueue(td); /* XXXKSE */ + if (do_switch && curthread->td_critnest == 1 && + curthread->td_proc->p_stat == SRUN) { + if (curthread != PCPU_GET(idlethread)) + setrunqueue(curthread); + curthread->td_proc->p_stats->p_ru.ru_nivcsw++; + mi_switch(); + } else { + curthread->td_kse->ke_flags |= KEF_NEEDRESCHED; + } + } else { + CTR4(KTR_INTR, "%s: pid %d: it_need %d, state %d", + __func__, p->p_pid, ithread->it_need, p->p_stat); + } + mtx_unlock_spin(&sched_lock); + + return (0); +} + +int +swi_add(struct ithd **ithdp, const char *name, driver_intr_t handler, + void *arg, int pri, enum intr_type flags, void **cookiep) +{ + struct ithd *ithd; + int error; + + if (flags & (INTR_FAST | INTR_ENTROPY)) + return (EINVAL); + + ithd = (ithdp != NULL) ? *ithdp : NULL; + + if (ithd != NULL) { + if ((ithd->it_flags & IT_SOFT) == 0) + return(EINVAL); + } else { + error = ithread_create(&ithd, pri, IT_SOFT, NULL, NULL, + "swi%d:", pri); + if (error) + return (error); + + if (ithdp != NULL) + *ithdp = ithd; + } + return (ithread_add_handler(ithd, name, handler, arg, + (pri * RQ_PPQ) + PI_SOFT, flags, cookiep)); +} + + +/* + * Schedule a heavyweight software interrupt process. + */ +void +swi_sched(void *cookie, int flags) +{ + struct intrhand *ih = (struct intrhand *)cookie; + struct ithd *it = ih->ih_ithread; + int error; + + atomic_add_int(&cnt.v_intr, 1); /* one more global interrupt */ + + CTR3(KTR_INTR, "swi_sched pid %d(%s) need=%d", + it->it_td->td_proc->p_pid, it->it_td->td_proc->p_comm, it->it_need); + + /* + * Set ih_need for this handler so that if the ithread is already + * running it will execute this handler on the next pass. Otherwise, + * it will execute it the next time it runs. + */ + atomic_store_rel_int(&ih->ih_need, 1); + if (!(flags & SWI_DELAY)) { + error = ithread_schedule(it, !cold); + KASSERT(error == 0, ("stray software interrupt")); + } +} + +/* + * This is the main code for interrupt threads. + */ +void +ithread_loop(void *arg) +{ + struct ithd *ithd; /* our thread context */ + struct intrhand *ih; /* and our interrupt handler chain */ + struct thread *td; + struct proc *p; + + td = curthread; + p = td->td_proc; + ithd = (struct ithd *)arg; /* point to myself */ + KASSERT(ithd->it_td == td && td->td_ithd == ithd, + ("%s: ithread and proc linkage out of sync", __func__)); + + /* + * As long as we have interrupts outstanding, go through the + * list of handlers, giving each one a go at it. + */ + for (;;) { + /* + * If we are an orphaned thread, then just die. + */ + if (ithd->it_flags & IT_DEAD) { + CTR3(KTR_INTR, "%s: pid %d: (%s) exiting", __func__, + p->p_pid, p->p_comm); + td->td_ithd = NULL; + mtx_destroy(&ithd->it_lock); + mtx_lock(&Giant); + free(ithd, M_ITHREAD); + kthread_exit(0); + } + + CTR4(KTR_INTR, "%s: pid %d: (%s) need=%d", __func__, + p->p_pid, p->p_comm, ithd->it_need); + while (ithd->it_need) { + /* + * Service interrupts. If another interrupt + * arrives while we are running, they will set + * it_need to denote that we should make + * another pass. + */ + atomic_store_rel_int(&ithd->it_need, 0); +restart: + TAILQ_FOREACH(ih, &ithd->it_handlers, ih_next) { + if (ithd->it_flags & IT_SOFT && !ih->ih_need) + continue; + atomic_store_rel_int(&ih->ih_need, 0); + CTR6(KTR_INTR, + "%s: pid %d ih=%p: %p(%p) flg=%x", __func__, + p->p_pid, (void *)ih, + (void *)ih->ih_handler, ih->ih_argument, + ih->ih_flags); + + if ((ih->ih_flags & IH_DEAD) != 0) { + mtx_lock(&ithd->it_lock); + TAILQ_REMOVE(&ithd->it_handlers, ih, + ih_next); + wakeup(ih); + mtx_unlock(&ithd->it_lock); + goto restart; + } + if ((ih->ih_flags & IH_MPSAFE) == 0) + mtx_lock(&Giant); + ih->ih_handler(ih->ih_argument); + if ((ih->ih_flags & IH_MPSAFE) == 0) + mtx_unlock(&Giant); + } + } + + /* + * Processed all our interrupts. Now get the sched + * lock. This may take a while and it_need may get + * set again, so we have to check it again. + */ + mtx_assert(&Giant, MA_NOTOWNED); + mtx_lock_spin(&sched_lock); + if (!ithd->it_need) { + /* + * Should we call this earlier in the loop above? + */ + if (ithd->it_enable != NULL) + ithd->it_enable(ithd->it_vector); + p->p_stat = SWAIT; /* we're idle */ + p->p_stats->p_ru.ru_nvcsw++; + CTR2(KTR_INTR, "%s: pid %d: done", __func__, p->p_pid); + mi_switch(); + CTR2(KTR_INTR, "%s: pid %d: resumed", __func__, p->p_pid); + } + mtx_unlock_spin(&sched_lock); + } +} + +/* + * Start standard software interrupt threads + */ +static void +start_softintr(void *dummy) +{ + + if (swi_add(NULL, "net", swi_net, NULL, SWI_NET, 0, &net_ih) || + swi_add(&clk_ithd, "clock", softclock, NULL, SWI_CLOCK, + INTR_MPSAFE, &softclock_ih) || + swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, 0, &vm_ih)) + panic("died while creating standard software ithreads"); + + PROC_LOCK(clk_ithd->it_td->td_proc); + clk_ithd->it_td->td_proc->p_flag |= P_NOLOAD; + PROC_UNLOCK(clk_ithd->it_td->td_proc); +} +SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr, NULL) + +void +legacy_setsoftnet(void) +{ + swi_sched(net_ih, 0); +} + +/* + * XXX: This should really be in the network code somewhere and installed + * via a SI_SUB_SOFINTR, SI_ORDER_MIDDLE sysinit. + */ +void (*netisrs[32])(void); +volatile unsigned int netisr; /* scheduling bits for network */ + +int +register_netisr(num, handler) + int num; + netisr_t *handler; +{ + + if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { + printf("register_netisr: bad isr number: %d\n", num); + return (EINVAL); + } + netisrs[num] = handler; + return (0); +} + +int +unregister_netisr(num) + int num; +{ + + if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { + printf("unregister_netisr: bad isr number: %d\n", num); + return (EINVAL); + } + netisrs[num] = NULL; + return (0); +} + +#ifdef DEVICE_POLLING + void netisr_pollmore(void); +#endif + +static void +swi_net(void *dummy) +{ + u_int bits; + int i; + +#ifdef DEVICE_POLLING + for (;;) { + int pollmore; +#endif + bits = atomic_readandclear_int(&netisr); +#ifdef DEVICE_POLLING + if (bits == 0) + return; + pollmore = bits & (1 << NETISR_POLL); +#endif + while ((i = ffs(bits)) != 0) { + i--; + if (netisrs[i] != NULL) + netisrs[i](); + else + printf("swi_net: unregistered isr number: %d.\n", i); + bits &= ~(1 << i); + } +#ifdef DEVICE_POLLING + if (pollmore) + netisr_pollmore(); + } +#endif +} + +/* + * Sysctls used by systat and others: hw.intrnames and hw.intrcnt. + * The data for this machine dependent, and the declarations are in machine + * dependent code. The layout of intrnames and intrcnt however is machine + * independent. + * + * We do not know the length of intrcnt and intrnames at compile time, so + * calculate things at run time. + */ +static int +sysctl_intrnames(SYSCTL_HANDLER_ARGS) +{ + return (sysctl_handle_opaque(oidp, intrnames, eintrnames - intrnames, + req)); +} + +SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD, + NULL, 0, sysctl_intrnames, "", "Interrupt Names"); + +static int +sysctl_intrcnt(SYSCTL_HANDLER_ARGS) +{ + return (sysctl_handle_opaque(oidp, intrcnt, + (char *)eintrcnt - (char *)intrcnt, req)); +} + +SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD, + NULL, 0, sysctl_intrcnt, "", "Interrupt Counts"); diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c new file mode 100644 index 0000000..cf3b03c --- /dev/null +++ b/sys/kern/kern_jail.c @@ -0,0 +1,256 @@ +/* + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + * + * $FreeBSD$ + * + */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/sysproto.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/jail.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <net/if.h> +#include <netinet/in.h> + +MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); + +SYSCTL_DECL(_security); +SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0, + "Jail rules"); + +mp_fixme("these variables need a lock") + +int jail_set_hostname_allowed = 1; +SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW, + &jail_set_hostname_allowed, 0, + "Processes in jail can set their hostnames"); + +int jail_socket_unixiproute_only = 1; +SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW, + &jail_socket_unixiproute_only, 0, + "Processes in jail are limited to creating UNIX/IPv4/route sockets only"); + +int jail_sysvipc_allowed = 0; +SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW, + &jail_sysvipc_allowed, 0, + "Processes in jail can use System V IPC primitives"); + +/* + * MPSAFE + */ +int +jail(td, uap) + struct thread *td; + struct jail_args /* { + syscallarg(struct jail *) jail; + } */ *uap; +{ + struct proc *p = td->td_proc; + int error; + struct prison *pr; + struct jail j; + struct chroot_args ca; + struct ucred *newcred = NULL, *oldcred; + + error = copyin(uap->jail, &j, sizeof j); + if (error) + return (error); + if (j.version != 0) + return (EINVAL); + + MALLOC(pr, struct prison *, sizeof *pr , M_PRISON, M_WAITOK | M_ZERO); + mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF); + pr->pr_securelevel = securelevel; + error = copyinstr(j.hostname, &pr->pr_host, sizeof pr->pr_host, 0); + if (error) + goto bail; + ca.path = j.path; + error = chroot(td, &ca); + if (error) + goto bail; + newcred = crget(); + pr->pr_ip = j.ip_number; + PROC_LOCK(p); + /* Implicitly fail if already in jail. */ + error = suser_cred(p->p_ucred, 0); + if (error) + goto badcred; + oldcred = p->p_ucred; + crcopy(newcred, oldcred); + p->p_ucred = newcred; + p->p_ucred->cr_prison = pr; + pr->pr_ref = 1; + PROC_UNLOCK(p); + crfree(oldcred); + return (0); +badcred: + PROC_UNLOCK(p); + crfree(newcred); +bail: + FREE(pr, M_PRISON); + return (error); +} + +void +prison_free(struct prison *pr) +{ + + mtx_lock(&pr->pr_mtx); + pr->pr_ref--; + if (pr->pr_ref == 0) { + mtx_unlock(&pr->pr_mtx); + mtx_destroy(&pr->pr_mtx); + if (pr->pr_linux != NULL) + FREE(pr->pr_linux, M_PRISON); + FREE(pr, M_PRISON); + return; + } + mtx_unlock(&pr->pr_mtx); +} + +void +prison_hold(struct prison *pr) +{ + + mtx_lock(&pr->pr_mtx); + pr->pr_ref++; + mtx_unlock(&pr->pr_mtx); +} + +u_int32_t +prison_getip(struct ucred *cred) +{ + + return (cred->cr_prison->pr_ip); +} + +int +prison_ip(struct ucred *cred, int flag, u_int32_t *ip) +{ + u_int32_t tmp; + + if (!jailed(cred)) + return (0); + if (flag) + tmp = *ip; + else + tmp = ntohl(*ip); + if (tmp == INADDR_ANY) { + if (flag) + *ip = cred->cr_prison->pr_ip; + else + *ip = htonl(cred->cr_prison->pr_ip); + return (0); + } + if (tmp == INADDR_LOOPBACK) { + if (flag) + *ip = cred->cr_prison->pr_ip; + else + *ip = htonl(cred->cr_prison->pr_ip); + return (0); + } + if (cred->cr_prison->pr_ip != tmp) + return (1); + return (0); +} + +void +prison_remote_ip(struct ucred *cred, int flag, u_int32_t *ip) +{ + u_int32_t tmp; + + if (!jailed(cred)) + return; + if (flag) + tmp = *ip; + else + tmp = ntohl(*ip); + if (tmp == INADDR_LOOPBACK) { + if (flag) + *ip = cred->cr_prison->pr_ip; + else + *ip = htonl(cred->cr_prison->pr_ip); + return; + } + return; +} + +int +prison_if(struct ucred *cred, struct sockaddr *sa) +{ + struct sockaddr_in *sai = (struct sockaddr_in*) sa; + int ok; + + if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only) + ok = 1; + else if (sai->sin_family != AF_INET) + ok = 0; + else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr)) + ok = 1; + else + ok = 0; + return (ok); +} + +/* + * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. + */ +int +prison_check(cred1, cred2) + struct ucred *cred1, *cred2; +{ + + if (jailed(cred1)) { + if (!jailed(cred2)) + return (ESRCH); + if (cred2->cr_prison != cred1->cr_prison) + return (ESRCH); + } + + return (0); +} + +/* + * Return 1 if the passed credential is in a jail, otherwise 0. + */ +int +jailed(cred) + struct ucred *cred; +{ + + return (cred->cr_prison != NULL); +} + +/* + * Return the correct hostname for the passed credential. + */ +void +getcredhostname(cred, buf, size) + struct ucred *cred; + char *buf; + size_t size; +{ + + if (jailed(cred)) { + mtx_lock(&cred->cr_prison->pr_mtx); + strncpy(buf, cred->cr_prison->pr_host, size); + mtx_unlock(&cred->cr_prison->pr_mtx); + } + else + strncpy(buf, hostname, size); + buf[size - 1] = '\0'; +} diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c new file mode 100644 index 0000000..a456a86 --- /dev/null +++ b/sys/kern/kern_kthread.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kthread.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/sx.h> +#include <sys/unistd.h> +#include <sys/wait.h> + +#include <machine/stdarg.h> + +/* + * Start a kernel process. This is called after a fork() call in + * mi_startup() in the file kern/init_main.c. + * + * This function is used to start "internal" daemons and intended + * to be called from SYSINIT(). + */ +void +kproc_start(udata) + const void *udata; +{ + const struct kproc_desc *kp = udata; + int error; + + error = kthread_create((void (*)(void *))kp->func, NULL, + kp->global_procpp, 0, "%s", kp->arg0); + if (error) + panic("kproc_start: %s: error %d", kp->arg0, error); +} + +/* + * Create a kernel process/thread/whatever. It shares its address space + * with proc0 - ie: kernel only. + * + * func is the function to start. + * arg is the parameter to pass to function on first startup. + * newpp is the return value pointing to the thread's struct proc. + * flags are flags to fork1 (in unistd.h) + * fmt and following will be *printf'd into (*newpp)->p_comm (for ps, etc.). + */ +int +kthread_create(void (*func)(void *), void *arg, + struct proc **newpp, int flags, const char *fmt, ...) +{ + int error; + va_list ap; + struct proc *p2; + + if (!proc0.p_stats /* || proc0.p_stats->p_start.tv_sec == 0 */) + panic("kthread_create called too soon"); + + error = fork1(&thread0, RFMEM | RFFDG | RFPROC | RFSTOPPED | flags, + &p2); + if (error) + return error; + + /* save a global descriptor, if desired */ + if (newpp != NULL) + *newpp = p2; + + /* this is a non-swapped system process */ + PROC_LOCK(p2); + p2->p_flag |= P_SYSTEM | P_KTHREAD; + p2->p_procsig->ps_flag |= PS_NOCLDWAIT; + _PHOLD(p2); + PROC_UNLOCK(p2); + + /* set up arg0 for 'ps', et al */ + va_start(ap, fmt); + vsnprintf(p2->p_comm, sizeof(p2->p_comm), fmt, ap); + va_end(ap); + + /* call the processes' main()... */ + cpu_set_fork_handler(FIRST_THREAD_IN_PROC(p2), func, arg); + + /* Delay putting it on the run queue until now. */ + mtx_lock_spin(&sched_lock); + p2->p_sflag |= PS_INMEM; + if (!(flags & RFSTOPPED)) { + p2->p_stat = SRUN; + setrunqueue(FIRST_THREAD_IN_PROC(p2)); /* XXXKSE */ + } + mtx_unlock_spin(&sched_lock); + + return 0; +} + +void +kthread_exit(int ecode) +{ + struct thread *td; + struct proc *p; + + td = curthread; + p = td->td_proc; + sx_xlock(&proctree_lock); + PROC_LOCK(p); + proc_reparent(p, initproc); + PROC_UNLOCK(p); + sx_xunlock(&proctree_lock); + exit1(td, W_EXITCODE(ecode, 0)); +} + +/* + * Advise a kernel process to suspend (or resume) in its main loop. + * Participation is voluntary. + */ +int +kthread_suspend(struct proc *p, int timo) +{ + /* + * Make sure this is indeed a system process and we can safely + * use the p_siglist field. + */ + PROC_LOCK(p); + if ((p->p_flag & P_KTHREAD) == 0) { + PROC_UNLOCK(p); + return (EINVAL); + } + SIGADDSET(p->p_siglist, SIGSTOP); + wakeup(p); + return msleep(&p->p_siglist, &p->p_mtx, PPAUSE | PDROP, "suspkt", timo); +} + +int +kthread_resume(struct proc *p) +{ + /* + * Make sure this is indeed a system process and we can safely + * use the p_siglist field. + */ + PROC_LOCK(p); + if ((p->p_flag & P_KTHREAD) == 0) { + PROC_UNLOCK(p); + return (EINVAL); + } + SIGDELSET(p->p_siglist, SIGSTOP); + PROC_UNLOCK(p); + wakeup(&p->p_siglist); + return (0); +} + +void +kthread_suspend_check(struct proc *p) +{ + PROC_LOCK(p); + while (SIGISMEMBER(p->p_siglist, SIGSTOP)) { + wakeup(&p->p_siglist); + msleep(&p->p_siglist, &p->p_mtx, PPAUSE, "ktsusp", 0); + } + PROC_UNLOCK(p); +} diff --git a/sys/kern/kern_ktr.c b/sys/kern/kern_ktr.c new file mode 100644 index 0000000..719d5e4 --- /dev/null +++ b/sys/kern/kern_ktr.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2000 + * John Baldwin <jhb@FreeBSD.org>. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY JOHN BALDWIN AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL JOHN BALDWIN OR THE VOICES IN HIS HEAD + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This module holds the global variables used by KTR and the ktr_tracepoint() + * function that does the actual tracing. + */ + +#include "opt_ddb.h" +#include "opt_ktr.h" + +#include <sys/param.h> +#include <sys/cons.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/libkern.h> +#include <sys/proc.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/time.h> + +#include <machine/cpu.h> +#ifdef __sparc64__ +#include <machine/ktr.h> +#endif + +#include <ddb/ddb.h> + +#ifndef KTR_ENTRIES +#define KTR_ENTRIES 1024 +#endif + +#ifndef KTR_MASK +#define KTR_MASK (KTR_GEN) +#endif + +#ifndef KTR_CPUMASK +#define KTR_CPUMASK (~0) +#endif + +#ifndef KTR_TIME +#define KTR_TIME get_cyclecount() +#endif + +#ifndef KTR_CPU +#define KTR_CPU PCPU_GET(cpuid) +#endif + +SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD, 0, "KTR options"); + +int ktr_cpumask = KTR_CPUMASK; +TUNABLE_INT("debug.ktr.cpumask", &ktr_cpumask); +SYSCTL_INT(_debug_ktr, OID_AUTO, cpumask, CTLFLAG_RW, &ktr_cpumask, 0, ""); + +int ktr_mask = KTR_MASK; +TUNABLE_INT("debug.ktr.mask", &ktr_mask); +SYSCTL_INT(_debug_ktr, OID_AUTO, mask, CTLFLAG_RW, &ktr_mask, 0, ""); + +int ktr_entries = KTR_ENTRIES; +SYSCTL_INT(_debug_ktr, OID_AUTO, entries, CTLFLAG_RD, &ktr_entries, 0, ""); + +int ktr_version = KTR_VERSION; +SYSCTL_INT(_debug_ktr, OID_AUTO, version, CTLFLAG_RD, &ktr_version, 0, ""); + +volatile int ktr_idx = 0; +struct ktr_entry ktr_buf[KTR_ENTRIES]; + +#ifdef KTR_VERBOSE +int ktr_verbose = KTR_VERBOSE; +TUNABLE_INT("debug.ktr.verbose", &ktr_verbose); +SYSCTL_INT(_debug_ktr, OID_AUTO, verbose, CTLFLAG_RW, &ktr_verbose, 0, ""); +#endif + +void +ktr_tracepoint(u_int mask, const char *file, int line, const char *format, + u_long arg1, u_long arg2, u_long arg3, u_long arg4, u_long arg5, + u_long arg6) +{ + struct ktr_entry *entry; + int newindex, saveindex; +#ifdef KTR_VERBOSE + struct thread *td; +#endif + int cpu; + + if (panicstr) + return; + if ((ktr_mask & mask) == 0) + return; + cpu = KTR_CPU; + if (((1 << cpu) & ktr_cpumask) == 0) + return; +#ifdef KTR_VERBOSE + td = curthread; + if (td->td_inktr) + return; + td->td_inktr++; +#endif + do { + saveindex = ktr_idx; + newindex = (saveindex + 1) & (KTR_ENTRIES - 1); + } while (atomic_cmpset_rel_int(&ktr_idx, saveindex, newindex) == 0); + entry = &ktr_buf[saveindex]; + entry->ktr_timestamp = KTR_TIME; + entry->ktr_cpu = cpu; + entry->ktr_file = file; + entry->ktr_line = line; +#ifdef KTR_VERBOSE + if (ktr_verbose) { +#ifdef SMP + printf("cpu%d ", cpu); +#endif + if (ktr_verbose > 1) { + printf("%s.%d\t", entry->ktr_file, + entry->ktr_line); + } + printf(format, arg1, arg2, arg3, arg4, arg5, arg6); + printf("\n"); + } +#endif + entry->ktr_desc = format; + entry->ktr_parms[0] = arg1; + entry->ktr_parms[1] = arg2; + entry->ktr_parms[2] = arg3; + entry->ktr_parms[3] = arg4; + entry->ktr_parms[4] = arg5; + entry->ktr_parms[5] = arg6; +#ifdef KTR_VERBOSE + td->td_inktr--; +#endif +} + +#ifdef DDB + +struct tstate { + int cur; + int first; +}; +static struct tstate tstate; +static int db_ktr_verbose; +static int db_mach_vtrace(void); + +#define NUM_LINES_PER_PAGE 18 + +DB_SHOW_COMMAND(ktr, db_ktr_all) +{ + int c, lines; + + lines = NUM_LINES_PER_PAGE; + tstate.cur = (ktr_idx - 1) & (KTR_ENTRIES - 1); + tstate.first = -1; + if (strcmp(modif, "v") == 0) + db_ktr_verbose = 1; + else + db_ktr_verbose = 0; + while (db_mach_vtrace()) + if (--lines == 0) { + db_printf("--More--"); + c = cngetc(); + db_printf("\r"); + switch (c) { + case '\n': /* one more line */ + lines = 1; + break; + case ' ': /* one more page */ + lines = NUM_LINES_PER_PAGE; + break; + default: + db_printf("\n"); + return; + } + } +} + +static int +db_mach_vtrace(void) +{ + struct ktr_entry *kp; + + if (tstate.cur == tstate.first) { + db_printf("--- End of trace buffer ---\n"); + return (0); + } + kp = &ktr_buf[tstate.cur]; + + /* Skip over unused entries. */ + if (kp->ktr_desc == NULL) { + db_printf("--- End of trace buffer ---\n"); + return (0); + } + db_printf("%d: ", tstate.cur); +#ifdef SMP + db_printf("cpu%d ", kp->ktr_cpu); +#endif + if (db_ktr_verbose) { + db_printf("%10.10lld %s.%d\t", (long long)kp->ktr_timestamp, + kp->ktr_file, kp->ktr_line); + } + db_printf(kp->ktr_desc, kp->ktr_parms[0], kp->ktr_parms[1], + kp->ktr_parms[2], kp->ktr_parms[3], kp->ktr_parms[4], + kp->ktr_parms[5]); + db_printf("\n"); + + if (tstate.first == -1) + tstate.first = tstate.cur; + + if (--tstate.cur < 0) + tstate.cur = KTR_ENTRIES - 1; + + return (1); +} + +#endif /* DDB */ diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c new file mode 100644 index 0000000..b71f695 --- /dev/null +++ b/sys/kern/kern_ktrace.c @@ -0,0 +1,850 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93 + * $FreeBSD$ + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/jail.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/malloc.h> +#include <sys/namei.h> +#include <sys/proc.h> +#include <sys/unistd.h> +#include <sys/vnode.h> +#include <sys/ktrace.h> +#include <sys/sema.h> +#include <sys/sx.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/sysproto.h> + +static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE"); + +#ifdef KTRACE + +#ifndef KTRACE_REQUEST_POOL +#define KTRACE_REQUEST_POOL 100 +#endif + +struct ktr_request { + struct ktr_header ktr_header; + struct ucred *ktr_cred; + struct vnode *ktr_vp; + union { + struct ktr_syscall ktr_syscall; + struct ktr_sysret ktr_sysret; + struct ktr_genio ktr_genio; + struct ktr_psig ktr_psig; + struct ktr_csw ktr_csw; + } ktr_data; + int ktr_synchronous; + STAILQ_ENTRY(ktr_request) ktr_list; +}; + +static int data_lengths[] = { + 0, /* none */ + offsetof(struct ktr_syscall, ktr_args), /* KTR_SYSCALL */ + sizeof(struct ktr_sysret), /* KTR_SYSRET */ + 0, /* KTR_NAMEI */ + sizeof(struct ktr_genio), /* KTR_GENIO */ + sizeof(struct ktr_psig), /* KTR_PSIG */ + sizeof(struct ktr_csw), /* KTR_CSW */ + 0 /* KTR_USER */ +}; + +static STAILQ_HEAD(, ktr_request) ktr_todo; +static STAILQ_HEAD(, ktr_request) ktr_free; + +static uint ktr_requestpool = KTRACE_REQUEST_POOL; +TUNABLE_INT("kern.ktrace_request_pool", &ktr_requestpool); + +static int print_message = 1; +struct mtx ktrace_mtx; +static struct sema ktrace_sema; + +static void ktrace_init(void *dummy); +static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS); +static uint ktrace_resize_pool(uint newsize); +static struct ktr_request *ktr_getrequest(int type); +static void ktr_submitrequest(struct ktr_request *req); +static void ktr_freerequest(struct ktr_request *req); +static void ktr_loop(void *dummy); +static void ktr_writerequest(struct ktr_request *req); +static int ktrcanset(struct thread *,struct proc *); +static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *); +static int ktrops(struct thread *,struct proc *,int,int,struct vnode *); + +static void +ktrace_init(void *dummy) +{ + struct ktr_request *req; + int i; + + mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET); + sema_init(&ktrace_sema, 0, "ktrace"); + STAILQ_INIT(&ktr_todo); + STAILQ_INIT(&ktr_free); + for (i = 0; i < ktr_requestpool; i++) { + req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK); + STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list); + } + kthread_create(ktr_loop, NULL, NULL, RFHIGHPID, "ktrace"); +} +SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL); + +static int +sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS) +{ + struct thread *td; + uint newsize, oldsize, wantsize; + int error; + + /* Handle easy read-only case first to avoid warnings from GCC. */ + if (!req->newptr) { + mtx_lock(&ktrace_mtx); + oldsize = ktr_requestpool; + mtx_unlock(&ktrace_mtx); + return (SYSCTL_OUT(req, &oldsize, sizeof(uint))); + } + + error = SYSCTL_IN(req, &wantsize, sizeof(uint)); + if (error) + return (error); + td = curthread; + td->td_inktrace = 1; + mtx_lock(&ktrace_mtx); + oldsize = ktr_requestpool; + newsize = ktrace_resize_pool(wantsize); + mtx_unlock(&ktrace_mtx); + td->td_inktrace = 0; + error = SYSCTL_OUT(req, &oldsize, sizeof(uint)); + if (error) + return (error); + if (newsize != wantsize) + return (ENOSPC); + return (0); +} +SYSCTL_PROC(_kern, OID_AUTO, ktrace_request_pool, CTLTYPE_UINT|CTLFLAG_RW, + &ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU", ""); + +static uint +ktrace_resize_pool(uint newsize) +{ + struct ktr_request *req; + + mtx_assert(&ktrace_mtx, MA_OWNED); + print_message = 1; + if (newsize == ktr_requestpool) + return (newsize); + if (newsize < ktr_requestpool) + /* Shrink pool down to newsize if possible. */ + while (ktr_requestpool > newsize) { + req = STAILQ_FIRST(&ktr_free); + if (req == NULL) + return (ktr_requestpool); + STAILQ_REMOVE_HEAD(&ktr_free, ktr_list); + ktr_requestpool--; + mtx_unlock(&ktrace_mtx); + free(req, M_KTRACE); + mtx_lock(&ktrace_mtx); + } + else + /* Grow pool up to newsize. */ + while (ktr_requestpool < newsize) { + mtx_unlock(&ktrace_mtx); + req = malloc(sizeof(struct ktr_request), M_KTRACE, + M_WAITOK); + mtx_lock(&ktrace_mtx); + STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list); + ktr_requestpool++; + } + return (ktr_requestpool); +} + +static struct ktr_request * +ktr_getrequest(int type) +{ + struct ktr_request *req; + struct thread *td = curthread; + struct proc *p = td->td_proc; + int pm; + + td->td_inktrace = 1; + mtx_lock(&ktrace_mtx); + if (!KTRCHECK(td, type)) { + mtx_unlock(&ktrace_mtx); + td->td_inktrace = 0; + return (NULL); + } + req = STAILQ_FIRST(&ktr_free); + if (req != NULL) { + STAILQ_REMOVE_HEAD(&ktr_free, ktr_list); + req->ktr_header.ktr_type = type; + KASSERT(p->p_tracep != NULL, ("ktrace: no trace vnode")); + req->ktr_vp = p->p_tracep; + VREF(p->p_tracep); + mtx_unlock(&ktrace_mtx); + microtime(&req->ktr_header.ktr_time); + req->ktr_header.ktr_pid = p->p_pid; + bcopy(p->p_comm, req->ktr_header.ktr_comm, MAXCOMLEN + 1); + req->ktr_cred = crhold(td->td_ucred); + req->ktr_header.ktr_buffer = NULL; + req->ktr_header.ktr_len = 0; + req->ktr_synchronous = 0; + } else { + pm = print_message; + print_message = 0; + mtx_unlock(&ktrace_mtx); + if (pm) + printf("Out of ktrace request objects.\n"); + td->td_inktrace = 0; + } + return (req); +} + +static void +ktr_submitrequest(struct ktr_request *req) +{ + + mtx_lock(&ktrace_mtx); + STAILQ_INSERT_TAIL(&ktr_todo, req, ktr_list); + sema_post(&ktrace_sema); + if (req->ktr_synchronous) { + /* + * For a synchronous request, we wait for the ktrace thread + * to get to our item in the todo list and wake us up. Then + * we write the request out ourselves and wake the ktrace + * thread back up. + */ + msleep(req, &ktrace_mtx, curthread->td_priority, "ktrsync", 0); + mtx_unlock(&ktrace_mtx); + ktr_writerequest(req); + mtx_lock(&ktrace_mtx); + wakeup(req); + } + mtx_unlock(&ktrace_mtx); + curthread->td_inktrace = 0; +} + +static void +ktr_freerequest(struct ktr_request *req) +{ + + crfree(req->ktr_cred); + mtx_lock(&Giant); + vrele(req->ktr_vp); + mtx_unlock(&Giant); + mtx_lock(&ktrace_mtx); + STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list); + mtx_unlock(&ktrace_mtx); +} + +static void +ktr_loop(void *dummy) +{ + struct ktr_request *req; + struct thread *td; + struct ucred *cred; + + /* Only cache these values once. */ + td = curthread; + cred = td->td_ucred; + for (;;) { + sema_wait(&ktrace_sema); + mtx_lock(&ktrace_mtx); + req = STAILQ_FIRST(&ktr_todo); + STAILQ_REMOVE_HEAD(&ktr_todo, ktr_list); + KASSERT(req != NULL, ("got a NULL request")); + if (req->ktr_synchronous) { + wakeup(req); + msleep(req, &ktrace_mtx, curthread->td_priority, + "ktrwait", 0); + mtx_unlock(&ktrace_mtx); + } else { + mtx_unlock(&ktrace_mtx); + /* + * It is not enough just to pass the cached cred + * to the VOP's in ktr_writerequest(). Some VFS + * operations use curthread->td_ucred, so we need + * to modify our thread's credentials as well. + * Evil. + */ + td->td_ucred = req->ktr_cred; + ktr_writerequest(req); + td->td_ucred = cred; + } + ktr_freerequest(req); + } +} + +/* + * MPSAFE + */ +void +ktrsyscall(code, narg, args) + int code, narg; + register_t args[]; +{ + struct ktr_request *req; + struct ktr_syscall *ktp; + size_t buflen; + + req = ktr_getrequest(KTR_SYSCALL); + if (req == NULL) + return; + ktp = &req->ktr_data.ktr_syscall; + ktp->ktr_code = code; + ktp->ktr_narg = narg; + buflen = sizeof(register_t) * narg; + if (buflen > 0) { + req->ktr_header.ktr_buffer = malloc(buflen, M_KTRACE, M_WAITOK); + bcopy(args, req->ktr_header.ktr_buffer, buflen); + req->ktr_header.ktr_len = buflen; + } + ktr_submitrequest(req); +} + +/* + * MPSAFE + */ +void +ktrsysret(code, error, retval) + int code, error; + register_t retval; +{ + struct ktr_request *req; + struct ktr_sysret *ktp; + + req = ktr_getrequest(KTR_SYSRET); + if (req == NULL) + return; + ktp = &req->ktr_data.ktr_sysret; + ktp->ktr_code = code; + ktp->ktr_error = error; + ktp->ktr_retval = retval; /* what about val2 ? */ + ktr_submitrequest(req); +} + +void +ktrnamei(path) + char *path; +{ + struct ktr_request *req; + int namelen; + + req = ktr_getrequest(KTR_NAMEI); + if (req == NULL) + return; + namelen = strlen(path); + if (namelen > 0) { + req->ktr_header.ktr_len = namelen; + req->ktr_header.ktr_buffer = malloc(namelen, M_KTRACE, + M_WAITOK); + bcopy(path, req->ktr_header.ktr_buffer, namelen); + } + ktr_submitrequest(req); +} + +/* + * Since the uio may not stay valid, we can not hand off this request to + * the thread and need to process it synchronously. However, we wish to + * keep the relative order of records in a trace file correct, so we + * do put this request on the queue (if it isn't empty) and then block. + * The ktrace thread waks us back up when it is time for this event to + * be posted and blocks until we have completed writing out the event + * and woken it back up. + */ +void +ktrgenio(fd, rw, uio, error) + int fd; + enum uio_rw rw; + struct uio *uio; + int error; +{ + struct ktr_request *req; + struct ktr_genio *ktg; + + if (error) + return; + req = ktr_getrequest(KTR_GENIO); + if (req == NULL) + return; + ktg = &req->ktr_data.ktr_genio; + ktg->ktr_fd = fd; + ktg->ktr_rw = rw; + req->ktr_header.ktr_buffer = uio; + uio->uio_offset = 0; + uio->uio_rw = UIO_WRITE; + req->ktr_synchronous = 1; + ktr_submitrequest(req); +} + +void +ktrpsig(sig, action, mask, code) + int sig; + sig_t action; + sigset_t *mask; + int code; +{ + struct ktr_request *req; + struct ktr_psig *kp; + + req = ktr_getrequest(KTR_PSIG); + if (req == NULL) + return; + kp = &req->ktr_data.ktr_psig; + kp->signo = (char)sig; + kp->action = action; + kp->mask = *mask; + kp->code = code; + ktr_submitrequest(req); +} + +void +ktrcsw(out, user) + int out, user; +{ + struct ktr_request *req; + struct ktr_csw *kc; + + req = ktr_getrequest(KTR_CSW); + if (req == NULL) + return; + kc = &req->ktr_data.ktr_csw; + kc->out = out; + kc->user = user; + ktr_submitrequest(req); +} +#endif + +/* Interface and common routines */ + +/* + * ktrace system call + */ +#ifndef _SYS_SYSPROTO_H_ +struct ktrace_args { + char *fname; + int ops; + int facs; + int pid; +}; +#endif +/* ARGSUSED */ +int +ktrace(td, uap) + struct thread *td; + register struct ktrace_args *uap; +{ +#ifdef KTRACE + register struct vnode *vp = NULL; + register struct proc *p; + struct pgrp *pg; + int facs = uap->facs & ~KTRFAC_ROOT; + int ops = KTROP(uap->ops); + int descend = uap->ops & KTRFLAG_DESCEND; + int ret = 0; + int flags, error = 0; + struct nameidata nd; + + td->td_inktrace = 1; + if (ops != KTROP_CLEAR) { + /* + * an operation which requires a file argument. + */ + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->fname, td); + flags = FREAD | FWRITE | O_NOFOLLOW; + error = vn_open(&nd, &flags, 0); + if (error) { + td->td_inktrace = 0; + return (error); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + VOP_UNLOCK(vp, 0, td); + if (vp->v_type != VREG) { + (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); + td->td_inktrace = 0; + return (EACCES); + } + } + /* + * Clear all uses of the tracefile. + */ + if (ops == KTROP_CLEARFILE) { + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + PROC_LOCK(p); + if (p->p_tracep == vp) { + if (ktrcanset(td, p)) { + mtx_lock(&ktrace_mtx); + p->p_tracep = NULL; + p->p_traceflag = 0; + mtx_unlock(&ktrace_mtx); + PROC_UNLOCK(p); + (void) vn_close(vp, FREAD|FWRITE, + td->td_ucred, td); + } else { + PROC_UNLOCK(p); + error = EPERM; + } + } else + PROC_UNLOCK(p); + } + sx_sunlock(&allproc_lock); + goto done; + } + /* + * need something to (un)trace (XXX - why is this here?) + */ + if (!facs) { + error = EINVAL; + goto done; + } + /* + * do it + */ + if (uap->pid < 0) { + /* + * by process group + */ + sx_slock(&proctree_lock); + pg = pgfind(-uap->pid); + if (pg == NULL) { + sx_sunlock(&proctree_lock); + error = ESRCH; + goto done; + } + /* + * ktrops() may call vrele(). Lock pg_members + * by the proctree_lock rather than pg_mtx. + */ + PGRP_UNLOCK(pg); + LIST_FOREACH(p, &pg->pg_members, p_pglist) + if (descend) + ret |= ktrsetchildren(td, p, ops, facs, vp); + else + ret |= ktrops(td, p, ops, facs, vp); + sx_sunlock(&proctree_lock); + } else { + /* + * by pid + */ + p = pfind(uap->pid); + if (p == NULL) { + error = ESRCH; + goto done; + } + PROC_UNLOCK(p); + /* XXX: UNLOCK above has a race */ + if (descend) + ret |= ktrsetchildren(td, p, ops, facs, vp); + else + ret |= ktrops(td, p, ops, facs, vp); + } + if (!ret) + error = EPERM; +done: + if (vp != NULL) + (void) vn_close(vp, FWRITE, td->td_ucred, td); + td->td_inktrace = 0; + return (error); +#else + return ENOSYS; +#endif +} + +/* + * utrace system call + */ +/* ARGSUSED */ +int +utrace(td, uap) + struct thread *td; + register struct utrace_args *uap; +{ + +#ifdef KTRACE + struct ktr_request *req; + register caddr_t cp; + + if (uap->len > KTR_USER_MAXLEN) + return (EINVAL); + req = ktr_getrequest(KTR_USER); + if (req == NULL) + return (0); + MALLOC(cp, caddr_t, uap->len, M_KTRACE, M_WAITOK); + if (!copyin(uap->addr, cp, uap->len)) { + req->ktr_header.ktr_buffer = cp; + req->ktr_header.ktr_len = uap->len; + ktr_submitrequest(req); + } else { + ktr_freerequest(req); + td->td_inktrace = 0; + } + return (0); +#else + return (ENOSYS); +#endif +} + +#ifdef KTRACE +static int +ktrops(td, p, ops, facs, vp) + struct thread *td; + struct proc *p; + int ops, facs; + struct vnode *vp; +{ + struct vnode *tracevp = NULL; + + PROC_LOCK(p); + if (!ktrcanset(td, p)) { + PROC_UNLOCK(p); + return (0); + } + mtx_lock(&ktrace_mtx); + if (ops == KTROP_SET) { + if (p->p_tracep != vp) { + /* + * if trace file already in use, relinquish below + */ + tracevp = p->p_tracep; + VREF(vp); + p->p_tracep = vp; + } + p->p_traceflag |= facs; + if (td->td_ucred->cr_uid == 0) + p->p_traceflag |= KTRFAC_ROOT; + } else { + /* KTROP_CLEAR */ + if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) { + /* no more tracing */ + p->p_traceflag = 0; + tracevp = p->p_tracep; + p->p_tracep = NULL; + } + } + mtx_unlock(&ktrace_mtx); + PROC_UNLOCK(p); + if (tracevp != NULL) + vrele(tracevp); + + return (1); +} + +static int +ktrsetchildren(td, top, ops, facs, vp) + struct thread *td; + struct proc *top; + int ops, facs; + struct vnode *vp; +{ + register struct proc *p; + register int ret = 0; + + p = top; + sx_slock(&proctree_lock); + for (;;) { + ret |= ktrops(td, p, ops, facs, vp); + /* + * If this process has children, descend to them next, + * otherwise do any siblings, and if done with this level, + * follow back up the tree (but not past top). + */ + if (!LIST_EMPTY(&p->p_children)) + p = LIST_FIRST(&p->p_children); + else for (;;) { + if (p == top) { + sx_sunlock(&proctree_lock); + return (ret); + } + if (LIST_NEXT(p, p_sibling)) { + p = LIST_NEXT(p, p_sibling); + break; + } + p = p->p_pptr; + } + } + /*NOTREACHED*/ +} + +static void +ktr_writerequest(struct ktr_request *req) +{ + struct ktr_header *kth; + struct vnode *vp; + struct uio *uio = NULL; + struct proc *p; + struct thread *td; + struct ucred *cred; + struct uio auio; + struct iovec aiov[3]; + struct mount *mp; + int datalen, buflen, vrele_count; + int error; + + vp = req->ktr_vp; + /* + * If vp is NULL, the vp has been cleared out from under this + * request, so just drop it. + */ + if (vp == NULL) + return; + kth = &req->ktr_header; + datalen = data_lengths[kth->ktr_type]; + buflen = kth->ktr_len; + cred = req->ktr_cred; + td = curthread; + auio.uio_iov = &aiov[0]; + auio.uio_offset = 0; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + aiov[0].iov_base = (caddr_t)kth; + aiov[0].iov_len = sizeof(struct ktr_header); + auio.uio_resid = sizeof(struct ktr_header); + auio.uio_iovcnt = 1; + auio.uio_td = td; + if (datalen != 0) { + aiov[1].iov_base = (caddr_t)&req->ktr_data; + aiov[1].iov_len = datalen; + auio.uio_resid += datalen; + auio.uio_iovcnt++; + kth->ktr_len += datalen; + } + if (buflen != 0) { + KASSERT(kth->ktr_buffer != NULL, ("ktrace: nothing to write")); + aiov[auio.uio_iovcnt].iov_base = kth->ktr_buffer; + aiov[auio.uio_iovcnt].iov_len = buflen; + auio.uio_resid += buflen; + auio.uio_iovcnt++; + } else + uio = kth->ktr_buffer; + KASSERT((uio == NULL) ^ (kth->ktr_type == KTR_GENIO), + ("ktrace: uio and genio mismatch")); + if (uio != NULL) + kth->ktr_len += uio->uio_resid; + mtx_lock(&Giant); + vn_start_write(vp, &mp, V_WAIT); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + (void)VOP_LEASE(vp, td, cred, LEASE_WRITE); + error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred); + if (error == 0 && uio != NULL) { + (void)VOP_LEASE(vp, td, cred, LEASE_WRITE); + error = VOP_WRITE(vp, uio, IO_UNIT | IO_APPEND, cred); + } + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + mtx_unlock(&Giant); + if (buflen != 0) + free(kth->ktr_buffer, M_KTRACE); + if (!error) + return; + /* + * If error encountered, give up tracing on this vnode. We defer + * all the vrele()'s on the vnode until after we are finished walking + * the various lists to avoid needlessly holding locks. + */ + log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n", + error); + vrele_count = 0; + /* + * First, clear this vnode from being used by any processes in the + * system. + * XXX - If one process gets an EPERM writing to the vnode, should + * we really do this? Other processes might have suitable + * credentials for the operation. + */ + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + PROC_LOCK(p); + if (p->p_tracep == vp) { + mtx_lock(&ktrace_mtx); + p->p_tracep = NULL; + p->p_traceflag = 0; + mtx_unlock(&ktrace_mtx); + vrele_count++; + } + PROC_UNLOCK(p); + } + sx_sunlock(&allproc_lock); + /* + * Second, clear this vnode from any pending requests. + */ + mtx_lock(&ktrace_mtx); + STAILQ_FOREACH(req, &ktr_todo, ktr_list) { + if (req->ktr_vp == vp) { + req->ktr_vp = NULL; + vrele_count++; + } + } + mtx_unlock(&ktrace_mtx); + mtx_lock(&Giant); + while (vrele_count-- > 0) + vrele(vp); + mtx_unlock(&Giant); +} + +/* + * Return true if caller has permission to set the ktracing state + * of target. Essentially, the target can't possess any + * more permissions than the caller. KTRFAC_ROOT signifies that + * root previously set the tracing status on the target process, and + * so, only root may further change it. + */ +static int +ktrcanset(td, targetp) + struct thread *td; + struct proc *targetp; +{ + + PROC_LOCK_ASSERT(targetp, MA_OWNED); + if (targetp->p_traceflag & KTRFAC_ROOT && + suser_cred(td->td_ucred, PRISON_ROOT)) + return (0); + + if (p_candebug(td, targetp) != 0) + return (0); + + return (1); +} + +#endif /* KTRACE */ diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c new file mode 100644 index 0000000..a506726 --- /dev/null +++ b/sys/kern/kern_linker.c @@ -0,0 +1,1812 @@ +/*- + * Copyright (c) 1997-2000 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/sysproto.h> +#include <sys/sysent.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/module.h> +#include <sys/linker.h> +#include <sys/fcntl.h> +#include <sys/libkern.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/sysctl.h> + +#include "linker_if.h" + +#ifdef KLD_DEBUG +int kld_debug = 0; +#endif + +/* + * static char *linker_search_path(const char *name, struct mod_depend + * *verinfo); + */ +static const char *linker_basename(const char *path); +static int linker_load_module(const char *kldname, const char *modname, + struct linker_file *parent, struct mod_depend *verinfo, + struct linker_file **lfpp); + +/* Metadata from the static kernel */ +SET_DECLARE(modmetadata_set, struct mod_metadata); + +MALLOC_DEFINE(M_LINKER, "linker", "kernel linker"); + +linker_file_t linker_kernel_file; + +static struct mtx kld_mtx; /* kernel linker mutex */ + +static linker_class_list_t classes; +static linker_file_list_t linker_files; +static int next_file_id = 1; +static int linker_no_more_classes = 0; + +#define LINKER_GET_NEXT_FILE_ID(a) do { \ + linker_file_t lftmp; \ + \ +retry: \ + mtx_lock(&kld_mtx); \ + TAILQ_FOREACH(lftmp, &linker_files, link) { \ + if (next_file_id == lftmp->id) { \ + next_file_id++; \ + mtx_unlock(&kld_mtx); \ + goto retry; \ + } \ + } \ + (a) = next_file_id; \ + mtx_unlock(&kld_mtx); /* Hold for safe read of id variable */ \ +} while(0) + + +/* XXX wrong name; we're looking at version provision tags here, not modules */ +typedef TAILQ_HEAD(, modlist) modlisthead_t; +struct modlist { + TAILQ_ENTRY(modlist) link; /* chain together all modules */ + linker_file_t container; + const char *name; + int version; +}; +typedef struct modlist *modlist_t; +static modlisthead_t found_modules; + +static modlist_t modlist_lookup2(const char *name, + struct mod_depend *verinfo); + +static char * +linker_strdup(const char *str) +{ + char *result; + + if ((result = malloc((strlen(str) + 1), M_LINKER, M_WAITOK)) != NULL) + strcpy(result, str); + return (result); +} + +static void +linker_init(void *arg) +{ + + mtx_init(&kld_mtx, "kernel linker", NULL, MTX_DEF); + TAILQ_INIT(&classes); + TAILQ_INIT(&linker_files); +} + +SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0) + +static void +linker_stop_class_add(void *arg) +{ + + linker_no_more_classes = 1; +} + +SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL) + +int +linker_add_class(linker_class_t lc) +{ + + /* + * We disallow any class registration passt SI_ORDER_ANY + * of SI_SUB_KLD. + */ + if (linker_no_more_classes == 1) + return (EPERM); + kobj_class_compile((kobj_class_t) lc); + TAILQ_INSERT_TAIL(&classes, lc, link); + return (0); +} + +static void +linker_file_sysinit(linker_file_t lf) +{ + struct sysinit **start, **stop, **sipp, **xipp, *save; + + KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n", + lf->filename)); + + if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0) + return; + /* + * Perform a bubble sort of the system initialization objects by + * their subsystem (primary key) and order (secondary key). + * + * Since some things care about execution order, this is the operation + * which ensures continued function. + */ + for (sipp = start; sipp < stop; sipp++) { + for (xipp = sipp + 1; xipp < stop; xipp++) { + if ((*sipp)->subsystem < (*xipp)->subsystem || + ((*sipp)->subsystem == (*xipp)->subsystem && + (*sipp)->order <= (*xipp)->order)) + continue; /* skip */ + save = *sipp; + *sipp = *xipp; + *xipp = save; + } + } + + /* + * Traverse the (now) ordered list of system initialization tasks. + * Perform each task, and continue on to the next task. + */ + for (sipp = start; sipp < stop; sipp++) { + if ((*sipp)->subsystem == SI_SUB_DUMMY) + continue; /* skip dummy task(s) */ + + /* Call function */ + (*((*sipp)->func)) ((*sipp)->udata); + } +} + +static void +linker_file_sysuninit(linker_file_t lf) +{ + struct sysinit **start, **stop, **sipp, **xipp, *save; + + KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n", + lf->filename)); + + if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop, + NULL) != 0) + return; + + /* + * Perform a reverse bubble sort of the system initialization objects + * by their subsystem (primary key) and order (secondary key). + * + * Since some things care about execution order, this is the operation + * which ensures continued function. + */ + for (sipp = start; sipp < stop; sipp++) { + for (xipp = sipp + 1; xipp < stop; xipp++) { + if ((*sipp)->subsystem > (*xipp)->subsystem || + ((*sipp)->subsystem == (*xipp)->subsystem && + (*sipp)->order >= (*xipp)->order)) + continue; /* skip */ + save = *sipp; + *sipp = *xipp; + *xipp = save; + } + } + + /* + * Traverse the (now) ordered list of system initialization tasks. + * Perform each task, and continue on to the next task. + */ + for (sipp = start; sipp < stop; sipp++) { + if ((*sipp)->subsystem == SI_SUB_DUMMY) + continue; /* skip dummy task(s) */ + + /* Call function */ + (*((*sipp)->func)) ((*sipp)->udata); + } +} + +static void +linker_file_register_sysctls(linker_file_t lf) +{ + struct sysctl_oid **start, **stop, **oidp; + + KLD_DPF(FILE, + ("linker_file_register_sysctls: registering SYSCTLs for %s\n", + lf->filename)); + + if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) + return; + + for (oidp = start; oidp < stop; oidp++) + sysctl_register_oid(*oidp); +} + +static void +linker_file_unregister_sysctls(linker_file_t lf) +{ + struct sysctl_oid **start, **stop, **oidp; + + KLD_DPF(FILE, ("linker_file_unregister_sysctls: registering SYSCTLs" + " for %s\n", lf->filename)); + + if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) + return; + + for (oidp = start; oidp < stop; oidp++) + sysctl_unregister_oid(*oidp); +} + +static int +linker_file_register_modules(linker_file_t lf) +{ + struct mod_metadata **start, **stop, **mdp; + const moduledata_t *moddata; + int error; + + KLD_DPF(FILE, ("linker_file_register_modules: registering modules" + " in %s\n", lf->filename)); + + if (linker_file_lookup_set(lf, "modmetadata_set", &start, + &stop, 0) != 0) { + /* + * This fallback should be unnecessary, but if we get booted + * from boot2 instead of loader and we are missing our + * metadata then we have to try the best we can. + */ + if (lf == linker_kernel_file) { + start = SET_BEGIN(modmetadata_set); + stop = SET_LIMIT(modmetadata_set); + } else + return (0); + } + for (mdp = start; mdp < stop; mdp++) { + if ((*mdp)->md_type != MDT_MODULE) + continue; + moddata = (*mdp)->md_data; + KLD_DPF(FILE, ("Registering module %s in %s\n", + moddata->name, lf->filename)); + error = module_register(moddata, lf); + if (error) + printf("Module %s failed to register: %d\n", + moddata->name, error); + } + return (0); +} + +static void +linker_init_kernel_modules(void) +{ + + linker_file_register_modules(linker_kernel_file); +} + +SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules, 0) + +int +linker_load_file(const char *filename, linker_file_t *result) +{ + linker_class_t lc; + linker_file_t lf; + int foundfile, error = 0; + + /* Refuse to load modules if securelevel raised */ + if (securelevel > 0) + return (EPERM); + + lf = linker_find_file_by_name(filename); + if (lf) { + KLD_DPF(FILE, ("linker_load_file: file %s is already loaded," + " incrementing refs\n", filename)); + *result = lf; + lf->refs++; + goto out; + } + lf = NULL; + foundfile = 0; + + /* + * We do not need to protect (lock) classes here because there is + * no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY) + * and there is no class deregistration mechanism at this time. + */ + TAILQ_FOREACH(lc, &classes, link) { + KLD_DPF(FILE, ("linker_load_file: trying to load %s\n", + filename)); + error = LINKER_LOAD_FILE(lc, filename, &lf); + /* + * If we got something other than ENOENT, then it exists but + * we cannot load it for some other reason. + */ + if (error != ENOENT) + foundfile = 1; + if (lf) { + linker_file_register_modules(lf); + linker_file_register_sysctls(lf); + linker_file_sysinit(lf); + lf->flags |= LINKER_FILE_LINKED; + *result = lf; + error = 0; + goto out; + } + } + /* + * Less than ideal, but tells the user whether it failed to load or + * the module was not found. + */ + if (foundfile) + /* Format not recognized (or unloadable). */ + error = ENOEXEC; + else + error = ENOENT; /* Nothing found */ +out: + return (error); +} + +int +linker_reference_module(const char *modname, struct mod_depend *verinfo, + linker_file_t *result) +{ + modlist_t mod; + + if ((mod = modlist_lookup2(modname, verinfo)) != NULL) { + *result = mod->container; + (*result)->refs++; + return (0); + } + + return (linker_load_module(NULL, modname, NULL, verinfo, result)); +} + +linker_file_t +linker_find_file_by_name(const char *filename) +{ + linker_file_t lf = 0; + char *koname; + + koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK); + if (koname == NULL) + goto out; + sprintf(koname, "%s.ko", filename); + + mtx_lock(&kld_mtx); + TAILQ_FOREACH(lf, &linker_files, link) { + if (strcmp(lf->filename, koname) == 0) + break; + if (strcmp(lf->filename, filename) == 0) + break; + } + mtx_unlock(&kld_mtx); +out: + if (koname) + free(koname, M_LINKER); + return (lf); +} + +linker_file_t +linker_find_file_by_id(int fileid) +{ + linker_file_t lf = 0; + + mtx_lock(&kld_mtx); + TAILQ_FOREACH(lf, &linker_files, link) + if (lf->id == fileid) + break; + mtx_unlock(&kld_mtx); + return (lf); +} + +linker_file_t +linker_make_file(const char *pathname, linker_class_t lc) +{ + linker_file_t lf; + const char *filename; + + lf = NULL; + filename = linker_basename(pathname); + + KLD_DPF(FILE, ("linker_make_file: new file, filename=%s\n", filename)); + lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK); + if (lf == NULL) + goto out; + lf->refs = 1; + lf->userrefs = 0; + lf->flags = 0; + lf->filename = linker_strdup(filename); + LINKER_GET_NEXT_FILE_ID(lf->id); + lf->ndeps = 0; + lf->deps = NULL; + STAILQ_INIT(&lf->common); + TAILQ_INIT(&lf->modules); + mtx_lock(&kld_mtx); + TAILQ_INSERT_TAIL(&linker_files, lf, link); + mtx_unlock(&kld_mtx); +out: + return (lf); +} + +int +linker_file_unload(linker_file_t file) +{ + module_t mod, next; + modlist_t ml, nextml; + struct common_symbol *cp; + int error, i; + + error = 0; + + /* Refuse to unload modules if securelevel raised. */ + if (securelevel > 0) + return (EPERM); + + KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs)); + if (file->refs == 1) { + KLD_DPF(FILE, ("linker_file_unload: file is unloading," + " informing modules\n")); + + /* + * Inform any modules associated with this file. + */ + MOD_XLOCK; + for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) { + next = module_getfnext(mod); + MOD_XUNLOCK; + + /* + * Give the module a chance to veto the unload. + */ + if ((error = module_unload(mod)) != 0) { + KLD_DPF(FILE, ("linker_file_unload: module %x" + " vetoes unload\n", mod)); + goto out; + } else + MOD_XLOCK; + module_release(mod); + } + MOD_XUNLOCK; + } + file->refs--; + if (file->refs > 0) { + goto out; + } + for (ml = TAILQ_FIRST(&found_modules); ml; ml = nextml) { + nextml = TAILQ_NEXT(ml, link); + if (ml->container == file) + TAILQ_REMOVE(&found_modules, ml, link); + } + + /* + * Don't try to run SYSUNINITs if we are unloaded due to a + * link error. + */ + if (file->flags & LINKER_FILE_LINKED) { + linker_file_sysuninit(file); + linker_file_unregister_sysctls(file); + } + mtx_lock(&kld_mtx); + TAILQ_REMOVE(&linker_files, file, link); + mtx_unlock(&kld_mtx); + + if (file->deps) { + for (i = 0; i < file->ndeps; i++) + linker_file_unload(file->deps[i]); + free(file->deps, M_LINKER); + file->deps = NULL; + } + for (cp = STAILQ_FIRST(&file->common); cp; + cp = STAILQ_FIRST(&file->common)) { + STAILQ_REMOVE(&file->common, cp, common_symbol, link); + free(cp, M_LINKER); + } + + LINKER_UNLOAD(file); + if (file->filename) { + free(file->filename, M_LINKER); + file->filename = NULL; + } + kobj_delete((kobj_t) file, M_LINKER); +out: + return (error); +} + +int +linker_file_add_dependency(linker_file_t file, linker_file_t dep) +{ + linker_file_t *newdeps; + + newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t *), + M_LINKER, M_WAITOK | M_ZERO); + if (newdeps == NULL) + return (ENOMEM); + + if (file->deps) { + bcopy(file->deps, newdeps, + file->ndeps * sizeof(linker_file_t *)); + free(file->deps, M_LINKER); + } + file->deps = newdeps; + file->deps[file->ndeps] = dep; + file->ndeps++; + return (0); +} + +/* + * Locate a linker set and its contents. This is a helper function to avoid + * linker_if.h exposure elsewhere. Note: firstp and lastp are really void *** + */ +int +linker_file_lookup_set(linker_file_t file, const char *name, + void *firstp, void *lastp, int *countp) +{ + + return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp)); +} + +caddr_t +linker_file_lookup_symbol(linker_file_t file, const char *name, int deps) +{ + c_linker_sym_t sym; + linker_symval_t symval; + caddr_t address; + size_t common_size = 0; + int i; + + KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%x, name=%s, deps=%d\n", + file, name, deps)); + + if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) { + LINKER_SYMBOL_VALUES(file, sym, &symval); + if (symval.value == 0) + /* + * For commons, first look them up in the + * dependencies and only allocate space if not found + * there. + */ + common_size = symval.size; + else { + KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol" + ".value=%x\n", symval.value)); + return (symval.value); + } + } + if (deps) { + for (i = 0; i < file->ndeps; i++) { + address = linker_file_lookup_symbol(file->deps[i], + name, 0); + if (address) { + KLD_DPF(SYM, ("linker_file_lookup_symbol:" + " deps value=%x\n", address)); + return (address); + } + } + } + if (common_size > 0) { + /* + * This is a common symbol which was not found in the + * dependencies. We maintain a simple common symbol table in + * the file object. + */ + struct common_symbol *cp; + + STAILQ_FOREACH(cp, &file->common, link) { + if (strcmp(cp->name, name) == 0) { + KLD_DPF(SYM, ("linker_file_lookup_symbol:" + " old common value=%x\n", cp->address)); + return (cp->address); + } + } + /* + * Round the symbol size up to align. + */ + common_size = (common_size + sizeof(int) - 1) & -sizeof(int); + cp = malloc(sizeof(struct common_symbol) + + common_size + strlen(name) + 1, M_LINKER, + M_WAITOK | M_ZERO); + if (cp == NULL) { + KLD_DPF(SYM, ("linker_file_lookup_symbol: nomem\n")); + return (0); + } + cp->address = (caddr_t)(cp + 1); + cp->name = cp->address + common_size; + strcpy(cp->name, name); + bzero(cp->address, common_size); + STAILQ_INSERT_TAIL(&file->common, cp, link); + + KLD_DPF(SYM, ("linker_file_lookup_symbol: new common" + " value=%x\n", cp->address)); + return (cp->address); + } + KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n")); + return (0); +} + +#ifdef DDB +/* + * DDB Helpers. DDB has to look across multiple files with their own symbol + * tables and string tables. + * + * Note that we do not obey list locking protocols here. We really don't need + * DDB to hang because somebody's got the lock held. We'll take the chance + * that the files list is inconsistant instead. + */ + +int +linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym) +{ + linker_file_t lf; + + TAILQ_FOREACH(lf, &linker_files, link) { + if (LINKER_LOOKUP_SYMBOL(lf, symstr, sym) == 0) + return (0); + } + return (ENOENT); +} + +int +linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp) +{ + linker_file_t lf; + c_linker_sym_t best, es; + u_long diff, bestdiff, off; + + best = 0; + off = (uintptr_t)value; + bestdiff = off; + TAILQ_FOREACH(lf, &linker_files, link) { + if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0) + continue; + if (es != 0 && diff < bestdiff) { + best = es; + bestdiff = diff; + } + if (bestdiff == 0) + break; + } + if (best) { + *sym = best; + *diffp = bestdiff; + return (0); + } else { + *sym = 0; + *diffp = off; + return (ENOENT); + } +} + +int +linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval) +{ + linker_file_t lf; + + TAILQ_FOREACH(lf, &linker_files, link) { + if (LINKER_SYMBOL_VALUES(lf, sym, symval) == 0) + return (0); + } + return (ENOENT); +} +#endif + +/* + * Syscalls. + */ +/* + * MPSAFE + */ +int +kldload(struct thread *td, struct kldload_args *uap) +{ + char *kldname, *modname; + char *pathname = NULL; + linker_file_t lf; + int error = 0; + + td->td_retval[0] = -1; + + mtx_lock(&Giant); + + if ((error = securelevel_gt(td->td_ucred, 0)) != 0) + goto out; + + if ((error = suser(td)) != 0) + goto out; + + pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + if ((error = copyinstr(SCARG(uap, file), pathname, MAXPATHLEN, + NULL)) != 0) + goto out; + + /* + * If path do not contain qualified name or any dot in it + * (kldname.ko, or kldname.ver.ko) treat it as interface + * name. + */ + if (index(pathname, '/') || index(pathname, '.')) { + kldname = pathname; + modname = NULL; + } else { + kldname = NULL; + modname = pathname; + } + error = linker_load_module(kldname, modname, NULL, NULL, &lf); + if (error) + goto out; + + lf->userrefs++; + td->td_retval[0] = lf->id; +out: + if (pathname) + free(pathname, M_TEMP); + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int +kldunload(struct thread *td, struct kldunload_args *uap) +{ + linker_file_t lf; + int error = 0; + + mtx_lock(&Giant); + + if ((error = securelevel_gt(td->td_ucred, 0)) != 0) + goto out; + + if ((error = suser(td)) != 0) + goto out; + + lf = linker_find_file_by_id(SCARG(uap, fileid)); + if (lf) { + KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs)); + if (lf->userrefs == 0) { + printf("kldunload: attempt to unload file that was" + " loaded by the kernel\n"); + error = EBUSY; + goto out; + } + lf->userrefs--; + error = linker_file_unload(lf); + if (error) + lf->userrefs++; + } else + error = ENOENT; +out: + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int +kldfind(struct thread *td, struct kldfind_args *uap) +{ + char *pathname; + const char *filename; + linker_file_t lf; + int error = 0; + + mtx_lock(&Giant); + td->td_retval[0] = -1; + + pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + if ((error = copyinstr(SCARG(uap, file), pathname, MAXPATHLEN, + NULL)) != 0) + goto out; + + filename = linker_basename(pathname); + lf = linker_find_file_by_name(filename); + if (lf) + td->td_retval[0] = lf->id; + else + error = ENOENT; +out: + if (pathname) + free(pathname, M_TEMP); + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int +kldnext(struct thread *td, struct kldnext_args *uap) +{ + linker_file_t lf; + int error = 0; + + mtx_lock(&Giant); + + if (SCARG(uap, fileid) == 0) { + mtx_lock(&kld_mtx); + if (TAILQ_FIRST(&linker_files)) + td->td_retval[0] = TAILQ_FIRST(&linker_files)->id; + else + td->td_retval[0] = 0; + mtx_unlock(&kld_mtx); + goto out; + } + lf = linker_find_file_by_id(SCARG(uap, fileid)); + if (lf) { + if (TAILQ_NEXT(lf, link)) + td->td_retval[0] = TAILQ_NEXT(lf, link)->id; + else + td->td_retval[0] = 0; + } else + error = ENOENT; +out: + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int +kldstat(struct thread *td, struct kldstat_args *uap) +{ + linker_file_t lf; + int error = 0; + int namelen, version; + struct kld_file_stat *stat; + + mtx_lock(&Giant); + + lf = linker_find_file_by_id(SCARG(uap, fileid)); + if (lf == NULL) { + error = ENOENT; + goto out; + } + stat = SCARG(uap, stat); + + /* + * Check the version of the user's structure. + */ + if ((error = copyin(&stat->version, &version, sizeof(version))) != 0) + goto out; + if (version != sizeof(struct kld_file_stat)) { + error = EINVAL; + goto out; + } + namelen = strlen(lf->filename) + 1; + if (namelen > MAXPATHLEN) + namelen = MAXPATHLEN; + if ((error = copyout(lf->filename, &stat->name[0], namelen)) != 0) + goto out; + if ((error = copyout(&lf->refs, &stat->refs, sizeof(int))) != 0) + goto out; + if ((error = copyout(&lf->id, &stat->id, sizeof(int))) != 0) + goto out; + if ((error = copyout(&lf->address, &stat->address, + sizeof(caddr_t))) != 0) + goto out; + if ((error = copyout(&lf->size, &stat->size, sizeof(size_t))) != 0) + goto out; + + td->td_retval[0] = 0; +out: + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int +kldfirstmod(struct thread *td, struct kldfirstmod_args *uap) +{ + linker_file_t lf; + module_t mp; + int error = 0; + + mtx_lock(&Giant); + lf = linker_find_file_by_id(SCARG(uap, fileid)); + if (lf) { + MOD_SLOCK; + mp = TAILQ_FIRST(&lf->modules); + if (mp != NULL) + td->td_retval[0] = module_getid(mp); + else + td->td_retval[0] = 0; + MOD_SUNLOCK; + } else + error = ENOENT; + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int +kldsym(struct thread *td, struct kldsym_args *uap) +{ + char *symstr = NULL; + c_linker_sym_t sym; + linker_symval_t symval; + linker_file_t lf; + struct kld_sym_lookup lookup; + int error = 0; + + mtx_lock(&Giant); + + if ((error = copyin(SCARG(uap, data), &lookup, sizeof(lookup))) != 0) + goto out; + if (lookup.version != sizeof(lookup) || + SCARG(uap, cmd) != KLDSYM_LOOKUP) { + error = EINVAL; + goto out; + } + symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0) + goto out; + if (SCARG(uap, fileid) != 0) { + lf = linker_find_file_by_id(SCARG(uap, fileid)); + if (lf == NULL) { + error = ENOENT; + goto out; + } + if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 && + LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) { + lookup.symvalue = (uintptr_t) symval.value; + lookup.symsize = symval.size; + error = copyout(&lookup, SCARG(uap, data), + sizeof(lookup)); + } else + error = ENOENT; + } else { + mtx_lock(&kld_mtx); + TAILQ_FOREACH(lf, &linker_files, link) { + if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 && + LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) { + lookup.symvalue = (uintptr_t)symval.value; + lookup.symsize = symval.size; + error = copyout(&lookup, SCARG(uap, data), + sizeof(lookup)); + break; + } + } + mtx_unlock(&kld_mtx); + if (lf == NULL) + error = ENOENT; + } +out: + if (symstr) + free(symstr, M_TEMP); + mtx_unlock(&Giant); + return (error); +} + +/* + * Preloaded module support + */ + +static modlist_t +modlist_lookup(const char *name, int ver) +{ + modlist_t mod; + + TAILQ_FOREACH(mod, &found_modules, link) { + if (strcmp(mod->name, name) == 0 && + (ver == 0 || mod->version == ver)) + return (mod); + } + return (NULL); +} + +static modlist_t +modlist_lookup2(const char *name, struct mod_depend *verinfo) +{ + modlist_t mod, bestmod; + int ver; + + if (verinfo == NULL) + return (modlist_lookup(name, 0)); + bestmod = NULL; + for (mod = TAILQ_FIRST(&found_modules); mod; + mod = TAILQ_NEXT(mod, link)) { + if (strcmp(mod->name, name) != 0) + continue; + ver = mod->version; + if (ver == verinfo->md_ver_preferred) + return (mod); + if (ver >= verinfo->md_ver_minimum && + ver <= verinfo->md_ver_maximum && + ver > bestmod->version) + bestmod = mod; + } + return (bestmod); +} + +static modlist_t +modlist_newmodule(const char *modname, int version, linker_file_t container) +{ + modlist_t mod; + + mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO); + if (mod == NULL) + panic("no memory for module list"); + mod->container = container; + mod->name = modname; + mod->version = version; + TAILQ_INSERT_TAIL(&found_modules, mod, link); + return (mod); +} + +/* + * This routine is cheap and nasty but will work for data pointers. + */ +static void * +linker_reloc_ptr(linker_file_t lf, const void *offset) +{ + return (lf->address + (uintptr_t)offset); +} + +/* + * Dereference MDT_VERSION metadata into module name and version + */ +static void +linker_mdt_version(linker_file_t lf, struct mod_metadata *mp, + const char **modname, int *version) +{ + struct mod_version *mvp; + + if (modname) + *modname = linker_reloc_ptr(lf, mp->md_cval); + if (version) { + mvp = linker_reloc_ptr(lf, mp->md_data); + *version = mvp->mv_version; + } +} + +/* + * Dereference MDT_DEPEND metadata into module name and mod_depend structure + */ +static void +linker_mdt_depend(linker_file_t lf, struct mod_metadata *mp, + const char **modname, struct mod_depend **verinfo) +{ + + if (modname) + *modname = linker_reloc_ptr(lf, mp->md_cval); + if (verinfo) + *verinfo = linker_reloc_ptr(lf, mp->md_data); +} + +static void +linker_addmodules(linker_file_t lf, struct mod_metadata **start, + struct mod_metadata **stop, int preload) +{ + struct mod_metadata *mp, **mdp; + const char *modname; + int ver; + + for (mdp = start; mdp < stop; mdp++) { + if (preload) + mp = *mdp; + else + mp = linker_reloc_ptr(lf, *mdp); + if (mp->md_type != MDT_VERSION) + continue; + if (preload) { + modname = mp->md_cval; + ver = ((struct mod_version *)mp->md_data)->mv_version; + } else + linker_mdt_version(lf, mp, &modname, &ver); + if (modlist_lookup(modname, ver) != NULL) { + printf("module %s already present!\n", modname); + /* XXX what can we do? this is a build error. :-( */ + continue; + } + modlist_newmodule(modname, ver, lf); + } +} + +static void +linker_preload(void *arg) +{ + caddr_t modptr; + const char *modname, *nmodname; + char *modtype; + linker_file_t lf; + linker_class_t lc; + int error; + linker_file_list_t loaded_files; + linker_file_list_t depended_files; + struct mod_metadata *mp, *nmp; + struct mod_metadata **start, **stop, **mdp, **nmdp; + struct mod_depend *verinfo; + int nver; + int resolves; + modlist_t mod; + struct sysinit **si_start, **si_stop; + + TAILQ_INIT(&loaded_files); + TAILQ_INIT(&depended_files); + TAILQ_INIT(&found_modules); + error = 0; + + modptr = NULL; + while ((modptr = preload_search_next_name(modptr)) != NULL) { + modname = (char *)preload_search_info(modptr, MODINFO_NAME); + modtype = (char *)preload_search_info(modptr, MODINFO_TYPE); + if (modname == NULL) { + printf("Preloaded module at %p does not have a" + " name!\n", modptr); + continue; + } + if (modtype == NULL) { + printf("Preloaded module at %p does not have a type!\n", + modptr); + continue; + } + printf("Preloaded %s \"%s\" at %p.\n", modtype, modname, + modptr); + lf = NULL; + TAILQ_FOREACH(lc, &classes, link) { + error = LINKER_LINK_PRELOAD(lc, modname, &lf); + if (error) { + lf = NULL; + break; + } + } + if (lf) + TAILQ_INSERT_TAIL(&loaded_files, lf, loaded); + } + + /* + * First get a list of stuff in the kernel. + */ + if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start, + &stop, NULL) == 0) + linker_addmodules(linker_kernel_file, start, stop, 1); + + /* + * this is a once-off kinky bubble sort resolve relocation dependency + * requirements + */ +restart: + TAILQ_FOREACH(lf, &loaded_files, loaded) { + error = linker_file_lookup_set(lf, MDT_SETNAME, &start, + &stop, NULL); + /* + * First, look to see if we would successfully link with this + * stuff. + */ + resolves = 1; /* unless we know otherwise */ + if (!error) { + for (mdp = start; mdp < stop; mdp++) { + mp = linker_reloc_ptr(lf, *mdp); + if (mp->md_type != MDT_DEPEND) + continue; + linker_mdt_depend(lf, mp, &modname, &verinfo); + for (nmdp = start; nmdp < stop; nmdp++) { + nmp = linker_reloc_ptr(lf, *nmdp); + if (nmp->md_type != MDT_VERSION) + continue; + linker_mdt_version(lf, nmp, &nmodname, + NULL); + nmodname = linker_reloc_ptr(lf, + nmp->md_cval); + if (strcmp(modname, nmodname) == 0) + break; + } + if (nmdp < stop) /* it's a self reference */ + continue; + + /* + * ok, the module isn't here yet, we + * are not finished + */ + if (modlist_lookup2(modname, verinfo) == NULL) + resolves = 0; + } + } + /* + * OK, if we found our modules, we can link. So, "provide" + * the modules inside and add it to the end of the link order + * list. + */ + if (resolves) { + if (!error) { + for (mdp = start; mdp < stop; mdp++) { + mp = linker_reloc_ptr(lf, *mdp); + if (mp->md_type != MDT_VERSION) + continue; + linker_mdt_version(lf, mp, + &modname, &nver); + if (modlist_lookup(modname, + nver) != NULL) { + printf("module %s already" + " present!\n", modname); + linker_file_unload(lf); + TAILQ_REMOVE(&loaded_files, + lf, loaded); + /* we changed tailq next ptr */ + goto restart; + } + modlist_newmodule(modname, nver, lf); + } + } + TAILQ_REMOVE(&loaded_files, lf, loaded); + TAILQ_INSERT_TAIL(&depended_files, lf, loaded); + /* + * Since we provided modules, we need to restart the + * sort so that the previous files that depend on us + * have a chance. Also, we've busted the tailq next + * pointer with the REMOVE. + */ + goto restart; + } + } + + /* + * At this point, we check to see what could not be resolved.. + */ + TAILQ_FOREACH(lf, &loaded_files, loaded) { + printf("KLD file %s is missing dependencies\n", lf->filename); + linker_file_unload(lf); + TAILQ_REMOVE(&loaded_files, lf, loaded); + } + + /* + * We made it. Finish off the linking in the order we determined. + */ + TAILQ_FOREACH(lf, &depended_files, loaded) { + if (linker_kernel_file) { + linker_kernel_file->refs++; + error = linker_file_add_dependency(lf, + linker_kernel_file); + if (error) + panic("cannot add dependency"); + } + lf->userrefs++; /* so we can (try to) kldunload it */ + error = linker_file_lookup_set(lf, MDT_SETNAME, &start, + &stop, NULL); + if (!error) { + for (mdp = start; mdp < stop; mdp++) { + mp = linker_reloc_ptr(lf, *mdp); + if (mp->md_type != MDT_DEPEND) + continue; + linker_mdt_depend(lf, mp, &modname, &verinfo); + mod = modlist_lookup2(modname, verinfo); + mod->container->refs++; + error = linker_file_add_dependency(lf, + mod->container); + if (error) + panic("cannot add dependency"); + } + } + /* + * Now do relocation etc using the symbol search paths + * established by the dependencies + */ + error = LINKER_LINK_PRELOAD_FINISH(lf); + if (error) { + printf("KLD file %s - could not finalize loading\n", + lf->filename); + linker_file_unload(lf); + continue; + } + linker_file_register_modules(lf); + if (linker_file_lookup_set(lf, "sysinit_set", &si_start, + &si_stop, NULL) == 0) + sysinit_add(si_start, si_stop); + linker_file_register_sysctls(lf); + lf->flags |= LINKER_FILE_LINKED; + } + /* woohoo! we made it! */ +} + +SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0) + +/* + * Search for a not-loaded module by name. + * + * Modules may be found in the following locations: + * + * - preloaded (result is just the module name) - on disk (result is full path + * to module) + * + * If the module name is qualified in any way (contains path, etc.) the we + * simply return a copy of it. + * + * The search path can be manipulated via sysctl. Note that we use the ';' + * character as a separator to be consistent with the bootloader. + */ + +static char linker_hintfile[] = "linker.hints"; +static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules;/modules"; + +SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RW, linker_path, + sizeof(linker_path), "module load search path"); + +TUNABLE_STR("module_path", linker_path, sizeof(linker_path)); + +static char *linker_ext_list[] = { + "", + ".ko", + NULL +}; + +/* + * Check if file actually exists either with or without extension listed in + * the linker_ext_list. (probably should be generic for the rest of the + * kernel) + */ +static char * +linker_lookup_file(const char *path, int pathlen, const char *name, + int namelen, struct vattr *vap) +{ + struct nameidata nd; + struct thread *td = curthread; /* XXX */ + char *result, **cpp, *sep; + int error, len, extlen, reclen, flags; + enum vtype type; + + extlen = 0; + for (cpp = linker_ext_list; *cpp; cpp++) { + len = strlen(*cpp); + if (len > extlen) + extlen = len; + } + extlen++; /* trailing '\0' */ + sep = (path[pathlen - 1] != '/') ? "/" : ""; + + reclen = pathlen + strlen(sep) + namelen + extlen + 1; + result = malloc(reclen, M_LINKER, M_WAITOK); + for (cpp = linker_ext_list; *cpp; cpp++) { + snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep, + namelen, name, *cpp); + /* + * Attempt to open the file, and return the path if + * we succeed and it's a regular file. + */ + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, td); + flags = FREAD; + error = vn_open(&nd, &flags, 0); + if (error == 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + type = nd.ni_vp->v_type; + if (vap) + VOP_GETATTR(nd.ni_vp, vap, td->td_ucred, td); + VOP_UNLOCK(nd.ni_vp, 0, td); + vn_close(nd.ni_vp, FREAD, td->td_ucred, td); + if (type == VREG) + return (result); + } + } + free(result, M_LINKER); + return (NULL); +} + +#define INT_ALIGN(base, ptr) ptr = \ + (base) + (((ptr) - (base) + sizeof(int) - 1) & ~(sizeof(int) - 1)) + +/* + * Lookup KLD which contains requested module in the "linker.hints" file. If + * version specification is available, then try to find the best KLD. + * Otherwise just find the latest one. + * + * XXX: Vnode locking here is hosed; lock should be held for calls to + * VOP_GETATTR() and vn_rdwr(). + */ +static char * +linker_hints_lookup(const char *path, int pathlen, const char *modname, + int modnamelen, struct mod_depend *verinfo) +{ + struct thread *td = curthread; /* XXX */ + struct ucred *cred = td ? td->td_ucred : NULL; + struct nameidata nd; + struct vattr vattr, mattr; + u_char *hints = NULL; + u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep; + int error, ival, bestver, *intp, reclen, found, flags, clen, blen; + + result = NULL; + bestver = found = 0; + + sep = (path[pathlen - 1] != '/') ? "/" : ""; + reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen + + strlen(sep) + 1; + pathbuf = malloc(reclen, M_LINKER, M_WAITOK); + snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep, + linker_hintfile); + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf, td); + flags = FREAD; + error = vn_open(&nd, &flags, 0); + if (error) + goto bad; + NDFREE(&nd, NDF_ONLY_PNBUF); + VOP_UNLOCK(nd.ni_vp, 0, td); + if (nd.ni_vp->v_type != VREG) + goto bad; + best = cp = NULL; + error = VOP_GETATTR(nd.ni_vp, &vattr, cred, td); + if (error) + goto bad; + /* + * XXX: we need to limit this number to some reasonable value + */ + if (vattr.va_size > 100 * 1024) { + printf("hints file too large %ld\n", (long)vattr.va_size); + goto bad; + } + hints = malloc(vattr.va_size, M_TEMP, M_WAITOK); + if (hints == NULL) + goto bad; + error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0, + UIO_SYSSPACE, IO_NODELOCKED, cred, &reclen, td); + if (error) + goto bad; + vn_close(nd.ni_vp, FREAD, cred, td); + nd.ni_vp = NULL; + if (reclen != 0) { + printf("can't read %d\n", reclen); + goto bad; + } + intp = (int *)hints; + ival = *intp++; + if (ival != LINKER_HINTS_VERSION) { + printf("hints file version mismatch %d\n", ival); + goto bad; + } + bufend = hints + vattr.va_size; + recptr = (u_char *)intp; + clen = blen = 0; + while (recptr < bufend && !found) { + intp = (int *)recptr; + reclen = *intp++; + ival = *intp++; + cp = (char *)intp; + switch (ival) { + case MDT_VERSION: + clen = *cp++; + if (clen != modnamelen || bcmp(cp, modname, clen) != 0) + break; + cp += clen; + INT_ALIGN(hints, cp); + ival = *(int *)cp; + cp += sizeof(int); + clen = *cp++; + if (verinfo == NULL || + ival == verinfo->md_ver_preferred) { + found = 1; + break; + } + if (ival >= verinfo->md_ver_minimum && + ival <= verinfo->md_ver_maximum && + ival > bestver) { + bestver = ival; + best = cp; + blen = clen; + } + break; + default: + break; + } + recptr += reclen + sizeof(int); + } + /* + * Finally check if KLD is in the place + */ + if (found) + result = linker_lookup_file(path, pathlen, cp, clen, &mattr); + else if (best) + result = linker_lookup_file(path, pathlen, best, blen, &mattr); + + /* + * KLD is newer than hints file. What we should do now? + */ + if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >)) + printf("warning: KLD '%s' is newer than the linker.hints" + " file\n", result); +bad: + if (hints) + free(hints, M_TEMP); + if (nd.ni_vp != NULL) + vn_close(nd.ni_vp, FREAD, cred, td); + /* + * If nothing found or hints is absent - fallback to the old + * way by using "kldname[.ko]" as module name. + */ + if (!found && !bestver && result == NULL) + result = linker_lookup_file(path, pathlen, modname, + modnamelen, NULL); + return (result); +} + +/* + * Lookup KLD which contains requested module in the all directories. + */ +static char * +linker_search_module(const char *modname, int modnamelen, + struct mod_depend *verinfo) +{ + char *cp, *ep, *result; + + /* + * traverse the linker path + */ + for (cp = linker_path; *cp; cp = ep + 1) { + /* find the end of this component */ + for (ep = cp; (*ep != 0) && (*ep != ';'); ep++); + result = linker_hints_lookup(cp, ep - cp, modname, + modnamelen, verinfo); + if (result != NULL) + return (result); + if (*ep == 0) + break; + } + return (NULL); +} + +/* + * Search for module in all directories listed in the linker_path. + */ +static char * +linker_search_kld(const char *name) +{ + char *cp, *ep, *result, **cpp; + int extlen, len; + + /* qualified at all? */ + if (index(name, '/')) + return (linker_strdup(name)); + + extlen = 0; + for (cpp = linker_ext_list; *cpp; cpp++) { + len = strlen(*cpp); + if (len > extlen) + extlen = len; + } + extlen++; /* trailing '\0' */ + + /* traverse the linker path */ + len = strlen(name); + for (ep = linker_path; *ep; ep++) { + cp = ep; + /* find the end of this component */ + for (; *ep != 0 && *ep != ';'; ep++); + result = linker_lookup_file(cp, ep - cp, name, len, NULL); + if (result != NULL) + return (result); + } + return (NULL); +} + +static const char * +linker_basename(const char *path) +{ + const char *filename; + + filename = rindex(path, '/'); + if (filename == NULL) + return path; + if (filename[1]) + filename++; + return (filename); +} + +/* + * Find a file which contains given module and load it, if "parent" is not + * NULL, register a reference to it. + */ +static int +linker_load_module(const char *kldname, const char *modname, + struct linker_file *parent, struct mod_depend *verinfo, + struct linker_file **lfpp) +{ + linker_file_t lfdep; + const char *filename; + char *pathname; + int error; + + if (modname == NULL) { + /* + * We have to load KLD + */ + KASSERT(verinfo == NULL, ("linker_load_module: verinfo" + " is not NULL")); + pathname = linker_search_kld(kldname); + } else { + if (modlist_lookup2(modname, verinfo) != NULL) + return (EEXIST); + if (kldname != NULL) + pathname = linker_strdup(kldname); + else if (rootvnode == NULL) + pathname = NULL; + else + /* + * Need to find a KLD with required module + */ + pathname = linker_search_module(modname, + strlen(modname), verinfo); + } + if (pathname == NULL) + return (ENOENT); + + /* + * Can't load more than one file with the same basename XXX: + * Actually it should be possible to have multiple KLDs with + * the same basename but different path because they can + * provide different versions of the same modules. + */ + filename = linker_basename(pathname); + if (linker_find_file_by_name(filename)) { + error = EEXIST; + goto out; + } + do { + error = linker_load_file(pathname, &lfdep); + if (error) + break; + if (modname && verinfo && + modlist_lookup2(modname, verinfo) == NULL) { + linker_file_unload(lfdep); + error = ENOENT; + break; + } + if (parent) { + error = linker_file_add_dependency(parent, lfdep); + if (error) + break; + } + if (lfpp) + *lfpp = lfdep; + } while (0); +out: + if (pathname) + free(pathname, M_LINKER); + return (error); +} + +/* + * This routine is responsible for finding dependencies of userland initiated + * kldload(2)'s of files. + */ +int +linker_load_dependencies(linker_file_t lf) +{ + linker_file_t lfdep; + struct mod_metadata **start, **stop, **mdp, **nmdp; + struct mod_metadata *mp, *nmp; + struct mod_depend *verinfo; + modlist_t mod; + const char *modname, *nmodname; + int ver, error = 0, count; + + /* + * All files are dependant on /kernel. + */ + if (linker_kernel_file) { + linker_kernel_file->refs++; + error = linker_file_add_dependency(lf, linker_kernel_file); + if (error) + return (error); + } + if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, + &count) != 0) + return (0); + for (mdp = start; mdp < stop; mdp++) { + mp = linker_reloc_ptr(lf, *mdp); + if (mp->md_type != MDT_VERSION) + continue; + linker_mdt_version(lf, mp, &modname, &ver); + mod = modlist_lookup(modname, ver); + if (mod != NULL) { + printf("interface %s.%d already present in the KLD" + " '%s'!\n", modname, ver, + mod->container->filename); + return (EEXIST); + } + } + + for (mdp = start; mdp < stop; mdp++) { + mp = linker_reloc_ptr(lf, *mdp); + if (mp->md_type != MDT_DEPEND) + continue; + linker_mdt_depend(lf, mp, &modname, &verinfo); + nmodname = NULL; + for (nmdp = start; nmdp < stop; nmdp++) { + nmp = linker_reloc_ptr(lf, *nmdp); + if (nmp->md_type != MDT_VERSION) + continue; + nmodname = linker_reloc_ptr(lf, nmp->md_cval); + if (strcmp(modname, nmodname) == 0) + break; + } + if (nmdp < stop)/* early exit, it's a self reference */ + continue; + mod = modlist_lookup2(modname, verinfo); + if (mod) { /* woohoo, it's loaded already */ + lfdep = mod->container; + lfdep->refs++; + error = linker_file_add_dependency(lf, lfdep); + if (error) + break; + continue; + } + error = linker_load_module(NULL, modname, lf, verinfo, NULL); + if (error) { + printf("KLD %s: depends on %s - not available\n", + lf->filename, modname); + break; + } + } + + if (error) + return (error); + linker_addmodules(lf, start, stop, 0); + return (error); +} + +static int +sysctl_kern_function_list_iterate(const char *name, void *opaque) +{ + struct sysctl_req *req; + + req = opaque; + return (SYSCTL_OUT(req, name, strlen(name) + 1)); +} + +/* + * Export a nul-separated, double-nul-terminated list of all function names + * in the kernel. + */ +static int +sysctl_kern_function_list(SYSCTL_HANDLER_ARGS) +{ + linker_file_t lf; + int error; + + mtx_lock(&kld_mtx); + TAILQ_FOREACH(lf, &linker_files, link) { + error = LINKER_EACH_FUNCTION_NAME(lf, + sysctl_kern_function_list_iterate, req); + if (error) { + mtx_unlock(&kld_mtx); + return (error); + } + } + mtx_unlock(&kld_mtx); + return (SYSCTL_OUT(req, "", 1)); +} + +SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLFLAG_RD, + NULL, 0, sysctl_kern_function_list, "", "kernel function list"); diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c new file mode 100644 index 0000000..5189bb7 --- /dev/null +++ b/sys/kern/kern_lock.c @@ -0,0 +1,594 @@ +/* + * Copyright (c) 1995 + * The Regents of the University of California. All rights reserved. + * + * Copyright (C) 1997 + * John S. Dyson. All rights reserved. + * + * This code contains ideas from software contributed to Berkeley by + * Avadis Tevanian, Jr., Michael Wayne Young, and the Mach Operating + * System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_lock.c 8.18 (Berkeley) 5/21/95 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +/* + * Locking primitives implementation. + * Locks provide shared/exclusive sychronization. + */ + +#define LOCK_WAIT_TIME 100 +#define LOCK_SAMPLE_WAIT 7 + +#if defined(DIAGNOSTIC) +#define LOCK_INLINE +#else +#define LOCK_INLINE __inline +#endif + +#define LK_ALL (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | \ + LK_SHARE_NONZERO | LK_WAIT_NONZERO) + +/* + * Mutex array variables. Rather than each lockmgr lock having its own mutex, + * share a fixed (at boot time) number of mutexes across all lockmgr locks in + * order to keep sizeof(struct lock) down. + */ +int lock_mtx_valid; +static struct mtx lock_mtx; + +static int acquire(struct lock *lkp, int extflags, int wanted); +static int apause(struct lock *lkp, int flags); +static int acquiredrain(struct lock *lkp, int extflags) ; + +static void +lockmgr_init(void *dummy __unused) +{ + /* + * Initialize the lockmgr protection mutex if it hasn't already been + * done. Unless something changes about kernel startup order, VM + * initialization will always cause this mutex to already be + * initialized in a call to lockinit(). + */ + if (lock_mtx_valid == 0) { + mtx_init(&lock_mtx, "lockmgr", NULL, MTX_DEF); + lock_mtx_valid = 1; + } +} +SYSINIT(lmgrinit, SI_SUB_LOCK, SI_ORDER_FIRST, lockmgr_init, NULL) + +static LOCK_INLINE void +sharelock(struct lock *lkp, int incr) { + lkp->lk_flags |= LK_SHARE_NONZERO; + lkp->lk_sharecount += incr; +} + +static LOCK_INLINE void +shareunlock(struct lock *lkp, int decr) { + + KASSERT(lkp->lk_sharecount >= decr, ("shareunlock: count < decr")); + + if (lkp->lk_sharecount == decr) { + lkp->lk_flags &= ~LK_SHARE_NONZERO; + if (lkp->lk_flags & (LK_WANT_UPGRADE | LK_WANT_EXCL)) { + wakeup(lkp); + } + lkp->lk_sharecount = 0; + } else { + lkp->lk_sharecount -= decr; + } +} + +/* + * This is the waitloop optimization. + */ +static int +apause(struct lock *lkp, int flags) +{ +#ifdef SMP + int i, lock_wait; +#endif + + if ((lkp->lk_flags & flags) == 0) + return 0; +#ifdef SMP + for (lock_wait = LOCK_WAIT_TIME; lock_wait > 0; lock_wait--) { + mtx_unlock(lkp->lk_interlock); + for (i = LOCK_SAMPLE_WAIT; i > 0; i--) + if ((lkp->lk_flags & flags) == 0) + break; + mtx_lock(lkp->lk_interlock); + if ((lkp->lk_flags & flags) == 0) + return 0; + } +#endif + return 1; +} + +static int +acquire(struct lock *lkp, int extflags, int wanted) { + int s, error; + + CTR3(KTR_LOCKMGR, + "acquire(): lkp == %p, extflags == 0x%x, wanted == 0x%x\n", + lkp, extflags, wanted); + + if ((extflags & LK_NOWAIT) && (lkp->lk_flags & wanted)) { + return EBUSY; + } + + if (((lkp->lk_flags | extflags) & LK_NOPAUSE) == 0) { + error = apause(lkp, wanted); + if (error == 0) + return 0; + } + + s = splhigh(); + while ((lkp->lk_flags & wanted) != 0) { + lkp->lk_flags |= LK_WAIT_NONZERO; + lkp->lk_waitcount++; + error = msleep(lkp, lkp->lk_interlock, lkp->lk_prio, + lkp->lk_wmesg, + ((extflags & LK_TIMELOCK) ? lkp->lk_timo : 0)); + if (lkp->lk_waitcount == 1) { + lkp->lk_flags &= ~LK_WAIT_NONZERO; + lkp->lk_waitcount = 0; + } else { + lkp->lk_waitcount--; + } + if (error) { + splx(s); + return error; + } + if (extflags & LK_SLEEPFAIL) { + splx(s); + return ENOLCK; + } + } + splx(s); + return 0; +} + +/* + * Set, change, or release a lock. + * + * Shared requests increment the shared count. Exclusive requests set the + * LK_WANT_EXCL flag (preventing further shared locks), and wait for already + * accepted shared locks and shared-to-exclusive upgrades to go away. + */ +int +#ifndef DEBUG_LOCKS +lockmgr(lkp, flags, interlkp, td) +#else +debuglockmgr(lkp, flags, interlkp, td, name, file, line) +#endif + struct lock *lkp; + u_int flags; + struct mtx *interlkp; + struct thread *td; +#ifdef DEBUG_LOCKS + const char *name; /* Name of lock function */ + const char *file; /* Name of file call is from */ + int line; /* Line number in file */ +#endif +{ + int error; + pid_t pid; + int extflags, lockflags; + + CTR5(KTR_LOCKMGR, + "lockmgr(): lkp == %p (lk_wmesg == \"%s\"), flags == 0x%x, " + "interlkp == %p, td == %p", lkp, lkp->lk_wmesg, flags, interlkp, td); + + error = 0; + if (td == NULL) + pid = LK_KERNPROC; + else + pid = td->td_proc->p_pid; + + mtx_lock(lkp->lk_interlock); + if (flags & LK_INTERLOCK) { + mtx_assert(interlkp, MA_OWNED | MA_NOTRECURSED); + mtx_unlock(interlkp); + } + + if (panicstr != NULL) { + mtx_unlock(lkp->lk_interlock); + return (0); + } + + extflags = (flags | lkp->lk_flags) & LK_EXTFLG_MASK; + + switch (flags & LK_TYPE_MASK) { + + case LK_SHARED: + /* + * If we are not the exclusive lock holder, we have to block + * while there is an exclusive lock holder or while an + * exclusive lock request or upgrade request is in progress. + * + * However, if TDF_DEADLKTREAT is set, we override exclusive + * lock requests or upgrade requests ( but not the exclusive + * lock itself ). + */ + if (lkp->lk_lockholder != pid) { + lockflags = LK_HAVE_EXCL; + mtx_lock_spin(&sched_lock); + if (td != NULL && !(td->td_flags & TDF_DEADLKTREAT)) + lockflags |= LK_WANT_EXCL | LK_WANT_UPGRADE; + mtx_unlock_spin(&sched_lock); + error = acquire(lkp, extflags, lockflags); + if (error) + break; + sharelock(lkp, 1); +#if defined(DEBUG_LOCKS) + lkp->lk_slockholder = pid; + lkp->lk_sfilename = file; + lkp->lk_slineno = line; + lkp->lk_slockername = name; +#endif + break; + } + /* + * We hold an exclusive lock, so downgrade it to shared. + * An alternative would be to fail with EDEADLK. + */ + sharelock(lkp, 1); + /* fall into downgrade */ + + case LK_DOWNGRADE: + KASSERT(lkp->lk_lockholder == pid && lkp->lk_exclusivecount != 0, + ("lockmgr: not holding exclusive lock " + "(owner pid (%d) != pid (%d), exlcnt (%d) != 0", + lkp->lk_lockholder, pid, lkp->lk_exclusivecount)); + sharelock(lkp, lkp->lk_exclusivecount); + lkp->lk_exclusivecount = 0; + lkp->lk_flags &= ~LK_HAVE_EXCL; + lkp->lk_lockholder = LK_NOPROC; + if (lkp->lk_waitcount) + wakeup((void *)lkp); + break; + + case LK_EXCLUPGRADE: + /* + * If another process is ahead of us to get an upgrade, + * then we want to fail rather than have an intervening + * exclusive access. + */ + if (lkp->lk_flags & LK_WANT_UPGRADE) { + shareunlock(lkp, 1); + error = EBUSY; + break; + } + /* fall into normal upgrade */ + + case LK_UPGRADE: + /* + * Upgrade a shared lock to an exclusive one. If another + * shared lock has already requested an upgrade to an + * exclusive lock, our shared lock is released and an + * exclusive lock is requested (which will be granted + * after the upgrade). If we return an error, the file + * will always be unlocked. + */ + if ((lkp->lk_lockholder == pid) || (lkp->lk_sharecount <= 0)) + panic("lockmgr: upgrade exclusive lock"); + shareunlock(lkp, 1); + /* + * If we are just polling, check to see if we will block. + */ + if ((extflags & LK_NOWAIT) && + ((lkp->lk_flags & LK_WANT_UPGRADE) || + lkp->lk_sharecount > 1)) { + error = EBUSY; + break; + } + if ((lkp->lk_flags & LK_WANT_UPGRADE) == 0) { + /* + * We are first shared lock to request an upgrade, so + * request upgrade and wait for the shared count to + * drop to zero, then take exclusive lock. + */ + lkp->lk_flags |= LK_WANT_UPGRADE; + error = acquire(lkp, extflags, LK_SHARE_NONZERO); + lkp->lk_flags &= ~LK_WANT_UPGRADE; + + if (error) + break; + lkp->lk_flags |= LK_HAVE_EXCL; + lkp->lk_lockholder = pid; + if (lkp->lk_exclusivecount != 0) + panic("lockmgr: non-zero exclusive count"); + lkp->lk_exclusivecount = 1; +#if defined(DEBUG_LOCKS) + lkp->lk_filename = file; + lkp->lk_lineno = line; + lkp->lk_lockername = name; +#endif + break; + } + /* + * Someone else has requested upgrade. Release our shared + * lock, awaken upgrade requestor if we are the last shared + * lock, then request an exclusive lock. + */ + if ( (lkp->lk_flags & (LK_SHARE_NONZERO|LK_WAIT_NONZERO)) == + LK_WAIT_NONZERO) + wakeup((void *)lkp); + /* fall into exclusive request */ + + case LK_EXCLUSIVE: + if (lkp->lk_lockholder == pid && pid != LK_KERNPROC) { + /* + * Recursive lock. + */ + if ((extflags & (LK_NOWAIT | LK_CANRECURSE)) == 0) + panic("lockmgr: locking against myself"); + if ((extflags & LK_CANRECURSE) != 0) { + lkp->lk_exclusivecount++; + break; + } + } + /* + * If we are just polling, check to see if we will sleep. + */ + if ((extflags & LK_NOWAIT) && + (lkp->lk_flags & (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | LK_SHARE_NONZERO))) { + error = EBUSY; + break; + } + /* + * Try to acquire the want_exclusive flag. + */ + error = acquire(lkp, extflags, (LK_HAVE_EXCL | LK_WANT_EXCL)); + if (error) + break; + lkp->lk_flags |= LK_WANT_EXCL; + /* + * Wait for shared locks and upgrades to finish. + */ + error = acquire(lkp, extflags, LK_WANT_UPGRADE | LK_SHARE_NONZERO); + lkp->lk_flags &= ~LK_WANT_EXCL; + if (error) + break; + lkp->lk_flags |= LK_HAVE_EXCL; + lkp->lk_lockholder = pid; + if (lkp->lk_exclusivecount != 0) + panic("lockmgr: non-zero exclusive count"); + lkp->lk_exclusivecount = 1; +#if defined(DEBUG_LOCKS) + lkp->lk_filename = file; + lkp->lk_lineno = line; + lkp->lk_lockername = name; +#endif + break; + + case LK_RELEASE: + if (lkp->lk_exclusivecount != 0) { + if (lkp->lk_lockholder != pid && + lkp->lk_lockholder != LK_KERNPROC) { + panic("lockmgr: pid %d, not %s %d unlocking", + pid, "exclusive lock holder", + lkp->lk_lockholder); + } + if (lkp->lk_exclusivecount == 1) { + lkp->lk_flags &= ~LK_HAVE_EXCL; + lkp->lk_lockholder = LK_NOPROC; + lkp->lk_exclusivecount = 0; + } else { + lkp->lk_exclusivecount--; + } + } else if (lkp->lk_flags & LK_SHARE_NONZERO) + shareunlock(lkp, 1); + if (lkp->lk_flags & LK_WAIT_NONZERO) + wakeup((void *)lkp); + break; + + case LK_DRAIN: + /* + * Check that we do not already hold the lock, as it can + * never drain if we do. Unfortunately, we have no way to + * check for holding a shared lock, but at least we can + * check for an exclusive one. + */ + if (lkp->lk_lockholder == pid) + panic("lockmgr: draining against myself"); + + error = acquiredrain(lkp, extflags); + if (error) + break; + lkp->lk_flags |= LK_DRAINING | LK_HAVE_EXCL; + lkp->lk_lockholder = pid; + lkp->lk_exclusivecount = 1; +#if defined(DEBUG_LOCKS) + lkp->lk_filename = file; + lkp->lk_lineno = line; + lkp->lk_lockername = name; +#endif + break; + + default: + mtx_unlock(lkp->lk_interlock); + panic("lockmgr: unknown locktype request %d", + flags & LK_TYPE_MASK); + /* NOTREACHED */ + } + if ((lkp->lk_flags & LK_WAITDRAIN) && + (lkp->lk_flags & (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | + LK_SHARE_NONZERO | LK_WAIT_NONZERO)) == 0) { + lkp->lk_flags &= ~LK_WAITDRAIN; + wakeup((void *)&lkp->lk_flags); + } + mtx_unlock(lkp->lk_interlock); + return (error); +} + +static int +acquiredrain(struct lock *lkp, int extflags) { + int error; + + if ((extflags & LK_NOWAIT) && (lkp->lk_flags & LK_ALL)) { + return EBUSY; + } + + error = apause(lkp, LK_ALL); + if (error == 0) + return 0; + + while (lkp->lk_flags & LK_ALL) { + lkp->lk_flags |= LK_WAITDRAIN; + error = msleep(&lkp->lk_flags, lkp->lk_interlock, lkp->lk_prio, + lkp->lk_wmesg, + ((extflags & LK_TIMELOCK) ? lkp->lk_timo : 0)); + if (error) + return error; + if (extflags & LK_SLEEPFAIL) { + return ENOLCK; + } + } + return 0; +} + +/* + * Initialize a lock; required before use. + */ +void +lockinit(lkp, prio, wmesg, timo, flags) + struct lock *lkp; + int prio; + const char *wmesg; + int timo; + int flags; +{ + CTR5(KTR_LOCKMGR, "lockinit(): lkp == %p, prio == %d, wmesg == \"%s\", " + "timo == %d, flags = 0x%x\n", lkp, prio, wmesg, timo, flags); + + if (lock_mtx_valid == 0) { + mtx_init(&lock_mtx, "lockmgr", NULL, MTX_DEF); + lock_mtx_valid = 1; + } + /* + * XXX cleanup - make sure mtxpool is always initialized before + * this is ever called. + */ + if (mtx_pool_valid) { + mtx_lock(&lock_mtx); + lkp->lk_interlock = mtx_pool_alloc(); + mtx_unlock(&lock_mtx); + } else { + lkp->lk_interlock = &lock_mtx; + } + lkp->lk_flags = (flags & LK_EXTFLG_MASK); + lkp->lk_sharecount = 0; + lkp->lk_waitcount = 0; + lkp->lk_exclusivecount = 0; + lkp->lk_prio = prio; + lkp->lk_wmesg = wmesg; + lkp->lk_timo = timo; + lkp->lk_lockholder = LK_NOPROC; +} + +/* + * Destroy a lock. + */ +void +lockdestroy(lkp) + struct lock *lkp; +{ + CTR2(KTR_LOCKMGR, "lockdestroy(): lkp == %p (lk_wmesg == \"%s\")", + lkp, lkp->lk_wmesg); +} + +/* + * Determine the status of a lock. + */ +int +lockstatus(lkp, td) + struct lock *lkp; + struct thread *td; +{ + int lock_type = 0; + + mtx_lock(lkp->lk_interlock); + if (lkp->lk_exclusivecount != 0) { + if (td == NULL || lkp->lk_lockholder == td->td_proc->p_pid) + lock_type = LK_EXCLUSIVE; + else + lock_type = LK_EXCLOTHER; + } else if (lkp->lk_sharecount != 0) + lock_type = LK_SHARED; + mtx_unlock(lkp->lk_interlock); + return (lock_type); +} + +/* + * Determine the number of holders of a lock. + */ +int +lockcount(lkp) + struct lock *lkp; +{ + int count; + + mtx_lock(lkp->lk_interlock); + count = lkp->lk_exclusivecount + lkp->lk_sharecount; + mtx_unlock(lkp->lk_interlock); + return (count); +} + +/* + * Print out information about state of a lock. Used by VOP_PRINT + * routines to display status about contained locks. + */ +void +lockmgr_printinfo(lkp) + struct lock *lkp; +{ + + if (lkp->lk_sharecount) + printf(" lock type %s: SHARED (count %d)", lkp->lk_wmesg, + lkp->lk_sharecount); + else if (lkp->lk_flags & LK_HAVE_EXCL) + printf(" lock type %s: EXCL (count %d) by pid %d", + lkp->lk_wmesg, lkp->lk_exclusivecount, lkp->lk_lockholder); + if (lkp->lk_waitcount > 0) + printf(" with %d pending", lkp->lk_waitcount); +} diff --git a/sys/kern/kern_lockf.c b/sys/kern/kern_lockf.c new file mode 100644 index 0000000..c1cadb1 --- /dev/null +++ b/sys/kern/kern_lockf.c @@ -0,0 +1,846 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Scooter Morris at Genentech Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94 + * $FreeBSD$ + */ + +#include "opt_debug_lockf.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/unistd.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/fcntl.h> +#include <sys/lockf.h> + +#include <machine/limits.h> + +/* + * This variable controls the maximum number of processes that will + * be checked in doing deadlock detection. + */ +static int maxlockdepth = MAXDEPTH; + +#ifdef LOCKF_DEBUG +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> + + +static int lockf_debug = 0; +SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, ""); +#endif + +MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures"); + +#define NOLOCKF (struct lockf *)0 +#define SELF 0x1 +#define OTHERS 0x2 +static int lf_clearlock(struct lockf *); +static int lf_findoverlap(struct lockf *, + struct lockf *, int, struct lockf ***, struct lockf **); +static struct lockf * + lf_getblock(struct lockf *); +static int lf_getlock(struct lockf *, struct flock *); +static int lf_setlock(struct lockf *); +static void lf_split(struct lockf *, struct lockf *); +static void lf_wakelock(struct lockf *); + +/* + * Advisory record locking support + */ +int +lf_advlock(ap, head, size) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; + struct lockf **head; + u_quad_t size; +{ + register struct flock *fl = ap->a_fl; + register struct lockf *lock; + off_t start, end, oadd; + int error; + + /* + * Convert the flock structure into a start and end. + */ + switch (fl->l_whence) { + + case SEEK_SET: + case SEEK_CUR: + /* + * Caller is responsible for adding any necessary offset + * when SEEK_CUR is used. + */ + start = fl->l_start; + break; + + case SEEK_END: + if (size > OFF_MAX || + (fl->l_start > 0 && size > OFF_MAX - fl->l_start)) + return (EOVERFLOW); + start = size + fl->l_start; + break; + + default: + return (EINVAL); + } + if (start < 0) + return (EINVAL); + if (fl->l_len < 0) { + if (start == 0) + return (EINVAL); + end = start - 1; + start += fl->l_len; + if (start < 0) + return (EINVAL); + } else if (fl->l_len == 0) + end = -1; + else { + oadd = fl->l_len - 1; + if (oadd > OFF_MAX - start) + return (EOVERFLOW); + end = start + oadd; + } + /* + * Avoid the common case of unlocking when inode has no locks. + */ + if (*head == (struct lockf *)0) { + if (ap->a_op != F_SETLK) { + fl->l_type = F_UNLCK; + return (0); + } + } + /* + * Create the lockf structure + */ + MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); + lock->lf_start = start; + lock->lf_end = end; + lock->lf_id = ap->a_id; + /* + * XXX The problem is that VTOI is ufs specific, so it will + * break LOCKF_DEBUG for all other FS's other than UFS because + * it casts the vnode->data ptr to struct inode *. + */ +/* lock->lf_inode = VTOI(ap->a_vp); */ + lock->lf_inode = (struct inode *)0; + lock->lf_type = fl->l_type; + lock->lf_head = head; + lock->lf_next = (struct lockf *)0; + TAILQ_INIT(&lock->lf_blkhd); + lock->lf_flags = ap->a_flags; + /* + * Do the requested operation. + */ + switch(ap->a_op) { + case F_SETLK: + return (lf_setlock(lock)); + + case F_UNLCK: + error = lf_clearlock(lock); + FREE(lock, M_LOCKF); + return (error); + + case F_GETLK: + error = lf_getlock(lock, fl); + FREE(lock, M_LOCKF); + return (error); + + default: + free(lock, M_LOCKF); + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Set a byte-range lock. + */ +static int +lf_setlock(lock) + register struct lockf *lock; +{ + register struct lockf *block; + struct lockf **head = lock->lf_head; + struct lockf **prev, *overlap, *ltmp; + static char lockstr[] = "lockf"; + int ovcase, priority, needtolink, error; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_print("lf_setlock", lock); +#endif /* LOCKF_DEBUG */ + + /* + * Set the priority + */ + priority = PLOCK; + if (lock->lf_type == F_WRLCK) + priority += 4; + priority |= PCATCH; + /* + * Scan lock list for this file looking for locks that would block us. + */ + while ((block = lf_getblock(lock))) { + /* + * Free the structure and return if nonblocking. + */ + if ((lock->lf_flags & F_WAIT) == 0) { + FREE(lock, M_LOCKF); + return (EAGAIN); + } + /* + * We are blocked. Since flock style locks cover + * the whole file, there is no chance for deadlock. + * For byte-range locks we must check for deadlock. + * + * Deadlock detection is done by looking through the + * wait channels to see if there are any cycles that + * involve us. MAXDEPTH is set just to make sure we + * do not go off into neverland. + */ + if ((lock->lf_flags & F_POSIX) && + (block->lf_flags & F_POSIX)) { + register struct proc *wproc; + struct thread *td; + register struct lockf *waitblock; + int i = 0; + + /* The block is waiting on something */ + /* XXXKSE this is not complete under threads */ + wproc = (struct proc *)block->lf_id; + mtx_lock_spin(&sched_lock); + FOREACH_THREAD_IN_PROC(wproc, td) { + while (td->td_wchan && + (td->td_wmesg == lockstr) && + (i++ < maxlockdepth)) { + waitblock = (struct lockf *)td->td_wchan; + /* Get the owner of the blocking lock */ + waitblock = waitblock->lf_next; + if ((waitblock->lf_flags & F_POSIX) == 0) + break; + wproc = (struct proc *)waitblock->lf_id; + if (wproc == (struct proc *)lock->lf_id) { + mtx_unlock_spin(&sched_lock); + free(lock, M_LOCKF); + return (EDEADLK); + } + } + } + mtx_unlock_spin(&sched_lock); + } + /* + * For flock type locks, we must first remove + * any shared locks that we hold before we sleep + * waiting for an exclusive lock. + */ + if ((lock->lf_flags & F_FLOCK) && + lock->lf_type == F_WRLCK) { + lock->lf_type = F_UNLCK; + (void) lf_clearlock(lock); + lock->lf_type = F_WRLCK; + } + /* + * Add our lock to the blocked list and sleep until we're free. + * Remember who blocked us (for deadlock detection). + */ + lock->lf_next = block; + TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block); +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + lf_print("lf_setlock: blocking on", block); + lf_printlist("lf_setlock", block); + } +#endif /* LOCKF_DEBUG */ + error = tsleep(lock, priority, lockstr, 0); + /* + * We may have been awakened by a signal and/or by a + * debugger continuing us (in which cases we must remove + * ourselves from the blocked list) and/or by another + * process releasing a lock (in which case we have + * already been removed from the blocked list and our + * lf_next field set to NOLOCKF). + */ + if (lock->lf_next) { + TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block); + lock->lf_next = NOLOCKF; + } + if (error) { + free(lock, M_LOCKF); + return (error); + } + } + /* + * No blocks!! Add the lock. Note that we will + * downgrade or upgrade any overlapping locks this + * process already owns. + * + * Skip over locks owned by other processes. + * Handle any locks that overlap and are owned by ourselves. + */ + prev = head; + block = *head; + needtolink = 1; + for (;;) { + ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap); + if (ovcase) + block = overlap->lf_next; + /* + * Six cases: + * 0) no overlap + * 1) overlap == lock + * 2) overlap contains lock + * 3) lock contains overlap + * 4) overlap starts before lock + * 5) overlap ends after lock + */ + switch (ovcase) { + case 0: /* no overlap */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap; + } + break; + + case 1: /* overlap == lock */ + /* + * If downgrading lock, others may be + * able to acquire it. + */ + if (lock->lf_type == F_RDLCK && + overlap->lf_type == F_WRLCK) + lf_wakelock(overlap); + overlap->lf_type = lock->lf_type; + FREE(lock, M_LOCKF); + lock = overlap; /* for debug output below */ + break; + + case 2: /* overlap contains lock */ + /* + * Check for common starting point and different types. + */ + if (overlap->lf_type == lock->lf_type) { + free(lock, M_LOCKF); + lock = overlap; /* for debug output below */ + break; + } + if (overlap->lf_start == lock->lf_start) { + *prev = lock; + lock->lf_next = overlap; + overlap->lf_start = lock->lf_end + 1; + } else + lf_split(overlap, lock); + lf_wakelock(overlap); + break; + + case 3: /* lock contains overlap */ + /* + * If downgrading lock, others may be able to + * acquire it, otherwise take the list. + */ + if (lock->lf_type == F_RDLCK && + overlap->lf_type == F_WRLCK) { + lf_wakelock(overlap); + } else { + while (!TAILQ_EMPTY(&overlap->lf_blkhd)) { + ltmp = TAILQ_FIRST(&overlap->lf_blkhd); + TAILQ_REMOVE(&overlap->lf_blkhd, ltmp, + lf_block); + TAILQ_INSERT_TAIL(&lock->lf_blkhd, + ltmp, lf_block); + ltmp->lf_next = lock; + } + } + /* + * Add the new lock if necessary and delete the overlap. + */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap->lf_next; + prev = &lock->lf_next; + needtolink = 0; + } else + *prev = overlap->lf_next; + free(overlap, M_LOCKF); + continue; + + case 4: /* overlap starts before lock */ + /* + * Add lock after overlap on the list. + */ + lock->lf_next = overlap->lf_next; + overlap->lf_next = lock; + overlap->lf_end = lock->lf_start - 1; + prev = &lock->lf_next; + lf_wakelock(overlap); + needtolink = 0; + continue; + + case 5: /* overlap ends after lock */ + /* + * Add the new lock before overlap. + */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap; + } + overlap->lf_start = lock->lf_end + 1; + lf_wakelock(overlap); + break; + } + break; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + lf_print("lf_setlock: got the lock", lock); + lf_printlist("lf_setlock", lock); + } +#endif /* LOCKF_DEBUG */ + return (0); +} + +/* + * Remove a byte-range lock on an inode. + * + * Generally, find the lock (or an overlap to that lock) + * and remove it (or shrink it), then wakeup anyone we can. + */ +static int +lf_clearlock(unlock) + register struct lockf *unlock; +{ + struct lockf **head = unlock->lf_head; + register struct lockf *lf = *head; + struct lockf *overlap, **prev; + int ovcase; + + if (lf == NOLOCKF) + return (0); +#ifdef LOCKF_DEBUG + if (unlock->lf_type != F_UNLCK) + panic("lf_clearlock: bad type"); + if (lockf_debug & 1) + lf_print("lf_clearlock", unlock); +#endif /* LOCKF_DEBUG */ + prev = head; + while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap))) { + /* + * Wakeup the list of locks to be retried. + */ + lf_wakelock(overlap); + + switch (ovcase) { + + case 1: /* overlap == lock */ + *prev = overlap->lf_next; + FREE(overlap, M_LOCKF); + break; + + case 2: /* overlap contains lock: split it */ + if (overlap->lf_start == unlock->lf_start) { + overlap->lf_start = unlock->lf_end + 1; + break; + } + lf_split(overlap, unlock); + overlap->lf_next = unlock->lf_next; + break; + + case 3: /* lock contains overlap */ + *prev = overlap->lf_next; + lf = overlap->lf_next; + free(overlap, M_LOCKF); + continue; + + case 4: /* overlap starts before lock */ + overlap->lf_end = unlock->lf_start - 1; + prev = &overlap->lf_next; + lf = overlap->lf_next; + continue; + + case 5: /* overlap ends after lock */ + overlap->lf_start = unlock->lf_end + 1; + break; + } + break; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_printlist("lf_clearlock", unlock); +#endif /* LOCKF_DEBUG */ + return (0); +} + +/* + * Check whether there is a blocking lock, + * and if so return its process identifier. + */ +static int +lf_getlock(lock, fl) + register struct lockf *lock; + register struct flock *fl; +{ + register struct lockf *block; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_print("lf_getlock", lock); +#endif /* LOCKF_DEBUG */ + + if ((block = lf_getblock(lock))) { + fl->l_type = block->lf_type; + fl->l_whence = SEEK_SET; + fl->l_start = block->lf_start; + if (block->lf_end == -1) + fl->l_len = 0; + else + fl->l_len = block->lf_end - block->lf_start + 1; + if (block->lf_flags & F_POSIX) + fl->l_pid = ((struct proc *)(block->lf_id))->p_pid; + else + fl->l_pid = -1; + } else { + fl->l_type = F_UNLCK; + } + return (0); +} + +/* + * Walk the list of locks for an inode and + * return the first blocking lock. + */ +static struct lockf * +lf_getblock(lock) + register struct lockf *lock; +{ + struct lockf **prev, *overlap, *lf = *(lock->lf_head); + int ovcase; + + prev = lock->lf_head; + while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap))) { + /* + * We've found an overlap, see if it blocks us + */ + if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) + return (overlap); + /* + * Nope, point to the next one on the list and + * see if it blocks us + */ + lf = overlap->lf_next; + } + return (NOLOCKF); +} + +/* + * Walk the list of locks for an inode to + * find an overlapping lock (if any). + * + * NOTE: this returns only the FIRST overlapping lock. There + * may be more than one. + */ +static int +lf_findoverlap(lf, lock, type, prev, overlap) + register struct lockf *lf; + struct lockf *lock; + int type; + struct lockf ***prev; + struct lockf **overlap; +{ + off_t start, end; + + *overlap = lf; + if (lf == NOLOCKF) + return (0); +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("lf_findoverlap: looking for overlap in", lock); +#endif /* LOCKF_DEBUG */ + start = lock->lf_start; + end = lock->lf_end; + while (lf != NOLOCKF) { + if (((type & SELF) && lf->lf_id != lock->lf_id) || + ((type & OTHERS) && lf->lf_id == lock->lf_id)) { + *prev = &lf->lf_next; + *overlap = lf = lf->lf_next; + continue; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("\tchecking", lf); +#endif /* LOCKF_DEBUG */ + /* + * OK, check for overlap + * + * Six cases: + * 0) no overlap + * 1) overlap == lock + * 2) overlap contains lock + * 3) lock contains overlap + * 4) overlap starts before lock + * 5) overlap ends after lock + */ + if ((lf->lf_end != -1 && start > lf->lf_end) || + (end != -1 && lf->lf_start > end)) { + /* Case 0 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("no overlap\n"); +#endif /* LOCKF_DEBUG */ + if ((type & SELF) && end != -1 && lf->lf_start > end) + return (0); + *prev = &lf->lf_next; + *overlap = lf = lf->lf_next; + continue; + } + if ((lf->lf_start == start) && (lf->lf_end == end)) { + /* Case 1 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap == lock\n"); +#endif /* LOCKF_DEBUG */ + return (1); + } + if ((lf->lf_start <= start) && + (end != -1) && + ((lf->lf_end >= end) || (lf->lf_end == -1))) { + /* Case 2 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap contains lock\n"); +#endif /* LOCKF_DEBUG */ + return (2); + } + if (start <= lf->lf_start && + (end == -1 || + (lf->lf_end != -1 && end >= lf->lf_end))) { + /* Case 3 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("lock contains overlap\n"); +#endif /* LOCKF_DEBUG */ + return (3); + } + if ((lf->lf_start < start) && + ((lf->lf_end >= start) || (lf->lf_end == -1))) { + /* Case 4 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap starts before lock\n"); +#endif /* LOCKF_DEBUG */ + return (4); + } + if ((lf->lf_start > start) && + (end != -1) && + ((lf->lf_end > end) || (lf->lf_end == -1))) { + /* Case 5 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap ends after lock\n"); +#endif /* LOCKF_DEBUG */ + return (5); + } + panic("lf_findoverlap: default"); + } + return (0); +} + +/* + * Split a lock and a contained region into + * two or three locks as necessary. + */ +static void +lf_split(lock1, lock2) + register struct lockf *lock1; + register struct lockf *lock2; +{ + register struct lockf *splitlock; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) { + lf_print("lf_split", lock1); + lf_print("splitting from", lock2); + } +#endif /* LOCKF_DEBUG */ + /* + * Check to see if spliting into only two pieces. + */ + if (lock1->lf_start == lock2->lf_start) { + lock1->lf_start = lock2->lf_end + 1; + lock2->lf_next = lock1; + return; + } + if (lock1->lf_end == lock2->lf_end) { + lock1->lf_end = lock2->lf_start - 1; + lock2->lf_next = lock1->lf_next; + lock1->lf_next = lock2; + return; + } + /* + * Make a new lock consisting of the last part of + * the encompassing lock + */ + MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK); + bcopy(lock1, splitlock, sizeof *splitlock); + splitlock->lf_start = lock2->lf_end + 1; + TAILQ_INIT(&splitlock->lf_blkhd); + lock1->lf_end = lock2->lf_start - 1; + /* + * OK, now link it in + */ + splitlock->lf_next = lock1->lf_next; + lock2->lf_next = splitlock; + lock1->lf_next = lock2; +} + +/* + * Wakeup a blocklist + */ +static void +lf_wakelock(listhead) + struct lockf *listhead; +{ + register struct lockf *wakelock; + + while (!TAILQ_EMPTY(&listhead->lf_blkhd)) { + wakelock = TAILQ_FIRST(&listhead->lf_blkhd); + TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block); + wakelock->lf_next = NOLOCKF; +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("lf_wakelock: awakening", wakelock); +#endif /* LOCKF_DEBUG */ + wakeup(wakelock); + } +} + +#ifdef LOCKF_DEBUG +/* + * Print out a lock. + */ +void +lf_print(tag, lock) + char *tag; + register struct lockf *lock; +{ + + printf("%s: lock %p for ", tag, (void *)lock); + if (lock->lf_flags & F_POSIX) + printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid); + else + printf("id %p", (void *)lock->lf_id); + if (lock->lf_inode != (struct inode *)0) + /* XXX no %qd in kernel. Truncate. */ + printf(" in ino %lu on dev <%d, %d>, %s, start %ld, end %ld", + (u_long)lock->lf_inode->i_number, + major(lock->lf_inode->i_dev), + minor(lock->lf_inode->i_dev), + lock->lf_type == F_RDLCK ? "shared" : + lock->lf_type == F_WRLCK ? "exclusive" : + lock->lf_type == F_UNLCK ? "unlock" : + "unknown", (long)lock->lf_start, (long)lock->lf_end); + else + printf(" %s, start %ld, end %ld", + lock->lf_type == F_RDLCK ? "shared" : + lock->lf_type == F_WRLCK ? "exclusive" : + lock->lf_type == F_UNLCK ? "unlock" : + "unknown", (long)lock->lf_start, (long)lock->lf_end); + if (!TAILQ_EMPTY(&lock->lf_blkhd)) + printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd)); + else + printf("\n"); +} + +void +lf_printlist(tag, lock) + char *tag; + struct lockf *lock; +{ + register struct lockf *lf, *blk; + + if (lock->lf_inode == (struct inode *)0) + return; + + printf("%s: Lock list for ino %lu on dev <%d, %d>:\n", + tag, (u_long)lock->lf_inode->i_number, + major(lock->lf_inode->i_dev), + minor(lock->lf_inode->i_dev)); + for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) { + printf("\tlock %p for ",(void *)lf); + if (lf->lf_flags & F_POSIX) + printf("proc %ld", + (long)((struct proc *)lf->lf_id)->p_pid); + else + printf("id %p", (void *)lf->lf_id); + /* XXX no %qd in kernel. Truncate. */ + printf(", %s, start %ld, end %ld", + lf->lf_type == F_RDLCK ? "shared" : + lf->lf_type == F_WRLCK ? "exclusive" : + lf->lf_type == F_UNLCK ? "unlock" : + "unknown", (long)lf->lf_start, (long)lf->lf_end); + TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) { + printf("\n\t\tlock request %p for ", (void *)blk); + if (blk->lf_flags & F_POSIX) + printf("proc %ld", + (long)((struct proc *)blk->lf_id)->p_pid); + else + printf("id %p", (void *)blk->lf_id); + /* XXX no %qd in kernel. Truncate. */ + printf(", %s, start %ld, end %ld", + blk->lf_type == F_RDLCK ? "shared" : + blk->lf_type == F_WRLCK ? "exclusive" : + blk->lf_type == F_UNLCK ? "unlock" : + "unknown", (long)blk->lf_start, + (long)blk->lf_end); + if (!TAILQ_EMPTY(&blk->lf_blkhd)) + panic("lf_printlist: bad list"); + } + printf("\n"); + } +} +#endif /* LOCKF_DEBUG */ diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c new file mode 100644 index 0000000..c7bec3e --- /dev/null +++ b/sys/kern/kern_malloc.c @@ -0,0 +1,618 @@ +/* + * Copyright (c) 1987, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94 + * $FreeBSD$ + */ + +#include "opt_vm.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/vmmeter.h> +#include <sys/proc.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/uma.h> +#include <vm/uma_int.h> +#include <vm/uma_dbg.h> + +#if defined(INVARIANTS) && defined(__i386__) +#include <machine/cpu.h> +#endif + +/* + * When realloc() is called, if the new size is sufficiently smaller than + * the old size, realloc() will allocate a new, smaller block to avoid + * wasting memory. 'Sufficiently smaller' is defined as: newsize <= + * oldsize / 2^n, where REALLOC_FRACTION defines the value of 'n'. + */ +#ifndef REALLOC_FRACTION +#define REALLOC_FRACTION 1 /* new block if <= half the size */ +#endif + +MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches"); +MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory"); +MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers"); + +MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options"); +MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery"); + +static void kmeminit(void *); +SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL) + +static MALLOC_DEFINE(M_FREE, "free", "should be on free list"); + +static struct malloc_type *kmemstatistics; +static char *kmembase; +static char *kmemlimit; + +#define KMEM_ZSHIFT 4 +#define KMEM_ZBASE 16 +#define KMEM_ZMASK (KMEM_ZBASE - 1) + +#define KMEM_ZMAX 65536 +#define KMEM_ZSIZE (KMEM_ZMAX >> KMEM_ZSHIFT) +static u_int8_t kmemsize[KMEM_ZSIZE + 1]; + +/* These won't be powers of two for long */ +struct { + int kz_size; + char *kz_name; + uma_zone_t kz_zone; +} kmemzones[] = { + {16, "16", NULL}, + {32, "32", NULL}, + {64, "64", NULL}, + {128, "128", NULL}, + {256, "256", NULL}, + {512, "512", NULL}, + {1024, "1024", NULL}, + {2048, "2048", NULL}, + {4096, "4096", NULL}, + {8192, "8192", NULL}, + {16384, "16384", NULL}, + {32768, "32768", NULL}, + {65536, "65536", NULL}, + {0, NULL}, +}; + +u_int vm_kmem_size; + +/* + * The malloc_mtx protects the kmemstatistics linked list as well as the + * mallochash. + */ + +struct mtx malloc_mtx; + +#ifdef MALLOC_PROFILE +uint64_t krequests[KMEM_ZSIZE + 1]; + +static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS); +#endif + +static int sysctl_kern_malloc(SYSCTL_HANDLER_ARGS); + +/* + * malloc: + * + * Allocate a block of memory. + * + * If M_NOWAIT is set, this routine will not block and return NULL if + * the allocation fails. + */ +void * +malloc(size, type, flags) + unsigned long size; + struct malloc_type *type; + int flags; +{ + int indx; + caddr_t va; + uma_zone_t zone; + register struct malloc_type *ksp = type; + +#if 0 + if (size == 0) + Debugger("zero size malloc"); +#endif + if (!(flags & M_NOWAIT)) + KASSERT(curthread->td_intr_nesting_level == 0, + ("malloc(M_WAITOK) in interrupt context")); + if (size <= KMEM_ZMAX) { + if (size & KMEM_ZMASK) + size = (size & ~KMEM_ZMASK) + KMEM_ZBASE; + indx = kmemsize[size >> KMEM_ZSHIFT]; + zone = kmemzones[indx].kz_zone; +#ifdef MALLOC_PROFILE + krequests[size >> KMEM_ZSHIFT]++; +#endif + va = uma_zalloc(zone, flags); + mtx_lock(&ksp->ks_mtx); + if (va == NULL) + goto out; + + ksp->ks_size |= 1 << indx; + size = zone->uz_size; + } else { + size = roundup(size, PAGE_SIZE); + zone = NULL; + va = uma_large_malloc(size, flags); + mtx_lock(&ksp->ks_mtx); + if (va == NULL) + goto out; + } + ksp->ks_memuse += size; + ksp->ks_inuse++; +out: + ksp->ks_calls++; + if (ksp->ks_memuse > ksp->ks_maxused) + ksp->ks_maxused = ksp->ks_memuse; + + mtx_unlock(&ksp->ks_mtx); + return ((void *) va); +} + +/* + * free: + * + * Free a block of memory allocated by malloc. + * + * This routine may not block. + */ +void +free(addr, type) + void *addr; + struct malloc_type *type; +{ + uma_slab_t slab; + void *mem; + u_long size; + register struct malloc_type *ksp = type; + + /* free(NULL, ...) does nothing */ + if (addr == NULL) + return; + + size = 0; + + mem = (void *)((u_long)addr & (~UMA_SLAB_MASK)); + mtx_lock(&malloc_mtx); + slab = hash_sfind(mallochash, mem); + mtx_unlock(&malloc_mtx); + + if (slab == NULL) + panic("free: address %p(%p) has not been allocated.\n", + addr, mem); + + if (!(slab->us_flags & UMA_SLAB_MALLOC)) { +#ifdef INVARIANTS + struct malloc_type **mtp = addr; +#endif + size = slab->us_zone->uz_size; +#ifdef INVARIANTS + /* + * Cache a pointer to the malloc_type that most recently freed + * this memory here. This way we know who is most likely to + * have stepped on it later. + * + * This code assumes that size is a multiple of 8 bytes for + * 64 bit machines + */ + mtp = (struct malloc_type **) + ((unsigned long)mtp & ~UMA_ALIGN_PTR); + mtp += (size - sizeof(struct malloc_type *)) / + sizeof(struct malloc_type *); + *mtp = type; +#endif + uma_zfree_arg(slab->us_zone, addr, slab); + } else { + size = slab->us_size; + uma_large_free(slab); + } + mtx_lock(&ksp->ks_mtx); + ksp->ks_memuse -= size; + ksp->ks_inuse--; + mtx_unlock(&ksp->ks_mtx); +} + +/* + * realloc: change the size of a memory block + */ +void * +realloc(addr, size, type, flags) + void *addr; + unsigned long size; + struct malloc_type *type; + int flags; +{ + uma_slab_t slab; + unsigned long alloc; + void *newaddr; + + /* realloc(NULL, ...) is equivalent to malloc(...) */ + if (addr == NULL) + return (malloc(size, type, flags)); + + mtx_lock(&malloc_mtx); + slab = hash_sfind(mallochash, + (void *)((u_long)addr & ~(UMA_SLAB_MASK))); + mtx_unlock(&malloc_mtx); + + /* Sanity check */ + KASSERT(slab != NULL, + ("realloc: address %p out of range", (void *)addr)); + + /* Get the size of the original block */ + if (slab->us_zone) + alloc = slab->us_zone->uz_size; + else + alloc = slab->us_size; + + /* Reuse the original block if appropriate */ + if (size <= alloc + && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE)) + return (addr); + + /* Allocate a new, bigger (or smaller) block */ + if ((newaddr = malloc(size, type, flags)) == NULL) + return (NULL); + + /* Copy over original contents */ + bcopy(addr, newaddr, min(size, alloc)); + free(addr, type); + return (newaddr); +} + +/* + * reallocf: same as realloc() but free memory on failure. + */ +void * +reallocf(addr, size, type, flags) + void *addr; + unsigned long size; + struct malloc_type *type; + int flags; +{ + void *mem; + + if ((mem = realloc(addr, size, type, flags)) == NULL) + free(addr, type); + return (mem); +} + +/* + * Initialize the kernel memory allocator + */ +/* ARGSUSED*/ +static void +kmeminit(dummy) + void *dummy; +{ + u_int8_t indx; + u_long npg; + u_long mem_size; + void *hashmem; + u_long hashsize; + int highbit; + int bits; + int i; + + mtx_init(&malloc_mtx, "malloc", NULL, MTX_DEF); + + /* + * Try to auto-tune the kernel memory size, so that it is + * more applicable for a wider range of machine sizes. + * On an X86, a VM_KMEM_SIZE_SCALE value of 4 is good, while + * a VM_KMEM_SIZE of 12MB is a fair compromise. The + * VM_KMEM_SIZE_MAX is dependent on the maximum KVA space + * available, and on an X86 with a total KVA space of 256MB, + * try to keep VM_KMEM_SIZE_MAX at 80MB or below. + * + * Note that the kmem_map is also used by the zone allocator, + * so make sure that there is enough space. + */ + vm_kmem_size = VM_KMEM_SIZE; + mem_size = cnt.v_page_count * PAGE_SIZE; + +#if defined(VM_KMEM_SIZE_SCALE) + if ((mem_size / VM_KMEM_SIZE_SCALE) > vm_kmem_size) + vm_kmem_size = mem_size / VM_KMEM_SIZE_SCALE; +#endif + +#if defined(VM_KMEM_SIZE_MAX) + if (vm_kmem_size >= VM_KMEM_SIZE_MAX) + vm_kmem_size = VM_KMEM_SIZE_MAX; +#endif + + /* Allow final override from the kernel environment */ + TUNABLE_INT_FETCH("kern.vm.kmem.size", &vm_kmem_size); + + /* + * Limit kmem virtual size to twice the physical memory. + * This allows for kmem map sparseness, but limits the size + * to something sane. Be careful to not overflow the 32bit + * ints while doing the check. + */ + if ((vm_kmem_size / 2) > (cnt.v_page_count * PAGE_SIZE)) + vm_kmem_size = 2 * cnt.v_page_count * PAGE_SIZE; + + /* + * In mbuf_init(), we set up submaps for mbufs and clusters, in which + * case we rounddown() (nmbufs * MSIZE) and (nmbclusters * MCLBYTES), + * respectively. Mathematically, this means that what we do here may + * amount to slightly more address space than we need for the submaps, + * but it never hurts to have an extra page in kmem_map. + */ + npg = (nmbufs * MSIZE + nmbclusters * MCLBYTES + nmbcnt * + sizeof(u_int) + vm_kmem_size) / PAGE_SIZE; + + kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase, + (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE)); + kmem_map->system_map = 1; + + hashsize = npg * sizeof(void *); + + highbit = 0; + bits = 0; + /* The hash size must be a power of two */ + for (i = 0; i < 8 * sizeof(hashsize); i++) + if (hashsize & (1 << i)) { + highbit = i; + bits++; + } + if (bits > 1) + hashsize = 1 << (highbit); + + hashmem = (void *)kmem_alloc(kernel_map, (vm_size_t)hashsize); + uma_startup2(hashmem, hashsize / sizeof(void *)); + + for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) { + int size = kmemzones[indx].kz_size; + char *name = kmemzones[indx].kz_name; + + kmemzones[indx].kz_zone = uma_zcreate(name, size, +#ifdef INVARIANTS + mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini, +#else + NULL, NULL, NULL, NULL, +#endif + UMA_ALIGN_PTR, UMA_ZONE_MALLOC); + + for (;i <= size; i+= KMEM_ZBASE) + kmemsize[i >> KMEM_ZSHIFT] = indx; + + } +} + +void +malloc_init(data) + void *data; +{ + struct malloc_type *type = (struct malloc_type *)data; + + mtx_lock(&malloc_mtx); + if (type->ks_magic != M_MAGIC) + panic("malloc type lacks magic"); + + if (cnt.v_page_count == 0) + panic("malloc_init not allowed before vm init"); + + if (type->ks_next != NULL) + return; + + type->ks_next = kmemstatistics; + kmemstatistics = type; + mtx_init(&type->ks_mtx, type->ks_shortdesc, "Malloc Stats", MTX_DEF); + mtx_unlock(&malloc_mtx); +} + +void +malloc_uninit(data) + void *data; +{ + struct malloc_type *type = (struct malloc_type *)data; + struct malloc_type *t; + + mtx_lock(&malloc_mtx); + mtx_lock(&type->ks_mtx); + if (type->ks_magic != M_MAGIC) + panic("malloc type lacks magic"); + + if (cnt.v_page_count == 0) + panic("malloc_uninit not allowed before vm init"); + + if (type == kmemstatistics) + kmemstatistics = type->ks_next; + else { + for (t = kmemstatistics; t->ks_next != NULL; t = t->ks_next) { + if (t->ks_next == type) { + t->ks_next = type->ks_next; + break; + } + } + } + type->ks_next = NULL; + mtx_destroy(&type->ks_mtx); + mtx_unlock(&malloc_mtx); +} + +static int +sysctl_kern_malloc(SYSCTL_HANDLER_ARGS) +{ + struct malloc_type *type; + int linesize = 128; + int curline; + int bufsize; + int first; + int error; + char *buf; + char *p; + int cnt; + int len; + int i; + + cnt = 0; + + mtx_lock(&malloc_mtx); + for (type = kmemstatistics; type != NULL; type = type->ks_next) + cnt++; + + mtx_unlock(&malloc_mtx); + bufsize = linesize * (cnt + 1); + p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); + mtx_lock(&malloc_mtx); + + len = snprintf(p, linesize, + "\n Type InUse MemUse HighUse Requests Size(s)\n"); + p += len; + + for (type = kmemstatistics; cnt != 0 && type != NULL; + type = type->ks_next, cnt--) { + if (type->ks_calls == 0) + continue; + + curline = linesize - 2; /* Leave room for the \n */ + len = snprintf(p, curline, "%13s%6lu%6luK%7luK%9llu", + type->ks_shortdesc, + type->ks_inuse, + (type->ks_memuse + 1023) / 1024, + (type->ks_maxused + 1023) / 1024, + (long long unsigned)type->ks_calls); + curline -= len; + p += len; + + first = 1; + for (i = 0; i < sizeof(kmemzones) / sizeof(kmemzones[0]) - 1; + i++) { + if (type->ks_size & (1 << i)) { + if (first) + len = snprintf(p, curline, " "); + else + len = snprintf(p, curline, ","); + curline -= len; + p += len; + + len = snprintf(p, curline, + "%s", kmemzones[i].kz_name); + curline -= len; + p += len; + + first = 0; + } + } + + len = snprintf(p, 2, "\n"); + p += len; + } + + mtx_unlock(&malloc_mtx); + error = SYSCTL_OUT(req, buf, p - buf); + + free(buf, M_TEMP); + return (error); +} + +SYSCTL_OID(_kern, OID_AUTO, malloc, CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, sysctl_kern_malloc, "A", "Malloc Stats"); + +#ifdef MALLOC_PROFILE + +static int +sysctl_kern_mprof(SYSCTL_HANDLER_ARGS) +{ + int linesize = 64; + uint64_t count; + uint64_t waste; + uint64_t mem; + int bufsize; + int error; + char *buf; + int rsize; + int size; + char *p; + int len; + int i; + + bufsize = linesize * (KMEM_ZSIZE + 1); + bufsize += 128; /* For the stats line */ + bufsize += 128; /* For the banner line */ + waste = 0; + mem = 0; + + p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); + len = snprintf(p, bufsize, + "\n Size Requests Real Size\n"); + bufsize -= len; + p += len; + + for (i = 0; i < KMEM_ZSIZE; i++) { + size = i << KMEM_ZSHIFT; + rsize = kmemzones[kmemsize[i]].kz_size; + count = (long long unsigned)krequests[i]; + + len = snprintf(p, bufsize, "%6d%28llu%11d\n", + size, (unsigned long long)count, rsize); + bufsize -= len; + p += len; + + if ((rsize * count) > (size * count)) + waste += (rsize * count) - (size * count); + mem += (rsize * count); + } + + len = snprintf(p, bufsize, + "\nTotal memory used:\t%30llu\nTotal Memory wasted:\t%30llu\n", + (unsigned long long)mem, (unsigned long long)waste); + p += len; + + error = SYSCTL_OUT(req, buf, p - buf); + + free(buf, M_TEMP); + return (error); +} + +SYSCTL_OID(_kern, OID_AUTO, mprof, CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, sysctl_kern_mprof, "A", "Malloc Profiling"); +#endif /* MALLOC_PROFILE */ diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c new file mode 100644 index 0000000..ebcba94 --- /dev/null +++ b/sys/kern/kern_mib.c @@ -0,0 +1,336 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Karels at Berkeley Software Design, Inc. + * + * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD + * project, to make these variables more userfriendly. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 + * $FreeBSD$ + */ + +#include "opt_posix.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/jail.h> +#include <sys/smp.h> + +SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0, + "Sysctl internal magic"); +SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW, 0, + "High kernel, proc, limits &c"); +SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW, 0, + "Virtual memory"); +SYSCTL_NODE(, CTL_VFS, vfs, CTLFLAG_RW, 0, + "File system"); +SYSCTL_NODE(, CTL_NET, net, CTLFLAG_RW, 0, + "Network, (see socket.h)"); +SYSCTL_NODE(, CTL_DEBUG, debug, CTLFLAG_RW, 0, + "Debugging"); +SYSCTL_NODE(_debug, OID_AUTO, sizeof, CTLFLAG_RW, 0, + "Sizeof various things"); +SYSCTL_NODE(, CTL_HW, hw, CTLFLAG_RW, 0, + "hardware"); +SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0, + "machine dependent"); +SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW, 0, + "user-level"); +SYSCTL_NODE(, CTL_P1003_1B, p1003_1b, CTLFLAG_RW, 0, + "p1003_1b, (see p1003_1b.h)"); + +SYSCTL_NODE(, OID_AUTO, compat, CTLFLAG_RW, 0, + "Compatibility code"); +SYSCTL_NODE(, OID_AUTO, security, CTLFLAG_RW, 0, + "Security"); +#ifdef REGRESSION +SYSCTL_NODE(, OID_AUTO, regression, CTLFLAG_RW, 0, + "Regression test MIB"); +#endif + +SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD, + osrelease, 0, "Operating system release"); + +SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD, + 0, BSD, "Operating system revision"); + +SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD, + version, 0, "Kernel version"); + +SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD, + ostype, 0, "Operating system type"); + +extern int osreldate; +SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD, + &osreldate, 0, "Operating system release date"); + +SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RD, + &maxproc, 0, "Maximum number of processes"); + +SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW, + &maxprocperuid, 0, "Maximum processes allowed per userid"); + +SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RD, + &maxusers, 0, "Hint for kernel tuning"); + +SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD, + 0, ARG_MAX, "Maximum bytes of argument to execve(2)"); + +SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD, + 0, _KPOSIX_VERSION, "Version of POSIX attempting to comply to"); + +SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD, + 0, NGROUPS_MAX, "Maximum number of groups a user can belong to"); + +SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD, + 0, 1, "Whether job control is available"); + +#ifdef _POSIX_SAVED_IDS +SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, + 0, 1, "Whether saved set-group/user ID is available"); +#else +SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, + 0, 0, "Whether saved set-group/user ID is available"); +#endif + +char kernelname[MAXPATHLEN] = "/kernel"; /* XXX bloat */ + +SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW, + kernelname, sizeof kernelname, "Name of kernel file booted"); + +#ifdef SMP +SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, + &mp_ncpus, 0, "Number of active CPUs"); +#else +SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, + 0, 1, "Number of active CPUs"); +#endif + +SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD, + 0, BYTE_ORDER, "System byte order"); + +SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD, + 0, PAGE_SIZE, "System memory page size"); + +static char machine_arch[] = MACHINE_ARCH; +SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD, + machine_arch, 0, "System architecture"); + +char hostname[MAXHOSTNAMELEN]; + +static int +sysctl_hostname(SYSCTL_HANDLER_ARGS) +{ + struct prison *pr; + char tmphostname[MAXHOSTNAMELEN]; + int error; + + pr = req->td->td_ucred->cr_prison; + if (pr != NULL) { + if (!jail_set_hostname_allowed && req->newptr) + return (EPERM); + /* + * Process is in jail, so make a local copy of jail + * hostname to get/set so we don't have to hold the jail + * mutex during the sysctl copyin/copyout activities. + */ + mtx_lock(&pr->pr_mtx); + bcopy(pr->pr_host, tmphostname, MAXHOSTNAMELEN); + mtx_unlock(&pr->pr_mtx); + + error = sysctl_handle_string(oidp, tmphostname, + sizeof pr->pr_host, req); + + if (req->newptr != NULL && error == 0) { + /* + * Copy the locally set hostname to the jail, if + * appropriate. + */ + mtx_lock(&pr->pr_mtx); + bcopy(tmphostname, pr->pr_host, MAXHOSTNAMELEN); + mtx_unlock(&pr->pr_mtx); + } + } else + error = sysctl_handle_string(oidp, + hostname, sizeof hostname, req); + return (error); +} + +SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname, + CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_PRISON, + 0, 0, sysctl_hostname, "A", "Hostname"); + +static int regression_securelevel_nonmonotonic = 0; + +#ifdef REGRESSION +SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW, + ®ression_securelevel_nonmonotonic, 0, "securelevel may be lowered"); +#endif + +int securelevel = -1; +struct mtx securelevel_mtx; + +MTX_SYSINIT(securelevel_lock, &securelevel_mtx, "securelevel mutex lock", + MTX_DEF); + +static int +sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS) +{ + struct prison *pr; + int error, level; + + pr = req->td->td_ucred->cr_prison; + + /* + * If the process is in jail, return the maximum of the global and + * local levels; otherwise, return the global level. + */ + if (pr != NULL) { + mtx_lock(&pr->pr_mtx); + level = imax(securelevel, pr->pr_securelevel); + mtx_unlock(&pr->pr_mtx); + } else + level = securelevel; + error = sysctl_handle_int(oidp, &level, 0, req); + if (error || !req->newptr) + return (error); + /* + * Permit update only if the new securelevel exceeds the + * global level, and local level if any. + */ + if (pr != NULL) { + mtx_lock(&pr->pr_mtx); + if (!regression_securelevel_nonmonotonic && + (level < imax(securelevel, pr->pr_securelevel))) { + mtx_unlock(&pr->pr_mtx); + return (EPERM); + } + pr->pr_securelevel = level; + mtx_unlock(&pr->pr_mtx); + } else { + mtx_lock(&securelevel_mtx); + if (!regression_securelevel_nonmonotonic && + (level < securelevel)) { + mtx_unlock(&securelevel_mtx); + return (EPERM); + } + securelevel = level; + mtx_unlock(&securelevel_mtx); + } + return (error); +} + +SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_kern_securelvl, + "I", "Current secure level"); + +char domainname[MAXHOSTNAMELEN]; +SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW, + &domainname, sizeof(domainname), "Name of the current YP/NIS domain"); + +u_long hostid; +SYSCTL_ULONG(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "Host ID"); + +/* + * This is really cheating. These actually live in the libc, something + * which I'm not quite sure is a good idea anyway, but in order for + * getnext and friends to actually work, we define dummies here. + */ +SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD, + "", 0, "PATH that finds all the standard utilities"); +SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD, + 0, 0, "Max ibase/obase values in bc(1)"); +SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD, + 0, 0, "Max array size in bc(1)"); +SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD, + 0, 0, "Max scale value in bc(1)"); +SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD, + 0, 0, "Max string length in bc(1)"); +SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD, + 0, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry"); +SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD, + 0, 0, "Max length (bytes) of a text-processing utility's input line"); +SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD, + 0, 0, "Maximum number of repeats of a regexp permitted"); +SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD, + 0, 0, + "The version of POSIX 1003.2 with which the system attempts to comply"); +SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD, + 0, 0, "Whether C development supports the C bindings option"); +SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD, + 0, 0, "Whether system supports the C development utilities option"); +SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD, + 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD, + 0, 0, "Whether system supports FORTRAN development utilities"); +SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD, + 0, 0, "Whether system supports FORTRAN runtime utilities"); +SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD, + 0, 0, "Whether system supports creation of locales"); +SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD, + 0, 0, "Whether system supports software development utilities"); +SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD, + 0, 0, "Whether system supports the user portability utilities"); +SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD, + 0, 0, "Min Maximum number of streams a process may have open at one time"); +SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD, + 0, 0, "Min Maximum number of types supported for timezone names"); + +#include <sys/vnode.h> +SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD, + 0, sizeof(struct vnode), "sizeof(struct vnode)"); + +SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD, + 0, sizeof(struct proc), "sizeof(struct proc)"); + +#include <sys/conf.h> +SYSCTL_INT(_debug_sizeof, OID_AUTO, specinfo, CTLFLAG_RD, + 0, sizeof(struct specinfo), "sizeof(struct specinfo)"); + +#include <sys/bio.h> +#include <sys/buf.h> +SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD, + 0, sizeof(struct bio), "sizeof(struct bio)"); +SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, + 0, sizeof(struct buf), "sizeof(struct buf)"); + +#include <sys/user.h> +SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD, + 0, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)"); diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c new file mode 100644 index 0000000..74a0259 --- /dev/null +++ b/sys/kern/kern_module.c @@ -0,0 +1,394 @@ +/*- + * Copyright (c) 1997 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/eventhandler.h> +#include <sys/malloc.h> +#include <sys/sysproto.h> +#include <sys/sysent.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/module.h> +#include <sys/linker.h> + +static MALLOC_DEFINE(M_MODULE, "module", "module data structures"); + +typedef TAILQ_HEAD(, module) modulelist_t; +struct module { + TAILQ_ENTRY(module) link; /* chain together all modules */ + TAILQ_ENTRY(module) flink; /* all modules in a file */ + struct linker_file *file; /* file which contains this module */ + int refs; /* reference count */ + int id; /* unique id number */ + char *name; /* module name */ + modeventhand_t handler; /* event handler */ + void *arg; /* argument for handler */ + modspecific_t data; /* module specific data */ +}; + +#define MOD_EVENT(mod, type) (mod)->handler((mod), (type), (mod)->arg) + +static modulelist_t modules; +struct sx modules_sx; +static int nextid = 1; +static void module_shutdown(void *, int); + +static int +modevent_nop(module_t mod, int what, void *arg) +{ + return (0); +} + +static void +module_init(void *arg) +{ + + sx_init(&modules_sx, "module subsystem sx lock"); + TAILQ_INIT(&modules); + EVENTHANDLER_REGISTER(shutdown_post_sync, module_shutdown, NULL, + SHUTDOWN_PRI_DEFAULT); +} + +SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0) + +static void +module_shutdown(void *arg1, int arg2) +{ + module_t mod; + + MOD_SLOCK; + TAILQ_FOREACH(mod, &modules, link) + MOD_EVENT(mod, MOD_SHUTDOWN); + MOD_SUNLOCK; +} + +void +module_register_init(const void *arg) +{ + const moduledata_t *data = (const moduledata_t *)arg; + int error; + module_t mod; + + MOD_SLOCK; + mod = module_lookupbyname(data->name); + if (mod == NULL) + panic("module_register_init: module named %s not found\n", + data->name); + MOD_SUNLOCK; + error = MOD_EVENT(mod, MOD_LOAD); + if (error) { + MOD_EVENT(mod, MOD_UNLOAD); + MOD_XLOCK; + module_release(mod); + MOD_XUNLOCK; + printf("module_register_init: MOD_LOAD (%s, %p, %p) error" + " %d\n", data->name, (void *)data->evhand, data->priv, + error); + } +} + +int +module_register(const moduledata_t *data, linker_file_t container) +{ + size_t namelen; + module_t newmod; + + MOD_SLOCK; + newmod = module_lookupbyname(data->name); + if (newmod != NULL) { + MOD_SUNLOCK; + printf("module_register: module %s already exists!\n", + data->name); + return (EEXIST); + } + MOD_SUNLOCK; + namelen = strlen(data->name) + 1; + newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK); + if (newmod == NULL) + return (ENOMEM); + MOD_XLOCK; + newmod->refs = 1; + newmod->id = nextid++; + newmod->name = (char *)(newmod + 1); + strcpy(newmod->name, data->name); + newmod->handler = data->evhand ? data->evhand : modevent_nop; + newmod->arg = data->priv; + bzero(&newmod->data, sizeof(newmod->data)); + TAILQ_INSERT_TAIL(&modules, newmod, link); + + if (container) + TAILQ_INSERT_TAIL(&container->modules, newmod, flink); + newmod->file = container; + MOD_XUNLOCK; + return (0); +} + +void +module_reference(module_t mod) +{ + + MOD_XLOCK_ASSERT; + + MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs)); + mod->refs++; +} + +void +module_release(module_t mod) +{ + + MOD_XLOCK_ASSERT; + + if (mod->refs <= 0) + panic("module_release: bad reference count"); + + MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs)); + + mod->refs--; + if (mod->refs == 0) { + TAILQ_REMOVE(&modules, mod, link); + if (mod->file) + TAILQ_REMOVE(&mod->file->modules, mod, flink); + MOD_XUNLOCK; + free(mod, M_MODULE); + MOD_XLOCK; + } +} + +module_t +module_lookupbyname(const char *name) +{ + module_t mod; + int err; + + MOD_LOCK_ASSERT; + + TAILQ_FOREACH(mod, &modules, link) { + err = strcmp(mod->name, name); + if (err == 0) + return (mod); + } + return (NULL); +} + +module_t +module_lookupbyid(int modid) +{ + module_t mod; + + MOD_LOCK_ASSERT; + + TAILQ_FOREACH(mod, &modules, link) + if (mod->id == modid) + return(mod); + return (NULL); +} + +int +module_unload(module_t mod) +{ + + return (MOD_EVENT(mod, MOD_UNLOAD)); +} + +int +module_getid(module_t mod) +{ + + MOD_LOCK_ASSERT; + return (mod->id); +} + +module_t +module_getfnext(module_t mod) +{ + + MOD_LOCK_ASSERT; + return (TAILQ_NEXT(mod, flink)); +} + +void +module_setspecific(module_t mod, modspecific_t *datap) +{ + + MOD_XLOCK_ASSERT; + mod->data = *datap; +} + +/* + * Syscalls. + */ +/* + * MPSAFE + */ +int +modnext(struct thread *td, struct modnext_args *uap) +{ + module_t mod; + int error = 0; + + td->td_retval[0] = -1; + + MOD_SLOCK; + if (SCARG(uap, modid) == 0) { + mod = TAILQ_FIRST(&modules); + if (mod) + td->td_retval[0] = mod->id; + else + error = ENOENT; + goto done2; + } + mod = module_lookupbyid(SCARG(uap, modid)); + if (mod == NULL) { + error = ENOENT; + goto done2; + } + if (TAILQ_NEXT(mod, link)) + td->td_retval[0] = TAILQ_NEXT(mod, link)->id; + else + td->td_retval[0] = 0; +done2: + MOD_SUNLOCK; + return (error); +} + +/* + * MPSAFE + */ +int +modfnext(struct thread *td, struct modfnext_args *uap) +{ + module_t mod; + int error; + + td->td_retval[0] = -1; + + MOD_SLOCK; + mod = module_lookupbyid(SCARG(uap, modid)); + if (mod == NULL) { + error = ENOENT; + } else { + error = 0; + if (TAILQ_NEXT(mod, flink)) + td->td_retval[0] = TAILQ_NEXT(mod, flink)->id; + else + td->td_retval[0] = 0; + } + MOD_SUNLOCK; + return (error); +} + +struct module_stat_v1 { + int version; /* set to sizeof(struct module_stat) */ + char name[MAXMODNAME]; + int refs; + int id; +}; + +/* + * MPSAFE + */ +int +modstat(struct thread *td, struct modstat_args *uap) +{ + module_t mod; + modspecific_t data; + int error = 0; + int id, namelen, refs, version; + struct module_stat *stat; + char *name; + + MOD_SLOCK; + mod = module_lookupbyid(SCARG(uap, modid)); + if (mod == NULL) { + MOD_SUNLOCK; + return (ENOENT); + } + id = mod->id; + refs = mod->refs; + name = mod->name; + data = mod->data; + MOD_SUNLOCK; + stat = SCARG(uap, stat); + + /* + * Check the version of the user's structure. + */ + if ((error = copyin(&stat->version, &version, sizeof(version))) != 0) + return (error); + if (version != sizeof(struct module_stat_v1) + && version != sizeof(struct module_stat)) + return (EINVAL); + namelen = strlen(mod->name) + 1; + if (namelen > MAXMODNAME) + namelen = MAXMODNAME; + if ((error = copyout(name, &stat->name[0], namelen)) != 0) + return (error); + + if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0) + return (error); + if ((error = copyout(&id, &stat->id, sizeof(int))) != 0) + return (error); + + /* + * >v1 stat includes module data. + */ + if (version == sizeof(struct module_stat)) + if ((error = copyout(&data, &stat->data, + sizeof(data))) != 0) + return (error); + td->td_retval[0] = 0; + return (error); +} + +/* + * MPSAFE + */ +int +modfind(struct thread *td, struct modfind_args *uap) +{ + int error = 0; + char name[MAXMODNAME]; + module_t mod; + + if ((error = copyinstr(SCARG(uap, name), name, sizeof name, 0)) != 0) + return (error); + + MOD_SLOCK; + mod = module_lookupbyname(name); + if (mod == NULL) + error = ENOENT; + else + td->td_retval[0] = module_getid(mod); + MOD_SUNLOCK; + return (error); +} diff --git a/sys/kern/kern_mtxpool.c b/sys/kern/kern_mtxpool.c new file mode 100644 index 0000000..3d4aa1c --- /dev/null +++ b/sys/kern/kern_mtxpool.c @@ -0,0 +1,115 @@ +/*- + * Copyright (c) 2001 Matthew Dillon. All Rights Reserved. Copyright + * terms are as specified in the COPYRIGHT file at the base of the source + * tree. + * + * Mutex pool routines. These routines are designed to be used as short + * term leaf mutexes (e.g. the last mutex you might aquire other then + * calling msleep()). They operate using a shared pool. A mutex is chosen + * from the pool based on the supplied pointer (which may or may not be + * valid). + * + * Advantages: + * - no structural overhead. Mutexes can be associated with structures + * without adding bloat to the structures. + * - mutexes can be obtained for invalid pointers, useful when uses + * mutexes to interlock destructor ops. + * - no initialization/destructor overhead + * - can be used with msleep. + * + * Disadvantages: + * - should generally only be used as leaf mutexes + * - pool/pool dependancy ordering cannot be depended on. + * - possible L1 cache mastersip contention between cpus + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#ifndef MTX_POOL_SIZE +#define MTX_POOL_SIZE 128 +#endif +#define MTX_POOL_MASK (MTX_POOL_SIZE-1) + +static struct mtx mtx_pool_ary[MTX_POOL_SIZE]; + +int mtx_pool_valid = 0; + +/* + * Inline version of mtx_pool_find(), used to streamline our main API + * function calls. + */ +static __inline +struct mtx * +_mtx_pool_find(void *ptr) +{ + int p; + + p = (int)(uintptr_t)ptr; + return(&mtx_pool_ary[(p ^ (p >> 6)) & MTX_POOL_MASK]); +} + +static void +mtx_pool_setup(void *dummy __unused) +{ + int i; + + for (i = 0; i < MTX_POOL_SIZE; ++i) + mtx_init(&mtx_pool_ary[i], "pool mutex", NULL, MTX_DEF | MTX_NOWITNESS | MTX_QUIET); + mtx_pool_valid = 1; +} + +/* + * Obtain a (shared) mutex from the pool. The returned mutex is a leaf + * level mutex, meaning that if you obtain it you cannot obtain any other + * mutexes until you release it. You can legally msleep() on the mutex. + */ +struct mtx * +mtx_pool_alloc(void) +{ + static int si; + return(&mtx_pool_ary[si++ & MTX_POOL_MASK]); +} + +/* + * Return the (shared) pool mutex associated with the specified address. + * The returned mutex is a leaf level mutex, meaning that if you obtain it + * you cannot obtain any other mutexes until you release it. You can + * legally msleep() on the mutex. + */ +struct mtx * +mtx_pool_find(void *ptr) +{ + return(_mtx_pool_find(ptr)); +} + +/* + * Combined find/lock operation. Lock the pool mutex associated with + * the specified address. + */ +void +mtx_pool_lock(void *ptr) +{ + mtx_lock(_mtx_pool_find(ptr)); +} + +/* + * Combined find/unlock operation. Unlock the pool mutex associated with + * the specified address. + */ +void +mtx_pool_unlock(void *ptr) +{ + mtx_unlock(_mtx_pool_find(ptr)); +} + +SYSINIT(mtxpooli, SI_SUB_MTX_POOL, SI_ORDER_FIRST, mtx_pool_setup, NULL) + diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c new file mode 100644 index 0000000..08bca8d --- /dev/null +++ b/sys/kern/kern_mutex.c @@ -0,0 +1,986 @@ +/*- + * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ + * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ + * $FreeBSD$ + */ + +/* + * Machine independent bits of mutex implementation. + */ + +#include "opt_adaptive_mutexes.h" +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/sbuf.h> +#include <sys/stdint.h> +#include <sys/sysctl.h> +#include <sys/vmmeter.h> + +#include <machine/atomic.h> +#include <machine/bus.h> +#include <machine/clock.h> +#include <machine/cpu.h> + +#include <ddb/ddb.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> + +/* + * Internal utility macros. + */ +#define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED) + +#define mtx_owner(m) (mtx_unowned((m)) ? NULL \ + : (struct thread *)((m)->mtx_lock & MTX_FLAGMASK)) + +/* XXXKSE This test will change. */ +#define thread_running(td) \ + ((td)->td_kse != NULL && (td)->td_kse->ke_oncpu != NOCPU) + +/* + * Lock classes for sleep and spin mutexes. + */ +struct lock_class lock_class_mtx_sleep = { + "sleep mutex", + LC_SLEEPLOCK | LC_RECURSABLE +}; +struct lock_class lock_class_mtx_spin = { + "spin mutex", + LC_SPINLOCK | LC_RECURSABLE +}; + +/* + * System-wide mutexes + */ +struct mtx sched_lock; +struct mtx Giant; + +/* + * Prototypes for non-exported routines. + */ +static void propagate_priority(struct thread *); + +static void +propagate_priority(struct thread *td) +{ + int pri = td->td_priority; + struct mtx *m = td->td_blocked; + + mtx_assert(&sched_lock, MA_OWNED); + for (;;) { + struct thread *td1; + + td = mtx_owner(m); + + if (td == NULL) { + /* + * This really isn't quite right. Really + * ought to bump priority of thread that + * next acquires the mutex. + */ + MPASS(m->mtx_lock == MTX_CONTESTED); + return; + } + + MPASS(td->td_proc->p_magic == P_MAGIC); + KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex")); + if (td->td_priority <= pri) /* lower is higher priority */ + return; + + /* + * Bump this thread's priority. + */ + td->td_priority = pri; + + /* + * If lock holder is actually running, just bump priority. + */ + if (thread_running(td)) { + MPASS(td->td_proc->p_stat == SRUN + || td->td_proc->p_stat == SZOMB + || td->td_proc->p_stat == SSTOP); + return; + } + +#ifndef SMP + /* + * For UP, we check to see if td is curthread (this shouldn't + * ever happen however as it would mean we are in a deadlock.) + */ + KASSERT(td != curthread, ("Deadlock detected")); +#endif + + /* + * If on run queue move to new run queue, and quit. + * XXXKSE this gets a lot more complicated under threads + * but try anyhow. + */ + if (td->td_proc->p_stat == SRUN) { + MPASS(td->td_blocked == NULL); + remrunqueue(td); + setrunqueue(td); + return; + } + + /* + * If we aren't blocked on a mutex, we should be. + */ + KASSERT(td->td_proc->p_stat == SMTX, ( + "process %d(%s):%d holds %s but isn't blocked on a mutex\n", + td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat, + m->mtx_object.lo_name)); + + /* + * Pick up the mutex that td is blocked on. + */ + m = td->td_blocked; + MPASS(m != NULL); + + /* + * Check if the thread needs to be moved up on + * the blocked chain + */ + if (td == TAILQ_FIRST(&m->mtx_blocked)) { + continue; + } + + td1 = TAILQ_PREV(td, threadqueue, td_blkq); + if (td1->td_priority <= pri) { + continue; + } + + /* + * Remove thread from blocked chain and determine where + * it should be moved up to. Since we know that td1 has + * a lower priority than td, we know that at least one + * thread in the chain has a lower priority and that + * td1 will thus not be NULL after the loop. + */ + TAILQ_REMOVE(&m->mtx_blocked, td, td_blkq); + TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq) { + MPASS(td1->td_proc->p_magic == P_MAGIC); + if (td1->td_priority > pri) + break; + } + + MPASS(td1 != NULL); + TAILQ_INSERT_BEFORE(td1, td, td_blkq); + CTR4(KTR_LOCK, + "propagate_priority: p %p moved before %p on [%p] %s", + td, td1, m, m->mtx_object.lo_name); + } +} + +#ifdef MUTEX_PROFILING +SYSCTL_NODE(_debug, OID_AUTO, mutex, CTLFLAG_RD, NULL, "mutex debugging"); +SYSCTL_NODE(_debug_mutex, OID_AUTO, prof, CTLFLAG_RD, NULL, "mutex profiling"); +static int mutex_prof_enable = 0; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, enable, CTLFLAG_RW, + &mutex_prof_enable, 0, "Enable tracing of mutex holdtime"); + +struct mutex_prof { + const char *name; + const char *file; + int line; +#define MPROF_MAX 0 +#define MPROF_TOT 1 +#define MPROF_CNT 2 +#define MPROF_AVG 3 + uintmax_t counter[4]; + struct mutex_prof *next; +}; + +/* + * mprof_buf is a static pool of profiling records to avoid possible + * reentrance of the memory allocation functions. + * + * Note: NUM_MPROF_BUFFERS must be smaller than MPROF_HASH_SIZE. + */ +#define NUM_MPROF_BUFFERS 1000 +static struct mutex_prof mprof_buf[NUM_MPROF_BUFFERS]; +static int first_free_mprof_buf; +#define MPROF_HASH_SIZE 1009 +static struct mutex_prof *mprof_hash[MPROF_HASH_SIZE]; + +static int mutex_prof_acquisitions; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, acquisitions, CTLFLAG_RD, + &mutex_prof_acquisitions, 0, "Number of mutex acquistions recorded"); +static int mutex_prof_records; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, records, CTLFLAG_RD, + &mutex_prof_records, 0, "Number of profiling records"); +static int mutex_prof_maxrecords = NUM_MPROF_BUFFERS; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, maxrecords, CTLFLAG_RD, + &mutex_prof_maxrecords, 0, "Maximum number of profiling records"); +static int mutex_prof_rejected; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, rejected, CTLFLAG_RD, + &mutex_prof_rejected, 0, "Number of rejected profiling records"); +static int mutex_prof_hashsize = MPROF_HASH_SIZE; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, hashsize, CTLFLAG_RD, + &mutex_prof_hashsize, 0, "Hash size"); +static int mutex_prof_collisions = 0; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, collisions, CTLFLAG_RD, + &mutex_prof_collisions, 0, "Number of hash collisions"); + +/* + * mprof_mtx protects the profiling buffers and the hash. + */ +static struct mtx mprof_mtx; +MTX_SYSINIT(mprof, &mprof_mtx, "mutex profiling lock", MTX_SPIN | MTX_QUIET); + +static u_int64_t +nanoseconds(void) +{ + struct timespec tv; + + nanotime(&tv); + return (tv.tv_sec * (u_int64_t)1000000000 + tv.tv_nsec); +} + +static int +dump_mutex_prof_stats(SYSCTL_HANDLER_ARGS) +{ + struct sbuf *sb; + int error, i; + + if (first_free_mprof_buf == 0) + return SYSCTL_OUT(req, "No locking recorded", + sizeof("No locking recorded")); + + sb = sbuf_new(NULL, NULL, 1024, SBUF_AUTOEXTEND); + sbuf_printf(sb, "%12s %12s %12s %12s %s\n", + "max", "total", "count", "average", "name"); + mtx_lock_spin(&mprof_mtx); + for (i = 0; i < first_free_mprof_buf; ++i) + sbuf_printf(sb, "%12ju %12ju %12ju %12ju %s:%d (%s)\n", + mprof_buf[i].counter[MPROF_MAX] / 1000, + mprof_buf[i].counter[MPROF_TOT] / 1000, + mprof_buf[i].counter[MPROF_CNT], + mprof_buf[i].counter[MPROF_AVG] / 1000, + mprof_buf[i].file, mprof_buf[i].line, mprof_buf[i].name); + mtx_unlock_spin(&mprof_mtx); + sbuf_finish(sb); + error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); + return (error); +} +SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, dump_mutex_prof_stats, "A", "Mutex profiling statistics"); +#endif + +/* + * Function versions of the inlined __mtx_* macros. These are used by + * modules and can also be called from assembly language if needed. + */ +void +_mtx_lock_flags(struct mtx *m, int opts, const char *file, int line) +{ + + MPASS(curthread != NULL); + _get_sleep_lock(m, curthread, opts, file, line); + LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file, + line); + WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line); +#ifdef MUTEX_PROFILING + /* don't reset the timer when/if recursing */ + if (m->acqtime == 0) { + m->file = file; + m->line = line; + m->acqtime = mutex_prof_enable ? nanoseconds() : 0; + ++mutex_prof_acquisitions; + } +#endif +} + +void +_mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line) +{ + + MPASS(curthread != NULL); + mtx_assert(m, MA_OWNED); +#ifdef MUTEX_PROFILING + if (m->acqtime != 0) { + static const char *unknown = "(unknown)"; + struct mutex_prof *mpp; + u_int64_t acqtime, now; + const char *p, *q; + volatile u_int hash; + + now = nanoseconds(); + acqtime = m->acqtime; + m->acqtime = 0; + if (now <= acqtime) + goto out; + for (p = file; strncmp(p, "../", 3) == 0; p += 3) + /* nothing */ ; + if (p == NULL || *p == '\0') + p = unknown; + for (hash = line, q = p; *q != '\0'; ++q) + hash = (hash * 2 + *q) % MPROF_HASH_SIZE; + mtx_lock_spin(&mprof_mtx); + for (mpp = mprof_hash[hash]; mpp != NULL; mpp = mpp->next) + if (mpp->line == line && strcmp(mpp->file, p) == 0) + break; + if (mpp == NULL) { + /* Just exit if we cannot get a trace buffer */ + if (first_free_mprof_buf >= NUM_MPROF_BUFFERS) { + ++mutex_prof_rejected; + goto unlock; + } + mpp = &mprof_buf[first_free_mprof_buf++]; + mpp->name = mtx_name(m); + mpp->file = p; + mpp->line = line; + mpp->next = mprof_hash[hash]; + if (mprof_hash[hash] != NULL) + ++mutex_prof_collisions; + mprof_hash[hash] = mpp; + ++mutex_prof_records; + } + /* + * Record if the mutex has been held longer now than ever + * before + */ + if ((now - acqtime) > mpp->counter[MPROF_MAX]) + mpp->counter[MPROF_MAX] = now - acqtime; + mpp->counter[MPROF_TOT] += now - acqtime; + mpp->counter[MPROF_CNT] += 1; + mpp->counter[MPROF_AVG] = + mpp->counter[MPROF_TOT] / mpp->counter[MPROF_CNT]; +unlock: + mtx_unlock_spin(&mprof_mtx); + } +out: +#endif + WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line); + LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file, + line); + _rel_sleep_lock(m, curthread, opts, file, line); +} + +void +_mtx_lock_spin_flags(struct mtx *m, int opts, const char *file, int line) +{ + + MPASS(curthread != NULL); +#if defined(SMP) || LOCK_DEBUG > 0 + _get_spin_lock(m, curthread, opts, file, line); +#else + critical_enter(); +#endif + LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file, + line); + WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line); +} + +void +_mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file, int line) +{ + + MPASS(curthread != NULL); + mtx_assert(m, MA_OWNED); + WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line); + LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file, + line); +#if defined(SMP) || LOCK_DEBUG > 0 + _rel_spin_lock(m); +#else + critical_exit(); +#endif +} + +/* + * The important part of mtx_trylock{,_flags}() + * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that + * if we're called, it's because we know we don't already own this lock. + */ +int +_mtx_trylock(struct mtx *m, int opts, const char *file, int line) +{ + int rval; + + MPASS(curthread != NULL); + + rval = _obtain_lock(m, curthread); + + LOCK_LOG_TRY("LOCK", &m->mtx_object, opts, rval, file, line); + if (rval) { + /* + * We do not handle recursion in _mtx_trylock; see the + * note at the top of the routine. + */ + KASSERT(!mtx_recursed(m), + ("mtx_trylock() called on a recursed mutex")); + WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK, + file, line); + } + + return (rval); +} + +/* + * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock. + * + * We call this if the lock is either contested (i.e. we need to go to + * sleep waiting for it), or if we need to recurse on it. + */ +void +_mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line) +{ + struct thread *td = curthread; +#if defined(SMP) && defined(ADAPTIVE_MUTEXES) + struct thread *owner; +#endif + + if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)td) { + m->mtx_recurse++; + atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m); + return; + } + + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR4(KTR_LOCK, + "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d", + m->mtx_object.lo_name, (void *)m->mtx_lock, file, line); + + while (!_obtain_lock(m, td)) { + uintptr_t v; + struct thread *td1; + + mtx_lock_spin(&sched_lock); + /* + * Check if the lock has been released while spinning for + * the sched_lock. + */ + if ((v = m->mtx_lock) == MTX_UNOWNED) { + mtx_unlock_spin(&sched_lock); +#ifdef __i386__ + ia32_pause(); +#endif + continue; + } + + /* + * The mutex was marked contested on release. This means that + * there are threads blocked on it. + */ + if (v == MTX_CONTESTED) { + td1 = TAILQ_FIRST(&m->mtx_blocked); + MPASS(td1 != NULL); + m->mtx_lock = (uintptr_t)td | MTX_CONTESTED; + + if (td1->td_priority < td->td_priority) + td->td_priority = td1->td_priority; + mtx_unlock_spin(&sched_lock); + return; + } + + /* + * If the mutex isn't already contested and a failure occurs + * setting the contested bit, the mutex was either released + * or the state of the MTX_RECURSED bit changed. + */ + if ((v & MTX_CONTESTED) == 0 && + !atomic_cmpset_ptr(&m->mtx_lock, (void *)v, + (void *)(v | MTX_CONTESTED))) { + mtx_unlock_spin(&sched_lock); +#ifdef __i386__ + ia32_pause(); +#endif + continue; + } + +#if defined(SMP) && defined(ADAPTIVE_MUTEXES) + /* + * If the current owner of the lock is executing on another + * CPU, spin instead of blocking. + */ + owner = (struct thread *)(v & MTX_FLAGMASK); + if (m != &Giant && thread_running(owner)) { + mtx_unlock_spin(&sched_lock); + while (mtx_owner(m) == owner && thread_running(owner)) { +#ifdef __i386__ + ia32_pause(); +#endif + } + continue; + } +#endif /* SMP && ADAPTIVE_MUTEXES */ + + /* + * We definitely must sleep for this lock. + */ + mtx_assert(m, MA_NOTOWNED); + +#ifdef notyet + /* + * If we're borrowing an interrupted thread's VM context, we + * must clean up before going to sleep. + */ + if (td->td_ithd != NULL) { + struct ithd *it = td->td_ithd; + + if (it->it_interrupted) { + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR2(KTR_LOCK, + "_mtx_lock_sleep: %p interrupted %p", + it, it->it_interrupted); + intr_thd_fixup(it); + } + } +#endif + + /* + * Put us on the list of threads blocked on this mutex. + */ + if (TAILQ_EMPTY(&m->mtx_blocked)) { + td1 = mtx_owner(m); + LIST_INSERT_HEAD(&td1->td_contested, m, mtx_contested); + TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq); + } else { + TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq) + if (td1->td_priority > td->td_priority) + break; + if (td1) + TAILQ_INSERT_BEFORE(td1, td, td_blkq); + else + TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq); + } + + /* + * Save who we're blocked on. + */ + td->td_blocked = m; + td->td_mtxname = m->mtx_object.lo_name; + td->td_proc->p_stat = SMTX; + propagate_priority(td); + + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR3(KTR_LOCK, + "_mtx_lock_sleep: p %p blocked on [%p] %s", td, m, + m->mtx_object.lo_name); + + td->td_proc->p_stats->p_ru.ru_nvcsw++; + mi_switch(); + + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR3(KTR_LOCK, + "_mtx_lock_sleep: p %p free from blocked on [%p] %s", + td, m, m->mtx_object.lo_name); + + mtx_unlock_spin(&sched_lock); + } + + return; +} + +/* + * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock. + * + * This is only called if we need to actually spin for the lock. Recursion + * is handled inline. + */ +void +_mtx_lock_spin(struct mtx *m, int opts, const char *file, int line) +{ + int i = 0; + + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m); + + for (;;) { + if (_obtain_lock(m, curthread)) + break; + + /* Give interrupts a chance while we spin. */ + critical_exit(); + while (m->mtx_lock != MTX_UNOWNED) { + if (i++ < 10000000) { +#ifdef __i386__ + ia32_pause(); +#endif + continue; + } + if (i < 60000000) + DELAY(1); +#ifdef DDB + else if (!db_active) +#else + else +#endif + panic("spin lock %s held by %p for > 5 seconds", + m->mtx_object.lo_name, (void *)m->mtx_lock); +#ifdef __i386__ + ia32_pause(); +#endif + } + critical_enter(); + } + + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m); + + return; +} + +/* + * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock. + * + * We are only called here if the lock is recursed or contested (i.e. we + * need to wake up a blocked thread). + */ +void +_mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line) +{ + struct thread *td, *td1; + struct mtx *m1; + int pri; + + td = curthread; + + if (mtx_recursed(m)) { + if (--(m->mtx_recurse) == 0) + atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m); + return; + } + + mtx_lock_spin(&sched_lock); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m); + + td1 = TAILQ_FIRST(&m->mtx_blocked); +#if defined(SMP) && defined(ADAPTIVE_MUTEXES) + if (td1 == NULL) { + _release_lock_quick(m); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m); + mtx_unlock_spin(&sched_lock); + return; + } +#endif + MPASS(td->td_proc->p_magic == P_MAGIC); + MPASS(td1->td_proc->p_magic == P_MAGIC); + + TAILQ_REMOVE(&m->mtx_blocked, td1, td_blkq); + + if (TAILQ_EMPTY(&m->mtx_blocked)) { + LIST_REMOVE(m, mtx_contested); + _release_lock_quick(m); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m); + } else + atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED); + + pri = PRI_MAX; + LIST_FOREACH(m1, &td->td_contested, mtx_contested) { + int cp = TAILQ_FIRST(&m1->mtx_blocked)->td_priority; + if (cp < pri) + pri = cp; + } + + if (pri > td->td_base_pri) + pri = td->td_base_pri; + td->td_priority = pri; + + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p", + m, td1); + + td1->td_blocked = NULL; + td1->td_proc->p_stat = SRUN; + setrunqueue(td1); + + if (td->td_critnest == 1 && td1->td_priority < pri) { +#ifdef notyet + if (td->td_ithd != NULL) { + struct ithd *it = td->td_ithd; + + if (it->it_interrupted) { + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR2(KTR_LOCK, + "_mtx_unlock_sleep: %p interrupted %p", + it, it->it_interrupted); + intr_thd_fixup(it); + } + } +#endif + setrunqueue(td); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR2(KTR_LOCK, + "_mtx_unlock_sleep: %p switching out lock=%p", m, + (void *)m->mtx_lock); + + td->td_proc->p_stats->p_ru.ru_nivcsw++; + mi_switch(); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p", + m, (void *)m->mtx_lock); + } + + mtx_unlock_spin(&sched_lock); + + return; +} + +/* + * All the unlocking of MTX_SPIN locks is done inline. + * See the _rel_spin_lock() macro for the details. + */ + +/* + * The backing function for the INVARIANTS-enabled mtx_assert() + */ +#ifdef INVARIANT_SUPPORT +void +_mtx_assert(struct mtx *m, int what, const char *file, int line) +{ + + if (panicstr != NULL) + return; + switch (what) { + case MA_OWNED: + case MA_OWNED | MA_RECURSED: + case MA_OWNED | MA_NOTRECURSED: + if (!mtx_owned(m)) + panic("mutex %s not owned at %s:%d", + m->mtx_object.lo_name, file, line); + if (mtx_recursed(m)) { + if ((what & MA_NOTRECURSED) != 0) + panic("mutex %s recursed at %s:%d", + m->mtx_object.lo_name, file, line); + } else if ((what & MA_RECURSED) != 0) { + panic("mutex %s unrecursed at %s:%d", + m->mtx_object.lo_name, file, line); + } + break; + case MA_NOTOWNED: + if (mtx_owned(m)) + panic("mutex %s owned at %s:%d", + m->mtx_object.lo_name, file, line); + break; + default: + panic("unknown mtx_assert at %s:%d", file, line); + } +} +#endif + +/* + * The MUTEX_DEBUG-enabled mtx_validate() + * + * Most of these checks have been moved off into the LO_INITIALIZED flag + * maintained by the witness code. + */ +#ifdef MUTEX_DEBUG + +void mtx_validate(struct mtx *); + +void +mtx_validate(struct mtx *m) +{ + +/* + * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly + * we can re-enable the kernacc() checks. + */ +#ifndef __alpha__ + /* + * Can't call kernacc() from early init386(), especially when + * initializing Giant mutex, because some stuff in kernacc() + * requires Giant itself. + */ + if (!cold) + if (!kernacc((caddr_t)m, sizeof(m), + VM_PROT_READ | VM_PROT_WRITE)) + panic("Can't read and write to mutex %p", m); +#endif +} +#endif + +/* + * General init routine used by the MTX_SYSINIT() macro. + */ +void +mtx_sysinit(void *arg) +{ + struct mtx_args *margs = arg; + + mtx_init(margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts); +} + +/* + * Mutex initialization routine; initialize lock `m' of type contained in + * `opts' with options contained in `opts' and name `name.' The optional + * lock type `type' is used as a general lock category name for use with + * witness. + */ +void +mtx_init(struct mtx *m, const char *name, const char *type, int opts) +{ + struct lock_object *lock; + + MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE | + MTX_SLEEPABLE | MTX_NOWITNESS | MTX_DUPOK)) == 0); + +#ifdef MUTEX_DEBUG + /* Diagnostic and error correction */ + mtx_validate(m); +#endif + + lock = &m->mtx_object; + KASSERT((lock->lo_flags & LO_INITIALIZED) == 0, + ("mutex %s %p already initialized", name, m)); + bzero(m, sizeof(*m)); + if (opts & MTX_SPIN) + lock->lo_class = &lock_class_mtx_spin; + else + lock->lo_class = &lock_class_mtx_sleep; + lock->lo_name = name; + lock->lo_type = type != NULL ? type : name; + if (opts & MTX_QUIET) + lock->lo_flags = LO_QUIET; + if (opts & MTX_RECURSE) + lock->lo_flags |= LO_RECURSABLE; + if (opts & MTX_SLEEPABLE) + lock->lo_flags |= LO_SLEEPABLE; + if ((opts & MTX_NOWITNESS) == 0) + lock->lo_flags |= LO_WITNESS; + if (opts & MTX_DUPOK) + lock->lo_flags |= LO_DUPOK; + + m->mtx_lock = MTX_UNOWNED; + TAILQ_INIT(&m->mtx_blocked); + + LOCK_LOG_INIT(lock, opts); + + WITNESS_INIT(lock); +} + +/* + * Remove lock `m' from all_mtx queue. We don't allow MTX_QUIET to be + * passed in as a flag here because if the corresponding mtx_init() was + * called with MTX_QUIET set, then it will already be set in the mutex's + * flags. + */ +void +mtx_destroy(struct mtx *m) +{ + + LOCK_LOG_DESTROY(&m->mtx_object, 0); + + if (!mtx_owned(m)) + MPASS(mtx_unowned(m)); + else { + MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0); + + /* Tell witness this isn't locked to make it happy. */ + WITNESS_UNLOCK(&m->mtx_object, LOP_EXCLUSIVE, __FILE__, + __LINE__); + } + + WITNESS_DESTROY(&m->mtx_object); +} + +/* + * Intialize the mutex code and system mutexes. This is called from the MD + * startup code prior to mi_startup(). The per-CPU data space needs to be + * setup before this is called. + */ +void +mutex_init(void) +{ + + /* Setup thread0 so that mutexes work. */ + LIST_INIT(&thread0.td_contested); + + /* + * Initialize mutexes. + */ + mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE); + mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE); + mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); + mtx_lock(&Giant); +} + +/* + * Encapsulated Giant mutex routines. These routines provide encapsulation + * control for the Giant mutex, allowing sysctls to be used to turn on and + * off Giant around certain subsystems. The default value for the sysctls + * are set to what developers believe is stable and working in regards to + * the Giant pushdown. Developers should not turn off Giant via these + * sysctls unless they know what they are doing. + * + * Callers of mtx_lock_giant() are expected to pass the return value to an + * accompanying mtx_unlock_giant() later on. If multiple subsystems are + * effected by a Giant wrap, all related sysctl variables must be zero for + * the subsystem call to operate without Giant (as determined by the caller). + */ + +SYSCTL_NODE(_kern, OID_AUTO, giant, CTLFLAG_RD, NULL, "Giant mutex manipulation"); + +static int kern_giant_all = 0; +SYSCTL_INT(_kern_giant, OID_AUTO, all, CTLFLAG_RW, &kern_giant_all, 0, ""); + +int kern_giant_proc = 1; /* Giant around PROC locks */ +int kern_giant_file = 1; /* Giant around struct file & filedesc */ +int kern_giant_ucred = 1; /* Giant around ucred */ +SYSCTL_INT(_kern_giant, OID_AUTO, proc, CTLFLAG_RW, &kern_giant_proc, 0, ""); +SYSCTL_INT(_kern_giant, OID_AUTO, file, CTLFLAG_RW, &kern_giant_file, 0, ""); +SYSCTL_INT(_kern_giant, OID_AUTO, ucred, CTLFLAG_RW, &kern_giant_ucred, 0, ""); + +int +mtx_lock_giant(int sysctlvar) +{ + if (sysctlvar || kern_giant_all) { + mtx_lock(&Giant); + return(1); + } + return(0); +} + +void +mtx_unlock_giant(int s) +{ + if (s) + mtx_unlock(&Giant); +} + diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c new file mode 100644 index 0000000..cd2db73 --- /dev/null +++ b/sys/kern/kern_ntptime.c @@ -0,0 +1,935 @@ +/*********************************************************************** + * * + * Copyright (c) David L. Mills 1993-2001 * + * * + * Permission to use, copy, modify, and distribute this software and * + * its documentation for any purpose and without fee is hereby * + * granted, provided that the above copyright notice appears in all * + * copies and that both the copyright notice and this permission * + * notice appear in supporting documentation, and that the name * + * University of Delaware not be used in advertising or publicity * + * pertaining to distribution of the software without specific, * + * written prior permission. The University of Delaware makes no * + * representations about the suitability this software for any * + * purpose. It is provided "as is" without express or implied * + * warranty. * + * * + **********************************************************************/ + +/* + * Adapted from the original sources for FreeBSD and timecounters by: + * Poul-Henning Kamp <phk@FreeBSD.org>. + * + * The 32bit version of the "LP" macros seems a bit past its "sell by" + * date so I have retained only the 64bit version and included it directly + * in this file. + * + * Only minor changes done to interface with the timecounters over in + * sys/kern/kern_clock.c. Some of the comments below may be (even more) + * confusing and/or plain wrong in that context. + * + * $FreeBSD$ + */ + +#include "opt_ntp.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/time.h> +#include <sys/timex.h> +#include <sys/timetc.h> +#include <sys/timepps.h> +#include <sys/sysctl.h> + +/* + * Single-precision macros for 64-bit machines + */ +typedef long long l_fp; +#define L_ADD(v, u) ((v) += (u)) +#define L_SUB(v, u) ((v) -= (u)) +#define L_ADDHI(v, a) ((v) += (long long)(a) << 32) +#define L_NEG(v) ((v) = -(v)) +#define L_RSHIFT(v, n) \ + do { \ + if ((v) < 0) \ + (v) = -(-(v) >> (n)); \ + else \ + (v) = (v) >> (n); \ + } while (0) +#define L_MPY(v, a) ((v) *= (a)) +#define L_CLR(v) ((v) = 0) +#define L_ISNEG(v) ((v) < 0) +#define L_LINT(v, a) ((v) = (long long)(a) << 32) +#define L_GINT(v) ((v) < 0 ? -(-(v) >> 32) : (v) >> 32) + +/* + * Generic NTP kernel interface + * + * These routines constitute the Network Time Protocol (NTP) interfaces + * for user and daemon application programs. The ntp_gettime() routine + * provides the time, maximum error (synch distance) and estimated error + * (dispersion) to client user application programs. The ntp_adjtime() + * routine is used by the NTP daemon to adjust the system clock to an + * externally derived time. The time offset and related variables set by + * this routine are used by other routines in this module to adjust the + * phase and frequency of the clock discipline loop which controls the + * system clock. + * + * When the kernel time is reckoned directly in nanoseconds (NTP_NANO + * defined), the time at each tick interrupt is derived directly from + * the kernel time variable. When the kernel time is reckoned in + * microseconds, (NTP_NANO undefined), the time is derived from the + * kernel time variable together with a variable representing the + * leftover nanoseconds at the last tick interrupt. In either case, the + * current nanosecond time is reckoned from these values plus an + * interpolated value derived by the clock routines in another + * architecture-specific module. The interpolation can use either a + * dedicated counter or a processor cycle counter (PCC) implemented in + * some architectures. + * + * Note that all routines must run at priority splclock or higher. + */ +/* + * Phase/frequency-lock loop (PLL/FLL) definitions + * + * The nanosecond clock discipline uses two variable types, time + * variables and frequency variables. Both types are represented as 64- + * bit fixed-point quantities with the decimal point between two 32-bit + * halves. On a 32-bit machine, each half is represented as a single + * word and mathematical operations are done using multiple-precision + * arithmetic. On a 64-bit machine, ordinary computer arithmetic is + * used. + * + * A time variable is a signed 64-bit fixed-point number in ns and + * fraction. It represents the remaining time offset to be amortized + * over succeeding tick interrupts. The maximum time offset is about + * 0.5 s and the resolution is about 2.3e-10 ns. + * + * 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |s s s| ns | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | fraction | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * A frequency variable is a signed 64-bit fixed-point number in ns/s + * and fraction. It represents the ns and fraction to be added to the + * kernel time variable at each second. The maximum frequency offset is + * about +-500000 ns/s and the resolution is about 2.3e-10 ns/s. + * + * 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |s s s s s s s s s s s s s| ns/s | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | fraction | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +/* + * The following variables establish the state of the PLL/FLL and the + * residual time and frequency offset of the local clock. + */ +#define SHIFT_PLL 4 /* PLL loop gain (shift) */ +#define SHIFT_FLL 2 /* FLL loop gain (shift) */ + +static int time_state = TIME_OK; /* clock state */ +static int time_status = STA_UNSYNC; /* clock status bits */ +static long time_tai; /* TAI offset (s) */ +static long time_monitor; /* last time offset scaled (ns) */ +static long time_constant; /* poll interval (shift) (s) */ +static long time_precision = 1; /* clock precision (ns) */ +static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */ +static long time_esterror = MAXPHASE / 1000; /* estimated error (us) */ +static long time_reftime; /* time at last adjustment (s) */ +static long time_tick; /* nanoseconds per tick (ns) */ +static l_fp time_offset; /* time offset (ns) */ +static l_fp time_freq; /* frequency offset (ns/s) */ +static l_fp time_adj; /* tick adjust (ns/s) */ + +static int64_t time_adjtime; /* correction from adjtime(2) (usec) */ + +#ifdef PPS_SYNC +/* + * The following variables are used when a pulse-per-second (PPS) signal + * is available and connected via a modem control lead. They establish + * the engineering parameters of the clock discipline loop when + * controlled by the PPS signal. + */ +#define PPS_FAVG 2 /* min freq avg interval (s) (shift) */ +#define PPS_FAVGDEF 8 /* default freq avg int (s) (shift) */ +#define PPS_FAVGMAX 15 /* max freq avg interval (s) (shift) */ +#define PPS_PAVG 4 /* phase avg interval (s) (shift) */ +#define PPS_VALID 120 /* PPS signal watchdog max (s) */ +#define PPS_MAXWANDER 100000 /* max PPS wander (ns/s) */ +#define PPS_POPCORN 2 /* popcorn spike threshold (shift) */ + +static struct timespec pps_tf[3]; /* phase median filter */ +static l_fp pps_freq; /* scaled frequency offset (ns/s) */ +static long pps_fcount; /* frequency accumulator */ +static long pps_jitter; /* nominal jitter (ns) */ +static long pps_stabil; /* nominal stability (scaled ns/s) */ +static long pps_lastsec; /* time at last calibration (s) */ +static int pps_valid; /* signal watchdog counter */ +static int pps_shift = PPS_FAVG; /* interval duration (s) (shift) */ +static int pps_shiftmax = PPS_FAVGDEF; /* max interval duration (s) (shift) */ +static int pps_intcnt; /* wander counter */ + +/* + * PPS signal quality monitors + */ +static long pps_calcnt; /* calibration intervals */ +static long pps_jitcnt; /* jitter limit exceeded */ +static long pps_stbcnt; /* stability limit exceeded */ +static long pps_errcnt; /* calibration errors */ +#endif /* PPS_SYNC */ +/* + * End of phase/frequency-lock loop (PLL/FLL) definitions + */ + +static void ntp_init(void); +static void hardupdate(long offset); + +/* + * ntp_gettime() - NTP user application interface + * + * See the timex.h header file for synopsis and API description. Note + * that the TAI offset is returned in the ntvtimeval.tai structure + * member. + */ +static int +ntp_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct ntptimeval ntv; /* temporary structure */ + struct timespec atv; /* nanosecond time */ + + nanotime(&atv); + ntv.time.tv_sec = atv.tv_sec; + ntv.time.tv_nsec = atv.tv_nsec; + ntv.maxerror = time_maxerror; + ntv.esterror = time_esterror; + ntv.tai = time_tai; + ntv.time_state = time_state; + + /* + * Status word error decode. If any of these conditions occur, + * an error is returned, instead of the status word. Most + * applications will care only about the fact the system clock + * may not be trusted, not about the details. + * + * Hardware or software error + */ + if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) || + + /* + * PPS signal lost when either time or frequency synchronization + * requested + */ + (time_status & (STA_PPSFREQ | STA_PPSTIME) && + !(time_status & STA_PPSSIGNAL)) || + + /* + * PPS jitter exceeded when time synchronization requested + */ + (time_status & STA_PPSTIME && + time_status & STA_PPSJITTER) || + + /* + * PPS wander exceeded or calibration error when frequency + * synchronization requested + */ + (time_status & STA_PPSFREQ && + time_status & (STA_PPSWANDER | STA_PPSERROR))) + ntv.time_state = TIME_ERROR; + return (sysctl_handle_opaque(oidp, &ntv, sizeof ntv, req)); +} + +SYSCTL_NODE(_kern, OID_AUTO, ntp_pll, CTLFLAG_RW, 0, ""); +SYSCTL_PROC(_kern_ntp_pll, OID_AUTO, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", ""); + +#ifdef PPS_SYNC +SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shiftmax, CTLFLAG_RW, &pps_shiftmax, 0, ""); +SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shift, CTLFLAG_RW, &pps_shift, 0, ""); +SYSCTL_INT(_kern_ntp_pll, OID_AUTO, time_monitor, CTLFLAG_RD, &time_monitor, 0, ""); + +SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD, &pps_freq, sizeof(pps_freq), "I", ""); +SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD, &time_freq, sizeof(time_freq), "I", ""); +#endif +/* + * ntp_adjtime() - NTP daemon application interface + * + * See the timex.h header file for synopsis and API description. Note + * that the timex.constant structure member has a dual purpose to set + * the time constant and to set the TAI offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ntp_adjtime_args { + struct timex *tp; +}; +#endif + +/* + * MPSAFE + */ +int +ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap) +{ + struct timex ntv; /* temporary structure */ + long freq; /* frequency ns/s) */ + int modes; /* mode bits from structure */ + int s; /* caller priority */ + int error; + + error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv)); + if (error) + return(error); + + /* + * Update selected clock variables - only the superuser can + * change anything. Note that there is no error checking here on + * the assumption the superuser should know what it is doing. + * Note that either the time constant or TAI offset are loaded + * from the ntv.constant member, depending on the mode bits. If + * the STA_PLL bit in the status word is cleared, the state and + * status words are reset to the initial values at boot. + */ + mtx_lock(&Giant); + modes = ntv.modes; + if (modes) + error = suser(td); + if (error) + goto done2; + s = splclock(); + if (modes & MOD_MAXERROR) + time_maxerror = ntv.maxerror; + if (modes & MOD_ESTERROR) + time_esterror = ntv.esterror; + if (modes & MOD_STATUS) { + if (time_status & STA_PLL && !(ntv.status & STA_PLL)) { + time_state = TIME_OK; + time_status = STA_UNSYNC; +#ifdef PPS_SYNC + pps_shift = PPS_FAVG; +#endif /* PPS_SYNC */ + } + time_status &= STA_RONLY; + time_status |= ntv.status & ~STA_RONLY; + } + if (modes & MOD_TIMECONST) { + if (ntv.constant < 0) + time_constant = 0; + else if (ntv.constant > MAXTC) + time_constant = MAXTC; + else + time_constant = ntv.constant; + } + if (modes & MOD_TAI) { + if (ntv.constant > 0) /* XXX zero & negative numbers ? */ + time_tai = ntv.constant; + } +#ifdef PPS_SYNC + if (modes & MOD_PPSMAX) { + if (ntv.shift < PPS_FAVG) + pps_shiftmax = PPS_FAVG; + else if (ntv.shift > PPS_FAVGMAX) + pps_shiftmax = PPS_FAVGMAX; + else + pps_shiftmax = ntv.shift; + } +#endif /* PPS_SYNC */ + if (modes & MOD_NANO) + time_status |= STA_NANO; + if (modes & MOD_MICRO) + time_status &= ~STA_NANO; + if (modes & MOD_CLKB) + time_status |= STA_CLK; + if (modes & MOD_CLKA) + time_status &= ~STA_CLK; + if (modes & MOD_OFFSET) { + if (time_status & STA_NANO) + hardupdate(ntv.offset); + else + hardupdate(ntv.offset * 1000); + } + if (modes & MOD_FREQUENCY) { + freq = (ntv.freq * 1000LL) >> 16; + if (freq > MAXFREQ) + L_LINT(time_freq, MAXFREQ); + else if (freq < -MAXFREQ) + L_LINT(time_freq, -MAXFREQ); + else + L_LINT(time_freq, freq); +#ifdef PPS_SYNC + pps_freq = time_freq; +#endif /* PPS_SYNC */ + } + + /* + * Retrieve all clock variables. Note that the TAI offset is + * returned only by ntp_gettime(); + */ + if (time_status & STA_NANO) + ntv.offset = L_GINT(time_offset); + else + ntv.offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */ + ntv.freq = L_GINT((time_freq / 1000LL) << 16); + ntv.maxerror = time_maxerror; + ntv.esterror = time_esterror; + ntv.status = time_status; + ntv.constant = time_constant; + if (time_status & STA_NANO) + ntv.precision = time_precision; + else + ntv.precision = time_precision / 1000; + ntv.tolerance = MAXFREQ * SCALE_PPM; +#ifdef PPS_SYNC + ntv.shift = pps_shift; + ntv.ppsfreq = L_GINT((pps_freq / 1000LL) << 16); + if (time_status & STA_NANO) + ntv.jitter = pps_jitter; + else + ntv.jitter = pps_jitter / 1000; + ntv.stabil = pps_stabil; + ntv.calcnt = pps_calcnt; + ntv.errcnt = pps_errcnt; + ntv.jitcnt = pps_jitcnt; + ntv.stbcnt = pps_stbcnt; +#endif /* PPS_SYNC */ + splx(s); + + error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv)); + if (error) + goto done2; + + /* + * Status word error decode. See comments in + * ntp_gettime() routine. + */ + if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) || + (time_status & (STA_PPSFREQ | STA_PPSTIME) && + !(time_status & STA_PPSSIGNAL)) || + (time_status & STA_PPSTIME && + time_status & STA_PPSJITTER) || + (time_status & STA_PPSFREQ && + time_status & (STA_PPSWANDER | STA_PPSERROR))) { + td->td_retval[0] = TIME_ERROR; + } else { + td->td_retval[0] = time_state; + } +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * second_overflow() - called after ntp_tick_adjust() + * + * This routine is ordinarily called immediately following the above + * routine ntp_tick_adjust(). While these two routines are normally + * combined, they are separated here only for the purposes of + * simulation. + */ +void +ntp_update_second(int64_t *adjustment, time_t *newsec) +{ + int tickrate; + l_fp ftemp; /* 32/64-bit temporary */ + + /* + * On rollover of the second both the nanosecond and microsecond + * clocks are updated and the state machine cranked as + * necessary. The phase adjustment to be used for the next + * second is calculated and the maximum error is increased by + * the tolerance. + */ + time_maxerror += MAXFREQ / 1000; + + /* + * Leap second processing. If in leap-insert state at + * the end of the day, the system clock is set back one + * second; if in leap-delete state, the system clock is + * set ahead one second. The nano_time() routine or + * external clock driver will insure that reported time + * is always monotonic. + */ + switch (time_state) { + + /* + * No warning. + */ + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + + /* + * Insert second 23:59:60 following second + * 23:59:59. + */ + case TIME_INS: + if (!(time_status & STA_INS)) + time_state = TIME_OK; + else if ((*newsec) % 86400 == 0) { + (*newsec)--; + time_state = TIME_OOP; + } + break; + + /* + * Delete second 23:59:59. + */ + case TIME_DEL: + if (!(time_status & STA_DEL)) + time_state = TIME_OK; + else if (((*newsec) + 1) % 86400 == 0) { + (*newsec)++; + time_tai--; + time_state = TIME_WAIT; + } + break; + + /* + * Insert second in progress. + */ + case TIME_OOP: + time_tai++; + time_state = TIME_WAIT; + break; + + /* + * Wait for status bits to clear. + */ + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + + /* + * Compute the total time adjustment for the next second + * in ns. The offset is reduced by a factor depending on + * whether the PPS signal is operating. Note that the + * value is in effect scaled by the clock frequency, + * since the adjustment is added at each tick interrupt. + */ + ftemp = time_offset; +#ifdef PPS_SYNC + /* XXX even if PPS signal dies we should finish adjustment ? */ + if (time_status & STA_PPSTIME && time_status & + STA_PPSSIGNAL) + L_RSHIFT(ftemp, pps_shift); + else + L_RSHIFT(ftemp, SHIFT_PLL + time_constant); +#else + L_RSHIFT(ftemp, SHIFT_PLL + time_constant); +#endif /* PPS_SYNC */ + time_adj = ftemp; + L_SUB(time_offset, ftemp); + L_ADD(time_adj, time_freq); + + /* + * Apply any correction from adjtime(2). If more than one second + * off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM) + * until the last second is slewed the final < 500 usecs. + */ + if (time_adjtime != 0) { + if (time_adjtime > 1000000) + tickrate = 5000; + else if (time_adjtime < -1000000) + tickrate = -5000; + else if (time_adjtime > 500) + tickrate = 500; + else if (time_adjtime < -500) + tickrate = -500; + else if (time_adjtime != 0) + tickrate = time_adjtime; + else + tickrate = 0; /* GCC sucks! */ + time_adjtime -= tickrate; + L_LINT(ftemp, tickrate * 1000); + L_ADD(time_adj, ftemp); + } + *adjustment = time_adj; + +#ifdef PPS_SYNC + if (pps_valid > 0) + pps_valid--; + else + time_status &= ~STA_PPSSIGNAL; +#endif /* PPS_SYNC */ +} + +/* + * ntp_init() - initialize variables and structures + * + * This routine must be called after the kernel variables hz and tick + * are set or changed and before the next tick interrupt. In this + * particular implementation, these values are assumed set elsewhere in + * the kernel. The design allows the clock frequency and tick interval + * to be changed while the system is running. So, this routine should + * probably be integrated with the code that does that. + */ +static void +ntp_init() +{ + + /* + * The following variable must be initialized any time the + * kernel variable hz is changed. + */ + time_tick = NANOSECOND / hz; + + /* + * The following variables are initialized only at startup. Only + * those structures not cleared by the compiler need to be + * initialized, and these only in the simulator. In the actual + * kernel, any nonzero values here will quickly evaporate. + */ + L_CLR(time_offset); + L_CLR(time_freq); +#ifdef PPS_SYNC + pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0; + pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0; + pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0; + pps_fcount = 0; + L_CLR(pps_freq); +#endif /* PPS_SYNC */ +} + +SYSINIT(ntpclocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, ntp_init, NULL) + +/* + * hardupdate() - local clock update + * + * This routine is called by ntp_adjtime() to update the local clock + * phase and frequency. The implementation is of an adaptive-parameter, + * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new + * time and frequency offset estimates for each call. If the kernel PPS + * discipline code is configured (PPS_SYNC), the PPS signal itself + * determines the new time offset, instead of the calling argument. + * Presumably, calls to ntp_adjtime() occur only when the caller + * believes the local clock is valid within some bound (+-128 ms with + * NTP). If the caller's time is far different than the PPS time, an + * argument will ensue, and it's not clear who will lose. + * + * For uncompensated quartz crystal oscillators and nominal update + * intervals less than 256 s, operation should be in phase-lock mode, + * where the loop is disciplined to phase. For update intervals greater + * than 1024 s, operation should be in frequency-lock mode, where the + * loop is disciplined to frequency. Between 256 s and 1024 s, the mode + * is selected by the STA_MODE status bit. + */ +static void +hardupdate(offset) + long offset; /* clock offset (ns) */ +{ + long mtemp; + l_fp ftemp; + + /* + * Select how the phase is to be controlled and from which + * source. If the PPS signal is present and enabled to + * discipline the time, the PPS offset is used; otherwise, the + * argument offset is used. + */ + if (!(time_status & STA_PLL)) + return; + if (!(time_status & STA_PPSTIME && time_status & + STA_PPSSIGNAL)) { + if (offset > MAXPHASE) + time_monitor = MAXPHASE; + else if (offset < -MAXPHASE) + time_monitor = -MAXPHASE; + else + time_monitor = offset; + L_LINT(time_offset, time_monitor); + } + + /* + * Select how the frequency is to be controlled and in which + * mode (PLL or FLL). If the PPS signal is present and enabled + * to discipline the frequency, the PPS frequency is used; + * otherwise, the argument offset is used to compute it. + */ + if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) { + time_reftime = time_second; + return; + } + if (time_status & STA_FREQHOLD || time_reftime == 0) + time_reftime = time_second; + mtemp = time_second - time_reftime; + L_LINT(ftemp, time_monitor); + L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1); + L_MPY(ftemp, mtemp); + L_ADD(time_freq, ftemp); + time_status &= ~STA_MODE; + if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > + MAXSEC)) { + L_LINT(ftemp, (time_monitor << 4) / mtemp); + L_RSHIFT(ftemp, SHIFT_FLL + 4); + L_ADD(time_freq, ftemp); + time_status |= STA_MODE; + } + time_reftime = time_second; + if (L_GINT(time_freq) > MAXFREQ) + L_LINT(time_freq, MAXFREQ); + else if (L_GINT(time_freq) < -MAXFREQ) + L_LINT(time_freq, -MAXFREQ); +} + +#ifdef PPS_SYNC +/* + * hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS interrupt in order to discipline + * the CPU clock oscillator to the PPS signal. There are two independent + * first-order feedback loops, one for the phase, the other for the + * frequency. The phase loop measures and grooms the PPS phase offset + * and leaves it in a handy spot for the seconds overflow routine. The + * frequency loop averages successive PPS phase differences and + * calculates the PPS frequency offset, which is also processed by the + * seconds overflow routine. The code requires the caller to capture the + * time and architecture-dependent hardware counter values in + * nanoseconds at the on-time PPS signal transition. + * + * Note that, on some Unix systems this routine runs at an interrupt + * priority level higher than the timer interrupt routine hardclock(). + * Therefore, the variables used are distinct from the hardclock() + * variables, except for the actual time and frequency variables, which + * are determined by this routine and updated atomically. + */ +void +hardpps(tsp, nsec) + struct timespec *tsp; /* time at PPS */ + long nsec; /* hardware counter at PPS */ +{ + long u_sec, u_nsec, v_nsec; /* temps */ + l_fp ftemp; + + /* + * The signal is first processed by a range gate and frequency + * discriminator. The range gate rejects noise spikes outside + * the range +-500 us. The frequency discriminator rejects input + * signals with apparent frequency outside the range 1 +-500 + * PPM. If two hits occur in the same second, we ignore the + * later hit; if not and a hit occurs outside the range gate, + * keep the later hit for later comparison, but do not process + * it. + */ + time_status |= STA_PPSSIGNAL | STA_PPSJITTER; + time_status &= ~(STA_PPSWANDER | STA_PPSERROR); + pps_valid = PPS_VALID; + u_sec = tsp->tv_sec; + u_nsec = tsp->tv_nsec; + if (u_nsec >= (NANOSECOND >> 1)) { + u_nsec -= NANOSECOND; + u_sec++; + } + v_nsec = u_nsec - pps_tf[0].tv_nsec; + if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND - + MAXFREQ) + return; + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0].tv_sec = u_sec; + pps_tf[0].tv_nsec = u_nsec; + + /* + * Compute the difference between the current and previous + * counter values. If the difference exceeds 0.5 s, assume it + * has wrapped around, so correct 1.0 s. If the result exceeds + * the tick interval, the sample point has crossed a tick + * boundary during the last second, so correct the tick. Very + * intricate. + */ + u_nsec = nsec; + if (u_nsec > (NANOSECOND >> 1)) + u_nsec -= NANOSECOND; + else if (u_nsec < -(NANOSECOND >> 1)) + u_nsec += NANOSECOND; + pps_fcount += u_nsec; + if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ) + return; + time_status &= ~STA_PPSJITTER; + + /* + * A three-stage median filter is used to help denoise the PPS + * time. The median sample becomes the time offset estimate; the + * difference between the other two samples becomes the time + * dispersion (jitter) estimate. + */ + if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) { + if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) { + v_nsec = pps_tf[1].tv_nsec; /* 0 1 2 */ + u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec; + } else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) { + v_nsec = pps_tf[0].tv_nsec; /* 2 0 1 */ + u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec; + } else { + v_nsec = pps_tf[2].tv_nsec; /* 0 2 1 */ + u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec; + } + } else { + if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) { + v_nsec = pps_tf[1].tv_nsec; /* 2 1 0 */ + u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec; + } else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) { + v_nsec = pps_tf[0].tv_nsec; /* 1 0 2 */ + u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec; + } else { + v_nsec = pps_tf[2].tv_nsec; /* 1 2 0 */ + u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec; + } + } + + /* + * Nominal jitter is due to PPS signal noise and interrupt + * latency. If it exceeds the popcorn threshold, the sample is + * discarded. otherwise, if so enabled, the time offset is + * updated. We can tolerate a modest loss of data here without + * much degrading time accuracy. + */ + if (u_nsec > (pps_jitter << PPS_POPCORN)) { + time_status |= STA_PPSJITTER; + pps_jitcnt++; + } else if (time_status & STA_PPSTIME) { + time_monitor = -v_nsec; + L_LINT(time_offset, time_monitor); + } + pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG; + u_sec = pps_tf[0].tv_sec - pps_lastsec; + if (u_sec < (1 << pps_shift)) + return; + + /* + * At the end of the calibration interval the difference between + * the first and last counter values becomes the scaled + * frequency. It will later be divided by the length of the + * interval to determine the frequency update. If the frequency + * exceeds a sanity threshold, or if the actual calibration + * interval is not equal to the expected length, the data are + * discarded. We can tolerate a modest loss of data here without + * much degrading frequency accuracy. + */ + pps_calcnt++; + v_nsec = -pps_fcount; + pps_lastsec = pps_tf[0].tv_sec; + pps_fcount = 0; + u_nsec = MAXFREQ << pps_shift; + if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 << + pps_shift)) { + time_status |= STA_PPSERROR; + pps_errcnt++; + return; + } + + /* + * Here the raw frequency offset and wander (stability) is + * calculated. If the wander is less than the wander threshold + * for four consecutive averaging intervals, the interval is + * doubled; if it is greater than the threshold for four + * consecutive intervals, the interval is halved. The scaled + * frequency offset is converted to frequency offset. The + * stability metric is calculated as the average of recent + * frequency changes, but is used only for performance + * monitoring. + */ + L_LINT(ftemp, v_nsec); + L_RSHIFT(ftemp, pps_shift); + L_SUB(ftemp, pps_freq); + u_nsec = L_GINT(ftemp); + if (u_nsec > PPS_MAXWANDER) { + L_LINT(ftemp, PPS_MAXWANDER); + pps_intcnt--; + time_status |= STA_PPSWANDER; + pps_stbcnt++; + } else if (u_nsec < -PPS_MAXWANDER) { + L_LINT(ftemp, -PPS_MAXWANDER); + pps_intcnt--; + time_status |= STA_PPSWANDER; + pps_stbcnt++; + } else { + pps_intcnt++; + } + if (pps_intcnt >= 4) { + pps_intcnt = 4; + if (pps_shift < pps_shiftmax) { + pps_shift++; + pps_intcnt = 0; + } + } else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) { + pps_intcnt = -4; + if (pps_shift > PPS_FAVG) { + pps_shift--; + pps_intcnt = 0; + } + } + if (u_nsec < 0) + u_nsec = -u_nsec; + pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG; + + /* + * The PPS frequency is recalculated and clamped to the maximum + * MAXFREQ. If enabled, the system clock frequency is updated as + * well. + */ + L_ADD(pps_freq, ftemp); + u_nsec = L_GINT(pps_freq); + if (u_nsec > MAXFREQ) + L_LINT(pps_freq, MAXFREQ); + else if (u_nsec < -MAXFREQ) + L_LINT(pps_freq, -MAXFREQ); + if (time_status & STA_PPSFREQ) + time_freq = pps_freq; +} +#endif /* PPS_SYNC */ + +#ifndef _SYS_SYSPROTO_H_ +struct adjtime_args { + struct timeval *delta; + struct timeval *olddelta; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +adjtime(struct thread *td, struct adjtime_args *uap) +{ + struct timeval atv; + int error; + + if ((error = suser(td))) + return (error); + + mtx_lock(&Giant); + if (uap->olddelta) { + atv.tv_sec = time_adjtime / 1000000; + atv.tv_usec = time_adjtime % 1000000; + if (atv.tv_usec < 0) { + atv.tv_usec += 1000000; + atv.tv_sec--; + } + error = copyout(&atv, uap->olddelta, sizeof(atv)); + if (error) + goto done2; + } + if (uap->delta) { + error = copyin(uap->delta, &atv, sizeof(atv)); + if (error) + goto done2; + time_adjtime = (int64_t)atv.tv_sec * 1000000 + atv.tv_usec; + } +done2: + mtx_unlock(&Giant); + return (error); +} + diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c new file mode 100644 index 0000000..11f3d0c --- /dev/null +++ b/sys/kern/kern_physio.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Absolutely no warranty of function or purpose is made by the author + * John S. Dyson. + * 4. Modifications may be freely made to this file if the above conditions + * are met. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/proc.h> +#include <sys/uio.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> + +static void +physwakeup(struct buf *bp) +{ + wakeup((caddr_t) bp); +} + +int +physio(dev_t dev, struct uio *uio, int ioflag) +{ + int i; + int error; + int spl; + caddr_t sa; + u_int iolen; + struct buf *bp; + + /* Keep the process UPAGES from being swapped. XXX: why ? */ + PHOLD(curproc); + + bp = getpbuf(NULL); + sa = bp->b_data; + error = bp->b_error = 0; + + /* XXX: sanity check */ + if(dev->si_iosize_max < PAGE_SIZE) { + printf("WARNING: %s si_iosize_max=%d, using DFLTPHYS.\n", + devtoname(dev), dev->si_iosize_max); + dev->si_iosize_max = DFLTPHYS; + } + + for (i = 0; i < uio->uio_iovcnt; i++) { + while (uio->uio_iov[i].iov_len) { + bp->b_flags = B_PHYS; + if (uio->uio_rw == UIO_READ) + bp->b_iocmd = BIO_READ; + else + bp->b_iocmd = BIO_WRITE; + bp->b_dev = dev; + bp->b_iodone = physwakeup; + bp->b_data = uio->uio_iov[i].iov_base; + bp->b_bcount = uio->uio_iov[i].iov_len; + bp->b_offset = uio->uio_offset; + bp->b_saveaddr = sa; + + /* Don't exceed drivers iosize limit */ + if (bp->b_bcount > dev->si_iosize_max) + bp->b_bcount = dev->si_iosize_max; + + /* + * Make sure the pbuf can map the request + * XXX: The pbuf has kvasize = MAXPHYS so a request + * XXX: larger than MAXPHYS - PAGE_SIZE must be + * XXX: page aligned or it will be fragmented. + */ + iolen = ((vm_offset_t) bp->b_data) & PAGE_MASK; + if ((bp->b_bcount + iolen) > bp->b_kvasize) { + bp->b_bcount = bp->b_kvasize; + if (iolen != 0) + bp->b_bcount -= PAGE_SIZE; + } + bp->b_bufsize = bp->b_bcount; + + bp->b_blkno = btodb(bp->b_offset); + + if (uio->uio_segflg == UIO_USERSPACE) { + if (!useracc(bp->b_data, bp->b_bufsize, + bp->b_iocmd == BIO_READ ? + VM_PROT_WRITE : VM_PROT_READ)) { + error = EFAULT; + goto doerror; + } + vmapbuf(bp); + } + + DEV_STRATEGY(bp, 0); + spl = splbio(); + while ((bp->b_flags & B_DONE) == 0) + tsleep((caddr_t)bp, PRIBIO, "physstr", 0); + splx(spl); + + if (uio->uio_segflg == UIO_USERSPACE) + vunmapbuf(bp); + iolen = bp->b_bcount - bp->b_resid; + if (iolen == 0 && !(bp->b_ioflags & BIO_ERROR)) + goto doerror; /* EOF */ + uio->uio_iov[i].iov_len -= iolen; + uio->uio_iov[i].iov_base += iolen; + uio->uio_resid -= iolen; + uio->uio_offset += iolen; + if( bp->b_ioflags & BIO_ERROR) { + error = bp->b_error; + goto doerror; + } + } + } +doerror: + relpbuf(bp, NULL); + PRELE(curproc); + return (error); +} diff --git a/sys/kern/kern_poll.c b/sys/kern/kern_poll.c new file mode 100644 index 0000000..a197bc0 --- /dev/null +++ b/sys/kern/kern_poll.c @@ -0,0 +1,523 @@ +/*- + * Copyright (c) 2001-2002 Luigi Rizzo + * + * Supported by: the Xorp Project (www.xorp.org) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/socket.h> /* needed by net/if.h */ +#include <sys/sysctl.h> + +#include <net/if.h> /* for IFF_* flags */ +#include <net/netisr.h> /* for NETISR_POLL */ + +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/kthread.h> + +#ifdef SMP +#ifndef COMPILING_LINT +#error DEVICE_POLLING is not compatible with SMP +#endif +#endif + +static void netisr_poll(void); /* the two netisr handlers */ +void netisr_pollmore(void); + +void init_device_poll(void); /* init routine */ +void hardclock_device_poll(void); /* hook from hardclock */ +void ether_poll(int); /* polling while in trap */ + +/* + * Polling support for [network] device drivers. + * + * Drivers which support this feature try to register with the + * polling code. + * + * If registration is successful, the driver must disable interrupts, + * and further I/O is performed through the handler, which is invoked + * (at least once per clock tick) with 3 arguments: the "arg" passed at + * register time (a struct ifnet pointer), a command, and a "count" limit. + * + * The command can be one of the following: + * POLL_ONLY: quick move of "count" packets from input/output queues. + * POLL_AND_CHECK_STATUS: as above, plus check status registers or do + * other more expensive operations. This command is issued periodically + * but less frequently than POLL_ONLY. + * POLL_DEREGISTER: deregister and return to interrupt mode. + * + * The first two commands are only issued if the interface is marked as + * 'IFF_UP and IFF_RUNNING', the last one only if IFF_RUNNING is set. + * + * The count limit specifies how much work the handler can do during the + * call -- typically this is the number of packets to be received, or + * transmitted, etc. (drivers are free to interpret this number, as long + * as the max time spent in the function grows roughly linearly with the + * count). + * + * Deregistration can be requested by the driver itself (typically in the + * *_stop() routine), or by the polling code, by invoking the handler. + * + * Polling can be globally enabled or disabled with the sysctl variable + * kern.polling.enable (default is 0, disabled) + * + * A second variable controls the sharing of CPU between polling/kernel + * network processing, and other activities (typically userlevel tasks): + * kern.polling.user_frac (between 0 and 100, default 50) sets the share + * of CPU allocated to user tasks. CPU is allocated proportionally to the + * shares, by dynamically adjusting the "count" (poll_burst). + * + * Other parameters can should be left to their default values. + * The following constraints hold + * + * 1 <= poll_each_burst <= poll_burst <= poll_burst_max + * 0 <= poll_in_trap <= poll_each_burst + * MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX + */ + +#define MIN_POLL_BURST_MAX 10 +#define MAX_POLL_BURST_MAX 1000 + +SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0, + "Device polling parameters"); + +static u_int32_t poll_burst = 5; +SYSCTL_UINT(_kern_polling, OID_AUTO, burst, CTLFLAG_RW, + &poll_burst, 0, "Current polling burst size"); + +static u_int32_t poll_each_burst = 5; +SYSCTL_UINT(_kern_polling, OID_AUTO, each_burst, CTLFLAG_RW, + &poll_each_burst, 0, "Max size of each burst"); + +static u_int32_t poll_burst_max = 150; /* good for 100Mbit net and HZ=1000 */ +SYSCTL_UINT(_kern_polling, OID_AUTO, burst_max, CTLFLAG_RW, + &poll_burst_max, 0, "Max Polling burst size"); + +static u_int32_t poll_in_idle_loop=1; /* do we poll in idle loop ? */ +SYSCTL_UINT(_kern_polling, OID_AUTO, idle_poll, CTLFLAG_RW, + &poll_in_idle_loop, 0, "Enable device polling in idle loop"); + +u_int32_t poll_in_trap; /* used in trap.c */ +SYSCTL_UINT(_kern_polling, OID_AUTO, poll_in_trap, CTLFLAG_RW, + &poll_in_trap, 0, "Poll burst size during a trap"); + +static u_int32_t user_frac = 50; +SYSCTL_UINT(_kern_polling, OID_AUTO, user_frac, CTLFLAG_RW, + &user_frac, 0, "Desired user fraction of cpu time"); + +static u_int32_t reg_frac = 20 ; +SYSCTL_UINT(_kern_polling, OID_AUTO, reg_frac, CTLFLAG_RW, + ®_frac, 0, "Every this many cycles poll register"); + +static u_int32_t short_ticks; +SYSCTL_UINT(_kern_polling, OID_AUTO, short_ticks, CTLFLAG_RW, + &short_ticks, 0, "Hardclock ticks shorter than they should be"); + +static u_int32_t lost_polls; +SYSCTL_UINT(_kern_polling, OID_AUTO, lost_polls, CTLFLAG_RW, + &lost_polls, 0, "How many times we would have lost a poll tick"); + +static u_int32_t pending_polls; +SYSCTL_UINT(_kern_polling, OID_AUTO, pending_polls, CTLFLAG_RW, + &pending_polls, 0, "Do we need to poll again"); + +static int residual_burst = 0; +SYSCTL_INT(_kern_polling, OID_AUTO, residual_burst, CTLFLAG_RW, + &residual_burst, 0, "# of residual cycles in burst"); + +static u_int32_t poll_handlers; /* next free entry in pr[]. */ +SYSCTL_UINT(_kern_polling, OID_AUTO, handlers, CTLFLAG_RD, + &poll_handlers, 0, "Number of registered poll handlers"); + +static int polling = 0; /* global polling enable */ +SYSCTL_UINT(_kern_polling, OID_AUTO, enable, CTLFLAG_RW, + &polling, 0, "Polling enabled"); + +static u_int32_t phase; +SYSCTL_UINT(_kern_polling, OID_AUTO, phase, CTLFLAG_RW, + &phase, 0, "Polling phase"); + +static u_int32_t suspect; +SYSCTL_UINT(_kern_polling, OID_AUTO, suspect, CTLFLAG_RW, + &suspect, 0, "suspect event"); + +static u_int32_t stalled; +SYSCTL_UINT(_kern_polling, OID_AUTO, stalled, CTLFLAG_RW, + &stalled, 0, "potential stalls"); + +static u_int32_t idlepoll_sleeping; /* idlepoll is sleeping */ +SYSCTL_UINT(_kern_polling, OID_AUTO, idlepoll_sleeping, CTLFLAG_RD, + &idlepoll_sleeping, 0, "idlepoll is sleeping"); + + +#define POLL_LIST_LEN 128 +struct pollrec { + poll_handler_t *handler; + struct ifnet *ifp; +}; + +static struct pollrec pr[POLL_LIST_LEN]; + +/* + * register relevant netisr. Called from kern_clock.c: + */ +void +init_device_poll(void) +{ + register_netisr(NETISR_POLL, netisr_poll); +} + +/* + * Hook from hardclock. Tries to schedule a netisr, but keeps track + * of lost ticks due to the previous handler taking too long. + * Normally, this should not happen, because polling handler should + * run for a short time. However, in some cases (e.g. when there are + * changes in link status etc.) the drivers take a very long time + * (even in the order of milliseconds) to reset and reconfigure the + * device, causing apparent lost polls. + * + * The first part of the code is just for debugging purposes, and tries + * to count how often hardclock ticks are shorter than they should, + * meaning either stray interrupts or delayed events. + */ +void +hardclock_device_poll(void) +{ + static struct timeval prev_t, t; + int delta; + + if (poll_handlers == 0) + return; + + microuptime(&t); + delta = (t.tv_usec - prev_t.tv_usec) + + (t.tv_sec - prev_t.tv_sec)*1000000; + if (delta * hz < 500000) + short_ticks++; + else + prev_t = t; + + if (pending_polls > 100) { + /* + * Too much, assume it has stalled (not always true + * see comment above). + */ + stalled++; + pending_polls = 0; + phase = 0; + } + + if (phase <= 2) { + if (phase != 0) + suspect++; + phase = 1; + schednetisr(NETISR_POLL); + phase = 2; + } + if (pending_polls++ > 0) + lost_polls++; +} + +/* + * ether_poll is called from the idle loop or from the trap handler. + */ +void +ether_poll(int count) +{ + int i; + + mtx_lock(&Giant); + + if (count > poll_each_burst) + count = poll_each_burst; + for (i = 0 ; i < poll_handlers ; i++) + if (pr[i].handler && (IFF_UP|IFF_RUNNING) == + (pr[i].ifp->if_flags & (IFF_UP|IFF_RUNNING)) ) + pr[i].handler(pr[i].ifp, 0, count); /* quick check */ + mtx_unlock(&Giant); +} + +/* + * netisr_pollmore is called after other netisr's, possibly scheduling + * another NETISR_POLL call, or adapting the burst size for the next cycle. + * + * It is very bad to fetch large bursts of packets from a single card at once, + * because the burst could take a long time to be completely processed, or + * could saturate the intermediate queue (ipintrq or similar) leading to + * losses or unfairness. To reduce the problem, and also to account better for + * time spent in network-related processing, we split the burst in smaller + * chunks of fixed size, giving control to the other netisr's between chunks. + * This helps in improving the fairness, reducing livelock (because we + * emulate more closely the "process to completion" that we have with + * fastforwarding) and accounting for the work performed in low level + * handling and forwarding. + */ + +static struct timeval poll_start_t; + +void +netisr_pollmore() +{ + struct timeval t; + int kern_load; + /* XXX run at splhigh() or equivalent */ + + phase = 5; + if (residual_burst > 0) { + schednetisr(NETISR_POLL); + /* will run immediately on return, followed by netisrs */ + return ; + } + /* here we can account time spent in netisr's in this tick */ + microuptime(&t); + kern_load = (t.tv_usec - poll_start_t.tv_usec) + + (t.tv_sec - poll_start_t.tv_sec)*1000000; /* us */ + kern_load = (kern_load * hz) / 10000; /* 0..100 */ + if (kern_load > (100 - user_frac)) { /* try decrease ticks */ + if (poll_burst > 1) + poll_burst--; + } else { + if (poll_burst < poll_burst_max) + poll_burst++; + } + + pending_polls--; + if (pending_polls == 0) /* we are done */ + phase = 0; + else { + /* + * Last cycle was long and caused us to miss one or more + * hardclock ticks. Restart processing again, but slightly + * reduce the burst size to prevent that this happens again. + */ + poll_burst -= (poll_burst / 8); + if (poll_burst < 1) + poll_burst = 1; + schednetisr(NETISR_POLL); + phase = 6; + } +} + +/* + * netisr_poll is scheduled by schednetisr when appropriate, typically once + * per tick. It is called at splnet() so first thing to do is to upgrade to + * splimp(), and call all registered handlers. + */ +static void +netisr_poll(void) +{ + static int reg_frac_count; + int i, cycles; + enum poll_cmd arg = POLL_ONLY; + mtx_lock(&Giant); + + phase = 3; + if (residual_burst == 0) { /* first call in this tick */ + microuptime(&poll_start_t); + /* + * Check that paremeters are consistent with runtime + * variables. Some of these tests could be done at sysctl + * time, but the savings would be very limited because we + * still have to check against reg_frac_count and + * poll_each_burst. So, instead of writing separate sysctl + * handlers, we do all here. + */ + + if (reg_frac > hz) + reg_frac = hz; + else if (reg_frac < 1) + reg_frac = 1; + if (reg_frac_count > reg_frac) + reg_frac_count = reg_frac - 1; + if (reg_frac_count-- == 0) { + arg = POLL_AND_CHECK_STATUS; + reg_frac_count = reg_frac - 1; + } + if (poll_burst_max < MIN_POLL_BURST_MAX) + poll_burst_max = MIN_POLL_BURST_MAX; + else if (poll_burst_max > MAX_POLL_BURST_MAX) + poll_burst_max = MAX_POLL_BURST_MAX; + + if (poll_each_burst < 1) + poll_each_burst = 1; + else if (poll_each_burst > poll_burst_max) + poll_each_burst = poll_burst_max; + + residual_burst = poll_burst; + } + cycles = (residual_burst < poll_each_burst) ? + residual_burst : poll_each_burst; + residual_burst -= cycles; + + if (polling) { + for (i = 0 ; i < poll_handlers ; i++) + if (pr[i].handler && (IFF_UP|IFF_RUNNING) == + (pr[i].ifp->if_flags & (IFF_UP|IFF_RUNNING)) ) + pr[i].handler(pr[i].ifp, arg, cycles); + } else { /* unregister */ + for (i = 0 ; i < poll_handlers ; i++) { + if (pr[i].handler && + pr[i].ifp->if_flags & IFF_RUNNING) { + pr[i].ifp->if_ipending &= ~IFF_POLLING; + pr[i].handler(pr[i].ifp, POLL_DEREGISTER, 1); + } + pr[i].handler=NULL; + } + residual_burst = 0; + poll_handlers = 0; + } + /* on -stable, schednetisr(NETISR_POLLMORE); */ + phase = 4; + mtx_unlock(&Giant); +} + +/* + * Try to register routine for polling. Returns 1 if successful + * (and polling should be enabled), 0 otherwise. + * A device is not supposed to register itself multiple times. + * + * This is called from within the *_intr() functions, so we do not need + * further locking. + */ +int +ether_poll_register(poll_handler_t *h, struct ifnet *ifp) +{ + int s; + + if (polling == 0) /* polling disabled, cannot register */ + return 0; + if (h == NULL || ifp == NULL) /* bad arguments */ + return 0; + if ( !(ifp->if_flags & IFF_UP) ) /* must be up */ + return 0; + if (ifp->if_ipending & IFF_POLLING) /* already polling */ + return 0; + + s = splhigh(); + if (poll_handlers >= POLL_LIST_LEN) { + /* + * List full, cannot register more entries. + * This should never happen; if it does, it is probably a + * broken driver trying to register multiple times. Checking + * this at runtime is expensive, and won't solve the problem + * anyways, so just report a few times and then give up. + */ + static int verbose = 10 ; + splx(s); + if (verbose >0) { + printf("poll handlers list full, " + "maybe a broken driver ?\n"); + verbose--; + } + return 0; /* no polling for you */ + } + + pr[poll_handlers].handler = h; + pr[poll_handlers].ifp = ifp; + poll_handlers++; + ifp->if_ipending |= IFF_POLLING; + splx(s); + if (idlepoll_sleeping) + wakeup(&idlepoll_sleeping); + return 1; /* polling enabled in next call */ +} + +/* + * Remove interface from the polling list. Normally called by *_stop(). + * It is not an error to call it with IFF_POLLING clear, the call is + * sufficiently rare to be preferable to save the space for the extra + * test in each driver in exchange of one additional function call. + */ +int +ether_poll_deregister(struct ifnet *ifp) +{ + int i; + + mtx_lock(&Giant); + if ( !ifp || !(ifp->if_ipending & IFF_POLLING) ) { + mtx_unlock(&Giant); + return 0; + } + for (i = 0 ; i < poll_handlers ; i++) + if (pr[i].ifp == ifp) /* found it */ + break; + ifp->if_ipending &= ~IFF_POLLING; /* found or not... */ + if (i == poll_handlers) { + mtx_unlock(&Giant); + printf("ether_poll_deregister: ifp not found!!!\n"); + return 0; + } + poll_handlers--; + if (i < poll_handlers) { /* Last entry replaces this one. */ + pr[i].handler = pr[poll_handlers].handler; + pr[i].ifp = pr[poll_handlers].ifp; + } + mtx_unlock(&Giant); + return 1; +} + +static void +poll_idle(void) +{ + struct thread *td = curthread; + struct rtprio rtp; + int pri; + + rtp.prio = RTP_PRIO_MAX; /* lowest priority */ + rtp.type = RTP_PRIO_IDLE; + mtx_lock_spin(&sched_lock); + rtp_to_pri(&rtp, td->td_ksegrp); + pri = td->td_priority; + mtx_unlock_spin(&sched_lock); + + for (;;) { + if (poll_in_idle_loop && poll_handlers > 0) { + idlepoll_sleeping = 0; + mtx_lock(&Giant); + ether_poll(poll_each_burst); + mtx_unlock(&Giant); + mtx_assert(&Giant, MA_NOTOWNED); + mtx_lock_spin(&sched_lock); + setrunqueue(td); + td->td_proc->p_stats->p_ru.ru_nvcsw++; + mi_switch(); + mtx_unlock_spin(&sched_lock); + } else { + idlepoll_sleeping = 1; + tsleep(&idlepoll_sleeping, pri, "pollid", hz * 3); + } + } +} + +static struct proc *idlepoll; +static struct kproc_desc idlepoll_kp = { + "idlepoll", + poll_idle, + &idlepoll +}; +SYSINIT(idlepoll, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, kproc_start, &idlepoll_kp) diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c new file mode 100644 index 0000000..a5378d9 --- /dev/null +++ b/sys/kern/kern_proc.c @@ -0,0 +1,1072 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 + * $FreeBSD$ + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sysproto.h> +#include <sys/sysctl.h> +#include <sys/filedesc.h> +#include <sys/tty.h> +#include <sys/signalvar.h> +#include <sys/sx.h> +#include <sys/user.h> +#include <sys/jail.h> +#ifdef KTRACE +#include <sys/uio.h> +#include <sys/ktrace.h> +#endif + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/uma.h> +#include <machine/critical.h> + +MALLOC_DEFINE(M_PGRP, "pgrp", "process group header"); +MALLOC_DEFINE(M_SESSION, "session", "session header"); +static MALLOC_DEFINE(M_PROC, "proc", "Proc structures"); +MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures"); + +static struct proc *dopfind(register pid_t); + +static void doenterpgrp(struct proc *, struct pgrp *); + +static void pgdelete(struct pgrp *); + +static void orphanpg(struct pgrp *pg); + +/* + * Other process lists + */ +struct pidhashhead *pidhashtbl; +u_long pidhash; +struct pgrphashhead *pgrphashtbl; +u_long pgrphash; +struct proclist allproc; +struct proclist zombproc; +struct sx allproc_lock; +struct sx proctree_lock; +struct mtx pargs_ref_lock; +uma_zone_t proc_zone; +uma_zone_t ithread_zone; + +CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); + +/* + * Initialize global process hashing structures. + */ +void +procinit() +{ + + sx_init(&allproc_lock, "allproc"); + sx_init(&proctree_lock, "proctree"); + mtx_init(&pargs_ref_lock, "struct pargs.ref", NULL, MTX_DEF); + LIST_INIT(&allproc); + LIST_INIT(&zombproc); + pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); + pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); + proc_zone = uma_zcreate("PROC", sizeof (struct proc), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uihashinit(); +} + +/* + * Note that we do not link to the proc's ucred here + * The thread is linked as if running but no KSE assigned + */ +static void +thread_link(struct thread *td, struct ksegrp *kg) +{ + struct proc *p = kg->kg_proc; + + td->td_proc = p; + td->td_ksegrp = kg; + td->td_last_kse = &p->p_kse; + + TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist); + TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist); + td->td_critnest = 0; + td->td_kse = NULL; + cpu_thread_link(td); +} + +/* + * KSE is linked onto the idle queue. + */ +static void +kse_link(struct kse *ke, struct ksegrp *kg) +{ + struct proc *p = kg->kg_proc; + + TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist); + kg->kg_kses++; + TAILQ_INSERT_HEAD(&kg->kg_iq, ke, ke_kgrlist); + ke->ke_proc = p; + ke->ke_ksegrp = kg; + ke->ke_thread = NULL; + ke->ke_oncpu = NOCPU; +} + +static void +ksegrp_link(struct ksegrp *kg, struct proc *p) +{ + + TAILQ_INIT(&kg->kg_threads); + TAILQ_INIT(&kg->kg_runq); /* links with td_runq */ + TAILQ_INIT(&kg->kg_slpq); /* links with td_runq */ + TAILQ_INIT(&kg->kg_kseq); /* all kses in ksegrp */ + TAILQ_INIT(&kg->kg_iq); /* all kses in ksegrp */ + kg->kg_proc = p; +/* the following counters are in the -zero- section and may not need clearing */ + kg->kg_runnable = 0; + kg->kg_kses = 0; + kg->kg_runq_kses = 0; /* XXXKSE change name */ +/* link it in now that it's consitant */ + TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp); +} + +/* + * for a newly created process, + * link up a the structure and its initial threads etc. + */ +void +proc_linkup(struct proc *p, struct ksegrp *kg, + struct kse *ke, struct thread *td) +{ + + TAILQ_INIT(&p->p_ksegrps); /* all ksegrps in proc */ + TAILQ_INIT(&p->p_threads); /* all threads in proc */ + + ksegrp_link(kg, p); + kse_link(ke, kg); + thread_link(td, kg); + /* link them together for 1:1 */ + td->td_kse = ke; + ke->ke_thread = td; +} + +/* temporary version is ultra simple while we are in 1:1 mode */ +struct thread * +thread_get(struct proc *p) +{ + struct thread *td = &p->p_xxthread; + + return (td); +} + + +/********************* +* STUB KSE syscalls +*********************/ + +/* struct thread_wakeup_args { struct thread_mailbox *tmbx; }; */ +int +thread_wakeup(struct thread *td, struct thread_wakeup_args *uap) +{ + + return(ENOSYS); +} + +int +kse_exit(struct thread *td, struct kse_exit_args *uap) +{ + + return(ENOSYS); +} + +int +kse_yield(struct thread *td, struct kse_yield_args *uap) +{ + + return(ENOSYS); +} + +int kse_wakeup(struct thread *td, struct kse_wakeup_args *uap) +{ + + return(ENOSYS); +} + + +int +kse_new(struct thread *td, struct kse_new_args *uap) +/* struct kse_new_args { + struct kse_mailbox *mbx; + int new_grp_flag; +}; */ +{ + + return (ENOSYS); +} + +/* + * Is p an inferior of the current process? + */ +int +inferior(p) + register struct proc *p; +{ + + sx_assert(&proctree_lock, SX_LOCKED); + for (; p != curproc; p = p->p_pptr) + if (p->p_pid == 0) + return (0); + return (1); +} + +/* + * Locate a process by number + */ +struct proc * +pfind(pid) + register pid_t pid; +{ + register struct proc *p; + + sx_slock(&allproc_lock); + p = dopfind(pid); + sx_sunlock(&allproc_lock); + return (p); +} + +static struct proc * +dopfind(pid) + register pid_t pid; +{ + register struct proc *p; + + sx_assert(&allproc_lock, SX_LOCKED); + + LIST_FOREACH(p, PIDHASH(pid), p_hash) + if (p->p_pid == pid) { + PROC_LOCK(p); + break; + } + return (p); +} + +/* + * Locate a process group by number. + * The caller must hold proctree_lock. + */ +struct pgrp * +pgfind(pgid) + register pid_t pgid; +{ + register struct pgrp *pgrp; + + sx_assert(&proctree_lock, SX_LOCKED); + + LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) { + if (pgrp->pg_id == pgid) { + PGRP_LOCK(pgrp); + return (pgrp); + } + } + return (NULL); +} + +/* + * Create a new process group. + * pgid must be equal to the pid of p. + * Begin a new session if required. + */ +int +enterpgrp(p, pgid, pgrp, sess) + register struct proc *p; + pid_t pgid; + struct pgrp *pgrp; + struct session *sess; +{ + struct pgrp *pgrp2; + + sx_assert(&proctree_lock, SX_XLOCKED); + + KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL")); + KASSERT(p->p_pid == pgid, + ("enterpgrp: new pgrp and pid != pgid")); + + pgrp2 = pgfind(pgid); + + KASSERT(pgrp2 == NULL, + ("enterpgrp: pgrp with pgid exists")); + KASSERT(!SESS_LEADER(p), + ("enterpgrp: session leader attempted setpgrp")); + + mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); + + if (sess != NULL) { + /* + * new session + */ + mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF); + PROC_LOCK(p); + p->p_flag &= ~P_CONTROLT; + PROC_UNLOCK(p); + PGRP_LOCK(pgrp); + sess->s_leader = p; + sess->s_sid = p->p_pid; + sess->s_count = 1; + sess->s_ttyvp = NULL; + sess->s_ttyp = NULL; + bcopy(p->p_session->s_login, sess->s_login, + sizeof(sess->s_login)); + pgrp->pg_session = sess; + KASSERT(p == curproc, + ("enterpgrp: mksession and p != curproc")); + } else { + pgrp->pg_session = p->p_session; + SESS_LOCK(pgrp->pg_session); + pgrp->pg_session->s_count++; + SESS_UNLOCK(pgrp->pg_session); + PGRP_LOCK(pgrp); + } + pgrp->pg_id = pgid; + LIST_INIT(&pgrp->pg_members); + + /* + * As we have an exclusive lock of proctree_lock, + * this should not deadlock. + */ + LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash); + pgrp->pg_jobc = 0; + SLIST_INIT(&pgrp->pg_sigiolst); + PGRP_UNLOCK(pgrp); + + doenterpgrp(p, pgrp); + + return (0); +} + +/* + * Move p to an existing process group + */ +int +enterthispgrp(p, pgrp) + register struct proc *p; + struct pgrp *pgrp; +{ + + sx_assert(&proctree_lock, SX_XLOCKED); + PROC_LOCK_ASSERT(p, MA_NOTOWNED); + PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); + PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); + SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); + KASSERT(pgrp->pg_session == p->p_session, + ("%s: pgrp's session %p, p->p_session %p.\n", + __func__, + pgrp->pg_session, + p->p_session)); + KASSERT(pgrp != p->p_pgrp, + ("%s: p belongs to pgrp.", __func__)); + + doenterpgrp(p, pgrp); + + return (0); +} + +/* + * Move p to a process group + */ +static void +doenterpgrp(p, pgrp) + struct proc *p; + struct pgrp *pgrp; +{ + struct pgrp *savepgrp; + + sx_assert(&proctree_lock, SX_XLOCKED); + PROC_LOCK_ASSERT(p, MA_NOTOWNED); + PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); + PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); + SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); + + savepgrp = p->p_pgrp; + + /* + * Adjust eligibility of affected pgrps to participate in job control. + * Increment eligibility counts before decrementing, otherwise we + * could reach 0 spuriously during the first call. + */ + fixjobc(p, pgrp, 1); + fixjobc(p, p->p_pgrp, 0); + + PGRP_LOCK(pgrp); + PGRP_LOCK(savepgrp); + PROC_LOCK(p); + LIST_REMOVE(p, p_pglist); + p->p_pgrp = pgrp; + PROC_UNLOCK(p); + LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist); + PGRP_UNLOCK(savepgrp); + PGRP_UNLOCK(pgrp); + if (LIST_EMPTY(&savepgrp->pg_members)) + pgdelete(savepgrp); +} + +/* + * remove process from process group + */ +int +leavepgrp(p) + register struct proc *p; +{ + struct pgrp *savepgrp; + + sx_assert(&proctree_lock, SX_XLOCKED); + savepgrp = p->p_pgrp; + PGRP_LOCK(savepgrp); + PROC_LOCK(p); + LIST_REMOVE(p, p_pglist); + p->p_pgrp = NULL; + PROC_UNLOCK(p); + PGRP_UNLOCK(savepgrp); + if (LIST_EMPTY(&savepgrp->pg_members)) + pgdelete(savepgrp); + return (0); +} + +/* + * delete a process group + */ +static void +pgdelete(pgrp) + register struct pgrp *pgrp; +{ + struct session *savesess; + + sx_assert(&proctree_lock, SX_XLOCKED); + PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); + SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); + + /* + * Reset any sigio structures pointing to us as a result of + * F_SETOWN with our pgid. + */ + funsetownlst(&pgrp->pg_sigiolst); + + PGRP_LOCK(pgrp); + if (pgrp->pg_session->s_ttyp != NULL && + pgrp->pg_session->s_ttyp->t_pgrp == pgrp) + pgrp->pg_session->s_ttyp->t_pgrp = NULL; + LIST_REMOVE(pgrp, pg_hash); + savesess = pgrp->pg_session; + SESS_LOCK(savesess); + savesess->s_count--; + SESS_UNLOCK(savesess); + PGRP_UNLOCK(pgrp); + if (savesess->s_count == 0) { + mtx_destroy(&savesess->s_mtx); + FREE(pgrp->pg_session, M_SESSION); + } + mtx_destroy(&pgrp->pg_mtx); + FREE(pgrp, M_PGRP); +} + +/* + * Adjust pgrp jobc counters when specified process changes process group. + * We count the number of processes in each process group that "qualify" + * the group for terminal job control (those with a parent in a different + * process group of the same session). If that count reaches zero, the + * process group becomes orphaned. Check both the specified process' + * process group and that of its children. + * entering == 0 => p is leaving specified group. + * entering == 1 => p is entering specified group. + */ +void +fixjobc(p, pgrp, entering) + register struct proc *p; + register struct pgrp *pgrp; + int entering; +{ + register struct pgrp *hispgrp; + register struct session *mysession; + + sx_assert(&proctree_lock, SX_LOCKED); + PROC_LOCK_ASSERT(p, MA_NOTOWNED); + PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); + SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); + + /* + * Check p's parent to see whether p qualifies its own process + * group; if so, adjust count for p's process group. + */ + mysession = pgrp->pg_session; + if ((hispgrp = p->p_pptr->p_pgrp) != pgrp && + hispgrp->pg_session == mysession) { + PGRP_LOCK(pgrp); + if (entering) + pgrp->pg_jobc++; + else { + --pgrp->pg_jobc; + if (pgrp->pg_jobc == 0) + orphanpg(pgrp); + } + PGRP_UNLOCK(pgrp); + } + + /* + * Check this process' children to see whether they qualify + * their process groups; if so, adjust counts for children's + * process groups. + */ + LIST_FOREACH(p, &p->p_children, p_sibling) { + if ((hispgrp = p->p_pgrp) != pgrp && + hispgrp->pg_session == mysession && + p->p_stat != SZOMB) { + PGRP_LOCK(hispgrp); + if (entering) + hispgrp->pg_jobc++; + else { + --hispgrp->pg_jobc; + if (hispgrp->pg_jobc == 0) + orphanpg(hispgrp); + } + PGRP_UNLOCK(hispgrp); + } + } +} + +/* + * A process group has become orphaned; + * if there are any stopped processes in the group, + * hang-up all process in that group. + */ +static void +orphanpg(pg) + struct pgrp *pg; +{ + register struct proc *p; + + PGRP_LOCK_ASSERT(pg, MA_OWNED); + + mtx_lock_spin(&sched_lock); + LIST_FOREACH(p, &pg->pg_members, p_pglist) { + if (p->p_stat == SSTOP) { + mtx_unlock_spin(&sched_lock); + LIST_FOREACH(p, &pg->pg_members, p_pglist) { + PROC_LOCK(p); + psignal(p, SIGHUP); + psignal(p, SIGCONT); + PROC_UNLOCK(p); + } + return; + } + } + mtx_unlock_spin(&sched_lock); +} + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +DB_SHOW_COMMAND(pgrpdump, pgrpdump) +{ + register struct pgrp *pgrp; + register struct proc *p; + register int i; + + for (i = 0; i <= pgrphash; i++) { + if (!LIST_EMPTY(&pgrphashtbl[i])) { + printf("\tindx %d\n", i); + LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) { + printf( + "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n", + (void *)pgrp, (long)pgrp->pg_id, + (void *)pgrp->pg_session, + pgrp->pg_session->s_count, + (void *)LIST_FIRST(&pgrp->pg_members)); + LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { + printf("\t\tpid %ld addr %p pgrp %p\n", + (long)p->p_pid, (void *)p, + (void *)p->p_pgrp); + } + } + } + } +} +#endif /* DDB */ + +/* + * Fill in an kinfo_proc structure for the specified process. + * Must be called with the target process locked. + */ +void +fill_kinfo_proc(p, kp) + struct proc *p; + struct kinfo_proc *kp; +{ + struct thread *td; + struct tty *tp; + struct session *sp; + struct timeval tv; + + bzero(kp, sizeof(*kp)); + + kp->ki_structsize = sizeof(*kp); + kp->ki_paddr = p; + PROC_LOCK_ASSERT(p, MA_OWNED); + kp->ki_addr =/* p->p_addr; */0; /* XXXKSE */ + kp->ki_args = p->p_args; + kp->ki_textvp = p->p_textvp; +#ifdef KTRACE + kp->ki_tracep = p->p_tracep; + mtx_lock(&ktrace_mtx); + kp->ki_traceflag = p->p_traceflag; + mtx_unlock(&ktrace_mtx); +#endif + kp->ki_fd = p->p_fd; + kp->ki_vmspace = p->p_vmspace; + if (p->p_ucred) { + kp->ki_uid = p->p_ucred->cr_uid; + kp->ki_ruid = p->p_ucred->cr_ruid; + kp->ki_svuid = p->p_ucred->cr_svuid; + /* XXX bde doesn't like KI_NGROUPS */ + kp->ki_ngroups = min(p->p_ucred->cr_ngroups, KI_NGROUPS); + bcopy(p->p_ucred->cr_groups, kp->ki_groups, + kp->ki_ngroups * sizeof(gid_t)); + kp->ki_rgid = p->p_ucred->cr_rgid; + kp->ki_svgid = p->p_ucred->cr_svgid; + } + if (p->p_procsig) { + kp->ki_sigignore = p->p_procsig->ps_sigignore; + kp->ki_sigcatch = p->p_procsig->ps_sigcatch; + } + mtx_lock_spin(&sched_lock); + if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) { + struct vmspace *vm = p->p_vmspace; + + kp->ki_size = vm->vm_map.size; + kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/ + if (p->p_sflag & PS_INMEM) + kp->ki_rssize += UAREA_PAGES; + FOREACH_THREAD_IN_PROC(p, td) /* XXXKSE: thread swapout check */ + kp->ki_rssize += KSTACK_PAGES; + kp->ki_swrss = vm->vm_swrss; + kp->ki_tsize = vm->vm_tsize; + kp->ki_dsize = vm->vm_dsize; + kp->ki_ssize = vm->vm_ssize; + } + if ((p->p_sflag & PS_INMEM) && p->p_stats) { + kp->ki_start = p->p_stats->p_start; + kp->ki_rusage = p->p_stats->p_ru; + kp->ki_childtime.tv_sec = p->p_stats->p_cru.ru_utime.tv_sec + + p->p_stats->p_cru.ru_stime.tv_sec; + kp->ki_childtime.tv_usec = p->p_stats->p_cru.ru_utime.tv_usec + + p->p_stats->p_cru.ru_stime.tv_usec; + } + td = FIRST_THREAD_IN_PROC(p); + if (td->td_wmesg != NULL) + strncpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg) - 1); + if (p->p_stat == SMTX) { + kp->ki_kiflag |= KI_MTXBLOCK; + strncpy(kp->ki_mtxname, td->td_mtxname, + sizeof(kp->ki_mtxname) - 1); + } + kp->ki_stat = p->p_stat; + kp->ki_sflag = p->p_sflag; + kp->ki_swtime = p->p_swtime; + kp->ki_pid = p->p_pid; + /* vvv XXXKSE */ + bintime2timeval(&p->p_runtime, &tv); + kp->ki_runtime = tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec; + kp->ki_pctcpu = p->p_kse.ke_pctcpu; + kp->ki_estcpu = td->td_ksegrp->kg_estcpu; + kp->ki_slptime = td->td_ksegrp->kg_slptime; + kp->ki_wchan = td->td_wchan; + kp->ki_pri.pri_level = td->td_priority; + kp->ki_pri.pri_user = td->td_ksegrp->kg_user_pri; + kp->ki_pri.pri_class = td->td_ksegrp->kg_pri_class; + kp->ki_pri.pri_native = td->td_base_pri; + kp->ki_nice = td->td_ksegrp->kg_nice; + kp->ki_rqindex = p->p_kse.ke_rqindex; + kp->ki_oncpu = p->p_kse.ke_oncpu; + kp->ki_lastcpu = td->td_lastcpu; + kp->ki_tdflags = td->td_flags; + kp->ki_pcb = td->td_pcb; + kp->ki_kstack = (void *)td->td_kstack; + /* ^^^ XXXKSE */ + mtx_unlock_spin(&sched_lock); + sp = NULL; + tp = NULL; + if (p->p_pgrp) { + kp->ki_pgid = p->p_pgrp->pg_id; + kp->ki_jobc = p->p_pgrp->pg_jobc; + sp = p->p_pgrp->pg_session; + + if (sp != NULL) { + kp->ki_sid = sp->s_sid; + SESS_LOCK(sp); + strncpy(kp->ki_login, sp->s_login, + sizeof(kp->ki_login) - 1); + if (sp->s_ttyvp) + kp->ki_kiflag |= KI_CTTY; + if (SESS_LEADER(p)) + kp->ki_kiflag |= KI_SLEADER; + tp = sp->s_ttyp; + SESS_UNLOCK(sp); + } + } + if ((p->p_flag & P_CONTROLT) && tp != NULL) { + kp->ki_tdev = dev2udev(tp->t_dev); + kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; + if (tp->t_session) + kp->ki_tsid = tp->t_session->s_sid; + } else + kp->ki_tdev = NOUDEV; + if (p->p_comm[0] != '\0') { + strncpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm) - 1); + strncpy(kp->ki_ocomm, p->p_comm, sizeof(kp->ki_ocomm) - 1); + } + kp->ki_siglist = p->p_siglist; + kp->ki_sigmask = p->p_sigmask; + kp->ki_xstat = p->p_xstat; + kp->ki_acflag = p->p_acflag; + kp->ki_flag = p->p_flag; + /* If jailed(p->p_ucred), emulate the old P_JAILED flag. */ + if (jailed(p->p_ucred)) + kp->ki_flag |= P_JAILED; + kp->ki_lock = p->p_lock; + if (p->p_pptr) + kp->ki_ppid = p->p_pptr->p_pid; +} + +/* + * Locate a zombie process by number + */ +struct proc * +zpfind(pid_t pid) +{ + struct proc *p; + + sx_slock(&allproc_lock); + LIST_FOREACH(p, &zombproc, p_list) + if (p->p_pid == pid) { + PROC_LOCK(p); + break; + } + sx_sunlock(&allproc_lock); + return (p); +} + + +/* + * Must be called with the process locked and will return with it unlocked. + */ +static int +sysctl_out_proc(struct proc *p, struct sysctl_req *req, int doingzomb) +{ + struct kinfo_proc kinfo_proc; + int error; + struct proc *np; + pid_t pid = p->p_pid; + + PROC_LOCK_ASSERT(p, MA_OWNED); + fill_kinfo_proc(p, &kinfo_proc); + PROC_UNLOCK(p); + error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); + if (error) + return (error); + if (doingzomb) + np = zpfind(pid); + else { + if (pid == 0) + return (0); + np = pfind(pid); + } + if (np == NULL) + return EAGAIN; + if (np != p) { + PROC_UNLOCK(np); + return EAGAIN; + } + PROC_UNLOCK(np); + return (0); +} + +static int +sysctl_kern_proc(SYSCTL_HANDLER_ARGS) +{ + int *name = (int*) arg1; + u_int namelen = arg2; + struct proc *p; + int doingzomb; + int error = 0; + + if (oidp->oid_number == KERN_PROC_PID) { + if (namelen != 1) + return (EINVAL); + p = pfind((pid_t)name[0]); + if (!p) + return (0); + if (p_cansee(curthread, p)) { + PROC_UNLOCK(p); + return (0); + } + error = sysctl_out_proc(p, req, 0); + return (error); + } + if (oidp->oid_number == KERN_PROC_ALL && !namelen) + ; + else if (oidp->oid_number != KERN_PROC_ALL && namelen == 1) + ; + else + return (EINVAL); + + if (!req->oldptr) { + /* overestimate by 5 procs */ + error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5); + if (error) + return (error); + } + sx_slock(&allproc_lock); + for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) { + if (!doingzomb) + p = LIST_FIRST(&allproc); + else + p = LIST_FIRST(&zombproc); + for (; p != 0; p = LIST_NEXT(p, p_list)) { + PROC_LOCK(p); + /* + * Show a user only appropriate processes. + */ + if (p_cansee(curthread, p)) { + PROC_UNLOCK(p); + continue; + } + /* + * Skip embryonic processes. + */ + if (p->p_stat == SIDL) { + PROC_UNLOCK(p); + continue; + } + /* + * TODO - make more efficient (see notes below). + * do by session. + */ + switch (oidp->oid_number) { + + case KERN_PROC_PGRP: + /* could do this by traversing pgrp */ + if (p->p_pgrp == NULL || + p->p_pgrp->pg_id != (pid_t)name[0]) { + PROC_UNLOCK(p); + continue; + } + break; + + case KERN_PROC_TTY: + if ((p->p_flag & P_CONTROLT) == 0 || + p->p_session == NULL) { + PROC_UNLOCK(p); + continue; + } + SESS_LOCK(p->p_session); + if (p->p_session->s_ttyp == NULL || + dev2udev(p->p_session->s_ttyp->t_dev) != + (udev_t)name[0]) { + SESS_UNLOCK(p->p_session); + PROC_UNLOCK(p); + continue; + } + SESS_UNLOCK(p->p_session); + break; + + case KERN_PROC_UID: + if (p->p_ucred == NULL || + p->p_ucred->cr_uid != (uid_t)name[0]) { + PROC_UNLOCK(p); + continue; + } + break; + + case KERN_PROC_RUID: + if (p->p_ucred == NULL || + p->p_ucred->cr_ruid != (uid_t)name[0]) { + PROC_UNLOCK(p); + continue; + } + break; + } + + error = sysctl_out_proc(p, req, doingzomb); + if (error) { + sx_sunlock(&allproc_lock); + return (error); + } + } + } + sx_sunlock(&allproc_lock); + return (0); +} + +struct pargs * +pargs_alloc(int len) +{ + struct pargs *pa; + + MALLOC(pa, struct pargs *, sizeof(struct pargs) + len, M_PARGS, + M_WAITOK); + pa->ar_ref = 1; + pa->ar_length = len; + return (pa); +} + +void +pargs_free(struct pargs *pa) +{ + + FREE(pa, M_PARGS); +} + +void +pargs_hold(struct pargs *pa) +{ + + if (pa == NULL) + return; + PARGS_LOCK(pa); + pa->ar_ref++; + PARGS_UNLOCK(pa); +} + +void +pargs_drop(struct pargs *pa) +{ + + if (pa == NULL) + return; + PARGS_LOCK(pa); + if (--pa->ar_ref == 0) { + PARGS_UNLOCK(pa); + pargs_free(pa); + } else + PARGS_UNLOCK(pa); +} + +/* + * This sysctl allows a process to retrieve the argument list or process + * title for another process without groping around in the address space + * of the other process. It also allow a process to set its own "process + * title to a string of its own choice. + */ +static int +sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS) +{ + int *name = (int*) arg1; + u_int namelen = arg2; + struct proc *p; + struct pargs *pa; + int error = 0; + + if (namelen != 1) + return (EINVAL); + + p = pfind((pid_t)name[0]); + if (!p) + return (0); + + if ((!ps_argsopen) && p_cansee(curthread, p)) { + PROC_UNLOCK(p); + return (0); + } + PROC_UNLOCK(p); + + if (req->newptr && curproc != p) + return (EPERM); + + PROC_LOCK(p); + pa = p->p_args; + pargs_hold(pa); + PROC_UNLOCK(p); + if (req->oldptr && pa != NULL) { + error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length); + } + pargs_drop(pa); + if (req->newptr == NULL) + return (error); + + PROC_LOCK(p); + pa = p->p_args; + p->p_args = NULL; + PROC_UNLOCK(p); + pargs_drop(pa); + + if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit) + return (error); + + pa = pargs_alloc(req->newlen); + error = SYSCTL_IN(req, pa->ar_args, req->newlen); + if (!error) { + PROC_LOCK(p); + p->p_args = pa; + PROC_UNLOCK(p); + } else + pargs_free(pa); + return (error); +} + +SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table"); + +SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT, + 0, 0, sysctl_kern_proc, "S,proc", "Return entire process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_ANYBODY, + sysctl_kern_proc_args, "Process argument list"); diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c new file mode 100644 index 0000000..a3e4bea --- /dev/null +++ b/sys/kern/kern_prot.c @@ -0,0 +1,1969 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * Copyright (c) 2000-2001 Robert N. M. Watson. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +/* + * System calls related to processes and protection + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/acct.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/proc.h> +#include <sys/sysproto.h> +#include <sys/jail.h> +#include <sys/pioctl.h> +#include <sys/resourcevar.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> + +static MALLOC_DEFINE(M_CRED, "cred", "credentials"); + +SYSCTL_DECL(_security); +SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0, + "BSD security policy"); + +#ifndef _SYS_SYSPROTO_H_ +struct getpid_args { + int dummy; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getpid(struct thread *td, struct getpid_args *uap) +{ + struct proc *p = td->td_proc; + int s; + + s = mtx_lock_giant(kern_giant_proc); + td->td_retval[0] = p->p_pid; +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + PROC_LOCK(p); + td->td_retval[1] = p->p_pptr->p_pid; + PROC_UNLOCK(p); +#endif + mtx_unlock_giant(s); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct getppid_args { + int dummy; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getppid(struct thread *td, struct getppid_args *uap) +{ + struct proc *p = td->td_proc; + int s; + + s = mtx_lock_giant(kern_giant_proc); + PROC_LOCK(p); + td->td_retval[0] = p->p_pptr->p_pid; + PROC_UNLOCK(p); + mtx_unlock_giant(s); + return (0); +} + +/* + * Get process group ID; note that POSIX getpgrp takes no parameter. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getpgrp_args { + int dummy; +}; +#endif +/* + * MPSAFE + */ +int +getpgrp(struct thread *td, struct getpgrp_args *uap) +{ + struct proc *p = td->td_proc; + int s; + + s = mtx_lock_giant(kern_giant_proc); + PROC_LOCK(p); + td->td_retval[0] = p->p_pgrp->pg_id; + PROC_UNLOCK(p); + mtx_unlock_giant(s); + return (0); +} + +/* Get an arbitary pid's process group id */ +#ifndef _SYS_SYSPROTO_H_ +struct getpgid_args { + pid_t pid; +}; +#endif +/* + * MPSAFE + */ +int +getpgid(struct thread *td, struct getpgid_args *uap) +{ + struct proc *p = td->td_proc; + struct proc *pt; + int error; + + mtx_lock(&Giant); + error = 0; + if (uap->pid == 0) { + PROC_LOCK(p); + td->td_retval[0] = p->p_pgrp->pg_id; + PROC_UNLOCK(p); + } else if ((pt = pfind(uap->pid)) == NULL) + error = ESRCH; + else { + error = p_cansee(td, pt); + if (error == 0) + td->td_retval[0] = pt->p_pgrp->pg_id; + PROC_UNLOCK(pt); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Get an arbitary pid's session id. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getsid_args { + pid_t pid; +}; +#endif +/* + * MPSAFE + */ +int +getsid(struct thread *td, struct getsid_args *uap) +{ + struct proc *p = td->td_proc; + struct proc *pt; + int error; + + mtx_lock(&Giant); + error = 0; + if (uap->pid == 0) { + PROC_LOCK(p); + td->td_retval[0] = p->p_session->s_sid; + PROC_UNLOCK(p); + } else if ((pt = pfind(uap->pid)) == NULL) + error = ESRCH; + else { + error = p_cansee(td, pt); + if (error == 0) + td->td_retval[0] = pt->p_session->s_sid; + PROC_UNLOCK(pt); + } + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct getuid_args { + int dummy; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getuid(struct thread *td, struct getuid_args *uap) +{ + + td->td_retval[0] = td->td_ucred->cr_ruid; +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + td->td_retval[1] = td->td_ucred->cr_uid; +#endif + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct geteuid_args { + int dummy; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +geteuid(struct thread *td, struct geteuid_args *uap) +{ + + td->td_retval[0] = td->td_ucred->cr_uid; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct getgid_args { + int dummy; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getgid(struct thread *td, struct getgid_args *uap) +{ + + td->td_retval[0] = td->td_ucred->cr_rgid; +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + td->td_retval[1] = td->td_ucred->cr_groups[0]; +#endif + return (0); +} + +/* + * Get effective group ID. The "egid" is groups[0], and could be obtained + * via getgroups. This syscall exists because it is somewhat painful to do + * correctly in a library function. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getegid_args { + int dummy; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getegid(struct thread *td, struct getegid_args *uap) +{ + + td->td_retval[0] = td->td_ucred->cr_groups[0]; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct getgroups_args { + u_int gidsetsize; + gid_t *gidset; +}; +#endif +/* + * MPSAFE + */ +int +getgroups(struct thread *td, register struct getgroups_args *uap) +{ + struct ucred *cred; + u_int ngrp; + int error; + + cred = td->td_ucred; + if ((ngrp = uap->gidsetsize) == 0) { + td->td_retval[0] = cred->cr_ngroups; + return (0); + } + if (ngrp < cred->cr_ngroups) + return (EINVAL); + ngrp = cred->cr_ngroups; + error = copyout((caddr_t)cred->cr_groups, (caddr_t)uap->gidset, + ngrp * sizeof(gid_t)); + if (error == 0) + td->td_retval[0] = ngrp; + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setsid_args { + int dummy; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setsid(register struct thread *td, struct setsid_args *uap) +{ + struct pgrp *pgrp; + int error; + struct proc *p = td->td_proc; + struct pgrp *newpgrp; + struct session *newsess; + + error = 0; + pgrp = NULL; + + MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); + MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO); + + sx_xlock(&proctree_lock); + + if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) { + if (pgrp != NULL) + PGRP_UNLOCK(pgrp); + error = EPERM; + } else { + (void)enterpgrp(p, p->p_pid, newpgrp, newsess); + td->td_retval[0] = p->p_pid; + newpgrp = NULL; + newsess = NULL; + } + + sx_xunlock(&proctree_lock); + + if (newpgrp != NULL) + FREE(newpgrp, M_PGRP); + if (newsess != NULL) + FREE(newsess, M_SESSION); + + return (error); +} + +/* + * set process group (setpgid/old setpgrp) + * + * caller does setpgid(targpid, targpgid) + * + * pid must be caller or child of caller (ESRCH) + * if a child + * pid must be in same session (EPERM) + * pid can't have done an exec (EACCES) + * if pgid != pid + * there must exist some pid in same session having pgid (EPERM) + * pid must not be session leader (EPERM) + */ +#ifndef _SYS_SYSPROTO_H_ +struct setpgid_args { + int pid; /* target process id */ + int pgid; /* target pgrp id */ +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setpgid(struct thread *td, register struct setpgid_args *uap) +{ + struct proc *curp = td->td_proc; + register struct proc *targp; /* target process */ + register struct pgrp *pgrp; /* target pgrp */ + int error; + struct pgrp *newpgrp; + + if (uap->pgid < 0) + return (EINVAL); + + error = 0; + + MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); + + sx_xlock(&proctree_lock); + if (uap->pid != 0 && uap->pid != curp->p_pid) { + if ((targp = pfind(uap->pid)) == NULL) { + if (targp) + PROC_UNLOCK(targp); + error = ESRCH; + goto done; + } + if (!inferior(targp)) { + PROC_UNLOCK(targp); + error = ESRCH; + goto done; + } + if ((error = p_cansee(curthread, targp))) { + PROC_UNLOCK(targp); + goto done; + } + if (targp->p_pgrp == NULL || + targp->p_session != curp->p_session) { + PROC_UNLOCK(targp); + error = EPERM; + goto done; + } + if (targp->p_flag & P_EXEC) { + PROC_UNLOCK(targp); + error = EACCES; + goto done; + } + PROC_UNLOCK(targp); + } else + targp = curp; + if (SESS_LEADER(targp)) { + error = EPERM; + goto done; + } + if (uap->pgid == 0) + uap->pgid = targp->p_pid; + if (uap->pgid == targp->p_pid) { + if (targp->p_pgid == uap->pgid) + goto done; + error = enterpgrp(targp, uap->pgid, newpgrp, NULL); + if (error == 0) + newpgrp = NULL; + } else { + if ((pgrp = pgfind(uap->pgid)) == NULL || + pgrp->pg_session != curp->p_session) { + if (pgrp != NULL) + PGRP_UNLOCK(pgrp); + error = EPERM; + goto done; + } + if (pgrp == targp->p_pgrp) { + PGRP_UNLOCK(pgrp); + goto done; + } + PGRP_UNLOCK(pgrp); + error = enterthispgrp(targp, pgrp); + } +done: + sx_xunlock(&proctree_lock); + KASSERT((error == 0) || (newpgrp != NULL), + ("setpgid failed and newpgrp is NULL")); + if (newpgrp != NULL) + FREE(newpgrp, M_PGRP); + return (error); +} + +/* + * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD + * compatible. It says that setting the uid/gid to euid/egid is a special + * case of "appropriate privilege". Once the rules are expanded out, this + * basically means that setuid(nnn) sets all three id's, in all permitted + * cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid()) + * does not set the saved id - this is dangerous for traditional BSD + * programs. For this reason, we *really* do not want to set + * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2. + */ +#define POSIX_APPENDIX_B_4_2_2 + +#ifndef _SYS_SYSPROTO_H_ +struct setuid_args { + uid_t uid; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setuid(struct thread *td, struct setuid_args *uap) +{ + struct proc *p = td->td_proc; + struct ucred *newcred, *oldcred; + uid_t uid; + struct uidinfo *uip; + int error; + + mtx_lock(&Giant); + uid = uap->uid; + newcred = crget(); + uip = uifind(uid); + PROC_LOCK(p); + oldcred = p->p_ucred; + + /* + * See if we have "permission" by POSIX 1003.1 rules. + * + * Note that setuid(geteuid()) is a special case of + * "appropriate privileges" in appendix B.4.2.2. We need + * to use this clause to be compatible with traditional BSD + * semantics. Basically, it means that "setuid(xx)" sets all + * three id's (assuming you have privs). + * + * Notes on the logic. We do things in three steps. + * 1: We determine if the euid is going to change, and do EPERM + * right away. We unconditionally change the euid later if this + * test is satisfied, simplifying that part of the logic. + * 2: We determine if the real and/or saved uids are going to + * change. Determined by compile options. + * 3: Change euid last. (after tests in #2 for "appropriate privs") + */ + if (uid != oldcred->cr_ruid && /* allow setuid(getuid()) */ +#ifdef _POSIX_SAVED_IDS + uid != oldcred->cr_svuid && /* allow setuid(saved gid) */ +#endif +#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ + uid != oldcred->cr_uid && /* allow setuid(geteuid()) */ +#endif + (error = suser_cred(oldcred, PRISON_ROOT)) != 0) { + PROC_UNLOCK(p); + uifree(uip); + crfree(newcred); + mtx_unlock(&Giant); + return (error); + } + + /* + * Copy credentials so other references do not see our changes. + */ + crcopy(newcred, oldcred); +#ifdef _POSIX_SAVED_IDS + /* + * Do we have "appropriate privileges" (are we root or uid == euid) + * If so, we are changing the real uid and/or saved uid. + */ + if ( +#ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */ + uid == oldcred->cr_uid || +#endif + suser_cred(oldcred, PRISON_ROOT) == 0) /* we are using privs */ +#endif + { + /* + * Set the real uid and transfer proc count to new user. + */ + if (uid != oldcred->cr_ruid) { + change_ruid(newcred, uip); + setsugid(p); + } + /* + * Set saved uid + * + * XXX always set saved uid even if not _POSIX_SAVED_IDS, as + * the security of seteuid() depends on it. B.4.2.2 says it + * is important that we should do this. + */ + if (uid != oldcred->cr_svuid) { + change_svuid(newcred, uid); + setsugid(p); + } + } + + /* + * In all permitted cases, we are changing the euid. + */ + if (uid != oldcred->cr_uid) { + change_euid(newcred, uip); + setsugid(p); + } + p->p_ucred = newcred; + PROC_UNLOCK(p); + uifree(uip); + crfree(oldcred); + mtx_unlock(&Giant); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct seteuid_args { + uid_t euid; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +seteuid(struct thread *td, struct seteuid_args *uap) +{ + struct proc *p = td->td_proc; + struct ucred *newcred, *oldcred; + uid_t euid; + struct uidinfo *euip; + int error; + + euid = uap->euid; + mtx_lock(&Giant); + newcred = crget(); + euip = uifind(euid); + PROC_LOCK(p); + oldcred = p->p_ucred; + if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */ + euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */ + (error = suser_cred(oldcred, PRISON_ROOT)) != 0) { + PROC_UNLOCK(p); + uifree(euip); + crfree(newcred); + mtx_unlock(&Giant); + return (error); + } + /* + * Everything's okay, do it. Copy credentials so other references do + * not see our changes. + */ + crcopy(newcred, oldcred); + if (oldcred->cr_uid != euid) { + change_euid(newcred, euip); + setsugid(p); + } + p->p_ucred = newcred; + PROC_UNLOCK(p); + uifree(euip); + crfree(oldcred); + mtx_unlock(&Giant); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setgid_args { + gid_t gid; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setgid(struct thread *td, struct setgid_args *uap) +{ + struct proc *p = td->td_proc; + struct ucred *newcred, *oldcred; + gid_t gid; + int error; + + gid = uap->gid; + mtx_lock(&Giant); + newcred = crget(); + PROC_LOCK(p); + oldcred = p->p_ucred; + + /* + * See if we have "permission" by POSIX 1003.1 rules. + * + * Note that setgid(getegid()) is a special case of + * "appropriate privileges" in appendix B.4.2.2. We need + * to use this clause to be compatible with traditional BSD + * semantics. Basically, it means that "setgid(xx)" sets all + * three id's (assuming you have privs). + * + * For notes on the logic here, see setuid() above. + */ + if (gid != oldcred->cr_rgid && /* allow setgid(getgid()) */ +#ifdef _POSIX_SAVED_IDS + gid != oldcred->cr_svgid && /* allow setgid(saved gid) */ +#endif +#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ + gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */ +#endif + (error = suser_cred(oldcred, PRISON_ROOT)) != 0) { + PROC_UNLOCK(p); + crfree(newcred); + mtx_unlock(&Giant); + return (error); + } + + crcopy(newcred, oldcred); +#ifdef _POSIX_SAVED_IDS + /* + * Do we have "appropriate privileges" (are we root or gid == egid) + * If so, we are changing the real uid and saved gid. + */ + if ( +#ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */ + gid == oldcred->cr_groups[0] || +#endif + suser_cred(oldcred, PRISON_ROOT) == 0) /* we are using privs */ +#endif + { + /* + * Set real gid + */ + if (oldcred->cr_rgid != gid) { + change_rgid(newcred, gid); + setsugid(p); + } + /* + * Set saved gid + * + * XXX always set saved gid even if not _POSIX_SAVED_IDS, as + * the security of setegid() depends on it. B.4.2.2 says it + * is important that we should do this. + */ + if (oldcred->cr_svgid != gid) { + change_svgid(newcred, gid); + setsugid(p); + } + } + /* + * In all cases permitted cases, we are changing the egid. + * Copy credentials so other references do not see our changes. + */ + if (oldcred->cr_groups[0] != gid) { + change_egid(newcred, gid); + setsugid(p); + } + p->p_ucred = newcred; + PROC_UNLOCK(p); + crfree(oldcred); + mtx_unlock(&Giant); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setegid_args { + gid_t egid; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setegid(struct thread *td, struct setegid_args *uap) +{ + struct proc *p = td->td_proc; + struct ucred *newcred, *oldcred; + gid_t egid; + int error; + + egid = uap->egid; + mtx_lock(&Giant); + newcred = crget(); + PROC_LOCK(p); + oldcred = p->p_ucred; + if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */ + egid != oldcred->cr_svgid && /* allow setegid(saved gid) */ + (error = suser_cred(oldcred, PRISON_ROOT)) != 0) { + PROC_UNLOCK(p); + crfree(newcred); + mtx_unlock(&Giant); + return (error); + } + crcopy(newcred, oldcred); + if (oldcred->cr_groups[0] != egid) { + change_egid(newcred, egid); + setsugid(p); + } + p->p_ucred = newcred; + PROC_UNLOCK(p); + crfree(oldcred); + mtx_unlock(&Giant); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setgroups_args { + u_int gidsetsize; + gid_t *gidset; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setgroups(struct thread *td, struct setgroups_args *uap) +{ + struct proc *p = td->td_proc; + struct ucred *newcred, *tempcred, *oldcred; + u_int ngrp; + int error; + + ngrp = uap->gidsetsize; + if (ngrp > NGROUPS) + return (EINVAL); + mtx_lock(&Giant); + tempcred = crget(); + error = copyin((caddr_t)uap->gidset, (caddr_t)tempcred->cr_groups, + ngrp * sizeof(gid_t)); + if (error != 0) { + crfree(tempcred); + mtx_unlock(&Giant); + return (error); + } + newcred = crget(); + PROC_LOCK(p); + oldcred = p->p_ucred; + error = suser_cred(oldcred, PRISON_ROOT); + if (error) { + PROC_UNLOCK(p); + crfree(newcred); + crfree(tempcred); + mtx_unlock(&Giant); + return (error); + } + + /* + * XXX A little bit lazy here. We could test if anything has + * changed before crcopy() and setting P_SUGID. + */ + crcopy(newcred, oldcred); + if (ngrp < 1) { + /* + * setgroups(0, NULL) is a legitimate way of clearing the + * groups vector on non-BSD systems (which generally do not + * have the egid in the groups[0]). We risk security holes + * when running non-BSD software if we do not do the same. + */ + newcred->cr_ngroups = 1; + } else { + bcopy(tempcred->cr_groups, newcred->cr_groups, + ngrp * sizeof(gid_t)); + newcred->cr_ngroups = ngrp; + } + setsugid(p); + p->p_ucred = newcred; + PROC_UNLOCK(p); + crfree(tempcred); + crfree(oldcred); + mtx_unlock(&Giant); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setreuid_args { + uid_t ruid; + uid_t euid; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setreuid(register struct thread *td, struct setreuid_args *uap) +{ + struct proc *p = td->td_proc; + struct ucred *newcred, *oldcred; + uid_t euid, ruid; + struct uidinfo *euip, *ruip; + int error; + + euid = uap->euid; + ruid = uap->ruid; + mtx_lock(&Giant); + newcred = crget(); + euip = uifind(euid); + ruip = uifind(ruid); + PROC_LOCK(p); + oldcred = p->p_ucred; + if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && + ruid != oldcred->cr_svuid) || + (euid != (uid_t)-1 && euid != oldcred->cr_uid && + euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) && + (error = suser_cred(oldcred, PRISON_ROOT)) != 0) { + PROC_UNLOCK(p); + uifree(ruip); + uifree(euip); + crfree(newcred); + mtx_unlock(&Giant); + return (error); + } + crcopy(newcred, oldcred); + if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { + change_euid(newcred, euip); + setsugid(p); + } + if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { + change_ruid(newcred, ruip); + setsugid(p); + } + if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) && + newcred->cr_svuid != newcred->cr_uid) { + change_svuid(newcred, newcred->cr_uid); + setsugid(p); + } + p->p_ucred = newcred; + PROC_UNLOCK(p); + uifree(ruip); + uifree(euip); + crfree(oldcred); + mtx_unlock(&Giant); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setregid_args { + gid_t rgid; + gid_t egid; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setregid(register struct thread *td, struct setregid_args *uap) +{ + struct proc *p = td->td_proc; + struct ucred *newcred, *oldcred; + gid_t egid, rgid; + int error; + + egid = uap->egid; + rgid = uap->rgid; + mtx_lock(&Giant); + newcred = crget(); + PROC_LOCK(p); + oldcred = p->p_ucred; + if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && + rgid != oldcred->cr_svgid) || + (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] && + egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) && + (error = suser_cred(oldcred, PRISON_ROOT)) != 0) { + PROC_UNLOCK(p); + crfree(newcred); + mtx_unlock(&Giant); + return (error); + } + + crcopy(newcred, oldcred); + if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { + change_egid(newcred, egid); + setsugid(p); + } + if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { + change_rgid(newcred, rgid); + setsugid(p); + } + if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) && + newcred->cr_svgid != newcred->cr_groups[0]) { + change_svgid(newcred, newcred->cr_groups[0]); + setsugid(p); + } + p->p_ucred = newcred; + PROC_UNLOCK(p); + crfree(oldcred); + mtx_unlock(&Giant); + return (0); +} + +/* + * setresuid(ruid, euid, suid) is like setreuid except control over the + * saved uid is explicit. + */ + +#ifndef _SYS_SYSPROTO_H_ +struct setresuid_args { + uid_t ruid; + uid_t euid; + uid_t suid; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setresuid(register struct thread *td, struct setresuid_args *uap) +{ + struct proc *p = td->td_proc; + struct ucred *newcred, *oldcred; + uid_t euid, ruid, suid; + struct uidinfo *euip, *ruip; + int error; + + euid = uap->euid; + ruid = uap->ruid; + suid = uap->suid; + mtx_lock(&Giant); + newcred = crget(); + euip = uifind(euid); + ruip = uifind(ruid); + PROC_LOCK(p); + oldcred = p->p_ucred; + if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && + ruid != oldcred->cr_svuid && + ruid != oldcred->cr_uid) || + (euid != (uid_t)-1 && euid != oldcred->cr_ruid && + euid != oldcred->cr_svuid && + euid != oldcred->cr_uid) || + (suid != (uid_t)-1 && suid != oldcred->cr_ruid && + suid != oldcred->cr_svuid && + suid != oldcred->cr_uid)) && + (error = suser_cred(oldcred, PRISON_ROOT)) != 0) { + PROC_UNLOCK(p); + uifree(ruip); + uifree(euip); + crfree(newcred); + mtx_unlock(&Giant); + return (error); + } + + crcopy(newcred, oldcred); + if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { + change_euid(newcred, euip); + setsugid(p); + } + if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { + change_ruid(newcred, ruip); + setsugid(p); + } + if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) { + change_svuid(newcred, suid); + setsugid(p); + } + p->p_ucred = newcred; + PROC_UNLOCK(p); + uifree(ruip); + uifree(euip); + crfree(oldcred); + mtx_unlock(&Giant); + return (0); +} + +/* + * setresgid(rgid, egid, sgid) is like setregid except control over the + * saved gid is explicit. + */ + +#ifndef _SYS_SYSPROTO_H_ +struct setresgid_args { + gid_t rgid; + gid_t egid; + gid_t sgid; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setresgid(register struct thread *td, struct setresgid_args *uap) +{ + struct proc *p = td->td_proc; + struct ucred *newcred, *oldcred; + gid_t egid, rgid, sgid; + int error; + + egid = uap->egid; + rgid = uap->rgid; + sgid = uap->sgid; + mtx_lock(&Giant); + newcred = crget(); + PROC_LOCK(p); + oldcred = p->p_ucred; + if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && + rgid != oldcred->cr_svgid && + rgid != oldcred->cr_groups[0]) || + (egid != (gid_t)-1 && egid != oldcred->cr_rgid && + egid != oldcred->cr_svgid && + egid != oldcred->cr_groups[0]) || + (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid && + sgid != oldcred->cr_svgid && + sgid != oldcred->cr_groups[0])) && + (error = suser_cred(oldcred, PRISON_ROOT)) != 0) { + PROC_UNLOCK(p); + crfree(newcred); + mtx_unlock(&Giant); + return (error); + } + + crcopy(newcred, oldcred); + if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { + change_egid(newcred, egid); + setsugid(p); + } + if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { + change_rgid(newcred, rgid); + setsugid(p); + } + if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) { + change_svgid(newcred, sgid); + setsugid(p); + } + p->p_ucred = newcred; + PROC_UNLOCK(p); + crfree(oldcred); + mtx_unlock(&Giant); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct getresuid_args { + uid_t *ruid; + uid_t *euid; + uid_t *suid; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getresuid(register struct thread *td, struct getresuid_args *uap) +{ + struct ucred *cred; + int error1 = 0, error2 = 0, error3 = 0; + + cred = td->td_ucred; + if (uap->ruid) + error1 = copyout((caddr_t)&cred->cr_ruid, + (caddr_t)uap->ruid, sizeof(cred->cr_ruid)); + if (uap->euid) + error2 = copyout((caddr_t)&cred->cr_uid, + (caddr_t)uap->euid, sizeof(cred->cr_uid)); + if (uap->suid) + error3 = copyout((caddr_t)&cred->cr_svuid, + (caddr_t)uap->suid, sizeof(cred->cr_svuid)); + return (error1 ? error1 : error2 ? error2 : error3); +} + +#ifndef _SYS_SYSPROTO_H_ +struct getresgid_args { + gid_t *rgid; + gid_t *egid; + gid_t *sgid; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getresgid(register struct thread *td, struct getresgid_args *uap) +{ + struct ucred *cred; + int error1 = 0, error2 = 0, error3 = 0; + + cred = td->td_ucred; + if (uap->rgid) + error1 = copyout((caddr_t)&cred->cr_rgid, + (caddr_t)uap->rgid, sizeof(cred->cr_rgid)); + if (uap->egid) + error2 = copyout((caddr_t)&cred->cr_groups[0], + (caddr_t)uap->egid, sizeof(cred->cr_groups[0])); + if (uap->sgid) + error3 = copyout((caddr_t)&cred->cr_svgid, + (caddr_t)uap->sgid, sizeof(cred->cr_svgid)); + return (error1 ? error1 : error2 ? error2 : error3); +} + +#ifndef _SYS_SYSPROTO_H_ +struct issetugid_args { + int dummy; +}; +#endif +/* + * NOT MPSAFE? + */ +/* ARGSUSED */ +int +issetugid(register struct thread *td, struct issetugid_args *uap) +{ + struct proc *p = td->td_proc; + + /* + * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time, + * we use P_SUGID because we consider changing the owners as + * "tainting" as well. + * This is significant for procs that start as root and "become" + * a user without an exec - programs cannot know *everything* + * that libc *might* have put in their data segment. + */ + PROC_LOCK(p); + td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0; + PROC_UNLOCK(p); + return (0); +} + +/* + * MPSAFE + */ +int +__setugid(struct thread *td, struct __setugid_args *uap) +{ +#ifdef REGRESSION + struct proc *p; + + p = td->td_proc; + switch (uap->flag) { + case 0: + mtx_lock(&Giant); + PROC_LOCK(p); + p->p_flag &= ~P_SUGID; + PROC_UNLOCK(p); + mtx_unlock(&Giant); + return (0); + case 1: + mtx_lock(&Giant); + PROC_LOCK(p); + p->p_flag |= P_SUGID; + PROC_UNLOCK(p); + mtx_unlock(&Giant); + return (0); + default: + return (EINVAL); + } +#else /* !REGRESSION */ + + return (ENOSYS); +#endif /* REGRESSION */ +} + +/* + * Check if gid is a member of the group set. + * + * MPSAFE (cred must be held) + */ +int +groupmember(gid_t gid, struct ucred *cred) +{ + register gid_t *gp; + gid_t *egp; + + egp = &(cred->cr_groups[cred->cr_ngroups]); + for (gp = cred->cr_groups; gp < egp; gp++) + if (*gp == gid) + return (1); + return (0); +} + +/* + * `suser_enabled' (which can be set by the security.suser_enabled + * sysctl) determines whether the system 'super-user' policy is in effect. + * If it is nonzero, an effective uid of 0 connotes special privilege, + * overriding many mandatory and discretionary protections. If it is zero, + * uid 0 is offered no special privilege in the kernel security policy. + * Setting it to zero may seriously impact the functionality of many + * existing userland programs, and should not be done without careful + * consideration of the consequences. + */ +int suser_enabled = 1; +SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RW, + &suser_enabled, 0, "processes with uid 0 have privilege"); +TUNABLE_INT("security.bsd.suser_enabled", &suser_enabled); + +/* + * Test whether the specified credentials imply "super-user" privilege. + * Return 0 or EPERM. The flag argument is currently used only to + * specify jail interaction. + */ +int +suser_cred(struct ucred *cred, int flag) +{ + + if (!suser_enabled) + return (EPERM); + if (cred->cr_uid != 0) + return (EPERM); + if (jailed(cred) && !(flag & PRISON_ROOT)) + return (EPERM); + return (0); +} + +/* + * Shortcut to hide contents of struct td and struct proc from the + * caller, promoting binary compatibility. + */ +int +suser(struct thread *td) +{ + + return (suser_cred(td->td_ucred, 0)); +} + +/* + * Test the active securelevel against a given level. securelevel_gt() + * implements (securelevel > level). securelevel_ge() implements + * (securelevel >= level). Note that the logic is inverted -- these + * functions return EPERM on "success" and 0 on "failure". + * + * MPSAFE + */ +int +securelevel_gt(struct ucred *cr, int level) +{ + int active_securelevel; + + active_securelevel = securelevel; + KASSERT(cr != NULL, ("securelevel_gt: null cr")); + if (cr->cr_prison != NULL) { + mtx_lock(&cr->cr_prison->pr_mtx); + active_securelevel = imax(cr->cr_prison->pr_securelevel, + active_securelevel); + mtx_unlock(&cr->cr_prison->pr_mtx); + } + return (active_securelevel > level ? EPERM : 0); +} + +int +securelevel_ge(struct ucred *cr, int level) +{ + int active_securelevel; + + active_securelevel = securelevel; + KASSERT(cr != NULL, ("securelevel_ge: null cr")); + if (cr->cr_prison != NULL) { + mtx_lock(&cr->cr_prison->pr_mtx); + active_securelevel = imax(cr->cr_prison->pr_securelevel, + active_securelevel); + mtx_unlock(&cr->cr_prison->pr_mtx); + } + return (active_securelevel >= level ? EPERM : 0); +} + +/* + * 'see_other_uids' determines whether or not visibility of processes + * and sockets with credentials holding different real uids is possible + * using a variety of system MIBs. + * XXX: data declarations should be together near the beginning of the file. + */ +static int see_other_uids = 1; +SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW, + &see_other_uids, 0, + "Unprivileged processes may see subjects/objects with different real uid"); + +/*- + * Determine if u1 "can see" the subject specified by u2, according to the + * 'see_other_uids' policy. + * Returns: 0 for permitted, ESRCH otherwise + * Locks: none + * References: *u1 and *u2 must not change during the call + * u1 may equal u2, in which case only one reference is required + */ +static int +cr_seeotheruids(struct ucred *u1, struct ucred *u2) +{ + + if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) { + if (suser_cred(u1, PRISON_ROOT) != 0) + return (ESRCH); + } + return (0); +} + +/*- + * Determine if u1 "can see" the subject specified by u2. + * Returns: 0 for permitted, an errno value otherwise + * Locks: none + * References: *u1 and *u2 must not change during the call + * u1 may equal u2, in which case only one reference is required + */ +int +cr_cansee(struct ucred *u1, struct ucred *u2) +{ + int error; + + if ((error = prison_check(u1, u2))) + return (error); + if ((error = cr_seeotheruids(u1, u2))) + return (error); + return (0); +} + +/*- + * Determine if td "can see" the subject specified by p. + * Returns: 0 for permitted, an errno value otherwise + * Locks: Sufficient locks to protect p->p_ucred must be held. td really + * should be curthread. + * References: td and p must be valid for the lifetime of the call + */ +int +p_cansee(struct thread *td, struct proc *p) +{ + + /* Wrap cr_cansee() for all functionality. */ + KASSERT(td == curthread, ("%s: td not curthread", __func__)); + PROC_LOCK_ASSERT(p, MA_OWNED); + return (cr_cansee(td->td_ucred, p->p_ucred)); +} + +/*- + * Determine whether cred may deliver the specified signal to proc. + * Returns: 0 for permitted, an errno value otherwise. + * Locks: A lock must be held for proc. + * References: cred and proc must be valid for the lifetime of the call. + */ +int +cr_cansignal(struct ucred *cred, struct proc *proc, int signum) +{ + int error; + + PROC_LOCK_ASSERT(proc, MA_OWNED); + /* + * Jail semantics limit the scope of signalling to proc in the + * same jail as cred, if cred is in jail. + */ + error = prison_check(cred, proc->p_ucred); + if (error) + return (error); + error = cr_seeotheruids(cred, proc->p_ucred); + if (error) + return (error); + + /* + * UNIX signal semantics depend on the status of the P_SUGID + * bit on the target process. If the bit is set, then additional + * restrictions are placed on the set of available signals. + */ + if (proc->p_flag & P_SUGID) { + switch (signum) { + case 0: + case SIGKILL: + case SIGINT: + case SIGTERM: + case SIGSTOP: + case SIGTTIN: + case SIGTTOU: + case SIGTSTP: + case SIGHUP: + case SIGUSR1: + case SIGUSR2: + /* + * Generally, permit job and terminal control + * signals. + */ + break; + default: + /* Not permitted without privilege. */ + error = suser_cred(cred, PRISON_ROOT); + if (error) + return (error); + } + } + + /* + * Generally, the target credential's ruid or svuid must match the + * subject credential's ruid or euid. + */ + if (cred->cr_ruid != proc->p_ucred->cr_ruid && + cred->cr_ruid != proc->p_ucred->cr_svuid && + cred->cr_uid != proc->p_ucred->cr_ruid && + cred->cr_uid != proc->p_ucred->cr_svuid) { + /* Not permitted without privilege. */ + error = suser_cred(cred, PRISON_ROOT); + if (error) + return (error); + } + + return (0); +} + + +/*- + * Determine whether td may deliver the specified signal to p. + * Returns: 0 for permitted, an errno value otherwise + * Locks: Sufficient locks to protect various components of td and p + * must be held. td must be curthread, and a lock must be + * held for p. + * References: td and p must be valid for the lifetime of the call + */ +int +p_cansignal(struct thread *td, struct proc *p, int signum) +{ + + KASSERT(td == curthread, ("%s: td not curthread", __func__)); + PROC_LOCK_ASSERT(p, MA_OWNED); + if (td->td_proc == p) + return (0); + + /* + * UNIX signalling semantics require that processes in the same + * session always be able to deliver SIGCONT to one another, + * overriding the remaining protections. + */ + /* XXX: This will require an additional lock of some sort. */ + if (signum == SIGCONT && td->td_proc->p_session == p->p_session) + return (0); + + return (cr_cansignal(td->td_ucred, p, signum)); +} + +/*- + * Determine whether td may reschedule p. + * Returns: 0 for permitted, an errno value otherwise + * Locks: Sufficient locks to protect various components of td and p + * must be held. td must be curthread, and a lock must + * be held for p. + * References: td and p must be valid for the lifetime of the call + */ +int +p_cansched(struct thread *td, struct proc *p) +{ + int error; + + KASSERT(td == curthread, ("%s: td not curthread", __func__)); + PROC_LOCK_ASSERT(p, MA_OWNED); + if (td->td_proc == p) + return (0); + if ((error = prison_check(td->td_ucred, p->p_ucred))) + return (error); + if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred))) + return (error); + if (td->td_ucred->cr_ruid == p->p_ucred->cr_ruid) + return (0); + if (td->td_ucred->cr_uid == p->p_ucred->cr_ruid) + return (0); + if (suser_cred(td->td_ucred, PRISON_ROOT) == 0) + return (0); + +#ifdef CAPABILITIES + if (!cap_check(NULL, td, CAP_SYS_NICE, PRISON_ROOT)) + return (0); +#endif + + return (EPERM); +} + +/* + * The 'unprivileged_proc_debug' flag may be used to disable a variety of + * unprivileged inter-process debugging services, including some procfs + * functionality, ptrace(), and ktrace(). In the past, inter-process + * debugging has been involved in a variety of security problems, and sites + * not requiring the service might choose to disable it when hardening + * systems. + * + * XXX: Should modifying and reading this variable require locking? + * XXX: data declarations should be together near the beginning of the file. + */ +static int unprivileged_proc_debug = 1; +SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLFLAG_RW, + &unprivileged_proc_debug, 0, + "Unprivileged processes may use process debugging facilities"); + +/*- + * Determine whether td may debug p. + * Returns: 0 for permitted, an errno value otherwise + * Locks: Sufficient locks to protect various components of td and p + * must be held. td must be curthread, and a lock must + * be held for p. + * References: td and p must be valid for the lifetime of the call + */ +int +p_candebug(struct thread *td, struct proc *p) +{ + int credentialchanged, error, grpsubset, i, uidsubset; + + KASSERT(td == curthread, ("%s: td not curthread", __func__)); + PROC_LOCK_ASSERT(p, MA_OWNED); + if (!unprivileged_proc_debug) { + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + } + if (td->td_proc == p) + return (0); + if ((error = prison_check(td->td_ucred, p->p_ucred))) + return (error); + if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred))) + return (error); + + /* + * Is p's group set a subset of td's effective group set? This + * includes p's egid, group access list, rgid, and svgid. + */ + grpsubset = 1; + for (i = 0; i < p->p_ucred->cr_ngroups; i++) { + if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) { + grpsubset = 0; + break; + } + } + grpsubset = grpsubset && + groupmember(p->p_ucred->cr_rgid, td->td_ucred) && + groupmember(p->p_ucred->cr_svgid, td->td_ucred); + + /* + * Are the uids present in p's credential equal to td's + * effective uid? This includes p's euid, svuid, and ruid. + */ + uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid && + td->td_ucred->cr_uid == p->p_ucred->cr_svuid && + td->td_ucred->cr_uid == p->p_ucred->cr_ruid); + + /* + * Has the credential of the process changed since the last exec()? + */ + credentialchanged = (p->p_flag & P_SUGID); + + /* + * If p's gids aren't a subset, or the uids aren't a subset, + * or the credential has changed, require appropriate privilege + * for td to debug p. For POSIX.1e capabilities, this will + * require CAP_SYS_PTRACE. + */ + if (!grpsubset || !uidsubset || credentialchanged) { + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + } + + /* Can't trace init when securelevel > 0. */ + if (p == initproc) { + error = securelevel_gt(td->td_ucred, 0); + if (error) + return (error); + } + + /* + * Can't trace a process that's currently exec'ing. + * XXX: Note, this is not a security policy decision, it's a + * basic correctness/functionality decision. Therefore, this check + * should be moved to the caller's of p_candebug(). + */ + if ((p->p_flag & P_INEXEC) != 0) + return (EAGAIN); + + return (0); +} + +/*- + * Determine whether the subject represented by cred can "see" a socket. + * Returns: 0 for permitted, ENOENT otherwise. + */ +int +cr_canseesocket(struct ucred *cred, struct socket *so) +{ + int error; + + error = prison_check(cred, so->so_cred); + if (error) + return (ENOENT); + if (cr_seeotheruids(cred, so->so_cred)) + return (ENOENT); +#ifdef MAC + /* XXX: error = mac_cred_check_seesocket() here. */ +#endif + + return (0); +} + +/* + * Allocate a zeroed cred structure. + */ +struct ucred * +crget(void) +{ + register struct ucred *cr; + + MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK | M_ZERO); + cr->cr_ref = 1; + cr->cr_mtxp = mtx_pool_find(cr); + return (cr); +} + +/* + * Claim another reference to a ucred structure. + */ +struct ucred * +crhold(struct ucred *cr) +{ + + mtx_lock(cr->cr_mtxp); + cr->cr_ref++; + mtx_unlock(cr->cr_mtxp); + return (cr); +} + +/* + * Free a cred structure. + * Throws away space when ref count gets to 0. + */ +void +crfree(struct ucred *cr) +{ + struct mtx *mtxp = cr->cr_mtxp; + + mtx_lock(mtxp); + KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref)); + if (--cr->cr_ref == 0) { + /* + * Some callers of crget(), such as nfs_statfs(), + * allocate a temporary credential, but don't + * allocate a uidinfo structure. + */ + mtx_unlock(mtxp); + mtx_lock(&Giant); + if (cr->cr_uidinfo != NULL) + uifree(cr->cr_uidinfo); + if (cr->cr_ruidinfo != NULL) + uifree(cr->cr_ruidinfo); + /* + * Free a prison, if any. + */ + if (jailed(cr)) + prison_free(cr->cr_prison); + FREE((caddr_t)cr, M_CRED); + mtx_unlock(&Giant); + } else { + mtx_unlock(mtxp); + } +} + +/* + * Check to see if this ucred is shared. + */ +int +crshared(struct ucred *cr) +{ + int shared; + + mtx_lock(cr->cr_mtxp); + shared = (cr->cr_ref > 1); + mtx_unlock(cr->cr_mtxp); + return (shared); +} + +/* + * Copy a ucred's contents from a template. Does not block. + */ +void +crcopy(struct ucred *dest, struct ucred *src) +{ + + KASSERT(crshared(dest) == 0, ("crcopy of shared ucred")); + bcopy(&src->cr_startcopy, &dest->cr_startcopy, + (unsigned)((caddr_t)&src->cr_endcopy - + (caddr_t)&src->cr_startcopy)); + uihold(dest->cr_uidinfo); + uihold(dest->cr_ruidinfo); + if (jailed(dest)) + prison_hold(dest->cr_prison); +} + +/* + * Dup cred struct to a new held one. + */ +struct ucred * +crdup(struct ucred *cr) +{ + struct ucred *newcr; + + newcr = crget(); + crcopy(newcr, cr); + return (newcr); +} + +/* + * Fill in a struct xucred based on a struct ucred. + */ +void +cru2x(struct ucred *cr, struct xucred *xcr) +{ + + bzero(xcr, sizeof(*xcr)); + xcr->cr_version = XUCRED_VERSION; + xcr->cr_uid = cr->cr_uid; + xcr->cr_ngroups = cr->cr_ngroups; + bcopy(cr->cr_groups, xcr->cr_groups, sizeof(cr->cr_groups)); +} + +/* + * small routine to swap a thread's current ucred for the correct one + * taken from the process. + */ +void +cred_update_thread(struct thread *td) +{ + struct proc *p; + struct ucred *cred; + + p = td->td_proc; + cred = td->td_ucred; + mtx_lock(&Giant); + PROC_LOCK(p); + td->td_ucred = crhold(p->p_ucred); + PROC_UNLOCK(p); + if (cred != NULL) + crfree(cred); + mtx_unlock(&Giant); +} + +/* + * Get login name, if available. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getlogin_args { + char *namebuf; + u_int namelen; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getlogin(struct thread *td, struct getlogin_args *uap) +{ + int error; + char login[MAXLOGNAME]; + struct proc *p = td->td_proc; + + if (uap->namelen > MAXLOGNAME) + uap->namelen = MAXLOGNAME; + PROC_LOCK(p); + SESS_LOCK(p->p_session); + bcopy(p->p_session->s_login, login, uap->namelen); + SESS_UNLOCK(p->p_session); + PROC_UNLOCK(p); + error = copyout((caddr_t) login, (caddr_t) uap->namebuf, uap->namelen); + return(error); +} + +/* + * Set login name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct setlogin_args { + char *namebuf; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setlogin(struct thread *td, struct setlogin_args *uap) +{ + struct proc *p = td->td_proc; + int error; + char logintmp[MAXLOGNAME]; + + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + error = copyinstr((caddr_t) uap->namebuf, (caddr_t) logintmp, + sizeof(logintmp), (size_t *)0); + if (error == ENAMETOOLONG) + error = EINVAL; + else if (!error) { + PROC_LOCK(p); + SESS_LOCK(p->p_session); + (void) memcpy(p->p_session->s_login, logintmp, + sizeof(logintmp)); + SESS_UNLOCK(p->p_session); + PROC_UNLOCK(p); + } + return (error); +} + +void +setsugid(struct proc *p) +{ + + PROC_LOCK_ASSERT(p, MA_OWNED); + p->p_flag |= P_SUGID; + if (!(p->p_pfsflags & PF_ISUGID)) + p->p_stops = 0; +} + +/*- + * Change a process's effective uid. + * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified. + * References: newcred must be an exclusive credential reference for the + * duration of the call. + */ +void +change_euid(struct ucred *newcred, struct uidinfo *euip) +{ + + newcred->cr_uid = euip->ui_uid; + uihold(euip); + uifree(newcred->cr_uidinfo); + newcred->cr_uidinfo = euip; +} + +/*- + * Change a process's effective gid. + * Side effects: newcred->cr_gid will be modified. + * References: newcred must be an exclusive credential reference for the + * duration of the call. + */ +void +change_egid(struct ucred *newcred, gid_t egid) +{ + + newcred->cr_groups[0] = egid; +} + +/*- + * Change a process's real uid. + * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo + * will be updated, and the old and new cr_ruidinfo proc + * counts will be updated. + * References: newcred must be an exclusive credential reference for the + * duration of the call. + */ +void +change_ruid(struct ucred *newcred, struct uidinfo *ruip) +{ + + (void)chgproccnt(newcred->cr_ruidinfo, -1, 0); + newcred->cr_ruid = ruip->ui_uid; + uihold(ruip); + uifree(newcred->cr_ruidinfo); + newcred->cr_ruidinfo = ruip; + (void)chgproccnt(newcred->cr_ruidinfo, 1, 0); +} + +/*- + * Change a process's real gid. + * Side effects: newcred->cr_rgid will be updated. + * References: newcred must be an exclusive credential reference for the + * duration of the call. + */ +void +change_rgid(struct ucred *newcred, gid_t rgid) +{ + + newcred->cr_rgid = rgid; +} + +/*- + * Change a process's saved uid. + * Side effects: newcred->cr_svuid will be updated. + * References: newcred must be an exclusive credential reference for the + * duration of the call. + */ +void +change_svuid(struct ucred *newcred, uid_t svuid) +{ + + newcred->cr_svuid = svuid; +} + +/*- + * Change a process's saved gid. + * Side effects: newcred->cr_svgid will be updated. + * References: newcred must be an exclusive credential reference for the + * duration of the call. + */ +void +change_svgid(struct ucred *newcred, gid_t svgid) +{ + + newcred->cr_svgid = svgid; +} diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c new file mode 100644 index 0000000..d467c1a --- /dev/null +++ b/sys/kern/kern_resource.c @@ -0,0 +1,1020 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/file.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/sx.h> +#include <sys/time.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> + +static int donice(struct thread *td, struct proc *chgp, int n); + +static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures"); +#define UIHASH(uid) (&uihashtbl[(uid) & uihash]) +static struct mtx uihashtbl_mtx; +static LIST_HEAD(uihashhead, uidinfo) *uihashtbl; +static u_long uihash; /* size of hash table - 1 */ + +static struct uidinfo *uilookup(uid_t uid); + +/* + * Resource controls and accounting. + */ + +#ifndef _SYS_SYSPROTO_H_ +struct getpriority_args { + int which; + int who; +}; +#endif +/* + * MPSAFE + */ +int +getpriority(td, uap) + struct thread *td; + register struct getpriority_args *uap; +{ + register struct proc *p; + register int low = PRIO_MAX + 1; + int error = 0; + + mtx_lock(&Giant); + + switch (uap->which) { + case PRIO_PROCESS: + if (uap->who == 0) + low = td->td_ksegrp->kg_nice; + else { + p = pfind(uap->who); + if (p == NULL) + break; + if (p_cansee(td, p) == 0) + low = p->p_ksegrp.kg_nice /* XXXKSE */ ; + PROC_UNLOCK(p); + } + break; + + case PRIO_PGRP: { + register struct pgrp *pg; + + sx_slock(&proctree_lock); + if (uap->who == 0) { + pg = td->td_proc->p_pgrp; + PGRP_LOCK(pg); + } else { + pg = pgfind(uap->who); + if (pg == NULL) { + sx_sunlock(&proctree_lock); + break; + } + } + sx_sunlock(&proctree_lock); + LIST_FOREACH(p, &pg->pg_members, p_pglist) { + PROC_LOCK(p); + if (!p_cansee(td, p) && p->p_ksegrp.kg_nice /* XXXKSE */ < low) + low = p->p_ksegrp.kg_nice /* XXXKSE */ ; + PROC_UNLOCK(p); + } + PGRP_UNLOCK(pg); + break; + } + + case PRIO_USER: + if (uap->who == 0) + uap->who = td->td_ucred->cr_uid; + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + PROC_LOCK(p); + if (!p_cansee(td, p) && + p->p_ucred->cr_uid == uap->who && + p->p_ksegrp.kg_nice /* XXXKSE */ < low) + low = p->p_ksegrp.kg_nice /* XXXKSE */ ; + PROC_UNLOCK(p); + } + sx_sunlock(&allproc_lock); + break; + + default: + error = EINVAL; + break; + } + if (low == PRIO_MAX + 1 && error == 0) + error = ESRCH; + td->td_retval[0] = low; + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setpriority_args { + int which; + int who; + int prio; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setpriority(td, uap) + struct thread *td; + register struct setpriority_args *uap; +{ + struct proc *curp = td->td_proc; + register struct proc *p; + int found = 0, error = 0; + + mtx_lock(&Giant); + + switch (uap->which) { + case PRIO_PROCESS: + if (uap->who == 0) { + PROC_LOCK(curp); + error = donice(td, curp, uap->prio); + PROC_UNLOCK(curp); + } else { + p = pfind(uap->who); + if (p == 0) + break; + if (p_cansee(td, p) == 0) + error = donice(td, p, uap->prio); + PROC_UNLOCK(p); + } + found++; + break; + + case PRIO_PGRP: { + register struct pgrp *pg; + + sx_slock(&proctree_lock); + if (uap->who == 0) { + pg = curp->p_pgrp; + PGRP_LOCK(pg); + } else { + pg = pgfind(uap->who); + if (pg == NULL) { + sx_sunlock(&proctree_lock); + break; + } + } + sx_sunlock(&proctree_lock); + LIST_FOREACH(p, &pg->pg_members, p_pglist) { + PROC_LOCK(p); + if (!p_cansee(td, p)) { + error = donice(td, p, uap->prio); + found++; + } + PROC_UNLOCK(p); + } + PGRP_UNLOCK(pg); + break; + } + + case PRIO_USER: + if (uap->who == 0) + uap->who = td->td_ucred->cr_uid; + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_ucred->cr_uid == uap->who && + !p_cansee(td, p)) { + error = donice(td, p, uap->prio); + found++; + } + PROC_UNLOCK(p); + } + sx_sunlock(&allproc_lock); + break; + + default: + error = EINVAL; + break; + } + if (found == 0 && error == 0) + error = ESRCH; + mtx_unlock(&Giant); + return (error); +} + +static int +donice(td, chgp, n) + struct thread *td; + register struct proc *chgp; + register int n; +{ + int error; + + PROC_LOCK_ASSERT(chgp, MA_OWNED); + if ((error = p_cansched(td, chgp))) + return (error); + if (n > PRIO_MAX) + n = PRIO_MAX; + if (n < PRIO_MIN) + n = PRIO_MIN; + if (n < chgp->p_ksegrp.kg_nice /* XXXKSE */ && suser(td)) + return (EACCES); + chgp->p_ksegrp.kg_nice /* XXXKSE */ = n; + (void)resetpriority(&chgp->p_ksegrp); /* XXXKSE */ + return (0); +} + +/* rtprio system call */ +#ifndef _SYS_SYSPROTO_H_ +struct rtprio_args { + int function; + pid_t pid; + struct rtprio *rtp; +}; +#endif + +/* + * Set realtime priority + */ + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +rtprio(td, uap) + struct thread *td; + register struct rtprio_args *uap; +{ + struct proc *curp = td->td_proc; + register struct proc *p; + struct rtprio rtp; + int error, cierror = 0; + + /* Perform copyin before acquiring locks if needed. */ + if (uap->function == RTP_SET) + cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); + + if (uap->pid == 0) { + p = curp; + PROC_LOCK(p); + } else { + p = pfind(uap->pid); + if (p == NULL) + return (ESRCH); + } + + switch (uap->function) { + case RTP_LOOKUP: + if ((error = p_cansee(td, p))) + break; + mtx_lock_spin(&sched_lock); + pri_to_rtp(&p->p_ksegrp /* XXXKSE */ , &rtp); + mtx_unlock_spin(&sched_lock); + PROC_UNLOCK(p); + return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); + case RTP_SET: + if ((error = p_cansched(td, p)) || (error = cierror)) + break; + /* disallow setting rtprio in most cases if not superuser */ + if (suser(td) != 0) { + /* can't set someone else's */ + if (uap->pid) { + error = EPERM; + break; + } + /* can't set realtime priority */ +/* + * Realtime priority has to be restricted for reasons which should be + * obvious. However, for idle priority, there is a potential for + * system deadlock if an idleprio process gains a lock on a resource + * that other processes need (and the idleprio process can't run + * due to a CPU-bound normal process). Fix me! XXX + */ +#if 0 + if (RTP_PRIO_IS_REALTIME(rtp.type)) +#endif + if (rtp.type != RTP_PRIO_NORMAL) { + error = EPERM; + break; + } + } + mtx_lock_spin(&sched_lock); + error = rtp_to_pri(&rtp, &p->p_ksegrp); + mtx_unlock_spin(&sched_lock); + break; + default: + error = EINVAL; + break; + } + PROC_UNLOCK(p); + return (error); +} + +int +rtp_to_pri(struct rtprio *rtp, struct ksegrp *kg) +{ + + if (rtp->prio > RTP_PRIO_MAX) + return (EINVAL); + switch (RTP_PRIO_BASE(rtp->type)) { + case RTP_PRIO_REALTIME: + kg->kg_user_pri = PRI_MIN_REALTIME + rtp->prio; + break; + case RTP_PRIO_NORMAL: + kg->kg_user_pri = PRI_MIN_TIMESHARE + rtp->prio; + break; + case RTP_PRIO_IDLE: + kg->kg_user_pri = PRI_MIN_IDLE + rtp->prio; + break; + default: + return (EINVAL); + } + kg->kg_pri_class = rtp->type; + if (curthread->td_ksegrp == kg) { + curthread->td_base_pri = kg->kg_user_pri; + curthread->td_priority = kg->kg_user_pri; /* XXX dubious */ + } + return (0); +} + +void +pri_to_rtp(struct ksegrp *kg, struct rtprio *rtp) +{ + + switch (PRI_BASE(kg->kg_pri_class)) { + case PRI_REALTIME: + rtp->prio = kg->kg_user_pri - PRI_MIN_REALTIME; + break; + case PRI_TIMESHARE: + rtp->prio = kg->kg_user_pri - PRI_MIN_TIMESHARE; + break; + case PRI_IDLE: + rtp->prio = kg->kg_user_pri - PRI_MIN_IDLE; + break; + default: + break; + } + rtp->type = kg->kg_pri_class; +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#ifndef _SYS_SYSPROTO_H_ +struct osetrlimit_args { + u_int which; + struct orlimit *rlp; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +osetrlimit(td, uap) + struct thread *td; + register struct osetrlimit_args *uap; +{ + struct orlimit olim; + struct rlimit lim; + int error; + + if ((error = + copyin((caddr_t)uap->rlp, (caddr_t)&olim, sizeof(struct orlimit)))) + return (error); + lim.rlim_cur = olim.rlim_cur; + lim.rlim_max = olim.rlim_max; + mtx_lock(&Giant); + error = dosetrlimit(td, uap->which, &lim); + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct ogetrlimit_args { + u_int which; + struct orlimit *rlp; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +ogetrlimit(td, uap) + struct thread *td; + register struct ogetrlimit_args *uap; +{ + struct proc *p = td->td_proc; + struct orlimit olim; + int error; + + if (uap->which >= RLIM_NLIMITS) + return (EINVAL); + mtx_lock(&Giant); + olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur; + if (olim.rlim_cur == -1) + olim.rlim_cur = 0x7fffffff; + olim.rlim_max = p->p_rlimit[uap->which].rlim_max; + if (olim.rlim_max == -1) + olim.rlim_max = 0x7fffffff; + error = copyout((caddr_t)&olim, (caddr_t)uap->rlp, sizeof(olim)); + mtx_unlock(&Giant); + return (error); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +#ifndef _SYS_SYSPROTO_H_ +struct __setrlimit_args { + u_int which; + struct rlimit *rlp; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setrlimit(td, uap) + struct thread *td; + register struct __setrlimit_args *uap; +{ + struct rlimit alim; + int error; + + if ((error = + copyin((caddr_t)uap->rlp, (caddr_t)&alim, sizeof (struct rlimit)))) + return (error); + mtx_lock(&Giant); + error = dosetrlimit(td, uap->which, &alim); + mtx_unlock(&Giant); + return (error); +} + +int +dosetrlimit(td, which, limp) + struct thread *td; + u_int which; + struct rlimit *limp; +{ + struct proc *p = td->td_proc; + register struct rlimit *alimp; + int error; + + GIANT_REQUIRED; + + if (which >= RLIM_NLIMITS) + return (EINVAL); + alimp = &p->p_rlimit[which]; + + /* + * Preserve historical bugs by treating negative limits as unsigned. + */ + if (limp->rlim_cur < 0) + limp->rlim_cur = RLIM_INFINITY; + if (limp->rlim_max < 0) + limp->rlim_max = RLIM_INFINITY; + + if (limp->rlim_cur > alimp->rlim_max || + limp->rlim_max > alimp->rlim_max) + if ((error = suser_cred(td->td_ucred, PRISON_ROOT))) + return (error); + if (limp->rlim_cur > limp->rlim_max) + limp->rlim_cur = limp->rlim_max; + if (p->p_limit->p_refcnt > 1 && + (p->p_limit->p_lflags & PL_SHAREMOD) == 0) { + p->p_limit->p_refcnt--; + p->p_limit = limcopy(p->p_limit); + alimp = &p->p_rlimit[which]; + } + + switch (which) { + + case RLIMIT_CPU: + if (limp->rlim_cur > RLIM_INFINITY / (rlim_t)1000000) + p->p_limit->p_cpulimit = RLIM_INFINITY; + else + p->p_limit->p_cpulimit = + (rlim_t)1000000 * limp->rlim_cur; + break; + case RLIMIT_DATA: + if (limp->rlim_cur > maxdsiz) + limp->rlim_cur = maxdsiz; + if (limp->rlim_max > maxdsiz) + limp->rlim_max = maxdsiz; + break; + + case RLIMIT_STACK: + if (limp->rlim_cur > maxssiz) + limp->rlim_cur = maxssiz; + if (limp->rlim_max > maxssiz) + limp->rlim_max = maxssiz; + /* + * Stack is allocated to the max at exec time with only + * "rlim_cur" bytes accessible. If stack limit is going + * up make more accessible, if going down make inaccessible. + */ + if (limp->rlim_cur != alimp->rlim_cur) { + vm_offset_t addr; + vm_size_t size; + vm_prot_t prot; + + if (limp->rlim_cur > alimp->rlim_cur) { + prot = VM_PROT_ALL; + size = limp->rlim_cur - alimp->rlim_cur; + addr = USRSTACK - limp->rlim_cur; + } else { + prot = VM_PROT_NONE; + size = alimp->rlim_cur - limp->rlim_cur; + addr = USRSTACK - alimp->rlim_cur; + } + addr = trunc_page(addr); + size = round_page(size); + (void) vm_map_protect(&p->p_vmspace->vm_map, + addr, addr+size, prot, FALSE); + } + break; + + case RLIMIT_NOFILE: + if (limp->rlim_cur > maxfilesperproc) + limp->rlim_cur = maxfilesperproc; + if (limp->rlim_max > maxfilesperproc) + limp->rlim_max = maxfilesperproc; + break; + + case RLIMIT_NPROC: + if (limp->rlim_cur > maxprocperuid) + limp->rlim_cur = maxprocperuid; + if (limp->rlim_max > maxprocperuid) + limp->rlim_max = maxprocperuid; + if (limp->rlim_cur < 1) + limp->rlim_cur = 1; + if (limp->rlim_max < 1) + limp->rlim_max = 1; + break; + } + *alimp = *limp; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct __getrlimit_args { + u_int which; + struct rlimit *rlp; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getrlimit(td, uap) + struct thread *td; + register struct __getrlimit_args *uap; +{ + int error; + struct proc *p = td->td_proc; + + if (uap->which >= RLIM_NLIMITS) + return (EINVAL); + mtx_lock(&Giant); + error = copyout((caddr_t)&p->p_rlimit[uap->which], (caddr_t)uap->rlp, + sizeof (struct rlimit)); + mtx_unlock(&Giant); + return(error); +} + +/* + * Transform the running time and tick information in proc p into user, + * system, and interrupt time usage. + */ +void +calcru(p, up, sp, ip) + struct proc *p; + struct timeval *up; + struct timeval *sp; + struct timeval *ip; +{ + /* {user, system, interrupt, total} {ticks, usec}; previous tu: */ + u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu; + u_int64_t uut = 0, sut = 0, iut = 0; + int s; + struct timeval tv; + struct bintime bt; + struct kse *ke; + struct ksegrp *kg; + + mtx_assert(&sched_lock, MA_OWNED); + /* XXX: why spl-protect ? worst case is an off-by-one report */ + + FOREACH_KSEGRP_IN_PROC(p, kg) { + /* we could accumulate per ksegrp and per process here*/ + FOREACH_KSE_IN_GROUP(kg, ke) { + s = splstatclock(); + ut = ke->ke_uticks; + st = ke->ke_sticks; + it = ke->ke_iticks; + splx(s); + + tt = ut + st + it; + if (tt == 0) { + st = 1; + tt = 1; + } + + if (ke == curthread->td_kse) { + /* + * Adjust for the current time slice. This is actually fairly + * important since the error here is on the order of a time + * quantum, which is much greater than the sampling error. + * XXXKSE use a different test due to threads on other + * processors also being 'current'. + */ + + binuptime(&bt); + bintime_sub(&bt, PCPU_PTR(switchtime)); + bintime_add(&bt, &p->p_runtime); + } else { + bt = p->p_runtime; + } + bintime2timeval(&bt, &tv); + tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec; + ptu = ke->ke_uu + ke->ke_su + ke->ke_iu; + if (tu < ptu || (int64_t)tu < 0) { + /* XXX no %qd in kernel. Truncate. */ + printf("calcru: negative time of %ld usec for pid %d (%s)\n", + (long)tu, p->p_pid, p->p_comm); + tu = ptu; + } + + /* Subdivide tu. */ + uu = (tu * ut) / tt; + su = (tu * st) / tt; + iu = tu - uu - su; + + /* Enforce monotonicity. */ + if (uu < ke->ke_uu || su < ke->ke_su || iu < ke->ke_iu) { + if (uu < ke->ke_uu) + uu = ke->ke_uu; + else if (uu + ke->ke_su + ke->ke_iu > tu) + uu = tu - ke->ke_su - ke->ke_iu; + if (st == 0) + su = ke->ke_su; + else { + su = ((tu - uu) * st) / (st + it); + if (su < ke->ke_su) + su = ke->ke_su; + else if (uu + su + ke->ke_iu > tu) + su = tu - uu - ke->ke_iu; + } + KASSERT(uu + su + ke->ke_iu <= tu, + ("calcru: monotonisation botch 1")); + iu = tu - uu - su; + KASSERT(iu >= ke->ke_iu, + ("calcru: monotonisation botch 2")); + } + ke->ke_uu = uu; + ke->ke_su = su; + ke->ke_iu = iu; + uut += uu; + sut += su; + iut += iu; + + } /* end kse loop */ + } /* end kseg loop */ + up->tv_sec = uut / 1000000; + up->tv_usec = uut % 1000000; + sp->tv_sec = sut / 1000000; + sp->tv_usec = sut % 1000000; + if (ip != NULL) { + ip->tv_sec = iut / 1000000; + ip->tv_usec = iut % 1000000; + } +} + +#ifndef _SYS_SYSPROTO_H_ +struct getrusage_args { + int who; + struct rusage *rusage; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getrusage(td, uap) + register struct thread *td; + register struct getrusage_args *uap; +{ + struct proc *p = td->td_proc; + register struct rusage *rup; + int error = 0; + + mtx_lock(&Giant); + + switch (uap->who) { + case RUSAGE_SELF: + rup = &p->p_stats->p_ru; + mtx_lock_spin(&sched_lock); + calcru(p, &rup->ru_utime, &rup->ru_stime, NULL); + mtx_unlock_spin(&sched_lock); + break; + + case RUSAGE_CHILDREN: + rup = &p->p_stats->p_cru; + break; + + default: + rup = NULL; + error = EINVAL; + break; + } + mtx_unlock(&Giant); + if (error == 0) { + error = copyout((caddr_t)rup, (caddr_t)uap->rusage, + sizeof (struct rusage)); + } + return(error); +} + +void +ruadd(ru, ru2) + register struct rusage *ru, *ru2; +{ + register long *ip, *ip2; + register int i; + + timevaladd(&ru->ru_utime, &ru2->ru_utime); + timevaladd(&ru->ru_stime, &ru2->ru_stime); + if (ru->ru_maxrss < ru2->ru_maxrss) + ru->ru_maxrss = ru2->ru_maxrss; + ip = &ru->ru_first; ip2 = &ru2->ru_first; + for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--) + *ip++ += *ip2++; +} + +/* + * Make a copy of the plimit structure. + * We share these structures copy-on-write after fork, + * and copy when a limit is changed. + */ +struct plimit * +limcopy(lim) + struct plimit *lim; +{ + register struct plimit *copy; + + MALLOC(copy, struct plimit *, sizeof(struct plimit), + M_SUBPROC, M_WAITOK); + bcopy(lim->pl_rlimit, copy->pl_rlimit, sizeof(struct plimit)); + copy->p_lflags = 0; + copy->p_refcnt = 1; + return (copy); +} + +/* + * Find the uidinfo structure for a uid. This structure is used to + * track the total resource consumption (process count, socket buffer + * size, etc.) for the uid and impose limits. + */ +void +uihashinit() +{ + + uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash); + mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF); +} + +/* + * lookup a uidinfo struct for the parameter uid. + * uihashtbl_mtx must be locked. + */ +static struct uidinfo * +uilookup(uid) + uid_t uid; +{ + struct uihashhead *uipp; + struct uidinfo *uip; + + mtx_assert(&uihashtbl_mtx, MA_OWNED); + uipp = UIHASH(uid); + LIST_FOREACH(uip, uipp, ui_hash) + if (uip->ui_uid == uid) + break; + + return (uip); +} + +/* + * Find or allocate a struct uidinfo for a particular uid. + * Increase refcount on uidinfo struct returned. + * uifree() should be called on a struct uidinfo when released. + */ +struct uidinfo * +uifind(uid) + uid_t uid; +{ + struct uidinfo *uip; + + mtx_lock(&uihashtbl_mtx); + uip = uilookup(uid); + if (uip == NULL) { + struct uidinfo *old_uip; + + mtx_unlock(&uihashtbl_mtx); + uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO); + mtx_lock(&uihashtbl_mtx); + /* + * There's a chance someone created our uidinfo while we + * were in malloc and not holding the lock, so we have to + * make sure we don't insert a duplicate uidinfo + */ + if ((old_uip = uilookup(uid)) != NULL) { + /* someone else beat us to it */ + free(uip, M_UIDINFO); + uip = old_uip; + } else { + uip->ui_mtxp = mtx_pool_alloc(); + uip->ui_uid = uid; + LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash); + } + } + uihold(uip); + mtx_unlock(&uihashtbl_mtx); + return (uip); +} + +/* + * Place another refcount on a uidinfo struct. + */ +void +uihold(uip) + struct uidinfo *uip; +{ + + UIDINFO_LOCK(uip); + uip->ui_ref++; + UIDINFO_UNLOCK(uip); +} + +/*- + * Since uidinfo structs have a long lifetime, we use an + * opportunistic refcounting scheme to avoid locking the lookup hash + * for each release. + * + * If the refcount hits 0, we need to free the structure, + * which means we need to lock the hash. + * Optimal case: + * After locking the struct and lowering the refcount, if we find + * that we don't need to free, simply unlock and return. + * Suboptimal case: + * If refcount lowering results in need to free, bump the count + * back up, loose the lock and aquire the locks in the proper + * order to try again. + */ +void +uifree(uip) + struct uidinfo *uip; +{ + + /* Prepare for optimal case. */ + UIDINFO_LOCK(uip); + + if (--uip->ui_ref != 0) { + UIDINFO_UNLOCK(uip); + return; + } + + /* Prepare for suboptimal case. */ + uip->ui_ref++; + UIDINFO_UNLOCK(uip); + mtx_lock(&uihashtbl_mtx); + UIDINFO_LOCK(uip); + + /* + * We must subtract one from the count again because we backed out + * our initial subtraction before dropping the lock. + * Since another thread may have added a reference after we dropped the + * initial lock we have to test for zero again. + */ + if (--uip->ui_ref == 0) { + LIST_REMOVE(uip, ui_hash); + mtx_unlock(&uihashtbl_mtx); + if (uip->ui_sbsize != 0) + /* XXX no %qd in kernel. Truncate. */ + printf("freeing uidinfo: uid = %d, sbsize = %ld\n", + uip->ui_uid, (long)uip->ui_sbsize); + if (uip->ui_proccnt != 0) + printf("freeing uidinfo: uid = %d, proccnt = %ld\n", + uip->ui_uid, uip->ui_proccnt); + UIDINFO_UNLOCK(uip); + FREE(uip, M_UIDINFO); + return; + } + + mtx_unlock(&uihashtbl_mtx); + UIDINFO_UNLOCK(uip); +} + +/* + * Change the count associated with number of processes + * a given user is using. When 'max' is 0, don't enforce a limit + */ +int +chgproccnt(uip, diff, max) + struct uidinfo *uip; + int diff; + int max; +{ + + UIDINFO_LOCK(uip); + /* don't allow them to exceed max, but allow subtraction */ + if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) { + UIDINFO_UNLOCK(uip); + return (0); + } + uip->ui_proccnt += diff; + if (uip->ui_proccnt < 0) + printf("negative proccnt for uid = %d\n", uip->ui_uid); + UIDINFO_UNLOCK(uip); + return (1); +} + +/* + * Change the total socket buffer size a user has used. + */ +int +chgsbsize(uip, hiwat, to, max) + struct uidinfo *uip; + u_long *hiwat; + u_long to; + rlim_t max; +{ + rlim_t new; + int s; + + s = splnet(); + UIDINFO_LOCK(uip); + new = uip->ui_sbsize + to - *hiwat; + /* don't allow them to exceed max, but allow subtraction */ + if (to > *hiwat && new > max) { + splx(s); + UIDINFO_UNLOCK(uip); + return (0); + } + uip->ui_sbsize = new; + *hiwat = to; + if (uip->ui_sbsize < 0) + printf("negative sbsize for uid = %d\n", uip->ui_uid); + splx(s); + UIDINFO_UNLOCK(uip); + return (1); +} diff --git a/sys/kern/kern_sema.c b/sys/kern/kern_sema.c new file mode 100644 index 0000000..61435bd --- /dev/null +++ b/sys/kern/kern_sema.c @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2001 Jason Evans <jasone@freebsd.org>. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice(s), this list of conditions and the following disclaimer as + * the first lines of this file unmodified other than the possible + * addition of one or more copyright notices. + * 2. Redistributions in binary form must reproduce the above copyright + * notice(s), this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Counting semaphores. + * + * Priority propagation will not generally raise the priority of semaphore + * "owners" (a misnomer in the context of semaphores), so should not be relied + * upon in combination with semaphores. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/ktr.h> +#include <sys/condvar.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sema.h> + +void +sema_init(struct sema *sema, int value, const char *description) +{ + + KASSERT((value >= 0), ("%s(): negative value\n", __func__)); + + bzero(sema, sizeof(*sema)); + mtx_init(&sema->sema_mtx, description, "sema backing lock", + MTX_DEF | MTX_NOWITNESS | MTX_QUIET); + cv_init(&sema->sema_cv, description); + sema->sema_value = value; + + CTR4(KTR_LOCK, "%s(%p, %d, \"%s\")", __func__, sema, value, description); +} + +void +sema_destroy(struct sema *sema) +{ + + CTR3(KTR_LOCK, "%s(%p) \"%s\"", __func__, sema, + cv_wmesg(&sema->sema_cv)); + + KASSERT((sema->sema_waiters == 0), ("%s(): waiters\n", __func__)); + + mtx_destroy(&sema->sema_mtx); + cv_destroy(&sema->sema_cv); +} + +void +_sema_post(struct sema *sema, const char *file, int line) +{ + + mtx_lock(&sema->sema_mtx); + sema->sema_value++; + if (sema->sema_waiters && sema->sema_value > 0) + cv_signal(&sema->sema_cv); + + CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema, + cv_wmesg(&sema->sema_cv), sema->sema_value, file, line); + + mtx_unlock(&sema->sema_mtx); +} + +void +_sema_wait(struct sema *sema, const char *file, int line) +{ + + mtx_lock(&sema->sema_mtx); + while (sema->sema_value == 0) { + sema->sema_waiters++; + cv_wait(&sema->sema_cv, &sema->sema_mtx); + sema->sema_waiters--; + } + sema->sema_value--; + + CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema, + cv_wmesg(&sema->sema_cv), sema->sema_value, file, line); + + mtx_unlock(&sema->sema_mtx); +} + +int +_sema_timedwait(struct sema *sema, int timo, const char *file, int line) +{ + int ret, timed_out; + + mtx_lock(&sema->sema_mtx); + + /* + * A spurious wakeup will cause the timeout interval to start over. + * This isn't a big deal as long as spurious wakeups don't occur + * continuously, since the timeout period is merely a lower bound on how + * long to wait. + */ + for (timed_out = 0; sema->sema_value == 0 && timed_out == 0;) { + sema->sema_waiters++; + timed_out = cv_timedwait(&sema->sema_cv, &sema->sema_mtx, timo); + sema->sema_waiters--; + } + if (sema->sema_value > 0) { + /* Success. */ + sema->sema_value--; + ret = 1; + + CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema, + cv_wmesg(&sema->sema_cv), sema->sema_value, file, line); + } else { + ret = 0; + + CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema, + cv_wmesg(&sema->sema_cv), file, line); + } + + mtx_unlock(&sema->sema_mtx); + return (ret); +} + +int +_sema_trywait(struct sema *sema, const char *file, int line) +{ + int ret; + + mtx_lock(&sema->sema_mtx); + + if (sema->sema_value > 0) { + /* Success. */ + sema->sema_value--; + ret = 1; + + CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema, + cv_wmesg(&sema->sema_cv), sema->sema_value, file, line); + } else { + ret = 0; + + CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema, + cv_wmesg(&sema->sema_cv), file, line); + } + + mtx_unlock(&sema->sema_mtx); + return (ret); +} + +int +sema_value(struct sema *sema) +{ + int ret; + + mtx_lock(&sema->sema_mtx); + ret = sema->sema_value; + mtx_unlock(&sema->sema_mtx); + return (ret); +} diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c new file mode 100644 index 0000000..d2cb69d --- /dev/null +++ b/sys/kern/kern_shutdown.c @@ -0,0 +1,564 @@ +/*- + * Copyright (c) 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#include "opt_ddb.h" +#include "opt_hw_wdog.h" +#include "opt_panic.h" +#include "opt_show_busybufs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/cons.h> +#include <sys/disklabel.h> +#include <sys/eventhandler.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/proc.h> +#include <sys/reboot.h> +#include <sys/resourcevar.h> +#include <sys/smp.h> /* smp_active */ +#include <sys/sysctl.h> +#include <sys/sysproto.h> +#include <sys/vnode.h> + +#include <machine/pcb.h> +#include <machine/md_var.h> +#include <machine/smp.h> + +#include <sys/signalvar.h> +#ifdef DDB +#include <ddb/ddb.h> +#endif + +#ifndef PANIC_REBOOT_WAIT_TIME +#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ +#endif + +/* + * Note that stdarg.h and the ANSI style va_start macro is used for both + * ANSI and traditional C compilers. + */ +#include <machine/stdarg.h> + +#ifdef DDB +#ifdef DDB_UNATTENDED +int debugger_on_panic = 0; +#else +int debugger_on_panic = 1; +#endif +SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW, + &debugger_on_panic, 0, "Run debugger on kernel panic"); +#endif + +int sync_on_panic = 1; +SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RW, + &sync_on_panic, 0, "Do a sync before rebooting from a panic"); + +SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment"); + +#ifdef HW_WDOG +/* + * If there is a hardware watchdog, point this at the function needed to + * hold it off. + * It's needed when the kernel needs to do some lengthy operations. + * e.g. in wd.c when dumping core.. It's most annoying to have + * your precious core-dump only half written because the wdog kicked in. + */ +watchdog_tickle_fn wdog_tickler = NULL; +#endif /* HW_WDOG */ + +/* + * Variable panicstr contains argument to first call to panic; used as flag + * to indicate that the kernel has already called panic. + */ +const char *panicstr; + +int dumping; /* system is dumping */ +static struct dumperinfo dumper; /* our selected dumper */ +static struct pcb dumppcb; /* "You Are Here" sign for dump-debuggers */ + +static void boot(int) __dead2; +static void poweroff_wait(void *, int); +static void shutdown_halt(void *junk, int howto); +static void shutdown_panic(void *junk, int howto); +static void shutdown_reset(void *junk, int howto); + +/* register various local shutdown events */ +static void +shutdown_conf(void *unused) +{ + EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST); + EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100); + EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100); + EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL, SHUTDOWN_PRI_LAST + 200); +} + +SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL) + +/* + * The system call that results in a reboot + * + * MPSAFE + */ +/* ARGSUSED */ +int +reboot(struct thread *td, struct reboot_args *uap) +{ + int error; + + mtx_lock(&Giant); + if ((error = suser(td)) == 0) + boot(uap->opt); + mtx_unlock(&Giant); + return (error); +} + +/* + * Called by events that want to shut down.. e.g <CTL><ALT><DEL> on a PC + */ +static int shutdown_howto = 0; + +void +shutdown_nice(int howto) +{ + shutdown_howto = howto; + + /* Send a signal to init(8) and have it shutdown the world */ + if (initproc != NULL) { + PROC_LOCK(initproc); + psignal(initproc, SIGINT); + PROC_UNLOCK(initproc); + } else { + /* No init(8) running, so simply reboot */ + boot(RB_NOSYNC); + } + return; +} +static int waittime = -1; + +static void +print_uptime(void) +{ + int f; + struct timespec ts; + + getnanouptime(&ts); + printf("Uptime: "); + f = 0; + if (ts.tv_sec >= 86400) { + printf("%ldd", (long)ts.tv_sec / 86400); + ts.tv_sec %= 86400; + f = 1; + } + if (f || ts.tv_sec >= 3600) { + printf("%ldh", (long)ts.tv_sec / 3600); + ts.tv_sec %= 3600; + f = 1; + } + if (f || ts.tv_sec >= 60) { + printf("%ldm", (long)ts.tv_sec / 60); + ts.tv_sec %= 60; + f = 1; + } + printf("%lds\n", (long)ts.tv_sec); +} + +static void +doadump(void) +{ + savectx(&dumppcb); + dumping++; + dumpsys(&dumper); +} + +/* + * Go through the rigmarole of shutting down.. + * this used to be in machdep.c but I'll be dammned if I could see + * anything machine dependant in it. + */ +static void +boot(int howto) +{ + + /* collect extra flags that shutdown_nice might have set */ + howto |= shutdown_howto; + +#ifdef DDB + /* We are out of the debugger now. */ + db_active = 0; +#endif + +#ifdef SMP + if (smp_active) + printf("boot() called on cpu#%d\n", PCPU_GET(cpuid)); +#endif + /* + * Do any callouts that should be done BEFORE syncing the filesystems. + */ + EVENTHANDLER_INVOKE(shutdown_pre_sync, howto); + + /* + * Now sync filesystems + */ + if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) { + register struct buf *bp; + int iter, nbusy, pbusy; + int subiter; + + waittime = 0; + printf("\nsyncing disks... "); + + sync(&thread0, NULL); + + /* + * With soft updates, some buffers that are + * written will be remarked as dirty until other + * buffers are written. + */ + for (iter = pbusy = 0; iter < 20; iter++) { + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) { + if ((bp->b_flags & B_INVAL) == 0 && + BUF_REFCNT(bp) > 0) { + nbusy++; + } else if ((bp->b_flags & (B_DELWRI | B_INVAL)) + == B_DELWRI) { + /* bawrite(bp);*/ + nbusy++; + } + } + if (nbusy == 0) + break; + printf("%d ", nbusy); + if (nbusy < pbusy) + iter = 0; + pbusy = nbusy; + sync(&thread0, NULL); + if (curthread != NULL) { + DROP_GIANT(); + for (subiter = 0; subiter < 50 * iter; subiter++) { + mtx_lock_spin(&sched_lock); + setrunqueue(curthread); + curthread->td_proc->p_stats->p_ru.ru_nvcsw++; + mi_switch(); /* Allow interrupt threads to run */ + mtx_unlock_spin(&sched_lock); + DELAY(1000); + } + PICKUP_GIANT(); + } else + DELAY(50000 * iter); + } + printf("\n"); + /* + * Count only busy local buffers to prevent forcing + * a fsck if we're just a client of a wedged NFS server + */ + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) { + if (((bp->b_flags&B_INVAL) == 0 && BUF_REFCNT(bp)) || + ((bp->b_flags & (B_DELWRI|B_INVAL)) == B_DELWRI)) { + if (bp->b_dev == NODEV) { + TAILQ_REMOVE(&mountlist, + bp->b_vp->v_mount, mnt_list); + continue; + } + nbusy++; +#if defined(SHOW_BUSYBUFS) || defined(DIAGNOSTIC) + printf( + "%d: dev:%s, flags:%08lx, blkno:%ld, lblkno:%ld\n", + nbusy, devtoname(bp->b_dev), + bp->b_flags, (long)bp->b_blkno, + (long)bp->b_lblkno); +#endif + } + } + if (nbusy) { + /* + * Failed to sync all blocks. Indicate this and don't + * unmount filesystems (thus forcing an fsck on reboot). + */ + printf("giving up on %d buffers\n", nbusy); + DELAY(5000000); /* 5 seconds */ + } else { + printf("done\n"); + /* + * Unmount filesystems + */ + if (panicstr == 0) + vfs_unmountall(); + } + DELAY(100000); /* wait for console output to finish */ + } + + print_uptime(); + + /* + * Ok, now do things that assume all filesystem activity has + * been completed. + */ + EVENTHANDLER_INVOKE(shutdown_post_sync, howto); + splhigh(); + if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && + !cold && dumper.dumper != NULL && !dumping) + doadump(); + + /* Now that we're going to really halt the system... */ + EVENTHANDLER_INVOKE(shutdown_final, howto); + + for(;;) ; /* safety against shutdown_reset not working */ + /* NOTREACHED */ +} + +/* + * If the shutdown was a clean halt, behave accordingly. + */ +static void +shutdown_halt(void *junk, int howto) +{ + if (howto & RB_HALT) { + printf("\n"); + printf("The operating system has halted.\n"); + printf("Please press any key to reboot.\n\n"); + switch (cngetc()) { + case -1: /* No console, just die */ + cpu_halt(); + /* NOTREACHED */ + default: + howto &= ~RB_HALT; + break; + } + } +} + +/* + * Check to see if the system paniced, pause and then reboot + * according to the specified delay. + */ +static void +shutdown_panic(void *junk, int howto) +{ + int loop; + + if (howto & RB_DUMP) { + if (PANIC_REBOOT_WAIT_TIME != 0) { + if (PANIC_REBOOT_WAIT_TIME != -1) { + printf("Automatic reboot in %d seconds - " + "press a key on the console to abort\n", + PANIC_REBOOT_WAIT_TIME); + for (loop = PANIC_REBOOT_WAIT_TIME * 10; + loop > 0; --loop) { + DELAY(1000 * 100); /* 1/10th second */ + /* Did user type a key? */ + if (cncheckc() != -1) + break; + } + if (!loop) + return; + } + } else { /* zero time specified - reboot NOW */ + return; + } + printf("--> Press a key on the console to reboot,\n"); + printf("--> or switch off the system now.\n"); + cngetc(); + } +} + +/* + * Everything done, now reset + */ +static void +shutdown_reset(void *junk, int howto) +{ + printf("Rebooting...\n"); + DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ + /* cpu_boot(howto); */ /* doesn't do anything at the moment */ + cpu_reset(); + /* NOTREACHED */ /* assuming reset worked */ +} + +#ifdef SMP +static u_int panic_cpu = NOCPU; +#endif + +/* + * Panic is called on unresolvable fatal errors. It prints "panic: mesg", + * and then reboots. If we are called twice, then we avoid trying to sync + * the disks as this often leads to recursive panics. + * + * MPSAFE + */ +void +panic(const char *fmt, ...) +{ + int bootopt; + va_list ap; + static char buf[256]; + +#ifdef SMP + /* + * We don't want multiple CPU's to panic at the same time, so we + * use panic_cpu as a simple spinlock. We have to keep checking + * panic_cpu if we are spinning in case the panic on the first + * CPU is canceled. + */ + if (panic_cpu != PCPU_GET(cpuid)) + while (atomic_cmpset_int(&panic_cpu, NOCPU, + PCPU_GET(cpuid)) == 0) + while (panic_cpu != NOCPU) + ; /* nothing */ +#endif + + bootopt = RB_AUTOBOOT | RB_DUMP; + if (panicstr) + bootopt |= RB_NOSYNC; + else + panicstr = fmt; + + va_start(ap, fmt); + (void)vsnprintf(buf, sizeof(buf), fmt, ap); + if (panicstr == fmt) + panicstr = buf; + va_end(ap); + printf("panic: %s\n", buf); +#ifdef SMP + /* two separate prints in case of an unmapped page and trap */ + printf("cpuid = %d; ", PCPU_GET(cpuid)); +#ifdef APIC_IO + printf("lapic.id = %08x\n", lapic.id); +#endif +#endif + +#if defined(DDB) + if (debugger_on_panic) + Debugger ("panic"); +#ifdef RESTARTABLE_PANICS + /* See if the user aborted the panic, in which case we continue. */ + if (panicstr == NULL) { +#ifdef SMP + atomic_store_rel_int(&panic_cpu, NOCPU); +#endif + return; + } +#endif +#endif + if (!sync_on_panic) + bootopt |= RB_NOSYNC; + boot(bootopt); +} + +/* + * Support for poweroff delay. + */ +#ifndef POWEROFF_DELAY +# define POWEROFF_DELAY 5000 +#endif +static int poweroff_delay = POWEROFF_DELAY; + +SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW, + &poweroff_delay, 0, ""); + +static void +poweroff_wait(void *junk, int howto) +{ + if(!(howto & RB_POWEROFF) || poweroff_delay <= 0) + return; + DELAY(poweroff_delay * 1000); +} + +/* + * Some system processes (e.g. syncer) need to be stopped at appropriate + * points in their main loops prior to a system shutdown, so that they + * won't interfere with the shutdown process (e.g. by holding a disk buf + * to cause sync to fail). For each of these system processes, register + * shutdown_kproc() as a handler for one of shutdown events. + */ +static int kproc_shutdown_wait = 60; +SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW, + &kproc_shutdown_wait, 0, ""); + +void +kproc_shutdown(void *arg, int howto) +{ + struct proc *p; + int error; + + if (panicstr) + return; + + p = (struct proc *)arg; + printf("Waiting (max %d seconds) for system process `%s' to stop...", + kproc_shutdown_wait, p->p_comm); + error = kthread_suspend(p, kproc_shutdown_wait * hz); + + if (error == EWOULDBLOCK) + printf("timed out\n"); + else + printf("stopped\n"); +} + +/* Registration of dumpers */ +int +set_dumper(struct dumperinfo *di) +{ + if (di == NULL) { + bzero(&dumper, sizeof dumper); + return (0); + } + if (dumper.dumper != NULL) + return (EBUSY); + dumper = *di; + return (0); +} + +#if defined(__powerpc__) || defined(__sparc64__) +void +dumpsys(struct dumperinfo *di __unused) +{ + + printf("Kernel dumps not implemented on this architecture\n"); +} +#endif diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c new file mode 100644 index 0000000..8af0280 --- /dev/null +++ b/sys/kern/kern_sig.c @@ -0,0 +1,2153 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 + * $FreeBSD$ + */ + +#include "opt_compat.h" +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/sysproto.h> +#include <sys/systm.h> +#include <sys/signalvar.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/event.h> +#include <sys/proc.h> +#include <sys/pioctl.h> +#include <sys/acct.h> +#include <sys/fcntl.h> +#include <sys/condvar.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/wait.h> +#include <sys/ktr.h> +#include <sys/ktrace.h> +#include <sys/resourcevar.h> +#include <sys/smp.h> +#include <sys/stat.h> +#include <sys/sx.h> +#include <sys/syslog.h> +#include <sys/sysent.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/unistd.h> + +#include <machine/cpu.h> + +#define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ + +static int coredump(struct thread *); +static int do_sigaction(struct proc *p, int sig, struct sigaction *act, + struct sigaction *oact, int old); +static int do_sigprocmask(struct proc *p, int how, sigset_t *set, + sigset_t *oset, int old); +static char *expand_name(const char *, uid_t, pid_t); +static int killpg1(struct thread *td, int sig, int pgid, int all); +static int sig_ffs(sigset_t *set); +static int sigprop(int sig); +static void stop(struct proc *); + +static int filt_sigattach(struct knote *kn); +static void filt_sigdetach(struct knote *kn); +static int filt_signal(struct knote *kn, long hint); + +struct filterops sig_filtops = + { 0, filt_sigattach, filt_sigdetach, filt_signal }; + +static int kern_logsigexit = 1; +SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, + &kern_logsigexit, 0, + "Log processes quitting on abnormal signals to syslog(3)"); + +/* + * Policy -- Can ucred cr1 send SIGIO to process cr2? + * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG + * in the right situations. + */ +#define CANSIGIO(cr1, cr2) \ + ((cr1)->cr_uid == 0 || \ + (cr1)->cr_ruid == (cr2)->cr_ruid || \ + (cr1)->cr_uid == (cr2)->cr_ruid || \ + (cr1)->cr_ruid == (cr2)->cr_uid || \ + (cr1)->cr_uid == (cr2)->cr_uid) + +int sugid_coredump; +SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, + &sugid_coredump, 0, "Enable coredumping set user/group ID processes"); + +static int do_coredump = 1; +SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, + &do_coredump, 0, "Enable/Disable coredumps"); + +/* + * Signal properties and actions. + * The array below categorizes the signals and their default actions + * according to the following properties: + */ +#define SA_KILL 0x01 /* terminates process by default */ +#define SA_CORE 0x02 /* ditto and coredumps */ +#define SA_STOP 0x04 /* suspend process */ +#define SA_TTYSTOP 0x08 /* ditto, from tty */ +#define SA_IGNORE 0x10 /* ignore by default */ +#define SA_CONT 0x20 /* continue if suspended */ +#define SA_CANTMASK 0x40 /* non-maskable, catchable */ + +static int sigproptbl[NSIG] = { + SA_KILL, /* SIGHUP */ + SA_KILL, /* SIGINT */ + SA_KILL|SA_CORE, /* SIGQUIT */ + SA_KILL|SA_CORE, /* SIGILL */ + SA_KILL|SA_CORE, /* SIGTRAP */ + SA_KILL|SA_CORE, /* SIGABRT */ + SA_KILL|SA_CORE, /* SIGEMT */ + SA_KILL|SA_CORE, /* SIGFPE */ + SA_KILL, /* SIGKILL */ + SA_KILL|SA_CORE, /* SIGBUS */ + SA_KILL|SA_CORE, /* SIGSEGV */ + SA_KILL|SA_CORE, /* SIGSYS */ + SA_KILL, /* SIGPIPE */ + SA_KILL, /* SIGALRM */ + SA_KILL, /* SIGTERM */ + SA_IGNORE, /* SIGURG */ + SA_STOP, /* SIGSTOP */ + SA_STOP|SA_TTYSTOP, /* SIGTSTP */ + SA_IGNORE|SA_CONT, /* SIGCONT */ + SA_IGNORE, /* SIGCHLD */ + SA_STOP|SA_TTYSTOP, /* SIGTTIN */ + SA_STOP|SA_TTYSTOP, /* SIGTTOU */ + SA_IGNORE, /* SIGIO */ + SA_KILL, /* SIGXCPU */ + SA_KILL, /* SIGXFSZ */ + SA_KILL, /* SIGVTALRM */ + SA_KILL, /* SIGPROF */ + SA_IGNORE, /* SIGWINCH */ + SA_IGNORE, /* SIGINFO */ + SA_KILL, /* SIGUSR1 */ + SA_KILL, /* SIGUSR2 */ +}; + +/* + * Determine signal that should be delivered to process p, the current + * process, 0 if none. If there is a pending stop signal with default + * action, the process stops in issignal(). + * + * MP SAFE. + */ +int +cursig(struct proc *p) +{ + + PROC_LOCK_ASSERT(p, MA_OWNED); + mtx_assert(&sched_lock, MA_NOTOWNED); + return (SIGPENDING(p) ? issignal(p) : 0); +} + +/* + * Arrange for ast() to handle unmasked pending signals on return to user + * mode. This must be called whenever a signal is added to p_siglist or + * unmasked in p_sigmask. + */ +void +signotify(struct proc *p) +{ + + PROC_LOCK_ASSERT(p, MA_OWNED); + mtx_lock_spin(&sched_lock); + if (SIGPENDING(p)) { + p->p_sflag |= PS_NEEDSIGCHK; + p->p_kse.ke_flags |= KEF_ASTPENDING; /* XXXKSE */ + } + mtx_unlock_spin(&sched_lock); +} + +static __inline int +sigprop(int sig) +{ + + if (sig > 0 && sig < NSIG) + return (sigproptbl[_SIG_IDX(sig)]); + return (0); +} + +static __inline int +sig_ffs(sigset_t *set) +{ + int i; + + for (i = 0; i < _SIG_WORDS; i++) + if (set->__bits[i]) + return (ffs(set->__bits[i]) + (i * 32)); + return (0); +} + +/* + * do_sigaction + * sigaction + * osigaction + */ +static int +do_sigaction(p, sig, act, oact, old) + struct proc *p; + register int sig; + struct sigaction *act, *oact; + int old; +{ + register struct sigacts *ps; + + if (!_SIG_VALID(sig)) + return (EINVAL); + + PROC_LOCK(p); + ps = p->p_sigacts; + if (oact) { + oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; + oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; + oact->sa_flags = 0; + if (SIGISMEMBER(ps->ps_sigonstack, sig)) + oact->sa_flags |= SA_ONSTACK; + if (!SIGISMEMBER(ps->ps_sigintr, sig)) + oact->sa_flags |= SA_RESTART; + if (SIGISMEMBER(ps->ps_sigreset, sig)) + oact->sa_flags |= SA_RESETHAND; + if (SIGISMEMBER(ps->ps_signodefer, sig)) + oact->sa_flags |= SA_NODEFER; + if (SIGISMEMBER(ps->ps_siginfo, sig)) + oact->sa_flags |= SA_SIGINFO; + if (sig == SIGCHLD && p->p_procsig->ps_flag & PS_NOCLDSTOP) + oact->sa_flags |= SA_NOCLDSTOP; + if (sig == SIGCHLD && p->p_procsig->ps_flag & PS_NOCLDWAIT) + oact->sa_flags |= SA_NOCLDWAIT; + } + if (act) { + if ((sig == SIGKILL || sig == SIGSTOP) && + act->sa_handler != SIG_DFL) { + PROC_UNLOCK(p); + return (EINVAL); + } + + /* + * Change setting atomically. + */ + + ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; + SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); + if (act->sa_flags & SA_SIGINFO) { + ps->ps_sigact[_SIG_IDX(sig)] = + (__sighandler_t *)act->sa_sigaction; + SIGADDSET(ps->ps_siginfo, sig); + } else { + ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; + SIGDELSET(ps->ps_siginfo, sig); + } + if (!(act->sa_flags & SA_RESTART)) + SIGADDSET(ps->ps_sigintr, sig); + else + SIGDELSET(ps->ps_sigintr, sig); + if (act->sa_flags & SA_ONSTACK) + SIGADDSET(ps->ps_sigonstack, sig); + else + SIGDELSET(ps->ps_sigonstack, sig); + if (act->sa_flags & SA_RESETHAND) + SIGADDSET(ps->ps_sigreset, sig); + else + SIGDELSET(ps->ps_sigreset, sig); + if (act->sa_flags & SA_NODEFER) + SIGADDSET(ps->ps_signodefer, sig); + else + SIGDELSET(ps->ps_signodefer, sig); +#ifdef COMPAT_SUNOS + if (act->sa_flags & SA_USERTRAMP) + SIGADDSET(ps->ps_usertramp, sig); + else + SIGDELSET(ps->ps_usertramp, sig); +#endif + if (sig == SIGCHLD) { + if (act->sa_flags & SA_NOCLDSTOP) + p->p_procsig->ps_flag |= PS_NOCLDSTOP; + else + p->p_procsig->ps_flag &= ~PS_NOCLDSTOP; + if (act->sa_flags & SA_NOCLDWAIT) { + /* + * Paranoia: since SA_NOCLDWAIT is implemented + * by reparenting the dying child to PID 1 (and + * trust it to reap the zombie), PID 1 itself + * is forbidden to set SA_NOCLDWAIT. + */ + if (p->p_pid == 1) + p->p_procsig->ps_flag &= ~PS_NOCLDWAIT; + else + p->p_procsig->ps_flag |= PS_NOCLDWAIT; + } else + p->p_procsig->ps_flag &= ~PS_NOCLDWAIT; + if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) + p->p_procsig->ps_flag |= PS_CLDSIGIGN; + else + p->p_procsig->ps_flag &= ~PS_CLDSIGIGN; + } + /* + * Set bit in p_sigignore for signals that are set to SIG_IGN, + * and for signals set to SIG_DFL where the default is to + * ignore. However, don't put SIGCONT in p_sigignore, as we + * have to restart the process. + */ + if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || + (sigprop(sig) & SA_IGNORE && + ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { + /* never to be seen again */ + SIGDELSET(p->p_siglist, sig); + if (sig != SIGCONT) + /* easier in psignal */ + SIGADDSET(p->p_sigignore, sig); + SIGDELSET(p->p_sigcatch, sig); + } else { + SIGDELSET(p->p_sigignore, sig); + if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) + SIGDELSET(p->p_sigcatch, sig); + else + SIGADDSET(p->p_sigcatch, sig); + } +#ifdef COMPAT_43 + if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || + ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || !old) + SIGDELSET(ps->ps_osigset, sig); + else + SIGADDSET(ps->ps_osigset, sig); +#endif + } + PROC_UNLOCK(p); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct sigaction_args { + int sig; + struct sigaction *act; + struct sigaction *oact; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +sigaction(td, uap) + struct thread *td; + register struct sigaction_args *uap; +{ + struct proc *p = td->td_proc; + struct sigaction act, oact; + register struct sigaction *actp, *oactp; + int error; + + mtx_lock(&Giant); + + actp = (uap->act != NULL) ? &act : NULL; + oactp = (uap->oact != NULL) ? &oact : NULL; + if (actp) { + error = copyin(uap->act, actp, sizeof(act)); + if (error) + goto done2; + } + error = do_sigaction(p, uap->sig, actp, oactp, 0); + if (oactp && !error) { + error = copyout(oactp, uap->oact, sizeof(oact)); + } +done2: + mtx_unlock(&Giant); + return (error); +} + +#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ +#ifndef _SYS_SYSPROTO_H_ +struct osigaction_args { + int signum; + struct osigaction *nsa; + struct osigaction *osa; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +osigaction(td, uap) + struct thread *td; + register struct osigaction_args *uap; +{ + struct proc *p = td->td_proc; + struct osigaction sa; + struct sigaction nsa, osa; + register struct sigaction *nsap, *osap; + int error; + + if (uap->signum <= 0 || uap->signum >= ONSIG) + return (EINVAL); + + nsap = (uap->nsa != NULL) ? &nsa : NULL; + osap = (uap->osa != NULL) ? &osa : NULL; + + mtx_lock(&Giant); + + if (nsap) { + error = copyin(uap->nsa, &sa, sizeof(sa)); + if (error) + goto done2; + nsap->sa_handler = sa.sa_handler; + nsap->sa_flags = sa.sa_flags; + OSIG2SIG(sa.sa_mask, nsap->sa_mask); + } + error = do_sigaction(p, uap->signum, nsap, osap, 1); + if (osap && !error) { + sa.sa_handler = osap->sa_handler; + sa.sa_flags = osap->sa_flags; + SIG2OSIG(osap->sa_mask, sa.sa_mask); + error = copyout(&sa, uap->osa, sizeof(sa)); + } +done2: + mtx_unlock(&Giant); + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Initialize signal state for process 0; + * set to ignore signals that are ignored by default. + */ +void +siginit(p) + struct proc *p; +{ + register int i; + + PROC_LOCK(p); + for (i = 1; i <= NSIG; i++) + if (sigprop(i) & SA_IGNORE && i != SIGCONT) + SIGADDSET(p->p_sigignore, i); + PROC_UNLOCK(p); +} + +/* + * Reset signals for an exec of the specified process. + */ +void +execsigs(p) + register struct proc *p; +{ + register struct sigacts *ps; + register int sig; + + /* + * Reset caught signals. Held signals remain held + * through p_sigmask (unless they were caught, + * and are now ignored by default). + */ + PROC_LOCK_ASSERT(p, MA_OWNED); + ps = p->p_sigacts; + while (SIGNOTEMPTY(p->p_sigcatch)) { + sig = sig_ffs(&p->p_sigcatch); + SIGDELSET(p->p_sigcatch, sig); + if (sigprop(sig) & SA_IGNORE) { + if (sig != SIGCONT) + SIGADDSET(p->p_sigignore, sig); + SIGDELSET(p->p_siglist, sig); + } + ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; + } + /* + * Reset stack state to the user stack. + * Clear set of signals caught on the signal stack. + */ + p->p_sigstk.ss_flags = SS_DISABLE; + p->p_sigstk.ss_size = 0; + p->p_sigstk.ss_sp = 0; + p->p_flag &= ~P_ALTSTACK; + /* + * Reset no zombies if child dies flag as Solaris does. + */ + p->p_procsig->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); + if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) + ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; +} + +/* + * do_sigprocmask() + * + * Manipulate signal mask. + */ +static int +do_sigprocmask(p, how, set, oset, old) + struct proc *p; + int how; + sigset_t *set, *oset; + int old; +{ + int error; + + PROC_LOCK(p); + if (oset != NULL) + *oset = p->p_sigmask; + + error = 0; + if (set != NULL) { + switch (how) { + case SIG_BLOCK: + SIG_CANTMASK(*set); + SIGSETOR(p->p_sigmask, *set); + break; + case SIG_UNBLOCK: + SIGSETNAND(p->p_sigmask, *set); + signotify(p); + break; + case SIG_SETMASK: + SIG_CANTMASK(*set); + if (old) + SIGSETLO(p->p_sigmask, *set); + else + p->p_sigmask = *set; + signotify(p); + break; + default: + error = EINVAL; + break; + } + } + PROC_UNLOCK(p); + return (error); +} + +/* + * sigprocmask() - MP SAFE (XXXKSE not under KSE it isn't) + */ + +#ifndef _SYS_SYSPROTO_H_ +struct sigprocmask_args { + int how; + const sigset_t *set; + sigset_t *oset; +}; +#endif +int +sigprocmask(td, uap) + register struct thread *td; + struct sigprocmask_args *uap; +{ + struct proc *p = td->td_proc; + sigset_t set, oset; + sigset_t *setp, *osetp; + int error; + + setp = (uap->set != NULL) ? &set : NULL; + osetp = (uap->oset != NULL) ? &oset : NULL; + if (setp) { + error = copyin(uap->set, setp, sizeof(set)); + if (error) + return (error); + } + error = do_sigprocmask(p, uap->how, setp, osetp, 0); + if (osetp && !error) { + error = copyout(osetp, uap->oset, sizeof(oset)); + } + return (error); +} + +#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ +/* + * osigprocmask() - MP SAFE + */ +#ifndef _SYS_SYSPROTO_H_ +struct osigprocmask_args { + int how; + osigset_t mask; +}; +#endif +int +osigprocmask(td, uap) + register struct thread *td; + struct osigprocmask_args *uap; +{ + struct proc *p = td->td_proc; + sigset_t set, oset; + int error; + + OSIG2SIG(uap->mask, set); + error = do_sigprocmask(p, uap->how, &set, &oset, 1); + SIG2OSIG(oset, td->td_retval[0]); + return (error); +} +#endif /* COMPAT_43 */ + +#ifndef _SYS_SYSPROTO_H_ +struct sigpending_args { + sigset_t *set; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +sigpending(td, uap) + struct thread *td; + struct sigpending_args *uap; +{ + struct proc *p = td->td_proc; + sigset_t siglist; + int error; + + mtx_lock(&Giant); + PROC_LOCK(p); + siglist = p->p_siglist; + PROC_UNLOCK(p); + mtx_unlock(&Giant); + error = copyout(&siglist, uap->set, sizeof(sigset_t)); + return(error); +} + +#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ +#ifndef _SYS_SYSPROTO_H_ +struct osigpending_args { + int dummy; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +osigpending(td, uap) + struct thread *td; + struct osigpending_args *uap; +{ + struct proc *p = td->td_proc; + + mtx_lock(&Giant); + PROC_LOCK(p); + SIG2OSIG(p->p_siglist, td->td_retval[0]); + PROC_UNLOCK(p); + mtx_unlock(&Giant); + return (0); +} +#endif /* COMPAT_43 */ + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Generalized interface signal handler, 4.3-compatible. + */ +#ifndef _SYS_SYSPROTO_H_ +struct osigvec_args { + int signum; + struct sigvec *nsv; + struct sigvec *osv; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +osigvec(td, uap) + struct thread *td; + register struct osigvec_args *uap; +{ + struct proc *p = td->td_proc; + struct sigvec vec; + struct sigaction nsa, osa; + register struct sigaction *nsap, *osap; + int error; + + if (uap->signum <= 0 || uap->signum >= ONSIG) + return (EINVAL); + nsap = (uap->nsv != NULL) ? &nsa : NULL; + osap = (uap->osv != NULL) ? &osa : NULL; + if (nsap) { + error = copyin(uap->nsv, &vec, sizeof(vec)); + if (error) + return (error); + nsap->sa_handler = vec.sv_handler; + OSIG2SIG(vec.sv_mask, nsap->sa_mask); + nsap->sa_flags = vec.sv_flags; + nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ +#ifdef COMPAT_SUNOS + nsap->sa_flags |= SA_USERTRAMP; +#endif + } + mtx_lock(&Giant); + error = do_sigaction(p, uap->signum, nsap, osap, 1); + mtx_unlock(&Giant); + if (osap && !error) { + vec.sv_handler = osap->sa_handler; + SIG2OSIG(osap->sa_mask, vec.sv_mask); + vec.sv_flags = osap->sa_flags; + vec.sv_flags &= ~SA_NOCLDWAIT; + vec.sv_flags ^= SA_RESTART; +#ifdef COMPAT_SUNOS + vec.sv_flags &= ~SA_NOCLDSTOP; +#endif + error = copyout(&vec, uap->osv, sizeof(vec)); + } + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct osigblock_args { + int mask; +}; +#endif +/* + * MPSAFE + */ +int +osigblock(td, uap) + register struct thread *td; + struct osigblock_args *uap; +{ + struct proc *p = td->td_proc; + sigset_t set; + + OSIG2SIG(uap->mask, set); + SIG_CANTMASK(set); + mtx_lock(&Giant); + PROC_LOCK(p); + SIG2OSIG(p->p_sigmask, td->td_retval[0]); + SIGSETOR(p->p_sigmask, set); + PROC_UNLOCK(p); + mtx_unlock(&Giant); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct osigsetmask_args { + int mask; +}; +#endif +/* + * MPSAFE + */ +int +osigsetmask(td, uap) + struct thread *td; + struct osigsetmask_args *uap; +{ + struct proc *p = td->td_proc; + sigset_t set; + + OSIG2SIG(uap->mask, set); + SIG_CANTMASK(set); + mtx_lock(&Giant); + PROC_LOCK(p); + SIG2OSIG(p->p_sigmask, td->td_retval[0]); + SIGSETLO(p->p_sigmask, set); + signotify(p); + PROC_UNLOCK(p); + mtx_unlock(&Giant); + return (0); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Suspend process until signal, providing mask to be set + * in the meantime. Note nonstandard calling convention: + * libc stub passes mask, not pointer, to save a copyin. + ***** XXXKSE this doesn't make sense under KSE. + ***** Do we suspend the thread or all threads in the process? + ***** How do we suspend threads running NOW on another processor? + */ +#ifndef _SYS_SYSPROTO_H_ +struct sigsuspend_args { + const sigset_t *sigmask; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +sigsuspend(td, uap) + struct thread *td; + struct sigsuspend_args *uap; +{ + struct proc *p = td->td_proc; + sigset_t mask; + register struct sigacts *ps; + int error; + + error = copyin(uap->sigmask, &mask, sizeof(mask)); + if (error) + return (error); + + /* + * When returning from sigsuspend, we want + * the old mask to be restored after the + * signal handler has finished. Thus, we + * save it here and mark the sigacts structure + * to indicate this. + */ + mtx_lock(&Giant); + PROC_LOCK(p); + ps = p->p_sigacts; + p->p_oldsigmask = p->p_sigmask; + p->p_flag |= P_OLDMASK; + + SIG_CANTMASK(mask); + p->p_sigmask = mask; + signotify(p); + while (msleep((caddr_t) ps, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) + /* void */; + PROC_UNLOCK(p); + mtx_unlock(&Giant); + /* always return EINTR rather than ERESTART... */ + return (EINTR); +} + +#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ +#ifndef _SYS_SYSPROTO_H_ +struct osigsuspend_args { + osigset_t mask; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +osigsuspend(td, uap) + struct thread *td; + struct osigsuspend_args *uap; +{ + struct proc *p = td->td_proc; + sigset_t mask; + register struct sigacts *ps; + + mtx_lock(&Giant); + PROC_LOCK(p); + ps = p->p_sigacts; + p->p_oldsigmask = p->p_sigmask; + p->p_flag |= P_OLDMASK; + OSIG2SIG(uap->mask, mask); + SIG_CANTMASK(mask); + SIGSETLO(p->p_sigmask, mask); + signotify(p); + while (msleep((caddr_t) ps, &p->p_mtx, PPAUSE|PCATCH, "opause", 0) == 0) + /* void */; + PROC_UNLOCK(p); + mtx_unlock(&Giant); + /* always return EINTR rather than ERESTART... */ + return (EINTR); +} +#endif /* COMPAT_43 */ + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#ifndef _SYS_SYSPROTO_H_ +struct osigstack_args { + struct sigstack *nss; + struct sigstack *oss; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +osigstack(td, uap) + struct thread *td; + register struct osigstack_args *uap; +{ + struct proc *p = td->td_proc; + struct sigstack ss; + int error = 0; + + mtx_lock(&Giant); + + if (uap->oss != NULL) { + PROC_LOCK(p); + ss.ss_sp = p->p_sigstk.ss_sp; + ss.ss_onstack = sigonstack(cpu_getstack(td)); + PROC_UNLOCK(p); + error = copyout(&ss, uap->oss, sizeof(struct sigstack)); + if (error) + goto done2; + } + + if (uap->nss != NULL) { + if ((error = copyin(uap->nss, &ss, sizeof(ss))) != 0) + goto done2; + PROC_LOCK(p); + p->p_sigstk.ss_sp = ss.ss_sp; + p->p_sigstk.ss_size = 0; + p->p_sigstk.ss_flags |= ss.ss_onstack & SS_ONSTACK; + p->p_flag |= P_ALTSTACK; + PROC_UNLOCK(p); + } +done2: + mtx_unlock(&Giant); + return (error); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +#ifndef _SYS_SYSPROTO_H_ +struct sigaltstack_args { + stack_t *ss; + stack_t *oss; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +sigaltstack(td, uap) + struct thread *td; + register struct sigaltstack_args *uap; +{ + struct proc *p = td->td_proc; + stack_t ss; + int oonstack; + int error = 0; + + mtx_lock(&Giant); + + oonstack = sigonstack(cpu_getstack(td)); + + if (uap->oss != NULL) { + PROC_LOCK(p); + ss = p->p_sigstk; + ss.ss_flags = (p->p_flag & P_ALTSTACK) + ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; + PROC_UNLOCK(p); + if ((error = copyout(&ss, uap->oss, sizeof(stack_t))) != 0) + goto done2; + } + + if (uap->ss != NULL) { + if (oonstack) { + error = EPERM; + goto done2; + } + if ((error = copyin(uap->ss, &ss, sizeof(ss))) != 0) + goto done2; + if ((ss.ss_flags & ~SS_DISABLE) != 0) { + error = EINVAL; + goto done2; + } + if (!(ss.ss_flags & SS_DISABLE)) { + if (ss.ss_size < p->p_sysent->sv_minsigstksz) { + error = ENOMEM; + goto done2; + } + PROC_LOCK(p); + p->p_sigstk = ss; + p->p_flag |= P_ALTSTACK; + PROC_UNLOCK(p); + } else { + PROC_LOCK(p); + p->p_flag &= ~P_ALTSTACK; + PROC_UNLOCK(p); + } + } +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * Common code for kill process group/broadcast kill. + * cp is calling process. + */ +int +killpg1(td, sig, pgid, all) + register struct thread *td; + int sig, pgid, all; +{ + register struct proc *p; + struct pgrp *pgrp; + int nfound = 0; + + if (all) { + /* + * broadcast + */ + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + PROC_LOCK(p); + if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || + p == td->td_proc) { + PROC_UNLOCK(p); + continue; + } + if (p_cansignal(td, p, sig) == 0) { + nfound++; + if (sig) + psignal(p, sig); + } + PROC_UNLOCK(p); + } + sx_sunlock(&allproc_lock); + } else { + sx_slock(&proctree_lock); + if (pgid == 0) { + /* + * zero pgid means send to my process group. + */ + pgrp = td->td_proc->p_pgrp; + PGRP_LOCK(pgrp); + } else { + pgrp = pgfind(pgid); + if (pgrp == NULL) { + sx_sunlock(&proctree_lock); + return (ESRCH); + } + } + sx_sunlock(&proctree_lock); + LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { + PROC_LOCK(p); + if (p->p_pid <= 1 || p->p_flag & P_SYSTEM) { + PROC_UNLOCK(p); + continue; + } + if (p->p_stat == SZOMB) { + PROC_UNLOCK(p); + continue; + } + if (p_cansignal(td, p, sig) == 0) { + nfound++; + if (sig) + psignal(p, sig); + } + PROC_UNLOCK(p); + } + PGRP_UNLOCK(pgrp); + } + return (nfound ? 0 : ESRCH); +} + +#ifndef _SYS_SYSPROTO_H_ +struct kill_args { + int pid; + int signum; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +kill(td, uap) + register struct thread *td; + register struct kill_args *uap; +{ + register struct proc *p; + int error = 0; + + if ((u_int)uap->signum > _SIG_MAXSIG) + return (EINVAL); + + mtx_lock(&Giant); + if (uap->pid > 0) { + /* kill single process */ + if ((p = pfind(uap->pid)) == NULL) { + error = ESRCH; + } else if ((error = p_cansignal(td, p, uap->signum)) != 0) { + PROC_UNLOCK(p); + } else { + if (uap->signum) + psignal(p, uap->signum); + PROC_UNLOCK(p); + error = 0; + } + } else { + switch (uap->pid) { + case -1: /* broadcast signal */ + error = killpg1(td, uap->signum, 0, 1); + break; + case 0: /* signal own process group */ + error = killpg1(td, uap->signum, 0, 0); + break; + default: /* negative explicit process group */ + error = killpg1(td, uap->signum, -uap->pid, 0); + break; + } + } + mtx_unlock(&Giant); + return(error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#ifndef _SYS_SYSPROTO_H_ +struct okillpg_args { + int pgid; + int signum; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +okillpg(td, uap) + struct thread *td; + register struct okillpg_args *uap; +{ + int error; + + if ((u_int)uap->signum > _SIG_MAXSIG) + return (EINVAL); + mtx_lock(&Giant); + error = killpg1(td, uap->signum, uap->pgid, 0); + mtx_unlock(&Giant); + return (error); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Send a signal to a process group. + */ +void +gsignal(pgid, sig) + int pgid, sig; +{ + struct pgrp *pgrp; + + if (pgid != 0) { + sx_slock(&proctree_lock); + pgrp = pgfind(pgid); + sx_sunlock(&proctree_lock); + if (pgrp != NULL) { + pgsignal(pgrp, sig, 0); + PGRP_UNLOCK(pgrp); + } + } +} + +/* + * Send a signal to a process group. If checktty is 1, + * limit to members which have a controlling terminal. + */ +void +pgsignal(pgrp, sig, checkctty) + struct pgrp *pgrp; + int sig, checkctty; +{ + register struct proc *p; + + if (pgrp) { + PGRP_LOCK_ASSERT(pgrp, MA_OWNED); + LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { + PROC_LOCK(p); + if (checkctty == 0 || p->p_flag & P_CONTROLT) + psignal(p, sig); + PROC_UNLOCK(p); + } + } +} + +/* + * Send a signal caused by a trap to the current process. + * If it will be caught immediately, deliver it with correct code. + * Otherwise, post it normally. + * + * MPSAFE + */ +void +trapsignal(p, sig, code) + struct proc *p; + register int sig; + u_long code; +{ + register struct sigacts *ps = p->p_sigacts; + + PROC_LOCK(p); + if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(p->p_sigcatch, sig) && + !SIGISMEMBER(p->p_sigmask, sig)) { + p->p_stats->p_ru.ru_nsignals++; +#ifdef KTRACE + if (KTRPOINT(curthread, KTR_PSIG)) + ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], + &p->p_sigmask, code); +#endif + (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], sig, + &p->p_sigmask, code); + SIGSETOR(p->p_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]); + if (!SIGISMEMBER(ps->ps_signodefer, sig)) + SIGADDSET(p->p_sigmask, sig); + if (SIGISMEMBER(ps->ps_sigreset, sig)) { + /* + * See do_sigaction() for origin of this code. + */ + SIGDELSET(p->p_sigcatch, sig); + if (sig != SIGCONT && + sigprop(sig) & SA_IGNORE) + SIGADDSET(p->p_sigignore, sig); + ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; + } + } else { + p->p_code = code; /* XXX for core dump/debugger */ + p->p_sig = sig; /* XXX to verify code */ + psignal(p, sig); + } + PROC_UNLOCK(p); +} + +/* + * Send the signal to the process. If the signal has an action, the action + * is usually performed by the target process rather than the caller; we add + * the signal to the set of pending signals for the process. + * + * Exceptions: + * o When a stop signal is sent to a sleeping process that takes the + * default action, the process is stopped without awakening it. + * o SIGCONT restarts stopped processes (or puts them back to sleep) + * regardless of the signal action (eg, blocked or ignored). + * + * Other ignored signals are discarded immediately. + */ +void +psignal(p, sig) + register struct proc *p; + register int sig; +{ + register int prop; + register sig_t action; + struct thread *td; +#ifdef SMP + struct ksegrp *kg; +#endif + + KASSERT(_SIG_VALID(sig), + ("psignal(): invalid signal %d\n", sig)); + + PROC_LOCK_ASSERT(p, MA_OWNED); + KNOTE(&p->p_klist, NOTE_SIGNAL | sig); + + prop = sigprop(sig); + + /* + * If proc is traced, always give parent a chance; + * if signal event is tracked by procfs, give *that* + * a chance, as well. + */ + if ((p->p_flag & P_TRACED) || (p->p_stops & S_SIG)) { + action = SIG_DFL; + } else { + /* + * If the signal is being ignored, + * then we forget about it immediately. + * (Note: we don't set SIGCONT in p_sigignore, + * and if it is set to SIG_IGN, + * action will be SIG_DFL here.) + */ + if (SIGISMEMBER(p->p_sigignore, sig) || (p->p_flag & P_WEXIT)) + return; + if (SIGISMEMBER(p->p_sigmask, sig)) + action = SIG_HOLD; + else if (SIGISMEMBER(p->p_sigcatch, sig)) + action = SIG_CATCH; + else + action = SIG_DFL; + } + + /* + * bring the priority of a process up if we want it to get + * killed in this lifetime. + * XXXKSE think if a better way to do this. + * + * What we need to do is see if there is a thread that will + * be able to accept the signal. e.g. + * FOREACH_THREAD_IN_PROC() { + * if runnable, we're done + * else pick one at random. + * } + */ + /* XXXKSE + * For now there is one thread per proc. + * Effectively select one sucker thread.. + */ + td = FIRST_THREAD_IN_PROC(p); + mtx_lock_spin(&sched_lock); + if ((p->p_ksegrp.kg_nice > NZERO) && (action == SIG_DFL) && + (prop & SA_KILL) && ((p->p_flag & P_TRACED) == 0)) + p->p_ksegrp.kg_nice = NZERO; /* XXXKSE */ + mtx_unlock_spin(&sched_lock); + + if (prop & SA_CONT) + SIG_STOPSIGMASK(p->p_siglist); + + if (prop & SA_STOP) { + /* + * If sending a tty stop signal to a member of an orphaned + * process group, discard the signal here if the action + * is default; don't stop the process below if sleeping, + * and don't clear any pending SIGCONT. + */ + if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 && + action == SIG_DFL) + return; + SIG_CONTSIGMASK(p->p_siglist); + } + SIGADDSET(p->p_siglist, sig); + mtx_lock_spin(&sched_lock); + signotify(p); + + /* + * Defer further processing for signals which are held, + * except that stopped processes must be continued by SIGCONT. + */ + if (action == SIG_HOLD && (!(prop & SA_CONT) || p->p_stat != SSTOP)) { + mtx_unlock_spin(&sched_lock); + return; + } + + switch (p->p_stat) { + + case SSLEEP: + /* + * If process is sleeping uninterruptibly + * we can't interrupt the sleep... the signal will + * be noticed when the process returns through + * trap() or syscall(). + */ + if ((td->td_flags & TDF_SINTR) == 0) + goto out; + /* + * Process is sleeping and traced... make it runnable + * so it can discover the signal in issignal() and stop + * for the parent. + */ + if (p->p_flag & P_TRACED) + goto run; + /* + * If SIGCONT is default (or ignored) and process is + * asleep, we are finished; the process should not + * be awakened. + */ + if ((prop & SA_CONT) && action == SIG_DFL) { + SIGDELSET(p->p_siglist, sig); + goto out; + } + /* + * When a sleeping process receives a stop + * signal, process immediately if possible. + * All other (caught or default) signals + * cause the process to run. + */ + if (prop & SA_STOP) { + if (action != SIG_DFL) + goto runfast; + /* + * If a child holding parent blocked, + * stopping could cause deadlock. + */ + if (p->p_flag & P_PPWAIT) + goto out; + mtx_unlock_spin(&sched_lock); + SIGDELSET(p->p_siglist, sig); + p->p_xstat = sig; + PROC_LOCK(p->p_pptr); + if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0) + psignal(p->p_pptr, SIGCHLD); + PROC_UNLOCK(p->p_pptr); + mtx_lock_spin(&sched_lock); + stop(p); + goto out; + } else + goto runfast; + /* NOTREACHED */ + + case SSTOP: + /* + * If traced process is already stopped, + * then no further action is necessary. + */ + if (p->p_flag & P_TRACED) + goto out; + + /* + * Kill signal always sets processes running. + */ + if (sig == SIGKILL) + goto runfast; + + if (prop & SA_CONT) { + /* + * If SIGCONT is default (or ignored), we continue the + * process but don't leave the signal in p_siglist, as + * it has no further action. If SIGCONT is held, we + * continue the process and leave the signal in + * p_siglist. If the process catches SIGCONT, let it + * handle the signal itself. If it isn't waiting on + * an event, then it goes back to run state. + * Otherwise, process goes back to sleep state. + */ + if (action == SIG_DFL) + SIGDELSET(p->p_siglist, sig); + if (action == SIG_CATCH) + goto runfast; + /* + * XXXKSE + * do this for each thread. + */ + if (p->p_flag & P_KSES) { + mtx_assert(&sched_lock, + MA_OWNED | MA_NOTRECURSED); + FOREACH_THREAD_IN_PROC(p, td) { + if (td->td_wchan == NULL) { + setrunnable(td); /* XXXKSE */ + } else { + /* mark it as sleeping */ + } + } + } else { + p->p_flag |= P_CONTINUED; + wakeup((caddr_t)p->p_pptr); + if (td->td_wchan == NULL) + goto run; + p->p_stat = SSLEEP; + } + goto out; + } + + if (prop & SA_STOP) { + /* + * Already stopped, don't need to stop again. + * (If we did the shell could get confused.) + */ + SIGDELSET(p->p_siglist, sig); + goto out; + } + + /* + * If process is sleeping interruptibly, then simulate a + * wakeup so that when it is continued, it will be made + * runnable and can look at the signal. But don't make + * the process runnable, leave it stopped. + * XXXKSE should we wake ALL blocked threads? + */ + if (p->p_flag & P_KSES) { + FOREACH_THREAD_IN_PROC(p, td) { + if (td->td_wchan && (td->td_flags & TDF_SINTR)){ + if (td->td_flags & TDF_CVWAITQ) + cv_waitq_remove(td); + else + unsleep(td); /* XXXKSE */ + } + } + } else { + if (td->td_wchan && td->td_flags & TDF_SINTR) { + if (td->td_flags & TDF_CVWAITQ) + cv_waitq_remove(td); + else + unsleep(td); /* XXXKSE */ + } + } + goto out; + + default: + /* + * SRUN, SIDL, SZOMB do nothing with the signal, + * other than kicking ourselves if we are running. + * It will either never be noticed, or noticed very soon. + */ + if (p->p_stat == SRUN) { +#ifdef SMP + struct kse *ke; + struct thread *td = curthread; +/* we should only deliver to one thread.. but which one? */ + FOREACH_KSEGRP_IN_PROC(p, kg) { + FOREACH_KSE_IN_GROUP(kg, ke) { + if (ke->ke_thread == td) { + continue; + } + forward_signal(ke->ke_thread); + } + } +#endif + } + goto out; + } + /*NOTREACHED*/ + +runfast: + /* + * Raise priority to at least PUSER. + * XXXKSE Should we make them all run fast? + * Maybe just one would be enough? + */ + + if (FIRST_THREAD_IN_PROC(p)->td_priority > PUSER) { + FIRST_THREAD_IN_PROC(p)->td_priority = PUSER; + } +run: + /* If we jump here, sched_lock has to be owned. */ + mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); + setrunnable(td); /* XXXKSE */ +out: + mtx_unlock_spin(&sched_lock); + + /* Once we get here, sched_lock should not be owned. */ + mtx_assert(&sched_lock, MA_NOTOWNED); +} + +/* + * If the current process has received a signal (should be caught or cause + * termination, should interrupt current syscall), return the signal number. + * Stop signals with default action are processed immediately, then cleared; + * they aren't returned. This is checked after each entry to the system for + * a syscall or trap (though this can usually be done without calling issignal + * by checking the pending signal masks in cursig.) The normal call + * sequence is + * + * while (sig = cursig(curproc)) + * postsig(sig); + */ +int +issignal(p) + register struct proc *p; +{ + sigset_t mask; + register int sig, prop; + + PROC_LOCK_ASSERT(p, MA_OWNED); + for (;;) { + int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG); + + mask = p->p_siglist; + SIGSETNAND(mask, p->p_sigmask); + if (p->p_flag & P_PPWAIT) + SIG_STOPSIGMASK(mask); + if (SIGISEMPTY(mask)) /* no signal to send */ + return (0); + sig = sig_ffs(&mask); + prop = sigprop(sig); + + _STOPEVENT(p, S_SIG, sig); + + /* + * We should see pending but ignored signals + * only if P_TRACED was on when they were posted. + */ + if (SIGISMEMBER(p->p_sigignore, sig) && (traced == 0)) { + SIGDELSET(p->p_siglist, sig); + continue; + } + if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) { + /* + * If traced, always stop. + */ + p->p_xstat = sig; + PROC_LOCK(p->p_pptr); + psignal(p->p_pptr, SIGCHLD); + PROC_UNLOCK(p->p_pptr); + mtx_lock_spin(&sched_lock); + stop(p); + PROC_UNLOCK(p); + DROP_GIANT(); + p->p_stats->p_ru.ru_nivcsw++; + mi_switch(); + mtx_unlock_spin(&sched_lock); + PICKUP_GIANT(); + PROC_LOCK(p); + + /* + * If the traced bit got turned off, go back up + * to the top to rescan signals. This ensures + * that p_sig* and ps_sigact are consistent. + */ + if ((p->p_flag & P_TRACED) == 0) + continue; + + /* + * If parent wants us to take the signal, + * then it will leave it in p->p_xstat; + * otherwise we just look for signals again. + */ + SIGDELSET(p->p_siglist, sig); /* clear old signal */ + sig = p->p_xstat; + if (sig == 0) + continue; + + /* + * Put the new signal into p_siglist. If the + * signal is being masked, look for other signals. + */ + SIGADDSET(p->p_siglist, sig); + if (SIGISMEMBER(p->p_sigmask, sig)) + continue; + } + + /* + * Decide whether the signal should be returned. + * Return the signal's number, or fall through + * to clear it from the pending mask. + */ + switch ((int)(intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { + + case (int)SIG_DFL: + /* + * Don't take default actions on system processes. + */ + if (p->p_pid <= 1) { +#ifdef DIAGNOSTIC + /* + * Are you sure you want to ignore SIGSEGV + * in init? XXX + */ + printf("Process (pid %lu) got signal %d\n", + (u_long)p->p_pid, sig); +#endif + break; /* == ignore */ + } + /* + * If there is a pending stop signal to process + * with default action, stop here, + * then clear the signal. However, + * if process is member of an orphaned + * process group, ignore tty stop signals. + */ + if (prop & SA_STOP) { + if (p->p_flag & P_TRACED || + (p->p_pgrp->pg_jobc == 0 && + prop & SA_TTYSTOP)) + break; /* == ignore */ + p->p_xstat = sig; + PROC_LOCK(p->p_pptr); + if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0) + psignal(p->p_pptr, SIGCHLD); + PROC_UNLOCK(p->p_pptr); + mtx_lock_spin(&sched_lock); + stop(p); + PROC_UNLOCK(p); + DROP_GIANT(); + p->p_stats->p_ru.ru_nivcsw++; + mi_switch(); + mtx_unlock_spin(&sched_lock); + PICKUP_GIANT(); + PROC_LOCK(p); + break; + } else if (prop & SA_IGNORE) { + /* + * Except for SIGCONT, shouldn't get here. + * Default action is to ignore; drop it. + */ + break; /* == ignore */ + } else + return (sig); + /*NOTREACHED*/ + + case (int)SIG_IGN: + /* + * Masking above should prevent us ever trying + * to take action on an ignored signal other + * than SIGCONT, unless process is traced. + */ + if ((prop & SA_CONT) == 0 && + (p->p_flag & P_TRACED) == 0) + printf("issignal\n"); + break; /* == ignore */ + + default: + /* + * This signal has an action, let + * postsig() process it. + */ + return (sig); + } + SIGDELSET(p->p_siglist, sig); /* take the signal! */ + } + /* NOTREACHED */ +} + +/* + * Put the argument process into the stopped state and notify the parent + * via wakeup. Signals are handled elsewhere. The process must not be + * on the run queue. Must be called with the proc p locked and the scheduler + * lock held. + */ +static void +stop(p) + register struct proc *p; +{ + + PROC_LOCK_ASSERT(p, MA_OWNED); + mtx_assert(&sched_lock, MA_OWNED); + p->p_stat = SSTOP; + p->p_flag &= ~P_WAITED; + wakeup((caddr_t)p->p_pptr); +} + +/* + * Take the action for the specified signal + * from the current set of pending signals. + */ +void +postsig(sig) + register int sig; +{ + struct thread *td = curthread; + register struct proc *p = td->td_proc; + struct sigacts *ps; + sig_t action; + sigset_t returnmask; + int code; + + KASSERT(sig != 0, ("postsig")); + + PROC_LOCK_ASSERT(p, MA_OWNED); + ps = p->p_sigacts; + SIGDELSET(p->p_siglist, sig); + action = ps->ps_sigact[_SIG_IDX(sig)]; +#ifdef KTRACE + if (KTRPOINT(td, KTR_PSIG)) + ktrpsig(sig, action, p->p_flag & P_OLDMASK ? + &p->p_oldsigmask : &p->p_sigmask, 0); +#endif + _STOPEVENT(p, S_SIG, sig); + + if (action == SIG_DFL) { + /* + * Default action, where the default is to kill + * the process. (Other cases were ignored above.) + */ + sigexit(td, sig); + /* NOTREACHED */ + } else { + /* + * If we get here, the signal must be caught. + */ + KASSERT(action != SIG_IGN && !SIGISMEMBER(p->p_sigmask, sig), + ("postsig action")); + /* + * Set the new mask value and also defer further + * occurrences of this signal. + * + * Special case: user has done a sigsuspend. Here the + * current mask is not of interest, but rather the + * mask from before the sigsuspend is what we want + * restored after the signal processing is completed. + */ + if (p->p_flag & P_OLDMASK) { + returnmask = p->p_oldsigmask; + p->p_flag &= ~P_OLDMASK; + } else + returnmask = p->p_sigmask; + + SIGSETOR(p->p_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]); + if (!SIGISMEMBER(ps->ps_signodefer, sig)) + SIGADDSET(p->p_sigmask, sig); + + if (SIGISMEMBER(ps->ps_sigreset, sig)) { + /* + * See do_sigaction() for origin of this code. + */ + SIGDELSET(p->p_sigcatch, sig); + if (sig != SIGCONT && + sigprop(sig) & SA_IGNORE) + SIGADDSET(p->p_sigignore, sig); + ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; + } + p->p_stats->p_ru.ru_nsignals++; + if (p->p_sig != sig) { + code = 0; + } else { + code = p->p_code; + p->p_code = 0; + p->p_sig = 0; + } + (*p->p_sysent->sv_sendsig)(action, sig, &returnmask, code); + } +} + +/* + * Kill the current process for stated reason. + */ +void +killproc(p, why) + struct proc *p; + char *why; +{ + + PROC_LOCK_ASSERT(p, MA_OWNED); + CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", + p, p->p_pid, p->p_comm); + log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, + p->p_ucred ? p->p_ucred->cr_uid : -1, why); + psignal(p, SIGKILL); +} + +/* + * Force the current process to exit with the specified signal, dumping core + * if appropriate. We bypass the normal tests for masked and caught signals, + * allowing unrecoverable failures to terminate the process without changing + * signal state. Mark the accounting record with the signal termination. + * If dumping core, save the signal number for the debugger. Calls exit and + * does not return. + */ +void +sigexit(td, sig) + struct thread *td; + int sig; +{ + struct proc *p = td->td_proc; + + PROC_LOCK_ASSERT(p, MA_OWNED); + p->p_acflag |= AXSIG; + if (sigprop(sig) & SA_CORE) { + p->p_sig = sig; + /* + * Log signals which would cause core dumps + * (Log as LOG_INFO to appease those who don't want + * these messages.) + * XXX : Todo, as well as euid, write out ruid too + */ + PROC_UNLOCK(p); + if (!mtx_owned(&Giant)) + mtx_lock(&Giant); + if (coredump(td) == 0) + sig |= WCOREFLAG; + if (kern_logsigexit) + log(LOG_INFO, + "pid %d (%s), uid %d: exited on signal %d%s\n", + p->p_pid, p->p_comm, + td->td_ucred ? td->td_ucred->cr_uid : -1, + sig &~ WCOREFLAG, + sig & WCOREFLAG ? " (core dumped)" : ""); + } else { + PROC_UNLOCK(p); + if (!mtx_owned(&Giant)) + mtx_lock(&Giant); + } + exit1(td, W_EXITCODE(0, sig)); + /* NOTREACHED */ +} + +static char corefilename[MAXPATHLEN+1] = {"%N.core"}; +SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename, + sizeof(corefilename), "process corefile name format string"); + +/* + * expand_name(name, uid, pid) + * Expand the name described in corefilename, using name, uid, and pid. + * corefilename is a printf-like string, with three format specifiers: + * %N name of process ("name") + * %P process id (pid) + * %U user id (uid) + * For example, "%N.core" is the default; they can be disabled completely + * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". + * This is controlled by the sysctl variable kern.corefile (see above). + */ + +static char * +expand_name(name, uid, pid) + const char *name; + uid_t uid; + pid_t pid; +{ + const char *format, *appendstr; + char *temp; + char buf[11]; /* Buffer for pid/uid -- max 4B */ + size_t i, l, n; + + format = corefilename; + temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO); + if (temp == NULL) + return (NULL); + for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) { + switch (format[i]) { + case '%': /* Format character */ + i++; + switch (format[i]) { + case '%': + appendstr = "%"; + break; + case 'N': /* process name */ + appendstr = name; + break; + case 'P': /* process id */ + sprintf(buf, "%u", pid); + appendstr = buf; + break; + case 'U': /* user id */ + sprintf(buf, "%u", uid); + appendstr = buf; + break; + default: + appendstr = ""; + log(LOG_ERR, + "Unknown format character %c in `%s'\n", + format[i], format); + } + l = strlen(appendstr); + if ((n + l) >= MAXPATHLEN) + goto toolong; + memcpy(temp + n, appendstr, l); + n += l; + break; + default: + temp[n++] = format[i]; + } + } + if (format[i] != '\0') + goto toolong; + return (temp); +toolong: + log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too long\n", + (long)pid, name, (u_long)uid); + free(temp, M_TEMP); + return (NULL); +} + +/* + * Dump a process' core. The main routine does some + * policy checking, and creates the name of the coredump; + * then it passes on a vnode and a size limit to the process-specific + * coredump routine if there is one; if there _is not_ one, it returns + * ENOSYS; otherwise it returns the error from the process-specific routine. + * + * XXX: VOP_GETATTR() here requires holding the vnode lock. + */ + +static int +coredump(struct thread *td) +{ + struct proc *p = td->td_proc; + register struct vnode *vp; + register struct ucred *cred = td->td_ucred; + struct flock lf; + struct nameidata nd; + struct vattr vattr; + int error, error1, flags; + struct mount *mp; + char *name; /* name of corefile */ + off_t limit; + + PROC_LOCK(p); + _STOPEVENT(p, S_CORE, 0); + + if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) { + PROC_UNLOCK(p); + return (EFAULT); + } + + /* + * Note that the bulk of limit checking is done after + * the corefile is created. The exception is if the limit + * for corefiles is 0, in which case we don't bother + * creating the corefile at all. This layout means that + * a corefile is truncated instead of not being created, + * if it is larger than the limit. + */ + limit = p->p_rlimit[RLIMIT_CORE].rlim_cur; + if (limit == 0) { + PROC_UNLOCK(p); + return 0; + } + PROC_UNLOCK(p); + +restart: + name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid); + if (name == NULL) + return (EINVAL); + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); /* XXXKSE */ + flags = O_CREAT | FWRITE | O_NOFOLLOW; + error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR); + free(name, M_TEMP); + if (error) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + + VOP_UNLOCK(vp, 0, td); + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_WRLCK; + error = VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK); + if (error) + goto out2; + + if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { + lf.l_type = F_UNLCK; + VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); + if ((error = vn_close(vp, FWRITE, cred, td)) != 0) + return (error); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + + /* Don't dump to non-regular files or files with links. */ + if (vp->v_type != VREG || + VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) { + error = EFAULT; + goto out1; + } + VATTR_NULL(&vattr); + vattr.va_size = 0; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + VOP_LEASE(vp, td, cred, LEASE_WRITE); + VOP_SETATTR(vp, &vattr, cred, td); + VOP_UNLOCK(vp, 0, td); + PROC_LOCK(p); + p->p_acflag |= ACORE; + PROC_UNLOCK(p); + + error = p->p_sysent->sv_coredump ? + p->p_sysent->sv_coredump(td, vp, limit) : + ENOSYS; + +out1: + lf.l_type = F_UNLCK; + VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); + vn_finished_write(mp); +out2: + error1 = vn_close(vp, FWRITE, cred, td); + if (error == 0) + error = error1; + return (error); +} + +/* + * Nonexistent system call-- signal process (may want to handle it). + * Flag error in case process won't see signal immediately (blocked or ignored). + */ +#ifndef _SYS_SYSPROTO_H_ +struct nosys_args { + int dummy; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +nosys(td, args) + struct thread *td; + struct nosys_args *args; +{ + struct proc *p = td->td_proc; + + mtx_lock(&Giant); + PROC_LOCK(p); + psignal(p, SIGSYS); + PROC_UNLOCK(p); + mtx_unlock(&Giant); + return (ENOSYS); +} + +/* + * Send a SIGIO or SIGURG signal to a process or process group using + * stored credentials rather than those of the current process. + */ +void +pgsigio(sigiop, sig, checkctty) + struct sigio **sigiop; + int sig, checkctty; +{ + struct sigio *sigio; + + SIGIO_LOCK(); + sigio = *sigiop; + if (sigio == NULL) { + SIGIO_UNLOCK(); + return; + } + if (sigio->sio_pgid > 0) { + PROC_LOCK(sigio->sio_proc); + if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) + psignal(sigio->sio_proc, sig); + PROC_UNLOCK(sigio->sio_proc); + } else if (sigio->sio_pgid < 0) { + struct proc *p; + + PGRP_LOCK(sigio->sio_pgrp); + LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { + PROC_LOCK(p); + if (CANSIGIO(sigio->sio_ucred, p->p_ucred) && + (checkctty == 0 || (p->p_flag & P_CONTROLT))) + psignal(p, sig); + PROC_UNLOCK(p); + } + PGRP_UNLOCK(sigio->sio_pgrp); + } + SIGIO_UNLOCK(); +} + +static int +filt_sigattach(struct knote *kn) +{ + struct proc *p = curproc; + + kn->kn_ptr.p_proc = p; + kn->kn_flags |= EV_CLEAR; /* automatically set */ + + PROC_LOCK(p); + SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext); + PROC_UNLOCK(p); + + return (0); +} + +static void +filt_sigdetach(struct knote *kn) +{ + struct proc *p = kn->kn_ptr.p_proc; + + PROC_LOCK(p); + SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext); + PROC_UNLOCK(p); +} + +/* + * signal knotes are shared with proc knotes, so we apply a mask to + * the hint in order to differentiate them from process hints. This + * could be avoided by using a signal-specific knote list, but probably + * isn't worth the trouble. + */ +static int +filt_signal(struct knote *kn, long hint) +{ + + if (hint & NOTE_SIGNAL) { + hint &= ~NOTE_SIGNAL; + + if (kn->kn_id == hint) + kn->kn_data++; + } + return (kn->kn_data != 0); +} diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c new file mode 100644 index 0000000..5e32eee --- /dev/null +++ b/sys/kern/kern_subr.c @@ -0,0 +1,582 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#include "opt_zero.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/resourcevar.h> +#include <sys/sysctl.h> +#include <sys/vnode.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> + +SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, + "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); + +#ifdef ZERO_COPY_SOCKETS +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_pager.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/swap_pager.h> +#include <sys/mbuf.h> +#include <machine/cpu.h> + +/* Declared in uipc_socket.c */ +extern int so_zero_copy_receive; + +static int vm_pgmoveco(vm_map_t mapa, vm_object_t srcobj, vm_offset_t kaddr, + vm_offset_t uaddr); +static int userspaceco(caddr_t cp, u_int cnt, struct uio *uio, + struct vm_object *obj, int disposable); + +static int +vm_pgmoveco(mapa, srcobj, kaddr, uaddr) + vm_map_t mapa; + vm_object_t srcobj; + vm_offset_t kaddr, uaddr; +{ + vm_map_t map = mapa; + vm_page_t kern_pg, user_pg; + vm_object_t uobject; + vm_map_entry_t entry; + vm_pindex_t upindex, kpindex; + vm_prot_t prot; + boolean_t wired; + + /* + * First lookup the kernel page. + */ + kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); + + if ((vm_map_lookup(&map, uaddr, + VM_PROT_READ, &entry, &uobject, + &upindex, &prot, &wired)) != KERN_SUCCESS) { + return(EFAULT); + } + if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { + vm_page_sleep_busy(user_pg, 1, "vm_pgmoveco"); + pmap_remove(map->pmap, uaddr, uaddr+PAGE_SIZE); + vm_page_busy(user_pg); + vm_page_free(user_pg); + } + + if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) || + (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) { + printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), " + "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex, + kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0, + kern_pg->hold_count, (u_long)kern_pg->phys_addr); + if ((kern_pg->queue - kern_pg->pc) == PQ_FREE) + panic("vm_pgmoveco: renaming free page"); + else + panic("vm_pgmoveco: renaming busy page"); + } + kpindex = kern_pg->pindex; + vm_page_busy(kern_pg); + vm_page_rename(kern_pg, uobject, upindex); + vm_page_flag_clear(kern_pg, PG_BUSY); + kern_pg->valid = VM_PAGE_BITS_ALL; + + vm_map_lookup_done(map, entry); + return(KERN_SUCCESS); +} +#endif /* ZERO_COPY_SOCKETS */ + +int +uiomove(cp, n, uio) + register caddr_t cp; + register int n; + register struct uio *uio; +{ + struct thread *td = curthread; + register struct iovec *iov; + u_int cnt; + int error = 0; + int save = 0; + + KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, + ("uiomove: mode")); + KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, + ("uiomove proc")); + + if (td) { + mtx_lock_spin(&sched_lock); + save = td->td_flags & TDF_DEADLKTREAT; + td->td_flags |= TDF_DEADLKTREAT; + mtx_unlock_spin(&sched_lock); + } + + while (n > 0 && uio->uio_resid) { + iov = uio->uio_iov; + cnt = iov->iov_len; + if (cnt == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + if (cnt > n) + cnt = n; + + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + if (ticks - PCPU_GET(switchticks) >= hogticks) + uio_yield(); + if (uio->uio_rw == UIO_READ) + error = copyout(cp, iov->iov_base, cnt); + else + error = copyin(iov->iov_base, cp, cnt); + if (error) + goto out; + break; + + case UIO_SYSSPACE: + if (uio->uio_rw == UIO_READ) + bcopy(cp, iov->iov_base, cnt); + else + bcopy(iov->iov_base, cp, cnt); + break; + case UIO_NOCOPY: + break; + } + iov->iov_base += cnt; + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + cp += cnt; + n -= cnt; + } +out: + if (td != curthread) printf("uiomove: IT CHANGED!"); + td = curthread; /* Might things have changed in copyin/copyout? */ + if (td) { + mtx_lock_spin(&sched_lock); + td->td_flags = (td->td_flags & ~TDF_DEADLKTREAT) | save; + mtx_unlock_spin(&sched_lock); + } + return (error); +} + +#if defined(ENABLE_VFS_IOOPT) || defined(ZERO_COPY_SOCKETS) +/* + * Experimental support for zero-copy I/O + */ +static int +userspaceco(cp, cnt, uio, obj, disposable) + caddr_t cp; + u_int cnt; + struct uio *uio; + struct vm_object *obj; + int disposable; +{ + struct iovec *iov; + int error; + + iov = uio->uio_iov; + +#ifdef ZERO_COPY_SOCKETS + + if (uio->uio_rw == UIO_READ) { + if ((so_zero_copy_receive != 0) + && (obj != NULL) + && ((cnt & PAGE_MASK) == 0) + && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) + && ((uio->uio_offset & PAGE_MASK) == 0) + && ((((intptr_t) cp) & PAGE_MASK) == 0) + && (obj->type == OBJT_DEFAULT) + && (disposable != 0)) { + /* SOCKET: use page-trading */ + /* + * We only want to call vm_pgmoveco() on + * disposeable pages, since it gives the + * kernel page to the userland process. + */ + error = vm_pgmoveco(&curproc->p_vmspace->vm_map, + obj, (vm_offset_t)cp, + (vm_offset_t)iov->iov_base); + + /* + * If we get an error back, attempt + * to use copyout() instead. The + * disposable page should be freed + * automatically if we weren't able to move + * it into userland. + */ + if (error != 0) + error = copyout(cp, iov->iov_base, cnt); +#ifdef ENABLE_VFS_IOOPT + } else if ((vfs_ioopt != 0) + && ((cnt & PAGE_MASK) == 0) + && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) + && ((uio->uio_offset & PAGE_MASK) == 0) + && ((((intptr_t) cp) & PAGE_MASK) == 0)) { + error = vm_uiomove(&curproc->p_vmspace->vm_map, obj, + uio->uio_offset, cnt, + (vm_offset_t) iov->iov_base, NULL); +#endif /* ENABLE_VFS_IOOPT */ + } else { + error = copyout(cp, iov->iov_base, cnt); + } + } else { + error = copyin(iov->iov_base, cp, cnt); + } +#else /* ZERO_COPY_SOCKETS */ + if (uio->uio_rw == UIO_READ) { +#ifdef ENABLE_VFS_IOOPT + if ((vfs_ioopt != 0) + && ((cnt & PAGE_MASK) == 0) + && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) + && ((uio->uio_offset & PAGE_MASK) == 0) + && ((((intptr_t) cp) & PAGE_MASK) == 0)) { + error = vm_uiomove(&curproc->p_vmspace->vm_map, obj, + uio->uio_offset, cnt, + (vm_offset_t) iov->iov_base, NULL); + } else +#endif /* ENABLE_VFS_IOOPT */ + { + error = copyout(cp, iov->iov_base, cnt); + } + } else { + error = copyin(iov->iov_base, cp, cnt); + } +#endif /* ZERO_COPY_SOCKETS */ + + return (error); +} + +int +uiomoveco(cp, n, uio, obj, disposable) + caddr_t cp; + int n; + struct uio *uio; + struct vm_object *obj; + int disposable; +{ + struct iovec *iov; + u_int cnt; + int error; + + KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, + ("uiomoveco: mode")); + KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, + ("uiomoveco proc")); + + while (n > 0 && uio->uio_resid) { + iov = uio->uio_iov; + cnt = iov->iov_len; + if (cnt == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + if (cnt > n) + cnt = n; + + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + if (ticks - PCPU_GET(switchticks) >= hogticks) + uio_yield(); + + error = userspaceco(cp, cnt, uio, obj, disposable); + + if (error) + return (error); + break; + + case UIO_SYSSPACE: + if (uio->uio_rw == UIO_READ) + bcopy(cp, iov->iov_base, cnt); + else + bcopy(iov->iov_base, cp, cnt); + break; + case UIO_NOCOPY: + break; + } + iov->iov_base += cnt; + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + cp += cnt; + n -= cnt; + } + return (0); +} +#endif /* ENABLE_VFS_IOOPT || ZERO_COPY_SOCKETS */ + +#ifdef ENABLE_VFS_IOOPT + +/* + * Experimental support for zero-copy I/O + */ +int +uioread(n, uio, obj, nread) + int n; + struct uio *uio; + struct vm_object *obj; + int *nread; +{ + int npagesmoved; + struct iovec *iov; + u_int cnt, tcnt; + int error; + + *nread = 0; + if (vfs_ioopt < 2) + return 0; + + error = 0; + + while (n > 0 && uio->uio_resid) { + iov = uio->uio_iov; + cnt = iov->iov_len; + if (cnt == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + if (cnt > n) + cnt = n; + + if ((uio->uio_segflg == UIO_USERSPACE) && + ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) && + ((uio->uio_offset & PAGE_MASK) == 0) ) { + + if (cnt < PAGE_SIZE) + break; + + cnt &= ~PAGE_MASK; + + if (ticks - PCPU_GET(switchticks) >= hogticks) + uio_yield(); + error = vm_uiomove(&curproc->p_vmspace->vm_map, obj, + uio->uio_offset, cnt, + (vm_offset_t) iov->iov_base, &npagesmoved); + + if (npagesmoved == 0) + break; + + tcnt = npagesmoved * PAGE_SIZE; + cnt = tcnt; + + if (error) + break; + + iov->iov_base += cnt; + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + *nread += cnt; + n -= cnt; + } else { + break; + } + } + return error; +} +#endif /* ENABLE_VFS_IOOPT */ + +/* + * Give next character to user as result of read. + */ +int +ureadc(c, uio) + register int c; + register struct uio *uio; +{ + register struct iovec *iov; + +again: + if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) + panic("ureadc"); + iov = uio->uio_iov; + if (iov->iov_len == 0) { + uio->uio_iovcnt--; + uio->uio_iov++; + goto again; + } + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + if (subyte(iov->iov_base, c) < 0) + return (EFAULT); + break; + + case UIO_SYSSPACE: + *iov->iov_base = c; + break; + + case UIO_NOCOPY: + break; + } + iov->iov_base++; + iov->iov_len--; + uio->uio_resid--; + uio->uio_offset++; + return (0); +} + +/* + * General routine to allocate a hash table. + */ +void * +hashinit(elements, type, hashmask) + int elements; + struct malloc_type *type; + u_long *hashmask; +{ + long hashsize; + LIST_HEAD(generic, generic) *hashtbl; + int i; + + if (elements <= 0) + panic("hashinit: bad elements"); + for (hashsize = 1; hashsize <= elements; hashsize <<= 1) + continue; + hashsize >>= 1; + hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); + for (i = 0; i < hashsize; i++) + LIST_INIT(&hashtbl[i]); + *hashmask = hashsize - 1; + return (hashtbl); +} + +static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, + 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, + 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; +#define NPRIMES (sizeof(primes) / sizeof(primes[0])) + +/* + * General routine to allocate a prime number sized hash table. + */ +void * +phashinit(elements, type, nentries) + int elements; + struct malloc_type *type; + u_long *nentries; +{ + long hashsize; + LIST_HEAD(generic, generic) *hashtbl; + int i; + + if (elements <= 0) + panic("phashinit: bad elements"); + for (i = 1, hashsize = primes[1]; hashsize <= elements;) { + i++; + if (i == NPRIMES) + break; + hashsize = primes[i]; + } + hashsize = primes[i - 1]; + hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); + for (i = 0; i < hashsize; i++) + LIST_INIT(&hashtbl[i]); + *nentries = hashsize; + return (hashtbl); +} + +void +uio_yield() +{ + struct thread *td; + + td = curthread; + mtx_lock_spin(&sched_lock); + DROP_GIANT(); + td->td_priority = td->td_ksegrp->kg_user_pri; /* XXXKSE */ + setrunqueue(td); + td->td_proc->p_stats->p_ru.ru_nivcsw++; + mi_switch(); + mtx_unlock_spin(&sched_lock); + PICKUP_GIANT(); +} + +int +copyinfrom(const void *src, void *dst, size_t len, int seg) +{ + int error = 0; + + switch (seg) { + case UIO_USERSPACE: + error = copyin(src, dst, len); + break; + case UIO_SYSSPACE: + bcopy(src, dst, len); + break; + default: + panic("copyinfrom: bad seg %d\n", seg); + } + return (error); +} + +int +copyinstrfrom(const void *src, void *dst, size_t len, size_t *copied, int seg) +{ + int error = 0; + + switch (seg) { + case UIO_USERSPACE: + error = copyinstr(src, dst, len, copied); + break; + case UIO_SYSSPACE: + error = copystr(src, dst, len, copied); + break; + default: + panic("copyinstrfrom: bad seg %d\n", seg); + } + return (error); +} diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c new file mode 100644 index 0000000..2b531c0 --- /dev/null +++ b/sys/kern/kern_switch.c @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <machine/critical.h> + +CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS); + +/* + * Global run queue. + */ +static struct runq runq; +SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq) + +/* + * Wrappers which implement old interface; act on global run queue. + */ + +struct thread * +choosethread(void) +{ + return (runq_choose(&runq)->ke_thread); +} + +int +procrunnable(void) +{ + return runq_check(&runq); +} + +void +remrunqueue(struct thread *td) +{ + runq_remove(&runq, td->td_kse); +} + +void +setrunqueue(struct thread *td) +{ + runq_add(&runq, td->td_kse); +} + +/* Critical sections that prevent preemption. */ +void +critical_enter(void) +{ + struct thread *td; + + td = curthread; + if (td->td_critnest == 0) + cpu_critical_enter(); + td->td_critnest++; +} + +void +critical_exit(void) +{ + struct thread *td; + + td = curthread; + if (td->td_critnest == 1) { + td->td_critnest = 0; + cpu_critical_exit(); + } else { + td->td_critnest--; + } +} + +/* + * Clear the status bit of the queue corresponding to priority level pri, + * indicating that it is empty. + */ +static __inline void +runq_clrbit(struct runq *rq, int pri) +{ + struct rqbits *rqb; + + rqb = &rq->rq_status; + CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d", + rqb->rqb_bits[RQB_WORD(pri)], + rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri), + RQB_BIT(pri), RQB_WORD(pri)); + rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri); +} + +/* + * Find the index of the first non-empty run queue. This is done by + * scanning the status bits, a set bit indicates a non-empty queue. + */ +static __inline int +runq_findbit(struct runq *rq) +{ + struct rqbits *rqb; + int pri; + int i; + + rqb = &rq->rq_status; + for (i = 0; i < RQB_LEN; i++) + if (rqb->rqb_bits[i]) { + pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW); + CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d", + rqb->rqb_bits[i], i, pri); + return (pri); + } + + return (-1); +} + +/* + * Set the status bit of the queue corresponding to priority level pri, + * indicating that it is non-empty. + */ +static __inline void +runq_setbit(struct runq *rq, int pri) +{ + struct rqbits *rqb; + + rqb = &rq->rq_status; + CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d", + rqb->rqb_bits[RQB_WORD(pri)], + rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri), + RQB_BIT(pri), RQB_WORD(pri)); + rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri); +} + +/* + * Add the process to the queue specified by its priority, and set the + * corresponding status bit. + */ +void +runq_add(struct runq *rq, struct kse *ke) +{ + struct rqhead *rqh; + int pri; + +#ifdef INVARIANTS + struct proc *p = ke->ke_proc; +#endif + if (ke->ke_flags & KEF_ONRUNQ) + return; + mtx_assert(&sched_lock, MA_OWNED); + KASSERT(p->p_stat == SRUN, ("runq_add: proc %p (%s) not SRUN", + p, p->p_comm)); + pri = ke->ke_thread->td_priority / RQ_PPQ; + ke->ke_rqindex = pri; + runq_setbit(rq, pri); + rqh = &rq->rq_queues[pri]; + CTR4(KTR_RUNQ, "runq_add: p=%p pri=%d %d rqh=%p", + ke->ke_proc, ke->ke_thread->td_priority, pri, rqh); + TAILQ_INSERT_TAIL(rqh, ke, ke_procq); + ke->ke_flags |= KEF_ONRUNQ; +} + +/* + * Return true if there are runnable processes of any priority on the run + * queue, false otherwise. Has no side effects, does not modify the run + * queue structure. + */ +int +runq_check(struct runq *rq) +{ + struct rqbits *rqb; + int i; + + rqb = &rq->rq_status; + for (i = 0; i < RQB_LEN; i++) + if (rqb->rqb_bits[i]) { + CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d", + rqb->rqb_bits[i], i); + return (1); + } + CTR0(KTR_RUNQ, "runq_check: empty"); + + return (0); +} + +/* + * Find and remove the highest priority process from the run queue. + * If there are no runnable processes, the per-cpu idle process is + * returned. Will not return NULL under any circumstances. + */ +struct kse * +runq_choose(struct runq *rq) +{ + struct rqhead *rqh; + struct kse *ke; + int pri; + + mtx_assert(&sched_lock, MA_OWNED); + if ((pri = runq_findbit(rq)) != -1) { + rqh = &rq->rq_queues[pri]; + ke = TAILQ_FIRST(rqh); + KASSERT(ke != NULL, ("runq_choose: no proc on busy queue")); + KASSERT(ke->ke_proc->p_stat == SRUN, + ("runq_choose: process %d(%s) in state %d", ke->ke_proc->p_pid, + ke->ke_proc->p_comm, ke->ke_proc->p_stat)); + CTR3(KTR_RUNQ, "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh); + TAILQ_REMOVE(rqh, ke, ke_procq); + if (TAILQ_EMPTY(rqh)) { + CTR0(KTR_RUNQ, "runq_choose: empty"); + runq_clrbit(rq, pri); + } + ke->ke_flags &= ~KEF_ONRUNQ; + return (ke); + } + CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri); + + return (PCPU_GET(idlethread)->td_kse); +} + +/* + * Initialize a run structure. + */ +void +runq_init(struct runq *rq) +{ + int i; + + bzero(rq, sizeof *rq); + for (i = 0; i < RQ_NQS; i++) + TAILQ_INIT(&rq->rq_queues[i]); +} + +/* + * Remove the process from the queue specified by its priority, and clear the + * corresponding status bit if the queue becomes empty. + */ +void +runq_remove(struct runq *rq, struct kse *ke) +{ + struct rqhead *rqh; + int pri; + + if (!(ke->ke_flags & KEF_ONRUNQ)) + return; + mtx_assert(&sched_lock, MA_OWNED); + pri = ke->ke_rqindex; + rqh = &rq->rq_queues[pri]; + CTR4(KTR_RUNQ, "runq_remove: p=%p pri=%d %d rqh=%p", + ke, ke->ke_thread->td_priority, pri, rqh); + KASSERT(ke != NULL, ("runq_remove: no proc on busy queue")); + TAILQ_REMOVE(rqh, ke, ke_procq); + if (TAILQ_EMPTY(rqh)) { + CTR0(KTR_RUNQ, "runq_remove: empty"); + runq_clrbit(rq, pri); + } + ke->ke_flags &= ~KEF_ONRUNQ; +} diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c new file mode 100644 index 0000000..2f69a00 --- /dev/null +++ b/sys/kern/kern_sx.c @@ -0,0 +1,348 @@ +/* + * Copyright (C) 2001 Jason Evans <jasone@freebsd.org>. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice(s), this list of conditions and the following disclaimer as + * the first lines of this file unmodified other than the possible + * addition of one or more copyright notices. + * 2. Redistributions in binary form must reproduce the above copyright + * notice(s), this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Shared/exclusive locks. This implementation assures deterministic lock + * granting behavior, so that slocks and xlocks are interleaved. + * + * Priority propagation will not generally raise the priority of lock holders, + * so should not be relied upon in combination with sx locks. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/ktr.h> +#include <sys/condvar.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> + +struct lock_class lock_class_sx = { + "sx", + LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE +}; + +#ifndef INVARIANTS +#define _sx_assert(sx, what, file, line) +#endif + +void +sx_sysinit(void *arg) +{ + struct sx_args *sargs = arg; + + sx_init(sargs->sa_sx, sargs->sa_desc); +} + +void +sx_init(struct sx *sx, const char *description) +{ + struct lock_object *lock; + + lock = &sx->sx_object; + KASSERT((lock->lo_flags & LO_INITIALIZED) == 0, + ("sx lock %s %p already initialized", description, sx)); + bzero(sx, sizeof(*sx)); + lock->lo_class = &lock_class_sx; + lock->lo_type = lock->lo_name = description; + lock->lo_flags = LO_WITNESS | LO_RECURSABLE | LO_SLEEPABLE | + LO_UPGRADABLE; + sx->sx_lock = mtx_pool_find(sx); + sx->sx_cnt = 0; + cv_init(&sx->sx_shrd_cv, description); + sx->sx_shrd_wcnt = 0; + cv_init(&sx->sx_excl_cv, description); + sx->sx_excl_wcnt = 0; + sx->sx_xholder = NULL; + + LOCK_LOG_INIT(lock, 0); + + WITNESS_INIT(lock); +} + +void +sx_destroy(struct sx *sx) +{ + + LOCK_LOG_DESTROY(&sx->sx_object, 0); + + KASSERT((sx->sx_cnt == 0 && sx->sx_shrd_wcnt == 0 && sx->sx_excl_wcnt == + 0), ("%s (%s): holders or waiters\n", __func__, + sx->sx_object.lo_name)); + + sx->sx_lock = NULL; + cv_destroy(&sx->sx_shrd_cv); + cv_destroy(&sx->sx_excl_cv); + + WITNESS_DESTROY(&sx->sx_object); +} + +void +_sx_slock(struct sx *sx, const char *file, int line) +{ + + mtx_lock(sx->sx_lock); + KASSERT(sx->sx_xholder != curthread, + ("%s (%s): slock while xlock is held @ %s:%d\n", __func__, + sx->sx_object.lo_name, file, line)); + + /* + * Loop in case we lose the race for lock acquisition. + */ + while (sx->sx_cnt < 0) { + sx->sx_shrd_wcnt++; + cv_wait(&sx->sx_shrd_cv, sx->sx_lock); + sx->sx_shrd_wcnt--; + } + + /* Acquire a shared lock. */ + sx->sx_cnt++; + + LOCK_LOG_LOCK("SLOCK", &sx->sx_object, 0, 0, file, line); + WITNESS_LOCK(&sx->sx_object, 0, file, line); + + mtx_unlock(sx->sx_lock); +} + +int +_sx_try_slock(struct sx *sx, const char *file, int line) +{ + + mtx_lock(sx->sx_lock); + if (sx->sx_cnt >= 0) { + sx->sx_cnt++; + LOCK_LOG_TRY("SLOCK", &sx->sx_object, 0, 1, file, line); + WITNESS_LOCK(&sx->sx_object, LOP_TRYLOCK, file, line); + mtx_unlock(sx->sx_lock); + return (1); + } else { + LOCK_LOG_TRY("SLOCK", &sx->sx_object, 0, 0, file, line); + mtx_unlock(sx->sx_lock); + return (0); + } +} + +void +_sx_xlock(struct sx *sx, const char *file, int line) +{ + + mtx_lock(sx->sx_lock); + + /* + * With sx locks, we're absolutely not permitted to recurse on + * xlocks, as it is fatal (deadlock). Normally, recursion is handled + * by WITNESS, but as it is not semantically correct to hold the + * xlock while in here, we consider it API abuse and put it under + * INVARIANTS. + */ + KASSERT(sx->sx_xholder != curthread, + ("%s (%s): xlock already held @ %s:%d", __func__, + sx->sx_object.lo_name, file, line)); + + /* Loop in case we lose the race for lock acquisition. */ + while (sx->sx_cnt != 0) { + sx->sx_excl_wcnt++; + cv_wait(&sx->sx_excl_cv, sx->sx_lock); + sx->sx_excl_wcnt--; + } + + MPASS(sx->sx_cnt == 0); + + /* Acquire an exclusive lock. */ + sx->sx_cnt--; + sx->sx_xholder = curthread; + + LOCK_LOG_LOCK("XLOCK", &sx->sx_object, 0, 0, file, line); + WITNESS_LOCK(&sx->sx_object, LOP_EXCLUSIVE, file, line); + + mtx_unlock(sx->sx_lock); +} + +int +_sx_try_xlock(struct sx *sx, const char *file, int line) +{ + + mtx_lock(sx->sx_lock); + if (sx->sx_cnt == 0) { + sx->sx_cnt--; + sx->sx_xholder = curthread; + LOCK_LOG_TRY("XLOCK", &sx->sx_object, 0, 1, file, line); + WITNESS_LOCK(&sx->sx_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, + line); + mtx_unlock(sx->sx_lock); + return (1); + } else { + LOCK_LOG_TRY("XLOCK", &sx->sx_object, 0, 0, file, line); + mtx_unlock(sx->sx_lock); + return (0); + } +} + +void +_sx_sunlock(struct sx *sx, const char *file, int line) +{ + + _sx_assert(sx, SX_SLOCKED, file, line); + mtx_lock(sx->sx_lock); + + WITNESS_UNLOCK(&sx->sx_object, 0, file, line); + + /* Release. */ + sx->sx_cnt--; + + /* + * If we just released the last shared lock, wake any waiters up, giving + * exclusive lockers precedence. In order to make sure that exclusive + * lockers won't be blocked forever, don't wake shared lock waiters if + * there are exclusive lock waiters. + */ + if (sx->sx_excl_wcnt > 0) { + if (sx->sx_cnt == 0) + cv_signal(&sx->sx_excl_cv); + } else if (sx->sx_shrd_wcnt > 0) + cv_broadcast(&sx->sx_shrd_cv); + + LOCK_LOG_LOCK("SUNLOCK", &sx->sx_object, 0, 0, file, line); + + mtx_unlock(sx->sx_lock); +} + +void +_sx_xunlock(struct sx *sx, const char *file, int line) +{ + + _sx_assert(sx, SX_XLOCKED, file, line); + mtx_lock(sx->sx_lock); + MPASS(sx->sx_cnt == -1); + + WITNESS_UNLOCK(&sx->sx_object, LOP_EXCLUSIVE, file, line); + + /* Release. */ + sx->sx_cnt++; + sx->sx_xholder = NULL; + + /* + * Wake up waiters if there are any. Give precedence to slock waiters. + */ + if (sx->sx_shrd_wcnt > 0) + cv_broadcast(&sx->sx_shrd_cv); + else if (sx->sx_excl_wcnt > 0) + cv_signal(&sx->sx_excl_cv); + + LOCK_LOG_LOCK("XUNLOCK", &sx->sx_object, 0, 0, file, line); + + mtx_unlock(sx->sx_lock); +} + +int +_sx_try_upgrade(struct sx *sx, const char *file, int line) +{ + + _sx_assert(sx, SX_SLOCKED, file, line); + mtx_lock(sx->sx_lock); + + if (sx->sx_cnt == 1) { + sx->sx_cnt = -1; + sx->sx_xholder = curthread; + + LOCK_LOG_TRY("XUPGRADE", &sx->sx_object, 0, 1, file, line); + WITNESS_UPGRADE(&sx->sx_object, LOP_EXCLUSIVE | LOP_TRYLOCK, + file, line); + + mtx_unlock(sx->sx_lock); + return (1); + } else { + LOCK_LOG_TRY("XUPGRADE", &sx->sx_object, 0, 0, file, line); + mtx_unlock(sx->sx_lock); + return (0); + } +} + +void +_sx_downgrade(struct sx *sx, const char *file, int line) +{ + + _sx_assert(sx, SX_XLOCKED, file, line); + mtx_lock(sx->sx_lock); + MPASS(sx->sx_cnt == -1); + + WITNESS_DOWNGRADE(&sx->sx_object, 0, file, line); + + sx->sx_cnt = 1; + sx->sx_xholder = NULL; + if (sx->sx_shrd_wcnt > 0) + cv_broadcast(&sx->sx_shrd_cv); + + LOCK_LOG_LOCK("XDOWNGRADE", &sx->sx_object, 0, 0, file, line); + + mtx_unlock(sx->sx_lock); +} + +#ifdef INVARIANT_SUPPORT +#ifndef INVARIANTS +#undef _sx_assert +#endif + +/* + * In the non-WITNESS case, sx_assert() can only detect that at least + * *some* thread owns an slock, but it cannot guarantee that *this* + * thread owns an slock. + */ +void +_sx_assert(struct sx *sx, int what, const char *file, int line) +{ + + switch (what) { + case SX_LOCKED: + case SX_SLOCKED: +#ifdef WITNESS + witness_assert(&sx->sx_object, what, file, line); +#else + mtx_lock(sx->sx_lock); + if (sx->sx_cnt <= 0 && + (what == SX_SLOCKED || sx->sx_xholder != curthread)) + printf("Lock %s not %slocked @ %s:%d\n", + sx->sx_object.lo_name, (what == SX_SLOCKED) ? + "share " : "", file, line); + mtx_unlock(sx->sx_lock); +#endif + break; + case SX_XLOCKED: + mtx_lock(sx->sx_lock); + if (sx->sx_xholder != curthread) + printf("Lock %s not exclusively locked @ %s:%d\n", + sx->sx_object.lo_name, file, line); + mtx_unlock(sx->sx_lock); + break; + default: + panic("Unknown sx lock assertion: %d @ %s:%d", what, file, + line); + } +} +#endif /* INVARIANT_SUPPORT */ diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c new file mode 100644 index 0000000..6f9adad --- /dev/null +++ b/sys/kern/kern_synch.c @@ -0,0 +1,970 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 + * $FreeBSD$ + */ + +#include "opt_ddb.h" +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/condvar.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/smp.h> +#include <sys/sx.h> +#include <sys/sysctl.h> +#include <sys/sysproto.h> +#include <sys/vmmeter.h> +#ifdef DDB +#include <ddb/ddb.h> +#endif +#ifdef KTRACE +#include <sys/uio.h> +#include <sys/ktrace.h> +#endif + +#include <machine/cpu.h> + +static void sched_setup(void *dummy); +SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) + +int hogticks; +int lbolt; +int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ + +static struct callout loadav_callout; +static struct callout schedcpu_callout; +static struct callout roundrobin_callout; + +struct loadavg averunnable = + { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ +/* + * Constants for averages over 1, 5, and 15 minutes + * when sampling at 5 second intervals. + */ +static fixpt_t cexp[3] = { + 0.9200444146293232 * FSCALE, /* exp(-1/12) */ + 0.9834714538216174 * FSCALE, /* exp(-1/60) */ + 0.9944598480048967 * FSCALE, /* exp(-1/180) */ +}; + +static void endtsleep(void *); +static void loadav(void *arg); +static void roundrobin(void *arg); +static void schedcpu(void *arg); + +static int +sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) +{ + int error, new_val; + + new_val = sched_quantum * tick; + error = sysctl_handle_int(oidp, &new_val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (new_val < tick) + return (EINVAL); + sched_quantum = new_val / tick; + hogticks = 2 * sched_quantum; + return (0); +} + +SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW, + 0, sizeof sched_quantum, sysctl_kern_quantum, "I", + "Roundrobin scheduling quantum in microseconds"); + +/* + * Arrange to reschedule if necessary, taking the priorities and + * schedulers into account. + */ +void +maybe_resched(struct thread *td) +{ + + mtx_assert(&sched_lock, MA_OWNED); + if (td->td_priority < curthread->td_priority) + curthread->td_kse->ke_flags |= KEF_NEEDRESCHED; +} + +int +roundrobin_interval(void) +{ + return (sched_quantum); +} + +/* + * Force switch among equal priority processes every 100ms. + * We don't actually need to force a context switch of the current process. + * The act of firing the event triggers a context switch to softclock() and + * then switching back out again which is equivalent to a preemption, thus + * no further work is needed on the local CPU. + */ +/* ARGSUSED */ +static void +roundrobin(arg) + void *arg; +{ + +#ifdef SMP + mtx_lock_spin(&sched_lock); + forward_roundrobin(); + mtx_unlock_spin(&sched_lock); +#endif + + callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL); +} + +/* + * Constants for digital decay and forget: + * 90% of (p_estcpu) usage in 5 * loadav time + * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) + * Note that, as ps(1) mentions, this can let percentages + * total over 100% (I've seen 137.9% for 3 processes). + * + * Note that schedclock() updates p_estcpu and p_cpticks asynchronously. + * + * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. + * That is, the system wants to compute a value of decay such + * that the following for loop: + * for (i = 0; i < (5 * loadavg); i++) + * p_estcpu *= decay; + * will compute + * p_estcpu *= 0.1; + * for all values of loadavg: + * + * Mathematically this loop can be expressed by saying: + * decay ** (5 * loadavg) ~= .1 + * + * The system computes decay as: + * decay = (2 * loadavg) / (2 * loadavg + 1) + * + * We wish to prove that the system's computation of decay + * will always fulfill the equation: + * decay ** (5 * loadavg) ~= .1 + * + * If we compute b as: + * b = 2 * loadavg + * then + * decay = b / (b + 1) + * + * We now need to prove two things: + * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) + * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) + * + * Facts: + * For x close to zero, exp(x) =~ 1 + x, since + * exp(x) = 0! + x**1/1! + x**2/2! + ... . + * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. + * For x close to zero, ln(1+x) =~ x, since + * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 + * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). + * ln(.1) =~ -2.30 + * + * Proof of (1): + * Solve (factor)**(power) =~ .1 given power (5*loadav): + * solving for factor, + * ln(factor) =~ (-2.30/5*loadav), or + * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = + * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED + * + * Proof of (2): + * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): + * solving for power, + * power*ln(b/(b+1)) =~ -2.30, or + * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED + * + * Actual power values for the implemented algorithm are as follows: + * loadav: 1 2 3 4 + * power: 5.68 10.32 14.94 19.55 + */ + +/* calculations for digital decay to forget 90% of usage in 5*loadav sec */ +#define loadfactor(loadav) (2 * (loadav)) +#define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) + +/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ +static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ +SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); + +/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ +static int fscale __unused = FSCALE; +SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); + +/* + * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the + * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below + * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). + * + * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: + * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). + * + * If you don't want to bother with the faster/more-accurate formula, you + * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate + * (more general) method of calculating the %age of CPU used by a process. + */ +#define CCPU_SHIFT 11 + +/* + * Recompute process priorities, every hz ticks. + * MP-safe, called without the Giant mutex. + */ +/* ARGSUSED */ +static void +schedcpu(arg) + void *arg; +{ + register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); + struct thread *td; + struct proc *p; + struct kse *ke; + struct ksegrp *kg; + int realstathz; + int awake; + + realstathz = stathz ? stathz : hz; + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + mtx_lock_spin(&sched_lock); + p->p_swtime++; + FOREACH_KSEGRP_IN_PROC(p, kg) { + awake = 0; + FOREACH_KSE_IN_GROUP(kg, ke) { + /* + * Increment time in/out of memory and sleep + * time (if sleeping). We ignore overflow; + * with 16-bit int's (remember them?) + * overflow takes 45 days. + */ + /* XXXKSE */ + /* if ((ke->ke_flags & KEF_ONRUNQ) == 0) */ + if (p->p_stat == SSLEEP || p->p_stat == SSTOP) { + ke->ke_slptime++; + } else { + ke->ke_slptime = 0; + awake = 1; + } + + /* + * pctcpu is only for ps? + * Do it per kse.. and add them up at the end? + * XXXKSE + */ + ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >> FSHIFT; + /* + * If the kse has been idle the entire second, + * stop recalculating its priority until + * it wakes up. + */ + if (ke->ke_slptime > 1) { + continue; + } + +#if (FSHIFT >= CCPU_SHIFT) + ke->ke_pctcpu += (realstathz == 100) ? + ((fixpt_t) ke->ke_cpticks) << + (FSHIFT - CCPU_SHIFT) : + 100 * (((fixpt_t) ke->ke_cpticks) << + (FSHIFT - CCPU_SHIFT)) / realstathz; +#else + ke->ke_pctcpu += ((FSCALE - ccpu) * + (ke->ke_cpticks * FSCALE / realstathz)) >> + FSHIFT; +#endif + ke->ke_cpticks = 0; + } /* end of kse loop */ + if (awake == 0) { + kg->kg_slptime++; + } else { + kg->kg_slptime = 0; + } + kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu); + resetpriority(kg); + td = FIRST_THREAD_IN_PROC(p); + if (td->td_priority >= PUSER && + (p->p_sflag & PS_INMEM)) { + int changedqueue = + ((td->td_priority / RQ_PPQ) != + (kg->kg_user_pri / RQ_PPQ)); + + td->td_priority = kg->kg_user_pri; + FOREACH_KSE_IN_GROUP(kg, ke) { + if ((ke->ke_oncpu == NOCPU) && + (p->p_stat == SRUN) && /* XXXKSE */ + changedqueue) { + remrunqueue(ke->ke_thread); + setrunqueue(ke->ke_thread); + } + } + } + } /* end of ksegrp loop */ + mtx_unlock_spin(&sched_lock); + } /* end of process loop */ + sx_sunlock(&allproc_lock); + wakeup((caddr_t)&lbolt); + callout_reset(&schedcpu_callout, hz, schedcpu, NULL); +} + +/* + * Recalculate the priority of a process after it has slept for a while. + * For all load averages >= 1 and max p_estcpu of 255, sleeping for at + * least six times the loadfactor will decay p_estcpu to zero. + */ +void +updatepri(td) + register struct thread *td; +{ + register struct ksegrp *kg; + register unsigned int newcpu; + register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); + + if (td == NULL) + return; + kg = td->td_ksegrp; + newcpu = kg->kg_estcpu; + if (kg->kg_slptime > 5 * loadfac) + kg->kg_estcpu = 0; + else { + kg->kg_slptime--; /* the first time was done in schedcpu */ + while (newcpu && --kg->kg_slptime) + newcpu = decay_cpu(loadfac, newcpu); + kg->kg_estcpu = newcpu; + } + resetpriority(td->td_ksegrp); +} + +/* + * We're only looking at 7 bits of the address; everything is + * aligned to 4, lots of things are aligned to greater powers + * of 2. Shift right by 8, i.e. drop the bottom 256 worth. + */ +#define TABLESIZE 128 +static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE]; +#define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1)) + +void +sleepinit(void) +{ + int i; + + sched_quantum = hz/10; + hogticks = 2 * sched_quantum; + for (i = 0; i < TABLESIZE; i++) + TAILQ_INIT(&slpque[i]); +} + +/* + * General sleep call. Suspends the current process until a wakeup is + * performed on the specified identifier. The process will then be made + * runnable with the specified priority. Sleeps at most timo/hz seconds + * (0 means no timeout). If pri includes PCATCH flag, signals are checked + * before and after sleeping, else signals are not checked. Returns 0 if + * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a + * signal needs to be delivered, ERESTART is returned if the current system + * call should be restarted if possible, and EINTR is returned if the system + * call should be interrupted by the signal (return EINTR). + * + * The mutex argument is exited before the caller is suspended, and + * entered before msleep returns. If priority includes the PDROP + * flag the mutex is not entered before returning. + */ +int +msleep(ident, mtx, priority, wmesg, timo) + void *ident; + struct mtx *mtx; + int priority, timo; + const char *wmesg; +{ + struct thread *td = curthread; + struct proc *p = td->td_proc; + int sig, catch = priority & PCATCH; + int rval = 0; + WITNESS_SAVE_DECL(mtx); + +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(1, 0); +#endif + WITNESS_SLEEP(0, &mtx->mtx_object); + KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL, + ("sleeping without a mutex")); + mtx_lock_spin(&sched_lock); + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, + * just give interrupts a chance, then just return; + * don't run any other procs or panic below, + * in case this is the idle process and already asleep. + */ + if (mtx != NULL && priority & PDROP) + mtx_unlock(mtx); + mtx_unlock_spin(&sched_lock); + return (0); + } + + DROP_GIANT(); + + if (mtx != NULL) { + mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); + WITNESS_SAVE(&mtx->mtx_object, mtx); + mtx_unlock(mtx); + if (priority & PDROP) + mtx = NULL; + } + + KASSERT(p != NULL, ("msleep1")); + KASSERT(ident != NULL && td->td_proc->p_stat == SRUN, ("msleep")); + + td->td_wchan = ident; + td->td_wmesg = wmesg; + td->td_kse->ke_slptime = 0; /* XXXKSE */ + td->td_ksegrp->kg_slptime = 0; + td->td_priority = priority & PRIMASK; + CTR5(KTR_PROC, "msleep: thread %p (pid %d, %s) on %s (%p)", + td, p->p_pid, p->p_comm, wmesg, ident); + TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], td, td_slpq); + if (timo) + callout_reset(&td->td_slpcallout, timo, endtsleep, td); + /* + * We put ourselves on the sleep queue and start our timeout + * before calling cursig, as we could stop there, and a wakeup + * or a SIGCONT (or both) could occur while we were stopped. + * A SIGCONT would cause us to be marked as SSLEEP + * without resuming us, thus we must be ready for sleep + * when cursig is called. If the wakeup happens while we're + * stopped, td->td_wchan will be 0 upon return from cursig. + */ + if (catch) { + CTR3(KTR_PROC, "msleep caught: proc %p (pid %d, %s)", p, + p->p_pid, p->p_comm); + td->td_flags |= TDF_SINTR; + mtx_unlock_spin(&sched_lock); + PROC_LOCK(p); + sig = cursig(p); + mtx_lock_spin(&sched_lock); + PROC_UNLOCK(p); + if (sig != 0) { + if (td->td_wchan != NULL) + unsleep(td); + } else if (td->td_wchan == NULL) + catch = 0; + } else + sig = 0; + if (td->td_wchan != NULL) { + td->td_proc->p_stat = SSLEEP; + p->p_stats->p_ru.ru_nvcsw++; + mi_switch(); + } + CTR3(KTR_PROC, "msleep resume: proc %p (pid %d, %s)", td, p->p_pid, + p->p_comm); + KASSERT(td->td_proc->p_stat == SRUN, ("running but not SRUN")); + td->td_flags &= ~TDF_SINTR; + if (td->td_flags & TDF_TIMEOUT) { + td->td_flags &= ~TDF_TIMEOUT; + if (sig == 0) + rval = EWOULDBLOCK; + } else if (td->td_flags & TDF_TIMOFAIL) + td->td_flags &= ~TDF_TIMOFAIL; + else if (timo && callout_stop(&td->td_slpcallout) == 0) { + /* + * This isn't supposed to be pretty. If we are here, then + * the endtsleep() callout is currently executing on another + * CPU and is either spinning on the sched_lock or will be + * soon. If we don't synchronize here, there is a chance + * that this process may msleep() again before the callout + * has a chance to run and the callout may end up waking up + * the wrong msleep(). Yuck. + */ + td->td_flags |= TDF_TIMEOUT; + p->p_stats->p_ru.ru_nivcsw++; + mi_switch(); + } + mtx_unlock_spin(&sched_lock); + + if (rval == 0 && catch) { + PROC_LOCK(p); + /* XXX: shouldn't we always be calling cursig() */ + if (sig != 0 || (sig = cursig(p))) { + if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) + rval = EINTR; + else + rval = ERESTART; + } + PROC_UNLOCK(p); + } +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(0, 0); +#endif + PICKUP_GIANT(); + if (mtx != NULL) { + mtx_lock(mtx); + WITNESS_RESTORE(&mtx->mtx_object, mtx); + } + return (rval); +} + +/* + * Implement timeout for msleep() + * + * If process hasn't been awakened (wchan non-zero), + * set timeout flag and undo the sleep. If proc + * is stopped, just unsleep so it will remain stopped. + * MP-safe, called without the Giant mutex. + */ +static void +endtsleep(arg) + void *arg; +{ + register struct thread *td = arg; + + CTR3(KTR_PROC, "endtsleep: thread %p (pid %d, %s)", td, td->td_proc->p_pid, + td->td_proc->p_comm); + mtx_lock_spin(&sched_lock); + /* + * This is the other half of the synchronization with msleep() + * described above. If the PS_TIMEOUT flag is set, we lost the + * race and just need to put the process back on the runqueue. + */ + if ((td->td_flags & TDF_TIMEOUT) != 0) { + td->td_flags &= ~TDF_TIMEOUT; + setrunqueue(td); + } else if (td->td_wchan != NULL) { + if (td->td_proc->p_stat == SSLEEP) /* XXXKSE */ + setrunnable(td); + else + unsleep(td); + td->td_flags |= TDF_TIMEOUT; + } else { + td->td_flags |= TDF_TIMOFAIL; + } + mtx_unlock_spin(&sched_lock); +} + +/* + * Remove a process from its wait queue + */ +void +unsleep(struct thread *td) +{ + + mtx_lock_spin(&sched_lock); + if (td->td_wchan != NULL) { + TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq); + td->td_wchan = NULL; + } + mtx_unlock_spin(&sched_lock); +} + +/* + * Make all processes sleeping on the specified identifier runnable. + */ +void +wakeup(ident) + register void *ident; +{ + register struct slpquehead *qp; + register struct thread *td; + struct thread *ntd; + struct proc *p; + + mtx_lock_spin(&sched_lock); + qp = &slpque[LOOKUP(ident)]; +restart: + for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) { + ntd = TAILQ_NEXT(td, td_slpq); + p = td->td_proc; + if (td->td_wchan == ident) { + TAILQ_REMOVE(qp, td, td_slpq); + td->td_wchan = NULL; + if (td->td_proc->p_stat == SSLEEP) { + /* OPTIMIZED EXPANSION OF setrunnable(p); */ + CTR3(KTR_PROC, "wakeup: thread %p (pid %d, %s)", + td, p->p_pid, p->p_comm); + if (td->td_ksegrp->kg_slptime > 1) + updatepri(td); + td->td_ksegrp->kg_slptime = 0; + td->td_kse->ke_slptime = 0; + td->td_proc->p_stat = SRUN; + if (p->p_sflag & PS_INMEM) { + setrunqueue(td); + maybe_resched(td); + } else { + p->p_sflag |= PS_SWAPINREQ; + wakeup((caddr_t)&proc0); + } + /* END INLINE EXPANSION */ + goto restart; + } + } + } + mtx_unlock_spin(&sched_lock); +} + +/* + * Make a process sleeping on the specified identifier runnable. + * May wake more than one process if a target process is currently + * swapped out. + */ +void +wakeup_one(ident) + register void *ident; +{ + register struct slpquehead *qp; + register struct thread *td; + register struct proc *p; + struct thread *ntd; + + mtx_lock_spin(&sched_lock); + qp = &slpque[LOOKUP(ident)]; +restart: + for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) { + ntd = TAILQ_NEXT(td, td_slpq); + p = td->td_proc; + if (td->td_wchan == ident) { + TAILQ_REMOVE(qp, td, td_slpq); + td->td_wchan = NULL; + if (td->td_proc->p_stat == SSLEEP) { + /* OPTIMIZED EXPANSION OF setrunnable(p); */ + CTR3(KTR_PROC, "wakeup1: proc %p (pid %d, %s)", + p, p->p_pid, p->p_comm); + if (td->td_ksegrp->kg_slptime > 1) + updatepri(td); + td->td_ksegrp->kg_slptime = 0; + td->td_kse->ke_slptime = 0; + td->td_proc->p_stat = SRUN; + if (p->p_sflag & PS_INMEM) { + setrunqueue(td); + maybe_resched(td); + break; + } else { + p->p_sflag |= PS_SWAPINREQ; + wakeup((caddr_t)&proc0); + } + /* END INLINE EXPANSION */ + goto restart; + } + } + } + mtx_unlock_spin(&sched_lock); +} + +/* + * The machine independent parts of mi_switch(). + */ +void +mi_switch() +{ + struct bintime new_switchtime; + struct thread *td = curthread; /* XXX */ + register struct proc *p = td->td_proc; /* XXX */ +#if 0 + register struct rlimit *rlim; +#endif + u_int sched_nest; + + mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); +#ifdef INVARIANTS + if (p->p_stat != SMTX && p->p_stat != SRUN) + mtx_assert(&Giant, MA_NOTOWNED); +#endif + + /* + * Compute the amount of time during which the current + * process was running, and add that to its total so far. + */ + binuptime(&new_switchtime); + bintime_add(&p->p_runtime, &new_switchtime); + bintime_sub(&p->p_runtime, PCPU_PTR(switchtime)); + +#ifdef DDB + /* + * Don't perform context switches from the debugger. + */ + if (db_active) { + mtx_unlock_spin(&sched_lock); + db_error("Context switches not allowed in the debugger."); + } +#endif + +#if 0 + /* + * Check if the process exceeds its cpu resource allocation. + * If over max, kill it. + * + * XXX drop sched_lock, pickup Giant + */ + if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY && + p->p_runtime > p->p_limit->p_cpulimit) { + rlim = &p->p_rlimit[RLIMIT_CPU]; + if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) { + mtx_unlock_spin(&sched_lock); + PROC_LOCK(p); + killproc(p, "exceeded maximum CPU limit"); + mtx_lock_spin(&sched_lock); + PROC_UNLOCK(p); + } else { + mtx_unlock_spin(&sched_lock); + PROC_LOCK(p); + psignal(p, SIGXCPU); + mtx_lock_spin(&sched_lock); + PROC_UNLOCK(p); + if (rlim->rlim_cur < rlim->rlim_max) { + /* XXX: we should make a private copy */ + rlim->rlim_cur += 5; + } + } + } +#endif + + /* + * Pick a new current process and record its start time. + */ + cnt.v_swtch++; + PCPU_SET(switchtime, new_switchtime); + CTR3(KTR_PROC, "mi_switch: old proc %p (pid %d, %s)", p, p->p_pid, + p->p_comm); + sched_nest = sched_lock.mtx_recurse; + td->td_lastcpu = td->td_kse->ke_oncpu; + td->td_kse->ke_oncpu = NOCPU; + td->td_kse->ke_flags &= ~KEF_NEEDRESCHED; + cpu_switch(); + td->td_kse->ke_oncpu = PCPU_GET(cpuid); + sched_lock.mtx_recurse = sched_nest; + sched_lock.mtx_lock = (uintptr_t)td; + CTR3(KTR_PROC, "mi_switch: new proc %p (pid %d, %s)", p, p->p_pid, + p->p_comm); + if (PCPU_GET(switchtime.sec) == 0) + binuptime(PCPU_PTR(switchtime)); + PCPU_SET(switchticks, ticks); +} + +/* + * Change process state to be runnable, + * placing it on the run queue if it is in memory, + * and awakening the swapper if it isn't in memory. + */ +void +setrunnable(struct thread *td) +{ + struct proc *p = td->td_proc; + + mtx_lock_spin(&sched_lock); + switch (p->p_stat) { + case SZOMB: /* not a thread flag XXXKSE */ + panic("setrunnable(1)"); + } + switch (td->td_proc->p_stat) { + case 0: + case SRUN: + case SWAIT: + default: + panic("setrunnable(2)"); + case SSTOP: + case SSLEEP: /* e.g. when sending signals */ + if (td->td_flags & TDF_CVWAITQ) + cv_waitq_remove(td); + else + unsleep(td); + break; + + case SIDL: + break; + } + td->td_proc->p_stat = SRUN; + if (td->td_ksegrp->kg_slptime > 1) + updatepri(td); + td->td_ksegrp->kg_slptime = 0; + td->td_kse->ke_slptime = 0; + if ((p->p_sflag & PS_INMEM) == 0) { + p->p_sflag |= PS_SWAPINREQ; + wakeup((caddr_t)&proc0); + } else { + setrunqueue(td); + maybe_resched(td); + } + mtx_unlock_spin(&sched_lock); +} + +/* + * Compute the priority of a process when running in user mode. + * Arrange to reschedule if the resulting priority is better + * than that of the current process. + */ +void +resetpriority(kg) + register struct ksegrp *kg; +{ + register unsigned int newpriority; + struct thread *td; + + mtx_lock_spin(&sched_lock); + if (kg->kg_pri_class == PRI_TIMESHARE) { + newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT + + NICE_WEIGHT * (kg->kg_nice - PRIO_MIN); + newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), + PRI_MAX_TIMESHARE); + kg->kg_user_pri = newpriority; + } + FOREACH_THREAD_IN_GROUP(kg, td) { + maybe_resched(td); + } + mtx_unlock_spin(&sched_lock); +} + +/* + * Compute a tenex style load average of a quantity on + * 1, 5 and 15 minute intervals. + * XXXKSE Needs complete rewrite when correct info is available. + * Completely Bogus.. only works with 1:1 (but compiles ok now :-) + */ +static void +loadav(void *arg) +{ + int i, nrun; + struct loadavg *avg; + struct proc *p; + struct ksegrp *kg; + + avg = &averunnable; + sx_slock(&allproc_lock); + nrun = 0; + FOREACH_PROC_IN_SYSTEM(p) { + FOREACH_KSEGRP_IN_PROC(p, kg) { + switch (p->p_stat) { + case SRUN: + if ((p->p_flag & P_NOLOAD) != 0) + goto nextproc; + /* FALLTHROUGH */ + case SIDL: + nrun++; + } +nextproc: + continue; + } + } + sx_sunlock(&allproc_lock); + for (i = 0; i < 3; i++) + avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; + + /* + * Schedule the next update to occur after 5 seconds, but add a + * random variation to avoid synchronisation with processes that + * run at regular intervals. + */ + callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)), + loadav, NULL); +} + +/* ARGSUSED */ +static void +sched_setup(dummy) + void *dummy; +{ + + callout_init(&schedcpu_callout, 1); + callout_init(&roundrobin_callout, 0); + callout_init(&loadav_callout, 0); + + /* Kick off timeout driven events by calling first time. */ + roundrobin(NULL); + schedcpu(NULL); + loadav(NULL); +} + +/* + * We adjust the priority of the current process. The priority of + * a process gets worse as it accumulates CPU time. The cpu usage + * estimator (p_estcpu) is increased here. resetpriority() will + * compute a different priority each time p_estcpu increases by + * INVERSE_ESTCPU_WEIGHT + * (until MAXPRI is reached). The cpu usage estimator ramps up + * quite quickly when the process is running (linearly), and decays + * away exponentially, at a rate which is proportionally slower when + * the system is busy. The basic principle is that the system will + * 90% forget that the process used a lot of CPU time in 5 * loadav + * seconds. This causes the system to favor processes which haven't + * run much recently, and to round-robin among other processes. + */ +void +schedclock(td) + struct thread *td; +{ + struct kse *ke = td->td_kse; + struct ksegrp *kg = td->td_ksegrp; + + if (td) { + ke->ke_cpticks++; + kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1); + if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { + resetpriority(td->td_ksegrp); + if (td->td_priority >= PUSER) + td->td_priority = kg->kg_user_pri; + } + } else { + panic("schedclock"); + } +} + +/* + * General purpose yield system call + */ +int +yield(struct thread *td, struct yield_args *uap) +{ + struct ksegrp *kg = td->td_ksegrp; + + mtx_assert(&Giant, MA_NOTOWNED); + mtx_lock_spin(&sched_lock); + td->td_priority = PRI_MAX_TIMESHARE; + setrunqueue(td); + kg->kg_proc->p_stats->p_ru.ru_nvcsw++; + mi_switch(); + mtx_unlock_spin(&sched_lock); + td->td_retval[0] = 0; + + return (0); +} + diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c new file mode 100644 index 0000000..2867bc9 --- /dev/null +++ b/sys/kern/kern_syscalls.c @@ -0,0 +1,123 @@ +/*- + * Copyright (c) 1999 Assar Westerlund + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/sysproto.h> +#include <sys/sysent.h> +#include <sys/syscall.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/module.h> + +/* + * Acts like "nosys" but can be identified in sysent for dynamic call + * number assignment for a limited number of calls. + * + * Place holder for system call slots reserved for loadable modules. + */ +int +lkmnosys(struct thread *td, struct nosys_args *args) +{ + return(nosys(td, args)); +} + +int +lkmressys(struct thread *td, struct nosys_args *args) +{ + return(nosys(td, args)); +} + +int +syscall_register(int *offset, struct sysent *new_sysent, + struct sysent *old_sysent) +{ + if (*offset == NO_SYSCALL) { + int i; + + for (i = 1; i < SYS_MAXSYSCALL; ++i) + if (sysent[i].sy_call == (sy_call_t *)lkmnosys) + break; + if (i == SYS_MAXSYSCALL) + return ENFILE; + *offset = i; + } else if (*offset < 0 || *offset >= SYS_MAXSYSCALL) + return EINVAL; + else if (sysent[*offset].sy_call != (sy_call_t *)lkmnosys && + sysent[*offset].sy_call != (sy_call_t *)lkmressys) + return EEXIST; + + *old_sysent = sysent[*offset]; + sysent[*offset] = *new_sysent; + return 0; +} + +int +syscall_deregister(int *offset, struct sysent *old_sysent) +{ + if (*offset) + sysent[*offset] = *old_sysent; + return 0; +} + +int +syscall_module_handler(struct module *mod, int what, void *arg) +{ + struct syscall_module_data *data = (struct syscall_module_data*)arg; + modspecific_t ms; + int error; + + switch (what) { + case MOD_LOAD : + error = syscall_register(data->offset, data->new_sysent, + &data->old_sysent); + if (error) + return error; + ms.intval = *data->offset; + MOD_XLOCK; + module_setspecific(mod, &ms); + MOD_XUNLOCK; + if (data->chainevh) + error = data->chainevh(mod, what, data->chainarg); + return error; + + case MOD_UNLOAD : + if (data->chainevh) { + error = data->chainevh(mod, what, data->chainarg); + if (error) + return error; + } + error = syscall_deregister(data->offset, &data->old_sysent); + return error; + } + + if (data->chainevh) + return data->chainevh(mod, what, data->chainarg); + else + return 0; +} diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c new file mode 100644 index 0000000..6943bc5 --- /dev/null +++ b/sys/kern/kern_sysctl.c @@ -0,0 +1,1422 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Karels at Berkeley Software Design, Inc. + * + * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD + * project, to make these variables more userfriendly. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 + * $FreeBSD$ + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/sysproto.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> + +static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic"); +static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids"); + +/* + * Locking - this locks the sysctl tree in memory. + */ +static struct sx sysctllock; + +#define SYSCTL_LOCK() sx_xlock(&sysctllock) +#define SYSCTL_UNLOCK() sx_xunlock(&sysctllock) +#define SYSCTL_INIT() sx_init(&sysctllock, "sysctl sysctllock") + +static int sysctl_root(SYSCTL_HANDLER_ARGS); + +struct sysctl_oid_list sysctl__children; /* root list */ + +static struct sysctl_oid * +sysctl_find_oidname(const char *name, struct sysctl_oid_list *list) +{ + struct sysctl_oid *oidp; + + SLIST_FOREACH(oidp, list, oid_link) { + if (strcmp(oidp->oid_name, name) == 0) { + return (oidp); + } + } + return (NULL); +} + +/* + * Initialization of the MIB tree. + * + * Order by number in each list. + */ + +void +sysctl_register_oid(struct sysctl_oid *oidp) +{ + struct sysctl_oid_list *parent = oidp->oid_parent; + struct sysctl_oid *p; + struct sysctl_oid *q; + + /* + * First check if another oid with the same name already + * exists in the parent's list. + */ + p = sysctl_find_oidname(oidp->oid_name, parent); + if (p != NULL) { + if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + p->oid_refcnt++; + return; + } else { + printf("can't re-use a leaf (%s)!\n", p->oid_name); + return; + } + } + /* + * If this oid has a number OID_AUTO, give it a number which + * is greater than any current oid. + * NOTE: DO NOT change the starting value here, change it in + * <sys/sysctl.h>, and make sure it is at least 256 to + * accomodate e.g. net.inet.raw as a static sysctl node. + */ + if (oidp->oid_number == OID_AUTO) { + static int newoid = CTL_AUTO_START; + + oidp->oid_number = newoid++; + if (newoid == 0x7fffffff) + panic("out of oids"); + } +#if 0 + else if (oidp->oid_number >= CTL_AUTO_START) { + /* do not panic; this happens when unregistering sysctl sets */ + printf("static sysctl oid too high: %d", oidp->oid_number); + } +#endif + + /* + * Insert the oid into the parent's list in order. + */ + q = NULL; + SLIST_FOREACH(p, parent, oid_link) { + if (oidp->oid_number < p->oid_number) + break; + q = p; + } + if (q) + SLIST_INSERT_AFTER(q, oidp, oid_link); + else + SLIST_INSERT_HEAD(parent, oidp, oid_link); +} + +void +sysctl_unregister_oid(struct sysctl_oid *oidp) +{ + SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link); +} + +/* Initialize a new context to keep track of dynamically added sysctls. */ +int +sysctl_ctx_init(struct sysctl_ctx_list *c) +{ + + if (c == NULL) { + return (EINVAL); + } + TAILQ_INIT(c); + return (0); +} + +/* Free the context, and destroy all dynamic oids registered in this context */ +int +sysctl_ctx_free(struct sysctl_ctx_list *clist) +{ + struct sysctl_ctx_entry *e, *e1; + int error; + + error = 0; + /* + * First perform a "dry run" to check if it's ok to remove oids. + * XXX FIXME + * XXX This algorithm is a hack. But I don't know any + * XXX better solution for now... + */ + TAILQ_FOREACH(e, clist, link) { + error = sysctl_remove_oid(e->entry, 0, 0); + if (error) + break; + } + /* + * Restore deregistered entries, either from the end, + * or from the place where error occured. + * e contains the entry that was not unregistered + */ + if (error) + e1 = TAILQ_PREV(e, sysctl_ctx_list, link); + else + e1 = TAILQ_LAST(clist, sysctl_ctx_list); + while (e1 != NULL) { + sysctl_register_oid(e1->entry); + e1 = TAILQ_PREV(e1, sysctl_ctx_list, link); + } + if (error) + return(EBUSY); + /* Now really delete the entries */ + e = TAILQ_FIRST(clist); + while (e != NULL) { + e1 = TAILQ_NEXT(e, link); + error = sysctl_remove_oid(e->entry, 1, 0); + if (error) + panic("sysctl_remove_oid: corrupt tree, entry: %s", + e->entry->oid_name); + free(e, M_SYSCTLOID); + e = e1; + } + return (error); +} + +/* Add an entry to the context */ +struct sysctl_ctx_entry * +sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) +{ + struct sysctl_ctx_entry *e; + + if (clist == NULL || oidp == NULL) + return(NULL); + e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK); + e->entry = oidp; + TAILQ_INSERT_HEAD(clist, e, link); + return (e); +} + +/* Find an entry in the context */ +struct sysctl_ctx_entry * +sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) +{ + struct sysctl_ctx_entry *e; + + if (clist == NULL || oidp == NULL) + return(NULL); + TAILQ_FOREACH(e, clist, link) { + if(e->entry == oidp) + return(e); + } + return (e); +} + +/* + * Delete an entry from the context. + * NOTE: this function doesn't free oidp! You have to remove it + * with sysctl_remove_oid(). + */ +int +sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) +{ + struct sysctl_ctx_entry *e; + + if (clist == NULL || oidp == NULL) + return (EINVAL); + e = sysctl_ctx_entry_find(clist, oidp); + if (e != NULL) { + TAILQ_REMOVE(clist, e, link); + free(e, M_SYSCTLOID); + return (0); + } else + return (ENOENT); +} + +/* + * Remove dynamically created sysctl trees. + * oidp - top of the tree to be removed + * del - if 0 - just deregister, otherwise free up entries as well + * recurse - if != 0 traverse the subtree to be deleted + */ +int +sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse) +{ + struct sysctl_oid *p; + int error; + + if (oidp == NULL) + return(EINVAL); + if ((oidp->oid_kind & CTLFLAG_DYN) == 0) { + printf("can't remove non-dynamic nodes!\n"); + return (EINVAL); + } + /* + * WARNING: normal method to do this should be through + * sysctl_ctx_free(). Use recursing as the last resort + * method to purge your sysctl tree of leftovers... + * However, if some other code still references these nodes, + * it will panic. + */ + if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + if (oidp->oid_refcnt == 1) { + SLIST_FOREACH(p, SYSCTL_CHILDREN(oidp), oid_link) { + if (!recurse) + return (ENOTEMPTY); + error = sysctl_remove_oid(p, del, recurse); + if (error) + return (error); + } + if (del) + free(SYSCTL_CHILDREN(oidp), M_SYSCTLOID); + } + } + if (oidp->oid_refcnt > 1 ) { + oidp->oid_refcnt--; + } else { + if (oidp->oid_refcnt == 0) { + printf("Warning: bad oid_refcnt=%u (%s)!\n", + oidp->oid_refcnt, oidp->oid_name); + return (EINVAL); + } + sysctl_unregister_oid(oidp); + if (del) { + if (oidp->descr) + free(oidp->descr, M_SYSCTLOID); + free((void *)(uintptr_t)(const void *)oidp->oid_name, + M_SYSCTLOID); + free(oidp, M_SYSCTLOID); + } + } + return (0); +} + +/* + * Create new sysctls at run time. + * clist may point to a valid context initialized with sysctl_ctx_init(). + */ +struct sysctl_oid * +sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, + int number, const char *name, int kind, void *arg1, int arg2, + int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr) +{ + struct sysctl_oid *oidp; + ssize_t len; + char *newname; + + /* You have to hook up somewhere.. */ + if (parent == NULL) + return(NULL); + /* Check if the node already exists, otherwise create it */ + oidp = sysctl_find_oidname(name, parent); + if (oidp != NULL) { + if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + oidp->oid_refcnt++; + /* Update the context */ + if (clist != NULL) + sysctl_ctx_entry_add(clist, oidp); + return (oidp); + } else { + printf("can't re-use a leaf (%s)!\n", name); + return (NULL); + } + } + oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO); + oidp->oid_parent = parent; + SLIST_NEXT(oidp, oid_link) = NULL; + oidp->oid_number = number; + oidp->oid_refcnt = 1; + len = strlen(name); + newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK); + bcopy(name, newname, len + 1); + newname[len] = '\0'; + oidp->oid_name = newname; + oidp->oid_handler = handler; + oidp->oid_kind = CTLFLAG_DYN | kind; + if ((kind & CTLTYPE) == CTLTYPE_NODE) { + /* Allocate space for children */ + SYSCTL_CHILDREN(oidp) = malloc(sizeof(struct sysctl_oid_list), + M_SYSCTLOID, M_WAITOK); + SLIST_INIT(SYSCTL_CHILDREN(oidp)); + } else { + oidp->oid_arg1 = arg1; + oidp->oid_arg2 = arg2; + } + oidp->oid_fmt = fmt; + if (descr) { + int len = strlen(descr) + 1; + oidp->descr = malloc(len, M_SYSCTLOID, M_WAITOK); + if (oidp->descr) + strcpy(oidp->descr, descr); + } + /* Update the context, if used */ + if (clist != NULL) + sysctl_ctx_entry_add(clist, oidp); + /* Register this oid */ + sysctl_register_oid(oidp); + return (oidp); +} + +/* + * Register the kernel's oids on startup. + */ +SET_DECLARE(sysctl_set, struct sysctl_oid); + +static void +sysctl_register_all(void *arg) +{ + struct sysctl_oid **oidp; + + SYSCTL_INIT(); + SET_FOREACH(oidp, sysctl_set) + sysctl_register_oid(*oidp); +} +SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0); + +/* + * "Staff-functions" + * + * These functions implement a presently undocumented interface + * used by the sysctl program to walk the tree, and get the type + * so it can print the value. + * This interface is under work and consideration, and should probably + * be killed with a big axe by the first person who can find the time. + * (be aware though, that the proper interface isn't as obvious as it + * may seem, there are various conflicting requirements. + * + * {0,0} printf the entire MIB-tree. + * {0,1,...} return the name of the "..." OID. + * {0,2,...} return the next OID. + * {0,3} return the OID of the name in "new" + * {0,4,...} return the kind & format info for the "..." OID. + * {0,5,...} return the description the "..." OID. + */ + +static void +sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) +{ + int k; + struct sysctl_oid *oidp; + + SLIST_FOREACH(oidp, l, oid_link) { + + for (k=0; k<i; k++) + printf(" "); + + printf("%d %s ", oidp->oid_number, oidp->oid_name); + + printf("%c%c", + oidp->oid_kind & CTLFLAG_RD ? 'R':' ', + oidp->oid_kind & CTLFLAG_WR ? 'W':' '); + + if (oidp->oid_handler) + printf(" *Handler"); + + switch (oidp->oid_kind & CTLTYPE) { + case CTLTYPE_NODE: + printf(" Node\n"); + if (!oidp->oid_handler) { + sysctl_sysctl_debug_dump_node( + oidp->oid_arg1, i+2); + } + break; + case CTLTYPE_INT: printf(" Int\n"); break; + case CTLTYPE_STRING: printf(" String\n"); break; + case CTLTYPE_QUAD: printf(" Quad\n"); break; + case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break; + default: printf("\n"); + } + + } +} + +static int +sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS) +{ + int error; + + error = suser(req->td); + if (error) + return error; + sysctl_sysctl_debug_dump_node(&sysctl__children, 0); + return ENOENT; +} + +SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD, + 0, 0, sysctl_sysctl_debug, "-", ""); + +static int +sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) +{ + int *name = (int *) arg1; + u_int namelen = arg2; + int error = 0; + struct sysctl_oid *oid; + struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; + char buf[10]; + + while (namelen) { + if (!lsp) { + snprintf(buf,sizeof(buf),"%d",*name); + if (req->oldidx) + error = SYSCTL_OUT(req, ".", 1); + if (!error) + error = SYSCTL_OUT(req, buf, strlen(buf)); + if (error) + return (error); + namelen--; + name++; + continue; + } + lsp2 = 0; + SLIST_FOREACH(oid, lsp, oid_link) { + if (oid->oid_number != *name) + continue; + + if (req->oldidx) + error = SYSCTL_OUT(req, ".", 1); + if (!error) + error = SYSCTL_OUT(req, oid->oid_name, + strlen(oid->oid_name)); + if (error) + return (error); + + namelen--; + name++; + + if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) + break; + + if (oid->oid_handler) + break; + + lsp2 = (struct sysctl_oid_list *)oid->oid_arg1; + break; + } + lsp = lsp2; + } + return (SYSCTL_OUT(req, "", 1)); +} + +SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, ""); + +static int +sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, + int *next, int *len, int level, struct sysctl_oid **oidpp) +{ + struct sysctl_oid *oidp; + + *len = level; + SLIST_FOREACH(oidp, lsp, oid_link) { + *next = oidp->oid_number; + *oidpp = oidp; + + if (!namelen) { + if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) + return 0; + if (oidp->oid_handler) + /* We really should call the handler here...*/ + return 0; + lsp = (struct sysctl_oid_list *)oidp->oid_arg1; + if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1, + len, level+1, oidpp)) + return 0; + goto next; + } + + if (oidp->oid_number < *name) + continue; + + if (oidp->oid_number > *name) { + if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) + return 0; + if (oidp->oid_handler) + return 0; + lsp = (struct sysctl_oid_list *)oidp->oid_arg1; + if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, + next+1, len, level+1, oidpp)) + return (0); + goto next; + } + if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) + continue; + + if (oidp->oid_handler) + continue; + + lsp = (struct sysctl_oid_list *)oidp->oid_arg1; + if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, + len, level+1, oidpp)) + return (0); + next: + namelen = 1; + *len = level; + } + return 1; +} + +static int +sysctl_sysctl_next(SYSCTL_HANDLER_ARGS) +{ + int *name = (int *) arg1; + u_int namelen = arg2; + int i, j, error; + struct sysctl_oid *oid; + struct sysctl_oid_list *lsp = &sysctl__children; + int newoid[CTL_MAXNAME]; + + i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid); + if (i) + return ENOENT; + error = SYSCTL_OUT(req, newoid, j * sizeof (int)); + return (error); +} + +SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, ""); + +static int +name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp) +{ + int i; + struct sysctl_oid *oidp; + struct sysctl_oid_list *lsp = &sysctl__children; + char *p; + + if (!*name) + return ENOENT; + + p = name + strlen(name) - 1 ; + if (*p == '.') + *p = '\0'; + + *len = 0; + + for (p = name; *p && *p != '.'; p++) + ; + i = *p; + if (i == '.') + *p = '\0'; + + oidp = SLIST_FIRST(lsp); + + while (oidp && *len < CTL_MAXNAME) { + if (strcmp(name, oidp->oid_name)) { + oidp = SLIST_NEXT(oidp, oid_link); + continue; + } + *oid++ = oidp->oid_number; + (*len)++; + + if (!i) { + if (oidpp) + *oidpp = oidp; + return (0); + } + + if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) + break; + + if (oidp->oid_handler) + break; + + lsp = (struct sysctl_oid_list *)oidp->oid_arg1; + oidp = SLIST_FIRST(lsp); + name = p+1; + for (p = name; *p && *p != '.'; p++) + ; + i = *p; + if (i == '.') + *p = '\0'; + } + return ENOENT; +} + +static int +sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS) +{ + char *p; + int error, oid[CTL_MAXNAME], len; + struct sysctl_oid *op = 0; + + if (!req->newlen) + return ENOENT; + if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */ + return (ENAMETOOLONG); + + p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK); + + error = SYSCTL_IN(req, p, req->newlen); + if (error) { + free(p, M_SYSCTL); + return (error); + } + + p [req->newlen] = '\0'; + + error = name2oid(p, oid, &len, &op); + + free(p, M_SYSCTL); + + if (error) + return (error); + + error = SYSCTL_OUT(req, oid, len * sizeof *oid); + return (error); +} + +SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0, + sysctl_sysctl_name2oid, "I", ""); + +static int +sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) +{ + struct sysctl_oid *oid; + int error; + + error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); + if (error) + return (error); + + if (!oid->oid_fmt) + return (ENOENT); + error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind)); + if (error) + return (error); + error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1); + return (error); +} + + +SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, ""); + +static int +sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) +{ + struct sysctl_oid *oid; + int error; + + error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); + if (error) + return (error); + + if (!oid->descr) + return (ENOENT); + error = SYSCTL_OUT(req, oid->descr, strlen(oid->descr) + 1); + return (error); +} + +SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD, sysctl_sysctl_oiddescr, ""); + +/* + * Default "handler" functions. + */ + +/* + * Handle an int, signed or unsigned. + * Two cases: + * a variable: point arg1 at it. + * a constant: pass it in arg2. + */ + +int +sysctl_handle_int(SYSCTL_HANDLER_ARGS) +{ + int error = 0; + + if (arg1) + error = SYSCTL_OUT(req, arg1, sizeof(int)); + else + error = SYSCTL_OUT(req, &arg2, sizeof(int)); + + if (error || !req->newptr) + return (error); + + if (!arg1) + error = EPERM; + else + error = SYSCTL_IN(req, arg1, sizeof(int)); + return (error); +} + +/* + * Handle a long, signed or unsigned. arg1 points to it. + */ + +int +sysctl_handle_long(SYSCTL_HANDLER_ARGS) +{ + int error = 0; + + if (!arg1) + return (EINVAL); + error = SYSCTL_OUT(req, arg1, sizeof(long)); + + if (error || !req->newptr) + return (error); + + error = SYSCTL_IN(req, arg1, sizeof(long)); + return (error); +} + +/* + * Handle our generic '\0' terminated 'C' string. + * Two cases: + * a variable string: point arg1 at it, arg2 is max length. + * a constant string: point arg1 at it, arg2 is zero. + */ + +int +sysctl_handle_string(SYSCTL_HANDLER_ARGS) +{ + int error=0; + + error = SYSCTL_OUT(req, arg1, strlen((char *)arg1)+1); + + if (error || !req->newptr) + return (error); + + if ((req->newlen - req->newidx) >= arg2) { + error = EINVAL; + } else { + arg2 = (req->newlen - req->newidx); + error = SYSCTL_IN(req, arg1, arg2); + ((char *)arg1)[arg2] = '\0'; + } + + return (error); +} + +/* + * Handle any kind of opaque data. + * arg1 points to it, arg2 is the size. + */ + +int +sysctl_handle_opaque(SYSCTL_HANDLER_ARGS) +{ + int error; + + error = SYSCTL_OUT(req, arg1, arg2); + + if (error || !req->newptr) + return (error); + + error = SYSCTL_IN(req, arg1, arg2); + + return (error); +} + +/* + * Transfer functions to/from kernel space. + * XXX: rather untested at this point + */ +static int +sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) +{ + size_t i = 0; + + if (req->oldptr) { + i = l; + if (req->oldlen <= req->oldidx) + i = 0; + else + if (i > req->oldlen - req->oldidx) + i = req->oldlen - req->oldidx; + if (i > 0) + bcopy(p, (char *)req->oldptr + req->oldidx, i); + } + req->oldidx += l; + if (req->oldptr && i != l) + return (ENOMEM); + return (0); +} + +static int +sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l) +{ + if (!req->newptr) + return 0; + if (req->newlen - req->newidx < l) + return (EINVAL); + bcopy((char *)req->newptr + req->newidx, p, l); + req->newidx += l; + return (0); +} + +int +kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old, + size_t *oldlenp, void *new, size_t newlen, size_t *retval) +{ + int error = 0; + struct sysctl_req req; + + bzero(&req, sizeof req); + + req.td = td; + + if (oldlenp) { + req.oldlen = *oldlenp; + } + + if (old) { + req.oldptr= old; + } + + if (new != NULL) { + req.newlen = newlen; + req.newptr = new; + } + + req.oldfunc = sysctl_old_kernel; + req.newfunc = sysctl_new_kernel; + req.lock = 1; + + SYSCTL_LOCK(); + + error = sysctl_root(0, name, namelen, &req); + + if (req.lock == 2) + vsunlock(req.oldptr, req.oldlen); + + SYSCTL_UNLOCK(); + + if (error && error != ENOMEM) + return (error); + + if (retval) { + if (req.oldptr && req.oldidx > req.oldlen) + *retval = req.oldlen; + else + *retval = req.oldidx; + } + return (error); +} + +int +kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, + void *new, size_t newlen, size_t *retval) +{ + int oid[CTL_MAXNAME]; + size_t oidlen, plen; + int error; + + oid[0] = 0; /* sysctl internal magic */ + oid[1] = 3; /* name2oid */ + oidlen = sizeof(oid); + + error = kernel_sysctl(td, oid, 2, oid, &oidlen, + (void *)name, strlen(name), &plen); + if (error) + return (error); + + error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp, + new, newlen, retval); + return (error); +} + +/* + * Transfer function to/from user space. + */ +static int +sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) +{ + int error = 0; + size_t i = 0; + + if (req->lock == 1 && req->oldptr) { + vslock(req->oldptr, req->oldlen); + req->lock = 2; + } + if (req->oldptr) { + i = l; + if (req->oldlen <= req->oldidx) + i = 0; + else + if (i > req->oldlen - req->oldidx) + i = req->oldlen - req->oldidx; + if (i > 0) + error = copyout(p, (char *)req->oldptr + req->oldidx, + i); + } + req->oldidx += l; + if (error) + return (error); + if (req->oldptr && i < l) + return (ENOMEM); + return (0); +} + +static int +sysctl_new_user(struct sysctl_req *req, void *p, size_t l) +{ + int error; + + if (!req->newptr) + return 0; + if (req->newlen - req->newidx < l) + return (EINVAL); + error = copyin((char *)req->newptr + req->newidx, p, l); + req->newidx += l; + return (error); +} + +int +sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid, + int *nindx, struct sysctl_req *req) +{ + struct sysctl_oid *oid; + int indx; + + oid = SLIST_FIRST(&sysctl__children); + indx = 0; + while (oid && indx < CTL_MAXNAME) { + if (oid->oid_number == name[indx]) { + indx++; + if (oid->oid_kind & CTLFLAG_NOLOCK) + req->lock = 0; + if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + if (oid->oid_handler != NULL || + indx == namelen) { + *noid = oid; + if (nindx != NULL) + *nindx = indx; + return (0); + } + oid = SLIST_FIRST( + (struct sysctl_oid_list *)oid->oid_arg1); + } else if (indx == namelen) { + *noid = oid; + if (nindx != NULL) + *nindx = indx; + return (0); + } else { + return (ENOTDIR); + } + } else { + oid = SLIST_NEXT(oid, oid_link); + } + } + return (ENOENT); +} + +/* + * Traverse our tree, and find the right node, execute whatever it points + * to, and return the resulting error code. + */ + +int +sysctl_root(SYSCTL_HANDLER_ARGS) +{ + struct sysctl_oid *oid; + int error, indx; + + error = sysctl_find_oid(arg1, arg2, &oid, &indx, req); + if (error) + return (error); + + if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + /* + * You can't call a sysctl when it's a node, but has + * no handler. Inform the user that it's a node. + * The indx may or may not be the same as namelen. + */ + if (oid->oid_handler == NULL) + return (EISDIR); + } + + /* Is this sysctl writable? */ + if (req->newptr && !(oid->oid_kind & CTLFLAG_WR)) + return (EPERM); + + KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL")); + + /* Is this sysctl sensitive to securelevels? */ + if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) { + error = securelevel_gt(req->td->td_ucred, 0); + if (error) + return (error); + } + + /* Is this sysctl writable by only privileged users? */ + if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) { + int flags; + + if (oid->oid_kind & CTLFLAG_PRISON) + flags = PRISON_ROOT; + else + flags = 0; + error = suser_cred(req->td->td_ucred, flags); + if (error) + return (error); + } + + if (!oid->oid_handler) + return EINVAL; + + if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) + error = oid->oid_handler(oid, (int *)arg1 + indx, arg2 - indx, + req); + else + error = oid->oid_handler(oid, oid->oid_arg1, oid->oid_arg2, + req); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct sysctl_args { + int *name; + u_int namelen; + void *old; + size_t *oldlenp; + void *new; + size_t newlen; +}; +#endif + +/* + * MPSAFE + */ +int +__sysctl(struct thread *td, struct sysctl_args *uap) +{ + int error, name[CTL_MAXNAME]; + size_t j; + + if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) + return (EINVAL); + + error = copyin(uap->name, &name, uap->namelen * sizeof(int)); + if (error) + return (error); + + mtx_lock(&Giant); + + error = userland_sysctl(td, name, uap->namelen, + uap->old, uap->oldlenp, 0, + uap->new, uap->newlen, &j); + if (error && error != ENOMEM) + goto done2; + if (uap->oldlenp) { + int i = copyout(&j, uap->oldlenp, sizeof(j)); + if (i) + error = i; + } +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * This is used from various compatibility syscalls too. That's why name + * must be in kernel space. + */ +int +userland_sysctl(struct thread *td, int *name, u_int namelen, void *old, + size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval) +{ + int error = 0; + struct sysctl_req req, req2; + + bzero(&req, sizeof req); + + req.td = td; + + if (oldlenp) { + if (inkernel) { + req.oldlen = *oldlenp; + } else { + error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp)); + if (error) + return (error); + } + } + + if (old) { + if (!useracc(old, req.oldlen, VM_PROT_WRITE)) + return (EFAULT); + req.oldptr= old; + } + + if (new != NULL) { + if (!useracc(new, req.newlen, VM_PROT_READ)) + return (EFAULT); + req.newlen = newlen; + req.newptr = new; + } + + req.oldfunc = sysctl_old_user; + req.newfunc = sysctl_new_user; + req.lock = 1; + + SYSCTL_LOCK(); + + do { + req2 = req; + error = sysctl_root(0, name, namelen, &req2); + } while (error == EAGAIN); + + req = req2; + if (req.lock == 2) + vsunlock(req.oldptr, req.oldlen); + + SYSCTL_UNLOCK(); + + if (error && error != ENOMEM) + return (error); + + if (retval) { + if (req.oldptr && req.oldidx > req.oldlen) + *retval = req.oldlen; + else + *retval = req.oldidx; + } + return (error); +} + +#ifdef COMPAT_43 +#include <sys/socket.h> +#include <vm/vm_param.h> + +#define KINFO_PROC (0<<8) +#define KINFO_RT (1<<8) +#define KINFO_VNODE (2<<8) +#define KINFO_FILE (3<<8) +#define KINFO_METER (4<<8) +#define KINFO_LOADAVG (5<<8) +#define KINFO_CLOCKRATE (6<<8) + +/* Non-standard BSDI extension - only present on their 4.3 net-2 releases */ +#define KINFO_BSDI_SYSINFO (101<<8) + +/* + * XXX this is bloat, but I hope it's better here than on the potentially + * limited kernel stack... -Peter + */ + +static struct { + int bsdi_machine; /* "i386" on BSD/386 */ +/* ^^^ this is an offset to the string, relative to the struct start */ + char *pad0; + long pad1; + long pad2; + long pad3; + u_long pad4; + u_long pad5; + u_long pad6; + + int bsdi_ostype; /* "BSD/386" on BSD/386 */ + int bsdi_osrelease; /* "1.1" on BSD/386 */ + long pad7; + long pad8; + char *pad9; + + long pad10; + long pad11; + int pad12; + long pad13; + quad_t pad14; + long pad15; + + struct timeval pad16; + /* we dont set this, because BSDI's uname used gethostname() instead */ + int bsdi_hostname; /* hostname on BSD/386 */ + + /* the actual string data is appended here */ + +} bsdi_si; +/* + * this data is appended to the end of the bsdi_si structure during copyout. + * The "char *" offsets are relative to the base of the bsdi_si struct. + * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings + * should not exceed the length of the buffer here... (or else!! :-) + */ +static char bsdi_strings[80]; /* It had better be less than this! */ + +#ifndef _SYS_SYSPROTO_H_ +struct getkerninfo_args { + int op; + char *where; + size_t *size; + int arg; +}; +#endif + +/* + * MPSAFE + */ +int +ogetkerninfo(struct thread *td, struct getkerninfo_args *uap) +{ + int error, name[6]; + size_t size; + u_int needed = 0; + + mtx_lock(&Giant); + + switch (uap->op & 0xff00) { + + case KINFO_RT: + name[0] = CTL_NET; + name[1] = PF_ROUTE; + name[2] = 0; + name[3] = (uap->op & 0xff0000) >> 16; + name[4] = uap->op & 0xff; + name[5] = uap->arg; + error = userland_sysctl(td, name, 6, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_VNODE: + name[0] = CTL_KERN; + name[1] = KERN_VNODE; + error = userland_sysctl(td, name, 2, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_PROC: + name[0] = CTL_KERN; + name[1] = KERN_PROC; + name[2] = uap->op & 0xff; + name[3] = uap->arg; + error = userland_sysctl(td, name, 4, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_FILE: + name[0] = CTL_KERN; + name[1] = KERN_FILE; + error = userland_sysctl(td, name, 2, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_METER: + name[0] = CTL_VM; + name[1] = VM_METER; + error = userland_sysctl(td, name, 2, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_LOADAVG: + name[0] = CTL_VM; + name[1] = VM_LOADAVG; + error = userland_sysctl(td, name, 2, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_CLOCKRATE: + name[0] = CTL_KERN; + name[1] = KERN_CLOCKRATE; + error = userland_sysctl(td, name, 2, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_BSDI_SYSINFO: { + /* + * this is pretty crude, but it's just enough for uname() + * from BSDI's 1.x libc to work. + * + * *size gives the size of the buffer before the call, and + * the amount of data copied after a successful call. + * If successful, the return value is the amount of data + * available, which can be larger than *size. + * + * BSDI's 2.x product apparently fails with ENOMEM if *size + * is too small. + */ + + u_int left; + char *s; + + bzero((char *)&bsdi_si, sizeof(bsdi_si)); + bzero(bsdi_strings, sizeof(bsdi_strings)); + + s = bsdi_strings; + + bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si); + strcpy(s, ostype); + s += strlen(s) + 1; + + bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si); + strcpy(s, osrelease); + s += strlen(s) + 1; + + bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si); + strcpy(s, machine); + s += strlen(s) + 1; + + needed = sizeof(bsdi_si) + (s - bsdi_strings); + + if ((uap->where == NULL) || (uap->size == NULL)) { + /* process is asking how much buffer to supply.. */ + size = needed; + error = 0; + break; + } + + if ((error = copyin(uap->size, &size, sizeof(size))) != 0) + break; + + /* if too much buffer supplied, trim it down */ + if (size > needed) + size = needed; + + /* how much of the buffer is remaining */ + left = size; + + if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0) + break; + + /* is there any point in continuing? */ + if (left > sizeof(bsdi_si)) { + left -= sizeof(bsdi_si); + error = copyout(&bsdi_strings, + uap->where + sizeof(bsdi_si), left); + } + break; + } + + default: + error = EOPNOTSUPP; + break; + } + if (error == 0) { + td->td_retval[0] = needed ? needed : size; + if (uap->size) { + error = copyout((caddr_t)&size, (caddr_t)uap->size, + sizeof(size)); + } + } + mtx_unlock(&Giant); + return (error); +} +#endif /* COMPAT_43 */ diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c new file mode 100644 index 0000000..fabc204 --- /dev/null +++ b/sys/kern/kern_tc.c @@ -0,0 +1,684 @@ +/*- + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + * + * $FreeBSD$ + */ + +#include "opt_ntp.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/timepps.h> +#include <sys/timetc.h> +#include <sys/timex.h> + +/* + * Implement a dummy timecounter which we can use until we get a real one + * in the air. This allows the console and other early stuff to use + * time services. + */ + +static u_int +dummy_get_timecount(struct timecounter *tc) +{ + static u_int now; + + return (++now); +} + +static struct timecounter dummy_timecounter = { + dummy_get_timecount, 0, ~0u, 1000000, "dummy", +}; + +struct timehands { + /* These fields must be initialized by the driver. */ + struct timecounter *th_counter; + int64_t th_adjustment; + u_int64_t th_scale; + u_int th_offset_count; + struct bintime th_offset; + struct timeval th_microtime; + struct timespec th_nanotime; + /* Fields not to be copied in tc_windup start with th_generation. */ + volatile u_int th_generation; + struct timehands *th_next; +}; + +extern struct timehands th0; +static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th0}; +static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th9}; +static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th8}; +static struct timehands th6 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th7}; +static struct timehands th5 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th6}; +static struct timehands th4 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th5}; +static struct timehands th3 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th4}; +static struct timehands th2 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th3}; +static struct timehands th1 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th2}; +static struct timehands th0 = { + &dummy_timecounter, + 0, + (uint64_t)-1 / 1000000, + 0, + {1, 0}, + {0, 0}, + {0, 0}, + 1, + &th1 +}; + +static struct timehands *volatile timehands = &th0; +struct timecounter *timecounter = &dummy_timecounter; +static struct timecounter *timecounters = &dummy_timecounter; + +time_t time_second = 1; + +static struct bintime boottimebin; +struct timeval boottime; +SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD, + &boottime, timeval, "System boottime"); + +SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, ""); + +#define TC_STATS(foo) \ + static u_int foo; \ + SYSCTL_UINT(_kern_timecounter, OID_AUTO, foo, CTLFLAG_RD, &foo, 0, "") \ + struct __hack + +TC_STATS(nbinuptime); TC_STATS(nnanouptime); TC_STATS(nmicrouptime); +TC_STATS(nbintime); TC_STATS(nnanotime); TC_STATS(nmicrotime); +TC_STATS(ngetbinuptime); TC_STATS(ngetnanouptime); TC_STATS(ngetmicrouptime); +TC_STATS(ngetbintime); TC_STATS(ngetnanotime); TC_STATS(ngetmicrotime); + +#undef TC_STATS + +static void tc_windup(void); + +/* + * Return the difference between the timehands' counter value now and what + * was when we copied it to the timehands' offset_count. + */ +static __inline u_int +tc_delta(struct timehands *th) +{ + struct timecounter *tc; + + tc = th->th_counter; + return ((tc->tc_get_timecount(tc) - th->th_offset_count) & + tc->tc_counter_mask); +} + +/* + * Functions for reading the time. We have to loop until we are sure that + * the timehands that we operated on was not updated under our feet. See + * the comment in <sys/time.h> for a description of these 12 functions. + */ + +void +binuptime(struct bintime *bt) +{ + struct timehands *th; + u_int gen; + + nbinuptime++; + do { + th = timehands; + gen = th->th_generation; + *bt = th->th_offset; + bintime_addx(bt, th->th_scale * tc_delta(th)); + } while (gen == 0 || gen != th->th_generation); +} + +void +nanouptime(struct timespec *tsp) +{ + struct bintime bt; + + nnanouptime++; + binuptime(&bt); + bintime2timespec(&bt, tsp); +} + +void +microuptime(struct timeval *tvp) +{ + struct bintime bt; + + nmicrouptime++; + binuptime(&bt); + bintime2timeval(&bt, tvp); +} + +void +bintime(struct bintime *bt) +{ + + nbintime++; + binuptime(bt); + bintime_add(bt, &boottimebin); +} + +void +nanotime(struct timespec *tsp) +{ + struct bintime bt; + + nnanotime++; + bintime(&bt); + bintime2timespec(&bt, tsp); +} + +void +microtime(struct timeval *tvp) +{ + struct bintime bt; + + nmicrotime++; + bintime(&bt); + bintime2timeval(&bt, tvp); +} + +void +getbinuptime(struct bintime *bt) +{ + struct timehands *th; + u_int gen; + + ngetbinuptime++; + do { + th = timehands; + gen = th->th_generation; + *bt = th->th_offset; + } while (gen == 0 || gen != th->th_generation); +} + +void +getnanouptime(struct timespec *tsp) +{ + struct timehands *th; + u_int gen; + + ngetnanouptime++; + do { + th = timehands; + gen = th->th_generation; + bintime2timespec(&th->th_offset, tsp); + } while (gen == 0 || gen != th->th_generation); +} + +void +getmicrouptime(struct timeval *tvp) +{ + struct timehands *th; + u_int gen; + + ngetmicrouptime++; + do { + th = timehands; + gen = th->th_generation; + bintime2timeval(&th->th_offset, tvp); + } while (gen == 0 || gen != th->th_generation); +} + +void +getbintime(struct bintime *bt) +{ + struct timehands *th; + u_int gen; + + ngetbintime++; + do { + th = timehands; + gen = th->th_generation; + *bt = th->th_offset; + } while (gen == 0 || gen != th->th_generation); + bintime_add(bt, &boottimebin); +} + +void +getnanotime(struct timespec *tsp) +{ + struct timehands *th; + u_int gen; + + ngetnanotime++; + do { + th = timehands; + gen = th->th_generation; + *tsp = th->th_nanotime; + } while (gen == 0 || gen != th->th_generation); +} + +void +getmicrotime(struct timeval *tvp) +{ + struct timehands *th; + u_int gen; + + ngetmicrotime++; + do { + th = timehands; + gen = th->th_generation; + *tvp = th->th_microtime; + } while (gen == 0 || gen != th->th_generation); +} + +/* + * Initialize a new timecounter. + * We should really try to rank the timecounters and intelligently determine + * if the new timecounter is better than the current one. This is subject + * to further study. For now always use the new timecounter. + */ +void +tc_init(struct timecounter *tc) +{ + + tc->tc_next = timecounters; + timecounters = tc; + printf("Timecounter \"%s\" frequency %lu Hz\n", + tc->tc_name, (u_long)tc->tc_frequency); + (void)tc->tc_get_timecount(tc); + (void)tc->tc_get_timecount(tc); + timecounter = tc; +} + +/* Report the frequency of the current timecounter. */ +u_int32_t +tc_getfrequency(void) +{ + + return (timehands->th_counter->tc_frequency); +} + +/* + * Step our concept of GMT. This is done by modifying our estimate of + * when we booted. XXX: needs futher work. + */ +void +tc_setclock(struct timespec *ts) +{ + struct timespec ts2; + + nanouptime(&ts2); + boottime.tv_sec = ts->tv_sec - ts2.tv_sec; + /* XXX boottime should probably be a timespec. */ + boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000; + if (boottime.tv_usec < 0) { + boottime.tv_usec += 1000000; + boottime.tv_sec--; + } + timeval2bintime(&boottime, &boottimebin); + + /* XXX fiddle all the little crinkly bits around the fiords... */ + tc_windup(); +} + +/* + * Initialize the next struct timehands in the ring and make + * it the active timehands. Along the way we might switch to a different + * timecounter and/or do seconds processing in NTP. Slightly magic. + */ +static void +tc_windup(void) +{ + struct bintime bt; + struct timehands *th, *tho; + u_int64_t scale; + u_int delta, ncount, ogen; + int i; + + /* + * Make the next timehands a copy of the current one, but do not + * overwrite the generation or next pointer. While we update + * the contents, the generation must be zero. + */ + tho = timehands; + th = tho->th_next; + ogen = th->th_generation; + th->th_generation = 0; + bcopy(tho, th, offsetof(struct timehands, th_generation)); + + /* + * Capture a timecounter delta on the current timecounter and if + * changing timecounters, a counter value from the new timecounter. + * Update the offset fields accordingly. + */ + delta = tc_delta(th); + if (th->th_counter != timecounter) + ncount = timecounter->tc_get_timecount(timecounter); + else + ncount = 0; + th->th_offset_count += delta; + th->th_offset_count &= th->th_counter->tc_counter_mask; + bintime_addx(&th->th_offset, th->th_scale * delta); + + /* + * Hardware latching timecounters may not generate interrupts on + * PPS events, so instead we poll them. There is a finite risk that + * the hardware might capture a count which is later than the one we + * got above, and therefore possibly in the next NTP second which might + * have a different rate than the current NTP second. It doesn't + * matter in practice. + */ + if (tho->th_counter->tc_poll_pps) + tho->th_counter->tc_poll_pps(tho->th_counter); + + /* + * Deal with NTP second processing. The for loop normally only + * iterates once, but in extreme situations it might keep NTP sane + * if timeouts are not run for several seconds. + */ + for (i = th->th_offset.sec - tho->th_offset.sec; i > 0; i--) + ntp_update_second(&th->th_adjustment, &th->th_offset.sec); + + /* Now is a good time to change timecounters. */ + if (th->th_counter != timecounter) { + th->th_counter = timecounter; + th->th_offset_count = ncount; + } + + /*- + * Recalculate the scaling factor. We want the number of 1/2^64 + * fractions of a second per period of the hardware counter, taking + * into account the th_adjustment factor which the NTP PLL/adjtime(2) + * processing provides us with. + * + * The th_adjustment is nanoseconds per second with 32 bit binary + * fraction and want 64 bit binary fraction of second: + * + * x = a * 2^32 / 10^9 = a * 4.294967296 + * + * The range of th_adjustment is +/- 5000PPM so inside a 64bit int + * we can only multiply by about 850 without overflowing, but that + * leaves suitably precise fractions for multiply before divide. + * + * Divide before multiply with a fraction of 2199/512 results in a + * systematic undercompensation of 10PPM of th_adjustment. On a + * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. + * + * We happily sacrifice the lowest of the 64 bits of our result + * to the goddess of code clarity. + * + */ + scale = (u_int64_t)1 << 63; + scale += (th->th_adjustment / 1024) * 2199; + scale /= th->th_counter->tc_frequency; + th->th_scale = scale * 2; + + /* Update the GMT timestamps used for the get*() functions. */ + bt = th->th_offset; + bintime_add(&bt, &boottimebin); + bintime2timeval(&bt, &th->th_microtime); + bintime2timespec(&bt, &th->th_nanotime); + + /* + * Now that the struct timehands is again consistent, set the new + * generation number, making sure to not make it zero. + */ + if (++ogen == 0) + ogen = 1; + th->th_generation = ogen; + + /* Go live with the new struct timehands. */ + time_second = th->th_microtime.tv_sec; + timehands = th; +} + +/* Report or change the active timecounter hardware. */ +static int +sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS) +{ + char newname[32]; + struct timecounter *newtc, *tc; + int error; + + tc = timecounter; + strncpy(newname, tc->tc_name, sizeof(newname)); + newname[sizeof(newname) - 1] = '\0'; + error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req); + if (error != 0 || req->newptr == NULL || + strcmp(newname, tc->tc_name) == 0) + return (error); + for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { + if (strcmp(newname, newtc->tc_name) != 0) + continue; + + /* Warm up new timecounter. */ + (void)newtc->tc_get_timecount(newtc); + (void)newtc->tc_get_timecount(newtc); + + timecounter = newtc; + return (0); + } + return (EINVAL); +} + +SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW, + 0, 0, sysctl_kern_timecounter_hardware, "A", ""); + +/* + * RFC 2783 PPS-API implementation. + */ + +int +pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps) +{ + pps_params_t *app; + struct pps_fetch_args *fapi; +#ifdef PPS_SYNC + struct pps_kcbind_args *kapi; +#endif + + switch (cmd) { + case PPS_IOC_CREATE: + return (0); + case PPS_IOC_DESTROY: + return (0); + case PPS_IOC_SETPARAMS: + app = (pps_params_t *)data; + if (app->mode & ~pps->ppscap) + return (EINVAL); + pps->ppsparam = *app; + return (0); + case PPS_IOC_GETPARAMS: + app = (pps_params_t *)data; + *app = pps->ppsparam; + app->api_version = PPS_API_VERS_1; + return (0); + case PPS_IOC_GETCAP: + *(int*)data = pps->ppscap; + return (0); + case PPS_IOC_FETCH: + fapi = (struct pps_fetch_args *)data; + if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC) + return (EINVAL); + if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec) + return (EOPNOTSUPP); + pps->ppsinfo.current_mode = pps->ppsparam.mode; + fapi->pps_info_buf = pps->ppsinfo; + return (0); + case PPS_IOC_KCBIND: +#ifdef PPS_SYNC + kapi = (struct pps_kcbind_args *)data; + /* XXX Only root should be able to do this */ + if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC) + return (EINVAL); + if (kapi->kernel_consumer != PPS_KC_HARDPPS) + return (EINVAL); + if (kapi->edge & ~pps->ppscap) + return (EINVAL); + pps->kcmode = kapi->edge; + return (0); +#else + return (EOPNOTSUPP); +#endif + default: + return (ENOTTY); + } +} + +void +pps_init(struct pps_state *pps) +{ + pps->ppscap |= PPS_TSFMT_TSPEC; + if (pps->ppscap & PPS_CAPTUREASSERT) + pps->ppscap |= PPS_OFFSETASSERT; + if (pps->ppscap & PPS_CAPTURECLEAR) + pps->ppscap |= PPS_OFFSETCLEAR; +} + +void +pps_capture(struct pps_state *pps) +{ + struct timehands *th; + + th = timehands; + pps->capgen = th->th_generation; + pps->capth = th; + pps->capcount = th->th_counter->tc_get_timecount(th->th_counter); + if (pps->capgen != th->th_generation) + pps->capgen = 0; +} + +void +pps_event(struct pps_state *pps, int event) +{ + struct bintime bt; + struct timespec ts, *tsp, *osp; + u_int tcount, *pcount; + int foff, fhard; + pps_seq_t *pseq; + + /* If the timecounter was wound up underneath us, bail out. */ + if (pps->capgen == 0 || pps->capgen != pps->capth->th_generation) + return; + + /* Things would be easier with arrays. */ + if (event == PPS_CAPTUREASSERT) { + tsp = &pps->ppsinfo.assert_timestamp; + osp = &pps->ppsparam.assert_offset; + foff = pps->ppsparam.mode & PPS_OFFSETASSERT; + fhard = pps->kcmode & PPS_CAPTUREASSERT; + pcount = &pps->ppscount[0]; + pseq = &pps->ppsinfo.assert_sequence; + } else { + tsp = &pps->ppsinfo.clear_timestamp; + osp = &pps->ppsparam.clear_offset; + foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; + fhard = pps->kcmode & PPS_CAPTURECLEAR; + pcount = &pps->ppscount[1]; + pseq = &pps->ppsinfo.clear_sequence; + } + + /* + * If the timecounter changed, we cannot compare the count values, so + * we have to drop the rest of the PPS-stuff until the next event. + */ + if (pps->ppstc != pps->capth->th_counter) { + pps->ppstc = pps->capth->th_counter; + *pcount = pps->capcount; + pps->ppscount[2] = pps->capcount; + return; + } + + /* Return if nothing really happened. */ + if (*pcount == pps->capcount) + return; + + /* Convert the count to a timespec. */ + tcount = pps->capcount - pps->capth->th_offset_count; + tcount &= pps->capth->th_counter->tc_counter_mask; + bt = pps->capth->th_offset; + bintime_addx(&bt, pps->capth->th_scale * tcount); + bintime_add(&bt, &boottimebin); + bintime2timespec(&bt, &ts); + + /* If the timecounter was wound up underneath us, bail out. */ + if (pps->capgen != pps->capth->th_generation) + return; + + *pcount = pps->capcount; + (*pseq)++; + *tsp = ts; + + if (foff) { + timespecadd(tsp, osp); + if (tsp->tv_nsec < 0) { + tsp->tv_nsec += 1000000000; + tsp->tv_sec -= 1; + } + } +#ifdef PPS_SYNC + if (fhard) { + /* + * Feed the NTP PLL/FLL. + * The FLL wants to know how many nanoseconds elapsed since + * the previous event. + * I have never been able to convince myself that this code + * is actually correct: Using th_scale is bound to contain + * a phase correction component from userland, when running + * as FLL, so the number hardpps() gets is not meaningful IMO. + */ + tcount = pps->capcount - pps->ppscount[2]; + pps->ppscount[2] = pps->capcount; + tcount &= pps->capth->th_counter->tc_counter_mask; + bt.sec = 0; + bt.frac = 0; + bintime_addx(&bt, pps->capth->th_scale * tcount); + bintime2timespec(&bt, &ts); + hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec); + } +#endif +} + +/* + * Timecounters need to be updated every so often to prevent the hardware + * counter from overflowing. Updating also recalculates the cached values + * used by the get*() family of functions, so their precision depends on + * the update frequency. + */ + +static int tc_tick; +SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tick, 0, ""); + +static void +tc_ticktock(void *dummy) +{ + + tc_windup(); + timeout(tc_ticktock, NULL, tc_tick); +} + +static void +inittimecounter(void *dummy) +{ + u_int p; + + /* + * Set the initial timeout to + * max(1, <approx. number of hardclock ticks in a millisecond>). + * People should probably not use the sysctl to set the timeout + * to smaller than its inital value, since that value is the + * smallest reasonable one. If they want better timestamps they + * should use the non-"get"* functions. + */ + if (hz > 1000) + tc_tick = (hz + 500) / 1000; + else + tc_tick = 1; + p = (tc_tick * 1000000) / hz; + printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000); + + /* warm up new timecounter (again) and get rolling. */ + (void)timecounter->tc_get_timecount(timecounter); + (void)timecounter->tc_get_timecount(timecounter); + tc_ticktock(NULL); +} + +SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_FIRST, inittimecounter, NULL) diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c new file mode 100644 index 0000000..645170e --- /dev/null +++ b/sys/kern/kern_time.c @@ -0,0 +1,678 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_time.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sysproto.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/sysent.h> +#include <sys/proc.h> +#include <sys/time.h> +#include <sys/timetc.h> +#include <sys/vnode.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> + +struct timezone tz; + +/* + * Time of day and interval timer support. + * + * These routines provide the kernel entry points to get and set + * the time-of-day and per-process interval timers. Subroutines + * here provide support for adding and subtracting timeval structures + * and decrementing interval timers, optionally reloading the interval + * timers when they expire. + */ + +static int nanosleep1(struct thread *td, struct timespec *rqt, + struct timespec *rmt); +static int settime(struct thread *, struct timeval *); +static void timevalfix(struct timeval *); +static void no_lease_updatetime(int); + +static void +no_lease_updatetime(deltat) + int deltat; +{ +} + +void (*lease_updatetime)(int) = no_lease_updatetime; + +static int +settime(td, tv) + struct thread *td; + struct timeval *tv; +{ + struct timeval delta, tv1, tv2; + static struct timeval maxtime, laststep; + struct timespec ts; + int s; + + s = splclock(); + microtime(&tv1); + delta = *tv; + timevalsub(&delta, &tv1); + + /* + * If the system is secure, we do not allow the time to be + * set to a value earlier than 1 second less than the highest + * time we have yet seen. The worst a miscreant can do in + * this circumstance is "freeze" time. He couldn't go + * back to the past. + * + * We similarly do not allow the clock to be stepped more + * than one second, nor more than once per second. This allows + * a miscreant to make the clock march double-time, but no worse. + */ + if (securelevel_gt(td->td_ucred, 1) != 0) { + if (delta.tv_sec < 0 || delta.tv_usec < 0) { + /* + * Update maxtime to latest time we've seen. + */ + if (tv1.tv_sec > maxtime.tv_sec) + maxtime = tv1; + tv2 = *tv; + timevalsub(&tv2, &maxtime); + if (tv2.tv_sec < -1) { + tv->tv_sec = maxtime.tv_sec - 1; + printf("Time adjustment clamped to -1 second\n"); + } + } else { + if (tv1.tv_sec == laststep.tv_sec) { + splx(s); + return (EPERM); + } + if (delta.tv_sec > 1) { + tv->tv_sec = tv1.tv_sec + 1; + printf("Time adjustment clamped to +1 second\n"); + } + laststep = *tv; + } + } + + ts.tv_sec = tv->tv_sec; + ts.tv_nsec = tv->tv_usec * 1000; + mtx_lock(&Giant); + tc_setclock(&ts); + (void) splsoftclock(); + lease_updatetime(delta.tv_sec); + splx(s); + resettodr(); + mtx_unlock(&Giant); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct clock_gettime_args { + clockid_t clock_id; + struct timespec *tp; +}; +#endif + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +clock_gettime(td, uap) + struct thread *td; + struct clock_gettime_args *uap; +{ + struct timespec ats; + + if (SCARG(uap, clock_id) != CLOCK_REALTIME) + return (EINVAL); + mtx_lock(&Giant); + nanotime(&ats); + mtx_unlock(&Giant); + return (copyout(&ats, SCARG(uap, tp), sizeof(ats))); +} + +#ifndef _SYS_SYSPROTO_H_ +struct clock_settime_args { + clockid_t clock_id; + const struct timespec *tp; +}; +#endif + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +clock_settime(td, uap) + struct thread *td; + struct clock_settime_args *uap; +{ + struct timeval atv; + struct timespec ats; + int error; + + if ((error = suser(td)) != 0) + return (error); + if (SCARG(uap, clock_id) != CLOCK_REALTIME) + return (EINVAL); + if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0) + return (error); + if (ats.tv_nsec < 0 || ats.tv_nsec >= 1000000000) + return (EINVAL); + /* XXX Don't convert nsec->usec and back */ + TIMESPEC_TO_TIMEVAL(&atv, &ats); + error = settime(td, &atv); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct clock_getres_args { + clockid_t clock_id; + struct timespec *tp; +}; +#endif + +int +clock_getres(td, uap) + struct thread *td; + struct clock_getres_args *uap; +{ + struct timespec ts; + int error; + + if (SCARG(uap, clock_id) != CLOCK_REALTIME) + return (EINVAL); + error = 0; + if (SCARG(uap, tp)) { + ts.tv_sec = 0; + ts.tv_nsec = 1000000000 / tc_getfrequency(); + error = copyout(&ts, SCARG(uap, tp), sizeof(ts)); + } + return (error); +} + +static int nanowait; + +static int +nanosleep1(td, rqt, rmt) + struct thread *td; + struct timespec *rqt, *rmt; +{ + struct timespec ts, ts2, ts3; + struct timeval tv; + int error; + + if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000) + return (EINVAL); + if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0)) + return (0); + getnanouptime(&ts); + timespecadd(&ts, rqt); + TIMESPEC_TO_TIMEVAL(&tv, rqt); + for (;;) { + error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp", + tvtohz(&tv)); + getnanouptime(&ts2); + if (error != EWOULDBLOCK) { + if (error == ERESTART) + error = EINTR; + if (rmt != NULL) { + timespecsub(&ts, &ts2); + if (ts.tv_sec < 0) + timespecclear(&ts); + *rmt = ts; + } + return (error); + } + if (timespeccmp(&ts2, &ts, >=)) + return (0); + ts3 = ts; + timespecsub(&ts3, &ts2); + TIMESPEC_TO_TIMEVAL(&tv, &ts3); + } +} + +#ifndef _SYS_SYSPROTO_H_ +struct nanosleep_args { + struct timespec *rqtp; + struct timespec *rmtp; +}; +#endif + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +nanosleep(td, uap) + struct thread *td; + struct nanosleep_args *uap; +{ + struct timespec rmt, rqt; + int error; + + error = copyin(SCARG(uap, rqtp), &rqt, sizeof(rqt)); + if (error) + return (error); + + mtx_lock(&Giant); + if (SCARG(uap, rmtp)) { + if (!useracc((caddr_t)SCARG(uap, rmtp), sizeof(rmt), + VM_PROT_WRITE)) { + error = EFAULT; + goto done2; + } + } + error = nanosleep1(td, &rqt, &rmt); + if (error && SCARG(uap, rmtp)) { + int error2; + + error2 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt)); + if (error2) /* XXX shouldn't happen, did useracc() above */ + error = error2; + } +done2: + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct gettimeofday_args { + struct timeval *tp; + struct timezone *tzp; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +gettimeofday(td, uap) + struct thread *td; + register struct gettimeofday_args *uap; +{ + struct timeval atv; + int error = 0; + + if (uap->tp) { + microtime(&atv); + error = copyout((caddr_t)&atv, (caddr_t)uap->tp, sizeof (atv)); + } + if (error == 0 && uap->tzp != NULL) { + mtx_lock(&Giant); + error = copyout((caddr_t)&tz, (caddr_t)uap->tzp, + sizeof (tz)); + mtx_unlock(&Giant); + } + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct settimeofday_args { + struct timeval *tv; + struct timezone *tzp; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +settimeofday(td, uap) + struct thread *td; + struct settimeofday_args *uap; +{ + struct timeval atv; + struct timezone atz; + int error = 0; + + if ((error = suser(td))) + return (error); + /* Verify all parameters before changing time. */ + if (uap->tv) { + if ((error = copyin((caddr_t)uap->tv, (caddr_t)&atv, + sizeof(atv)))) + return (error); + if (atv.tv_usec < 0 || atv.tv_usec >= 1000000) + return (EINVAL); + } + if (uap->tzp && + (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz)))) + return (error); + + if (uap->tv && (error = settime(td, &atv))) + return (error); + if (uap->tzp) { + mtx_lock(&Giant); + tz = atz; + mtx_unlock(&Giant); + } + return (error); +} +/* + * Get value of an interval timer. The process virtual and + * profiling virtual time timers are kept in the p_stats area, since + * they can be swapped out. These are kept internally in the + * way they are specified externally: in time until they expire. + * + * The real time interval timer is kept in the process table slot + * for the process, and its value (it_value) is kept as an + * absolute time rather than as a delta, so that it is easy to keep + * periodic real-time signals from drifting. + * + * Virtual time timers are processed in the hardclock() routine of + * kern_clock.c. The real time timer is processed by a timeout + * routine, called from the softclock() routine. Since a callout + * may be delayed in real time due to interrupt processing in the system, + * it is possible for the real time timeout routine (realitexpire, given below), + * to be delayed in real time past when it is supposed to occur. It + * does not suffice, therefore, to reload the real timer .it_value from the + * real time timers .it_interval. Rather, we compute the next time in + * absolute time the timer should go off. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getitimer_args { + u_int which; + struct itimerval *itv; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getitimer(td, uap) + struct thread *td; + register struct getitimer_args *uap; +{ + struct proc *p = td->td_proc; + struct timeval ctv; + struct itimerval aitv; + int s; + int error; + + if (uap->which > ITIMER_PROF) + return (EINVAL); + + mtx_lock(&Giant); + + s = splclock(); /* XXX still needed ? */ + if (uap->which == ITIMER_REAL) { + /* + * Convert from absolute to relative time in .it_value + * part of real time timer. If time for real time timer + * has passed return 0, else return difference between + * current time and time for the timer to go off. + */ + aitv = p->p_realtimer; + if (timevalisset(&aitv.it_value)) { + getmicrouptime(&ctv); + if (timevalcmp(&aitv.it_value, &ctv, <)) + timevalclear(&aitv.it_value); + else + timevalsub(&aitv.it_value, &ctv); + } + } else { + aitv = p->p_stats->p_timer[uap->which]; + } + splx(s); + error = copyout((caddr_t)&aitv, (caddr_t)uap->itv, + sizeof (struct itimerval)); + mtx_unlock(&Giant); + return(error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setitimer_args { + u_int which; + struct itimerval *itv, *oitv; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setitimer(td, uap) + struct thread *td; + register struct setitimer_args *uap; +{ + struct proc *p = td->td_proc; + struct itimerval aitv; + struct timeval ctv; + register struct itimerval *itvp; + int s, error = 0; + + if (uap->which > ITIMER_PROF) + return (EINVAL); + itvp = uap->itv; + if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv, + sizeof(struct itimerval)))) + return (error); + + mtx_lock(&Giant); + + if ((uap->itv = uap->oitv) && + (error = getitimer(td, (struct getitimer_args *)uap))) { + goto done2; + } + if (itvp == 0) { + error = 0; + goto done2; + } + if (itimerfix(&aitv.it_value)) { + error = EINVAL; + goto done2; + } + if (!timevalisset(&aitv.it_value)) { + timevalclear(&aitv.it_interval); + } else if (itimerfix(&aitv.it_interval)) { + error = EINVAL; + goto done2; + } + s = splclock(); /* XXX: still needed ? */ + if (uap->which == ITIMER_REAL) { + if (timevalisset(&p->p_realtimer.it_value)) + callout_stop(&p->p_itcallout); + if (timevalisset(&aitv.it_value)) + callout_reset(&p->p_itcallout, tvtohz(&aitv.it_value), + realitexpire, p); + getmicrouptime(&ctv); + timevaladd(&aitv.it_value, &ctv); + p->p_realtimer = aitv; + } else { + p->p_stats->p_timer[uap->which] = aitv; + } + splx(s); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * Real interval timer expired: + * send process whose timer expired an alarm signal. + * If time is not set up to reload, then just return. + * Else compute next time timer should go off which is > current time. + * This is where delay in processing this timeout causes multiple + * SIGALRM calls to be compressed into one. + * tvtohz() always adds 1 to allow for the time until the next clock + * interrupt being strictly less than 1 clock tick, but we don't want + * that here since we want to appear to be in sync with the clock + * interrupt even when we're delayed. + */ +void +realitexpire(arg) + void *arg; +{ + register struct proc *p; + struct timeval ctv, ntv; + int s; + + p = (struct proc *)arg; + PROC_LOCK(p); + psignal(p, SIGALRM); + if (!timevalisset(&p->p_realtimer.it_interval)) { + timevalclear(&p->p_realtimer.it_value); + PROC_UNLOCK(p); + return; + } + for (;;) { + s = splclock(); /* XXX: still neeeded ? */ + timevaladd(&p->p_realtimer.it_value, + &p->p_realtimer.it_interval); + getmicrouptime(&ctv); + if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) { + ntv = p->p_realtimer.it_value; + timevalsub(&ntv, &ctv); + callout_reset(&p->p_itcallout, tvtohz(&ntv) - 1, + realitexpire, p); + splx(s); + PROC_UNLOCK(p); + return; + } + splx(s); + } + /*NOTREACHED*/ +} + +/* + * Check that a proposed value to load into the .it_value or + * .it_interval part of an interval timer is acceptable, and + * fix it to have at least minimal value (i.e. if it is less + * than the resolution of the clock, round it up.) + */ +int +itimerfix(tv) + struct timeval *tv; +{ + + if (tv->tv_sec < 0 || tv->tv_sec > 100000000 || + tv->tv_usec < 0 || tv->tv_usec >= 1000000) + return (EINVAL); + if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick) + tv->tv_usec = tick; + return (0); +} + +/* + * Decrement an interval timer by a specified number + * of microseconds, which must be less than a second, + * i.e. < 1000000. If the timer expires, then reload + * it. In this case, carry over (usec - old value) to + * reduce the value reloaded into the timer so that + * the timer does not drift. This routine assumes + * that it is called in a context where the timers + * on which it is operating cannot change in value. + */ +int +itimerdecr(itp, usec) + register struct itimerval *itp; + int usec; +{ + + if (itp->it_value.tv_usec < usec) { + if (itp->it_value.tv_sec == 0) { + /* expired, and already in next interval */ + usec -= itp->it_value.tv_usec; + goto expire; + } + itp->it_value.tv_usec += 1000000; + itp->it_value.tv_sec--; + } + itp->it_value.tv_usec -= usec; + usec = 0; + if (timevalisset(&itp->it_value)) + return (1); + /* expired, exactly at end of interval */ +expire: + if (timevalisset(&itp->it_interval)) { + itp->it_value = itp->it_interval; + itp->it_value.tv_usec -= usec; + if (itp->it_value.tv_usec < 0) { + itp->it_value.tv_usec += 1000000; + itp->it_value.tv_sec--; + } + } else + itp->it_value.tv_usec = 0; /* sec is already 0 */ + return (0); +} + +/* + * Add and subtract routines for timevals. + * N.B.: subtract routine doesn't deal with + * results which are before the beginning, + * it just gets very confused in this case. + * Caveat emptor. + */ +void +timevaladd(t1, t2) + struct timeval *t1, *t2; +{ + + t1->tv_sec += t2->tv_sec; + t1->tv_usec += t2->tv_usec; + timevalfix(t1); +} + +void +timevalsub(t1, t2) + struct timeval *t1, *t2; +{ + + t1->tv_sec -= t2->tv_sec; + t1->tv_usec -= t2->tv_usec; + timevalfix(t1); +} + +static void +timevalfix(t1) + struct timeval *t1; +{ + + if (t1->tv_usec < 0) { + t1->tv_sec--; + t1->tv_usec += 1000000; + } + if (t1->tv_usec >= 1000000) { + t1->tv_sec++; + t1->tv_usec -= 1000000; + } +} diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c new file mode 100644 index 0000000..937b0c2 --- /dev/null +++ b/sys/kern/kern_timeout.c @@ -0,0 +1,414 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +/* + * TODO: + * allocate more timeout table slots when table overflows. + */ + +/* Exported to machdep.c and/or kern_clock.c. */ +struct callout *callout; +struct callout_list callfree; +int callwheelsize, callwheelbits, callwheelmask; +struct callout_tailq *callwheel; +int softticks; /* Like ticks, but for softclock(). */ +struct mtx callout_lock; + +static struct callout *nextsoftcheck; /* Next callout to be checked. */ + +/* + * kern_timeout_callwheel_alloc() - kernel low level callwheel initialization + * + * This code is called very early in the kernel initialization sequence, + * and may be called more then once. + */ +caddr_t +kern_timeout_callwheel_alloc(caddr_t v) +{ + /* + * Calculate callout wheel size + */ + for (callwheelsize = 1, callwheelbits = 0; + callwheelsize < ncallout; + callwheelsize <<= 1, ++callwheelbits) + ; + callwheelmask = callwheelsize - 1; + + callout = (struct callout *)v; + v = (caddr_t)(callout + ncallout); + callwheel = (struct callout_tailq *)v; + v = (caddr_t)(callwheel + callwheelsize); + return(v); +} + +/* + * kern_timeout_callwheel_init() - initialize previously reserved callwheel + * space. + * + * This code is called just once, after the space reserved for the + * callout wheel has been finalized. + */ +void +kern_timeout_callwheel_init(void) +{ + int i; + + SLIST_INIT(&callfree); + for (i = 0; i < ncallout; i++) { + callout_init(&callout[i], 0); + callout[i].c_flags = CALLOUT_LOCAL_ALLOC; + SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle); + } + for (i = 0; i < callwheelsize; i++) { + TAILQ_INIT(&callwheel[i]); + } + mtx_init(&callout_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); +} + +/* + * The callout mechanism is based on the work of Adam M. Costello and + * George Varghese, published in a technical report entitled "Redesigning + * the BSD Callout and Timer Facilities" and modified slightly for inclusion + * in FreeBSD by Justin T. Gibbs. The original work on the data structures + * used in this implementation was published by G.Varghese and A. Lauck in + * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for + * the Efficient Implementation of a Timer Facility" in the Proceedings of + * the 11th ACM Annual Symposium on Operating Systems Principles, + * Austin, Texas Nov 1987. + */ + +/* + * Software (low priority) clock interrupt. + * Run periodic events from timeout queue. + */ +void +softclock(void *dummy) +{ + register struct callout *c; + register struct callout_tailq *bucket; + register int curticks; + register int steps; /* #steps since we last allowed interrupts */ + +#ifndef MAX_SOFTCLOCK_STEPS +#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */ +#endif /* MAX_SOFTCLOCK_STEPS */ + + steps = 0; + mtx_lock_spin(&callout_lock); + while (softticks != ticks) { + softticks++; + /* + * softticks may be modified by hard clock, so cache + * it while we work on a given bucket. + */ + curticks = softticks; + bucket = &callwheel[curticks & callwheelmask]; + c = TAILQ_FIRST(bucket); + while (c) { + if (c->c_time != curticks) { + c = TAILQ_NEXT(c, c_links.tqe); + ++steps; + if (steps >= MAX_SOFTCLOCK_STEPS) { + nextsoftcheck = c; + /* Give interrupts a chance. */ + mtx_unlock_spin(&callout_lock); + ; /* nothing */ + mtx_lock_spin(&callout_lock); + c = nextsoftcheck; + steps = 0; + } + } else { + void (*c_func)(void *); + void *c_arg; + int c_flags; + + nextsoftcheck = TAILQ_NEXT(c, c_links.tqe); + TAILQ_REMOVE(bucket, c, c_links.tqe); + c_func = c->c_func; + c_arg = c->c_arg; + c_flags = c->c_flags; + c->c_func = NULL; + if (c->c_flags & CALLOUT_LOCAL_ALLOC) { + c->c_flags = CALLOUT_LOCAL_ALLOC; + SLIST_INSERT_HEAD(&callfree, c, + c_links.sle); + } else { + c->c_flags = + (c->c_flags & ~CALLOUT_PENDING); + } + mtx_unlock_spin(&callout_lock); + if (!(c_flags & CALLOUT_MPSAFE)) + mtx_lock(&Giant); + c_func(c_arg); + if (!(c_flags & CALLOUT_MPSAFE)) + mtx_unlock(&Giant); + mtx_lock_spin(&callout_lock); + steps = 0; + c = nextsoftcheck; + } + } + } + nextsoftcheck = NULL; + mtx_unlock_spin(&callout_lock); +} + +/* + * timeout -- + * Execute a function after a specified length of time. + * + * untimeout -- + * Cancel previous timeout function call. + * + * callout_handle_init -- + * Initialize a handle so that using it with untimeout is benign. + * + * See AT&T BCI Driver Reference Manual for specification. This + * implementation differs from that one in that although an + * identification value is returned from timeout, the original + * arguments to timeout as well as the identifier are used to + * identify entries for untimeout. + */ +struct callout_handle +timeout(ftn, arg, to_ticks) + timeout_t *ftn; + void *arg; + int to_ticks; +{ + struct callout *new; + struct callout_handle handle; + + mtx_lock_spin(&callout_lock); + + /* Fill in the next free callout structure. */ + new = SLIST_FIRST(&callfree); + if (new == NULL) + /* XXX Attempt to malloc first */ + panic("timeout table full"); + SLIST_REMOVE_HEAD(&callfree, c_links.sle); + + callout_reset(new, to_ticks, ftn, arg); + + handle.callout = new; + mtx_unlock_spin(&callout_lock); + return (handle); +} + +void +untimeout(ftn, arg, handle) + timeout_t *ftn; + void *arg; + struct callout_handle handle; +{ + + /* + * Check for a handle that was initialized + * by callout_handle_init, but never used + * for a real timeout. + */ + if (handle.callout == NULL) + return; + + mtx_lock_spin(&callout_lock); + if (handle.callout->c_func == ftn && handle.callout->c_arg == arg) + callout_stop(handle.callout); + mtx_unlock_spin(&callout_lock); +} + +void +callout_handle_init(struct callout_handle *handle) +{ + handle->callout = NULL; +} + +/* + * New interface; clients allocate their own callout structures. + * + * callout_reset() - establish or change a timeout + * callout_stop() - disestablish a timeout + * callout_init() - initialize a callout structure so that it can + * safely be passed to callout_reset() and callout_stop() + * + * <sys/callout.h> defines three convenience macros: + * + * callout_active() - returns truth if callout has not been serviced + * callout_pending() - returns truth if callout is still waiting for timeout + * callout_deactivate() - marks the callout as having been serviced + */ +void +callout_reset(c, to_ticks, ftn, arg) + struct callout *c; + int to_ticks; + void (*ftn)(void *); + void *arg; +{ + + mtx_lock_spin(&callout_lock); + if (c->c_flags & CALLOUT_PENDING) + callout_stop(c); + + /* + * We could unlock callout_lock here and lock it again before the + * TAILQ_INSERT_TAIL, but there's no point since doing this setup + * doesn't take much time. + */ + if (to_ticks <= 0) + to_ticks = 1; + + c->c_arg = arg; + c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); + c->c_func = ftn; + c->c_time = ticks + to_ticks; + TAILQ_INSERT_TAIL(&callwheel[c->c_time & callwheelmask], + c, c_links.tqe); + mtx_unlock_spin(&callout_lock); +} + +int +callout_stop(c) + struct callout *c; +{ + + mtx_lock_spin(&callout_lock); + /* + * Don't attempt to delete a callout that's not on the queue. + */ + if (!(c->c_flags & CALLOUT_PENDING)) { + c->c_flags &= ~CALLOUT_ACTIVE; + mtx_unlock_spin(&callout_lock); + return (0); + } + c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); + + if (nextsoftcheck == c) { + nextsoftcheck = TAILQ_NEXT(c, c_links.tqe); + } + TAILQ_REMOVE(&callwheel[c->c_time & callwheelmask], c, c_links.tqe); + c->c_func = NULL; + + if (c->c_flags & CALLOUT_LOCAL_ALLOC) { + SLIST_INSERT_HEAD(&callfree, c, c_links.sle); + } + mtx_unlock_spin(&callout_lock); + return (1); +} + +void +callout_init(c, mpsafe) + struct callout *c; + int mpsafe; +{ + bzero(c, sizeof *c); + if (mpsafe) + c->c_flags |= CALLOUT_MPSAFE; +} + +#ifdef APM_FIXUP_CALLTODO +/* + * Adjust the kernel calltodo timeout list. This routine is used after + * an APM resume to recalculate the calltodo timer list values with the + * number of hz's we have been sleeping. The next hardclock() will detect + * that there are fired timers and run softclock() to execute them. + * + * Please note, I have not done an exhaustive analysis of what code this + * might break. I am motivated to have my select()'s and alarm()'s that + * have expired during suspend firing upon resume so that the applications + * which set the timer can do the maintanence the timer was for as close + * as possible to the originally intended time. Testing this code for a + * week showed that resuming from a suspend resulted in 22 to 25 timers + * firing, which seemed independant on whether the suspend was 2 hours or + * 2 days. Your milage may vary. - Ken Key <key@cs.utk.edu> + */ +void +adjust_timeout_calltodo(time_change) + struct timeval *time_change; +{ + register struct callout *p; + unsigned long delta_ticks; + + /* + * How many ticks were we asleep? + * (stolen from tvtohz()). + */ + + /* Don't do anything */ + if (time_change->tv_sec < 0) + return; + else if (time_change->tv_sec <= LONG_MAX / 1000000) + delta_ticks = (time_change->tv_sec * 1000000 + + time_change->tv_usec + (tick - 1)) / tick + 1; + else if (time_change->tv_sec <= LONG_MAX / hz) + delta_ticks = time_change->tv_sec * hz + + (time_change->tv_usec + (tick - 1)) / tick + 1; + else + delta_ticks = LONG_MAX; + + if (delta_ticks > INT_MAX) + delta_ticks = INT_MAX; + + /* + * Now rip through the timer calltodo list looking for timers + * to expire. + */ + + /* don't collide with softclock() */ + mtx_lock_spin(&callout_lock); + for (p = calltodo.c_next; p != NULL; p = p->c_next) { + p->c_time -= delta_ticks; + + /* Break if the timer had more time on it than delta_ticks */ + if (p->c_time > 0) + break; + + /* take back the ticks the timer didn't use (p->c_time <= 0) */ + delta_ticks = -p->c_time; + } + mtx_unlock_spin(&callout_lock); + + return; +} +#endif /* APM_FIXUP_CALLTODO */ diff --git a/sys/kern/kern_uuid.c b/sys/kern/kern_uuid.c new file mode 100644 index 0000000..ba5faa5 --- /dev/null +++ b/sys/kern/kern_uuid.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2002 Marcel Moolenaar + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/endian.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sbuf.h> +#include <sys/socket.h> +#include <sys/sysproto.h> +#include <sys/uuid.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/if_types.h> + +/* + * See also: + * http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt + * http://www.opengroup.org/onlinepubs/009629399/apdxa.htm + * + * Note that the generator state is itself an UUID, but the time and clock + * sequence fields are written in the native byte order. + */ + +CTASSERT(sizeof(struct uuid) == 16); + +/* We use an alternative, more convenient representation in the generator. */ +struct uuid_private { + union { + uint64_t ll; /* internal. */ + struct { + uint32_t low; + uint16_t mid; + uint16_t hi; + } x; + } time; + uint16_t seq; /* Big-endian. */ + uint16_t node[UUID_NODE_LEN>>1]; +}; + +CTASSERT(sizeof(struct uuid_private) == 16); + +static struct uuid_private uuid_last; + +static struct mtx uuid_mutex; +MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF); + +/* + * Return the first MAC address we encounter or, if none was found, + * construct a sufficiently random multicast address. We don't try + * to return the same MAC address as previously returned. We always + * generate a new multicast address if no MAC address exists in the + * system. + * It would be nice to know if 'ifnet' or any of its sub-structures + * has been changed in any way. If not, we could simply skip the + * scan and safely return the MAC address we returned before. + */ +static void +uuid_node(uint16_t *node) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + struct sockaddr_dl *sdl; + int i; + + /* XXX: lock ifnet. */ + TAILQ_FOREACH(ifp, &ifnet, if_link) { + /* Walk the address list */ + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + sdl = (struct sockaddr_dl*)ifa->ifa_addr; + if (sdl != NULL && sdl->sdl_family == AF_LINK && + sdl->sdl_type == IFT_ETHER) { + /* Got a MAC address. */ + bcopy(LLADDR(sdl), node, UUID_NODE_LEN); + /* XXX: unlock ifnet. */ + return; + } + } + } + /* XXX: unlock ifnet. */ + + for (i = 0; i < (UUID_NODE_LEN>>1); i++) + node[i] = (uint16_t)arc4random(); + *((uint8_t*)node) |= 0x80; +} + +/* + * Get the current time as a 60 bit count of 100-nanosecond intervals + * since 00:00:00.00, October 15,1582. We apply a magic offset to convert + * the Unix time since 00:00:00.00, Januari 1, 1970 to the date of the + * Gregorian reform to the Christian calendar. + */ +static uint64_t +uuid_time(void) +{ + struct bintime bt; + uint64_t time = 0x01B21DD213814000LL; + + bintime(&bt); + time += (uint64_t)bt.sec * 10000000LL; + time += (10000000LL * (uint32_t)(bt.frac >> 32)) >> 32; + return (time & ((1LL << 60) - 1LL)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct uuidgen_args { + struct uuid *store; + int count; +}; +#endif + +int uuidgen(struct thread *td, struct uuidgen_args *uap) +{ + struct uuid_private uuid; + uint64_t time; + int error; + + /* + * Limit the number of UUIDs that can be created at the same time + * to some arbitrary number. This isn't really necessary, but I + * like to have some sort of upper-bound that's less than 2G :-) + * XXX needs to be tunable. + */ + if (uap->count < 1 || uap->count > 2048) + return (EINVAL); + + /* XXX: pre-validate accessibility to the whole of the UUID store? */ + + mtx_lock(&uuid_mutex); + + uuid_node(uuid.node); + time = uuid_time(); + + if (uuid_last.time.ll == 0LL || uuid_last.node[0] != uuid.node[0] || + uuid_last.node[1] != uuid.node[1] || + uuid_last.node[2] != uuid.node[2]) + uuid.seq = (uint16_t)arc4random() & 0x3fff; + else if (uuid_last.time.ll >= time) + uuid.seq = (uuid_last.seq + 1) & 0x3fff; + else + uuid.seq = uuid_last.seq; + + uuid_last = uuid; + uuid_last.time.ll = (time + uap->count - 1) & ((1LL << 60) - 1LL); + + mtx_unlock(&uuid_mutex); + + /* Set sequence and variant and deal with byte order. */ + uuid.seq = htobe16(uuid.seq | 0x8000); + + /* XXX: this should copyout larger chunks at a time. */ + do { + /* Set time and version (=1) and deal with byte order. */ + uuid.time.x.low = (uint32_t)time; + uuid.time.x.mid = (uint16_t)(time >> 32); + uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) | (1 << 12); + error = copyout(&uuid, uap->store, sizeof(uuid)); + uap->store++; + uap->count--; + time++; + } while (uap->count > 0 && !error); + + return (error); +} + +int +snprintf_uuid(char *buf, size_t sz, struct uuid *uuid) +{ + struct uuid_private *id; + int cnt; + + id = (struct uuid_private *)uuid; + cnt = snprintf(buf, sz, "%08x-%04x-%04x-%04x-%04x%04x%04x", + id->time.x.low, id->time.x.mid, id->time.x.hi, be16toh(id->seq), + be16toh(id->node[0]), be16toh(id->node[1]), be16toh(id->node[2])); + return (cnt); +} + +int +printf_uuid(struct uuid *uuid) +{ + char buf[38]; + + snprintf_uuid(buf, sizeof(buf), uuid); + return (printf("%s", buf)); +} + +int +sbuf_printf_uuid(struct sbuf *sb, struct uuid *uuid) +{ + char buf[38]; + + snprintf_uuid(buf, sizeof(buf), uuid); + return (sbuf_printf(sb, "%s", buf)); +} diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c new file mode 100644 index 0000000..9d4136b --- /dev/null +++ b/sys/kern/kern_xxx.c @@ -0,0 +1,314 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_xxx.c 8.2 (Berkeley) 11/14/93 + * $FreeBSD$ + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sysctl.h> +#include <sys/utsname.h> + + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + +#ifndef _SYS_SYSPROTO_H_ +struct gethostname_args { + char *hostname; + u_int len; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +ogethostname(td, uap) + struct thread *td; + struct gethostname_args *uap; +{ + int name[2]; + int error; + size_t len = uap->len; + + name[0] = CTL_KERN; + name[1] = KERN_HOSTNAME; + mtx_lock(&Giant); + error = userland_sysctl(td, name, 2, uap->hostname, &len, 1, 0, 0, 0); + mtx_unlock(&Giant); + return(error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct sethostname_args { + char *hostname; + u_int len; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +osethostname(td, uap) + struct thread *td; + register struct sethostname_args *uap; +{ + int name[2]; + int error; + + name[0] = CTL_KERN; + name[1] = KERN_HOSTNAME; + mtx_lock(&Giant); + if ((error = suser_cred(td->td_ucred, PRISON_ROOT)) == 0) { + error = userland_sysctl(td, name, 2, 0, 0, 0, + uap->hostname, uap->len, 0); + } + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct ogethostid_args { + int dummy; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +ogethostid(td, uap) + struct thread *td; + struct ogethostid_args *uap; +{ + + *(long *)(td->td_retval) = hostid; + return (0); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +#ifdef COMPAT_43 +#ifndef _SYS_SYSPROTO_H_ +struct osethostid_args { + long hostid; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +osethostid(td, uap) + struct thread *td; + struct osethostid_args *uap; +{ + int error; + + mtx_lock(&Giant); + if ((error = suser(td))) + hostid = uap->hostid; + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int +oquota(td, uap) + struct thread *td; + struct oquota_args *uap; +{ + return (ENOSYS); +} +#endif /* COMPAT_43 */ + +/* + * This is the FreeBSD-1.1 compatable uname(2) interface. These + * days it is done in libc as a wrapper around a bunch of sysctl's. + * This must maintain the old 1.1 binary ABI. + */ +#if SYS_NMLN != 32 +#error "FreeBSD-1.1 uname syscall has been broken" +#endif +#ifndef _SYS_SYSPROTO_H_ +struct uname_args { + struct utsname *name; +}; +#endif + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +uname(td, uap) + struct thread *td; + struct uname_args *uap; +{ + int name[2], error; + size_t len; + char *s, *us; + + name[0] = CTL_KERN; + name[1] = KERN_OSTYPE; + len = sizeof (uap->name->sysname); + mtx_lock(&Giant); + error = userland_sysctl(td, name, 2, uap->name->sysname, &len, + 1, 0, 0, 0); + if (error) + goto done2; + subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0); + + name[1] = KERN_HOSTNAME; + len = sizeof uap->name->nodename; + error = userland_sysctl(td, name, 2, uap->name->nodename, &len, + 1, 0, 0, 0); + if (error) + goto done2; + subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0); + + name[1] = KERN_OSRELEASE; + len = sizeof uap->name->release; + error = userland_sysctl(td, name, 2, uap->name->release, &len, + 1, 0, 0, 0); + if (error) + goto done2; + subyte( uap->name->release + sizeof(uap->name->release) - 1, 0); + +/* + name = KERN_VERSION; + len = sizeof uap->name->version; + error = userland_sysctl(td, name, 2, uap->name->version, &len, + 1, 0, 0, 0); + if (error) + goto done2; + subyte( uap->name->version + sizeof(uap->name->version) - 1, 0); +*/ + +/* + * this stupid hackery to make the version field look like FreeBSD 1.1 + */ + for(s = version; *s && *s != '#'; s++); + + for(us = uap->name->version; *s && *s != ':'; s++) { + error = subyte( us++, *s); + if (error) + goto done2; + } + error = subyte( us++, 0); + if (error) + goto done2; + + name[0] = CTL_HW; + name[1] = HW_MACHINE; + len = sizeof uap->name->machine; + error = userland_sysctl(td, name, 2, uap->name->machine, &len, + 1, 0, 0, 0); + if (error) + goto done2; + subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0); +done2: + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct getdomainname_args { + char *domainname; + int len; +}; +#endif + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getdomainname(td, uap) + struct thread *td; + struct getdomainname_args *uap; +{ + int domainnamelen; + int error; + + mtx_lock(&Giant); + domainnamelen = strlen(domainname) + 1; + if ((u_int)uap->len > domainnamelen + 1) + uap->len = domainnamelen + 1; + error = copyout((caddr_t)domainname, (caddr_t)uap->domainname, uap->len); + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setdomainname_args { + char *domainname; + int len; +}; +#endif + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setdomainname(td, uap) + struct thread *td; + struct setdomainname_args *uap; +{ + int error, domainnamelen; + + mtx_lock(&Giant); + if ((error = suser(td))) + goto done2; + if ((u_int)uap->len > sizeof (domainname) - 1) { + error = EINVAL; + goto done2; + } + domainnamelen = uap->len; + error = copyin((caddr_t)uap->domainname, domainname, uap->len); + domainname[domainnamelen] = 0; +done2: + mtx_unlock(&Giant); + return (error); +} + diff --git a/sys/kern/ksched.c b/sys/kern/ksched.c new file mode 100644 index 0000000..c9081c3 --- /dev/null +++ b/sys/kern/ksched.c @@ -0,0 +1,280 @@ +/* + * Copyright (c) 1996, 1997 + * HD Associates, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by HD Associates, Inc + * 4. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* ksched: Soft real time scheduling based on "rtprio". + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/resource.h> + +#include <posix4/posix4.h> + +/* ksched: Real-time extension to support POSIX priority scheduling. + */ + +struct ksched { + struct timespec rr_interval; +}; + +int ksched_attach(struct ksched **p) +{ + struct ksched *ksched= p31b_malloc(sizeof(*ksched)); + + ksched->rr_interval.tv_sec = 0; + ksched->rr_interval.tv_nsec = 1000000000L / roundrobin_interval(); + + *p = ksched; + return 0; +} + +int ksched_detach(struct ksched *ks) +{ + p31b_free(ks); + + return 0; +} + +/* + * XXX About priorities + * + * POSIX 1003.1b requires that numerically higher priorities be of + * higher priority. It also permits sched_setparam to be + * implementation defined for SCHED_OTHER. I don't like + * the notion of inverted priorites for normal processes when + * you can use "setpriority" for that. + * + * I'm rejecting sched_setparam for SCHED_OTHER with EINVAL. + */ + +/* Macros to convert between the unix (lower numerically is higher priority) + * and POSIX 1003.1b (higher numerically is higher priority) + */ + +#define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P)) +#define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P)) + +/* These improve readability a bit for me: + */ +#define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX) +#define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN) + +static __inline int +getscheduler(register_t *ret, struct ksched *ksched, struct thread *td) +{ + struct rtprio rtp; + int e = 0; + + mtx_lock_spin(&sched_lock); + pri_to_rtp(td->td_ksegrp, &rtp); + mtx_unlock_spin(&sched_lock); + switch (rtp.type) + { + case RTP_PRIO_FIFO: + *ret = SCHED_FIFO; + break; + + case RTP_PRIO_REALTIME: + *ret = SCHED_RR; + break; + + default: + *ret = SCHED_OTHER; + break; + } + + return e; +} + +int ksched_setparam(register_t *ret, struct ksched *ksched, + struct thread *td, const struct sched_param *param) +{ + register_t policy; + int e; + + e = getscheduler(&policy, ksched, td); + + if (e == 0) + { + if (policy == SCHED_OTHER) + e = EINVAL; + else + e = ksched_setscheduler(ret, ksched, td, policy, param); + } + + return e; +} + +int ksched_getparam(register_t *ret, struct ksched *ksched, + struct thread *td, struct sched_param *param) +{ + struct rtprio rtp; + + mtx_lock_spin(&sched_lock); + pri_to_rtp(td->td_ksegrp, &rtp); + mtx_unlock_spin(&sched_lock); + if (RTP_PRIO_IS_REALTIME(rtp.type)) + param->sched_priority = rtpprio_to_p4prio(rtp.prio); + + return 0; +} + +/* + * XXX The priority and scheduler modifications should + * be moved into published interfaces in kern/kern_sync. + * + * The permissions to modify process p were checked in "p31b_proc()". + * + */ +int ksched_setscheduler(register_t *ret, struct ksched *ksched, + struct thread *td, int policy, const struct sched_param *param) +{ + int e = 0; + struct rtprio rtp; + struct ksegrp *kg = td->td_ksegrp; + + switch(policy) + { + case SCHED_RR: + case SCHED_FIFO: + + if (param->sched_priority >= P1B_PRIO_MIN && + param->sched_priority <= P1B_PRIO_MAX) + { + rtp.prio = p4prio_to_rtpprio(param->sched_priority); + rtp.type = (policy == SCHED_FIFO) + ? RTP_PRIO_FIFO : RTP_PRIO_REALTIME; + + mtx_lock_spin(&sched_lock); + rtp_to_pri(&rtp, kg); + td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */ + mtx_unlock_spin(&sched_lock); + } + else + e = EPERM; + + + break; + + case SCHED_OTHER: + { + rtp.type = RTP_PRIO_NORMAL; + rtp.prio = p4prio_to_rtpprio(param->sched_priority); + mtx_lock_spin(&sched_lock); + rtp_to_pri(&rtp, kg); + + /* XXX Simply revert to whatever we had for last + * normal scheduler priorities. + * This puts a requirement + * on the scheduling code: You must leave the + * scheduling info alone. + */ + td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */ + mtx_unlock_spin(&sched_lock); + } + break; + } + + return e; +} + +int ksched_getscheduler(register_t *ret, struct ksched *ksched, struct thread *td) +{ + return getscheduler(ret, ksched, td); +} + +/* ksched_yield: Yield the CPU. + */ +int ksched_yield(register_t *ret, struct ksched *ksched) +{ + mtx_lock_spin(&sched_lock); + curthread->td_kse->ke_flags |= KEF_NEEDRESCHED; + mtx_unlock_spin(&sched_lock); + return 0; +} + +int ksched_get_priority_max(register_t*ret, struct ksched *ksched, int policy) +{ + int e = 0; + + switch (policy) + { + case SCHED_FIFO: + case SCHED_RR: + *ret = RTP_PRIO_MAX; + break; + + case SCHED_OTHER: + *ret = PRIO_MAX; + break; + + default: + e = EINVAL; + } + + return e; +} + +int ksched_get_priority_min(register_t *ret, struct ksched *ksched, int policy) +{ + int e = 0; + + switch (policy) + { + case SCHED_FIFO: + case SCHED_RR: + *ret = P1B_PRIO_MIN; + break; + + case SCHED_OTHER: + *ret = PRIO_MIN; + break; + + default: + e = EINVAL; + } + + return e; +} + +int ksched_rr_get_interval(register_t *ret, struct ksched *ksched, + struct thread *td, struct timespec *timespec) +{ + *timespec = ksched->rr_interval; + + return 0; +} diff --git a/sys/kern/link_aout.c b/sys/kern/link_aout.c new file mode 100644 index 0000000..5a863bd --- /dev/null +++ b/sys/kern/link_aout.c @@ -0,0 +1,590 @@ +/*- + * Copyright (c) 1997-2000 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifdef __i386__ + +#define FREEBSD_AOUT 1 + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/namei.h> +#include <sys/fcntl.h> +#include <sys/vnode.h> +#include <sys/linker.h> + + +#include "linker_if.h" + +#ifndef __ELF__ +#include <vm/vm.h> +#include <vm/pmap.h> +#include <machine/vmparam.h> +#endif + +#include <a.out.h> +#include <link.h> + +typedef struct aout_file { + struct linker_file lf; /* Common fields */ + int preloaded; /* Was this pre-loader */ + char* address; /* Load address */ + struct _dynamic* dynamic; /* Symbol table etc. */ +} *aout_file_t; + +static int link_aout_link_preload(linker_class_t lc, + const char* modname, linker_file_t*); +static int link_aout_link_preload_finish(linker_file_t); + +static int link_aout_load_file(linker_class_t lc, const char*, linker_file_t*); +static int link_aout_lookup_symbol(linker_file_t, const char*, + c_linker_sym_t*); +static int link_aout_symbol_values(linker_file_t file, c_linker_sym_t sym, + linker_symval_t* symval); +static int link_aout_search_symbol(linker_file_t lf, caddr_t value, + c_linker_sym_t* sym, long* diffp); +static void link_aout_unload_file(linker_file_t); +static void link_aout_unload_preload(linker_file_t); +static int link_aout_lookup_set(linker_file_t, const char*, + void ***, void ***, int*); + +static kobj_method_t link_aout_methods[] = { + KOBJMETHOD(linker_lookup_symbol, link_aout_lookup_symbol), + KOBJMETHOD(linker_symbol_values, link_aout_symbol_values), + KOBJMETHOD(linker_search_symbol, link_aout_search_symbol), + KOBJMETHOD(linker_unload, link_aout_unload_file), + KOBJMETHOD(linker_load_file, link_aout_load_file), + KOBJMETHOD(linker_link_preload, link_aout_link_preload), + KOBJMETHOD(linker_link_preload_finish, link_aout_link_preload_finish), + KOBJMETHOD(linker_lookup_set, link_aout_lookup_set), + { 0, 0 } +}; + +static struct linker_class link_aout_class = { + "a.out", link_aout_methods, sizeof(struct aout_file) +}; + +static int relocate_file(aout_file_t af); + +/* + * The kernel symbol table starts here. + */ +extern struct _dynamic __DYNAMIC; + +static void +link_aout_init(void* arg) +{ +#ifndef __ELF__ + struct _dynamic* dp = &__DYNAMIC; +#endif + + linker_add_class(&link_aout_class); + +#ifndef __ELF__ + if (dp) { + aout_file_t af; + + linker_kernel_file = + linker_make_file(kernelname, &link_aout_class); + if (linker_kernel_file == NULL) + panic("link_aout_init: Can't create linker structures for kernel"); + af = (aout_file_t) linker_kernel_file; + af->address = 0; + af->dynamic = dp; + linker_kernel_file->address = (caddr_t) KERNBASE; + linker_kernel_file->size = -(long)linker_kernel_file->address; + } +#endif +} + +SYSINIT(link_aout, SI_SUB_KLD, SI_ORDER_THIRD, link_aout_init, 0); + +static int +link_aout_link_preload(linker_class_t lc, + const char* filename, linker_file_t* result) +{ + caddr_t modptr, baseptr; + char *type; + struct exec *ehdr; + aout_file_t af; + linker_file_t lf; + + /* Look to see if we have the module preloaded. */ + modptr = preload_search_by_name(filename); + if (modptr == NULL) + return ENOENT; + + if (((type = (char *)preload_search_info(modptr, MODINFO_TYPE)) == NULL) || + strcmp(type, "a.out module") || + ((baseptr = preload_search_info(modptr, MODINFO_ADDR)) == NULL) || + ((ehdr = (struct exec *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_AOUTEXEC)) == NULL)) + return(0); /* we can't handle this */ + + /* Register with kld */ + lf = linker_make_file(filename, &link_aout_class); + if (lf == NULL) { + return(ENOMEM); + } + af = (aout_file_t) lf; + + /* Looks like we can handle this one */ + filename = preload_search_info(modptr, MODINFO_NAME); + af->preloaded = 1; + af->address = baseptr; + + /* Assume _DYNAMIC is the first data item. */ + af->dynamic = (struct _dynamic*)(af->address + ehdr->a_text); + if (af->dynamic->d_version != LD_VERSION_BSD) { + linker_file_unload(lf); + return(0); /* we can't handle this */ + } + af->dynamic->d_un.d_sdt = (struct section_dispatch_table *) + ((char *)af->dynamic->d_un.d_sdt + (vm_offset_t)af->address); + + lf->address = af->address; + lf->size = ehdr->a_text + ehdr->a_data + ehdr->a_bss; + *result = lf; + return(0); +} + +static int +link_aout_link_preload_finish(linker_file_t lf) +{ + aout_file_t af; + int error; + + af = (aout_file_t) lf; + error = relocate_file(af); + if (error) { + linker_file_unload(lf); + return(error); + } + return(0); +} + +static int +link_aout_load_file(linker_class_t lc, const char* filename, linker_file_t* result) +{ + struct nameidata nd; + struct thread *td = curthread; /* XXX */ + int error = 0; + int resid, flags; + struct exec header; + aout_file_t af; + linker_file_t lf = 0; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td); + flags = FREAD; + error = vn_open(&nd, &flags, 0); + if (error) + return error; + NDFREE(&nd, NDF_ONLY_PNBUF); + + /* + * Read the a.out header from the file. + */ + error = vn_rdwr(UIO_READ, nd.ni_vp, (void*) &header, sizeof header, 0, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td); + if (error) + goto out; + + if (N_BADMAG(header) || !(N_GETFLAG(header) & EX_DYNAMIC)) + goto out; + + /* + * We have an a.out file, so make some space to read it in. + */ + lf = linker_make_file(filename, &link_aout_class); + if (lf == NULL) { + error = ENOMEM; + goto out; + } + + af = (aout_file_t) lf; + af->address = malloc(header.a_text + header.a_data + header.a_bss, + M_LINKER, M_WAITOK); + + /* + * Read the text and data sections and zero the bss. + */ + error = vn_rdwr(UIO_READ, nd.ni_vp, (void*) af->address, + header.a_text + header.a_data, 0, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td); + if (error) + goto out; + bzero(af->address + header.a_text + header.a_data, header.a_bss); + + /* + * Assume _DYNAMIC is the first data item. + */ + af->dynamic = (struct _dynamic*) (af->address + header.a_text); + if (af->dynamic->d_version != LD_VERSION_BSD) { + error = ENOEXEC; + goto out; + } + af->dynamic->d_un.d_sdt = (struct section_dispatch_table *) + ((char *)af->dynamic->d_un.d_sdt + (vm_offset_t)af->address); + + lf->address = af->address; + lf->size = header.a_text + header.a_data + header.a_bss; + + error = linker_load_dependencies(lf); + if (error) + goto out; + error = relocate_file(af); + if (error) + goto out; + + *result = lf; + +out: + if (error && lf) + linker_file_unload(lf); + VOP_UNLOCK(nd.ni_vp, 0, td); + vn_close(nd.ni_vp, FREAD, td->td_ucred, td); + + return error; +} + +static void +link_aout_unload_file(linker_file_t file) +{ + aout_file_t af = (aout_file_t) file; + + if (af->preloaded) { + link_aout_unload_preload(file); + return; + } + + if (af->address) + free(af->address, M_LINKER); +} + +static void +link_aout_unload_preload(linker_file_t file) +{ + if (file->filename) + preload_delete_name(file->filename); +} + +/* + * XXX i386 dependant. + */ +static long +read_relocation(struct relocation_info* r, char* addr) +{ + int length = r->r_length; + + if (length == 0) + return *(u_char*) addr; + else if (length == 1) + return *(u_short*) addr; + else if (length == 2) + return *(u_int*) addr; + else + printf("link_aout: unsupported relocation size %d\n", r->r_length); + return 0; +} + +static void +write_relocation(struct relocation_info* r, char* addr, long value) +{ + int length = r->r_length; + + if (length == 0) + *(u_char*) addr = value; + else if (length == 1) + *(u_short*) addr = value; + else if (length == 2) + *(u_int*) addr = value; + else + printf("link_aout: unsupported relocation size %d\n", r->r_length); +} + +#define AOUT_RELOC(af, type, off) (type*) ((af)->address + (off)) + +static int +relocate_file(aout_file_t af) +{ + struct relocation_info* rel; + struct relocation_info* erel; + struct relocation_info* r; + struct nzlist* symbolbase; + char* stringbase; + struct nzlist* np; + char* sym; + long relocation; + + rel = AOUT_RELOC(af, struct relocation_info, LD_REL(af->dynamic)); + erel = AOUT_RELOC(af, struct relocation_info, + LD_REL(af->dynamic) + LD_RELSZ(af->dynamic)); + symbolbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic)); + stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic)); + + for (r = rel; r < erel; r++) { + char* addr; + + if (r->r_address == 0) + break; + + addr = AOUT_RELOC(af, char, r->r_address); + if (r->r_extern) { + np = &symbolbase[r->r_symbolnum]; + sym = &stringbase[np->nz_strx]; + + if (sym[0] != '_') { + printf("link_aout: bad symbol name %s\n", sym); + relocation = 0; + } else + relocation = (intptr_t) + linker_file_lookup_symbol(&af->lf, sym + 1, + np->nz_type != (N_SETV+N_EXT)); + if (!relocation) { + printf("link_aout: symbol %s not found\n", sym); + return ENOENT; + } + + relocation += read_relocation(r, addr); + + if (r->r_jmptable) { + printf("link_aout: can't cope with jump table relocations\n"); + continue; + } + + if (r->r_pcrel) + relocation -= (intptr_t) af->address; + + if (r->r_copy) { + printf("link_aout: can't cope with copy relocations\n"); + continue; + } + + write_relocation(r, addr, relocation); + } else { + write_relocation(r, addr, + (intptr_t)(read_relocation(r, addr) + af->address)); + } + + } + + return 0; +} + +static long +symbol_hash_value(aout_file_t af, const char* name) +{ + long hashval; + const char* p; + + hashval = '_'; /* fake a starting '_' for C symbols */ + for (p = name; *p; p++) + hashval = (hashval << 1) + *p; + + return (hashval & 0x7fffffff) % LD_BUCKETS(af->dynamic); +} + +int +link_aout_lookup_symbol(linker_file_t file, const char* name, + c_linker_sym_t* sym) +{ + aout_file_t af = (aout_file_t) file; + long hashval; + struct rrs_hash* hashbase; + struct nzlist* symbolbase; + char* stringbase; + struct rrs_hash* hp; + struct nzlist* np; + char* cp; + + if (LD_BUCKETS(af->dynamic) == 0) + return 0; + + hashbase = AOUT_RELOC(af, struct rrs_hash, LD_HASH(af->dynamic)); + symbolbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic)); + stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic)); + +restart: + hashval = symbol_hash_value(af, name); + hp = &hashbase[hashval]; + if (hp->rh_symbolnum == -1) + return ENOENT; + + while (hp) { + np = (struct nzlist *) &symbolbase[hp->rh_symbolnum]; + cp = stringbase + np->nz_strx; + /* + * Note: we fake the leading '_' for C symbols. + */ + if (cp[0] == '_' && !strcmp(cp + 1, name)) + break; + + if (hp->rh_next == 0) + hp = NULL; + else + hp = &hashbase[hp->rh_next]; + } + + if (hp == NULL) + /* + * Not found. + */ + return ENOENT; + + /* + * Check for an aliased symbol, whatever that is. + */ + if (np->nz_type == N_INDR+N_EXT) { + name = stringbase + (++np)->nz_strx + 1; /* +1 for '_' */ + goto restart; + } + + /* + * Check this is an actual definition of the symbol. + */ + if (np->nz_value == 0) + return ENOENT; + + if (np->nz_type == N_UNDF+N_EXT && np->nz_value != 0) { + if (np->nz_other == AUX_FUNC) + /* weak function */ + return ENOENT; + } + + *sym = (linker_sym_t) np; + + return 0; +} + + +static int +link_aout_symbol_values(linker_file_t file, c_linker_sym_t sym, + linker_symval_t* symval) +{ + aout_file_t af = (aout_file_t) file; + const struct nzlist* np = (const struct nzlist*) sym; + char* stringbase; + long numsym = LD_STABSZ(af->dynamic) / sizeof(struct nzlist); + struct nzlist *symbase; + + /* Is it one of ours? It could be another module... */ + symbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic)); + if (np < symbase) + return ENOENT; + if ((np - symbase) > numsym) + return ENOENT; + + stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic)); + + symval->name = stringbase + np->nz_strx + 1; /* +1 for '_' */ + if (np->nz_type == N_UNDF+N_EXT && np->nz_value != 0) { + symval->value = 0; + symval->size = np->nz_value; + } else { + symval->value = AOUT_RELOC(af, char, np->nz_value); + symval->size = np->nz_size; + } + return 0; +} + +static int +link_aout_search_symbol(linker_file_t lf, caddr_t value, + c_linker_sym_t* sym, long* diffp) +{ + aout_file_t af = (aout_file_t) lf; + u_long off = (uintptr_t) (void *) value; + u_long diff = off; + u_long sp_nz_value; + struct nzlist* sp; + struct nzlist* ep; + struct nzlist* best = 0; + + for (sp = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic)), + ep = (struct nzlist *) ((caddr_t) sp + LD_STABSZ(af->dynamic)); + sp < ep; sp++) { + if (sp->nz_name == 0) + continue; + sp_nz_value = sp->nz_value + (uintptr_t) (void *) af->address; + if (off >= sp_nz_value) { + if (off - sp_nz_value < diff) { + diff = off - sp_nz_value; + best = sp; + if (diff == 0) + break; + } else if (off - sp_nz_value == diff) { + best = sp; + } + } + } + if (best == 0) + *diffp = off; + else + *diffp = diff; + *sym = (linker_sym_t) best; + + return 0; +} + +/* + * Look up a linker set on an a.out + gnu LD system. + */ +struct generic_linker_set { + int ls_length; + void *ls_items[1]; +}; +static int +link_aout_lookup_set(linker_file_t lf, const char *name, + void ***startp, void ***stopp, int *countp) +{ + c_linker_sym_t sym; + linker_symval_t symval; + void **start, **stop; + int error, count; + struct generic_linker_set *setp; + + error = link_aout_lookup_symbol(lf, name, &sym); + if (error) + return error; + link_aout_symbol_values(lf, sym, &symval); + if (symval.value == 0) + return ESRCH; + setp = (struct generic_linker_set *)symval.value; + count = setp->ls_length; + start = &setp->ls_items[0]; + stop = &setp->ls_items[count]; + if (startp) + *startp = start; + if (stopp) + *stopp = stop; + if (countp) + *countp = count; + return 0; +} + +#endif /* __i386__ */ diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c new file mode 100644 index 0000000..dd59405 --- /dev/null +++ b/sys/kern/link_elf.c @@ -0,0 +1,1239 @@ +/*- + * Copyright (c) 1998-2000 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/namei.h> +#include <sys/fcntl.h> +#include <sys/vnode.h> +#include <sys/linker.h> + +#include <machine/elf.h> +#ifdef GPROF +#include <machine/profile.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_param.h> +#ifdef SPARSE_MAPPING +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#endif +#include <vm/pmap.h> +#include <vm/vm_map.h> + +#ifdef __AOUT__ +#include <nlist.h> +#endif +#include <link.h> + +#include "linker_if.h" + +typedef struct elf_file { + struct linker_file lf; /* Common fields */ + int preloaded; /* Was file pre-loaded */ + caddr_t address; /* Relocation address */ +#ifdef SPARSE_MAPPING + vm_object_t object; /* VM object to hold file pages */ +#endif + Elf_Dyn* dynamic; /* Symbol table etc. */ + Elf_Hashelt nbuckets; /* DT_HASH info */ + Elf_Hashelt nchains; + const Elf_Hashelt* buckets; + const Elf_Hashelt* chains; + caddr_t hash; + caddr_t strtab; /* DT_STRTAB */ + int strsz; /* DT_STRSZ */ + const Elf_Sym* symtab; /* DT_SYMTAB */ + Elf_Addr* got; /* DT_PLTGOT */ + const Elf_Rel* pltrel; /* DT_JMPREL */ + int pltrelsize; /* DT_PLTRELSZ */ + const Elf_Rela* pltrela; /* DT_JMPREL */ + int pltrelasize; /* DT_PLTRELSZ */ + const Elf_Rel* rel; /* DT_REL */ + int relsize; /* DT_RELSZ */ + const Elf_Rela* rela; /* DT_RELA */ + int relasize; /* DT_RELASZ */ + caddr_t modptr; + const Elf_Sym* ddbsymtab; /* The symbol table we are using */ + long ddbsymcnt; /* Number of symbols */ + caddr_t ddbstrtab; /* String table */ + long ddbstrcnt; /* number of bytes in string table */ + caddr_t symbase; /* malloc'ed symbold base */ + caddr_t strbase; /* malloc'ed string base */ +#ifdef DDB + struct link_map gdb; /* hooks for gdb */ +#endif +} *elf_file_t; + +static int link_elf_link_preload(linker_class_t cls, + const char*, linker_file_t*); +static int link_elf_link_preload_finish(linker_file_t); +static int link_elf_load_file(linker_class_t, const char*, linker_file_t*); +static int link_elf_lookup_symbol(linker_file_t, const char*, + c_linker_sym_t*); +static int link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t*); +static int link_elf_search_symbol(linker_file_t, caddr_t value, + c_linker_sym_t* sym, long* diffp); + +static void link_elf_unload_file(linker_file_t); +static void link_elf_unload_preload(linker_file_t); +static int link_elf_lookup_set(linker_file_t, const char *, + void ***, void ***, int *); +static int link_elf_each_function_name(linker_file_t, + int (*)(const char *, void *), + void *); + +static kobj_method_t link_elf_methods[] = { + KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol), + KOBJMETHOD(linker_symbol_values, link_elf_symbol_values), + KOBJMETHOD(linker_search_symbol, link_elf_search_symbol), + KOBJMETHOD(linker_unload, link_elf_unload_file), + KOBJMETHOD(linker_load_file, link_elf_load_file), + KOBJMETHOD(linker_link_preload, link_elf_link_preload), + KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish), + KOBJMETHOD(linker_lookup_set, link_elf_lookup_set), + KOBJMETHOD(linker_each_function_name, link_elf_each_function_name), + { 0, 0 } +}; + +static struct linker_class link_elf_class = { +#if ELF_TARG_CLASS == ELFCLASS32 + "elf32", +#else + "elf64", +#endif + link_elf_methods, sizeof(struct elf_file) +}; + +static int parse_dynamic(elf_file_t ef); +static int relocate_file(elf_file_t ef); +static int link_elf_preload_parse_symbols(elf_file_t ef); + +#ifdef DDB +static void r_debug_state(struct r_debug *dummy_one, + struct link_map *dummy_two); + +/* + * A list of loaded modules for GDB to use for loading symbols. + */ +struct r_debug r_debug; + +#define GDB_STATE(s) r_debug.r_state = s; r_debug_state(NULL, NULL); + +/* + * Function for the debugger to set a breakpoint on to gain control. + */ +void +r_debug_state(struct r_debug *dummy_one __unused, + struct link_map *dummy_two __unused) +{ +} + +#endif + +#ifdef __ia64__ +Elf_Addr link_elf_get_gp(linker_file_t); +#endif + +/* + * The kernel symbol table starts here. + */ +extern struct _dynamic _DYNAMIC; + +static void +link_elf_init(void* arg) +{ +#ifdef __ELF__ + Elf_Dyn *dp; + caddr_t modptr, baseptr, sizeptr; + elf_file_t ef; + char *modname; +#ifdef DDB + char *newfilename; +#endif +#endif + + linker_add_class(&link_elf_class); + +#ifdef __ELF__ + dp = (Elf_Dyn*) &_DYNAMIC; + modname = NULL; + modptr = preload_search_by_type("elf kernel"); + if (modptr) + modname = (char *)preload_search_info(modptr, MODINFO_NAME); + if (modname == NULL) + modname = "kernel"; + linker_kernel_file = linker_make_file(modname, &link_elf_class); + if (linker_kernel_file == NULL) + panic("link_elf_init: Can't create linker structures for kernel"); + + ef = (elf_file_t) linker_kernel_file; + ef->preloaded = 1; + ef->address = 0; +#ifdef SPARSE_MAPPING + ef->object = 0; +#endif + ef->dynamic = dp; + + if (dp) + parse_dynamic(ef); + linker_kernel_file->address = (caddr_t) KERNBASE; + linker_kernel_file->size = -(intptr_t)linker_kernel_file->address; + + if (modptr) { + ef->modptr = modptr; + baseptr = preload_search_info(modptr, MODINFO_ADDR); + if (baseptr) + linker_kernel_file->address = *(caddr_t *)baseptr; + sizeptr = preload_search_info(modptr, MODINFO_SIZE); + if (sizeptr) + linker_kernel_file->size = *(size_t *)sizeptr; + } + (void)link_elf_preload_parse_symbols(ef); + +#ifdef DDB + ef->gdb.l_addr = linker_kernel_file->address; + newfilename = malloc(strlen(modname) + 1, M_LINKER, M_WAITOK); + strcpy(newfilename, modname); + ef->gdb.l_name = newfilename; + ef->gdb.l_ld = dp; + ef->gdb.l_prev = 0; + ef->gdb.l_next = 0; + + r_debug.r_map = &ef->gdb; + r_debug.r_brk = r_debug_state; + r_debug.r_state = RT_CONSISTENT; + + r_debug_state(NULL, NULL); /* say hello to gdb! */ +#endif + +#endif +} + +SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0); + +static int +link_elf_preload_parse_symbols(elf_file_t ef) +{ + caddr_t pointer; + caddr_t ssym, esym, base; + caddr_t strtab; + int strcnt; + Elf_Sym* symtab; + int symcnt; + + if (ef->modptr == NULL) + return 0; + pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_SSYM); + if (pointer == NULL) + return 0; + ssym = *(caddr_t *)pointer; + pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_ESYM); + if (pointer == NULL) + return 0; + esym = *(caddr_t *)pointer; + + base = ssym; + + symcnt = *(long *)base; + base += sizeof(long); + symtab = (Elf_Sym *)base; + base += roundup(symcnt, sizeof(long)); + + if (base > esym || base < ssym) { + printf("Symbols are corrupt!\n"); + return EINVAL; + } + + strcnt = *(long *)base; + base += sizeof(long); + strtab = base; + base += roundup(strcnt, sizeof(long)); + + if (base > esym || base < ssym) { + printf("Symbols are corrupt!\n"); + return EINVAL; + } + + ef->ddbsymtab = symtab; + ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); + ef->ddbstrtab = strtab; + ef->ddbstrcnt = strcnt; + + return 0; +} + +static int +parse_dynamic(elf_file_t ef) +{ + Elf_Dyn *dp; + int plttype = DT_REL; + + for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { + switch (dp->d_tag) { + case DT_HASH: + { + /* From src/libexec/rtld-elf/rtld.c */ + const Elf_Hashelt *hashtab = (const Elf_Hashelt *) + (ef->address + dp->d_un.d_ptr); + ef->nbuckets = hashtab[0]; + ef->nchains = hashtab[1]; + ef->buckets = hashtab + 2; + ef->chains = ef->buckets + ef->nbuckets; + break; + } + case DT_STRTAB: + ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr); + break; + case DT_STRSZ: + ef->strsz = dp->d_un.d_val; + break; + case DT_SYMTAB: + ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr); + break; + case DT_SYMENT: + if (dp->d_un.d_val != sizeof(Elf_Sym)) + return ENOEXEC; + break; + case DT_PLTGOT: + ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr); + break; + case DT_REL: + ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); + break; + case DT_RELSZ: + ef->relsize = dp->d_un.d_val; + break; + case DT_RELENT: + if (dp->d_un.d_val != sizeof(Elf_Rel)) + return ENOEXEC; + break; + case DT_JMPREL: + ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); + break; + case DT_PLTRELSZ: + ef->pltrelsize = dp->d_un.d_val; + break; + case DT_RELA: + ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr); + break; + case DT_RELASZ: + ef->relasize = dp->d_un.d_val; + break; + case DT_RELAENT: + if (dp->d_un.d_val != sizeof(Elf_Rela)) + return ENOEXEC; + break; + case DT_PLTREL: + plttype = dp->d_un.d_val; + if (plttype != DT_REL && plttype != DT_RELA) + return ENOEXEC; + break; +#ifdef DDB + case DT_DEBUG: + dp->d_un.d_ptr = (Elf_Addr) &r_debug; + break; +#endif + } + } + + if (plttype == DT_RELA) { + ef->pltrela = (const Elf_Rela *) ef->pltrel; + ef->pltrel = NULL; + ef->pltrelasize = ef->pltrelsize; + ef->pltrelsize = 0; + } + + ef->ddbsymtab = ef->symtab; + ef->ddbsymcnt = ef->nchains; + ef->ddbstrtab = ef->strtab; + ef->ddbstrcnt = ef->strsz; + + return 0; +} + +static void +link_elf_error(const char *s) +{ + printf("kldload: %s\n", s); +} + +#ifdef DDB + +static void +link_elf_add_gdb(struct link_map *l) +{ + struct link_map *prev; + + /* + * Scan to the end of the list. + */ + for (prev = r_debug.r_map; prev->l_next != NULL; prev = prev->l_next) + ; + + /* Link in the new entry. */ + l->l_prev = prev; + l->l_next = prev->l_next; + prev->l_next = l; +} + +static void +link_elf_delete_gdb(struct link_map *l) +{ + if (l->l_prev == NULL) { + if ((r_debug.r_map = l->l_next) != NULL) + l->l_next->l_prev = NULL; + return; + } + + if ((l->l_prev->l_next = l->l_next) != NULL) + l->l_next->l_prev = l->l_prev; +} + +#endif /* DDB */ + +static int +link_elf_link_preload(linker_class_t cls, + const char* filename, linker_file_t *result) +{ + caddr_t modptr, baseptr, sizeptr, dynptr; + char *type; + elf_file_t ef; + linker_file_t lf; + int error; + vm_offset_t dp; + + /* Look to see if we have the file preloaded */ + modptr = preload_search_by_name(filename); + if (modptr == NULL) + return ENOENT; + + type = (char *)preload_search_info(modptr, MODINFO_TYPE); + baseptr = preload_search_info(modptr, MODINFO_ADDR); + sizeptr = preload_search_info(modptr, MODINFO_SIZE); + dynptr = preload_search_info(modptr, MODINFO_METADATA|MODINFOMD_DYNAMIC); + if (type == NULL || strcmp(type, "elf module") != 0) + return (EFTYPE); + if (baseptr == NULL || sizeptr == NULL || dynptr == NULL) + return (EINVAL); + + lf = linker_make_file(filename, &link_elf_class); + if (lf == NULL) { + return ENOMEM; + } + + ef = (elf_file_t) lf; + ef->preloaded = 1; + ef->modptr = modptr; + ef->address = *(caddr_t *)baseptr; +#ifdef SPARSE_MAPPING + ef->object = 0; +#endif + dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr; + ef->dynamic = (Elf_Dyn *)dp; + lf->address = ef->address; + lf->size = *(size_t *)sizeptr; + + error = parse_dynamic(ef); + if (error) { + linker_file_unload(lf); + return error; + } + *result = lf; + return (0); +} + +static int +link_elf_link_preload_finish(linker_file_t lf) +{ + elf_file_t ef; + int error; +#ifdef DDB + char *newfilename; +#endif + + ef = (elf_file_t) lf; +#if 0 /* this will be more trouble than it's worth for now */ + for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { + if (dp->d_tag != DT_NEEDED) + continue; + modname = ef->strtab + dp->d_un.d_val; + error = linker_load_module(modname, lf); + if (error) + goto out; + } +#endif + error = relocate_file(ef); + if (error) + return error; + (void)link_elf_preload_parse_symbols(ef); + +#ifdef DDB + GDB_STATE(RT_ADD); + ef->gdb.l_addr = lf->address; + newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK); + strcpy(newfilename, lf->filename); + ef->gdb.l_name = newfilename; + ef->gdb.l_ld = ef->dynamic; + link_elf_add_gdb(&ef->gdb); + GDB_STATE(RT_CONSISTENT); +#endif + + return (0); +} + +static int +link_elf_load_file(linker_class_t cls, const char* filename, linker_file_t* result) +{ + struct nameidata nd; + struct thread* td = curthread; /* XXX */ + Elf_Ehdr *hdr; + caddr_t firstpage; + int nbytes, i; + Elf_Phdr *phdr; + Elf_Phdr *phlimit; + Elf_Phdr *segs[2]; + int nsegs; + Elf_Phdr *phdyn; + Elf_Phdr *phphdr; + caddr_t mapbase; + size_t mapsize; + Elf_Off base_offset; + Elf_Addr base_vaddr; + Elf_Addr base_vlimit; + int error = 0; + int resid, flags; + elf_file_t ef; + linker_file_t lf; + Elf_Shdr *shdr; + int symtabindex; + int symstrindex; + int symcnt; + int strcnt; +#ifdef DDB + char *newfilename; +#endif + + GIANT_REQUIRED; + + shdr = NULL; + lf = NULL; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td); + flags = FREAD; + error = vn_open(&nd, &flags, 0); + if (error) + return error; + NDFREE(&nd, NDF_ONLY_PNBUF); + + /* + * Read the elf header from the file. + */ + firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK); + if (firstpage == NULL) { + error = ENOMEM; + goto out; + } + hdr = (Elf_Ehdr *)firstpage; + error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td); + nbytes = PAGE_SIZE - resid; + if (error) + goto out; + + if (!IS_ELF(*hdr)) { + error = ENOEXEC; + goto out; + } + + if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS + || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { + link_elf_error("Unsupported file layout"); + error = ENOEXEC; + goto out; + } + if (hdr->e_ident[EI_VERSION] != EV_CURRENT + || hdr->e_version != EV_CURRENT) { + link_elf_error("Unsupported file version"); + error = ENOEXEC; + goto out; + } + if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) { + link_elf_error("Unsupported file type"); + error = ENOEXEC; + goto out; + } + if (hdr->e_machine != ELF_TARG_MACH) { + link_elf_error("Unsupported machine"); + error = ENOEXEC; + goto out; + } + + /* + * We rely on the program header being in the first page. This is + * not strictly required by the ABI specification, but it seems to + * always true in practice. And, it simplifies things considerably. + */ + if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) && + (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) && + (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes))) + link_elf_error("Unreadable program headers"); + + /* + * Scan the program header entries, and save key information. + * + * We rely on there being exactly two load segments, text and data, + * in that order. + */ + phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff); + phlimit = phdr + hdr->e_phnum; + nsegs = 0; + phdyn = NULL; + phphdr = NULL; + while (phdr < phlimit) { + switch (phdr->p_type) { + + case PT_LOAD: + if (nsegs == 2) { + link_elf_error("Too many sections"); + error = ENOEXEC; + goto out; + } + segs[nsegs] = phdr; + ++nsegs; + break; + + case PT_PHDR: + phphdr = phdr; + break; + + case PT_DYNAMIC: + phdyn = phdr; + break; + + case PT_INTERP: + link_elf_error("Unsupported file type"); + error = ENOEXEC; + goto out; + } + + ++phdr; + } + if (phdyn == NULL) { + link_elf_error("Object is not dynamically-linked"); + error = ENOEXEC; + goto out; + } + + /* + * Allocate the entire address space of the object, to stake out our + * contiguous region, and to establish the base address for relocation. + */ + base_offset = trunc_page(segs[0]->p_offset); + base_vaddr = trunc_page(segs[0]->p_vaddr); + base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz); + mapsize = base_vlimit - base_vaddr; + + lf = linker_make_file(filename, &link_elf_class); + if (!lf) { + error = ENOMEM; + goto out; + } + + ef = (elf_file_t) lf; +#ifdef SPARSE_MAPPING + ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT); + if (ef->object == NULL) { + free(ef, M_LINKER); + error = ENOMEM; + goto out; + } + vm_object_reference(ef->object); + ef->address = (caddr_t) vm_map_min(kernel_map); + error = vm_map_find(kernel_map, ef->object, 0, + (vm_offset_t *) &ef->address, + mapsize, 1, + VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) { + vm_object_deallocate(ef->object); + ef->object = 0; + goto out; + } +#else + ef->address = malloc(mapsize, M_LINKER, M_WAITOK); + if (!ef->address) { + error = ENOMEM; + goto out; + } +#endif + mapbase = ef->address; + + /* + * Read the text and data sections and zero the bss. + */ + for (i = 0; i < 2; i++) { + caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr; + error = vn_rdwr(UIO_READ, nd.ni_vp, + segbase, segs[i]->p_filesz, segs[i]->p_offset, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td); + if (error) { + goto out; + } + bzero(segbase + segs[i]->p_filesz, + segs[i]->p_memsz - segs[i]->p_filesz); + +#ifdef SPARSE_MAPPING + /* + * Wire down the pages + */ + vm_map_pageable(kernel_map, + (vm_offset_t) segbase, + (vm_offset_t) segbase + segs[i]->p_memsz, + FALSE); +#endif + } + +#ifdef GPROF + /* Update profiling information with the new text segment. */ + kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr + + segs[0]->p_memsz)); +#endif + + ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr); + + lf->address = ef->address; + lf->size = mapsize; + + error = parse_dynamic(ef); + if (error) + goto out; + error = linker_load_dependencies(lf); + if (error) + goto out; +#if 0 /* this will be more trouble than it's worth for now */ + for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { + if (dp->d_tag != DT_NEEDED) + continue; + modname = ef->strtab + dp->d_un.d_val; + error = linker_load_module(modname, lf); + if (error) + goto out; + } +#endif + error = relocate_file(ef); + if (error) + goto out; + + /* Try and load the symbol table if it's present. (you can strip it!) */ + nbytes = hdr->e_shnum * hdr->e_shentsize; + if (nbytes == 0 || hdr->e_shoff == 0) + goto nosyms; + shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO); + if (shdr == NULL) { + error = ENOMEM; + goto out; + } + error = vn_rdwr(UIO_READ, nd.ni_vp, + (caddr_t)shdr, nbytes, hdr->e_shoff, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td); + if (error) + goto out; + symtabindex = -1; + symstrindex = -1; + for (i = 0; i < hdr->e_shnum; i++) { + if (shdr[i].sh_type == SHT_SYMTAB) { + symtabindex = i; + symstrindex = shdr[i].sh_link; + } + } + if (symtabindex < 0 || symstrindex < 0) + goto nosyms; + + symcnt = shdr[symtabindex].sh_size; + ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK); + strcnt = shdr[symstrindex].sh_size; + ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK); + + if (ef->symbase == NULL || ef->strbase == NULL) { + error = ENOMEM; + goto out; + } + error = vn_rdwr(UIO_READ, nd.ni_vp, + ef->symbase, symcnt, shdr[symtabindex].sh_offset, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td); + if (error) + goto out; + error = vn_rdwr(UIO_READ, nd.ni_vp, + ef->strbase, strcnt, shdr[symstrindex].sh_offset, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td); + if (error) + goto out; + + ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); + ef->ddbsymtab = (const Elf_Sym *)ef->symbase; + ef->ddbstrcnt = strcnt; + ef->ddbstrtab = ef->strbase; + +#ifdef DDB + GDB_STATE(RT_ADD); + ef->gdb.l_addr = lf->address; + newfilename = malloc(strlen(filename) + 1, M_LINKER, M_WAITOK); + strcpy(newfilename, filename); + ef->gdb.l_name = (const char *)newfilename; + ef->gdb.l_ld = ef->dynamic; + link_elf_add_gdb(&ef->gdb); + GDB_STATE(RT_CONSISTENT); +#endif + +nosyms: + + *result = lf; + +out: + if (error && lf) + linker_file_unload(lf); + if (shdr) + free(shdr, M_LINKER); + if (firstpage) + free(firstpage, M_LINKER); + VOP_UNLOCK(nd.ni_vp, 0, td); + vn_close(nd.ni_vp, FREAD, td->td_ucred, td); + + return error; +} + +static void +link_elf_unload_file(linker_file_t file) +{ + elf_file_t ef = (elf_file_t) file; + +#ifdef DDB + if (ef->gdb.l_ld) { + GDB_STATE(RT_DELETE); + free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER); + link_elf_delete_gdb(&ef->gdb); + GDB_STATE(RT_CONSISTENT); + } +#endif + + if (ef->preloaded) { + link_elf_unload_preload(file); + return; + } +#ifdef SPARSE_MAPPING + if (ef->object) { + vm_map_remove(kernel_map, (vm_offset_t) ef->address, + (vm_offset_t) ef->address + + (ef->object->size << PAGE_SHIFT)); + vm_object_deallocate(ef->object); + } +#else + if (ef->address) + free(ef->address, M_LINKER); +#endif + if (ef->symbase) + free(ef->symbase, M_LINKER); + if (ef->strbase) + free(ef->strbase, M_LINKER); +} + +static void +link_elf_unload_preload(linker_file_t file) +{ + if (file->filename) + preload_delete_name(file->filename); +} + +static const char * +symbol_name(elf_file_t ef, Elf_Word r_info) +{ + const Elf_Sym *ref; + + if (ELF_R_SYM(r_info)) { + ref = ef->symtab + ELF_R_SYM(r_info); + return ef->strtab + ref->st_name; + } else + return NULL; +} + +static int +relocate_file(elf_file_t ef) +{ + const Elf_Rel *rellim; + const Elf_Rel *rel; + const Elf_Rela *relalim; + const Elf_Rela *rela; + const char *symname; + + /* Perform relocations without addend if there are any: */ + rel = ef->rel; + if (rel) { + rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize); + while (rel < rellim) { + if (elf_reloc(&ef->lf, rel, ELF_RELOC_REL)) { + symname = symbol_name(ef, rel->r_info); + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rel++; + } + } + + /* Perform relocations with addend if there are any: */ + rela = ef->rela; + if (rela) { + relalim = (const Elf_Rela *)((const char *)ef->rela + ef->relasize); + while (rela < relalim) { + if (elf_reloc(&ef->lf, rela, ELF_RELOC_RELA)) { + symname = symbol_name(ef, rela->r_info); + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rela++; + } + } + + /* Perform PLT relocations without addend if there are any: */ + rel = ef->pltrel; + if (rel) { + rellim = (const Elf_Rel *)((const char *)ef->pltrel + ef->pltrelsize); + while (rel < rellim) { + if (elf_reloc(&ef->lf, rel, ELF_RELOC_REL)) { + symname = symbol_name(ef, rel->r_info); + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rel++; + } + } + + /* Perform relocations with addend if there are any: */ + rela = ef->pltrela; + if (rela) { + relalim = (const Elf_Rela *)((const char *)ef->pltrela + ef->pltrelasize); + while (rela < relalim) { + if (elf_reloc(&ef->lf, rela, ELF_RELOC_RELA)) { + symname = symbol_name(ef, rela->r_info); + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rela++; + } + } + + return 0; +} + +/* + * Hash function for symbol table lookup. Don't even think about changing + * this. It is specified by the System V ABI. + */ +static unsigned long +elf_hash(const char *name) +{ + const unsigned char *p = (const unsigned char *) name; + unsigned long h = 0; + unsigned long g; + + while (*p != '\0') { + h = (h << 4) + *p++; + if ((g = h & 0xf0000000) != 0) + h ^= g >> 24; + h &= ~g; + } + return h; +} + +int +link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym) +{ + elf_file_t ef = (elf_file_t) lf; + unsigned long symnum; + const Elf_Sym* symp; + const char *strp; + unsigned long hash; + int i; + + /* First, search hashed global symbols */ + hash = elf_hash(name); + symnum = ef->buckets[hash % ef->nbuckets]; + + while (symnum != STN_UNDEF) { + if (symnum >= ef->nchains) { + printf("link_elf_lookup_symbol: corrupt symbol table\n"); + return ENOENT; + } + + symp = ef->symtab + symnum; + if (symp->st_name == 0) { + printf("link_elf_lookup_symbol: corrupt symbol table\n"); + return ENOENT; + } + + strp = ef->strtab + symp->st_name; + + if (strcmp(name, strp) == 0) { + if (symp->st_shndx != SHN_UNDEF || + (symp->st_value != 0 && + ELF_ST_TYPE(symp->st_info) == STT_FUNC)) { + *sym = (c_linker_sym_t) symp; + return 0; + } else + return ENOENT; + } + + symnum = ef->chains[symnum]; + } + + /* If we have not found it, look at the full table (if loaded) */ + if (ef->symtab == ef->ddbsymtab) + return ENOENT; + + /* Exhaustive search */ + for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { + strp = ef->ddbstrtab + symp->st_name; + if (strcmp(name, strp) == 0) { + if (symp->st_shndx != SHN_UNDEF || + (symp->st_value != 0 && + ELF_ST_TYPE(symp->st_info) == STT_FUNC)) { + *sym = (c_linker_sym_t) symp; + return 0; + } else + return ENOENT; + } + } + + return ENOENT; +} + +static int +link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t* symval) +{ + elf_file_t ef = (elf_file_t) lf; + const Elf_Sym* es = (const Elf_Sym*) sym; + + if (es >= ef->symtab && ((es - ef->symtab) < ef->nchains)) { + symval->name = ef->strtab + es->st_name; + symval->value = (caddr_t) ef->address + es->st_value; + symval->size = es->st_size; + return 0; + } + if (ef->symtab == ef->ddbsymtab) + return ENOENT; + if (es >= ef->ddbsymtab && ((es - ef->ddbsymtab) < ef->ddbsymcnt)) { + symval->name = ef->ddbstrtab + es->st_name; + symval->value = (caddr_t) ef->address + es->st_value; + symval->size = es->st_size; + return 0; + } + return ENOENT; +} + +static int +link_elf_search_symbol(linker_file_t lf, caddr_t value, + c_linker_sym_t* sym, long* diffp) +{ + elf_file_t ef = (elf_file_t) lf; + u_long off = (uintptr_t) (void *) value; + u_long diff = off; + u_long st_value; + const Elf_Sym* es; + const Elf_Sym* best = 0; + int i; + + for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) { + if (es->st_name == 0) + continue; + st_value = es->st_value + (uintptr_t) (void *) ef->address; + if (off >= st_value) { + if (off - st_value < diff) { + diff = off - st_value; + best = es; + if (diff == 0) + break; + } else if (off - st_value == diff) { + best = es; + } + } + } + if (best == 0) + *diffp = off; + else + *diffp = diff; + *sym = (c_linker_sym_t) best; + + return 0; +} + +/* + * Look up a linker set on an ELF system. + */ +static int +link_elf_lookup_set(linker_file_t lf, const char *name, + void ***startp, void ***stopp, int *countp) +{ + c_linker_sym_t sym; + linker_symval_t symval; + char *setsym; + void **start, **stop; + int len, error = 0, count; + + len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */ + setsym = malloc(len, M_LINKER, M_WAITOK); + if (setsym == NULL) + return ENOMEM; + + /* get address of first entry */ + snprintf(setsym, len, "%s%s", "__start_set_", name); + error = link_elf_lookup_symbol(lf, setsym, &sym); + if (error) + goto out; + link_elf_symbol_values(lf, sym, &symval); + if (symval.value == 0) { + error = ESRCH; + goto out; + } + start = (void **)symval.value; + + /* get address of last entry */ + snprintf(setsym, len, "%s%s", "__stop_set_", name); + error = link_elf_lookup_symbol(lf, setsym, &sym); + if (error) + goto out; + link_elf_symbol_values(lf, sym, &symval); + if (symval.value == 0) { + error = ESRCH; + goto out; + } + stop = (void **)symval.value; + + /* and the number of entries */ + count = stop - start; + + /* and copy out */ + if (startp) + *startp = start; + if (stopp) + *stopp = stop; + if (countp) + *countp = count; + +out: + free(setsym, M_LINKER); + return error; +} + +static int +link_elf_each_function_name(linker_file_t file, + int (*callback)(const char *, void *), void *opaque) { + elf_file_t ef = (elf_file_t)file; + const Elf_Sym* symp; + int i, error; + + /* Exhaustive search */ + for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { + if (symp->st_value != 0 && + ELF_ST_TYPE(symp->st_info) == STT_FUNC) { + error = callback(ef->ddbstrtab + symp->st_name, opaque); + if (error) + return (error); + } + } + return (0); +} + +#ifdef __ia64__ +/* + * Each KLD has its own GP. The GP value for each load module is given by + * DT_PLTGOT on ia64. We need GP to construct function descriptors, but + * don't have direct access to the ELF file structure. The link_elf_get_gp() + * function returns the GP given a pointer to a generic linker file struct. + */ +Elf_Addr +link_elf_get_gp(linker_file_t lf) +{ + elf_file_t ef = (elf_file_t)lf; + return (Elf_Addr)ef->got; +} +#endif + +/* + * Symbol lookup function that can be used when the symbol index is known (ie + * in relocations). It uses the symbol index instead of doing a fully fledged + * hash table based lookup when such is valid. For example for local symbols. + * This is not only more efficient, it's also more correct. It's not always + * the case that the symbol can be found through the hash table. + */ +Elf_Addr +elf_lookup(linker_file_t lf, Elf_Word symidx, int deps) +{ + elf_file_t ef = (elf_file_t)lf; + const Elf_Sym *sym; + const char *symbol; + + /* Don't even try to lookup the symbol if the index is bogus. */ + if (symidx >= ef->nchains) + return (0); + + sym = ef->symtab + symidx; + + /* + * Don't do a full lookup when the symbol is local. It may even + * fail because it may not be found through the hash table. + */ + if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) { + /* Force lookup failure when we have an insanity. */ + if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0) + return (0); + return ((Elf_Addr)ef->address + sym->st_value); + } + + /* + * XXX we can avoid doing a hash table based lookup for global + * symbols as well. This however is not always valid, so we'll + * just do it the hard way for now. Performance tweaks can + * always be added. + */ + + symbol = ef->strtab + sym->st_name; + + /* Force a lookup failure if the symbol name is bogus. */ + if (*symbol == 0) + return (0); + + return ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps)); +} diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c new file mode 100644 index 0000000..dd59405 --- /dev/null +++ b/sys/kern/link_elf_obj.c @@ -0,0 +1,1239 @@ +/*- + * Copyright (c) 1998-2000 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/namei.h> +#include <sys/fcntl.h> +#include <sys/vnode.h> +#include <sys/linker.h> + +#include <machine/elf.h> +#ifdef GPROF +#include <machine/profile.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_param.h> +#ifdef SPARSE_MAPPING +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#endif +#include <vm/pmap.h> +#include <vm/vm_map.h> + +#ifdef __AOUT__ +#include <nlist.h> +#endif +#include <link.h> + +#include "linker_if.h" + +typedef struct elf_file { + struct linker_file lf; /* Common fields */ + int preloaded; /* Was file pre-loaded */ + caddr_t address; /* Relocation address */ +#ifdef SPARSE_MAPPING + vm_object_t object; /* VM object to hold file pages */ +#endif + Elf_Dyn* dynamic; /* Symbol table etc. */ + Elf_Hashelt nbuckets; /* DT_HASH info */ + Elf_Hashelt nchains; + const Elf_Hashelt* buckets; + const Elf_Hashelt* chains; + caddr_t hash; + caddr_t strtab; /* DT_STRTAB */ + int strsz; /* DT_STRSZ */ + const Elf_Sym* symtab; /* DT_SYMTAB */ + Elf_Addr* got; /* DT_PLTGOT */ + const Elf_Rel* pltrel; /* DT_JMPREL */ + int pltrelsize; /* DT_PLTRELSZ */ + const Elf_Rela* pltrela; /* DT_JMPREL */ + int pltrelasize; /* DT_PLTRELSZ */ + const Elf_Rel* rel; /* DT_REL */ + int relsize; /* DT_RELSZ */ + const Elf_Rela* rela; /* DT_RELA */ + int relasize; /* DT_RELASZ */ + caddr_t modptr; + const Elf_Sym* ddbsymtab; /* The symbol table we are using */ + long ddbsymcnt; /* Number of symbols */ + caddr_t ddbstrtab; /* String table */ + long ddbstrcnt; /* number of bytes in string table */ + caddr_t symbase; /* malloc'ed symbold base */ + caddr_t strbase; /* malloc'ed string base */ +#ifdef DDB + struct link_map gdb; /* hooks for gdb */ +#endif +} *elf_file_t; + +static int link_elf_link_preload(linker_class_t cls, + const char*, linker_file_t*); +static int link_elf_link_preload_finish(linker_file_t); +static int link_elf_load_file(linker_class_t, const char*, linker_file_t*); +static int link_elf_lookup_symbol(linker_file_t, const char*, + c_linker_sym_t*); +static int link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t*); +static int link_elf_search_symbol(linker_file_t, caddr_t value, + c_linker_sym_t* sym, long* diffp); + +static void link_elf_unload_file(linker_file_t); +static void link_elf_unload_preload(linker_file_t); +static int link_elf_lookup_set(linker_file_t, const char *, + void ***, void ***, int *); +static int link_elf_each_function_name(linker_file_t, + int (*)(const char *, void *), + void *); + +static kobj_method_t link_elf_methods[] = { + KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol), + KOBJMETHOD(linker_symbol_values, link_elf_symbol_values), + KOBJMETHOD(linker_search_symbol, link_elf_search_symbol), + KOBJMETHOD(linker_unload, link_elf_unload_file), + KOBJMETHOD(linker_load_file, link_elf_load_file), + KOBJMETHOD(linker_link_preload, link_elf_link_preload), + KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish), + KOBJMETHOD(linker_lookup_set, link_elf_lookup_set), + KOBJMETHOD(linker_each_function_name, link_elf_each_function_name), + { 0, 0 } +}; + +static struct linker_class link_elf_class = { +#if ELF_TARG_CLASS == ELFCLASS32 + "elf32", +#else + "elf64", +#endif + link_elf_methods, sizeof(struct elf_file) +}; + +static int parse_dynamic(elf_file_t ef); +static int relocate_file(elf_file_t ef); +static int link_elf_preload_parse_symbols(elf_file_t ef); + +#ifdef DDB +static void r_debug_state(struct r_debug *dummy_one, + struct link_map *dummy_two); + +/* + * A list of loaded modules for GDB to use for loading symbols. + */ +struct r_debug r_debug; + +#define GDB_STATE(s) r_debug.r_state = s; r_debug_state(NULL, NULL); + +/* + * Function for the debugger to set a breakpoint on to gain control. + */ +void +r_debug_state(struct r_debug *dummy_one __unused, + struct link_map *dummy_two __unused) +{ +} + +#endif + +#ifdef __ia64__ +Elf_Addr link_elf_get_gp(linker_file_t); +#endif + +/* + * The kernel symbol table starts here. + */ +extern struct _dynamic _DYNAMIC; + +static void +link_elf_init(void* arg) +{ +#ifdef __ELF__ + Elf_Dyn *dp; + caddr_t modptr, baseptr, sizeptr; + elf_file_t ef; + char *modname; +#ifdef DDB + char *newfilename; +#endif +#endif + + linker_add_class(&link_elf_class); + +#ifdef __ELF__ + dp = (Elf_Dyn*) &_DYNAMIC; + modname = NULL; + modptr = preload_search_by_type("elf kernel"); + if (modptr) + modname = (char *)preload_search_info(modptr, MODINFO_NAME); + if (modname == NULL) + modname = "kernel"; + linker_kernel_file = linker_make_file(modname, &link_elf_class); + if (linker_kernel_file == NULL) + panic("link_elf_init: Can't create linker structures for kernel"); + + ef = (elf_file_t) linker_kernel_file; + ef->preloaded = 1; + ef->address = 0; +#ifdef SPARSE_MAPPING + ef->object = 0; +#endif + ef->dynamic = dp; + + if (dp) + parse_dynamic(ef); + linker_kernel_file->address = (caddr_t) KERNBASE; + linker_kernel_file->size = -(intptr_t)linker_kernel_file->address; + + if (modptr) { + ef->modptr = modptr; + baseptr = preload_search_info(modptr, MODINFO_ADDR); + if (baseptr) + linker_kernel_file->address = *(caddr_t *)baseptr; + sizeptr = preload_search_info(modptr, MODINFO_SIZE); + if (sizeptr) + linker_kernel_file->size = *(size_t *)sizeptr; + } + (void)link_elf_preload_parse_symbols(ef); + +#ifdef DDB + ef->gdb.l_addr = linker_kernel_file->address; + newfilename = malloc(strlen(modname) + 1, M_LINKER, M_WAITOK); + strcpy(newfilename, modname); + ef->gdb.l_name = newfilename; + ef->gdb.l_ld = dp; + ef->gdb.l_prev = 0; + ef->gdb.l_next = 0; + + r_debug.r_map = &ef->gdb; + r_debug.r_brk = r_debug_state; + r_debug.r_state = RT_CONSISTENT; + + r_debug_state(NULL, NULL); /* say hello to gdb! */ +#endif + +#endif +} + +SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0); + +static int +link_elf_preload_parse_symbols(elf_file_t ef) +{ + caddr_t pointer; + caddr_t ssym, esym, base; + caddr_t strtab; + int strcnt; + Elf_Sym* symtab; + int symcnt; + + if (ef->modptr == NULL) + return 0; + pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_SSYM); + if (pointer == NULL) + return 0; + ssym = *(caddr_t *)pointer; + pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_ESYM); + if (pointer == NULL) + return 0; + esym = *(caddr_t *)pointer; + + base = ssym; + + symcnt = *(long *)base; + base += sizeof(long); + symtab = (Elf_Sym *)base; + base += roundup(symcnt, sizeof(long)); + + if (base > esym || base < ssym) { + printf("Symbols are corrupt!\n"); + return EINVAL; + } + + strcnt = *(long *)base; + base += sizeof(long); + strtab = base; + base += roundup(strcnt, sizeof(long)); + + if (base > esym || base < ssym) { + printf("Symbols are corrupt!\n"); + return EINVAL; + } + + ef->ddbsymtab = symtab; + ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); + ef->ddbstrtab = strtab; + ef->ddbstrcnt = strcnt; + + return 0; +} + +static int +parse_dynamic(elf_file_t ef) +{ + Elf_Dyn *dp; + int plttype = DT_REL; + + for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { + switch (dp->d_tag) { + case DT_HASH: + { + /* From src/libexec/rtld-elf/rtld.c */ + const Elf_Hashelt *hashtab = (const Elf_Hashelt *) + (ef->address + dp->d_un.d_ptr); + ef->nbuckets = hashtab[0]; + ef->nchains = hashtab[1]; + ef->buckets = hashtab + 2; + ef->chains = ef->buckets + ef->nbuckets; + break; + } + case DT_STRTAB: + ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr); + break; + case DT_STRSZ: + ef->strsz = dp->d_un.d_val; + break; + case DT_SYMTAB: + ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr); + break; + case DT_SYMENT: + if (dp->d_un.d_val != sizeof(Elf_Sym)) + return ENOEXEC; + break; + case DT_PLTGOT: + ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr); + break; + case DT_REL: + ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); + break; + case DT_RELSZ: + ef->relsize = dp->d_un.d_val; + break; + case DT_RELENT: + if (dp->d_un.d_val != sizeof(Elf_Rel)) + return ENOEXEC; + break; + case DT_JMPREL: + ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); + break; + case DT_PLTRELSZ: + ef->pltrelsize = dp->d_un.d_val; + break; + case DT_RELA: + ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr); + break; + case DT_RELASZ: + ef->relasize = dp->d_un.d_val; + break; + case DT_RELAENT: + if (dp->d_un.d_val != sizeof(Elf_Rela)) + return ENOEXEC; + break; + case DT_PLTREL: + plttype = dp->d_un.d_val; + if (plttype != DT_REL && plttype != DT_RELA) + return ENOEXEC; + break; +#ifdef DDB + case DT_DEBUG: + dp->d_un.d_ptr = (Elf_Addr) &r_debug; + break; +#endif + } + } + + if (plttype == DT_RELA) { + ef->pltrela = (const Elf_Rela *) ef->pltrel; + ef->pltrel = NULL; + ef->pltrelasize = ef->pltrelsize; + ef->pltrelsize = 0; + } + + ef->ddbsymtab = ef->symtab; + ef->ddbsymcnt = ef->nchains; + ef->ddbstrtab = ef->strtab; + ef->ddbstrcnt = ef->strsz; + + return 0; +} + +static void +link_elf_error(const char *s) +{ + printf("kldload: %s\n", s); +} + +#ifdef DDB + +static void +link_elf_add_gdb(struct link_map *l) +{ + struct link_map *prev; + + /* + * Scan to the end of the list. + */ + for (prev = r_debug.r_map; prev->l_next != NULL; prev = prev->l_next) + ; + + /* Link in the new entry. */ + l->l_prev = prev; + l->l_next = prev->l_next; + prev->l_next = l; +} + +static void +link_elf_delete_gdb(struct link_map *l) +{ + if (l->l_prev == NULL) { + if ((r_debug.r_map = l->l_next) != NULL) + l->l_next->l_prev = NULL; + return; + } + + if ((l->l_prev->l_next = l->l_next) != NULL) + l->l_next->l_prev = l->l_prev; +} + +#endif /* DDB */ + +static int +link_elf_link_preload(linker_class_t cls, + const char* filename, linker_file_t *result) +{ + caddr_t modptr, baseptr, sizeptr, dynptr; + char *type; + elf_file_t ef; + linker_file_t lf; + int error; + vm_offset_t dp; + + /* Look to see if we have the file preloaded */ + modptr = preload_search_by_name(filename); + if (modptr == NULL) + return ENOENT; + + type = (char *)preload_search_info(modptr, MODINFO_TYPE); + baseptr = preload_search_info(modptr, MODINFO_ADDR); + sizeptr = preload_search_info(modptr, MODINFO_SIZE); + dynptr = preload_search_info(modptr, MODINFO_METADATA|MODINFOMD_DYNAMIC); + if (type == NULL || strcmp(type, "elf module") != 0) + return (EFTYPE); + if (baseptr == NULL || sizeptr == NULL || dynptr == NULL) + return (EINVAL); + + lf = linker_make_file(filename, &link_elf_class); + if (lf == NULL) { + return ENOMEM; + } + + ef = (elf_file_t) lf; + ef->preloaded = 1; + ef->modptr = modptr; + ef->address = *(caddr_t *)baseptr; +#ifdef SPARSE_MAPPING + ef->object = 0; +#endif + dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr; + ef->dynamic = (Elf_Dyn *)dp; + lf->address = ef->address; + lf->size = *(size_t *)sizeptr; + + error = parse_dynamic(ef); + if (error) { + linker_file_unload(lf); + return error; + } + *result = lf; + return (0); +} + +static int +link_elf_link_preload_finish(linker_file_t lf) +{ + elf_file_t ef; + int error; +#ifdef DDB + char *newfilename; +#endif + + ef = (elf_file_t) lf; +#if 0 /* this will be more trouble than it's worth for now */ + for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { + if (dp->d_tag != DT_NEEDED) + continue; + modname = ef->strtab + dp->d_un.d_val; + error = linker_load_module(modname, lf); + if (error) + goto out; + } +#endif + error = relocate_file(ef); + if (error) + return error; + (void)link_elf_preload_parse_symbols(ef); + +#ifdef DDB + GDB_STATE(RT_ADD); + ef->gdb.l_addr = lf->address; + newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK); + strcpy(newfilename, lf->filename); + ef->gdb.l_name = newfilename; + ef->gdb.l_ld = ef->dynamic; + link_elf_add_gdb(&ef->gdb); + GDB_STATE(RT_CONSISTENT); +#endif + + return (0); +} + +static int +link_elf_load_file(linker_class_t cls, const char* filename, linker_file_t* result) +{ + struct nameidata nd; + struct thread* td = curthread; /* XXX */ + Elf_Ehdr *hdr; + caddr_t firstpage; + int nbytes, i; + Elf_Phdr *phdr; + Elf_Phdr *phlimit; + Elf_Phdr *segs[2]; + int nsegs; + Elf_Phdr *phdyn; + Elf_Phdr *phphdr; + caddr_t mapbase; + size_t mapsize; + Elf_Off base_offset; + Elf_Addr base_vaddr; + Elf_Addr base_vlimit; + int error = 0; + int resid, flags; + elf_file_t ef; + linker_file_t lf; + Elf_Shdr *shdr; + int symtabindex; + int symstrindex; + int symcnt; + int strcnt; +#ifdef DDB + char *newfilename; +#endif + + GIANT_REQUIRED; + + shdr = NULL; + lf = NULL; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td); + flags = FREAD; + error = vn_open(&nd, &flags, 0); + if (error) + return error; + NDFREE(&nd, NDF_ONLY_PNBUF); + + /* + * Read the elf header from the file. + */ + firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK); + if (firstpage == NULL) { + error = ENOMEM; + goto out; + } + hdr = (Elf_Ehdr *)firstpage; + error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td); + nbytes = PAGE_SIZE - resid; + if (error) + goto out; + + if (!IS_ELF(*hdr)) { + error = ENOEXEC; + goto out; + } + + if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS + || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { + link_elf_error("Unsupported file layout"); + error = ENOEXEC; + goto out; + } + if (hdr->e_ident[EI_VERSION] != EV_CURRENT + || hdr->e_version != EV_CURRENT) { + link_elf_error("Unsupported file version"); + error = ENOEXEC; + goto out; + } + if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) { + link_elf_error("Unsupported file type"); + error = ENOEXEC; + goto out; + } + if (hdr->e_machine != ELF_TARG_MACH) { + link_elf_error("Unsupported machine"); + error = ENOEXEC; + goto out; + } + + /* + * We rely on the program header being in the first page. This is + * not strictly required by the ABI specification, but it seems to + * always true in practice. And, it simplifies things considerably. + */ + if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) && + (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) && + (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes))) + link_elf_error("Unreadable program headers"); + + /* + * Scan the program header entries, and save key information. + * + * We rely on there being exactly two load segments, text and data, + * in that order. + */ + phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff); + phlimit = phdr + hdr->e_phnum; + nsegs = 0; + phdyn = NULL; + phphdr = NULL; + while (phdr < phlimit) { + switch (phdr->p_type) { + + case PT_LOAD: + if (nsegs == 2) { + link_elf_error("Too many sections"); + error = ENOEXEC; + goto out; + } + segs[nsegs] = phdr; + ++nsegs; + break; + + case PT_PHDR: + phphdr = phdr; + break; + + case PT_DYNAMIC: + phdyn = phdr; + break; + + case PT_INTERP: + link_elf_error("Unsupported file type"); + error = ENOEXEC; + goto out; + } + + ++phdr; + } + if (phdyn == NULL) { + link_elf_error("Object is not dynamically-linked"); + error = ENOEXEC; + goto out; + } + + /* + * Allocate the entire address space of the object, to stake out our + * contiguous region, and to establish the base address for relocation. + */ + base_offset = trunc_page(segs[0]->p_offset); + base_vaddr = trunc_page(segs[0]->p_vaddr); + base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz); + mapsize = base_vlimit - base_vaddr; + + lf = linker_make_file(filename, &link_elf_class); + if (!lf) { + error = ENOMEM; + goto out; + } + + ef = (elf_file_t) lf; +#ifdef SPARSE_MAPPING + ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT); + if (ef->object == NULL) { + free(ef, M_LINKER); + error = ENOMEM; + goto out; + } + vm_object_reference(ef->object); + ef->address = (caddr_t) vm_map_min(kernel_map); + error = vm_map_find(kernel_map, ef->object, 0, + (vm_offset_t *) &ef->address, + mapsize, 1, + VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) { + vm_object_deallocate(ef->object); + ef->object = 0; + goto out; + } +#else + ef->address = malloc(mapsize, M_LINKER, M_WAITOK); + if (!ef->address) { + error = ENOMEM; + goto out; + } +#endif + mapbase = ef->address; + + /* + * Read the text and data sections and zero the bss. + */ + for (i = 0; i < 2; i++) { + caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr; + error = vn_rdwr(UIO_READ, nd.ni_vp, + segbase, segs[i]->p_filesz, segs[i]->p_offset, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td); + if (error) { + goto out; + } + bzero(segbase + segs[i]->p_filesz, + segs[i]->p_memsz - segs[i]->p_filesz); + +#ifdef SPARSE_MAPPING + /* + * Wire down the pages + */ + vm_map_pageable(kernel_map, + (vm_offset_t) segbase, + (vm_offset_t) segbase + segs[i]->p_memsz, + FALSE); +#endif + } + +#ifdef GPROF + /* Update profiling information with the new text segment. */ + kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr + + segs[0]->p_memsz)); +#endif + + ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr); + + lf->address = ef->address; + lf->size = mapsize; + + error = parse_dynamic(ef); + if (error) + goto out; + error = linker_load_dependencies(lf); + if (error) + goto out; +#if 0 /* this will be more trouble than it's worth for now */ + for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { + if (dp->d_tag != DT_NEEDED) + continue; + modname = ef->strtab + dp->d_un.d_val; + error = linker_load_module(modname, lf); + if (error) + goto out; + } +#endif + error = relocate_file(ef); + if (error) + goto out; + + /* Try and load the symbol table if it's present. (you can strip it!) */ + nbytes = hdr->e_shnum * hdr->e_shentsize; + if (nbytes == 0 || hdr->e_shoff == 0) + goto nosyms; + shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO); + if (shdr == NULL) { + error = ENOMEM; + goto out; + } + error = vn_rdwr(UIO_READ, nd.ni_vp, + (caddr_t)shdr, nbytes, hdr->e_shoff, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td); + if (error) + goto out; + symtabindex = -1; + symstrindex = -1; + for (i = 0; i < hdr->e_shnum; i++) { + if (shdr[i].sh_type == SHT_SYMTAB) { + symtabindex = i; + symstrindex = shdr[i].sh_link; + } + } + if (symtabindex < 0 || symstrindex < 0) + goto nosyms; + + symcnt = shdr[symtabindex].sh_size; + ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK); + strcnt = shdr[symstrindex].sh_size; + ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK); + + if (ef->symbase == NULL || ef->strbase == NULL) { + error = ENOMEM; + goto out; + } + error = vn_rdwr(UIO_READ, nd.ni_vp, + ef->symbase, symcnt, shdr[symtabindex].sh_offset, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td); + if (error) + goto out; + error = vn_rdwr(UIO_READ, nd.ni_vp, + ef->strbase, strcnt, shdr[symstrindex].sh_offset, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td); + if (error) + goto out; + + ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); + ef->ddbsymtab = (const Elf_Sym *)ef->symbase; + ef->ddbstrcnt = strcnt; + ef->ddbstrtab = ef->strbase; + +#ifdef DDB + GDB_STATE(RT_ADD); + ef->gdb.l_addr = lf->address; + newfilename = malloc(strlen(filename) + 1, M_LINKER, M_WAITOK); + strcpy(newfilename, filename); + ef->gdb.l_name = (const char *)newfilename; + ef->gdb.l_ld = ef->dynamic; + link_elf_add_gdb(&ef->gdb); + GDB_STATE(RT_CONSISTENT); +#endif + +nosyms: + + *result = lf; + +out: + if (error && lf) + linker_file_unload(lf); + if (shdr) + free(shdr, M_LINKER); + if (firstpage) + free(firstpage, M_LINKER); + VOP_UNLOCK(nd.ni_vp, 0, td); + vn_close(nd.ni_vp, FREAD, td->td_ucred, td); + + return error; +} + +static void +link_elf_unload_file(linker_file_t file) +{ + elf_file_t ef = (elf_file_t) file; + +#ifdef DDB + if (ef->gdb.l_ld) { + GDB_STATE(RT_DELETE); + free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER); + link_elf_delete_gdb(&ef->gdb); + GDB_STATE(RT_CONSISTENT); + } +#endif + + if (ef->preloaded) { + link_elf_unload_preload(file); + return; + } +#ifdef SPARSE_MAPPING + if (ef->object) { + vm_map_remove(kernel_map, (vm_offset_t) ef->address, + (vm_offset_t) ef->address + + (ef->object->size << PAGE_SHIFT)); + vm_object_deallocate(ef->object); + } +#else + if (ef->address) + free(ef->address, M_LINKER); +#endif + if (ef->symbase) + free(ef->symbase, M_LINKER); + if (ef->strbase) + free(ef->strbase, M_LINKER); +} + +static void +link_elf_unload_preload(linker_file_t file) +{ + if (file->filename) + preload_delete_name(file->filename); +} + +static const char * +symbol_name(elf_file_t ef, Elf_Word r_info) +{ + const Elf_Sym *ref; + + if (ELF_R_SYM(r_info)) { + ref = ef->symtab + ELF_R_SYM(r_info); + return ef->strtab + ref->st_name; + } else + return NULL; +} + +static int +relocate_file(elf_file_t ef) +{ + const Elf_Rel *rellim; + const Elf_Rel *rel; + const Elf_Rela *relalim; + const Elf_Rela *rela; + const char *symname; + + /* Perform relocations without addend if there are any: */ + rel = ef->rel; + if (rel) { + rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize); + while (rel < rellim) { + if (elf_reloc(&ef->lf, rel, ELF_RELOC_REL)) { + symname = symbol_name(ef, rel->r_info); + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rel++; + } + } + + /* Perform relocations with addend if there are any: */ + rela = ef->rela; + if (rela) { + relalim = (const Elf_Rela *)((const char *)ef->rela + ef->relasize); + while (rela < relalim) { + if (elf_reloc(&ef->lf, rela, ELF_RELOC_RELA)) { + symname = symbol_name(ef, rela->r_info); + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rela++; + } + } + + /* Perform PLT relocations without addend if there are any: */ + rel = ef->pltrel; + if (rel) { + rellim = (const Elf_Rel *)((const char *)ef->pltrel + ef->pltrelsize); + while (rel < rellim) { + if (elf_reloc(&ef->lf, rel, ELF_RELOC_REL)) { + symname = symbol_name(ef, rel->r_info); + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rel++; + } + } + + /* Perform relocations with addend if there are any: */ + rela = ef->pltrela; + if (rela) { + relalim = (const Elf_Rela *)((const char *)ef->pltrela + ef->pltrelasize); + while (rela < relalim) { + if (elf_reloc(&ef->lf, rela, ELF_RELOC_RELA)) { + symname = symbol_name(ef, rela->r_info); + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rela++; + } + } + + return 0; +} + +/* + * Hash function for symbol table lookup. Don't even think about changing + * this. It is specified by the System V ABI. + */ +static unsigned long +elf_hash(const char *name) +{ + const unsigned char *p = (const unsigned char *) name; + unsigned long h = 0; + unsigned long g; + + while (*p != '\0') { + h = (h << 4) + *p++; + if ((g = h & 0xf0000000) != 0) + h ^= g >> 24; + h &= ~g; + } + return h; +} + +int +link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym) +{ + elf_file_t ef = (elf_file_t) lf; + unsigned long symnum; + const Elf_Sym* symp; + const char *strp; + unsigned long hash; + int i; + + /* First, search hashed global symbols */ + hash = elf_hash(name); + symnum = ef->buckets[hash % ef->nbuckets]; + + while (symnum != STN_UNDEF) { + if (symnum >= ef->nchains) { + printf("link_elf_lookup_symbol: corrupt symbol table\n"); + return ENOENT; + } + + symp = ef->symtab + symnum; + if (symp->st_name == 0) { + printf("link_elf_lookup_symbol: corrupt symbol table\n"); + return ENOENT; + } + + strp = ef->strtab + symp->st_name; + + if (strcmp(name, strp) == 0) { + if (symp->st_shndx != SHN_UNDEF || + (symp->st_value != 0 && + ELF_ST_TYPE(symp->st_info) == STT_FUNC)) { + *sym = (c_linker_sym_t) symp; + return 0; + } else + return ENOENT; + } + + symnum = ef->chains[symnum]; + } + + /* If we have not found it, look at the full table (if loaded) */ + if (ef->symtab == ef->ddbsymtab) + return ENOENT; + + /* Exhaustive search */ + for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { + strp = ef->ddbstrtab + symp->st_name; + if (strcmp(name, strp) == 0) { + if (symp->st_shndx != SHN_UNDEF || + (symp->st_value != 0 && + ELF_ST_TYPE(symp->st_info) == STT_FUNC)) { + *sym = (c_linker_sym_t) symp; + return 0; + } else + return ENOENT; + } + } + + return ENOENT; +} + +static int +link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t* symval) +{ + elf_file_t ef = (elf_file_t) lf; + const Elf_Sym* es = (const Elf_Sym*) sym; + + if (es >= ef->symtab && ((es - ef->symtab) < ef->nchains)) { + symval->name = ef->strtab + es->st_name; + symval->value = (caddr_t) ef->address + es->st_value; + symval->size = es->st_size; + return 0; + } + if (ef->symtab == ef->ddbsymtab) + return ENOENT; + if (es >= ef->ddbsymtab && ((es - ef->ddbsymtab) < ef->ddbsymcnt)) { + symval->name = ef->ddbstrtab + es->st_name; + symval->value = (caddr_t) ef->address + es->st_value; + symval->size = es->st_size; + return 0; + } + return ENOENT; +} + +static int +link_elf_search_symbol(linker_file_t lf, caddr_t value, + c_linker_sym_t* sym, long* diffp) +{ + elf_file_t ef = (elf_file_t) lf; + u_long off = (uintptr_t) (void *) value; + u_long diff = off; + u_long st_value; + const Elf_Sym* es; + const Elf_Sym* best = 0; + int i; + + for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) { + if (es->st_name == 0) + continue; + st_value = es->st_value + (uintptr_t) (void *) ef->address; + if (off >= st_value) { + if (off - st_value < diff) { + diff = off - st_value; + best = es; + if (diff == 0) + break; + } else if (off - st_value == diff) { + best = es; + } + } + } + if (best == 0) + *diffp = off; + else + *diffp = diff; + *sym = (c_linker_sym_t) best; + + return 0; +} + +/* + * Look up a linker set on an ELF system. + */ +static int +link_elf_lookup_set(linker_file_t lf, const char *name, + void ***startp, void ***stopp, int *countp) +{ + c_linker_sym_t sym; + linker_symval_t symval; + char *setsym; + void **start, **stop; + int len, error = 0, count; + + len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */ + setsym = malloc(len, M_LINKER, M_WAITOK); + if (setsym == NULL) + return ENOMEM; + + /* get address of first entry */ + snprintf(setsym, len, "%s%s", "__start_set_", name); + error = link_elf_lookup_symbol(lf, setsym, &sym); + if (error) + goto out; + link_elf_symbol_values(lf, sym, &symval); + if (symval.value == 0) { + error = ESRCH; + goto out; + } + start = (void **)symval.value; + + /* get address of last entry */ + snprintf(setsym, len, "%s%s", "__stop_set_", name); + error = link_elf_lookup_symbol(lf, setsym, &sym); + if (error) + goto out; + link_elf_symbol_values(lf, sym, &symval); + if (symval.value == 0) { + error = ESRCH; + goto out; + } + stop = (void **)symval.value; + + /* and the number of entries */ + count = stop - start; + + /* and copy out */ + if (startp) + *startp = start; + if (stopp) + *stopp = stop; + if (countp) + *countp = count; + +out: + free(setsym, M_LINKER); + return error; +} + +static int +link_elf_each_function_name(linker_file_t file, + int (*callback)(const char *, void *), void *opaque) { + elf_file_t ef = (elf_file_t)file; + const Elf_Sym* symp; + int i, error; + + /* Exhaustive search */ + for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { + if (symp->st_value != 0 && + ELF_ST_TYPE(symp->st_info) == STT_FUNC) { + error = callback(ef->ddbstrtab + symp->st_name, opaque); + if (error) + return (error); + } + } + return (0); +} + +#ifdef __ia64__ +/* + * Each KLD has its own GP. The GP value for each load module is given by + * DT_PLTGOT on ia64. We need GP to construct function descriptors, but + * don't have direct access to the ELF file structure. The link_elf_get_gp() + * function returns the GP given a pointer to a generic linker file struct. + */ +Elf_Addr +link_elf_get_gp(linker_file_t lf) +{ + elf_file_t ef = (elf_file_t)lf; + return (Elf_Addr)ef->got; +} +#endif + +/* + * Symbol lookup function that can be used when the symbol index is known (ie + * in relocations). It uses the symbol index instead of doing a fully fledged + * hash table based lookup when such is valid. For example for local symbols. + * This is not only more efficient, it's also more correct. It's not always + * the case that the symbol can be found through the hash table. + */ +Elf_Addr +elf_lookup(linker_file_t lf, Elf_Word symidx, int deps) +{ + elf_file_t ef = (elf_file_t)lf; + const Elf_Sym *sym; + const char *symbol; + + /* Don't even try to lookup the symbol if the index is bogus. */ + if (symidx >= ef->nchains) + return (0); + + sym = ef->symtab + symidx; + + /* + * Don't do a full lookup when the symbol is local. It may even + * fail because it may not be found through the hash table. + */ + if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) { + /* Force lookup failure when we have an insanity. */ + if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0) + return (0); + return ((Elf_Addr)ef->address + sym->st_value); + } + + /* + * XXX we can avoid doing a hash table based lookup for global + * symbols as well. This however is not always valid, so we'll + * just do it the hard way for now. Performance tweaks can + * always be added. + */ + + symbol = ef->strtab + sym->st_name; + + /* Force a lookup failure if the symbol name is bogus. */ + if (*symbol == 0) + return (0); + + return ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps)); +} diff --git a/sys/kern/linker_if.m b/sys/kern/linker_if.m new file mode 100644 index 0000000..9dafb57 --- /dev/null +++ b/sys/kern/linker_if.m @@ -0,0 +1,107 @@ +# +# Copyright (c) 2000 Doug Rabson +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD$ +# + +#include <sys/linker.h> + +INTERFACE linker; + +# +# Lookup a symbol in the file's symbol table. If the symbol is not +# found then return ENOENT, otherwise zero. +# +METHOD int lookup_symbol { + linker_file_t file; + const char* name; + c_linker_sym_t* symp; +}; + +METHOD int symbol_values { + linker_file_t file; + c_linker_sym_t sym; + linker_symval_t* valp; +}; + +METHOD int search_symbol { + linker_file_t file; + caddr_t value; + c_linker_sym_t* symp; + long* diffp; +}; + +# +# Call the callback with each specified function defined in the file. +# Stop and return the error if the callback returns an error. +# +METHOD int each_function_name { + linker_file_t file; + linker_function_name_callback_t callback; + void* opaque; +}; + +# +# Search for a linker set in a file. Return a pointer to the first +# entry (which is itself a pointer), and the number of entries. +# "stop" points to the entry beyond the last valid entry. +# If count, start or stop are NULL, they are not returned. +# +METHOD int lookup_set { + linker_file_t file; + const char* name; + void*** start; + void*** stop; + int* count; +}; + +# +# Unload a file, releasing dependancies and freeing storage. +# +METHOD void unload { + linker_file_t file; +}; + +# +# Load a file, returning the new linker_file_t in *result. If +# the class does not recognise the file type, zero should be +# returned, without modifying *result. If the file is +# recognised, the file should be loaded, *result set to the new +# file and zero returned. If some other error is detected an +# appropriate errno should be returned. +# +STATICMETHOD int load_file { + linker_class_t cls; + const char* filename; + linker_file_t* result; +}; +STATICMETHOD int link_preload { + linker_class_t cls; + const char* filename; + linker_file_t* result; +}; +STATICMETHOD int link_preload_finish { + linker_file_t file; +}; diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh new file mode 100644 index 0000000..f4a0212 --- /dev/null +++ b/sys/kern/makesyscalls.sh @@ -0,0 +1,446 @@ +#! /bin/sh - +# @(#)makesyscalls.sh 8.1 (Berkeley) 6/10/93 +# $FreeBSD$ + +set -e + +# name of compat option: +compat=COMPAT_43 + +# output files: +sysnames="syscalls.c" +sysproto="../sys/sysproto.h" +sysproto_h=_SYS_SYSPROTO_H_ +syshdr="../sys/syscall.h" +sysmk="../sys/syscall.mk" +syssw="init_sysent.c" +syscallprefix="SYS_" +switchname="sysent" +namesname="syscallnames" + +# tmp files: +sysdcl="sysent.dcl.$$" +syscompat="sysent.compat.$$" +syscompatdcl="sysent.compatdcl.$$" +sysent="sysent.switch.$$" +sysinc="sysinc.switch.$$" +sysarg="sysarg.switch.$$" + +trap "rm $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg" 0 + +touch $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg + +case $# in + 0) echo "usage: $0 input-file <config-file>" 1>&2 + exit 1 + ;; +esac + +if [ -n "$2" -a -f "$2" ]; then + . $2 +fi + +sed -e ' +s/\$//g +:join + /\\$/{a\ + + N + s/\\\n// + b join + } +2,${ + /^#/!s/\([{}()*,]\)/ \1 /g +} +' < $1 | awk " + BEGIN { + sysdcl = \"$sysdcl\" + sysproto = \"$sysproto\" + sysproto_h = \"$sysproto_h\" + syscompat = \"$syscompat\" + syscompatdcl = \"$syscompatdcl\" + sysent = \"$sysent\" + syssw = \"$syssw\" + sysinc = \"$sysinc\" + sysarg = \"$sysarg\" + sysnames = \"$sysnames\" + syshdr = \"$syshdr\" + sysmk = \"$sysmk\" + compat = \"$compat\" + syscallprefix = \"$syscallprefix\" + switchname = \"$switchname\" + namesname = \"$namesname\" + infile = \"$1\" + "' + + printf "/*\n * System call switch table.\n *\n" > syssw + printf " * DO NOT EDIT-- this file is automatically generated.\n" > syssw + printf " * $%s$\n", "FreeBSD" > syssw + + printf "/*\n * System call prototypes.\n *\n" > sysarg + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg + printf " * $%s$\n", "FreeBSD" > sysarg + + printf "\n#ifdef %s\n\n", compat > syscompat + + printf "/*\n * System call names.\n *\n" > sysnames + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames + printf " * $%s$\n", "FreeBSD" > sysnames + + printf "/*\n * System call numbers.\n *\n" > syshdr + printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr + printf " * $%s$\n", "FreeBSD" > syshdr + printf "# FreeBSD system call names.\n" > sysmk + printf "# DO NOT EDIT-- this file is automatically generated.\n" > sysmk + printf "# $%s$\n", "FreeBSD" > sysmk + } + NR == 1 { + gsub("[$]FreeBSD: ", "", $0) + gsub(" [$]", "", $0) + + printf " * created from%s\n */\n\n", $0 > syssw + + printf "\n/* The casts are bogus but will do for now. */\n" > sysent + printf "struct sysent %s[] = {\n",switchname > sysent + + printf " * created from%s\n */\n\n", $0 > sysarg + printf "#ifndef %s\n", sysproto_h > sysarg + printf "#define\t%s\n\n", sysproto_h > sysarg + printf "#include <sys/signal.h>\n\n" > sysarg + printf "#include <sys/acl.h>\n\n" > sysarg + printf "struct proc;\n\n" > sysarg + printf "struct thread;\n\n" > sysarg + printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) ? \\\n" > sysarg + printf "\t\t0 : sizeof(register_t) - sizeof(t))\n\n" > sysarg + printf "#if BYTE_ORDER == LITTLE_ENDIAN\n"> sysarg + printf "#define\tPADL_(t)\t0\n" > sysarg + printf "#define\tPADR_(t)\tPAD_(t)\n" > sysarg + printf "#else\n" > sysarg + printf "#define\tPADL_(t)\tPAD_(t)\n" > sysarg + printf "#define\tPADR_(t)\t0\n" > sysarg + printf "#endif\n\n" > sysarg + + printf " * created from%s\n */\n\n", $0 > sysnames + printf "char *%s[] = {\n", namesname > sysnames + + printf " * created from%s\n */\n\n", $0 > syshdr + + printf "# created from%s\nMIASM = ", $0 > sysmk + + next + } + NF == 0 || $1 ~ /^;/ { + next + } + $1 ~ /^#[ ]*include/ { + print > sysinc + next + } + $1 ~ /^#[ ]*if/ { + print > sysent + print > sysdcl + print > sysarg + print > syscompat + print > sysnames + savesyscall = syscall + next + } + $1 ~ /^#[ ]*else/ { + print > sysent + print > sysdcl + print > sysarg + print > syscompat + print > sysnames + syscall = savesyscall + next + } + $1 ~ /^#/ { + print > sysent + print > sysdcl + print > sysarg + print > syscompat + print > sysnames + next + } + syscall != $1 { + printf "%s: line %d: syscall number out of sync at %d\n", + infile, NR, syscall + printf "line is:\n" + print + exit 1 + } + function align_sysent_comment(column) { + printf("\t") > sysent + column = column + 8 - column % 8 + while (column < 56) { + printf("\t") > sysent + column = column + 8 + } + } + function parserr(was, wanted) { + printf "%s: line %d: unexpected %s (expected %s)\n", + infile, NR, was, wanted + exit 1 + } + function parseline() { + f=4 # toss number and type + argc= 0; + argssize = "0" + if ($NF != "}") { + funcalias=$(NF-2) + argalias=$(NF-1) + rettype=$NF + end=NF-3 + } else { + funcalias="" + argalias="" + rettype="int" + end=NF + } + if ($2 == "NODEF") { + funcname=$4 + argssize = "AS(" $6 ")" + return + } + if ($f != "{") + parserr($f, "{") + f++ + if ($end != "}") + parserr($end, "}") + end-- + if ($end != ";") + parserr($end, ";") + end-- + if ($end != ")") + parserr($end, ")") + end-- + + f++ #function return type + + funcname=$f + if (funcalias == "") + funcalias = funcname + if (argalias == "") { + argalias = funcname "_args" + if ($2 == "COMPAT") + argalias = "o" argalias + } + f++ + + if ($f != "(") + parserr($f, ")") + f++ + + if (f == end) { + if ($f != "void") + parserr($f, "argument definition") + return + } + + while (f <= end) { + argc++ + argtype[argc]="" + oldf="" + while (f < end && $(f+1) != ",") { + if (argtype[argc] != "" && oldf != "*") + argtype[argc] = argtype[argc]" "; + argtype[argc] = argtype[argc]$f; + oldf = $f; + f++ + } + if (argtype[argc] == "") + parserr($f, "argument definition") + argname[argc]=$f; + f += 2; # skip name, and any comma + } + if (argc != 0) + argssize = "AS(" argalias ")" + } + { comment = $4 + if (NF < 7) + for (i = 5; i <= NF; i++) + comment = comment " " $i + } + + # The 'M' type prefix + # + { + mpsafe = "SYF_MPSAFE | "; + if ($2 == "MSTD") { + $2 = "STD"; + } else if ($2 == "MNODEF") { + $2 = "NODEF"; + } else if ($2 == "MNOARGS") { + $2 = "NOARGS"; + } else if ($2 == "MNOPROTO") { + $2 = "NOPROTO"; + } else if ($2 == "MNOIMPL") { + $2 = "NOIMPL"; + } else if ($2 == "MNOSTD") { + $2 = "NOSTD"; + } else if ($2 == "MCOMPAT") { + $2 = "COMPAT"; + } else if ($2 == "MCPT_NOA") { + $2 = "CPT_NOA"; + } else if ($2 == "MLIBCOMPAT") { + $2 = "LIBCOMPAT"; + } else if ($2 == "MOBSOL") { + $2 = "OBSOL"; + } else if ($2 == "MUNIMPL") { + $2 = "UNIMPL"; + } else { + mpsafe = ""; + } + } + $2 == "STD" || $2 == "NODEF" || $2 == "NOARGS" || $2 == "NOPROTO" \ + || $2 == "NOIMPL" || $2 == "NOSTD" { + parseline() + if ((!nosys || funcname != "nosys") && \ + (funcname != "lkmnosys") && (funcname != "lkmressys")) { + if (argc != 0 && $2 != "NOARGS" && $2 != "NOPROTO") { + printf("struct %s {\n", argalias) > sysarg + for (i = 1; i <= argc; i++) + printf("\tchar %s_l_[PADL_(%s)]; " \ + "%s %s; char %s_r_[PADR_(%s)];\n", + argname[i], argtype[i], + argtype[i], argname[i], + argname[i], argtype[i]) > sysarg + printf("};\n") > sysarg + } + else if ($2 != "NOARGS" && $2 != "NOPROTO" && \ + $2 != "NODEF") + printf("struct %s {\n\tregister_t dummy;\n};\n", + argalias) > sysarg + } + if (($2 != "NOPROTO" && $2 != "NODEF" && \ + (funcname != "nosys" || !nosys)) || \ + (funcname == "lkmnosys" && !lkmnosys) || \ + funcname == "lkmressys") { + printf("%s\t%s(struct thread *, struct %s *)", + rettype, funcname, argalias) > sysdcl + printf(";\n") > sysdcl + } + if (funcname == "nosys") + nosys = 1 + if (funcname == "lkmnosys") + lkmnosys = 1 + printf("\t{ %s%s, (sy_call_t *)", mpsafe, argssize) > sysent + column = 8 + 2 + length(mpsafe) + length(argssize) + 15 + if ($2 == "NOIMPL") { + printf("%s },", "nosys") > sysent + column = column + length("nosys") + 3 + } else if ($2 == "NOSTD") { + printf("%s },", "lkmressys") > sysent + column = column + length("lkmressys") + 3 + } else { + printf("%s },", funcname) > sysent + column = column + length(funcname) + 3 + } + align_sysent_comment(column) + printf("/* %d = %s */\n", syscall, funcalias) > sysent + printf("\t\"%s\",\t\t\t/* %d = %s */\n", + funcalias, syscall, funcalias) > sysnames + if ($2 != "NODEF") { + printf("#define\t%s%s\t%d\n", syscallprefix, + funcalias, syscall) > syshdr + printf(" \\\n\t%s.o", funcalias) > sysmk + } + syscall++ + next + } + $2 == "COMPAT" || $2 == "CPT_NOA" { + ncompat++ + parseline() + if (argc != 0 && $2 != "CPT_NOA") { + printf("struct %s {\n", argalias) > syscompat + for (i = 1; i <= argc; i++) + printf("\tchar %s_l_[PADL_(%s)]; %s %s; " \ + "char %s_r_[PADR_(%s)];\n", + argname[i], argtype[i], + argtype[i], argname[i], + argname[i], argtype[i]) > syscompat + printf("};\n") > syscompat + } + else if($2 != "CPT_NOA") + printf("struct %s {\n\tregister_t dummy;\n};\n", + argalias) > sysarg + printf("%s\to%s(struct thread *, struct %s *);\n", + rettype, funcname, argalias) > syscompatdcl + printf("\t{ compat(%s%s,%s) },", + mpsafe, argssize, funcname) > sysent + align_sysent_comment(8 + 9 + length(mpsafe) + \ + length(argssize) + 1 + length(funcname) + 4) + printf("/* %d = old %s */\n", syscall, funcalias) > sysent + printf("\t\"old.%s\",\t\t/* %d = old %s */\n", + funcalias, syscall, funcalias) > sysnames + printf("\t\t\t\t/* %d is old %s */\n", + syscall, funcalias) > syshdr + syscall++ + next + } + $2 == "LIBCOMPAT" { + ncompat++ + parseline() + printf("%s\to%s();\n", rettype, funcname) > syscompatdcl + printf("\t{ compat(%s%s,%s) },", + mpsafe, argssize, funcname) > sysent + align_sysent_comment(8 + 9 + length(mpsafe) + \ + length(argssize) + 1 + length(funcname) + 4) + printf("/* %d = old %s */\n", syscall, funcalias) > sysent + printf("\t\"old.%s\",\t\t/* %d = old %s */\n", + funcalias, syscall, funcalias) > sysnames + printf("#define\t%s%s\t%d\t/* compatibility; still used by libc */\n", + syscallprefix, funcalias, syscall) > syshdr + printf(" \\\n\t%s.o", funcalias) > sysmk + syscall++ + next + } + $2 == "OBSOL" { + printf("\t{ 0, (sy_call_t *)nosys },") > sysent + align_sysent_comment(34) + printf("/* %d = obsolete %s */\n", syscall, comment) > sysent + printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n", + $4, syscall, comment) > sysnames + printf("\t\t\t\t/* %d is obsolete %s */\n", + syscall, comment) > syshdr + syscall++ + next + } + $2 == "UNIMPL" { + printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = %s */\n", + syscall, comment) > sysent + printf("\t\"#%d\",\t\t\t/* %d = %s */\n", + syscall, syscall, comment) > sysnames + syscall++ + next + } + { + printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2 + exit 1 + } + END { + printf "\n#define AS(name) (sizeof(struct name) / sizeof(register_t))\n" > sysinc + if (ncompat != 0) { + printf "#include \"opt_compat.h\"\n\n" > syssw + printf "\n#ifdef %s\n", compat > sysinc + printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysinc + printf "#else\n" > sysinc + printf "#define compat(n, name) 0, (sy_call_t *)nosys\n" > sysinc + printf "#endif\n" > sysinc + } + + printf("\n#endif /* %s */\n\n", compat) > syscompatdcl + printf("#undef PAD_\n") > syscompatdcl + printf("#undef PADL_\n") > syscompatdcl + printf("#undef PADR_\n") > syscompatdcl + printf("\n#endif /* !%s */\n", sysproto_h) > syscompatdcl + + printf("\n") > sysmk + printf("};\n") > sysent + printf("};\n") > sysnames + printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \ + > syshdr + } ' + +cat $sysinc $sysent >> $syssw +cat $sysarg $sysdcl $syscompat $syscompatdcl > $sysproto diff --git a/sys/kern/md4c.c b/sys/kern/md4c.c new file mode 100644 index 0000000..e3a0bfa --- /dev/null +++ b/sys/kern/md4c.c @@ -0,0 +1,285 @@ +/* MD4C.C - RSA Data Security, Inc., MD4 message-digest algorithm + * $FreeBSD$ + */ + +/* Copyright (C) 1990-2, RSA Data Security, Inc. All rights reserved. + + License to copy and use this software is granted provided that it + is identified as the "RSA Data Security, Inc. MD4 Message-Digest + Algorithm" in all material mentioning or referencing this software + or this function. + + License is also granted to make and use derivative works provided + that such works are identified as "derived from the RSA Data + Security, Inc. MD4 Message-Digest Algorithm" in all material + mentioning or referencing the derived work. + + RSA Data Security, Inc. makes no representations concerning either + the merchantability of this software or the suitability of this + software for any particular purpose. It is provided "as is" + without express or implied warranty of any kind. + + These notices must be retained in any copies of any part of this + documentation and/or software. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/md4.h> + +typedef unsigned char *POINTER; +typedef u_int16_t UINT2; +typedef u_int32_t UINT4; + +#define PROTO_LIST(list) list + +/* Constants for MD4Transform routine. + */ +#define S11 3 +#define S12 7 +#define S13 11 +#define S14 19 +#define S21 3 +#define S22 5 +#define S23 9 +#define S24 13 +#define S31 3 +#define S32 9 +#define S33 11 +#define S34 15 + +static void MD4Transform PROTO_LIST ((UINT4 [4], const unsigned char [64])); +static void Encode PROTO_LIST + ((unsigned char *, UINT4 *, unsigned int)); +static void Decode PROTO_LIST + ((UINT4 *, const unsigned char *, unsigned int)); + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* F, G and H are basic MD4 functions. + */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) + +/* ROTATE_LEFT rotates x left n bits. + */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* FF, GG and HH are transformations for rounds 1, 2 and 3 */ +/* Rotation is separate from addition to prevent recomputation */ +#define FF(a, b, c, d, x, s) { \ + (a) += F ((b), (c), (d)) + (x); \ + (a) = ROTATE_LEFT ((a), (s)); \ + } +#define GG(a, b, c, d, x, s) { \ + (a) += G ((b), (c), (d)) + (x) + (UINT4)0x5a827999; \ + (a) = ROTATE_LEFT ((a), (s)); \ + } +#define HH(a, b, c, d, x, s) { \ + (a) += H ((b), (c), (d)) + (x) + (UINT4)0x6ed9eba1; \ + (a) = ROTATE_LEFT ((a), (s)); \ + } + +/* MD4 initialization. Begins an MD4 operation, writing a new context. + */ +void MD4Init (context) +MD4_CTX *context; /* context */ +{ + context->count[0] = context->count[1] = 0; + + /* Load magic initialization constants. + */ + context->state[0] = 0x67452301; + context->state[1] = 0xefcdab89; + context->state[2] = 0x98badcfe; + context->state[3] = 0x10325476; +} + +/* MD4 block update operation. Continues an MD4 message-digest + operation, processing another message block, and updating the + context. + */ +void MD4Update (context, input, inputLen) +MD4_CTX *context; /* context */ +const unsigned char *input; /* input block */ +unsigned int inputLen; /* length of input block */ +{ + unsigned int i, index, partLen; + + /* Compute number of bytes mod 64 */ + index = (unsigned int)((context->count[0] >> 3) & 0x3F); + /* Update number of bits */ + if ((context->count[0] += ((UINT4)inputLen << 3)) + < ((UINT4)inputLen << 3)) + context->count[1]++; + context->count[1] += ((UINT4)inputLen >> 29); + + partLen = 64 - index; + /* Transform as many times as possible. + */ + if (inputLen >= partLen) { + bcopy(input, &context->buffer[index], partLen); + MD4Transform (context->state, context->buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + MD4Transform (context->state, &input[i]); + + index = 0; + } + else + i = 0; + + /* Buffer remaining input */ + bcopy(&input[i], &context->buffer[index], inputLen-i); +} + +/* MD4 padding. */ +void MD4Pad (context) +MD4_CTX *context; /* context */ +{ + unsigned char bits[8]; + unsigned int index, padLen; + + /* Save number of bits */ + Encode (bits, context->count, 8); + + /* Pad out to 56 mod 64. + */ + index = (unsigned int)((context->count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + MD4Update (context, PADDING, padLen); + + /* Append length (before padding) */ + MD4Update (context, bits, 8); +} + +/* MD4 finalization. Ends an MD4 message-digest operation, writing the + the message digest and zeroizing the context. + */ +void MD4Final (digest, context) +unsigned char digest[16]; /* message digest */ +MD4_CTX *context; /* context */ +{ + /* Do padding */ + MD4Pad (context); + + /* Store state in digest */ + Encode (digest, context->state, 16); + + /* Zeroize sensitive information. + */ + bzero((POINTER)context, sizeof (*context)); +} + +/* MD4 basic transformation. Transforms state based on block. + */ +static void MD4Transform (state, block) +UINT4 state[4]; +const unsigned char block[64]; +{ + UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + Decode (x, block, 64); + + /* Round 1 */ + FF (a, b, c, d, x[ 0], S11); /* 1 */ + FF (d, a, b, c, x[ 1], S12); /* 2 */ + FF (c, d, a, b, x[ 2], S13); /* 3 */ + FF (b, c, d, a, x[ 3], S14); /* 4 */ + FF (a, b, c, d, x[ 4], S11); /* 5 */ + FF (d, a, b, c, x[ 5], S12); /* 6 */ + FF (c, d, a, b, x[ 6], S13); /* 7 */ + FF (b, c, d, a, x[ 7], S14); /* 8 */ + FF (a, b, c, d, x[ 8], S11); /* 9 */ + FF (d, a, b, c, x[ 9], S12); /* 10 */ + FF (c, d, a, b, x[10], S13); /* 11 */ + FF (b, c, d, a, x[11], S14); /* 12 */ + FF (a, b, c, d, x[12], S11); /* 13 */ + FF (d, a, b, c, x[13], S12); /* 14 */ + FF (c, d, a, b, x[14], S13); /* 15 */ + FF (b, c, d, a, x[15], S14); /* 16 */ + + /* Round 2 */ + GG (a, b, c, d, x[ 0], S21); /* 17 */ + GG (d, a, b, c, x[ 4], S22); /* 18 */ + GG (c, d, a, b, x[ 8], S23); /* 19 */ + GG (b, c, d, a, x[12], S24); /* 20 */ + GG (a, b, c, d, x[ 1], S21); /* 21 */ + GG (d, a, b, c, x[ 5], S22); /* 22 */ + GG (c, d, a, b, x[ 9], S23); /* 23 */ + GG (b, c, d, a, x[13], S24); /* 24 */ + GG (a, b, c, d, x[ 2], S21); /* 25 */ + GG (d, a, b, c, x[ 6], S22); /* 26 */ + GG (c, d, a, b, x[10], S23); /* 27 */ + GG (b, c, d, a, x[14], S24); /* 28 */ + GG (a, b, c, d, x[ 3], S21); /* 29 */ + GG (d, a, b, c, x[ 7], S22); /* 30 */ + GG (c, d, a, b, x[11], S23); /* 31 */ + GG (b, c, d, a, x[15], S24); /* 32 */ + + /* Round 3 */ + HH (a, b, c, d, x[ 0], S31); /* 33 */ + HH (d, a, b, c, x[ 8], S32); /* 34 */ + HH (c, d, a, b, x[ 4], S33); /* 35 */ + HH (b, c, d, a, x[12], S34); /* 36 */ + HH (a, b, c, d, x[ 2], S31); /* 37 */ + HH (d, a, b, c, x[10], S32); /* 38 */ + HH (c, d, a, b, x[ 6], S33); /* 39 */ + HH (b, c, d, a, x[14], S34); /* 40 */ + HH (a, b, c, d, x[ 1], S31); /* 41 */ + HH (d, a, b, c, x[ 9], S32); /* 42 */ + HH (c, d, a, b, x[ 5], S33); /* 43 */ + HH (b, c, d, a, x[13], S34); /* 44 */ + HH (a, b, c, d, x[ 3], S31); /* 45 */ + HH (d, a, b, c, x[11], S32); /* 46 */ + HH (c, d, a, b, x[ 7], S33); /* 47 */ + HH (b, c, d, a, x[15], S34); /* 48 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + /* Zeroize sensitive information. + */ + bzero((POINTER)x, sizeof (x)); +} + +/* Encodes input (UINT4) into output (unsigned char). Assumes len is + a multiple of 4. + */ +static void Encode (output, input, len) +unsigned char *output; +UINT4 *input; +unsigned int len; +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (unsigned char)(input[i] & 0xff); + output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); + output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); + output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); + } +} + +/* Decodes input (unsigned char) into output (UINT4). Assumes len is + a multiple of 4. + */ +static void Decode (output, input, len) + +UINT4 *output; +const unsigned char *input; +unsigned int len; +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) + output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) | + (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24); +} diff --git a/sys/kern/md5c.c b/sys/kern/md5c.c new file mode 100644 index 0000000..72c970b --- /dev/null +++ b/sys/kern/md5c.c @@ -0,0 +1,339 @@ +/* + * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm + * + * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All + * rights reserved. + * + * License to copy and use this software is granted provided that it + * is identified as the "RSA Data Security, Inc. MD5 Message-Digest + * Algorithm" in all material mentioning or referencing this software + * or this function. + * + * License is also granted to make and use derivative works provided + * that such works are identified as "derived from the RSA Data + * Security, Inc. MD5 Message-Digest Algorithm" in all material + * mentioning or referencing the derived work. + * + * RSA Data Security, Inc. makes no representations concerning either + * the merchantability of this software or the suitability of this + * software for any particular purpose. It is provided "as is" + * without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this + * documentation and/or software. + * + * This code is the same as the code published by RSA Inc. It has been + * edited for clarity and style only. + */ + +/* + * This file should be kept in sync with src/lib/libmd/md5c.c + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#ifdef _KERNEL +#include <sys/systm.h> +#else +#include <string.h> +#endif + +#include <machine/endian.h> +#include <sys/endian.h> +#include <sys/md5.h> + +static void MD5Transform(u_int32_t [4], const unsigned char [64]); + +#ifdef _KERNEL +#define memset(x,y,z) bzero(x,z); +#define memcpy(x,y,z) bcopy(y, x, z) +#endif + +#if (BYTE_ORDER == LITTLE_ENDIAN) +#define Encode memcpy +#define Decode memcpy +#else + +/* + * Encodes input (u_int32_t) into output (unsigned char). Assumes len is + * a multiple of 4. + */ + +static void +Encode (unsigned char *output, u_int32_t *input, unsigned int len) +{ + unsigned int i; + u_int32_t *op = (u_int32_t *)output; + + for (i = 0; i < len / 4; i++) + op[i] = htole32(input[i]); +} + +/* + * Decodes input (unsigned char) into output (u_int32_t). Assumes len is + * a multiple of 4. + */ + +static void +Decode (u_int32_t *output, const unsigned char *input, unsigned int len) +{ + unsigned int i; + const u_int32_t *ip = (const u_int32_t *)input; + + for (i = 0; i < len / 4; i++) + output[i] = le32toh(ip[i]); +} +#endif + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* F, G, H and I are basic MD5 functions. */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits. */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* + * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. + * Rotation is separate from addition to prevent recomputation. + */ +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) { \ + (a) += I ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +/* MD5 initialization. Begins an MD5 operation, writing a new context. */ + +void +MD5Init (context) + MD5_CTX *context; +{ + + context->count[0] = context->count[1] = 0; + + /* Load magic initialization constants. */ + context->state[0] = 0x67452301; + context->state[1] = 0xefcdab89; + context->state[2] = 0x98badcfe; + context->state[3] = 0x10325476; +} + +/* + * MD5 block update operation. Continues an MD5 message-digest + * operation, processing another message block, and updating the + * context. + */ + +void +MD5Update (context, input, inputLen) + MD5_CTX *context; + const unsigned char *input; + unsigned int inputLen; +{ + unsigned int i, index, partLen; + + /* Compute number of bytes mod 64 */ + index = (unsigned int)((context->count[0] >> 3) & 0x3F); + + /* Update number of bits */ + if ((context->count[0] += ((u_int32_t)inputLen << 3)) + < ((u_int32_t)inputLen << 3)) + context->count[1]++; + context->count[1] += ((u_int32_t)inputLen >> 29); + + partLen = 64 - index; + + /* Transform as many times as possible. */ + if (inputLen >= partLen) { + memcpy((void *)&context->buffer[index], (const void *)input, + partLen); + MD5Transform (context->state, context->buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + MD5Transform (context->state, &input[i]); + + index = 0; + } + else + i = 0; + + /* Buffer remaining input */ + memcpy ((void *)&context->buffer[index], (const void *)&input[i], + inputLen-i); +} + +/* + * MD5 padding. Adds padding followed by original length. + */ + +void +MD5Pad (context) + MD5_CTX *context; +{ + unsigned char bits[8]; + unsigned int index, padLen; + + /* Save number of bits */ + Encode (bits, context->count, 8); + + /* Pad out to 56 mod 64. */ + index = (unsigned int)((context->count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + MD5Update (context, PADDING, padLen); + + /* Append length (before padding) */ + MD5Update (context, bits, 8); +} + +/* + * MD5 finalization. Ends an MD5 message-digest operation, writing the + * the message digest and zeroizing the context. + */ + +void +MD5Final (digest, context) + unsigned char digest[16]; + MD5_CTX *context; +{ + /* Do padding. */ + MD5Pad (context); + + /* Store state in digest */ + Encode (digest, context->state, 16); + + /* Zeroize sensitive information. */ + memset ((void *)context, 0, sizeof (*context)); +} + +/* MD5 basic transformation. Transforms state based on block. */ + +static void +MD5Transform (state, block) + u_int32_t state[4]; + const unsigned char block[64]; +{ + u_int32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + Decode (x, block, 64); + + /* Round 1 */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ + FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ + FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ + FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ + FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ + FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ + FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ + FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ + FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ + FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ + FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ + + /* Round 2 */ +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ + GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ + GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ + GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ + GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ + GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ + GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ + + /* Round 3 */ +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ + HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ + HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ + HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ + HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ + HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ + HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ + HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ + + /* Round 4 */ +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ + II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ + II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ + II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ + II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ + II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ + II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ + II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ + II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ + II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ + II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ + II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + /* Zeroize sensitive information. */ + memset ((void *)x, 0, sizeof (x)); +} diff --git a/sys/kern/p1003_1b.c b/sys/kern/p1003_1b.c new file mode 100644 index 0000000..9e6fdca --- /dev/null +++ b/sys/kern/p1003_1b.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 1996, 1997, 1998 + * HD Associates, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by HD Associates, Inc + * 4. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* p1003_1b: Real Time common code. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sysctl.h> +#include <sys/sysent.h> +#include <sys/syslog.h> +#include <sys/sysproto.h> + +#include <posix4/posix4.h> + +MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B"); + +/* The system calls return ENOSYS if an entry is called that is + * not run-time supported. I am also logging since some programs + * start to use this when they shouldn't. That will be removed if annoying. + */ +int +syscall_not_present(struct thread *td, const char *s, struct nosys_args *uap) +{ + log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n", + td->td_proc->p_comm, td->td_proc->p_pid, s); + + /* a " return nosys(p, uap); " here causes a core dump. + */ + + return ENOSYS; +} + +#if !defined(_KPOSIX_PRIORITY_SCHEDULING) + +/* Not configured but loadable via a module: + */ + +static int sched_attach(void) +{ + return 0; +} + +SYSCALL_NOT_PRESENT_GEN(sched_setparam) +SYSCALL_NOT_PRESENT_GEN(sched_getparam) +SYSCALL_NOT_PRESENT_GEN(sched_setscheduler) +SYSCALL_NOT_PRESENT_GEN(sched_getscheduler) +SYSCALL_NOT_PRESENT_GEN(sched_yield) +SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max) +SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min) +SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval) + +#else + +/* Configured in kernel version: + */ +static struct ksched *ksched; + +static int sched_attach(void) +{ + int ret = ksched_attach(&ksched); + + if (ret == 0) + p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 1); + + return ret; +} + +/* + * MPSAFE + */ +int sched_setparam(struct thread *td, + struct sched_setparam_args *uap) +{ + struct thread *targettd; + struct proc *targetp; + int e; + struct sched_param sched_param; + + e = copyin(uap->param, &sched_param, sizeof(sched_param)); + if (e) + return (e); + + mtx_lock(&Giant); + if (uap->pid == 0) { + targetp = td->td_proc; + targettd = td; + PROC_LOCK(targetp); + } else { + targetp = pfind(uap->pid); + if (targetp == NULL) { + e = ESRCH; + goto done2; + } + targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */ + } + + e = p_cansched(td, targetp); + PROC_UNLOCK(targetp); + if (e == 0) { + e = ksched_setparam(&td->td_retval[0], ksched, targettd, + (const struct sched_param *)&sched_param); + } +done2: + mtx_unlock(&Giant); + return (e); +} + +/* + * MPSAFE + */ +int sched_getparam(struct thread *td, + struct sched_getparam_args *uap) +{ + int e; + struct sched_param sched_param; + struct thread *targettd; + struct proc *targetp; + + mtx_lock(&Giant); + if (uap->pid == 0) { + targetp = td->td_proc; + targettd = td; + PROC_LOCK(targetp); + } else { + targetp = pfind(uap->pid); + if (targetp == NULL) { + e = ESRCH; + goto done2; + } + targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */ + } + + e = p_cansee(td, targetp); + PROC_UNLOCK(targetp); + if (e) + goto done2; + + e = ksched_getparam(&td->td_retval[0], ksched, targettd, &sched_param); + if (e == 0) + e = copyout(&sched_param, uap->param, sizeof(sched_param)); +done2: + mtx_unlock(&Giant); + return (e); +} + +/* + * MPSAFE + */ +int sched_setscheduler(struct thread *td, + struct sched_setscheduler_args *uap) +{ + int e; + struct sched_param sched_param; + struct thread *targettd; + struct proc *targetp; + + e = copyin(uap->param, &sched_param, sizeof(sched_param)); + if (e) + return (e); + + mtx_lock(&Giant); + if (uap->pid == 0) { + targetp = td->td_proc; + targettd = td; + PROC_LOCK(targetp); + } else { + targetp = pfind(uap->pid); + if (targetp == NULL) { + e = ESRCH; + goto done2; + } + targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */ + } + + e = p_cansched(td, targetp); + PROC_UNLOCK(targetp); + if (e == 0) { + e = ksched_setscheduler(&td->td_retval[0], ksched, targettd, + uap->policy, (const struct sched_param *)&sched_param); + } +done2: + mtx_unlock(&Giant); + return (e); +} + +/* + * MPSAFE + */ +int sched_getscheduler(struct thread *td, + struct sched_getscheduler_args *uap) +{ + int e; + struct thread *targettd; + struct proc *targetp; + + mtx_lock(&Giant); + if (uap->pid == 0) { + targetp = td->td_proc; + targettd = td; + PROC_LOCK(targetp); + } else { + targetp = pfind(uap->pid); + if (targetp == NULL) { + e = ESRCH; + goto done2; + } + targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */ + } + + e = p_cansee(td, targetp); + PROC_UNLOCK(targetp); + if (e == 0) + e = ksched_getscheduler(&td->td_retval[0], ksched, targettd); + +done2: + mtx_unlock(&Giant); + return (e); +} + +/* + * MPSAFE + */ +int sched_yield(struct thread *td, + struct sched_yield_args *uap) +{ + int error; + + mtx_lock(&Giant); + error = ksched_yield(&td->td_retval[0], ksched); + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int sched_get_priority_max(struct thread *td, + struct sched_get_priority_max_args *uap) +{ + int error; + + mtx_lock(&Giant); + error = ksched_get_priority_max(&td->td_retval[0], ksched, uap->policy); + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int sched_get_priority_min(struct thread *td, + struct sched_get_priority_min_args *uap) +{ + int error; + + mtx_lock(&Giant); + error = ksched_get_priority_min(&td->td_retval[0], ksched, uap->policy); + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int sched_rr_get_interval(struct thread *td, + struct sched_rr_get_interval_args *uap) +{ + int e; + struct thread *targettd; + struct proc *targetp; + + mtx_lock(&Giant); + if (uap->pid == 0) { + targettd = td; + targetp = td->td_proc; + PROC_LOCK(targetp); + } else { + targetp = pfind(uap->pid); + if (targetp == NULL) { + e = ESRCH; + goto done2; + } + targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */ + } + + e = p_cansee(td, targetp); + PROC_UNLOCK(targetp); + if (e == 0) { + e = ksched_rr_get_interval(&td->td_retval[0], ksched, targettd, + uap->interval); + } +done2: + mtx_unlock(&Giant); + return (e); +} + +#endif + +static void p31binit(void *notused) +{ + (void) sched_attach(); + p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE); +} + +SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL); diff --git a/sys/kern/posix4_mib.c b/sys/kern/posix4_mib.c new file mode 100644 index 0000000..09af27d --- /dev/null +++ b/sys/kern/posix4_mib.c @@ -0,0 +1,115 @@ +/*- + * Copyright (c) 1998 + * HD Associates, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by HD Associates, Inc + * 4. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/queue.h> +#include <sys/sysctl.h> +#include <posix4/posix4.h> + +static int facility[CTL_P1003_1B_MAXID - 1]; + +/* OID_AUTO isn't working with sysconf(3). I guess I'd have to + * modify it to do a lookup by name from the index. + * For now I've left it a top-level sysctl. + */ + +#if 1 + +SYSCTL_DECL(_p1003_1b); + +#define P1B_SYSCTL(num, name) \ +SYSCTL_INT(_p1003_1b, num, \ + name, CTLFLAG_RD, facility + num - 1, 0, ""); + +#else + +SYSCTL_DECL(_kern_p1003_1b); + +#define P1B_SYSCTL(num, name) \ +SYSCTL_INT(_kern_p1003_1b, OID_AUTO, \ + name, CTLFLAG_RD, facility + num - 1, 0, ""); +SYSCTL_NODE(_kern, OID_AUTO, p1003_1b, CTLFLAG_RW, 0, "P1003.1B"); + +#endif + +P1B_SYSCTL(CTL_P1003_1B_ASYNCHRONOUS_IO, asynchronous_io); +P1B_SYSCTL(CTL_P1003_1B_MAPPED_FILES, mapped_files); +P1B_SYSCTL(CTL_P1003_1B_MEMLOCK, memlock); +P1B_SYSCTL(CTL_P1003_1B_MEMLOCK_RANGE, memlock_range); +P1B_SYSCTL(CTL_P1003_1B_MEMORY_PROTECTION, memory_protection); +P1B_SYSCTL(CTL_P1003_1B_MESSAGE_PASSING, message_passing); +P1B_SYSCTL(CTL_P1003_1B_PRIORITIZED_IO, prioritized_io); +P1B_SYSCTL(CTL_P1003_1B_PRIORITY_SCHEDULING, priority_scheduling); +P1B_SYSCTL(CTL_P1003_1B_REALTIME_SIGNALS, realtime_signals); +P1B_SYSCTL(CTL_P1003_1B_SEMAPHORES, semaphores); +P1B_SYSCTL(CTL_P1003_1B_FSYNC, fsync); +P1B_SYSCTL(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, shared_memory_objects); +P1B_SYSCTL(CTL_P1003_1B_SYNCHRONIZED_IO, synchronized_io); +P1B_SYSCTL(CTL_P1003_1B_TIMERS, timers); +P1B_SYSCTL(CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max); +P1B_SYSCTL(CTL_P1003_1B_AIO_MAX, aio_max); +P1B_SYSCTL(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, aio_prio_delta_max); +P1B_SYSCTL(CTL_P1003_1B_DELAYTIMER_MAX, delaytimer_max); +P1B_SYSCTL(CTL_P1003_1B_MQ_OPEN_MAX, mq_open_max); +P1B_SYSCTL(CTL_P1003_1B_PAGESIZE, pagesize); +P1B_SYSCTL(CTL_P1003_1B_RTSIG_MAX, rtsig_max); +P1B_SYSCTL(CTL_P1003_1B_SEM_NSEMS_MAX, sem_nsems_max); +P1B_SYSCTL(CTL_P1003_1B_SEM_VALUE_MAX, sem_value_max); +P1B_SYSCTL(CTL_P1003_1B_SIGQUEUE_MAX, sigqueue_max); +P1B_SYSCTL(CTL_P1003_1B_TIMER_MAX, timer_max); + +/* p31b_setcfg: Set the configuration + */ +void p31b_setcfg(int num, int value) +{ + if (num >= 1 && num < CTL_P1003_1B_MAXID) + facility[num - 1] = value; +} + +/* + * Turn on indications for standard (non-configurable) kernel features. + */ +static void +p31b_set_standard(void *dummy) +{ + /* ??? p31b_setcfg(CTL_P1003_1B_FSYNC, 1); */ + p31b_setcfg(CTL_P1003_1B_MAPPED_FILES, 1); + p31b_setcfg(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, 1); + p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE); +} + +SYSINIT(p31b_set_standard, SI_SUB_P1003_1B, SI_ORDER_ANY, p31b_set_standard, + 0); + diff --git a/sys/kern/subr_acl_posix1e.c b/sys/kern/subr_acl_posix1e.c new file mode 100644 index 0000000..70be0ec --- /dev/null +++ b/sys/kern/subr_acl_posix1e.c @@ -0,0 +1,830 @@ +/*- + * Copyright (c) 1999-2001 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * Developed by the TrustedBSD Project. + * Support for POSIX.1e access control lists. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/vnode.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/namei.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/sysent.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/acl.h> + +MALLOC_DEFINE(M_ACL, "acl", "access control list"); + +static int vacl_set_acl(struct thread *td, struct vnode *vp, + acl_type_t type, struct acl *aclp); +static int vacl_get_acl(struct thread *td, struct vnode *vp, + acl_type_t type, struct acl *aclp); +static int vacl_aclcheck(struct thread *td, struct vnode *vp, + acl_type_t type, struct acl *aclp); + +/* + * Implement a version of vaccess() that understands POSIX.1e ACL semantics. + * Return 0 on success, else an errno value. Should be merged into + * vaccess() eventually. + */ +int +vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, + struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused) +{ + struct acl_entry *acl_other, *acl_mask; + mode_t dac_granted; + mode_t cap_granted; + mode_t acl_mask_granted; + int group_matched, i; + + /* + * Look for a normal, non-privileged way to access the file/directory + * as requested. If it exists, go with that. Otherwise, attempt + * to use privileges granted via cap_granted. In some cases, + * which privileges to use may be ambiguous due to "best match", + * in which case fall back on first match for the time being. + */ + if (privused != NULL) + *privused = 0; + + /* + * Determine privileges now, but don't apply until we've found + * a DAC entry that matches but has failed to allow access. + */ +#ifndef CAPABILITIES + if (suser_cred(cred, PRISON_ROOT) == 0) + cap_granted = (VEXEC | VREAD | VWRITE | VADMIN); + else + cap_granted = 0; +#else + cap_granted = 0; + + if (type == VDIR) { + if ((acc_mode & VEXEC) && !cap_check(cred, NULL, + CAP_DAC_READ_SEARCH, PRISON_ROOT)) + cap_granted |= VEXEC; + } else { + if ((acc_mode & VEXEC) && !cap_check(cred, NULL, + CAP_DAC_EXECUTE, PRISON_ROOT)) + cap_granted |= VEXEC; + } + + if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, + PRISON_ROOT)) + cap_granted |= VREAD; + + if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE, + PRISON_ROOT)) + cap_granted |= VWRITE; + + if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER, + PRISON_ROOT)) + cap_granted |= VADMIN; +#endif /* CAPABILITIES */ + + /* + * The owner matches if the effective uid associated with the + * credential matches that of the ACL_USER_OBJ entry. While we're + * doing the first scan, also cache the location of the ACL_MASK + * and ACL_OTHER entries, preventing some future iterations. + */ + acl_mask = acl_other = NULL; + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_USER_OBJ: + if (file_uid != cred->cr_uid) + break; + dac_granted = 0; + dac_granted |= VADMIN; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + if ((acc_mode & dac_granted) == acc_mode) + return (0); + if ((acc_mode & (dac_granted | cap_granted)) == + acc_mode) { + if (privused != NULL) + *privused = 1; + return (0); + } + goto error; + + case ACL_MASK: + acl_mask = &acl->acl_entry[i]; + break; + + case ACL_OTHER: + acl_other = &acl->acl_entry[i]; + break; + + default: + break; + } + } + + /* + * An ACL_OTHER entry should always exist in a valid access + * ACL. If it doesn't, then generate a serious failure. For now, + * this means a debugging message and EPERM, but in the future + * should probably be a panic. + */ + if (acl_other == NULL) { + /* + * XXX This should never happen + */ + printf("vaccess_acl_posix1e: ACL_OTHER missing\n"); + return (EPERM); + } + + /* + * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields + * are masked by an ACL_MASK entry, if any. As such, first identify + * the ACL_MASK field, then iterate through identifying potential + * user matches, then group matches. If there is no ACL_MASK, + * assume that the mask allows all requests to succeed. + */ + if (acl_mask != NULL) { + acl_mask_granted = 0; + if (acl_mask->ae_perm & ACL_EXECUTE) + acl_mask_granted |= VEXEC; + if (acl_mask->ae_perm & ACL_READ) + acl_mask_granted |= VREAD; + if (acl_mask->ae_perm & ACL_WRITE) + acl_mask_granted |= VWRITE; + } else + acl_mask_granted = VEXEC | VREAD | VWRITE; + + /* + * Iterate through user ACL entries. Do checks twice, first + * without privilege, and then if a match is found but failed, + * a second time with privilege. + */ + + /* + * Check ACL_USER ACL entries. + */ + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_USER: + if (acl->acl_entry[i].ae_id != cred->cr_uid) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + if ((acc_mode & dac_granted) == acc_mode) + return (0); + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + goto error; + + if (privused != NULL) + *privused = 1; + return (0); + } + } + + /* + * Group match is best-match, not first-match, so find a + * "best" match. Iterate across, testing each potential group + * match. Make sure we keep track of whether we found a match + * or not, so that we know if we should try again with any + * available privilege, or if we should move on to ACL_OTHER. + */ + group_matched = 0; + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_GROUP_OBJ: + if (!groupmember(file_gid, cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + + group_matched = 1; + break; + + case ACL_GROUP: + if (!groupmember(acl->acl_entry[i].ae_id, cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + + group_matched = 1; + break; + + default: + break; + } + } + + if (group_matched == 1) { + /* + * There was a match, but it did not grant rights via + * pure DAC. Try again, this time with privilege. + */ + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_GROUP_OBJ: + if (!groupmember(file_gid, cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + break; + + if (privused != NULL) + *privused = 1; + return (0); + + case ACL_GROUP: + if (!groupmember(acl->acl_entry[i].ae_id, + cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + break; + + if (privused != NULL) + *privused = 1; + return (0); + + default: + break; + } + } + /* + * Even with privilege, group membership was not sufficient. + * Return failure. + */ + goto error; + } + + /* + * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER. + */ + dac_granted = 0; + if (acl_other->ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl_other->ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl_other->ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { + if (privused != NULL) + *privused = 1; + return (0); + } + +error: + return ((acc_mode & VADMIN) ? EPERM : EACCES); +} + +/* + * For the purposes of filesystems maintaining the _OBJ entries in an + * inode with a mode_t field, this routine converts a mode_t entry + * to an acl_perm_t. + */ +acl_perm_t +acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode) +{ + acl_perm_t perm = 0; + + switch(tag) { + case ACL_USER_OBJ: + if (mode & S_IXUSR) + perm |= ACL_EXECUTE; + if (mode & S_IRUSR) + perm |= ACL_READ; + if (mode & S_IWUSR) + perm |= ACL_WRITE; + return (perm); + + case ACL_GROUP_OBJ: + if (mode & S_IXGRP) + perm |= ACL_EXECUTE; + if (mode & S_IRGRP) + perm |= ACL_READ; + if (mode & S_IWGRP) + perm |= ACL_WRITE; + return (perm); + + case ACL_OTHER: + if (mode & S_IXOTH) + perm |= ACL_EXECUTE; + if (mode & S_IROTH) + perm |= ACL_READ; + if (mode & S_IWOTH) + perm |= ACL_WRITE; + return (perm); + + default: + printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag); + return (0); + } +} + +/* + * Given inode information (uid, gid, mode), return an acl entry of the + * appropriate type. + */ +struct acl_entry +acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode) +{ + struct acl_entry acl_entry; + + acl_entry.ae_tag = tag; + acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode); + switch(tag) { + case ACL_USER_OBJ: + acl_entry.ae_id = uid; + break; + + case ACL_GROUP_OBJ: + acl_entry.ae_id = gid; + break; + + case ACL_OTHER: + acl_entry.ae_id = ACL_UNDEFINED_ID; + break; + + default: + acl_entry.ae_id = ACL_UNDEFINED_ID; + printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag); + } + + return (acl_entry); +} + +/* + * Utility function to generate a file mode given appropriate ACL entries. + */ +mode_t +acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry, + struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry) +{ + mode_t mode; + + mode = 0; + if (acl_user_obj_entry->ae_perm & ACL_EXECUTE) + mode |= S_IXUSR; + if (acl_user_obj_entry->ae_perm & ACL_READ) + mode |= S_IRUSR; + if (acl_user_obj_entry->ae_perm & ACL_WRITE) + mode |= S_IWUSR; + if (acl_group_obj_entry->ae_perm & ACL_EXECUTE) + mode |= S_IXGRP; + if (acl_group_obj_entry->ae_perm & ACL_READ) + mode |= S_IRGRP; + if (acl_group_obj_entry->ae_perm & ACL_WRITE) + mode |= S_IWGRP; + if (acl_other_entry->ae_perm & ACL_EXECUTE) + mode |= S_IXOTH; + if (acl_other_entry->ae_perm & ACL_READ) + mode |= S_IROTH; + if (acl_other_entry->ae_perm & ACL_WRITE) + mode |= S_IWOTH; + + return (mode); +} + +/* + * Perform a syntactic check of the ACL, sufficient to allow an + * implementing filesystem to determine if it should accept this and + * rely on the POSIX.1e ACL properties. + */ +int +acl_posix1e_check(struct acl *acl) +{ + int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group; + int num_acl_mask, num_acl_other, i; + + /* + * Verify that the number of entries does not exceed the maximum + * defined for acl_t. + * Verify that the correct number of various sorts of ae_tags are + * present: + * Exactly one ACL_USER_OBJ + * Exactly one ACL_GROUP_OBJ + * Exactly one ACL_OTHER + * If any ACL_USER or ACL_GROUP entries appear, then exactly one + * ACL_MASK entry must also appear. + * Verify that all ae_perm entries are in ACL_PERM_BITS. + * Verify all ae_tag entries are understood by this implementation. + * Note: Does not check for uniqueness of qualifier (ae_id) field. + */ + num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group = + num_acl_mask = num_acl_other = 0; + if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0) + return (EINVAL); + for (i = 0; i < acl->acl_cnt; i++) { + /* + * Check for a valid tag. + */ + switch(acl->acl_entry[i].ae_tag) { + case ACL_USER_OBJ: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_user_obj++; + break; + case ACL_GROUP_OBJ: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_group_obj++; + break; + case ACL_USER: + if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_user++; + break; + case ACL_GROUP: + if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_group++; + break; + case ACL_OTHER: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_other++; + break; + case ACL_MASK: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_mask++; + break; + default: + return (EINVAL); + } + /* + * Check for valid perm entries. + */ + if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) != + ACL_PERM_BITS) + return (EINVAL); + } + if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) || + (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1)) + return (EINVAL); + if (((num_acl_group != 0) || (num_acl_user != 0)) && + (num_acl_mask != 1)) + return (EINVAL); + return (0); +} + +/* + * These calls wrap the real vnode operations, and are called by the + * syscall code once the syscall has converted the path or file + * descriptor to a vnode (unlocked). The aclp pointer is assumed + * still to point to userland, so this should not be consumed within + * the kernel except by syscall code. Other code should directly + * invoke VOP_{SET,GET}ACL. + */ + +/* + * Given a vnode, set its ACL. + */ +static int +vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, + struct acl *aclp) +{ + struct acl inkernacl; + struct mount *mp; + int error; + + error = copyin(aclp, &inkernacl, sizeof(struct acl)); + if (error) + return(error); + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return(error); +} + +/* + * Given a vnode, get its ACL. + */ +static int +vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, + struct acl *aclp) +{ + struct acl inkernelacl; + int error; + + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + if (error == 0) + error = copyout(&inkernelacl, aclp, sizeof(struct acl)); + return (error); +} + +/* + * Given a vnode, delete its ACL. + */ +static int +vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type) +{ + struct mount *mp; + int error; + + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_SETACL(vp, type, NULL, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return (error); +} + +/* + * Given a vnode, check whether an ACL is appropriate for it + */ +static int +vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, + struct acl *aclp) +{ + struct acl inkernelacl; + int error; + + error = copyin(aclp, &inkernelacl, sizeof(struct acl)); + if (error) + return(error); + error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td); + return (error); +} + +/* + * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. + * Don't need to lock, as the vacl_ code will get/release any locks + * required. + */ + +/* + * Given a file path, get an ACL for it + * + * MPSAFE + */ +int +__acl_get_file(struct thread *td, struct __acl_get_file_args *uap) +{ + struct nameidata nd; + int error; + + mtx_lock(&Giant); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + error = namei(&nd); + if (error == 0) { + error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type), + SCARG(uap, aclp)); + NDFREE(&nd, 0); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file path, set an ACL for it + * + * MPSAFE + */ +int +__acl_set_file(struct thread *td, struct __acl_set_file_args *uap) +{ + struct nameidata nd; + int error; + + mtx_lock(&Giant); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + error = namei(&nd); + if (error == 0) { + error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type), + SCARG(uap, aclp)); + NDFREE(&nd, 0); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file descriptor, get an ACL for it + * + * MPSAFE + */ +int +__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap) +{ + struct file *fp; + int error; + + mtx_lock(&Giant); + error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); + if (error == 0) { + error = vacl_get_acl(td, (struct vnode *)fp->f_data, + SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file descriptor, set an ACL for it + * + * MPSAFE + */ +int +__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap) +{ + struct file *fp; + int error; + + mtx_lock(&Giant); + error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); + if (error == 0) { + error = vacl_set_acl(td, (struct vnode *)fp->f_data, + SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file path, delete an ACL from it. + * + * MPSAFE + */ +int +__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap) +{ + struct nameidata nd; + int error; + + mtx_lock(&Giant); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + error = namei(&nd); + if (error == 0) { + error = vacl_delete(td, nd.ni_vp, SCARG(uap, type)); + NDFREE(&nd, 0); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file path, delete an ACL from it. + * + * MPSAFE + */ +int +__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap) +{ + struct file *fp; + int error; + + mtx_lock(&Giant); + error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); + if (error == 0) { + error = vacl_delete(td, (struct vnode *)fp->f_data, + SCARG(uap, type)); + fdrop(fp, td); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file path, check an ACL for it + * + * MPSAFE + */ +int +__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap) +{ + struct nameidata nd; + int error; + + mtx_lock(&Giant); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + error = namei(&nd); + if (error == 0) { + error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type), + SCARG(uap, aclp)); + NDFREE(&nd, 0); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file descriptor, check an ACL for it + * + * MPSAFE + */ +int +__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap) +{ + struct file *fp; + int error; + + mtx_lock(&Giant); + error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); + if (error == 0) { + error = vacl_aclcheck(td, (struct vnode *)fp->f_data, + SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); + } + mtx_unlock(&Giant); + return (error); +} diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c new file mode 100644 index 0000000..5132e02 --- /dev/null +++ b/sys/kern/subr_autoconf.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Lawrence Berkeley Laboratories. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_autoconf.c 8.1 (Berkeley) 6/10/93 + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> + +/* + * Autoconfiguration subroutines. + */ + +/* + * "Interrupt driven config" functions. + */ +static TAILQ_HEAD(, intr_config_hook) intr_config_hook_list = + TAILQ_HEAD_INITIALIZER(intr_config_hook_list); + + +/* ARGSUSED */ +static void run_interrupt_driven_config_hooks(void *dummy); +static void +run_interrupt_driven_config_hooks(dummy) + void *dummy; +{ + struct intr_config_hook *hook_entry, *next_entry; + + for (hook_entry = TAILQ_FIRST(&intr_config_hook_list); + hook_entry != NULL; + hook_entry = next_entry) { + next_entry = TAILQ_NEXT(hook_entry, ich_links); + (*hook_entry->ich_func)(hook_entry->ich_arg); + } + + while (!TAILQ_EMPTY(&intr_config_hook_list)) { + tsleep(&intr_config_hook_list, PCONFIG, "conifhk", 0); + } +} +SYSINIT(intr_config_hooks, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST, + run_interrupt_driven_config_hooks, NULL) + +/* + * Register a hook that will be called after "cold" + * autoconfiguration is complete and interrupts can + * be used to complete initialization. + */ +int +config_intrhook_establish(hook) + struct intr_config_hook *hook; +{ + struct intr_config_hook *hook_entry; + + for (hook_entry = TAILQ_FIRST(&intr_config_hook_list); + hook_entry != NULL; + hook_entry = TAILQ_NEXT(hook_entry, ich_links)) + if (hook_entry == hook) + break; + if (hook_entry != NULL) { + printf("config_intrhook_establish: establishing an " + "already established hook.\n"); + return (1); + } + TAILQ_INSERT_TAIL(&intr_config_hook_list, hook, ich_links); + if (cold == 0) + /* XXX Sufficient for modules loaded after initial config??? */ + run_interrupt_driven_config_hooks(NULL); + return (0); +} + +void +config_intrhook_disestablish(hook) + struct intr_config_hook *hook; +{ + struct intr_config_hook *hook_entry; + + for (hook_entry = TAILQ_FIRST(&intr_config_hook_list); + hook_entry != NULL; + hook_entry = TAILQ_NEXT(hook_entry, ich_links)) + if (hook_entry == hook) + break; + if (hook_entry == NULL) + panic("config_intrhook_disestablish: disestablishing an " + "unestablished hook"); + + TAILQ_REMOVE(&intr_config_hook_list, hook, ich_links); + /* Wakeup anyone watching the list */ + wakeup(&intr_config_hook_list); +} diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c new file mode 100644 index 0000000..eeeb7d9 --- /dev/null +++ b/sys/kern/subr_blist.c @@ -0,0 +1,929 @@ + +/* + * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting + * + * (c)Copyright 1998, Matthew Dillon. Terms for use and redistribution + * are covered by the BSD Copyright as found in /usr/src/COPYRIGHT. + * + * This module implements a general bitmap allocator/deallocator. The + * allocator eats around 2 bits per 'block'. The module does not + * try to interpret the meaning of a 'block' other then to return + * SWAPBLK_NONE on an allocation failure. + * + * A radix tree is used to maintain the bitmap. Two radix constants are + * involved: One for the bitmaps contained in the leaf nodes (typically + * 32), and one for the meta nodes (typically 16). Both meta and leaf + * nodes have a hint field. This field gives us a hint as to the largest + * free contiguous range of blocks under the node. It may contain a + * value that is too high, but will never contain a value that is too + * low. When the radix tree is searched, allocation failures in subtrees + * update the hint. + * + * The radix tree also implements two collapsed states for meta nodes: + * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is + * in either of these two states, all information contained underneath + * the node is considered stale. These states are used to optimize + * allocation and freeing operations. + * + * The hinting greatly increases code efficiency for allocations while + * the general radix structure optimizes both allocations and frees. The + * radix tree should be able to operate well no matter how much + * fragmentation there is and no matter how large a bitmap is used. + * + * Unlike the rlist code, the blist code wires all necessary memory at + * creation time. Neither allocations nor frees require interaction with + * the memory subsystem. In contrast, the rlist code may allocate memory + * on an rlist_free() call. The non-blocking features of the blist code + * are used to great advantage in the swap code (vm/nswap_pager.c). The + * rlist code uses a little less overall memory then the blist code (but + * due to swap interleaving not all that much less), but the blist code + * scales much, much better. + * + * LAYOUT: The radix tree is layed out recursively using a + * linear array. Each meta node is immediately followed (layed out + * sequentially in memory) by BLIST_META_RADIX lower level nodes. This + * is a recursive structure but one that can be easily scanned through + * a very simple 'skip' calculation. In order to support large radixes, + * portions of the tree may reside outside our memory allocation. We + * handle this with an early-termination optimization (when bighint is + * set to -1) on the scan. The memory allocation is only large enough + * to cover the number of blocks requested at creation time even if it + * must be encompassed in larger root-node radix. + * + * NOTE: the allocator cannot currently allocate more then + * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too + * large' if you try. This is an area that could use improvement. The + * radix is large enough that this restriction does not effect the swap + * system, though. Currently only the allocation code is effected by + * this algorithmic unfeature. The freeing code can handle arbitrary + * ranges. + * + * This code can be compiled stand-alone for debugging. + * + * $FreeBSD$ + */ + +#ifdef _KERNEL + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/kernel.h> +#include <sys/blist.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/mutex.h> +#include <vm/vm.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/vm_page.h> + +#else + +#ifndef BLIST_NO_DEBUG +#define BLIST_DEBUG +#endif + +#define SWAPBLK_NONE ((daddr_t)-1) + +#include <sys/types.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <stdarg.h> + +#define malloc(a,b,c) malloc(a) +#define free(a,b) free(a) + +typedef unsigned int u_daddr_t; + +#include <sys/blist.h> + +void panic(const char *ctl, ...); + +#endif + +/* + * static support functions + */ + +static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count); +static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, + daddr_t count, daddr_t radix, int skip); +static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); +static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, + daddr_t radix, int skip, daddr_t blk); +static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, + daddr_t skip, blist_t dest, daddr_t count); +static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, + int skip, daddr_t count); +#ifndef _KERNEL +static void blst_radix_print(blmeta_t *scan, daddr_t blk, + daddr_t radix, int skip, int tab); +#endif + +#ifdef _KERNEL +static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); +#endif + +/* + * blist_create() - create a blist capable of handling up to the specified + * number of blocks + * + * blocks must be greater then 0 + * + * The smallest blist consists of a single leaf node capable of + * managing BLIST_BMAP_RADIX blocks. + */ + +blist_t +blist_create(daddr_t blocks) +{ + blist_t bl; + int radix; + int skip = 0; + + /* + * Calculate radix and skip field used for scanning. + */ + radix = BLIST_BMAP_RADIX; + + while (radix < blocks) { + radix <<= BLIST_META_RADIX_SHIFT; + skip = (skip + 1) << BLIST_META_RADIX_SHIFT; + } + + bl = malloc(sizeof(struct blist), M_SWAP, M_WAITOK | M_ZERO); + + bl->bl_blocks = blocks; + bl->bl_radix = radix; + bl->bl_skip = skip; + bl->bl_rootblks = 1 + + blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks); + bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, M_WAITOK); + +#if defined(BLIST_DEBUG) + printf( + "BLIST representing %d blocks (%d MB of swap)" + ", requiring %dK of ram\n", + bl->bl_blocks, + bl->bl_blocks * 4 / 1024, + (bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024 + ); + printf("BLIST raw radix tree contains %d records\n", bl->bl_rootblks); +#endif + blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks); + + return(bl); +} + +void +blist_destroy(blist_t bl) +{ + free(bl->bl_root, M_SWAP); + free(bl, M_SWAP); +} + +/* + * blist_alloc() - reserve space in the block bitmap. Return the base + * of a contiguous region or SWAPBLK_NONE if space could + * not be allocated. + */ + +daddr_t +blist_alloc(blist_t bl, daddr_t count) +{ + daddr_t blk = SWAPBLK_NONE; + + if (bl) { + if (bl->bl_radix == BLIST_BMAP_RADIX) + blk = blst_leaf_alloc(bl->bl_root, 0, count); + else + blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip); + if (blk != SWAPBLK_NONE) + bl->bl_free -= count; + } + return(blk); +} + +/* + * blist_free() - free up space in the block bitmap. Return the base + * of a contiguous region. Panic if an inconsistancy is + * found. + */ + +void +blist_free(blist_t bl, daddr_t blkno, daddr_t count) +{ + if (bl) { + if (bl->bl_radix == BLIST_BMAP_RADIX) + blst_leaf_free(bl->bl_root, blkno, count); + else + blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0); + bl->bl_free += count; + } +} + +/* + * blist_resize() - resize an existing radix tree to handle the + * specified number of blocks. This will reallocate + * the tree and transfer the previous bitmap to the new + * one. When extending the tree you can specify whether + * the new blocks are to left allocated or freed. + */ + +void +blist_resize(blist_t *pbl, daddr_t count, int freenew) +{ + blist_t newbl = blist_create(count); + blist_t save = *pbl; + + *pbl = newbl; + if (count > save->bl_blocks) + count = save->bl_blocks; + blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count); + + /* + * If resizing upwards, should we free the new space or not? + */ + if (freenew && count < newbl->bl_blocks) { + blist_free(newbl, count, newbl->bl_blocks - count); + } + blist_destroy(save); +} + +#ifdef BLIST_DEBUG + +/* + * blist_print() - dump radix tree + */ + +void +blist_print(blist_t bl) +{ + printf("BLIST {\n"); + blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4); + printf("}\n"); +} + +#endif + +/************************************************************************ + * ALLOCATION SUPPORT FUNCTIONS * + ************************************************************************ + * + * These support functions do all the actual work. They may seem + * rather longish, but that's because I've commented them up. The + * actual code is straight forward. + * + */ + +/* + * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap). + * + * This is the core of the allocator and is optimized for the 1 block + * and the BLIST_BMAP_RADIX block allocation cases. Other cases are + * somewhat slower. The 1 block allocation case is log2 and extremely + * quick. + */ + +static daddr_t +blst_leaf_alloc( + blmeta_t *scan, + daddr_t blk, + int count +) { + u_daddr_t orig = scan->u.bmu_bitmap; + + if (orig == 0) { + /* + * Optimize bitmap all-allocated case. Also, count = 1 + * case assumes at least 1 bit is free in the bitmap, so + * we have to take care of this case here. + */ + scan->bm_bighint = 0; + return(SWAPBLK_NONE); + } + if (count == 1) { + /* + * Optimized code to allocate one bit out of the bitmap + */ + u_daddr_t mask; + int j = BLIST_BMAP_RADIX/2; + int r = 0; + + mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2); + + while (j) { + if ((orig & mask) == 0) { + r += j; + orig >>= j; + } + j >>= 1; + mask >>= j; + } + scan->u.bmu_bitmap &= ~(1 << r); + return(blk + r); + } + if (count <= BLIST_BMAP_RADIX) { + /* + * non-optimized code to allocate N bits out of the bitmap. + * The more bits, the faster the code runs. It will run + * the slowest allocating 2 bits, but since there aren't any + * memory ops in the core loop (or shouldn't be, anyway), + * you probably won't notice the difference. + */ + int j; + int n = BLIST_BMAP_RADIX - count; + u_daddr_t mask; + + mask = (u_daddr_t)-1 >> n; + + for (j = 0; j <= n; ++j) { + if ((orig & mask) == mask) { + scan->u.bmu_bitmap &= ~mask; + return(blk + j); + } + mask = (mask << 1); + } + } + /* + * We couldn't allocate count in this subtree, update bighint. + */ + scan->bm_bighint = count - 1; + return(SWAPBLK_NONE); +} + +/* + * blist_meta_alloc() - allocate at a meta in the radix tree. + * + * Attempt to allocate at a meta node. If we can't, we update + * bighint and return a failure. Updating bighint optimize future + * calls that hit this node. We have to check for our collapse cases + * and we have a few optimizations strewn in as well. + */ + +static daddr_t +blst_meta_alloc( + blmeta_t *scan, + daddr_t blk, + daddr_t count, + daddr_t radix, + int skip +) { + int i; + int next_skip = (skip >> BLIST_META_RADIX_SHIFT); + + if (scan->u.bmu_avail == 0) { + /* + * ALL-ALLOCATED special case + */ + scan->bm_bighint = count; + return(SWAPBLK_NONE); + } + + if (scan->u.bmu_avail == radix) { + radix >>= BLIST_META_RADIX_SHIFT; + + /* + * ALL-FREE special case, initialize uninitialize + * sublevel. + */ + for (i = 1; i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (daddr_t)-1) + break; + if (next_skip == 1) { + scan[i].u.bmu_bitmap = (u_daddr_t)-1; + scan[i].bm_bighint = BLIST_BMAP_RADIX; + } else { + scan[i].bm_bighint = radix; + scan[i].u.bmu_avail = radix; + } + } + } else { + radix >>= BLIST_META_RADIX_SHIFT; + } + + for (i = 1; i <= skip; i += next_skip) { + if (count <= scan[i].bm_bighint) { + /* + * count fits in object + */ + daddr_t r; + if (next_skip == 1) { + r = blst_leaf_alloc(&scan[i], blk, count); + } else { + r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1); + } + if (r != SWAPBLK_NONE) { + scan->u.bmu_avail -= count; + if (scan->bm_bighint > scan->u.bmu_avail) + scan->bm_bighint = scan->u.bmu_avail; + return(r); + } + } else if (scan[i].bm_bighint == (daddr_t)-1) { + /* + * Terminator + */ + break; + } else if (count > radix) { + /* + * count does not fit in object even if it were + * complete free. + */ + panic("blist_meta_alloc: allocation too large"); + } + blk += radix; + } + + /* + * We couldn't allocate count in this subtree, update bighint. + */ + if (scan->bm_bighint >= count) + scan->bm_bighint = count - 1; + return(SWAPBLK_NONE); +} + +/* + * BLST_LEAF_FREE() - free allocated block from leaf bitmap + * + */ + +static void +blst_leaf_free( + blmeta_t *scan, + daddr_t blk, + int count +) { + /* + * free some data in this bitmap + * + * e.g. + * 0000111111111110000 + * \_________/\__/ + * v n + */ + int n = blk & (BLIST_BMAP_RADIX - 1); + u_daddr_t mask; + + mask = ((u_daddr_t)-1 << n) & + ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); + + if (scan->u.bmu_bitmap & mask) + panic("blst_radix_free: freeing free block"); + scan->u.bmu_bitmap |= mask; + + /* + * We could probably do a better job here. We are required to make + * bighint at least as large as the biggest contiguous block of + * data. If we just shoehorn it, a little extra overhead will + * be incured on the next allocation (but only that one typically). + */ + scan->bm_bighint = BLIST_BMAP_RADIX; +} + +/* + * BLST_META_FREE() - free allocated blocks from radix tree meta info + * + * This support routine frees a range of blocks from the bitmap. + * The range must be entirely enclosed by this radix node. If a + * meta node, we break the range down recursively to free blocks + * in subnodes (which means that this code can free an arbitrary + * range whereas the allocation code cannot allocate an arbitrary + * range). + */ + +static void +blst_meta_free( + blmeta_t *scan, + daddr_t freeBlk, + daddr_t count, + daddr_t radix, + int skip, + daddr_t blk +) { + int i; + int next_skip = (skip >> BLIST_META_RADIX_SHIFT); + +#if 0 + printf("FREE (%x,%d) FROM (%x,%d)\n", + freeBlk, count, + blk, radix + ); +#endif + + if (scan->u.bmu_avail == 0) { + /* + * ALL-ALLOCATED special case, with possible + * shortcut to ALL-FREE special case. + */ + scan->u.bmu_avail = count; + scan->bm_bighint = count; + + if (count != radix) { + for (i = 1; i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (daddr_t)-1) + break; + scan[i].bm_bighint = 0; + if (next_skip == 1) { + scan[i].u.bmu_bitmap = 0; + } else { + scan[i].u.bmu_avail = 0; + } + } + /* fall through */ + } + } else { + scan->u.bmu_avail += count; + /* scan->bm_bighint = radix; */ + } + + /* + * ALL-FREE special case. + */ + + if (scan->u.bmu_avail == radix) + return; + if (scan->u.bmu_avail > radix) + panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld", + (long long)count, (long long)scan->u.bmu_avail, + (long long)radix); + + /* + * Break the free down into its components + */ + + radix >>= BLIST_META_RADIX_SHIFT; + + i = (freeBlk - blk) / radix; + blk += i * radix; + i = i * next_skip + 1; + + while (i <= skip && blk < freeBlk + count) { + daddr_t v; + + v = blk + radix - freeBlk; + if (v > count) + v = count; + + if (scan->bm_bighint == (daddr_t)-1) + panic("blst_meta_free: freeing unexpected range"); + + if (next_skip == 1) { + blst_leaf_free(&scan[i], freeBlk, v); + } else { + blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk); + } + if (scan->bm_bighint < scan[i].bm_bighint) + scan->bm_bighint = scan[i].bm_bighint; + count -= v; + freeBlk += v; + blk += radix; + i += next_skip; + } +} + +/* + * BLIST_RADIX_COPY() - copy one radix tree to another + * + * Locates free space in the source tree and frees it in the destination + * tree. The space may not already be free in the destination. + */ + +static void blst_copy( + blmeta_t *scan, + daddr_t blk, + daddr_t radix, + daddr_t skip, + blist_t dest, + daddr_t count +) { + int next_skip; + int i; + + /* + * Leaf node + */ + + if (radix == BLIST_BMAP_RADIX) { + u_daddr_t v = scan->u.bmu_bitmap; + + if (v == (u_daddr_t)-1) { + blist_free(dest, blk, count); + } else if (v != 0) { + int i; + + for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) { + if (v & (1 << i)) + blist_free(dest, blk + i, 1); + } + } + return; + } + + /* + * Meta node + */ + + if (scan->u.bmu_avail == 0) { + /* + * Source all allocated, leave dest allocated + */ + return; + } + if (scan->u.bmu_avail == radix) { + /* + * Source all free, free entire dest + */ + if (count < radix) + blist_free(dest, blk, count); + else + blist_free(dest, blk, radix); + return; + } + + + radix >>= BLIST_META_RADIX_SHIFT; + next_skip = (skip >> BLIST_META_RADIX_SHIFT); + + for (i = 1; count && i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (daddr_t)-1) + break; + + if (count >= radix) { + blst_copy( + &scan[i], + blk, + radix, + next_skip - 1, + dest, + radix + ); + count -= radix; + } else { + if (count) { + blst_copy( + &scan[i], + blk, + radix, + next_skip - 1, + dest, + count + ); + } + count = 0; + } + blk += radix; + } +} + +/* + * BLST_RADIX_INIT() - initialize radix tree + * + * Initialize our meta structures and bitmaps and calculate the exact + * amount of space required to manage 'count' blocks - this space may + * be considerably less then the calculated radix due to the large + * RADIX values we use. + */ + +static daddr_t +blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count) +{ + int i; + int next_skip; + daddr_t memindex = 0; + + /* + * Leaf node + */ + + if (radix == BLIST_BMAP_RADIX) { + if (scan) { + scan->bm_bighint = 0; + scan->u.bmu_bitmap = 0; + } + return(memindex); + } + + /* + * Meta node. If allocating the entire object we can special + * case it. However, we need to figure out how much memory + * is required to manage 'count' blocks, so we continue on anyway. + */ + + if (scan) { + scan->bm_bighint = 0; + scan->u.bmu_avail = 0; + } + + radix >>= BLIST_META_RADIX_SHIFT; + next_skip = (skip >> BLIST_META_RADIX_SHIFT); + + for (i = 1; i <= skip; i += next_skip) { + if (count >= radix) { + /* + * Allocate the entire object + */ + memindex = i + blst_radix_init( + ((scan) ? &scan[i] : NULL), + radix, + next_skip - 1, + radix + ); + count -= radix; + } else if (count > 0) { + /* + * Allocate a partial object + */ + memindex = i + blst_radix_init( + ((scan) ? &scan[i] : NULL), + radix, + next_skip - 1, + count + ); + count = 0; + } else { + /* + * Add terminator and break out + */ + if (scan) + scan[i].bm_bighint = (daddr_t)-1; + break; + } + } + if (memindex < i) + memindex = i; + return(memindex); +} + +#ifdef BLIST_DEBUG + +static void +blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab) +{ + int i; + int next_skip; + int lastState = 0; + + if (radix == BLIST_BMAP_RADIX) { + printf( + "%*.*s(%04x,%d): bitmap %08x big=%d\n", + tab, tab, "", + blk, radix, + scan->u.bmu_bitmap, + scan->bm_bighint + ); + return; + } + + if (scan->u.bmu_avail == 0) { + printf( + "%*.*s(%04x,%d) ALL ALLOCATED\n", + tab, tab, "", + blk, + radix + ); + return; + } + if (scan->u.bmu_avail == radix) { + printf( + "%*.*s(%04x,%d) ALL FREE\n", + tab, tab, "", + blk, + radix + ); + return; + } + + printf( + "%*.*s(%04x,%d): subtree (%d/%d) big=%d {\n", + tab, tab, "", + blk, radix, + scan->u.bmu_avail, + radix, + scan->bm_bighint + ); + + radix >>= BLIST_META_RADIX_SHIFT; + next_skip = (skip >> BLIST_META_RADIX_SHIFT); + tab += 4; + + for (i = 1; i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (daddr_t)-1) { + printf( + "%*.*s(%04x,%d): Terminator\n", + tab, tab, "", + blk, radix + ); + lastState = 0; + break; + } + blst_radix_print( + &scan[i], + blk, + radix, + next_skip - 1, + tab + ); + blk += radix; + } + tab -= 4; + + printf( + "%*.*s}\n", + tab, tab, "" + ); +} + +#endif + +#ifdef BLIST_DEBUG + +int +main(int ac, char **av) +{ + int size = 1024; + int i; + blist_t bl; + + for (i = 1; i < ac; ++i) { + const char *ptr = av[i]; + if (*ptr != '-') { + size = strtol(ptr, NULL, 0); + continue; + } + ptr += 2; + fprintf(stderr, "Bad option: %s\n", ptr - 2); + exit(1); + } + bl = blist_create(size); + blist_free(bl, 0, size); + + for (;;) { + char buf[1024]; + daddr_t da = 0; + daddr_t count = 0; + + + printf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix); + fflush(stdout); + if (fgets(buf, sizeof(buf), stdin) == NULL) + break; + switch(buf[0]) { + case 'r': + if (sscanf(buf + 1, "%d", &count) == 1) { + blist_resize(&bl, count, 1); + } else { + printf("?\n"); + } + case 'p': + blist_print(bl); + break; + case 'a': + if (sscanf(buf + 1, "%d", &count) == 1) { + daddr_t blk = blist_alloc(bl, count); + printf(" R=%04x\n", blk); + } else { + printf("?\n"); + } + break; + case 'f': + if (sscanf(buf + 1, "%x %d", &da, &count) == 2) { + blist_free(bl, da, count); + } else { + printf("?\n"); + } + break; + case '?': + case 'h': + puts( + "p -print\n" + "a %d -allocate\n" + "f %x %d -free\n" + "r %d -resize\n" + "h/? -help" + ); + break; + default: + printf("?\n"); + break; + } + } + return(0); +} + +void +panic(const char *ctl, ...) +{ + va_list va; + + va_start(va, ctl); + vfprintf(stderr, ctl, va); + fprintf(stderr, "\n"); + va_end(va); + exit(1); +} + +#endif + diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c new file mode 100644 index 0000000..7281051 --- /dev/null +++ b/sys/kern/subr_bus.c @@ -0,0 +1,2179 @@ +/*- + * Copyright (c) 1997,1998 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_bus.h" + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/kobj.h> +#include <sys/bus_private.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <machine/bus.h> +#include <sys/rman.h> +#include <machine/stdarg.h> /* for device_printf() */ + +static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures"); + +#ifdef BUS_DEBUG + +static int bus_debug = 1; +SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RW, &bus_debug, 0, + "Debug bus code"); + +#define PDEBUG(a) if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a, printf("\n");} +#define DEVICENAME(d) ((d)? device_get_name(d): "no device") +#define DRIVERNAME(d) ((d)? d->name : "no driver") +#define DEVCLANAME(d) ((d)? d->name : "no devclass") + +/* Produce the indenting, indent*2 spaces plus a '.' ahead of that to + * prevent syslog from deleting initial spaces + */ +#define indentprintf(p) do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf(" "); printf p ; } while (0) + +static void print_device_short(device_t dev, int indent); +static void print_device(device_t dev, int indent); +void print_device_tree_short(device_t dev, int indent); +void print_device_tree(device_t dev, int indent); +static void print_driver_short(driver_t *driver, int indent); +static void print_driver(driver_t *driver, int indent); +static void print_driver_list(driver_list_t drivers, int indent); +static void print_devclass_short(devclass_t dc, int indent); +static void print_devclass(devclass_t dc, int indent); +void print_devclass_list_short(void); +void print_devclass_list(void); + +#else +/* Make the compiler ignore the function calls */ +#define PDEBUG(a) /* nop */ +#define DEVICENAME(d) /* nop */ +#define DRIVERNAME(d) /* nop */ +#define DEVCLANAME(d) /* nop */ + +#define print_device_short(d,i) /* nop */ +#define print_device(d,i) /* nop */ +#define print_device_tree_short(d,i) /* nop */ +#define print_device_tree(d,i) /* nop */ +#define print_driver_short(d,i) /* nop */ +#define print_driver(d,i) /* nop */ +#define print_driver_list(d,i) /* nop */ +#define print_devclass_short(d,i) /* nop */ +#define print_devclass(d,i) /* nop */ +#define print_devclass_list_short() /* nop */ +#define print_devclass_list() /* nop */ +#endif + +TAILQ_HEAD(,device) bus_data_devices; +static int bus_data_generation = 1; + +kobj_method_t null_methods[] = { + { 0, 0 } +}; + +DEFINE_CLASS(null, null_methods, 0); + +/* + * Devclass implementation + */ + +static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses); + +static devclass_t +devclass_find_internal(const char *classname, int create) +{ + devclass_t dc; + + PDEBUG(("looking for %s", classname)); + if (!classname) + return (NULL); + + TAILQ_FOREACH(dc, &devclasses, link) { + if (!strcmp(dc->name, classname)) + return (dc); + } + + PDEBUG(("%s not found%s", classname, (create? ", creating": ""))); + if (create) { + dc = malloc(sizeof(struct devclass) + strlen(classname) + 1, + M_BUS, M_NOWAIT|M_ZERO); + if (!dc) + return (NULL); + dc->name = (char*) (dc + 1); + strcpy(dc->name, classname); + TAILQ_INIT(&dc->drivers); + TAILQ_INSERT_TAIL(&devclasses, dc, link); + + bus_data_generation_update(); + } + + return (dc); +} + +devclass_t +devclass_create(const char *classname) +{ + return (devclass_find_internal(classname, TRUE)); +} + +devclass_t +devclass_find(const char *classname) +{ + return (devclass_find_internal(classname, FALSE)); +} + +int +devclass_add_driver(devclass_t dc, driver_t *driver) +{ + driverlink_t dl; + int i; + + PDEBUG(("%s", DRIVERNAME(driver))); + + dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO); + if (!dl) + return (ENOMEM); + + /* + * Compile the driver's methods. Also increase the reference count + * so that the class doesn't get freed when the last instance + * goes. This means we can safely use static methods and avoids a + * double-free in devclass_delete_driver. + */ + kobj_class_compile((kobj_class_t) driver); + + /* + * Make sure the devclass which the driver is implementing exists. + */ + devclass_find_internal(driver->name, TRUE); + + dl->driver = driver; + TAILQ_INSERT_TAIL(&dc->drivers, dl, link); + driver->refs++; + + /* + * Call BUS_DRIVER_ADDED for any existing busses in this class. + */ + for (i = 0; i < dc->maxunit; i++) + if (dc->devices[i]) + BUS_DRIVER_ADDED(dc->devices[i], driver); + + bus_data_generation_update(); + return (0); +} + +int +devclass_delete_driver(devclass_t busclass, driver_t *driver) +{ + devclass_t dc = devclass_find(driver->name); + driverlink_t dl; + device_t dev; + int i; + int error; + + PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass))); + + if (!dc) + return (0); + + /* + * Find the link structure in the bus' list of drivers. + */ + TAILQ_FOREACH(dl, &busclass->drivers, link) { + if (dl->driver == driver) + break; + } + + if (!dl) { + PDEBUG(("%s not found in %s list", driver->name, + busclass->name)); + return (ENOENT); + } + + /* + * Disassociate from any devices. We iterate through all the + * devices in the devclass of the driver and detach any which are + * using the driver and which have a parent in the devclass which + * we are deleting from. + * + * Note that since a driver can be in multiple devclasses, we + * should not detach devices which are not children of devices in + * the affected devclass. + */ + for (i = 0; i < dc->maxunit; i++) { + if (dc->devices[i]) { + dev = dc->devices[i]; + if (dev->driver == driver && dev->parent && + dev->parent->devclass == busclass) { + if ((error = device_detach(dev)) != 0) + return (error); + device_set_driver(dev, NULL); + } + } + } + + TAILQ_REMOVE(&busclass->drivers, dl, link); + free(dl, M_BUS); + + driver->refs--; + if (driver->refs == 0) + kobj_class_free((kobj_class_t) driver); + + bus_data_generation_update(); + return (0); +} + +static driverlink_t +devclass_find_driver_internal(devclass_t dc, const char *classname) +{ + driverlink_t dl; + + PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc))); + + TAILQ_FOREACH(dl, &dc->drivers, link) { + if (!strcmp(dl->driver->name, classname)) + return (dl); + } + + PDEBUG(("not found")); + return (NULL); +} + +driver_t * +devclass_find_driver(devclass_t dc, const char *classname) +{ + driverlink_t dl; + + dl = devclass_find_driver_internal(dc, classname); + if (dl) + return (dl->driver); + return (NULL); +} + +const char * +devclass_get_name(devclass_t dc) +{ + return (dc->name); +} + +device_t +devclass_get_device(devclass_t dc, int unit) +{ + if (dc == NULL || unit < 0 || unit >= dc->maxunit) + return (NULL); + return (dc->devices[unit]); +} + +void * +devclass_get_softc(devclass_t dc, int unit) +{ + device_t dev; + + dev = devclass_get_device(dc, unit); + if (!dev) + return (NULL); + + return (device_get_softc(dev)); +} + +int +devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp) +{ + int i; + int count; + device_t *list; + + count = 0; + for (i = 0; i < dc->maxunit; i++) + if (dc->devices[i]) + count++; + + list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO); + if (!list) + return (ENOMEM); + + count = 0; + for (i = 0; i < dc->maxunit; i++) { + if (dc->devices[i]) { + list[count] = dc->devices[i]; + count++; + } + } + + *devlistp = list; + *devcountp = count; + + return (0); +} + +int +devclass_get_maxunit(devclass_t dc) +{ + return (dc->maxunit); +} + +int +devclass_find_free_unit(devclass_t dc, int unit) +{ + if (dc == NULL) + return (unit); + while (unit < dc->maxunit && dc->devices[unit] != NULL) + unit++; + return (unit); +} + +static int +devclass_alloc_unit(devclass_t dc, int *unitp) +{ + int unit = *unitp; + + PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc))); + + /* If we were given a wired unit number, check for existing device */ + /* XXX imp XXX */ + if (unit != -1) { + if (unit >= 0 && unit < dc->maxunit && + dc->devices[unit] != NULL) { + if (bootverbose) + printf("%s: %s%d already exists; skipping it\n", + dc->name, dc->name, *unitp); + return (EEXIST); + } + } else { + /* Unwired device, find the next available slot for it */ + unit = 0; + while (unit < dc->maxunit && dc->devices[unit] != NULL) + unit++; + } + + /* + * We've selected a unit beyond the length of the table, so let's + * extend the table to make room for all units up to and including + * this one. + */ + if (unit >= dc->maxunit) { + device_t *newlist; + int newsize; + + newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t)); + newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT); + if (!newlist) + return (ENOMEM); + bcopy(dc->devices, newlist, sizeof(device_t) * dc->maxunit); + bzero(newlist + dc->maxunit, + sizeof(device_t) * (newsize - dc->maxunit)); + if (dc->devices) + free(dc->devices, M_BUS); + dc->devices = newlist; + dc->maxunit = newsize; + } + PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc))); + + *unitp = unit; + return (0); +} + +static int +devclass_add_device(devclass_t dc, device_t dev) +{ + int buflen, error; + + PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc))); + + buflen = snprintf(NULL, 0, "%s%d$", dc->name, dev->unit); + if (buflen < 0) + return (ENOMEM); + dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO); + if (!dev->nameunit) + return (ENOMEM); + + if ((error = devclass_alloc_unit(dc, &dev->unit)) != 0) { + free(dev->nameunit, M_BUS); + dev->nameunit = NULL; + return (error); + } + dc->devices[dev->unit] = dev; + dev->devclass = dc; + snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit); + + return (0); +} + +static int +devclass_delete_device(devclass_t dc, device_t dev) +{ + if (!dc || !dev) + return (0); + + PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc))); + + if (dev->devclass != dc || dc->devices[dev->unit] != dev) + panic("devclass_delete_device: inconsistent device class"); + dc->devices[dev->unit] = NULL; + if (dev->flags & DF_WILDCARD) + dev->unit = -1; + dev->devclass = NULL; + free(dev->nameunit, M_BUS); + dev->nameunit = NULL; + + return (0); +} + +static device_t +make_device(device_t parent, const char *name, int unit) +{ + device_t dev; + devclass_t dc; + + PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit)); + + if (name) { + dc = devclass_find_internal(name, TRUE); + if (!dc) { + printf("make_device: can't find device class %s\n", + name); + return (NULL); + } + } else { + dc = NULL; + } + + dev = malloc(sizeof(struct device), M_BUS, M_NOWAIT|M_ZERO); + if (!dev) + return (NULL); + + dev->parent = parent; + TAILQ_INIT(&dev->children); + kobj_init((kobj_t) dev, &null_class); + dev->driver = NULL; + dev->devclass = NULL; + dev->unit = unit; + dev->nameunit = NULL; + dev->desc = NULL; + dev->busy = 0; + dev->devflags = 0; + dev->flags = DF_ENABLED; + dev->order = 0; + if (unit == -1) + dev->flags |= DF_WILDCARD; + if (name) { + dev->flags |= DF_FIXEDCLASS; + if (devclass_add_device(dc, dev)) { + kobj_delete((kobj_t) dev, M_BUS); + return (NULL); + } + } + dev->ivars = NULL; + dev->softc = NULL; + + dev->state = DS_NOTPRESENT; + + TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink); + bus_data_generation_update(); + + return (dev); +} + +static int +device_print_child(device_t dev, device_t child) +{ + int retval = 0; + + if (device_is_alive(child)) + retval += BUS_PRINT_CHILD(dev, child); + else + retval += device_printf(child, " not found\n"); + + return (retval); +} + +device_t +device_add_child(device_t dev, const char *name, int unit) +{ + return (device_add_child_ordered(dev, 0, name, unit)); +} + +device_t +device_add_child_ordered(device_t dev, int order, const char *name, int unit) +{ + device_t child; + device_t place; + + PDEBUG(("%s at %s with order %d as unit %d", + name, DEVICENAME(dev), order, unit)); + + child = make_device(dev, name, unit); + if (child == NULL) + return (child); + child->order = order; + + TAILQ_FOREACH(place, &dev->children, link) { + if (place->order > order) + break; + } + + if (place) { + /* + * The device 'place' is the first device whose order is + * greater than the new child. + */ + TAILQ_INSERT_BEFORE(place, child, link); + } else { + /* + * The new child's order is greater or equal to the order of + * any existing device. Add the child to the tail of the list. + */ + TAILQ_INSERT_TAIL(&dev->children, child, link); + } + + bus_data_generation_update(); + return (child); +} + +int +device_delete_child(device_t dev, device_t child) +{ + int error; + device_t grandchild; + + PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev))); + + /* remove children first */ + while ( (grandchild = TAILQ_FIRST(&child->children)) ) { + error = device_delete_child(child, grandchild); + if (error) + return (error); + } + + if ((error = device_detach(child)) != 0) + return (error); + if (child->devclass) + devclass_delete_device(child->devclass, child); + TAILQ_REMOVE(&dev->children, child, link); + TAILQ_REMOVE(&bus_data_devices, child, devlink); + device_set_desc(child, NULL); + free(child, M_BUS); + + bus_data_generation_update(); + return (0); +} + +/* + * Find only devices attached to this bus. + */ +device_t +device_find_child(device_t dev, const char *classname, int unit) +{ + devclass_t dc; + device_t child; + + dc = devclass_find(classname); + if (!dc) + return (NULL); + + child = devclass_get_device(dc, unit); + if (child && child->parent == dev) + return (child); + return (NULL); +} + +static driverlink_t +first_matching_driver(devclass_t dc, device_t dev) +{ + if (dev->devclass) + return (devclass_find_driver_internal(dc, dev->devclass->name)); + return (TAILQ_FIRST(&dc->drivers)); +} + +static driverlink_t +next_matching_driver(devclass_t dc, device_t dev, driverlink_t last) +{ + if (dev->devclass) { + driverlink_t dl; + for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link)) + if (!strcmp(dev->devclass->name, dl->driver->name)) + return (dl); + return (NULL); + } + return (TAILQ_NEXT(last, link)); +} + +static int +device_probe_child(device_t dev, device_t child) +{ + devclass_t dc; + driverlink_t best = 0; + driverlink_t dl; + int result, pri = 0; + int hasclass = (child->devclass != 0); + + dc = dev->devclass; + if (!dc) + panic("device_probe_child: parent device has no devclass"); + + if (child->state == DS_ALIVE) + return (0); + + for (dl = first_matching_driver(dc, child); + dl; + dl = next_matching_driver(dc, child, dl)) { + PDEBUG(("Trying %s", DRIVERNAME(dl->driver))); + device_set_driver(child, dl->driver); + if (!hasclass) + device_set_devclass(child, dl->driver->name); + result = DEVICE_PROBE(child); + if (!hasclass) + device_set_devclass(child, 0); + + /* + * If the driver returns SUCCESS, there can be no higher match + * for this device. + */ + if (result == 0) { + best = dl; + pri = 0; + break; + } + + /* + * The driver returned an error so it certainly doesn't match. + */ + if (result > 0) { + device_set_driver(child, 0); + continue; + } + + /* + * A priority lower than SUCCESS, remember the best matching + * driver. Initialise the value of pri for the first match. + */ + if (best == 0 || result > pri) { + best = dl; + pri = result; + continue; + } + } + + /* + * If we found a driver, change state and initialise the devclass. + */ + if (best) { + if (!child->devclass) + device_set_devclass(child, best->driver->name); + device_set_driver(child, best->driver); + if (pri < 0) { + /* + * A bit bogus. Call the probe method again to make + * sure that we have the right description. + */ + DEVICE_PROBE(child); + } + child->state = DS_ALIVE; + + bus_data_generation_update(); + return (0); + } + + return (ENXIO); +} + +device_t +device_get_parent(device_t dev) +{ + return (dev->parent); +} + +int +device_get_children(device_t dev, device_t **devlistp, int *devcountp) +{ + int count; + device_t child; + device_t *list; + + count = 0; + TAILQ_FOREACH(child, &dev->children, link) { + count++; + } + + list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO); + if (!list) + return (ENOMEM); + + count = 0; + TAILQ_FOREACH(child, &dev->children, link) { + list[count] = child; + count++; + } + + *devlistp = list; + *devcountp = count; + + return (0); +} + +driver_t * +device_get_driver(device_t dev) +{ + return (dev->driver); +} + +devclass_t +device_get_devclass(device_t dev) +{ + return (dev->devclass); +} + +const char * +device_get_name(device_t dev) +{ + if (dev->devclass) + return (devclass_get_name(dev->devclass)); + return (NULL); +} + +const char * +device_get_nameunit(device_t dev) +{ + return (dev->nameunit); +} + +int +device_get_unit(device_t dev) +{ + return (dev->unit); +} + +const char * +device_get_desc(device_t dev) +{ + return (dev->desc); +} + +u_int32_t +device_get_flags(device_t dev) +{ + return (dev->devflags); +} + +int +device_print_prettyname(device_t dev) +{ + const char *name = device_get_name(dev); + + if (name == 0) + return (printf("unknown: ")); + return (printf("%s%d: ", name, device_get_unit(dev))); +} + +int +device_printf(device_t dev, const char * fmt, ...) +{ + va_list ap; + int retval; + + retval = device_print_prettyname(dev); + va_start(ap, fmt); + retval += vprintf(fmt, ap); + va_end(ap); + return (retval); +} + +static void +device_set_desc_internal(device_t dev, const char* desc, int copy) +{ + if (dev->desc && (dev->flags & DF_DESCMALLOCED)) { + free(dev->desc, M_BUS); + dev->flags &= ~DF_DESCMALLOCED; + dev->desc = NULL; + } + + if (copy && desc) { + dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT); + if (dev->desc) { + strcpy(dev->desc, desc); + dev->flags |= DF_DESCMALLOCED; + } + } else { + /* Avoid a -Wcast-qual warning */ + dev->desc = (char *)(uintptr_t) desc; + } + + bus_data_generation_update(); +} + +void +device_set_desc(device_t dev, const char* desc) +{ + device_set_desc_internal(dev, desc, FALSE); +} + +void +device_set_desc_copy(device_t dev, const char* desc) +{ + device_set_desc_internal(dev, desc, TRUE); +} + +void +device_set_flags(device_t dev, u_int32_t flags) +{ + dev->devflags = flags; +} + +void * +device_get_softc(device_t dev) +{ + return (dev->softc); +} + +void +device_set_softc(device_t dev, void *softc) +{ + if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) + free(dev->softc, M_BUS); + dev->softc = softc; + if (dev->softc) + dev->flags |= DF_EXTERNALSOFTC; + else + dev->flags &= ~DF_EXTERNALSOFTC; +} + +void * +device_get_ivars(device_t dev) +{ + return (dev->ivars); +} + +void +device_set_ivars(device_t dev, void * ivars) +{ + if (!dev) + return; + + dev->ivars = ivars; + + return; +} + +device_state_t +device_get_state(device_t dev) +{ + return (dev->state); +} + +void +device_enable(device_t dev) +{ + dev->flags |= DF_ENABLED; +} + +void +device_disable(device_t dev) +{ + dev->flags &= ~DF_ENABLED; +} + +void +device_busy(device_t dev) +{ + if (dev->state < DS_ATTACHED) + panic("device_busy: called for unattached device"); + if (dev->busy == 0 && dev->parent) + device_busy(dev->parent); + dev->busy++; + dev->state = DS_BUSY; +} + +void +device_unbusy(device_t dev) +{ + if (dev->state != DS_BUSY) + panic("device_unbusy: called for non-busy device"); + dev->busy--; + if (dev->busy == 0) { + if (dev->parent) + device_unbusy(dev->parent); + dev->state = DS_ATTACHED; + } +} + +void +device_quiet(device_t dev) +{ + dev->flags |= DF_QUIET; +} + +void +device_verbose(device_t dev) +{ + dev->flags &= ~DF_QUIET; +} + +int +device_is_quiet(device_t dev) +{ + return ((dev->flags & DF_QUIET) != 0); +} + +int +device_is_enabled(device_t dev) +{ + return ((dev->flags & DF_ENABLED) != 0); +} + +int +device_is_alive(device_t dev) +{ + return (dev->state >= DS_ALIVE); +} + +int +device_set_devclass(device_t dev, const char *classname) +{ + devclass_t dc; + int error; + + if (!classname) { + if (dev->devclass) + devclass_delete_device(dev->devclass, dev); + return (0); + } + + if (dev->devclass) { + printf("device_set_devclass: device class already set\n"); + return (EINVAL); + } + + dc = devclass_find_internal(classname, TRUE); + if (!dc) + return (ENOMEM); + + error = devclass_add_device(dc, dev); + + bus_data_generation_update(); + return (error); +} + +int +device_set_driver(device_t dev, driver_t *driver) +{ + if (dev->state >= DS_ATTACHED) + return (EBUSY); + + if (dev->driver == driver) + return (0); + + if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) { + free(dev->softc, M_BUS); + dev->softc = NULL; + } + kobj_delete((kobj_t) dev, 0); + dev->driver = driver; + if (driver) { + kobj_init((kobj_t) dev, (kobj_class_t) driver); + if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) { + dev->softc = malloc(driver->size, M_BUS, + M_NOWAIT | M_ZERO); + if (!dev->softc) { + kobj_init((kobj_t) dev, &null_class); + dev->driver = NULL; + return (ENOMEM); + } + } + } else { + kobj_init((kobj_t) dev, &null_class); + } + + bus_data_generation_update(); + return (0); +} + +int +device_probe_and_attach(device_t dev) +{ + device_t bus = dev->parent; + int error = 0; + int hasclass = (dev->devclass != 0); + + if (dev->state >= DS_ALIVE) + return (0); + + if (dev->flags & DF_ENABLED) { + error = device_probe_child(bus, dev); + if (!error) { + if (!device_is_quiet(dev)) + device_print_child(bus, dev); + error = DEVICE_ATTACH(dev); + if (!error) + dev->state = DS_ATTACHED; + else { + printf("device_probe_and_attach: %s%d attach returned %d\n", + dev->driver->name, dev->unit, error); + /* Unset the class; set in device_probe_child */ + if (!hasclass) + device_set_devclass(dev, 0); + device_set_driver(dev, NULL); + dev->state = DS_NOTPRESENT; + } + } else { + if (!(dev->flags & DF_DONENOMATCH)) { + BUS_PROBE_NOMATCH(bus, dev); + dev->flags |= DF_DONENOMATCH; + } + } + } else { + if (bootverbose) { + device_print_prettyname(dev); + printf("not probed (disabled)\n"); + } + } + + return (error); +} + +int +device_detach(device_t dev) +{ + int error; + + PDEBUG(("%s", DEVICENAME(dev))); + if (dev->state == DS_BUSY) + return (EBUSY); + if (dev->state != DS_ATTACHED) + return (0); + + if ((error = DEVICE_DETACH(dev)) != 0) + return (error); + device_printf(dev, "detached\n"); + if (dev->parent) + BUS_CHILD_DETACHED(dev->parent, dev); + + if (!(dev->flags & DF_FIXEDCLASS)) + devclass_delete_device(dev->devclass, dev); + + dev->state = DS_NOTPRESENT; + device_set_driver(dev, NULL); + + return (0); +} + +int +device_shutdown(device_t dev) +{ + if (dev->state < DS_ATTACHED) + return (0); + return (DEVICE_SHUTDOWN(dev)); +} + +int +device_set_unit(device_t dev, int unit) +{ + devclass_t dc; + int err; + + dc = device_get_devclass(dev); + if (unit < dc->maxunit && dc->devices[unit]) + return (EBUSY); + err = devclass_delete_device(dc, dev); + if (err) + return (err); + dev->unit = unit; + err = devclass_add_device(dc, dev); + if (err) + return (err); + + bus_data_generation_update(); + return (0); +} + +/*======================================*/ +/* + * Some useful method implementations to make life easier for bus drivers. + */ + +void +resource_list_init(struct resource_list *rl) +{ + SLIST_INIT(rl); +} + +void +resource_list_free(struct resource_list *rl) +{ + struct resource_list_entry *rle; + + while ((rle = SLIST_FIRST(rl)) != NULL) { + if (rle->res) + panic("resource_list_free: resource entry is busy"); + SLIST_REMOVE_HEAD(rl, link); + free(rle, M_BUS); + } +} + +int +resource_list_add_next(struct resource_list *rl, int type, + u_long start, u_long end, u_long count) +{ + int rid; + + rid = 0; + while (resource_list_find(rl, type, rid)) rid++; + resource_list_add(rl, type, rid, start, end, count); + + return (rid); +} + +void +resource_list_add(struct resource_list *rl, int type, int rid, + u_long start, u_long end, u_long count) +{ + struct resource_list_entry *rle; + + rle = resource_list_find(rl, type, rid); + if (!rle) { + rle = malloc(sizeof(struct resource_list_entry), M_BUS, + M_NOWAIT); + if (!rle) + panic("resource_list_add: can't record entry"); + SLIST_INSERT_HEAD(rl, rle, link); + rle->type = type; + rle->rid = rid; + rle->res = NULL; + } + + if (rle->res) + panic("resource_list_add: resource entry is busy"); + + rle->start = start; + rle->end = end; + rle->count = count; +} + +struct resource_list_entry * +resource_list_find(struct resource_list *rl, int type, int rid) +{ + struct resource_list_entry *rle; + + SLIST_FOREACH(rle, rl, link) { + if (rle->type == type && rle->rid == rid) + return (rle); + } + return (NULL); +} + +void +resource_list_delete(struct resource_list *rl, int type, int rid) +{ + struct resource_list_entry *rle = resource_list_find(rl, type, rid); + + if (rle) { + if (rle->res != NULL) + panic("resource_list_delete: resource has not been released"); + SLIST_REMOVE(rl, rle, resource_list_entry, link); + free(rle, M_BUS); + } +} + +struct resource * +resource_list_alloc(struct resource_list *rl, device_t bus, device_t child, + int type, int *rid, u_long start, u_long end, u_long count, u_int flags) +{ + struct resource_list_entry *rle = 0; + int passthrough = (device_get_parent(child) != bus); + int isdefault = (start == 0UL && end == ~0UL); + + if (passthrough) { + return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child, + type, rid, start, end, count, flags)); + } + + rle = resource_list_find(rl, type, *rid); + + if (!rle) + return (NULL); /* no resource of that type/rid */ + + if (rle->res) + panic("resource_list_alloc: resource entry is busy"); + + if (isdefault) { + start = rle->start; + count = ulmax(count, rle->count); + end = ulmax(rle->end, start + count - 1); + } + + rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child, + type, rid, start, end, count, flags); + + /* + * Record the new range. + */ + if (rle->res) { + rle->start = rman_get_start(rle->res); + rle->end = rman_get_end(rle->res); + rle->count = count; + } + + return (rle->res); +} + +int +resource_list_release(struct resource_list *rl, device_t bus, device_t child, + int type, int rid, struct resource *res) +{ + struct resource_list_entry *rle = 0; + int passthrough = (device_get_parent(child) != bus); + int error; + + if (passthrough) { + return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child, + type, rid, res)); + } + + rle = resource_list_find(rl, type, rid); + + if (!rle) + panic("resource_list_release: can't find resource"); + if (!rle->res) + panic("resource_list_release: resource entry is not busy"); + + error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child, + type, rid, res); + if (error) + return (error); + + rle->res = NULL; + return (0); +} + +int +resource_list_print_type(struct resource_list *rl, const char *name, int type, + const char *format) +{ + struct resource_list_entry *rle; + int printed, retval; + + printed = 0; + retval = 0; + /* Yes, this is kinda cheating */ + SLIST_FOREACH(rle, rl, link) { + if (rle->type == type) { + if (printed == 0) + retval += printf(" %s ", name); + else + retval += printf(","); + printed++; + retval += printf(format, rle->start); + if (rle->count > 1) { + retval += printf("-"); + retval += printf(format, rle->start + + rle->count - 1); + } + } + } + return (retval); +} + +/* + * Call DEVICE_IDENTIFY for each driver. + */ +int +bus_generic_probe(device_t dev) +{ + devclass_t dc = dev->devclass; + driverlink_t dl; + + TAILQ_FOREACH(dl, &dc->drivers, link) { + DEVICE_IDENTIFY(dl->driver, dev); + } + + return (0); +} + +int +bus_generic_attach(device_t dev) +{ + device_t child; + + TAILQ_FOREACH(child, &dev->children, link) { + device_probe_and_attach(child); + } + + return (0); +} + +int +bus_generic_detach(device_t dev) +{ + device_t child; + int error; + + if (dev->state != DS_ATTACHED) + return (EBUSY); + + TAILQ_FOREACH(child, &dev->children, link) { + if ((error = device_detach(child)) != 0) + return (error); + } + + return (0); +} + +int +bus_generic_shutdown(device_t dev) +{ + device_t child; + + TAILQ_FOREACH(child, &dev->children, link) { + device_shutdown(child); + } + + return (0); +} + +int +bus_generic_suspend(device_t dev) +{ + int error; + device_t child, child2; + + TAILQ_FOREACH(child, &dev->children, link) { + error = DEVICE_SUSPEND(child); + if (error) { + for (child2 = TAILQ_FIRST(&dev->children); + child2 && child2 != child; + child2 = TAILQ_NEXT(child2, link)) + DEVICE_RESUME(child2); + return (error); + } + } + return (0); +} + +int +bus_generic_resume(device_t dev) +{ + device_t child; + + TAILQ_FOREACH(child, &dev->children, link) { + DEVICE_RESUME(child); + /* if resume fails, there's nothing we can usefully do... */ + } + return (0); +} + +int +bus_print_child_header (device_t dev, device_t child) +{ + int retval = 0; + + if (device_get_desc(child)) { + retval += device_printf(child, "<%s>", device_get_desc(child)); + } else { + retval += printf("%s", device_get_nameunit(child)); + } + + return (retval); +} + +int +bus_print_child_footer (device_t dev, device_t child) +{ + return (printf(" on %s\n", device_get_nameunit(dev))); +} + +int +bus_generic_print_child(device_t dev, device_t child) +{ + int retval = 0; + + retval += bus_print_child_header(dev, child); + retval += bus_print_child_footer(dev, child); + + return (retval); +} + +int +bus_generic_read_ivar(device_t dev, device_t child, int index, + uintptr_t * result) +{ + return (ENOENT); +} + +int +bus_generic_write_ivar(device_t dev, device_t child, int index, + uintptr_t value) +{ + return (ENOENT); +} + +struct resource_list * +bus_generic_get_resource_list (device_t dev, device_t child) +{ + return (NULL); +} + +void +bus_generic_driver_added(device_t dev, driver_t *driver) +{ + device_t child; + + DEVICE_IDENTIFY(driver, dev); + TAILQ_FOREACH(child, &dev->children, link) { + if (child->state == DS_NOTPRESENT) + device_probe_and_attach(child); + } +} + +int +bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq, + int flags, driver_intr_t *intr, void *arg, void **cookiep) +{ + /* Propagate up the bus hierarchy until someone handles it. */ + if (dev->parent) + return (BUS_SETUP_INTR(dev->parent, child, irq, flags, + intr, arg, cookiep)); + return (EINVAL); +} + +int +bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq, + void *cookie) +{ + /* Propagate up the bus hierarchy until someone handles it. */ + if (dev->parent) + return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie)); + return (EINVAL); +} + +struct resource * +bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid, + u_long start, u_long end, u_long count, u_int flags) +{ + /* Propagate up the bus hierarchy until someone handles it. */ + if (dev->parent) + return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid, + start, end, count, flags)); + return (NULL); +} + +int +bus_generic_release_resource(device_t dev, device_t child, int type, int rid, + struct resource *r) +{ + /* Propagate up the bus hierarchy until someone handles it. */ + if (dev->parent) + return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid, + r)); + return (EINVAL); +} + +int +bus_generic_activate_resource(device_t dev, device_t child, int type, int rid, + struct resource *r) +{ + /* Propagate up the bus hierarchy until someone handles it. */ + if (dev->parent) + return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid, + r)); + return (EINVAL); +} + +int +bus_generic_deactivate_resource(device_t dev, device_t child, int type, + int rid, struct resource *r) +{ + /* Propagate up the bus hierarchy until someone handles it. */ + if (dev->parent) + return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid, + r)); + return (EINVAL); +} + +int +bus_generic_rl_get_resource (device_t dev, device_t child, int type, int rid, + u_long *startp, u_long *countp) +{ + struct resource_list * rl = NULL; + struct resource_list_entry * rle = NULL; + + rl = BUS_GET_RESOURCE_LIST(dev, child); + if (!rl) + return (EINVAL); + + rle = resource_list_find(rl, type, rid); + if (!rle) + return (ENOENT); + + if (startp) + *startp = rle->start; + if (countp) + *countp = rle->count; + + return (0); +} + +int +bus_generic_rl_set_resource (device_t dev, device_t child, int type, int rid, + u_long start, u_long count) +{ + struct resource_list * rl = NULL; + + rl = BUS_GET_RESOURCE_LIST(dev, child); + if (!rl) + return (EINVAL); + + resource_list_add(rl, type, rid, start, (start + count - 1), count); + + return (0); +} + +void +bus_generic_rl_delete_resource (device_t dev, device_t child, int type, int rid) +{ + struct resource_list * rl = NULL; + + rl = BUS_GET_RESOURCE_LIST(dev, child); + if (!rl) + return; + + resource_list_delete(rl, type, rid); + + return; +} + +int +bus_generic_rl_release_resource (device_t dev, device_t child, int type, + int rid, struct resource *r) +{ + struct resource_list * rl = NULL; + + rl = BUS_GET_RESOURCE_LIST(dev, child); + if (!rl) + return (EINVAL); + + return (resource_list_release(rl, dev, child, type, rid, r)); +} + +struct resource * +bus_generic_rl_alloc_resource (device_t dev, device_t child, int type, + int *rid, u_long start, u_long end, u_long count, u_int flags) +{ + struct resource_list * rl = NULL; + + rl = BUS_GET_RESOURCE_LIST(dev, child); + if (!rl) + return (NULL); + + return (resource_list_alloc(rl, dev, child, type, rid, + start, end, count, flags)); +} + +/* + * Some convenience functions to make it easier for drivers to use the + * resource-management functions. All these really do is hide the + * indirection through the parent's method table, making for slightly + * less-wordy code. In the future, it might make sense for this code + * to maintain some sort of a list of resources allocated by each device. + */ +struct resource * +bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end, + u_long count, u_int flags) +{ + if (dev->parent == 0) + return (0); + return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end, + count, flags)); +} + +int +bus_activate_resource(device_t dev, int type, int rid, struct resource *r) +{ + if (dev->parent == 0) + return (EINVAL); + return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r)); +} + +int +bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r) +{ + if (dev->parent == 0) + return (EINVAL); + return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r)); +} + +int +bus_release_resource(device_t dev, int type, int rid, struct resource *r) +{ + if (dev->parent == 0) + return (EINVAL); + return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r)); +} + +int +bus_setup_intr(device_t dev, struct resource *r, int flags, + driver_intr_t handler, void *arg, void **cookiep) +{ + if (dev->parent == 0) + return (EINVAL); + return (BUS_SETUP_INTR(dev->parent, dev, r, flags, + handler, arg, cookiep)); +} + +int +bus_teardown_intr(device_t dev, struct resource *r, void *cookie) +{ + if (dev->parent == 0) + return (EINVAL); + return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie)); +} + +int +bus_set_resource(device_t dev, int type, int rid, + u_long start, u_long count) +{ + return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid, + start, count)); +} + +int +bus_get_resource(device_t dev, int type, int rid, + u_long *startp, u_long *countp) +{ + return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid, + startp, countp)); +} + +u_long +bus_get_resource_start(device_t dev, int type, int rid) +{ + u_long start, count; + int error; + + error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid, + &start, &count); + if (error) + return (0); + return (start); +} + +u_long +bus_get_resource_count(device_t dev, int type, int rid) +{ + u_long start, count; + int error; + + error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid, + &start, &count); + if (error) + return (0); + return (count); +} + +void +bus_delete_resource(device_t dev, int type, int rid) +{ + BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid); +} + +static int +root_print_child(device_t dev, device_t child) +{ + int retval = 0; + + retval += bus_print_child_header(dev, child); + retval += printf("\n"); + + return (retval); +} + +static int +root_setup_intr(device_t dev, device_t child, driver_intr_t *intr, void *arg, + void **cookiep) +{ + /* + * If an interrupt mapping gets to here something bad has happened. + */ + panic("root_setup_intr"); +} + +static kobj_method_t root_methods[] = { + /* Device interface */ + KOBJMETHOD(device_shutdown, bus_generic_shutdown), + KOBJMETHOD(device_suspend, bus_generic_suspend), + KOBJMETHOD(device_resume, bus_generic_resume), + + /* Bus interface */ + KOBJMETHOD(bus_print_child, root_print_child), + KOBJMETHOD(bus_read_ivar, bus_generic_read_ivar), + KOBJMETHOD(bus_write_ivar, bus_generic_write_ivar), + KOBJMETHOD(bus_setup_intr, root_setup_intr), + + { 0, 0 } +}; + +static driver_t root_driver = { + "root", + root_methods, + 1, /* no softc */ +}; + +device_t root_bus; +devclass_t root_devclass; + +static int +root_bus_module_handler(module_t mod, int what, void* arg) +{ + switch (what) { + case MOD_LOAD: + TAILQ_INIT(&bus_data_devices); + kobj_class_compile((kobj_class_t) &root_driver); + root_bus = make_device(NULL, "root", 0); + root_bus->desc = "System root bus"; + kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver); + root_bus->driver = &root_driver; + root_bus->state = DS_ATTACHED; + root_devclass = devclass_find_internal("root", FALSE); + return (0); + + case MOD_SHUTDOWN: + device_shutdown(root_bus); + return (0); + } + + return (0); +} + +static moduledata_t root_bus_mod = { + "rootbus", + root_bus_module_handler, + 0 +}; +DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); + +void +root_bus_configure(void) +{ + device_t dev; + + PDEBUG((".")); + + TAILQ_FOREACH(dev, &root_bus->children, link) { + device_probe_and_attach(dev); + } +} + +int +driver_module_handler(module_t mod, int what, void *arg) +{ + int error, i; + struct driver_module_data *dmd; + devclass_t bus_devclass; + + dmd = (struct driver_module_data *)arg; + bus_devclass = devclass_find_internal(dmd->dmd_busname, TRUE); + error = 0; + + switch (what) { + case MOD_LOAD: + if (dmd->dmd_chainevh) + error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg); + + for (i = 0; !error && i < dmd->dmd_ndrivers; i++) { + PDEBUG(("Loading module: driver %s on bus %s", + DRIVERNAME(dmd->dmd_drivers[i]), dmd->dmd_busname)); + error = devclass_add_driver(bus_devclass, + dmd->dmd_drivers[i]); + } + if (error) + break; + + /* + * The drivers loaded in this way are assumed to all + * implement the same devclass. + */ + *dmd->dmd_devclass = + devclass_find_internal(dmd->dmd_drivers[0]->name, TRUE); + break; + + case MOD_UNLOAD: + for (i = 0; !error && i < dmd->dmd_ndrivers; i++) { + PDEBUG(("Unloading module: driver %s from bus %s", + DRIVERNAME(dmd->dmd_drivers[i]), + dmd->dmd_busname)); + error = devclass_delete_driver(bus_devclass, + dmd->dmd_drivers[i]); + } + + if (!error && dmd->dmd_chainevh) + error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg); + break; + } + + return (error); +} + +#ifdef BUS_DEBUG + +/* the _short versions avoid iteration by not calling anything that prints + * more than oneliners. I love oneliners. + */ + +static void +print_device_short(device_t dev, int indent) +{ + if (!dev) + return; + + indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s,%sivars,%ssoftc,busy=%d\n", + dev->unit, dev->desc, + (dev->parent? "":"no "), + (TAILQ_EMPTY(&dev->children)? "no ":""), + (dev->flags&DF_ENABLED? "enabled,":"disabled,"), + (dev->flags&DF_FIXEDCLASS? "fixed,":""), + (dev->flags&DF_WILDCARD? "wildcard,":""), + (dev->flags&DF_DESCMALLOCED? "descmalloced,":""), + (dev->ivars? "":"no "), + (dev->softc? "":"no "), + dev->busy)); +} + +static void +print_device(device_t dev, int indent) +{ + if (!dev) + return; + + print_device_short(dev, indent); + + indentprintf(("Parent:\n")); + print_device_short(dev->parent, indent+1); + indentprintf(("Driver:\n")); + print_driver_short(dev->driver, indent+1); + indentprintf(("Devclass:\n")); + print_devclass_short(dev->devclass, indent+1); +} + +void +print_device_tree_short(device_t dev, int indent) +/* print the device and all its children (indented) */ +{ + device_t child; + + if (!dev) + return; + + print_device_short(dev, indent); + + TAILQ_FOREACH(child, &dev->children, link) { + print_device_tree_short(child, indent+1); + } +} + +void +print_device_tree(device_t dev, int indent) +/* print the device and all its children (indented) */ +{ + device_t child; + + if (!dev) + return; + + print_device(dev, indent); + + TAILQ_FOREACH(child, &dev->children, link) { + print_device_tree(child, indent+1); + } +} + +static void +print_driver_short(driver_t *driver, int indent) +{ + if (!driver) + return; + + indentprintf(("driver %s: softc size = %d\n", + driver->name, driver->size)); +} + +static void +print_driver(driver_t *driver, int indent) +{ + if (!driver) + return; + + print_driver_short(driver, indent); +} + + +static void +print_driver_list(driver_list_t drivers, int indent) +{ + driverlink_t driver; + + TAILQ_FOREACH(driver, &drivers, link) { + print_driver(driver->driver, indent); + } +} + +static void +print_devclass_short(devclass_t dc, int indent) +{ + if ( !dc ) + return; + + indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit)); +} + +static void +print_devclass(devclass_t dc, int indent) +{ + int i; + + if ( !dc ) + return; + + print_devclass_short(dc, indent); + indentprintf(("Drivers:\n")); + print_driver_list(dc->drivers, indent+1); + + indentprintf(("Devices:\n")); + for (i = 0; i < dc->maxunit; i++) + if (dc->devices[i]) + print_device(dc->devices[i], indent+1); +} + +void +print_devclass_list_short(void) +{ + devclass_t dc; + + printf("Short listing of devclasses, drivers & devices:\n"); + TAILQ_FOREACH(dc, &devclasses, link) { + print_devclass_short(dc, 0); + } +} + +void +print_devclass_list(void) +{ + devclass_t dc; + + printf("Full listing of devclasses, drivers & devices:\n"); + TAILQ_FOREACH(dc, &devclasses, link) { + print_devclass(dc, 0); + } +} + +#endif + +/* + * User-space access to the device tree. + * + * We implement a small set of nodes: + * + * hw.bus Single integer read method to obtain the + * current generation count. + * hw.bus.devices Reads the entire device tree in flat space. + * hw.bus.rman Resource manager interface + * + * We might like to add the ability to scan devclasses and/or drivers to + * determine what else is currently loaded/available. + */ +SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL); + +static int +sysctl_bus(SYSCTL_HANDLER_ARGS) +{ + struct u_businfo ubus; + + ubus.ub_version = BUS_USER_VERSION; + ubus.ub_generation = bus_data_generation; + + return (SYSCTL_OUT(req, &ubus, sizeof(ubus))); +} +SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus, + "bus-related data"); + +static int +sysctl_devices(SYSCTL_HANDLER_ARGS) +{ + int *name = (int *)arg1; + u_int namelen = arg2; + int index; + struct device *dev; + struct u_device udev; /* XXX this is a bit big */ + int error; + + if (namelen != 2) + return (EINVAL); + + if (bus_data_generation_check(name[0])) + return (EINVAL); + + index = name[1]; + + /* + * Scan the list of devices, looking for the requested index. + */ + TAILQ_FOREACH(dev, &bus_data_devices, devlink) { + if (index-- == 0) + break; + } + if (dev == NULL) + return (ENOENT); + + /* + * Populate the return array. + */ + udev.dv_handle = (uintptr_t)dev; + udev.dv_parent = (uintptr_t)dev->parent; + if (dev->nameunit == NULL) { + udev.dv_name[0] = 0; + } else { + snprintf(udev.dv_name, 32, "%s", dev->nameunit); + } + if (dev->desc == NULL) { + udev.dv_desc[0] = 0; + } else { + snprintf(udev.dv_desc, 32, "%s", dev->desc); + } + if ((dev->driver == NULL) || (dev->driver->name == NULL)) { + udev.dv_drivername[0] = 0; + } else { + snprintf(udev.dv_drivername, 32, "%s", dev->driver->name); + } + error = SYSCTL_OUT(req, &udev, sizeof(udev)); + return (error); +} + +SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices, + "system device tree"); + +/* + * Sysctl interface for scanning the resource lists. + * + * We take two input parameters; the index into the list of resource + * managers, and the resource offset into the list. + */ +static int +sysctl_rman(SYSCTL_HANDLER_ARGS) +{ + int *name = (int *)arg1; + u_int namelen = arg2; + int rman_idx, res_idx; + struct rman *rm; + struct resource *res; + struct u_rman urm; + struct u_resource ures; + int error; + + if (namelen != 3) + return (EINVAL); + + if (bus_data_generation_check(name[0])) + return (EINVAL); + rman_idx = name[1]; + res_idx = name[2]; + + /* + * Find the indexed resource manager + */ + TAILQ_FOREACH(rm, &rman_head, rm_link) { + if (rman_idx-- == 0) + break; + } + if (rm == NULL) + return (ENOENT); + + /* + * If the resource index is -1, we want details on the + * resource manager. + */ + if (res_idx == -1) { + urm.rm_handle = (uintptr_t)rm; + snprintf(urm.rm_descr, RM_TEXTLEN, "%s", rm->rm_descr); + urm.rm_descr[RM_TEXTLEN - 1] = '\0'; + urm.rm_start = rm->rm_start; + urm.rm_size = rm->rm_end - rm->rm_start + 1; + urm.rm_type = rm->rm_type; + + error = SYSCTL_OUT(req, &urm, sizeof(urm)); + return (error); + } + + /* + * Find the indexed resource and return it. + */ + TAILQ_FOREACH(res, &rm->rm_list, r_link) { + if (res_idx-- == 0) { + ures.r_handle = (uintptr_t)res; + ures.r_parent = (uintptr_t)res->r_rm; + ures.r_device = (uintptr_t)res->r_dev; + if (res->r_dev != NULL) { + if (device_get_name(res->r_dev) != NULL) { + snprintf(ures.r_devname, RM_TEXTLEN, + "%s%d", + device_get_name(res->r_dev), + device_get_unit(res->r_dev)); + } else { + snprintf(ures.r_devname, RM_TEXTLEN, + "nomatch"); + } + } else { + ures.r_devname[0] = 0; + } + ures.r_start = res->r_start; + ures.r_size = res->r_end - res->r_start + 1; + ures.r_flags = res->r_flags; + + error = SYSCTL_OUT(req, &ures, sizeof(ures)); + return (error); + } + } + return (ENOENT); +} + +SYSCTL_NODE(_hw_bus, OID_AUTO, rman, CTLFLAG_RD, sysctl_rman, + "kernel resource manager"); + +int +bus_data_generation_check(int generation) +{ + if (generation != bus_data_generation) + return (1); + + /* XXX generate optimised lists here? */ + return (0); +} + +void +bus_data_generation_update(void) +{ + bus_data_generation++; +} diff --git a/sys/kern/subr_clist.c b/sys/kern/subr_clist.c new file mode 100644 index 0000000..78bb231 --- /dev/null +++ b/sys/kern/subr_clist.c @@ -0,0 +1,696 @@ +/* + * Copyright (c) 1994, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * clist support routines + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/tty.h> +#include <sys/clist.h> + +static void clist_init(void *); +SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL) + +static struct cblock *cfreelist = 0; +int cfreecount = 0; +static int cslushcount; +static int ctotcount; + +#ifndef INITIAL_CBLOCKS +#define INITIAL_CBLOCKS 50 +#endif + +static struct cblock *cblock_alloc(void); +static void cblock_alloc_cblocks(int number); +static void cblock_free(struct cblock *cblockp); +static void cblock_free_cblocks(int number); + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +DB_SHOW_COMMAND(cbstat, cbstat) +{ + int cbsize = CBSIZE; + + printf( + "tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n", + ctotcount * cbsize, ctotcount * cbsize - cfreecount, cfreecount, + cfreecount - cslushcount * cbsize, cslushcount * cbsize); +} +#endif /* DDB */ + +/* + * Called from init_main.c + */ +/* ARGSUSED*/ +static void +clist_init(dummy) + void *dummy; +{ + /* + * Allocate an initial base set of cblocks as a 'slush'. + * We allocate non-slush cblocks with each initial ttyopen() and + * deallocate them with each ttyclose(). + * We should adjust the slush allocation. This can't be done in + * the i/o routines because they are sometimes called from + * interrupt handlers when it may be unsafe to call malloc(). + */ + cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS); +} + +/* + * Remove a cblock from the cfreelist queue and return a pointer + * to it. + */ +static __inline struct cblock * +cblock_alloc() +{ + struct cblock *cblockp; + + cblockp = cfreelist; + if (cblockp == NULL) + panic("clist reservation botch"); + cfreelist = cblockp->c_next; + cblockp->c_next = NULL; + cfreecount -= CBSIZE; + return (cblockp); +} + +/* + * Add a cblock to the cfreelist queue. + */ +static __inline void +cblock_free(cblockp) + struct cblock *cblockp; +{ + if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) + bzero(cblockp->c_quote, sizeof cblockp->c_quote); + cblockp->c_next = cfreelist; + cfreelist = cblockp; + cfreecount += CBSIZE; +} + +/* + * Allocate some cblocks for the cfreelist queue. + */ +static void +cblock_alloc_cblocks(number) + int number; +{ + int i; + struct cblock *cbp; + + for (i = 0; i < number; ++i) { + cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT); + if (cbp == NULL) { + printf( +"cblock_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n"); + cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK); + } + /* + * Freed cblocks have zero quotes and garbage elsewhere. + * Set the may-have-quote bit to force zeroing the quotes. + */ + setbit(cbp->c_quote, CBQSIZE * NBBY - 1); + cblock_free(cbp); + } + ctotcount += number; +} + +/* + * Set the cblock allocation policy for a a clist. + * Must be called in process context at spltty(). + */ +void +clist_alloc_cblocks(clistp, ccmax, ccreserved) + struct clist *clistp; + int ccmax; + int ccreserved; +{ + int dcbr; + + /* + * Allow for wasted space at the head. + */ + if (ccmax != 0) + ccmax += CBSIZE - 1; + if (ccreserved != 0) + ccreserved += CBSIZE - 1; + + clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE; + dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved; + if (dcbr >= 0) + cblock_alloc_cblocks(dcbr); + else { + if (clistp->c_cbreserved + dcbr < clistp->c_cbcount) + dcbr = clistp->c_cbcount - clistp->c_cbreserved; + cblock_free_cblocks(-dcbr); + } + clistp->c_cbreserved += dcbr; +} + +/* + * Free some cblocks from the cfreelist queue back to the + * system malloc pool. + */ +static void +cblock_free_cblocks(number) + int number; +{ + int i; + + for (i = 0; i < number; ++i) + free(cblock_alloc(), M_TTYS); + ctotcount -= number; +} + +/* + * Free the cblocks reserved for a clist. + * Must be called at spltty(). + */ +void +clist_free_cblocks(clistp) + struct clist *clistp; +{ + if (clistp->c_cbcount != 0) + panic("freeing active clist cblocks"); + cblock_free_cblocks(clistp->c_cbreserved); + clistp->c_cbmax = 0; + clistp->c_cbreserved = 0; +} + +/* + * Get a character from the head of a clist. + */ +int +getc(clistp) + struct clist *clistp; +{ + int chr = -1; + int s; + struct cblock *cblockp; + + s = spltty(); + + /* If there are characters in the list, get one */ + if (clistp->c_cc) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + chr = (u_char)*clistp->c_cf; + + /* + * If this char is quoted, set the flag. + */ + if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info)) + chr |= TTY_QUOTE; + + /* + * Advance to next character. + */ + clistp->c_cf++; + clistp->c_cc--; + /* + * If we have advanced the 'first' character pointer + * past the end of this cblock, advance to the next one. + * If there are no more characters, set the first and + * last pointers to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); + return (chr); +} + +/* + * Copy 'amount' of chars, beginning at head of clist 'clistp' to + * destination linear buffer 'dest'. Return number of characters + * actually copied. + */ +int +q_to_b(clistp, dest, amount) + struct clist *clistp; + char *dest; + int amount; +{ + struct cblock *cblockp; + struct cblock *cblockn; + char *dest_orig = dest; + int numc; + int s; + + s = spltty(); + + while (clistp && amount && (clistp->c_cc > 0)) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + cblockn = cblockp + 1; /* pointer arithmetic! */ + numc = min(amount, (char *)cblockn - clistp->c_cf); + numc = min(numc, clistp->c_cc); + bcopy(clistp->c_cf, dest, numc); + amount -= numc; + clistp->c_cf += numc; + clistp->c_cc -= numc; + dest += numc; + /* + * If this cblock has been emptied, advance to the next + * one. If there are no more characters, set the first + * and last pointer to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); + return (dest - dest_orig); +} + +/* + * Flush 'amount' of chars, beginning at head of clist 'clistp'. + */ +void +ndflush(clistp, amount) + struct clist *clistp; + int amount; +{ + struct cblock *cblockp; + struct cblock *cblockn; + int numc; + int s; + + s = spltty(); + + while (amount && (clistp->c_cc > 0)) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + cblockn = cblockp + 1; /* pointer arithmetic! */ + numc = min(amount, (char *)cblockn - clistp->c_cf); + numc = min(numc, clistp->c_cc); + amount -= numc; + clistp->c_cf += numc; + clistp->c_cc -= numc; + /* + * If this cblock has been emptied, advance to the next + * one. If there are no more characters, set the first + * and last pointer to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); +} + +/* + * Add a character to the end of a clist. Return -1 is no + * more clists, or 0 for success. + */ +int +putc(chr, clistp) + int chr; + struct clist *clistp; +{ + struct cblock *cblockp; + int s; + + s = spltty(); + + if (clistp->c_cl == NULL) { + if (clistp->c_cbreserved < 1) { + splx(s); + printf("putc to a clist with no reserved cblocks\n"); + return (-1); /* nothing done */ + } + cblockp = cblock_alloc(); + clistp->c_cbcount = 1; + clistp->c_cf = clistp->c_cl = cblockp->c_info; + clistp->c_cc = 0; + } else { + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + if (((intptr_t)clistp->c_cl & CROUND) == 0) { + struct cblock *prev = (cblockp - 1); + + if (clistp->c_cbcount >= clistp->c_cbreserved) { + if (clistp->c_cbcount >= clistp->c_cbmax + || cslushcount <= 0) { + splx(s); + return (-1); + } + --cslushcount; + } + cblockp = cblock_alloc(); + clistp->c_cbcount++; + prev->c_next = cblockp; + clistp->c_cl = cblockp->c_info; + } + } + + /* + * If this character is quoted, set the quote bit, if not, clear it. + */ + if (chr & TTY_QUOTE) { + setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info); + /* + * Use one of the spare quote bits to record that something + * may be quoted. + */ + setbit(cblockp->c_quote, CBQSIZE * NBBY - 1); + } else + clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info); + + *clistp->c_cl++ = chr; + clistp->c_cc++; + + splx(s); + return (0); +} + +/* + * Copy data from linear buffer to clist chain. Return the + * number of characters not copied. + */ +int +b_to_q(src, amount, clistp) + char *src; + int amount; + struct clist *clistp; +{ + struct cblock *cblockp; + char *firstbyte, *lastbyte; + u_char startmask, endmask; + int startbit, endbit, num_between, numc; + int s; + + /* + * Avoid allocating an initial cblock and then not using it. + * c_cc == 0 must imply c_cbount == 0. + */ + if (amount <= 0) + return (amount); + + s = spltty(); + + /* + * If there are no cblocks assigned to this clist yet, + * then get one. + */ + if (clistp->c_cl == NULL) { + if (clistp->c_cbreserved < 1) { + splx(s); + printf("b_to_q to a clist with no reserved cblocks.\n"); + return (amount); /* nothing done */ + } + cblockp = cblock_alloc(); + clistp->c_cbcount = 1; + clistp->c_cf = clistp->c_cl = cblockp->c_info; + clistp->c_cc = 0; + } else { + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + } + + while (amount) { + /* + * Get another cblock if needed. + */ + if (((intptr_t)clistp->c_cl & CROUND) == 0) { + struct cblock *prev = cblockp - 1; + + if (clistp->c_cbcount >= clistp->c_cbreserved) { + if (clistp->c_cbcount >= clistp->c_cbmax + || cslushcount <= 0) { + splx(s); + return (amount); + } + --cslushcount; + } + cblockp = cblock_alloc(); + clistp->c_cbcount++; + prev->c_next = cblockp; + clistp->c_cl = cblockp->c_info; + } + + /* + * Copy a chunk of the linear buffer up to the end + * of this cblock. + */ + numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl); + bcopy(src, clistp->c_cl, numc); + + /* + * Clear quote bits if they aren't known to be clear. + * The following could probably be made into a separate + * "bitzero()" routine, but why bother? + */ + if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) { + startbit = clistp->c_cl - (char *)cblockp->c_info; + endbit = startbit + numc - 1; + + firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY); + lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY); + + /* + * Calculate mask of bits to preserve in first and + * last bytes. + */ + startmask = NBBY - (startbit % NBBY); + startmask = 0xff >> startmask; + endmask = (endbit % NBBY); + endmask = 0xff << (endmask + 1); + + if (firstbyte != lastbyte) { + *firstbyte &= startmask; + *lastbyte &= endmask; + + num_between = lastbyte - firstbyte - 1; + if (num_between) + bzero(firstbyte + 1, num_between); + } else { + *firstbyte &= (startmask | endmask); + } + } + + /* + * ...and update pointer for the next chunk. + */ + src += numc; + clistp->c_cl += numc; + clistp->c_cc += numc; + amount -= numc; + /* + * If we go through the loop again, it's always + * for data in the next cblock, so by adding one (cblock), + * (which makes the pointer 1 beyond the end of this + * cblock) we prepare for the assignment of 'prev' + * above. + */ + cblockp += 1; + + } + + splx(s); + return (amount); +} + +/* + * Get the next character in the clist. Store it at dst. Don't + * advance any clist pointers, but return a pointer to the next + * character position. + */ +char * +nextc(clistp, cp, dst) + struct clist *clistp; + char *cp; + int *dst; +{ + struct cblock *cblockp; + + ++cp; + /* + * See if the next character is beyond the end of + * the clist. + */ + if (clistp->c_cc && (cp != clistp->c_cl)) { + /* + * If the next character is beyond the end of this + * cblock, advance to the next cblock. + */ + if (((intptr_t)cp & CROUND) == 0) + cp = ((struct cblock *)cp - 1)->c_next->c_info; + cblockp = (struct cblock *)((intptr_t)cp & ~CROUND); + + /* + * Get the character. Set the quote flag if this character + * is quoted. + */ + *dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0); + + return (cp); + } + + return (NULL); +} + +/* + * "Unput" a character from a clist. + */ +int +unputc(clistp) + struct clist *clistp; +{ + struct cblock *cblockp = 0, *cbp = 0; + int s; + int chr = -1; + + + s = spltty(); + + if (clistp->c_cc) { + --clistp->c_cc; + --clistp->c_cl; + + chr = (u_char)*clistp->c_cl; + + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + + /* + * Set quote flag if this character was quoted. + */ + if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info)) + chr |= TTY_QUOTE; + + /* + * If all of the characters have been unput in this + * cblock, then find the previous one and free this + * one. + */ + if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) { + cbp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + + while (cbp->c_next != cblockp) + cbp = cbp->c_next; + + /* + * When the previous cblock is at the end, the 'last' + * pointer always points (invalidly) one past. + */ + clistp->c_cl = (char *)(cbp+1); + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + cbp->c_next = NULL; + } + } + + /* + * If there are no more characters on the list, then + * free the last cblock. + */ + if ((clistp->c_cc == 0) && clistp->c_cl) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + clistp->c_cf = clistp->c_cl = NULL; + } + + splx(s); + return (chr); +} + +/* + * Move characters in source clist to destination clist, + * preserving quote bits. + */ +void +catq(src_clistp, dest_clistp) + struct clist *src_clistp, *dest_clistp; +{ + int chr, s; + + s = spltty(); + /* + * If the destination clist is empty (has no cblocks atttached), + * and there are no possible complications with the resource counters, + * then we simply assign the current clist to the destination. + */ + if (!dest_clistp->c_cf + && src_clistp->c_cbcount <= src_clistp->c_cbmax + && src_clistp->c_cbcount <= dest_clistp->c_cbmax) { + dest_clistp->c_cf = src_clistp->c_cf; + dest_clistp->c_cl = src_clistp->c_cl; + src_clistp->c_cf = src_clistp->c_cl = NULL; + + dest_clistp->c_cc = src_clistp->c_cc; + src_clistp->c_cc = 0; + dest_clistp->c_cbcount = src_clistp->c_cbcount; + src_clistp->c_cbcount = 0; + + splx(s); + return; + } + + splx(s); + + /* + * XXX This should probably be optimized to more than one + * character at a time. + */ + while ((chr = getc(src_clistp)) != -1) + putc(chr, dest_clistp); +} diff --git a/sys/kern/subr_clock.c b/sys/kern/subr_clock.c new file mode 100644 index 0000000..a79e331 --- /dev/null +++ b/sys/kern/subr_clock.c @@ -0,0 +1,316 @@ +/* + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1982, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: clock.c 1.18 91/01/21$ + * from: @(#)clock.c 8.2 (Berkeley) 1/12/94 + * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp + * and + * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04 + * + * $FreeBSD$ + */ + +/* + * Helpers for time-of-day clocks. This is useful for architectures that need + * support multiple models of such clocks, and generally serves to make the + * code more machine-independent. + * If the clock in question can also be used as a time counter, the driver + * needs to initiate this. + * This code is not yet used by all architectures. + */ + +/* + * Generic routines to convert between a POSIX date + * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec + * Derived from NetBSD arch/hp300/hp300/clock.c + */ +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/clock.h> +#include <sys/sysctl.h> +#include <sys/timetc.h> + +#include "clock_if.h" + +static __inline int leapyear(int year); +static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS); + +#define FEBRUARY 2 +#define days_in_year(y) (leapyear(y) ? 366 : 365) +#define days_in_month(y, m) \ + (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0)) +/* Day of week. Days are counted from 1/1/1970, which was a Thursday */ +#define day_of_week(days) (((days) + 4) % 7) + +static const int month_days[12] = { + 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 +}; + +static device_t clock_dev = NULL; +static long clock_res; + +int adjkerntz; /* local offset from GMT in seconds */ +int disable_rtc_set; /* disable resettodr() if != 0 */ +int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */ + +/* + * These have traditionally been in machdep, but should probably be moved to + * kern. + */ +SYSCTL_PROC(_machdep, OID_AUTO, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, + &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); + +SYSCTL_INT(_machdep, OID_AUTO, disable_rtc_set, + CTLFLAG_RW, &disable_rtc_set, 0, ""); + +SYSCTL_INT(_machdep, OID_AUTO, wall_cmos_clock, + CTLFLAG_RW, &wall_cmos_clock, 0, ""); + +static int +sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) +{ + int error; + error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, + req); + if (!error && req->newptr) + resettodr(); + return (error); +} + +/* + * This inline avoids some unnecessary modulo operations + * as compared with the usual macro: + * ( ((year % 4) == 0 && + * (year % 100) != 0) || + * ((year % 400) == 0) ) + * It is otherwise equivalent. + */ +static __inline int +leapyear(int year) +{ + int rv = 0; + + if ((year & 3) == 0) { + rv = 1; + if ((year % 100) == 0) { + rv = 0; + if ((year % 400) == 0) + rv = 1; + } + } + return (rv); +} + +int +clock_ct_to_ts(struct clocktime *ct, struct timespec *ts) +{ + time_t secs; + int i, year, days; + + year = ct->year; + + /* Sanity checks. */ + if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 || + ct->day > days_in_month(year, ct->mon) || + ct->hour > 23 || ct->min > 59 || ct->sec > 59 || + ct->year > 2037) /* time_t overflow */ + return (EINVAL); + + /* + * Compute days since start of time + * First from years, then from months. + */ + days = 0; + for (i = POSIX_BASE_YEAR; i < year; i++) + days += days_in_year(i); + + /* Months */ + for (i = 1; i < ct->mon; i++) + days += days_in_month(year, i); + days += (ct->day - 1); + + /* Another sanity check. */ + if (ct->dow != -1 && ct->dow != day_of_week(days)) + return (EINVAL); + + /* Add hours, minutes, seconds. */ + secs = ((days * 24 + ct->hour) * 60 + ct->min) * 60 + ct->sec; + + ts->tv_sec = secs; + ts->tv_nsec = ct->nsec; + return (0); +} + +void +clock_ts_to_ct(struct timespec *ts, struct clocktime *ct) +{ + int i, year, days; + time_t rsec; /* remainder seconds */ + time_t secs; + + secs = ts->tv_sec; + days = secs / SECDAY; + rsec = secs % SECDAY; + + ct->dow = day_of_week(days); + + /* Subtract out whole years, counting them in i. */ + for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++) + days -= days_in_year(year); + ct->year = year; + + /* Subtract out whole months, counting them in i. */ + for (i = 1; days >= days_in_month(year, i); i++) + days -= days_in_month(year, i); + ct->mon = i; + + /* Days are what is left over (+1) from all that. */ + ct->day = days + 1; + + /* Hours, minutes, seconds are easy */ + ct->hour = rsec / 3600; + rsec = rsec % 3600; + ct->min = rsec / 60; + rsec = rsec % 60; + ct->sec = rsec; + ct->nsec = ts->tv_nsec; +} + +void +clock_register(device_t dev, long res) +{ + + if (clock_dev != NULL) { + if (clock_res > res) { + if (bootverbose) { + device_printf(dev, "not installed as " + "time-of-day clock: clock %s has higher " + "resolution\n", device_get_name(clock_dev)); + } + return; + } else { + if (bootverbose) { + device_printf(clock_dev, "removed as " + "time-of-day clock: clock %s has higher " + "resolution\n", device_get_name(dev)); + } + } + } + clock_dev = dev; + clock_res = res; + if (bootverbose) { + device_printf(dev, "registered as a time-of-day clock " + "(resolution %ldus)\n", res); + } +} + +/* + * inittodr and settodr derived from the i386 versions written + * by Christoph Robitschko <chmr@edvz.tu-graz.ac.at>, reintroduced and + * updated by Chris Stenton <chris@gnome.co.uk> 8/10/94 + */ + +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +void +inittodr(time_t base) +{ + struct timespec diff, ref, ts; + int error; + + if (base) { + ref.tv_sec = base; + ref.tv_nsec = 0; + tc_setclock(&ref); + } + + if (clock_dev == NULL) { + printf("warning: no time-of-day clock registered, system time " + "will not be set accurately\n"); + return; + } + error = CLOCK_GETTIME(clock_dev, &ts); + if (error != 0 && error != EINVAL) { + printf("warning: clock_gettime failed (%d), the system time " + "will not be set accurately\n", error); + return; + } + if (error == EINVAL || ts.tv_sec < 0) { + printf("Invalid time in real time clock.\n"); + printf("Check and reset the date immediately!\n"); + } + + ts.tv_sec += tz.tz_minuteswest * 60 + + (wall_cmos_clock ? adjkerntz : 0); + + if (timespeccmp(&ref, &ts, >)) { + diff = ref; + timespecsub(&ref, &ts); + } else { + diff = ts; + timespecsub(&diff, &ref); + } + if (ts.tv_sec >= 2) { + /* badly off, adjust it */ + tc_setclock(&ts); + } +} + +/* + * Write system time back to RTC + */ +void +resettodr() +{ + struct timespec ts; + int error; + + if (disable_rtc_set || clock_dev == NULL) + return; + + getnanotime(&ts); + ts.tv_sec -= tz.tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + if ((error = CLOCK_SETTIME(clock_dev, &ts)) != 0) { + printf("warning: clock_settime failed (%d), time-of-day clock " + "not adjusted to system time\n", error); + return; + } +} diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c new file mode 100644 index 0000000..dabdf9d --- /dev/null +++ b/sys/kern/subr_devstat.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/sysctl.h> + +#include <sys/devicestat.h> + +static int devstat_num_devs; +static long devstat_generation; +static int devstat_version = DEVSTAT_VERSION; +static int devstat_current_devnumber; + +static struct devstatlist device_statq; + +/* + * Take a malloced and zeroed devstat structure given to us, fill it in + * and add it to the queue of devices. + */ +void +devstat_add_entry(struct devstat *ds, const char *dev_name, + int unit_number, u_int32_t block_size, + devstat_support_flags flags, + devstat_type_flags device_type, + devstat_priority priority) +{ + struct devstatlist *devstat_head; + struct devstat *ds_tmp; + + if (ds == NULL) + return; + + if (devstat_num_devs == 0) + STAILQ_INIT(&device_statq); + + devstat_generation++; + devstat_num_devs++; + + devstat_head = &device_statq; + + /* + * Priority sort. Each driver passes in its priority when it adds + * its devstat entry. Drivers are sorted first by priority, and + * then by probe order. + * + * For the first device, we just insert it, since the priority + * doesn't really matter yet. Subsequent devices are inserted into + * the list using the order outlined above. + */ + if (devstat_num_devs == 1) + STAILQ_INSERT_TAIL(devstat_head, ds, dev_links); + else { + STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) { + struct devstat *ds_next; + + ds_next = STAILQ_NEXT(ds_tmp, dev_links); + + /* + * If we find a break between higher and lower + * priority items, and if this item fits in the + * break, insert it. This also applies if the + * "lower priority item" is the end of the list. + */ + if ((priority <= ds_tmp->priority) + && ((ds_next == NULL) + || (priority > ds_next->priority))) { + STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds, + dev_links); + break; + } else if (priority > ds_tmp->priority) { + /* + * If this is the case, we should be able + * to insert ourselves at the head of the + * list. If we can't, something is wrong. + */ + if (ds_tmp == STAILQ_FIRST(devstat_head)) { + STAILQ_INSERT_HEAD(devstat_head, + ds, dev_links); + break; + } else { + STAILQ_INSERT_TAIL(devstat_head, + ds, dev_links); + printf("devstat_add_entry: HELP! " + "sorting problem detected " + "for %s%d\n", dev_name, + unit_number); + break; + } + } + } + } + + ds->device_number = devstat_current_devnumber++; + ds->unit_number = unit_number; + strncpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN); + ds->device_name[DEVSTAT_NAME_LEN - 1] = '\0'; + ds->block_size = block_size; + ds->flags = flags; + ds->device_type = device_type; + ds->priority = priority; + getmicrotime(&ds->dev_creation_time); +} + +/* + * Remove a devstat structure from the list of devices. + */ +void +devstat_remove_entry(struct devstat *ds) +{ + struct devstatlist *devstat_head; + + if (ds == NULL) + return; + + devstat_generation++; + devstat_num_devs--; + + devstat_head = &device_statq; + + /* Remove this entry from the devstat queue */ + STAILQ_REMOVE(devstat_head, ds, devstat, dev_links); +} + +/* + * Record a transaction start. + */ +void +devstat_start_transaction(struct devstat *ds) +{ + /* sanity check */ + if (ds == NULL) + return; + + /* + * We only want to set the start time when we are going from idle + * to busy. The start time is really the start of the latest busy + * period. + */ + if (ds->busy_count == 0) + getmicrouptime(&ds->start_time); + ds->busy_count++; +} + +/* + * Record the ending of a transaction, and incrment the various counters. + */ +void +devstat_end_transaction(struct devstat *ds, u_int32_t bytes, + devstat_tag_type tag_type, devstat_trans_flags flags) +{ + struct timeval busy_time; + + /* sanity check */ + if (ds == NULL) + return; + + getmicrouptime(&ds->last_comp_time); + ds->busy_count--; + + /* + * There might be some transactions (DEVSTAT_NO_DATA) that don't + * transfer any data. + */ + if (flags == DEVSTAT_READ) { + ds->bytes_read += bytes; + ds->num_reads++; + } else if (flags == DEVSTAT_WRITE) { + ds->bytes_written += bytes; + ds->num_writes++; + } else if (flags == DEVSTAT_FREE) { + ds->bytes_freed += bytes; + ds->num_frees++; + } else + ds->num_other++; + + /* + * Keep a count of the various tag types sent. + */ + if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 && + tag_type != DEVSTAT_TAG_NONE) + ds->tag_types[tag_type]++; + + /* + * We only update the busy time when we go idle. Otherwise, this + * calculation would require many more clock cycles. + */ + if (ds->busy_count == 0) { + /* Calculate how long we were busy */ + busy_time = ds->last_comp_time; + timevalsub(&busy_time, &ds->start_time); + + /* Add our busy time to the total busy time. */ + timevaladd(&ds->busy_time, &busy_time); + } else if (ds->busy_count < 0) + printf("devstat_end_transaction: HELP!! busy_count " + "for %s%d is < 0 (%d)!\n", ds->device_name, + ds->unit_number, ds->busy_count); +} + +void +devstat_end_transaction_bio(struct devstat *ds, struct bio *bp) +{ + devstat_trans_flags flg; + + if (bp->bio_cmd == BIO_DELETE) + flg = DEVSTAT_FREE; + else if (bp->bio_cmd == BIO_READ) + flg = DEVSTAT_READ; + else + flg = DEVSTAT_WRITE; + + devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid, + DEVSTAT_TAG_SIMPLE, flg); +} + +/* + * This is the sysctl handler for the devstat package. The data pushed out + * on the kern.devstat.all sysctl variable consists of the current devstat + * generation number, and then an array of devstat structures, one for each + * device in the system. + * + * I'm really not too fond of this method of doing things, but there really + * aren't that many alternatives. We must have some method of making sure + * that the generation number the user gets corresponds with the data the + * user gets. If the user makes a separate sysctl call to get the + * generation, and then a sysctl call to get the device statistics, the + * device list could have changed in that brief period of time. By + * supplying the generation number along with the statistics output, we can + * guarantee that the generation number and the statistics match up. + */ +static int +sysctl_devstat(SYSCTL_HANDLER_ARGS) +{ + int error, i; + struct devstat *nds; + struct devstatlist *devstat_head; + + if (devstat_num_devs == 0) + return(EINVAL); + + error = 0; + devstat_head = &device_statq; + + /* + * First push out the generation number. + */ + error = SYSCTL_OUT(req, &devstat_generation, sizeof(long)); + + /* + * Now push out all the devices. + */ + for (i = 0, nds = STAILQ_FIRST(devstat_head); + (nds != NULL) && (i < devstat_num_devs) && (error == 0); + nds = STAILQ_NEXT(nds, dev_links), i++) + error = SYSCTL_OUT(req, nds, sizeof(struct devstat)); + + return(error); +} + +/* + * Sysctl entries for devstat. The first one is a node that all the rest + * hang off of. + */ +SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, 0, "Device Statistics"); + +SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE, + 0, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list"); +/* + * Export the number of devices in the system so that userland utilities + * can determine how much memory to allocate to hold all the devices. + */ +SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, + &devstat_num_devs, 0, "Number of devices in the devstat list"); +SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD, + &devstat_generation, 0, "Devstat list generation"); +SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, + &devstat_version, 0, "Devstat list version number"); diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c new file mode 100644 index 0000000..1982e7f --- /dev/null +++ b/sys/kern/subr_disk.c @@ -0,0 +1,434 @@ +/* + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + * + * $FreeBSD$ + * + */ + +#include "opt_geom.h" +#ifndef GEOM + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/bio.h> +#include <sys/conf.h> +#include <sys/disk.h> +#include <sys/malloc.h> +#include <sys/sysctl.h> +#include <machine/md_var.h> +#include <sys/ctype.h> + +static MALLOC_DEFINE(M_DISK, "disk", "disk data"); + +static d_strategy_t diskstrategy; +static d_open_t diskopen; +static d_close_t diskclose; +static d_ioctl_t diskioctl; +static d_psize_t diskpsize; + +static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist); + +void disk_dev_synth(dev_t dev); + +void +disk_dev_synth(dev_t dev) +{ + struct disk *dp; + int u, s, p; + dev_t pdev; + + if (dksparebits(dev)) + return; + LIST_FOREACH(dp, &disklist, d_list) { + if (major(dev) != dp->d_devsw->d_maj) + continue; + u = dkunit(dev); + p = RAW_PART; + s = WHOLE_DISK_SLICE; + pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p)); + if (pdev->si_devsw == NULL) + return; /* Probably a unit we don't have */ + s = dkslice(dev); + p = dkpart(dev); + if (s == WHOLE_DISK_SLICE && p == RAW_PART) { + /* XXX: actually should not happen */ + dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), + UID_ROOT, GID_OPERATOR, 0640, "%s%d", + dp->d_devsw->d_name, u); + dev_depends(pdev, dev); + return; + } + if (s == COMPATIBILITY_SLICE) { + dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), + UID_ROOT, GID_OPERATOR, 0640, "%s%d%c", + dp->d_devsw->d_name, u, 'a' + p); + dev_depends(pdev, dev); + return; + } + if (p != RAW_PART) { + dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), + UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c", + dp->d_devsw->d_name, u, s - BASE_SLICE + 1, + 'a' + p); + } else { + dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), + UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d", + dp->d_devsw->d_name, u, s - BASE_SLICE + 1); + make_dev_alias(dev, "%s%ds%dc", + dp->d_devsw->d_name, u, s - BASE_SLICE + 1); + } + dev_depends(pdev, dev); + return; + } +} + +static void +disk_clone(void *arg, char *name, int namelen, dev_t *dev) +{ + struct disk *dp; + char const *d; + char *e; + int j, u, s, p; + dev_t pdev; + + if (*dev != NODEV) + return; + + LIST_FOREACH(dp, &disklist, d_list) { + d = dp->d_devsw->d_name; + j = dev_stdclone(name, &e, d, &u); + if (j == 0) + continue; + if (u > DKMAXUNIT) + continue; + p = RAW_PART; + s = WHOLE_DISK_SLICE; + pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p)); + if (pdev->si_disk == NULL) + continue; + if (*e != '\0') { + j = dev_stdclone(e, &e, "s", &s); + if (j == 0) + s = COMPATIBILITY_SLICE; + else if (j == 1 || j == 2) + s += BASE_SLICE - 1; + if (!*e) + ; /* ad0s1 case */ + else if (e[1] != '\0') + return; /* can never be a disk name */ + else if (*e < 'a' || *e > 'h') + return; /* can never be a disk name */ + else + p = *e - 'a'; + } + if (s == WHOLE_DISK_SLICE && p == RAW_PART) { + return; + } else if (s >= BASE_SLICE && p != RAW_PART) { + *dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), + UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c", + pdev->si_devsw->d_name, u, s - BASE_SLICE + 1, + p + 'a'); + } else if (s >= BASE_SLICE) { + *dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), + UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d", + pdev->si_devsw->d_name, u, s - BASE_SLICE + 1); + make_dev_alias(*dev, "%s%ds%dc", + pdev->si_devsw->d_name, u, s - BASE_SLICE + 1); + } else { + *dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), + UID_ROOT, GID_OPERATOR, 0640, "%s%d%c", + pdev->si_devsw->d_name, u, p + 'a'); + } + dev_depends(pdev, *dev); + return; + } +} + +static void +inherit_raw(dev_t pdev, dev_t dev) +{ + dev->si_disk = pdev->si_disk; + dev->si_drv1 = pdev->si_drv1; + dev->si_drv2 = pdev->si_drv2; + dev->si_iosize_max = pdev->si_iosize_max; + dev->si_bsize_phys = pdev->si_bsize_phys; + dev->si_bsize_best = pdev->si_bsize_best; +} + +dev_t +disk_create(int unit, struct disk *dp, int flags, struct cdevsw *cdevsw, struct cdevsw *proto) +{ + static int once; + dev_t dev; + + if (!once) { + EVENTHANDLER_REGISTER(dev_clone, disk_clone, 0, 1000); + once++; + } + + bzero(dp, sizeof(*dp)); + + if (proto->d_open != diskopen) { + *proto = *cdevsw; + proto->d_open = diskopen; + proto->d_close = diskclose; + proto->d_ioctl = diskioctl; + proto->d_strategy = diskstrategy; + proto->d_psize = diskpsize; + } + + if (bootverbose) + printf("Creating DISK %s%d\n", cdevsw->d_name, unit); + dev = make_dev(proto, dkmakeminor(unit, WHOLE_DISK_SLICE, RAW_PART), + UID_ROOT, GID_OPERATOR, 0640, "%s%d", cdevsw->d_name, unit); + + dev->si_disk = dp; + dp->d_dev = dev; + dp->d_dsflags = flags; + dp->d_devsw = cdevsw; + LIST_INSERT_HEAD(&disklist, dp, d_list); + + return (dev); +} + +static int +diskdumpconf(u_int onoff, dev_t dev, struct disk *dp) +{ + struct dumperinfo di; + struct disklabel *dl; + + if (!onoff) + return(set_dumper(NULL)); + dl = dsgetlabel(dev, dp->d_slice); + if (!dl) + return (ENXIO); + bzero(&di, sizeof di); + di.dumper = (dumper_t *)dp->d_devsw->d_dump; + di.priv = dp->d_dev; + di.blocksize = dl->d_secsize; + di.mediaoffset = (off_t)(dl->d_partitions[dkpart(dev)].p_offset + + dp->d_slice->dss_slices[dkslice(dev)].ds_offset) * DEV_BSIZE; + di.mediasize = + (off_t)(dl->d_partitions[dkpart(dev)].p_size) * DEV_BSIZE; + return(set_dumper(&di)); +} + +void +disk_invalidate (struct disk *disk) +{ + if (disk->d_slice) + dsgone(&disk->d_slice); +} + +void +disk_destroy(dev_t dev) +{ + LIST_REMOVE(dev->si_disk, d_list); + bzero(dev->si_disk, sizeof(*dev->si_disk)); + dev->si_disk = NULL; + destroy_dev(dev); + return; +} + +struct disk * +disk_enumerate(struct disk *disk) +{ + if (!disk) + return (LIST_FIRST(&disklist)); + else + return (LIST_NEXT(disk, d_list)); +} + +static int +sysctl_disks(SYSCTL_HANDLER_ARGS) +{ + struct disk *disk; + int error, first; + + disk = NULL; + first = 1; + + while ((disk = disk_enumerate(disk))) { + if (!first) { + error = SYSCTL_OUT(req, " ", 1); + if (error) + return error; + } else { + first = 0; + } + error = SYSCTL_OUT(req, disk->d_dev->si_name, strlen(disk->d_dev->si_name)); + if (error) + return error; + } + error = SYSCTL_OUT(req, "", 1); + return error; +} + +SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, 0, NULL, + sysctl_disks, "A", "names of available disks"); + +/* + * The cdevsw functions + */ + +static int +diskopen(dev_t dev, int oflags, int devtype, struct thread *td) +{ + dev_t pdev; + struct disk *dp; + int error; + + error = 0; + pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART); + + dp = pdev->si_disk; + if (!dp) + return (ENXIO); + + while (dp->d_flags & DISKFLAG_LOCK) { + dp->d_flags |= DISKFLAG_WANTED; + error = tsleep(dp, PRIBIO | PCATCH, "diskopen", hz); + if (error) + return (error); + } + dp->d_flags |= DISKFLAG_LOCK; + + if (!dsisopen(dp->d_slice)) { + if (!pdev->si_iosize_max) + pdev->si_iosize_max = dev->si_iosize_max; + error = dp->d_devsw->d_open(pdev, oflags, devtype, td); + } + + /* Inherit properties from the whole/raw dev_t */ + inherit_raw(pdev, dev); + + if (error) + goto out; + + error = dsopen(dev, devtype, dp->d_dsflags, &dp->d_slice, &dp->d_label); + + if (!dsisopen(dp->d_slice)) + dp->d_devsw->d_close(pdev, oflags, devtype, td); +out: + dp->d_flags &= ~DISKFLAG_LOCK; + if (dp->d_flags & DISKFLAG_WANTED) { + dp->d_flags &= ~DISKFLAG_WANTED; + wakeup(dp); + } + + return(error); +} + +static int +diskclose(dev_t dev, int fflag, int devtype, struct thread *td) +{ + struct disk *dp; + int error; + dev_t pdev; + + error = 0; + pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART); + dp = pdev->si_disk; + if (!dp) + return (ENXIO); + dsclose(dev, devtype, dp->d_slice); + if (!dsisopen(dp->d_slice)) + error = dp->d_devsw->d_close(dp->d_dev, fflag, devtype, td); + return (error); +} + +static void +diskstrategy(struct bio *bp) +{ + dev_t pdev; + struct disk *dp; + + pdev = dkmodpart(dkmodslice(bp->bio_dev, WHOLE_DISK_SLICE), RAW_PART); + dp = pdev->si_disk; + bp->bio_resid = bp->bio_bcount; + if (dp != bp->bio_dev->si_disk) + inherit_raw(pdev, bp->bio_dev); + + if (!dp) { + biofinish(bp, NULL, ENXIO); + return; + } + + if (dscheck(bp, dp->d_slice) <= 0) { + biodone(bp); + return; + } + + if (bp->bio_bcount == 0) { + biodone(bp); + return; + } + + KASSERT(dp->d_devsw != NULL, ("NULL devsw")); + KASSERT(dp->d_devsw->d_strategy != NULL, ("NULL d_strategy")); + dp->d_devsw->d_strategy(bp); + return; + +} + +static int +diskioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td) +{ + struct disk *dp; + int error; + u_int u; + dev_t pdev; + + pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART); + dp = pdev->si_disk; + if (!dp) + return (ENXIO); + if (cmd == DIOCSKERNELDUMP) { + u = *(u_int *)data; + return (diskdumpconf(u, dev, dp)); + } + if (cmd == DIOCGFRONTSTUFF) { + *(off_t *)data = 8192; /* XXX: crude but enough) */ + return (0); + } + error = dsioctl(dev, cmd, data, fflag, &dp->d_slice); + if (error == ENOIOCTL) + error = dp->d_devsw->d_ioctl(dev, cmd, data, fflag, td); + return (error); +} + +static int +diskpsize(dev_t dev) +{ + struct disk *dp; + dev_t pdev; + + pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART); + dp = pdev->si_disk; + if (!dp) + return (-1); + if (dp != dev->si_disk) { + dev->si_drv1 = pdev->si_drv1; + dev->si_drv2 = pdev->si_drv2; + /* XXX: don't set bp->b_dev->si_disk (?) */ + } + return (dssize(dev, &dp->d_slice)); +} + +SYSCTL_INT(_debug_sizeof, OID_AUTO, disklabel, CTLFLAG_RD, + 0, sizeof(struct disklabel), "sizeof(struct disklabel)"); + +SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD, + 0, sizeof(struct diskslices), "sizeof(struct diskslices)"); + +SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD, + 0, sizeof(struct disk), "sizeof(struct disk)"); + +#endif diff --git a/sys/kern/subr_disklabel.c b/sys/kern/subr_disklabel.c new file mode 100644 index 0000000..e149687 --- /dev/null +++ b/sys/kern/subr_disklabel.c @@ -0,0 +1,426 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/stdint.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/disklabel.h> +#include <sys/diskslice.h> +#include <sys/syslog.h> +#include <machine/atomic.h> + +#ifdef notquite +/* + * Mutex to use when delaying niced I/O bound processes in bioqdisksort(). + */ +static struct mtx dksort_mtx; +static void +dksort_init(void) +{ + + mtx_init(&dksort_mtx, "dksort", NULL, MTX_DEF); +} +SYSINIT(dksort, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, dksort_init, NULL) +#endif + +/* + * Seek sort for disks. + * + * The buf_queue keep two queues, sorted in ascending block order. The first + * queue holds those requests which are positioned after the current block + * (in the first request); the second, which starts at queue->switch_point, + * holds requests which came in after their block number was passed. Thus + * we implement a one way scan, retracting after reaching the end of the drive + * to the first request on the second queue, at which time it becomes the + * first queue. + * + * A one-way scan is natural because of the way UNIX read-ahead blocks are + * allocated. + */ + +void +bioqdisksort(bioq, bp) + struct bio_queue_head *bioq; + struct bio *bp; +{ + struct bio *bq; + struct bio *bn; + struct bio *be; + +#ifdef notquite + struct thread *td = curthread; + + if (td && td->td_ksegrp->kg_nice > 0) { + TAILQ_FOREACH(bn, &bioq->queue, bio_queue) + if (BIOTOBUF(bp)->b_vp != BIOTOBUF(bn)->b_vp) + break; + if (bn != NULL) { + mtx_lock(&dksort_mtx); + msleep((caddr_t)&dksort_mtx, &dksort_mtx, + PPAUSE | PCATCH | PDROP, "ioslow", + td->td_ksegrp->kg_nice); + } + } +#endif + if (!atomic_cmpset_int(&bioq->busy, 0, 1)) + panic("Recursing in bioqdisksort()"); + be = TAILQ_LAST(&bioq->queue, bio_queue); + /* + * If the queue is empty or we are an + * ordered transaction, then it's easy. + */ + if ((bq = bioq_first(bioq)) == NULL) { + bioq_insert_tail(bioq, bp); + bioq->busy = 0; + return; + } else if (bioq->insert_point != NULL) { + + /* + * A certain portion of the list is + * "locked" to preserve ordering, so + * we can only insert after the insert + * point. + */ + bq = bioq->insert_point; + } else { + + /* + * If we lie before the last removed (currently active) + * request, and are not inserting ourselves into the + * "locked" portion of the list, then we must add ourselves + * to the second request list. + */ + if (bp->bio_pblkno < bioq->last_pblkno) { + + bq = bioq->switch_point; + /* + * If we are starting a new secondary list, + * then it's easy. + */ + if (bq == NULL) { + bioq->switch_point = bp; + bioq_insert_tail(bioq, bp); + bioq->busy = 0; + return; + } + /* + * If we lie ahead of the current switch point, + * insert us before the switch point and move + * the switch point. + */ + if (bp->bio_pblkno < bq->bio_pblkno) { + bioq->switch_point = bp; + TAILQ_INSERT_BEFORE(bq, bp, bio_queue); + bioq->busy = 0; + return; + } + } else { + if (bioq->switch_point != NULL) + be = TAILQ_PREV(bioq->switch_point, + bio_queue, bio_queue); + /* + * If we lie between last_pblkno and bq, + * insert before bq. + */ + if (bp->bio_pblkno < bq->bio_pblkno) { + TAILQ_INSERT_BEFORE(bq, bp, bio_queue); + bioq->busy = 0; + return; + } + } + } + + /* + * Request is at/after our current position in the list. + * Optimize for sequential I/O by seeing if we go at the tail. + */ + if (bp->bio_pblkno > be->bio_pblkno) { + TAILQ_INSERT_AFTER(&bioq->queue, be, bp, bio_queue); + bioq->busy = 0; + return; + } + + /* Otherwise, insertion sort */ + while ((bn = TAILQ_NEXT(bq, bio_queue)) != NULL) { + + /* + * We want to go after the current request if it is the end + * of the first request list, or if the next request is a + * larger cylinder than our request. + */ + if (bn == bioq->switch_point + || bp->bio_pblkno < bn->bio_pblkno) + break; + bq = bn; + } + TAILQ_INSERT_AFTER(&bioq->queue, bq, bp, bio_queue); + bioq->busy = 0; +} + + +/* + * Attempt to read a disk label from a device using the indicated strategy + * routine. The label must be partly set up before this: secpercyl, secsize + * and anything required in the strategy routine (e.g., dummy bounds for the + * partition containing the label) must be filled in before calling us. + * Returns NULL on success and an error string on failure. + */ +char * +readdisklabel(dev, lp) + dev_t dev; + register struct disklabel *lp; +{ + register struct buf *bp; + struct disklabel *dlp; + char *msg = NULL; + + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dev; + bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE); + bp->b_bcount = lp->d_secsize; + bp->b_flags &= ~B_INVAL; + bp->b_iocmd = BIO_READ; + DEV_STRATEGY(bp, 1); + if (bufwait(bp)) + msg = "I/O error"; + else if (bp->b_resid != 0) + msg = "disk too small for a label"; + else for (dlp = (struct disklabel *)bp->b_data; + dlp <= (struct disklabel *)((char *)bp->b_data + + lp->d_secsize - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) { + if (msg == NULL) + msg = "no disk label"; + } else if (dlp->d_npartitions > MAXPARTITIONS || + dkcksum(dlp) != 0) + msg = "disk label corrupted"; + else { + *lp = *dlp; + msg = NULL; + break; + } + } + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + return (msg); +} + +/* + * Check new disk label for sensibility before setting it. + */ +int +setdisklabel(olp, nlp, openmask) + register struct disklabel *olp, *nlp; + u_long openmask; +{ + register int i; + register struct partition *opp, *npp; + + /* + * Check it is actually a disklabel we are looking at. + */ + if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC || + dkcksum(nlp) != 0) + return (EINVAL); + /* + * For each partition that we think is open, + */ + while ((i = ffs((long)openmask)) != 0) { + i--; + /* + * Check it is not changing.... + */ + openmask &= ~(1 << i); + if (nlp->d_npartitions <= i) + return (EBUSY); + opp = &olp->d_partitions[i]; + npp = &nlp->d_partitions[i]; + if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size) + return (EBUSY); + /* + * Copy internally-set partition information + * if new label doesn't include it. XXX + * (If we are using it then we had better stay the same type) + * This is possibly dubious, as someone else noted (XXX) + */ + if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) { + npp->p_fstype = opp->p_fstype; + npp->p_fsize = opp->p_fsize; + npp->p_frag = opp->p_frag; + npp->p_cpg = opp->p_cpg; + } + } + nlp->d_checksum = 0; + nlp->d_checksum = dkcksum(nlp); + *olp = *nlp; + return (0); +} + +/* + * Write disk label back to device after modification. + */ +int +writedisklabel(dev, lp) + dev_t dev; + register struct disklabel *lp; +{ + struct buf *bp; + struct disklabel *dlp; + int error = 0; + + if (lp->d_partitions[RAW_PART].p_offset != 0) + return (EXDEV); /* not quite right */ + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dkmodpart(dev, RAW_PART); + bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE); + bp->b_bcount = lp->d_secsize; +#if 1 + /* + * We read the label first to see if it's there, + * in which case we will put ours at the same offset into the block.. + * (I think this is stupid [Julian]) + * Note that you can't write a label out over a corrupted label! + * (also stupid.. how do you write the first one? by raw writes?) + */ + bp->b_flags &= ~B_INVAL; + bp->b_iocmd = BIO_READ; + DEV_STRATEGY(bp, 1); + error = bufwait(bp); + if (error) + goto done; + if (bp->b_resid != 0) { + error = ENOSPC; + goto done; + } + for (dlp = (struct disklabel *)bp->b_data; + dlp <= (struct disklabel *) + ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC && + dkcksum(dlp) == 0) { + *dlp = *lp; + bp->b_flags &= ~B_DONE; + bp->b_iocmd = BIO_WRITE; +#ifdef __alpha__ + alpha_fix_srm_checksum(bp); +#endif + DEV_STRATEGY(bp, 1); + error = bufwait(bp); + goto done; + } + } + error = ESRCH; +done: +#else + bzero(bp->b_data, lp->d_secsize); + dlp = (struct disklabel *)bp->b_data; + *dlp = *lp; + bp->b_flags &= ~B_INVAL; + bp->b_iocmd = BIO_WRITE; + DEV_STRATEGY(bp, 1); + error = bufwait(bp); +#endif + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + return (error); +} + +/* + * Disk error is the preface to plaintive error messages + * about failing disk transfers. It prints messages of the form + +hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) + + * if the offset of the error in the transfer and a disk label + * are both available. blkdone should be -1 if the position of the error + * is unknown; the disklabel pointer may be null from drivers that have not + * been converted to use them. The message is printed with printf. + * The message should be completed with at least a newline. There is no + * trailing space. + */ +void +diskerr(bp, what, blkdone, lp) + struct bio *bp; + char *what; + int blkdone; + register struct disklabel *lp; +{ + int part = dkpart(bp->bio_dev); + char partname[2]; + char *sname; + daddr_t sn; + + *partname = '\0'; + sname = bp->bio_dev->si_name; + printf("%s%s: %s %sing fsbn ", sname, partname, what, + bp->bio_cmd == BIO_READ ? "read" : "writ"); + sn = bp->bio_blkno; + if (bp->bio_bcount <= DEV_BSIZE) + printf("%jd", (intmax_t)sn); + else { + if (blkdone >= 0) { + sn += blkdone; + printf("%jd of ", (intmax_t)sn); + } + printf("%ld-%ld", (long)bp->bio_blkno, + (long)(bp->bio_blkno + (bp->bio_bcount - 1) / DEV_BSIZE)); + } + if (lp && (blkdone >= 0 || bp->bio_bcount <= lp->d_secsize)) { + sn += lp->d_partitions[part].p_offset; + /* + * XXX should add slice offset and not print the slice, + * but we don't know the slice pointer. + * XXX should print bp->b_pblkno so that this will work + * independent of slices, labels and bad sector remapping, + * but some drivers don't set bp->b_pblkno. + */ + printf(" (%s bn %jd; cn %jd", sname, (intmax_t)sn, + (intmax_t)(sn / lp->d_secpercyl)); + sn %= lp->d_secpercyl; + printf(" tn %ld sn %ld)", (long)(sn / lp->d_nsectors), + (long)(sn % lp->d_nsectors)); + } +} diff --git a/sys/kern/subr_diskmbr.c b/sys/kern/subr_diskmbr.c new file mode 100644 index 0000000..40d5b2d --- /dev/null +++ b/sys/kern/subr_diskmbr.c @@ -0,0 +1,544 @@ +/*- + * Copyright (c) 1994 Bruce D. Evans. + * All rights reserved. + * + * Copyright (c) 1982, 1986, 1988 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91 + * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $ + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/conf.h> +#ifdef PC98 +#define PC98_ATCOMPAT +#define dsinit atcompat_dsinit +#endif +#include <sys/disklabel.h> +#define DOSPTYP_EXTENDED 5 +#define DOSPTYP_EXTENDEDX 15 +#define DOSPTYP_ONTRACK 84 +#include <sys/diskslice.h> +#include <sys/malloc.h> +#include <sys/syslog.h> + +#define TRACE(str) do { if (dsi_debug) printf str; } while (0) + +static volatile u_char dsi_debug; + +/* + * This is what we have embedded in every boot1 for supporting the bogus + * "Dangerously Dedicated" mode. However, the old table is broken because + * it has an illegal geometry in it - it specifies 256 heads (heads = end + * head + 1) which causes nasty stuff when that wraps to zero in bios code. + * eg: divide by zero etc. This caused the dead-thinkpad problem, numerous + * SCSI bios crashes, EFI to crash, etc. + * + * We still have to recognize the old table though, even though we stopped + * inflicting it apon the world. + */ +static struct dos_partition historical_bogus_partition_table[NDOSPART] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0x80, 0, 1, 0, DOSPTYP_386BSD, 255, 255, 255, 0, 50000, }, +}; +static struct dos_partition historical_bogus_partition_table_fixed[NDOSPART] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0x80, 0, 1, 0, DOSPTYP_386BSD, 254, 255, 255, 0, 50000, }, +}; + +static int check_part(char *sname, struct dos_partition *dp, + u_long offset, int nsectors, int ntracks, + u_long mbr_offset); +static void mbr_extended(dev_t dev, struct disklabel *lp, + struct diskslices *ssp, u_long ext_offset, + u_long ext_size, u_long base_ext_offset, + int nsectors, int ntracks, u_long mbr_offset, + int level); +static int mbr_setslice(char *sname, struct disklabel *lp, + struct diskslice *sp, struct dos_partition *dp, + u_long br_offset); + +static int +check_part(sname, dp, offset, nsectors, ntracks, mbr_offset ) + char *sname; + struct dos_partition *dp; + u_long offset; + int nsectors; + int ntracks; + u_long mbr_offset; +{ + int chs_ecyl; + int chs_esect; + int chs_scyl; + int chs_ssect; + int error; + u_long esector; + u_long esector1; + u_long secpercyl; + u_long ssector; + u_long ssector1; + + secpercyl = (u_long)nsectors * ntracks; + chs_scyl = DPCYL(dp->dp_scyl, dp->dp_ssect); + chs_ssect = DPSECT(dp->dp_ssect); + ssector = chs_ssect - 1 + dp->dp_shd * nsectors + chs_scyl * secpercyl + + mbr_offset; + ssector1 = offset + dp->dp_start; + + /* + * If ssector1 is on a cylinder >= 1024, then ssector can't be right. + * Allow the C/H/S for it to be 1023/ntracks-1/nsectors, or correct + * apart from the cylinder being reduced modulo 1024. Always allow + * 1023/255/63, because this is the official way to represent + * pure-LBA for the starting position. + */ + if ((ssector < ssector1 + && ((chs_ssect == nsectors && dp->dp_shd == ntracks - 1 + && chs_scyl == 1023) + || (secpercyl != 0 + && (ssector1 - ssector) % (1024 * secpercyl) == 0))) + || (dp->dp_scyl == 255 && dp->dp_shd == 255 + && dp->dp_ssect == 255)) { + TRACE(("%s: C/H/S start %d/%d/%d, start %lu: allow\n", + sname, chs_scyl, dp->dp_shd, chs_ssect, ssector1)); + ssector = ssector1; + } + + chs_ecyl = DPCYL(dp->dp_ecyl, dp->dp_esect); + chs_esect = DPSECT(dp->dp_esect); + esector = chs_esect - 1 + dp->dp_ehd * nsectors + chs_ecyl * secpercyl + + mbr_offset; + esector1 = ssector1 + dp->dp_size - 1; + + /* + * Allow certain bogus C/H/S values for esector, as above. However, + * heads == 255 isn't really legal and causes some BIOS crashes. The + * correct value to indicate a pure-LBA end is 1023/heads-1/sectors - + * usually 1023/254/63. "heads" is base 0, "sectors" is base 1. + */ + if ((esector < esector1 + && ((chs_esect == nsectors && dp->dp_ehd == ntracks - 1 + && chs_ecyl == 1023) + || (secpercyl != 0 + && (esector1 - esector) % (1024 * secpercyl) == 0))) + || (dp->dp_ecyl == 255 && dp->dp_ehd == 255 + && dp->dp_esect == 255)) { + TRACE(("%s: C/H/S end %d/%d/%d, end %lu: allow\n", + sname, chs_ecyl, dp->dp_ehd, chs_esect, esector1)); + esector = esector1; + } + + error = (ssector == ssector1 && esector == esector1) ? 0 : EINVAL; + if (bootverbose) + printf("%s: type 0x%x, start %lu, end = %lu, size %lu %s\n", + sname, dp->dp_typ, ssector1, esector1, + (u_long)dp->dp_size, error ? "" : ": OK"); + if (ssector != ssector1 && bootverbose) + printf("%s: C/H/S start %d/%d/%d (%lu) != start %lu: invalid\n", + sname, chs_scyl, dp->dp_shd, chs_ssect, + ssector, ssector1); + if (esector != esector1 && bootverbose) + printf("%s: C/H/S end %d/%d/%d (%lu) != end %lu: invalid\n", + sname, chs_ecyl, dp->dp_ehd, chs_esect, + esector, esector1); + return (error); +} + +int +dsinit(dev, lp, sspp) + dev_t dev; + struct disklabel *lp; + struct diskslices **sspp; +{ + struct buf *bp; + u_char *cp; + int dospart; + struct dos_partition *dp; + struct dos_partition *dp0; + struct dos_partition dpcopy[NDOSPART]; + int error; + int max_ncyls; + int max_nsectors; + int max_ntracks; + u_long mbr_offset; + char partname[2]; + u_long secpercyl; + char *sname; + struct diskslice *sp; + struct diskslices *ssp; + + mbr_offset = DOSBBSECTOR; +reread_mbr: + /* Read master boot record. */ + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART); + bp->b_blkno = mbr_offset; + bp->b_bcount = lp->d_secsize; + bp->b_iocmd = BIO_READ; + DEV_STRATEGY(bp, 1); + if (bufwait(bp) != 0) { + diskerr(&bp->b_io, "reading primary partition table: error", + 0, (struct disklabel *)NULL); + printf("\n"); + error = EIO; + goto done; + } + + /* Weakly verify it. */ + cp = bp->b_data; + sname = dsname(dev, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART, partname); + if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) { + if (bootverbose) + printf("%s: invalid primary partition table: no magic\n", + sname); + error = EINVAL; + goto done; + } + + /* Make a copy of the partition table to avoid alignment problems. */ + memcpy(&dpcopy[0], cp + DOSPARTOFF, sizeof(dpcopy)); + + dp0 = &dpcopy[0]; + + /* Check for "Ontrack Diskmanager". */ + for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) { + if (dp->dp_typ == DOSPTYP_ONTRACK) { + if (bootverbose) + printf( + "%s: Found \"Ontrack Disk Manager\" on this disk.\n", sname); + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + mbr_offset = 63; + goto reread_mbr; + } + } + + if (bcmp(dp0, historical_bogus_partition_table, + sizeof historical_bogus_partition_table) == 0 || + bcmp(dp0, historical_bogus_partition_table_fixed, + sizeof historical_bogus_partition_table_fixed) == 0) { + if (bootverbose) + printf( + "%s: invalid primary partition table: Dangerously Dedicated (ignored)\n", + sname); + error = EINVAL; + goto done; + } + + /* Guess the geometry. */ + /* + * TODO: + * Perhaps skip entries with 0 size. + * Perhaps only look at entries of type DOSPTYP_386BSD. + */ + max_ncyls = 0; + max_nsectors = 0; + max_ntracks = 0; + for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) { + int ncyls; + int nsectors; + int ntracks; + + ncyls = DPCYL(dp->dp_ecyl, dp->dp_esect) + 1; + if (max_ncyls < ncyls) + max_ncyls = ncyls; + nsectors = DPSECT(dp->dp_esect); + if (max_nsectors < nsectors) + max_nsectors = nsectors; + ntracks = dp->dp_ehd + 1; + if (max_ntracks < ntracks) + max_ntracks = ntracks; + } + + /* + * Check that we have guessed the geometry right by checking the + * partition entries. + */ + /* + * TODO: + * As above. + * Check for overlaps. + * Check against d_secperunit if the latter is reliable. + */ + error = 0; + for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) { + if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0 + && dp->dp_start == 0 && dp->dp_size == 0) + continue; + sname = dsname(dev, dkunit(dev), BASE_SLICE + dospart, + RAW_PART, partname); + + /* + * Temporarily ignore errors from this check. We could + * simplify things by accepting the table eariler if we + * always ignore errors here. Perhaps we should always + * accept the table if the magic is right but not let + * bad entries affect the geometry. + */ + check_part(sname, dp, mbr_offset, max_nsectors, max_ntracks, + mbr_offset); + } + if (error != 0) + goto done; + + /* + * Accept the DOS partition table. + * First adjust the label (we have been careful not to change it + * before we can guarantee success). + */ + secpercyl = (u_long)max_nsectors * max_ntracks; + if (secpercyl != 0) { + lp->d_nsectors = max_nsectors; + lp->d_ntracks = max_ntracks; + lp->d_secpercyl = secpercyl; + lp->d_ncylinders = lp->d_secperunit / secpercyl; + } + + /* + * We are passed a pointer to a suitably initialized minimal + * slices "struct" with no dangling pointers in it. Replace it + * by a maximal one. This usually oversizes the "struct", but + * enlarging it while searching for logical drives would be + * inconvenient. + */ + free(*sspp, M_DEVBUF); + ssp = dsmakeslicestruct(MAX_SLICES, lp); + *sspp = ssp; + + /* Initialize normal slices. */ + sp = &ssp->dss_slices[BASE_SLICE]; + for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++, sp++) { + sname = dsname(dev, dkunit(dev), BASE_SLICE + dospart, + RAW_PART, partname); + (void)mbr_setslice(sname, lp, sp, dp, mbr_offset); + } + ssp->dss_nslices = BASE_SLICE + NDOSPART; + + /* Handle extended partitions. */ + sp -= NDOSPART; + for (dospart = 0; dospart < NDOSPART; dospart++, sp++) + if (sp->ds_type == DOSPTYP_EXTENDED || + sp->ds_type == DOSPTYP_EXTENDEDX) + mbr_extended(bp->b_dev, lp, ssp, + sp->ds_offset, sp->ds_size, sp->ds_offset, + max_nsectors, max_ntracks, mbr_offset, 1); + + /* + * mbr_extended() abuses ssp->dss_nslices for the number of slices + * that would be found if there were no limit on the number of slices + * in *ssp. Cut it back now. + */ + if (ssp->dss_nslices > MAX_SLICES) + ssp->dss_nslices = MAX_SLICES; + +done: + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + if (error == EINVAL) + error = 0; + return (error); +} + +void +mbr_extended(dev, lp, ssp, ext_offset, ext_size, base_ext_offset, nsectors, + ntracks, mbr_offset, level) + dev_t dev; + struct disklabel *lp; + struct diskslices *ssp; + u_long ext_offset; + u_long ext_size; + u_long base_ext_offset; + int nsectors; + int ntracks; + u_long mbr_offset; + int level; +{ + struct buf *bp; + u_char *cp; + int dospart; + struct dos_partition *dp; + struct dos_partition dpcopy[NDOSPART]; + u_long ext_offsets[NDOSPART]; + u_long ext_sizes[NDOSPART]; + char partname[2]; + int slice; + char *sname; + struct diskslice *sp; + + if (level >= 16) { + printf( + "%s: excessive recursion in search for slices; aborting search\n", + devtoname(dev)); + return; + } + + /* Read extended boot record. */ + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dev; + bp->b_blkno = ext_offset; + bp->b_bcount = lp->d_secsize; + bp->b_iocmd = BIO_READ; + DEV_STRATEGY(bp, 1); + if (bufwait(bp) != 0) { + diskerr(&bp->b_io, "reading extended partition table: error", + 0, (struct disklabel *)NULL); + printf("\n"); + goto done; + } + + /* Weakly verify it. */ + cp = bp->b_data; + if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) { + sname = dsname(dev, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART, + partname); + if (bootverbose) + printf("%s: invalid extended partition table: no magic\n", + sname); + goto done; + } + + /* Make a copy of the partition table to avoid alignment problems. */ + memcpy(&dpcopy[0], cp + DOSPARTOFF, sizeof(dpcopy)); + + slice = ssp->dss_nslices; + for (dospart = 0, dp = &dpcopy[0]; dospart < NDOSPART; + dospart++, dp++) { + ext_sizes[dospart] = 0; + if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0 + && dp->dp_start == 0 && dp->dp_size == 0) + continue; + if (dp->dp_typ == DOSPTYP_EXTENDED || + dp->dp_typ == DOSPTYP_EXTENDEDX) { + static char buf[32]; + + sname = dsname(dev, dkunit(dev), WHOLE_DISK_SLICE, + RAW_PART, partname); + snprintf(buf, sizeof(buf), "%s", sname); + if (strlen(buf) < sizeof buf - 11) + strcat(buf, "<extended>"); + check_part(buf, dp, base_ext_offset, nsectors, + ntracks, mbr_offset); + ext_offsets[dospart] = base_ext_offset + dp->dp_start; + ext_sizes[dospart] = dp->dp_size; + } else { + sname = dsname(dev, dkunit(dev), slice, RAW_PART, + partname); + check_part(sname, dp, ext_offset, nsectors, ntracks, + mbr_offset); + if (slice >= MAX_SLICES) { + printf("%s: too many slices\n", sname); + slice++; + continue; + } + sp = &ssp->dss_slices[slice]; + if (mbr_setslice(sname, lp, sp, dp, ext_offset) != 0) + continue; + slice++; + } + } + ssp->dss_nslices = slice; + + /* If we found any more slices, recursively find all the subslices. */ + for (dospart = 0; dospart < NDOSPART; dospart++) + if (ext_sizes[dospart] != 0) + mbr_extended(dev, lp, ssp, ext_offsets[dospart], + ext_sizes[dospart], base_ext_offset, + nsectors, ntracks, mbr_offset, ++level); + +done: + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); +} + +static int +mbr_setslice(sname, lp, sp, dp, br_offset) + char *sname; + struct disklabel *lp; + struct diskslice *sp; + struct dos_partition *dp; + u_long br_offset; +{ + u_long offset; + u_long size; + + offset = br_offset + dp->dp_start; + if (offset > lp->d_secperunit || offset < br_offset) { + printf( + "%s: slice starts beyond end of the disk: rejecting it\n", + sname); + return (1); + } + size = lp->d_secperunit - offset; + if (size >= dp->dp_size) + size = dp->dp_size; + else + printf( +"%s: slice extends beyond end of disk: truncating from %lu to %lu sectors\n", + sname, (u_long)dp->dp_size, size); + sp->ds_offset = offset; + sp->ds_size = size; + sp->ds_type = dp->dp_typ; +#ifdef PC98_ATCOMPAT + /* Fake FreeBSD(98). */ + if (sp->ds_type == DOSPTYP_386BSD) + sp->ds_type = 0x94; +#endif +#if 0 + lp->d_subtype |= (lp->d_subtype & 3) | dospart | DSTYPE_INDOSPART; +#endif + return (0); +} + +#ifdef __alpha__ +void +alpha_fix_srm_checksum(bp) + struct buf *bp; +{ + u_int64_t *p; + u_int64_t sum; + int i; + + p = (u_int64_t *) bp->b_data; + sum = 0; + for (i = 0; i < 63; i++) + sum += p[i]; + p[63] = sum; +} +#endif diff --git a/sys/kern/subr_diskslice.c b/sys/kern/subr_diskslice.c new file mode 100644 index 0000000..ec6099e --- /dev/null +++ b/sys/kern/subr_diskslice.c @@ -0,0 +1,997 @@ +/*- + * Copyright (c) 1994 Bruce D. Evans. + * All rights reserved. + * + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Copyright (c) 1982, 1986, 1988 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)wd.c 7.2 (Berkeley) 5/9/91 + * from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $ + * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91 + * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $ + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/conf.h> +#include <sys/disk.h> +#include <sys/disklabel.h> +#include <sys/diskslice.h> +#include <sys/fcntl.h> +#include <sys/malloc.h> +#include <sys/stat.h> +#include <sys/stdint.h> +#include <sys/syslog.h> +#include <sys/vnode.h> + +#define TRACE(str) do { if (ds_debug) printf str; } while (0) + +typedef u_char bool_t; + +static volatile bool_t ds_debug; + +static struct disklabel *clone_label(struct disklabel *lp); +static void dsiodone(struct bio *bp); +static char *fixlabel(char *sname, struct diskslice *sp, + struct disklabel *lp, int writeflag); +static void free_ds_label(struct diskslices *ssp, int slice); +static void partition_info(char *sname, int part, struct partition *pp); +static void slice_info(char *sname, struct diskslice *sp); +static void set_ds_label(struct diskslices *ssp, int slice, + struct disklabel *lp); +static void set_ds_labeldevs(dev_t dev, struct diskslices *ssp); +static void set_ds_wlabel(struct diskslices *ssp, int slice, + int wlabel); + +/* + * Duplicate a label for the whole disk, and initialize defaults in the + * copy for fields that are not already initialized. The caller only + * needs to initialize d_secsize and d_secperunit, and zero the fields + * that are to be defaulted. + */ +static struct disklabel * +clone_label(lp) + struct disklabel *lp; +{ + struct disklabel *lp1; + + lp1 = malloc(sizeof *lp1, M_DEVBUF, M_WAITOK); + *lp1 = *lp; + lp = NULL; + if (lp1->d_typename[0] == '\0') + strncpy(lp1->d_typename, "amnesiac", sizeof(lp1->d_typename)); + if (lp1->d_packname[0] == '\0') + strncpy(lp1->d_packname, "fictitious", sizeof(lp1->d_packname)); + if (lp1->d_nsectors == 0) + lp1->d_nsectors = 32; + if (lp1->d_ntracks == 0) + lp1->d_ntracks = 64; + lp1->d_secpercyl = lp1->d_nsectors * lp1->d_ntracks; + lp1->d_ncylinders = lp1->d_secperunit / lp1->d_secpercyl; + if (lp1->d_rpm == 0) + lp1->d_rpm = 3600; + if (lp1->d_interleave == 0) + lp1->d_interleave = 1; + if (lp1->d_npartitions < RAW_PART + 1) + lp1->d_npartitions = MAXPARTITIONS; + if (lp1->d_bbsize == 0) + lp1->d_bbsize = BBSIZE; + lp1->d_partitions[RAW_PART].p_size = lp1->d_secperunit; + lp1->d_magic = DISKMAGIC; + lp1->d_magic2 = DISKMAGIC; + lp1->d_checksum = dkcksum(lp1); + return (lp1); +} + +dev_t +dkmodpart(dev_t dev, int part) +{ + return (makedev(major(dev), (minor(dev) & ~7) | part)); +} + +dev_t +dkmodslice(dev_t dev, int slice) +{ + return (makedev(major(dev), (minor(dev) & ~0x1f0000) | (slice << 16))); +} + +u_int +dkunit(dev_t dev) +{ + return (((minor(dev) >> 16) & 0x1e0) | ((minor(dev) >> 3) & 0x1f)); +} + +/* + * Determine the size of the transfer, and make sure it is + * within the boundaries of the partition. Adjust transfer + * if needed, and signal errors or early completion. + * + * XXX TODO: + * o Split buffers that are too big for the device. + * o Check for overflow. + * o Finish cleaning this up. + */ +int +dscheck(bp, ssp) + struct bio *bp; + struct diskslices *ssp; +{ + daddr_t blkno; + daddr_t endsecno; + daddr_t labelsect; + struct disklabel *lp; + char *msg; + long nsec; + struct partition *pp; + daddr_t secno; + daddr_t slicerel_secno; + struct diskslice *sp; + + blkno = bp->bio_blkno; + if (blkno < 0) { + printf("dscheck(%s): negative bio_blkno %ld\n", + devtoname(bp->bio_dev), (long)blkno); + bp->bio_error = EINVAL; + goto bad; + } + sp = &ssp->dss_slices[dkslice(bp->bio_dev)]; + lp = sp->ds_label; + if (ssp->dss_secmult == 1) { + if (bp->bio_bcount % (u_long)DEV_BSIZE) + goto bad_bcount; + secno = blkno; + nsec = bp->bio_bcount >> DEV_BSHIFT; + } else if (ssp->dss_secshift != -1) { + if (bp->bio_bcount & (ssp->dss_secsize - 1)) + goto bad_bcount; + if (blkno & (ssp->dss_secmult - 1)) + goto bad_blkno; + secno = blkno >> ssp->dss_secshift; + nsec = bp->bio_bcount >> (DEV_BSHIFT + ssp->dss_secshift); + } else { + if (bp->bio_bcount % ssp->dss_secsize) + goto bad_bcount; + if (blkno % ssp->dss_secmult) + goto bad_blkno; + secno = blkno / ssp->dss_secmult; + nsec = bp->bio_bcount / ssp->dss_secsize; + } + if (lp == NULL) { + labelsect = -LABELSECTOR - 1; + endsecno = sp->ds_size; + slicerel_secno = secno; + } else { + labelsect = lp->d_partitions[LABEL_PART].p_offset; +if (labelsect != 0) Debugger("labelsect != 0 in dscheck()"); + pp = &lp->d_partitions[dkpart(bp->bio_dev)]; + endsecno = pp->p_size; + slicerel_secno = pp->p_offset + secno; + } + + /* overwriting disk label ? */ + /* XXX should also protect bootstrap in first 8K */ + if (slicerel_secno <= LABELSECTOR + labelsect && +#if LABELSECTOR != 0 + slicerel_secno + nsec > LABELSECTOR + labelsect && +#endif + (bp->bio_cmd == BIO_WRITE) && sp->ds_wlabel == 0) { + bp->bio_error = EROFS; + goto bad; + } + +#if defined(DOSBBSECTOR) && defined(notyet) + /* overwriting master boot record? */ + if (slicerel_secno <= DOSBBSECTOR && (bp->bio_cmd == BIO_WRITE) && + sp->ds_wlabel == 0) { + bp->bio_error = EROFS; + goto bad; + } +#endif + + /* beyond partition? */ + if ((uintmax_t)secno + nsec > endsecno) { + /* if exactly at end of disk, return an EOF */ + if (secno == endsecno) { + bp->bio_resid = bp->bio_bcount; + return (0); + } + /* or truncate if part of it fits */ + if (secno > endsecno) { + bp->bio_error = EINVAL; + goto bad; + } + bp->bio_bcount = (endsecno - secno) * ssp->dss_secsize; + } + + bp->bio_pblkno = sp->ds_offset + slicerel_secno; + + /* + * Snoop on label accesses if the slice offset is nonzero. Fudge + * offsets in the label to keep the in-core label coherent with + * the on-disk one. + */ + if (slicerel_secno <= LABELSECTOR + labelsect +#if LABELSECTOR != 0 + && slicerel_secno + nsec > LABELSECTOR + labelsect +#endif + && sp->ds_offset != 0) { + struct iodone_chain *ic; + + ic = malloc(sizeof *ic , M_DEVBUF, M_WAITOK); + ic->ic_prev_flags = bp->bio_flags; + ic->ic_prev_iodone = bp->bio_done; + ic->ic_prev_iodone_chain = bp->bio_done_chain; + ic->ic_args[0].ia_long = (LABELSECTOR + labelsect - + slicerel_secno) * ssp->dss_secsize; + ic->ic_args[1].ia_ptr = sp; + bp->bio_done = dsiodone; + bp->bio_done_chain = ic; + if (!(bp->bio_cmd == BIO_READ)) { + /* + * XXX even disklabel(8) writes directly so we need + * to adjust writes. Perhaps we should drop support + * for DIOCWLABEL (always write protect labels) and + * require the use of DIOCWDINFO. + * + * XXX probably need to copy the data to avoid even + * temporarily corrupting the in-core copy. + */ + /* XXX need name here. */ + msg = fixlabel((char *)NULL, sp, + (struct disklabel *) + (bp->bio_data + ic->ic_args[0].ia_long), + TRUE); + if (msg != NULL) { + printf("dscheck(%s): %s\n", + devtoname(bp->bio_dev), msg); + bp->bio_error = EROFS; + goto bad; + } + } + } + return (1); + +bad_bcount: + printf( + "dscheck(%s): bio_bcount %ld is not on a sector boundary (ssize %d)\n", + devtoname(bp->bio_dev), bp->bio_bcount, ssp->dss_secsize); + bp->bio_error = EINVAL; + goto bad; + +bad_blkno: + printf( + "dscheck(%s): bio_blkno %ld is not on a sector boundary (ssize %d)\n", + devtoname(bp->bio_dev), (long)blkno, ssp->dss_secsize); + bp->bio_error = EINVAL; + goto bad; + +bad: + bp->bio_resid = bp->bio_bcount; + bp->bio_flags |= BIO_ERROR; + return (-1); +} + +void +dsclose(dev, mode, ssp) + dev_t dev; + int mode; + struct diskslices *ssp; +{ + u_char mask; + struct diskslice *sp; + + sp = &ssp->dss_slices[dkslice(dev)]; + mask = 1 << dkpart(dev); + sp->ds_openmask &= ~mask; +} + +void +dsgone(sspp) + struct diskslices **sspp; +{ + int slice; + struct diskslice *sp; + struct diskslices *ssp; + + for (slice = 0, ssp = *sspp; slice < ssp->dss_nslices; slice++) { + sp = &ssp->dss_slices[slice]; + free_ds_label(ssp, slice); + } + free(ssp, M_DEVBUF); + *sspp = NULL; +} + +/* + * For the "write" commands (DIOCSDINFO and DIOCWDINFO), this + * is subject to the same restriction as dsopen(). + */ +int +dsioctl(dev, cmd, data, flags, sspp) + dev_t dev; + u_long cmd; + caddr_t data; + int flags; + struct diskslices **sspp; +{ + int error; + struct disklabel *lp; + int old_wlabel; + u_char openmask; + int part; + int slice; + struct diskslice *sp; + struct diskslices *ssp; + struct partition *pp; + + slice = dkslice(dev); + ssp = *sspp; + sp = &ssp->dss_slices[slice]; + lp = sp->ds_label; + switch (cmd) { + + case DIOCGDVIRGIN: + lp = (struct disklabel *)data; + if (ssp->dss_slices[WHOLE_DISK_SLICE].ds_label) { + *lp = *ssp->dss_slices[WHOLE_DISK_SLICE].ds_label; + } else { + bzero(lp, sizeof(struct disklabel)); + } + + lp->d_magic = DISKMAGIC; + lp->d_magic2 = DISKMAGIC; + pp = &lp->d_partitions[RAW_PART]; + pp->p_offset = 0; + pp->p_size = sp->ds_size; + + lp->d_npartitions = MAXPARTITIONS; + if (lp->d_interleave == 0) + lp->d_interleave = 1; + if (lp->d_rpm == 0) + lp->d_rpm = 3600; + if (lp->d_nsectors == 0) + lp->d_nsectors = 32; + if (lp->d_ntracks == 0) + lp->d_ntracks = 64; + + lp->d_bbsize = BBSIZE; + lp->d_sbsize = 0; + lp->d_secpercyl = lp->d_nsectors * lp->d_ntracks; + lp->d_ncylinders = sp->ds_size / lp->d_secpercyl; + lp->d_secperunit = sp->ds_size; + lp->d_checksum = 0; + lp->d_checksum = dkcksum(lp); + return (0); + + case DIOCGDINFO: + if (lp == NULL) + return (EINVAL); + *(struct disklabel *)data = *lp; + return (0); + + case DIOCGSECTORSIZE: + if (lp == NULL) + return (EINVAL); + *(u_int *)data = lp->d_secsize; + return (0); + + case DIOCGMEDIASIZE: + if (lp == NULL) + return (EINVAL); + *(off_t *)data = (off_t)lp->d_partitions[dkpart(dev)].p_size * + lp->d_secsize; + return (0); + + case DIOCGSLICEINFO: + bcopy(ssp, data, (char *)&ssp->dss_slices[ssp->dss_nslices] - + (char *)ssp); + return (0); + + case DIOCSDINFO: + if (slice == WHOLE_DISK_SLICE) + return (ENODEV); + if (!(flags & FWRITE)) + return (EBADF); + lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK); + if (sp->ds_label == NULL) + bzero(lp, sizeof *lp); + else + bcopy(sp->ds_label, lp, sizeof *lp); + if (sp->ds_label == NULL) + openmask = 0; + else { + openmask = sp->ds_openmask; + if (slice == COMPATIBILITY_SLICE) + openmask |= ssp->dss_slices[ + ssp->dss_first_bsd_slice].ds_openmask; + else if (slice == ssp->dss_first_bsd_slice) + openmask |= ssp->dss_slices[ + COMPATIBILITY_SLICE].ds_openmask; + } + error = setdisklabel(lp, (struct disklabel *)data, + (u_long)openmask); + /* XXX why doesn't setdisklabel() check this? */ + if (error == 0 && lp->d_partitions[RAW_PART].p_offset != 0) + error = EXDEV; + if (error == 0) { + if (lp->d_secperunit > sp->ds_size) + error = ENOSPC; + for (part = 0; part < lp->d_npartitions; part++) + if (lp->d_partitions[part].p_size > sp->ds_size) + error = ENOSPC; + } + if (error != 0) { + free(lp, M_DEVBUF); + return (error); + } + free_ds_label(ssp, slice); + set_ds_label(ssp, slice, lp); + set_ds_labeldevs(dev, ssp); + return (0); + + case DIOCSYNCSLICEINFO: + if (slice != WHOLE_DISK_SLICE || dkpart(dev) != RAW_PART) + return (EINVAL); + if (!*(int *)data) + for (slice = 0; slice < ssp->dss_nslices; slice++) { + openmask = ssp->dss_slices[slice].ds_openmask; + if (openmask + && (slice != WHOLE_DISK_SLICE + || openmask & ~(1 << RAW_PART))) + return (EBUSY); + } + + /* + * Temporarily forget the current slices struct and read + * the current one. + * XXX should wait for current accesses on this disk to + * complete, then lock out future accesses and opens. + */ + *sspp = NULL; + lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK); + *lp = *ssp->dss_slices[WHOLE_DISK_SLICE].ds_label; + error = dsopen(dev, S_IFCHR, ssp->dss_oflags, sspp, lp); + if (error != 0) { + free(lp, M_DEVBUF); + *sspp = ssp; + return (error); + } + + /* + * Reopen everything. This is a no-op except in the "force" + * case and when the raw bdev and cdev are both open. Abort + * if anything fails. + */ + for (slice = 0; slice < ssp->dss_nslices; slice++) { + for (openmask = ssp->dss_slices[slice].ds_openmask, + part = 0; openmask; openmask >>= 1, part++) { + if (!(openmask & 1)) + continue; + error = dsopen(dkmodslice(dkmodpart(dev, part), + slice), + S_IFCHR, ssp->dss_oflags, sspp, + lp); + if (error != 0) { + free(lp, M_DEVBUF); + *sspp = ssp; + return (EBUSY); + } + } + } + + free(lp, M_DEVBUF); + dsgone(&ssp); + return (0); + + case DIOCWDINFO: + error = dsioctl(dev, DIOCSDINFO, data, flags, &ssp); + if (error != 0) + return (error); + /* + * XXX this used to hack on dk_openpart to fake opening + * partition 0 in case that is used instead of dkpart(dev). + */ + old_wlabel = sp->ds_wlabel; + set_ds_wlabel(ssp, slice, TRUE); + error = writedisklabel(dev, sp->ds_label); + /* XXX should invalidate in-core label if write failed. */ + set_ds_wlabel(ssp, slice, old_wlabel); + return (error); + + case DIOCWLABEL: +#ifndef __alpha__ + if (slice == WHOLE_DISK_SLICE) + return (ENODEV); +#endif + if (!(flags & FWRITE)) + return (EBADF); + set_ds_wlabel(ssp, slice, *(int *)data != 0); + return (0); + + default: + return (ENOIOCTL); + } +} + +static void +dsiodone(bp) + struct bio *bp; +{ + struct iodone_chain *ic; + char *msg; + + ic = bp->bio_done_chain; + bp->bio_done = ic->ic_prev_iodone; + bp->bio_done_chain = ic->ic_prev_iodone_chain; + if (!(bp->bio_cmd == BIO_READ) + || (!(bp->bio_flags & BIO_ERROR) && bp->bio_error == 0)) { + msg = fixlabel((char *)NULL, ic->ic_args[1].ia_ptr, + (struct disklabel *) + (bp->bio_data + ic->ic_args[0].ia_long), + FALSE); + if (msg != NULL) + printf("%s\n", msg); + } + free(ic, M_DEVBUF); + biodone(bp); +} + +int +dsisopen(ssp) + struct diskslices *ssp; +{ + int slice; + + if (ssp == NULL) + return (0); + for (slice = 0; slice < ssp->dss_nslices; slice++) + if (ssp->dss_slices[slice].ds_openmask) + return (1); + return (0); +} + +/* + * Allocate a slices "struct" and initialize it to contain only an empty + * compatibility slice (pointing to itself), a whole disk slice (covering + * the disk as described by the label), and (nslices - BASE_SLICES) empty + * slices beginning at BASE_SLICE. + */ +struct diskslices * +dsmakeslicestruct(nslices, lp) + int nslices; + struct disklabel *lp; +{ + struct diskslice *sp; + struct diskslices *ssp; + + ssp = malloc(offsetof(struct diskslices, dss_slices) + + nslices * sizeof *sp, M_DEVBUF, M_WAITOK); + ssp->dss_first_bsd_slice = COMPATIBILITY_SLICE; + ssp->dss_nslices = nslices; + ssp->dss_oflags = 0; + ssp->dss_secmult = lp->d_secsize / DEV_BSIZE; + if (ssp->dss_secmult & (ssp->dss_secmult - 1)) + ssp->dss_secshift = -1; + else + ssp->dss_secshift = ffs(ssp->dss_secmult) - 1; + ssp->dss_secsize = lp->d_secsize; + sp = &ssp->dss_slices[0]; + bzero(sp, nslices * sizeof *sp); + sp[WHOLE_DISK_SLICE].ds_size = lp->d_secperunit; + return (ssp); +} + +char * +dsname(dev, unit, slice, part, partname) + dev_t dev; + int unit; + int slice; + int part; + char *partname; +{ + static char name[32]; + const char *dname; + + dname = devsw(dev)->d_name; + if (strlen(dname) > 16) + dname = "nametoolong"; + snprintf(name, sizeof(name), "%s%d", dname, unit); + partname[0] = '\0'; + if (slice != WHOLE_DISK_SLICE || part != RAW_PART) { + partname[0] = 'a' + part; + partname[1] = '\0'; + if (slice != COMPATIBILITY_SLICE) + snprintf(name + strlen(name), + sizeof(name) - strlen(name), "s%d", slice - 1); + } + return (name); +} + +/* + * This should only be called when the unit is inactive and the strategy + * routine should not allow it to become active unless we call it. Our + * strategy routine must be special to allow activity. + */ +int +dsopen(dev, mode, flags, sspp, lp) + dev_t dev; + int mode; + u_int flags; + struct diskslices **sspp; + struct disklabel *lp; +{ + dev_t dev1; + int error; + struct disklabel *lp1; + char *msg; + u_char mask; + int part; + char partname[2]; + int slice; + char *sname; + struct diskslice *sp; + struct diskslices *ssp; + int unit; + + dev->si_bsize_phys = lp->d_secsize; + + unit = dkunit(dev); + if (lp->d_secsize % DEV_BSIZE) { + printf("%s: invalid sector size %lu\n", devtoname(dev), + (u_long)lp->d_secsize); + return (EINVAL); + } + + /* + * XXX reinitialize the slice table unless there is an open device + * on the unit. This should only be done if the media has changed. + */ + ssp = *sspp; + if (!dsisopen(ssp)) { + if (ssp != NULL) + dsgone(sspp); + /* + * Allocate a minimal slices "struct". This will become + * the final slices "struct" if we don't want real slices + * or if we can't find any real slices. + */ + *sspp = dsmakeslicestruct(BASE_SLICE, lp); + + if (!(flags & DSO_ONESLICE)) { + TRACE(("dsinit\n")); + error = dsinit(dev, lp, sspp); + if (error != 0) { + dsgone(sspp); + return (error); + } + } + ssp = *sspp; + ssp->dss_oflags = flags; + + /* + * If there are no real slices, then make the compatiblity + * slice cover the whole disk. + */ + if (ssp->dss_nslices == BASE_SLICE) + ssp->dss_slices[COMPATIBILITY_SLICE].ds_size + = lp->d_secperunit; + + /* Point the compatibility slice at the BSD slice, if any. */ + for (slice = BASE_SLICE; slice < ssp->dss_nslices; slice++) { + sp = &ssp->dss_slices[slice]; + if (sp->ds_type == DOSPTYP_386BSD /* XXX */) { + ssp->dss_first_bsd_slice = slice; + ssp->dss_slices[COMPATIBILITY_SLICE].ds_offset + = sp->ds_offset; + ssp->dss_slices[COMPATIBILITY_SLICE].ds_size + = sp->ds_size; + ssp->dss_slices[COMPATIBILITY_SLICE].ds_type + = sp->ds_type; + break; + } + } + + ssp->dss_slices[WHOLE_DISK_SLICE].ds_label = clone_label(lp); + ssp->dss_slices[WHOLE_DISK_SLICE].ds_wlabel = TRUE; + } + + /* Initialize secondary info for all slices. */ + for (slice = 0; slice < ssp->dss_nslices; slice++) { + sp = &ssp->dss_slices[slice]; + if (sp->ds_label != NULL +#ifdef __alpha__ + && slice != WHOLE_DISK_SLICE +#endif + ) + continue; + dev1 = dkmodslice(dkmodpart(dev, RAW_PART), slice); +#if 0 + sname = dsname(dev, unit, slice, RAW_PART, partname); +#else + *partname='\0'; + sname = dev1->si_name; +#endif + /* + * XXX this should probably only be done for the need_init + * case, but there may be a problem with DIOCSYNCSLICEINFO. + */ + set_ds_wlabel(ssp, slice, TRUE); /* XXX invert */ + lp1 = clone_label(lp); + TRACE(("readdisklabel\n")); + if (flags & DSO_NOLABELS) + msg = NULL; + else { + msg = readdisklabel(dev1, lp1); + + /* + * readdisklabel() returns NULL for success, and an + * error string for failure. + * + * If there isn't a label on the disk, and if the + * DSO_COMPATLABEL is set, we want to use the + * faked-up label provided by the caller. + * + * So we set msg to NULL to indicate that there is + * no failure (since we have a faked-up label), + * free lp1, and then clone it again from lp. + * (In case readdisklabel() modified lp1.) + */ + if (msg != NULL && (flags & DSO_COMPATLABEL)) { + msg = NULL; + free(lp1, M_DEVBUF); + lp1 = clone_label(lp); + } + } + if (msg == NULL) + msg = fixlabel(sname, sp, lp1, FALSE); + if (msg == NULL && lp1->d_secsize != ssp->dss_secsize) + msg = "inconsistent sector size"; + if (msg != NULL) { + if (sp->ds_type == DOSPTYP_386BSD /* XXX */) + log(LOG_WARNING, "%s: cannot find label (%s)\n", + sname, msg); + free(lp1, M_DEVBUF); + continue; + } + if (lp1->d_flags & D_BADSECT) { + log(LOG_ERR, "%s: bad sector table not supported\n", + sname); + free(lp1, M_DEVBUF); + continue; + } + set_ds_label(ssp, slice, lp1); + set_ds_labeldevs(dev1, ssp); + set_ds_wlabel(ssp, slice, FALSE); + } + + slice = dkslice(dev); + if (slice >= ssp->dss_nslices) + return (ENXIO); + sp = &ssp->dss_slices[slice]; + part = dkpart(dev); + if (part != RAW_PART + && (sp->ds_label == NULL || part >= sp->ds_label->d_npartitions)) + return (EINVAL); /* XXX needs translation */ + mask = 1 << part; + sp->ds_openmask |= mask; + return (0); +} + +int +dssize(dev, sspp) + dev_t dev; + struct diskslices **sspp; +{ + struct disklabel *lp; + int part; + int slice; + struct diskslices *ssp; + + slice = dkslice(dev); + part = dkpart(dev); + ssp = *sspp; + if (ssp == NULL || slice >= ssp->dss_nslices + || !(ssp->dss_slices[slice].ds_openmask & (1 << part))) { + if (devsw(dev)->d_open(dev, FREAD, S_IFCHR, + (struct thread *)NULL) != 0) + return (-1); + devsw(dev)->d_close(dev, FREAD, S_IFCHR, (struct thread *)NULL); + ssp = *sspp; + } + lp = ssp->dss_slices[slice].ds_label; + if (lp == NULL) + return (-1); + return ((int)lp->d_partitions[part].p_size); +} + +static void +free_ds_label(ssp, slice) + struct diskslices *ssp; + int slice; +{ + struct disklabel *lp; + struct diskslice *sp; + + sp = &ssp->dss_slices[slice]; + lp = sp->ds_label; + if (lp == NULL) + return; + free(lp, M_DEVBUF); + set_ds_label(ssp, slice, (struct disklabel *)NULL); +} + + +static char * +fixlabel(sname, sp, lp, writeflag) + char *sname; + struct diskslice *sp; + struct disklabel *lp; + int writeflag; +{ + u_long end; + u_long offset; + int part; + struct partition *pp; + u_long start; + bool_t warned; + + /* These errors "can't happen" so don't bother reporting details. */ + if (lp->d_magic != DISKMAGIC || lp->d_magic2 != DISKMAGIC) + return ("fixlabel: invalid magic"); + if (dkcksum(lp) != 0) + return ("fixlabel: invalid checksum"); + + pp = &lp->d_partitions[RAW_PART]; + if (writeflag) { + start = 0; + offset = sp->ds_offset; + } else { + start = sp->ds_offset; + offset = -sp->ds_offset; + } + if (pp->p_offset != start) { + if (sname != NULL) { + printf( +"%s: rejecting BSD label: raw partition offset != slice offset\n", + sname); + slice_info(sname, sp); + partition_info(sname, RAW_PART, pp); + } + return ("fixlabel: raw partition offset != slice offset"); + } + if (pp->p_size != sp->ds_size) { + if (sname != NULL) { + printf("%s: raw partition size != slice size\n", sname); + slice_info(sname, sp); + partition_info(sname, RAW_PART, pp); + } + if (pp->p_size > sp->ds_size) { + if (sname == NULL) + return ("fixlabel: raw partition size > slice size"); + printf("%s: truncating raw partition\n", sname); + pp->p_size = sp->ds_size; + } + } + end = start + sp->ds_size; + if (start > end) + return ("fixlabel: slice wraps"); + if (lp->d_secpercyl <= 0) + return ("fixlabel: d_secpercyl <= 0"); + pp -= RAW_PART; + warned = FALSE; + for (part = 0; part < lp->d_npartitions; part++, pp++) { + if (pp->p_offset != 0 || pp->p_size != 0) { + if (pp->p_offset < start + || pp->p_offset + pp->p_size > end + || pp->p_offset + pp->p_size < pp->p_offset) { + if (sname != NULL) { + printf( +"%s: rejecting partition in BSD label: it isn't entirely within the slice\n", + sname); + if (!warned) { + slice_info(sname, sp); + warned = TRUE; + } + partition_info(sname, part, pp); + } + /* XXX else silently discard junk. */ + bzero(pp, sizeof *pp); + } else + pp->p_offset += offset; + } + } + lp->d_ncylinders = sp->ds_size / lp->d_secpercyl; + lp->d_secperunit = sp->ds_size; + lp->d_checksum = 0; + lp->d_checksum = dkcksum(lp); + return (NULL); +} + +static void +partition_info(sname, part, pp) + char *sname; + int part; + struct partition *pp; +{ + printf("%s%c: start %lu, end %lu, size %lu\n", sname, 'a' + part, + (u_long)pp->p_offset, (u_long)(pp->p_offset + pp->p_size - 1), + (u_long)pp->p_size); +} + +static void +slice_info(sname, sp) + char *sname; + struct diskslice *sp; +{ + printf("%s: start %lu, end %lu, size %lu\n", sname, + sp->ds_offset, sp->ds_offset + sp->ds_size - 1, sp->ds_size); +} + +static void +set_ds_label(ssp, slice, lp) + struct diskslices *ssp; + int slice; + struct disklabel *lp; +{ + ssp->dss_slices[slice].ds_label = lp; + if (slice == COMPATIBILITY_SLICE) + ssp->dss_slices[ssp->dss_first_bsd_slice].ds_label = lp; + else if (slice == ssp->dss_first_bsd_slice) + ssp->dss_slices[COMPATIBILITY_SLICE].ds_label = lp; +} + +static void +set_ds_labeldevs(dev, ssp) + dev_t dev; + struct diskslices *ssp; +{ +} + + +static void +set_ds_wlabel(ssp, slice, wlabel) + struct diskslices *ssp; + int slice; + int wlabel; +{ + ssp->dss_slices[slice].ds_wlabel = wlabel; + if (slice == COMPATIBILITY_SLICE) + ssp->dss_slices[ssp->dss_first_bsd_slice].ds_wlabel = wlabel; + else if (slice == ssp->dss_first_bsd_slice) + ssp->dss_slices[COMPATIBILITY_SLICE].ds_wlabel = wlabel; +} diff --git a/sys/kern/subr_eventhandler.c b/sys/kern/subr_eventhandler.c new file mode 100644 index 0000000..45b4302 --- /dev/null +++ b/sys/kern/subr_eventhandler.c @@ -0,0 +1,173 @@ +/*- + * Copyright (c) 1999 Michael Smith <msmith@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/eventhandler.h> + +static MALLOC_DEFINE(M_EVENTHANDLER, "eventhandler", "Event handler records"); + +/* List of 'slow' lists */ +static TAILQ_HEAD(, eventhandler_list) eventhandler_lists; +static int eventhandler_lists_initted = 0; +static struct mtx eventhandler_mutex; + +struct eventhandler_entry_generic +{ + struct eventhandler_entry ee; + void (* func)(void); +}; + +/* + * Initialize the eventhandler mutex and list. + */ +static void +eventhandler_init(void *dummy __unused) +{ + TAILQ_INIT(&eventhandler_lists); + mtx_init(&eventhandler_mutex, "eventhandler", NULL, MTX_DEF | MTX_RECURSE); + eventhandler_lists_initted = 1; +} +SYSINIT(eventhandlers, SI_SUB_EVENTHANDLER, SI_ORDER_FIRST, eventhandler_init, + NULL) + +/* + * Insertion is O(n) due to the priority scan, but optimises to O(1) + * if all priorities are identical. + */ +eventhandler_tag +eventhandler_register(struct eventhandler_list *list, char *name, + void *func, void *arg, int priority) +{ + struct eventhandler_entry_generic *eg; + struct eventhandler_entry *ep; + + KASSERT(eventhandler_lists_initted, ("eventhandler registered too early")); + + /* lock the eventhandler lists */ + mtx_lock(&eventhandler_mutex); + + /* Do we need to find/create the (slow) list? */ + if (list == NULL) { + /* look for a matching, existing list */ + list = eventhandler_find_list(name); + + /* Do we need to create the list? */ + if (list == NULL) { + if ((list = malloc(sizeof(struct eventhandler_list) + strlen(name) + 1, + M_EVENTHANDLER, M_NOWAIT)) == NULL) { + mtx_unlock(&eventhandler_mutex); + return(NULL); + } + list->el_flags = 0; + bzero(&list->el_lock, sizeof(list->el_lock)); + list->el_name = (char *)list + sizeof(struct eventhandler_list); + strcpy(list->el_name, name); + TAILQ_INSERT_HEAD(&eventhandler_lists, list, el_link); + } + } + if (!(list->el_flags & EHE_INITTED)) { + TAILQ_INIT(&list->el_entries); + sx_init(&list->el_lock, name); + list->el_flags = EHE_INITTED; + } + mtx_unlock(&eventhandler_mutex); + + /* allocate an entry for this handler, populate it */ + if ((eg = malloc(sizeof(struct eventhandler_entry_generic), + M_EVENTHANDLER, M_NOWAIT)) == NULL) { + return(NULL); + } + eg->func = func; + eg->ee.ee_arg = arg; + eg->ee.ee_priority = priority; + + /* sort it into the list */ + EHE_LOCK(list); + for (ep = TAILQ_FIRST(&list->el_entries); + ep != NULL; + ep = TAILQ_NEXT(ep, ee_link)) { + if (eg->ee.ee_priority < ep->ee_priority) { + TAILQ_INSERT_BEFORE(ep, &eg->ee, ee_link); + break; + } + } + if (ep == NULL) + TAILQ_INSERT_TAIL(&list->el_entries, &eg->ee, ee_link); + EHE_UNLOCK(list); + return(&eg->ee); +} + +void +eventhandler_deregister(struct eventhandler_list *list, eventhandler_tag tag) +{ + struct eventhandler_entry *ep = tag; + + /* XXX insert diagnostic check here? */ + EHE_LOCK(list); + if (ep != NULL) { + /* remove just this entry */ + TAILQ_REMOVE(&list->el_entries, ep, ee_link); + free(ep, M_EVENTHANDLER); + } else { + /* remove entire list */ + while (!TAILQ_EMPTY(&list->el_entries)) { + ep = TAILQ_FIRST(&list->el_entries); + TAILQ_REMOVE(&list->el_entries, ep, ee_link); + free(ep, M_EVENTHANDLER); + } + } + EHE_UNLOCK(list); +} + +struct eventhandler_list * +eventhandler_find_list(char *name) +{ + struct eventhandler_list *list; + + if (!eventhandler_lists_initted) + return(NULL); + + /* scan looking for the requested list */ + mtx_lock(&eventhandler_mutex); + for (list = TAILQ_FIRST(&eventhandler_lists); + list != NULL; + list = TAILQ_NEXT(list, el_link)) { + if (!strcmp(name, list->el_name)) + break; + } + mtx_unlock(&eventhandler_mutex); + + return(list); +} + diff --git a/sys/kern/subr_hints.c b/sys/kern/subr_hints.c new file mode 100644 index 0000000..c68d607 --- /dev/null +++ b/sys/kern/subr_hints.c @@ -0,0 +1,366 @@ +/*- + * Copyright (c) 2000,2001 Peter Wemm <peter@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/sx.h> +#include <sys/systm.h> +#include <sys/bus.h> + +/* + * Access functions for device resources. + */ + +static int checkmethod = 1; +static int use_kenv; +static char *hintp; + +/* + * Evil wildcarding resource string lookup. + * This walks the supplied env string table and returns a match. + * The start point can be remembered for incremental searches. + */ +static int +res_find(int *line, int *startln, + const char *name, int *unit, const char *resname, const char *value, + const char **ret_name, int *ret_namelen, int *ret_unit, + const char **ret_resname, int *ret_resnamelen, const char **ret_value) +{ + int n = 0, hit, i = 0; + char r_name[32]; + int r_unit; + char r_resname[32]; + char r_value[128]; + const char *s, *cp; + char *p; + + if (checkmethod) { + switch (hintmode) { + case 0: /* loader hints in environment only */ + break; + case 1: /* static hints only */ + hintp = static_hints; + checkmethod = 0; + break; + case 2: /* fallback mode */ + if (dynamic_kenv) { + sx_slock(&kenv_lock); + cp = kenvp[0]; + for (i = 0; cp != NULL; cp = kenvp[++i]) { + if (!strncmp(cp, "hint.", 5)) { + use_kenv = 1; + checkmethod = 0; + break; + } + } + sx_sunlock(&kenv_lock); + } else { + cp = kern_envp; + while (cp) { + if (strncmp(cp, "hint.", 5) == 0) { + cp = NULL; + hintp = kern_envp; + break; + } + while (*cp != '\0') + cp++; + cp++; + if (*cp == '\0') { + cp = NULL; + hintp = static_hints; + break; + } + } + } + break; + default: + break; + } + if (hintp == NULL) { + if (dynamic_kenv) { + use_kenv = 1; + checkmethod = 0; + } else + hintp = kern_envp; + } + } + + if (use_kenv) { + sx_slock(&kenv_lock); + i = 0; + cp = kenvp[0]; + if (cp == NULL) { + sx_sunlock(&kenv_lock); + return (ENOENT); + } + } else + cp = hintp; + while (cp) { + hit = 1; + (*line)++; + if (strncmp(cp, "hint.", 5) != 0) + hit = 0; + else + n = sscanf(cp, "hint.%32[^.].%d.%32[^=]=%128s", + r_name, &r_unit, r_resname, r_value); + if (hit && n != 4) { + printf("CONFIG: invalid hint '%s'\n", cp); + /* XXX: abuse bogus index() declaration */ + p = index(cp, 'h'); + *p = 'H'; + hit = 0; + } + if (hit && startln && *startln >= 0 && *line < *startln) + hit = 0; + if (hit && name && strcmp(name, r_name) != 0) + hit = 0; + if (hit && unit && *unit != r_unit) + hit = 0; + if (hit && resname && strcmp(resname, r_resname) != 0) + hit = 0; + if (hit && value && strcmp(value, r_value) != 0) + hit = 0; + if (hit) + break; + if (use_kenv) { + cp = kenvp[++i]; + if (cp == NULL) + break; + } else { + while (*cp != '\0') + cp++; + cp++; + if (*cp == '\0') { + cp = NULL; + break; + } + } + } + if (use_kenv) + sx_sunlock(&kenv_lock); + if (cp == NULL) + return ENOENT; + + s = cp; + /* This is a bit of a hack, but at least is reentrant */ + /* Note that it returns some !unterminated! strings. */ + s = index(s, '.') + 1; /* start of device */ + if (ret_name) + *ret_name = s; + s = index(s, '.') + 1; /* start of unit */ + if (ret_namelen) + *ret_namelen = s - *ret_name - 1; /* device length */ + if (ret_unit) + *ret_unit = r_unit; + s = index(s, '.') + 1; /* start of resname */ + if (ret_resname) + *ret_resname = s; + s = index(s, '=') + 1; /* start of value */ + if (ret_resnamelen) + *ret_resnamelen = s - *ret_resname - 1; /* value len */ + if (ret_value) + *ret_value = s; + if (startln) /* line number for anchor */ + *startln = *line + 1; + return 0; +} + +/* + * Search all the data sources for matches to our query. We look for + * dynamic hints first as overrides for static or fallback hints. + */ +static int +resource_find(int *line, int *startln, + const char *name, int *unit, const char *resname, const char *value, + const char **ret_name, int *ret_namelen, int *ret_unit, + const char **ret_resname, int *ret_resnamelen, const char **ret_value) +{ + int i; + int un; + + *line = 0; + + /* Search for exact unit matches first */ + i = res_find(line, startln, name, unit, resname, value, + ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen, + ret_value); + if (i == 0) + return 0; + if (unit == NULL) + return ENOENT; + /* If we are still here, search for wildcard matches */ + un = -1; + i = res_find(line, startln, name, &un, resname, value, + ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen, + ret_value); + if (i == 0) + return 0; + return ENOENT; +} + +int +resource_int_value(const char *name, int unit, const char *resname, int *result) +{ + int error; + const char *str; + char *op; + unsigned long val; + int line; + + line = 0; + error = resource_find(&line, NULL, name, &unit, resname, NULL, + NULL, NULL, NULL, NULL, NULL, &str); + if (error) + return error; + if (*str == '\0') + return EFTYPE; + val = strtoul(str, &op, 0); + if (*op != '\0') + return EFTYPE; + *result = val; + return 0; +} + +int +resource_long_value(const char *name, int unit, const char *resname, + long *result) +{ + int error; + const char *str; + char *op; + unsigned long val; + int line; + + line = 0; + error = resource_find(&line, NULL, name, &unit, resname, NULL, + NULL, NULL, NULL, NULL, NULL, &str); + if (error) + return error; + if (*str == '\0') + return EFTYPE; + val = strtoul(str, &op, 0); + if (*op != '\0') + return EFTYPE; + *result = val; + return 0; +} + +int +resource_string_value(const char *name, int unit, const char *resname, + const char **result) +{ + int error; + const char *str; + int line; + + line = 0; + error = resource_find(&line, NULL, name, &unit, resname, NULL, + NULL, NULL, NULL, NULL, NULL, &str); + if (error) + return error; + *result = str; + return 0; +} + +/* + * This is a bit nasty, but allows us to not modify the env strings. + */ +static const char * +resource_string_copy(const char *s, int len) +{ + static char stringbuf[256]; + static int offset = 0; + const char *ret; + + if (len == 0) + len = strlen(s); + if (len > 255) + return NULL; + if ((offset + len + 1) > 255) + offset = 0; + bcopy(s, &stringbuf[offset], len); + stringbuf[offset + len] = '\0'; + ret = &stringbuf[offset]; + offset += len + 1; + return ret; +} + +/* + * err = resource_find_at(&anchor, &name, &unit, resname, value) + * Iteratively fetch a list of devices wired "at" something + * res and value are restrictions. eg: "at", "scbus0". + * For practical purposes, res = required, value = optional. + * *name and *unit are set. + * set *anchor to zero before starting. + */ +int +resource_find_match(int *anchor, const char **name, int *unit, + const char *resname, const char *value) +{ + const char *found_name; + int found_namelen; + int found_unit; + int ret; + int newln; + + newln = *anchor; + ret = resource_find(anchor, &newln, NULL, NULL, resname, value, + &found_name, &found_namelen, &found_unit, NULL, NULL, NULL); + if (ret == 0) { + *name = resource_string_copy(found_name, found_namelen); + *unit = found_unit; + } + *anchor = newln; + return ret; +} + + +/* + * err = resource_find_dev(&anchor, name, &unit, res, value); + * Iterate through a list of devices, returning their unit numbers. + * res and value are optional restrictions. eg: "at", "scbus0". + * *unit is set to the value. + * set *anchor to zero before starting. + */ +int +resource_find_dev(int *anchor, const char *name, int *unit, + const char *resname, const char *value) +{ + int found_unit; + int newln; + int ret; + + newln = *anchor; + ret = resource_find(anchor, &newln, name, NULL, resname, value, + NULL, NULL, &found_unit, NULL, NULL, NULL); + if (ret == 0) { + *unit = found_unit; + } + *anchor = newln; + return ret; +} diff --git a/sys/kern/subr_kobj.c b/sys/kern/subr_kobj.c new file mode 100644 index 0000000..b5bfa1f --- /dev/null +++ b/sys/kern/subr_kobj.c @@ -0,0 +1,216 @@ +/*- + * Copyright (c) 2000 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/errno.h> +#ifndef TEST +#include <sys/systm.h> +#endif +#include <sys/kobj.h> + +#ifdef TEST +#include "usertest.h" +#endif + +static MALLOC_DEFINE(M_KOBJ, "kobj", "Kernel object structures"); + +#ifdef KOBJ_STATS + +#include <sys/sysctl.h> + +u_int kobj_lookup_hits; +u_int kobj_lookup_misses; + +SYSCTL_UINT(_kern, OID_AUTO, kobj_hits, CTLFLAG_RD, + &kobj_lookup_hits, 0, "") +SYSCTL_UINT(_kern, OID_AUTO, kobj_misses, CTLFLAG_RD, + &kobj_lookup_misses, 0, "") + +#endif + +static int kobj_next_id = 1; + +static int +kobj_error_method(void) +{ + return ENXIO; +} + +static void +kobj_register_method(struct kobjop_desc *desc) +{ + if (desc->id == 0) + desc->id = kobj_next_id++; +} + +static void +kobj_unregister_method(struct kobjop_desc *desc) +{ +} + +static void +kobj_class_compile_common(kobj_class_t cls, kobj_ops_t ops) +{ + kobj_method_t *m; + int i; + + /* + * Don't do anything if we are already compiled. + */ + if (cls->ops) + return; + + /* + * First register any methods which need it. + */ + for (i = 0, m = cls->methods; m->desc; i++, m++) + kobj_register_method(m->desc); + + /* + * Then initialise the ops table. + */ + bzero(ops, sizeof(struct kobj_ops)); + ops->cls = cls; + cls->ops = ops; +} + +void +kobj_class_compile(kobj_class_t cls) +{ + kobj_ops_t ops; + + /* + * Allocate space for the compiled ops table. + */ + ops = malloc(sizeof(struct kobj_ops), M_KOBJ, M_NOWAIT); + if (!ops) + panic("kobj_compile_methods: out of memory"); + kobj_class_compile_common(cls, ops); +} + +void +kobj_class_compile_static(kobj_class_t cls, kobj_ops_t ops) +{ + /* + * Increment refs to make sure that the ops table is not freed. + */ + cls->refs++; + kobj_class_compile_common(cls, ops); +} + +void +kobj_lookup_method(kobj_method_t *methods, + kobj_method_t *ce, + kobjop_desc_t desc) +{ + ce->desc = desc; + for (; methods && methods->desc; methods++) { + if (methods->desc == desc) { + ce->func = methods->func; + return; + } + } + if (desc->deflt) + ce->func = desc->deflt; + else + ce->func = kobj_error_method; + return; +} + +void +kobj_class_free(kobj_class_t cls) +{ + int i; + kobj_method_t *m; + + /* + * Unregister any methods which are no longer used. + */ + for (i = 0, m = cls->methods; m->desc; i++, m++) + kobj_unregister_method(m->desc); + + /* + * Free memory and clean up. + */ + free(cls->ops, M_KOBJ); + cls->ops = 0; +} + +kobj_t +kobj_create(kobj_class_t cls, + struct malloc_type *mtype, + int mflags) +{ + kobj_t obj; + + /* + * Allocate and initialise the new object. + */ + obj = malloc(cls->size, mtype, mflags | M_ZERO); + if (!obj) + return 0; + kobj_init(obj, cls); + + return obj; +} + +void +kobj_init(kobj_t obj, kobj_class_t cls) +{ + /* + * Consider compiling the class' method table. + */ + if (!cls->ops) + kobj_class_compile(cls); + + obj->ops = cls->ops; + cls->refs++; +} + +void +kobj_delete(kobj_t obj, struct malloc_type *mtype) +{ + kobj_class_t cls = obj->ops->cls; + + /* + * Consider freeing the compiled method table for the class + * after its last instance is deleted. As an optimisation, we + * should defer this for a short while to avoid thrashing. + */ + cls->refs--; + if (!cls->refs) + kobj_class_free(cls); + + obj->ops = 0; + if (mtype) + free(obj, mtype); +} diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c new file mode 100644 index 0000000..2c01568 --- /dev/null +++ b/sys/kern/subr_log.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_log.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +/* + * Error log buffer for kernel printf's. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/filio.h> +#include <sys/ttycom.h> +#include <sys/msgbuf.h> +#include <sys/signalvar.h> +#include <sys/kernel.h> +#include <sys/poll.h> +#include <sys/filedesc.h> +#include <sys/sysctl.h> + +#define LOG_RDPRI (PZERO + 1) + +#define LOG_ASYNC 0x04 +#define LOG_RDWAIT 0x08 + +static d_open_t logopen; +static d_close_t logclose; +static d_read_t logread; +static d_ioctl_t logioctl; +static d_poll_t logpoll; + +static void logtimeout(void *arg); + +#define CDEV_MAJOR 7 +static struct cdevsw log_cdevsw = { + /* open */ logopen, + /* close */ logclose, + /* read */ logread, + /* write */ nowrite, + /* ioctl */ logioctl, + /* poll */ logpoll, + /* mmap */ nommap, + /* strategy */ nostrategy, + /* name */ "log", + /* maj */ CDEV_MAJOR, + /* dump */ nodump, + /* psize */ nopsize, + /* flags */ 0, +}; + +static struct logsoftc { + int sc_state; /* see above for possibilities */ + struct selinfo sc_selp; /* process waiting on select call */ + struct sigio *sc_sigio; /* information for async I/O */ + struct callout sc_callout; /* callout to wakeup syslog */ +} logsoftc; + +int log_open; /* also used in log() */ + +/* Times per second to check for a pending syslog wakeup. */ +static int log_wakeups_per_second = 5; +SYSCTL_INT(_kern, OID_AUTO, log_wakeups_per_second, CTLFLAG_RW, + &log_wakeups_per_second, 0, ""); + +/*ARGSUSED*/ +static int +logopen(dev_t dev, int flags, int mode, struct thread *td) +{ + if (log_open) + return (EBUSY); + log_open = 1; + callout_init(&logsoftc.sc_callout, 0); + fsetown(td->td_proc->p_pid, &logsoftc.sc_sigio); /* signal process only */ + callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second, + logtimeout, NULL); + return (0); +} + +/*ARGSUSED*/ +static int +logclose(dev_t dev, int flag, int mode, struct thread *td) +{ + + log_open = 0; + callout_stop(&logsoftc.sc_callout); + logsoftc.sc_state = 0; + funsetown(&logsoftc.sc_sigio); + return (0); +} + +/*ARGSUSED*/ +static int +logread(dev_t dev, struct uio *uio, int flag) +{ + struct msgbuf *mbp = msgbufp; + long l; + int s; + int error = 0; + + s = splhigh(); + while (mbp->msg_bufr == mbp->msg_bufx) { + if (flag & IO_NDELAY) { + splx(s); + return (EWOULDBLOCK); + } + logsoftc.sc_state |= LOG_RDWAIT; + if ((error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH, + "klog", 0))) { + splx(s); + return (error); + } + } + splx(s); + logsoftc.sc_state &= ~LOG_RDWAIT; + + while (uio->uio_resid > 0) { + l = mbp->msg_bufx - mbp->msg_bufr; + if (l < 0) + l = mbp->msg_size - mbp->msg_bufr; + l = min(l, uio->uio_resid); + if (l == 0) + break; + error = uiomove((caddr_t)msgbufp->msg_ptr + mbp->msg_bufr, + (int)l, uio); + if (error) + break; + mbp->msg_bufr += l; + if (mbp->msg_bufr >= mbp->msg_size) + mbp->msg_bufr = 0; + } + return (error); +} + +/*ARGSUSED*/ +static int +logpoll(dev_t dev, int events, struct thread *td) +{ + int s; + int revents = 0; + + s = splhigh(); + + if (events & (POLLIN | POLLRDNORM)) { + if (msgbufp->msg_bufr != msgbufp->msg_bufx) + revents |= events & (POLLIN | POLLRDNORM); + else + selrecord(td, &logsoftc.sc_selp); + } + splx(s); + return (revents); +} + +static void +logtimeout(void *arg) +{ + + if (!log_open) + return; + if (msgbuftrigger == 0) { + callout_reset(&logsoftc.sc_callout, + hz / log_wakeups_per_second, logtimeout, NULL); + return; + } + msgbuftrigger = 0; + selwakeup(&logsoftc.sc_selp); + if ((logsoftc.sc_state & LOG_ASYNC) && logsoftc.sc_sigio != NULL) + pgsigio(&logsoftc.sc_sigio, SIGIO, 0); + if (logsoftc.sc_state & LOG_RDWAIT) { + wakeup((caddr_t)msgbufp); + logsoftc.sc_state &= ~LOG_RDWAIT; + } + callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second, + logtimeout, NULL); +} + +/*ARGSUSED*/ +static int +logioctl(dev_t dev, u_long com, caddr_t data, int flag, struct thread *td) +{ + long l; + int s; + + switch (com) { + + /* return number of characters immediately available */ + case FIONREAD: + s = splhigh(); + l = msgbufp->msg_bufx - msgbufp->msg_bufr; + splx(s); + if (l < 0) + l += msgbufp->msg_size; + *(int *)data = l; + break; + + case FIONBIO: + break; + + case FIOASYNC: + if (*(int *)data) + logsoftc.sc_state |= LOG_ASYNC; + else + logsoftc.sc_state &= ~LOG_ASYNC; + break; + + case FIOSETOWN: + return (fsetown(*(int *)data, &logsoftc.sc_sigio)); + + case FIOGETOWN: + *(int *)data = fgetown(logsoftc.sc_sigio); + break; + + /* This is deprecated, FIOSETOWN should be used instead. */ + case TIOCSPGRP: + return (fsetown(-(*(int *)data), &logsoftc.sc_sigio)); + + /* This is deprecated, FIOGETOWN should be used instead */ + case TIOCGPGRP: + *(int *)data = -fgetown(logsoftc.sc_sigio); + break; + + default: + return (ENOTTY); + } + return (0); +} + +static void +log_drvinit(void *unused) +{ + + make_dev(&log_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "klog"); +} + +SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,log_drvinit,NULL) diff --git a/sys/kern/subr_mbuf.c b/sys/kern/subr_mbuf.c new file mode 100644 index 0000000..74e1f56 --- /dev/null +++ b/sys/kern/subr_mbuf.c @@ -0,0 +1,1111 @@ +/*- + * Copyright (c) 2001 + * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_param.h" +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/condvar.h> +#include <sys/smp.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/domain.h> +#include <sys/protosw.h> + +#include <vm/vm.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> + +/* + * Maximum number of PCPU containers. If you know what you're doing you could + * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your + * system during compilation, and thus prevent kernel structure bloat. + * + * SMP and non-SMP kernels clearly have a different number of possible CPUs, + * but because we cannot assume a dense array of CPUs, we always allocate + * and traverse PCPU containers up to NCPU amount and merely check for + * CPU availability. + */ +#ifdef MBALLOC_NCPU +#define NCPU MBALLOC_NCPU +#else +#define NCPU MAXCPU +#endif + +/*- + * The mbuf allocator is heavily based on Alfred Perlstein's + * (alfred@FreeBSD.org) "memcache" allocator which is itself based + * on concepts from several per-CPU memory allocators. The difference + * between this allocator and memcache is that, among other things: + * + * (i) We don't free back to the map from the free() routine - we leave the + * option of implementing lazy freeing (from a kproc) in the future. + * + * (ii) We allocate from separate sub-maps of kmem_map, thus limiting the + * maximum number of allocatable objects of a given type. Further, + * we handle blocking on a cv in the case that the map is starved and + * we have to rely solely on cached (circulating) objects. + * + * The mbuf allocator keeps all objects that it allocates in mb_buckets. + * The buckets keep a page worth of objects (an object can be an mbuf or an + * mbuf cluster) and facilitate moving larger sets of contiguous objects + * from the per-CPU lists to the main list for the given object. The buckets + * also have an added advantage in that after several moves from a per-CPU + * list to the main list and back to the per-CPU list, contiguous objects + * are kept together, thus trying to put the TLB cache to good use. + * + * The buckets are kept on singly-linked lists called "containers." A container + * is protected by a mutex lock in order to ensure consistency. The mutex lock + * itself is allocated separately and attached to the container at boot time, + * thus allowing for certain containers to share the same mutex lock. Per-CPU + * containers for mbufs and mbuf clusters all share the same per-CPU + * lock whereas the "general system" containers (i.e., the "main lists") for + * these objects share one global lock. + */ +struct mb_bucket { + SLIST_ENTRY(mb_bucket) mb_blist; + int mb_owner; + int mb_numfree; + void *mb_free[0]; +}; + +struct mb_container { + SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead; + struct mtx *mc_lock; + int mc_numowner; + u_int mc_starved; + long *mc_types; + u_long *mc_objcount; + u_long *mc_numpgs; +}; + +struct mb_gen_list { + struct mb_container mb_cont; + struct cv mgl_mstarved; +}; + +struct mb_pcpu_list { + struct mb_container mb_cont; +}; + +/* + * Boot-time configurable object counts that will determine the maximum + * number of permitted objects in the mbuf and mcluster cases. In the + * ext counter (nmbcnt) case, it's just an indicator serving to scale + * kmem_map size properly - in other words, we may be allowed to allocate + * more than nmbcnt counters, whereas we will never be allowed to allocate + * more than nmbufs mbufs or nmbclusters mclusters. + * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be + * allocatable by the sfbuf allocator (found in uipc_syscalls.c) + */ +#ifndef NMBCLUSTERS +#define NMBCLUSTERS (1024 + maxusers * 64) +#endif +#ifndef NMBUFS +#define NMBUFS (nmbclusters * 2) +#endif +#ifndef NSFBUFS +#define NSFBUFS (512 + maxusers * 16) +#endif +#ifndef NMBCNTS +#define NMBCNTS (nmbclusters + nsfbufs) +#endif +int nmbufs; +int nmbclusters; +int nmbcnt; +int nsfbufs; + +/* + * Perform sanity checks of tunables declared above. + */ +static void +tunable_mbinit(void *dummy) +{ + + /* + * This has to be done before VM init. + */ + nmbclusters = NMBCLUSTERS; + TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); + nmbufs = NMBUFS; + TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); + nsfbufs = NSFBUFS; + TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); + nmbcnt = NMBCNTS; + TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt); + /* Sanity checks */ + if (nmbufs < nmbclusters * 2) + nmbufs = nmbclusters * 2; + if (nmbcnt < nmbclusters + nsfbufs) + nmbcnt = nmbclusters + nsfbufs; +} +SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL); + +/* + * The freelist structures and mutex locks. The number statically declared + * here depends on the number of CPUs. + * + * We set up in such a way that all the objects (mbufs, clusters) + * share the same mutex lock. It has been established that we do not benefit + * from different locks for different objects, so we use the same lock, + * regardless of object type. + */ +struct mb_lstmngr { + struct mb_gen_list *ml_genlist; + struct mb_pcpu_list *ml_cntlst[NCPU]; + struct mb_bucket **ml_btable; + vm_map_t ml_map; + vm_offset_t ml_mapbase; + vm_offset_t ml_maptop; + int ml_mapfull; + u_int ml_objsize; + u_int *ml_wmhigh; +}; +static struct mb_lstmngr mb_list_mbuf, mb_list_clust; +static struct mtx mbuf_gen, mbuf_pcpu[NCPU]; + +/* + * Local macros for internal allocator structure manipulations. + */ +#ifdef SMP +#define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[PCPU_GET(cpuid)] +#else +#define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[0] +#endif + +#define MB_GET_GEN_LIST(mb_lst) (mb_lst)->ml_genlist + +#define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mc_lock) + +#define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mc_lock) + +#define MB_GET_PCPU_LIST_NUM(mb_lst, num) \ + (mb_lst)->ml_cntlst[(num)] + +#define MB_BUCKET_INDX(mb_obj, mb_lst) \ + (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE) + +#define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \ +{ \ + struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead); \ + \ + (mb_bckt)->mb_numfree--; \ + (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)]; \ + (*((mb_lst)->mb_cont.mc_objcount))--; \ + if ((mb_bckt)->mb_numfree == 0) { \ + SLIST_REMOVE_HEAD(_mchd, mb_blist); \ + SLIST_NEXT((mb_bckt), mb_blist) = NULL; \ + (mb_bckt)->mb_owner |= MB_BUCKET_FREE; \ + } \ +} + +#define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst) \ + (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp); \ + (mb_bckt)->mb_numfree++; \ + (*((mb_lst)->mb_cont.mc_objcount))++; + +#define MB_MBTYPES_INC(mb_cnt, mb_type, mb_num) \ + if ((mb_type) != MT_NOTMBUF) \ + (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num) + +#define MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num) \ + if ((mb_type) != MT_NOTMBUF) \ + (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num) + +/* + * Ownership of buckets/containers is represented by integers. The PCPU + * lists range from 0 to NCPU-1. We need a free numerical id for the general + * list (we use NCPU). We also need a non-conflicting free bit to indicate + * that the bucket is free and removed from a container, while not losing + * the bucket's originating container id. We use the highest bit + * for the free marker. + */ +#define MB_GENLIST_OWNER (NCPU) +#define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1)) + +/* Statistics structures for allocator (per-CPU and general). */ +static struct mbpstat mb_statpcpu[NCPU + 1]; +struct mbstat mbstat; + +/* Sleep time for wait code (in ticks). */ +static int mbuf_wait = 64; + +static u_int mbuf_limit = 512; /* Upper limit on # of mbufs per CPU. */ +static u_int clust_limit = 128; /* Upper limit on # of clusters per CPU. */ + +/* + * Objects exported by sysctl(8). + */ +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, + "Maximum number of mbuf clusters available"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0, + "Maximum number of mbufs available"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0, + "Number used to scale kmem_map to ensure sufficient space for counters"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0, + "Maximum number of sendfile(2) sf_bufs available"); +SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0, + "Sleep time of mbuf subsystem wait allocations during exhaustion"); +SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0, + "Upper limit of number of mbufs allowed on each PCPU list"); +SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0, + "Upper limit of number of mbuf clusters allowed on each PCPU list"); +SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat, + "Mbuf general information and statistics"); +SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu, + sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics"); + +/* + * Prototypes of local allocator routines. + */ +static void *mb_alloc_wait(struct mb_lstmngr *, short); +static struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int, + struct mb_pcpu_list *); +static void mb_reclaim(void); +static void mbuf_init(void *); + +/* + * Initial allocation numbers. Each parameter represents the number of buckets + * of each object that will be placed initially in each PCPU container for + * said object. + */ +#define NMB_MBUF_INIT 4 +#define NMB_CLUST_INIT 16 + +/* + * Initialize the mbuf subsystem. + * + * We sub-divide the kmem_map into several submaps; this way, we don't have + * to worry about artificially limiting the number of mbuf or mbuf cluster + * allocations, due to fear of one type of allocation "stealing" address + * space initially reserved for another. + * + * Set up both the general containers and all the PCPU containers. Populate + * the PCPU containers with initial numbers. + */ +MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures"); +SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL) +void +mbuf_init(void *dummy) +{ + struct mb_pcpu_list *pcpu_cnt; + vm_size_t mb_map_size; + int i, j; + + /* + * Set up all the submaps, for each type of object that we deal + * with in this allocator. + */ + mb_map_size = (vm_size_t)(nmbufs * MSIZE); + mb_map_size = rounddown(mb_map_size, PAGE_SIZE); + mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE * + sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); + if (mb_list_mbuf.ml_btable == NULL) + goto bad; + mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase), + &(mb_list_mbuf.ml_maptop), mb_map_size); + mb_list_mbuf.ml_map->system_map = 1; + mb_list_mbuf.ml_mapfull = 0; + mb_list_mbuf.ml_objsize = MSIZE; + mb_list_mbuf.ml_wmhigh = &mbuf_limit; + + mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES); + mb_map_size = rounddown(mb_map_size, PAGE_SIZE); + mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE + * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); + if (mb_list_clust.ml_btable == NULL) + goto bad; + mb_list_clust.ml_map = kmem_suballoc(kmem_map, + &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop), + mb_map_size); + mb_list_clust.ml_map->system_map = 1; + mb_list_clust.ml_mapfull = 0; + mb_list_clust.ml_objsize = MCLBYTES; + mb_list_clust.ml_wmhigh = &clust_limit; + + /* + * Allocate required general (global) containers for each object type. + */ + mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF, + M_NOWAIT); + mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF, + M_NOWAIT); + if ((mb_list_mbuf.ml_genlist == NULL) || + (mb_list_clust.ml_genlist == NULL)) + goto bad; + + /* + * Initialize condition variables and general container mutex locks. + */ + mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, 0); + cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved"); + cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved), + "mcluster pool starved"); + mb_list_mbuf.ml_genlist->mb_cont.mc_lock = + mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen; + + /* + * Set up the general containers for each object. + */ + mb_list_mbuf.ml_genlist->mb_cont.mc_numowner = + mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER; + mb_list_mbuf.ml_genlist->mb_cont.mc_starved = + mb_list_clust.ml_genlist->mb_cont.mc_starved = 0; + mb_list_mbuf.ml_genlist->mb_cont.mc_objcount = + &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree); + mb_list_clust.ml_genlist->mb_cont.mc_objcount = + &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree); + mb_list_mbuf.ml_genlist->mb_cont.mc_numpgs = + &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbpgs); + mb_list_clust.ml_genlist->mb_cont.mc_numpgs = + &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs); + mb_list_mbuf.ml_genlist->mb_cont.mc_types = + &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]); + mb_list_clust.ml_genlist->mb_cont.mc_types = NULL; + SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead)); + SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead)); + + /* + * Initialize general mbuf statistics. + */ + mbstat.m_msize = MSIZE; + mbstat.m_mclbytes = MCLBYTES; + mbstat.m_minclsize = MINCLSIZE; + mbstat.m_mlen = MLEN; + mbstat.m_mhlen = MHLEN; + mbstat.m_numtypes = MT_NTYPES; + + /* + * Allocate and initialize PCPU containers. + */ + for (i = 0; i < NCPU; i++) { + if (CPU_ABSENT(i)) + continue; + + mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list), + M_MBUF, M_NOWAIT); + mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list), + M_MBUF, M_NOWAIT); + if ((mb_list_mbuf.ml_cntlst[i] == NULL) || + (mb_list_clust.ml_cntlst[i] == NULL)) + goto bad; + + mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, 0); + mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock = + mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i]; + + mb_statpcpu[i].mb_active = 1; + mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner = + mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i; + mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved = + mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0; + mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount = + &(mb_statpcpu[i].mb_mbfree); + mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount = + &(mb_statpcpu[i].mb_clfree); + mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numpgs = + &(mb_statpcpu[i].mb_mbpgs); + mb_list_clust.ml_cntlst[i]->mb_cont.mc_numpgs = + &(mb_statpcpu[i].mb_clpgs); + mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types = + &(mb_statpcpu[i].mb_mbtypes[0]); + mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL; + + SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead)); + SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead)); + + /* + * Perform initial allocations. + */ + pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i); + MB_LOCK_CONT(pcpu_cnt); + for (j = 0; j < NMB_MBUF_INIT; j++) { + if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt) + == NULL) + goto bad; + } + MB_UNLOCK_CONT(pcpu_cnt); + + pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i); + MB_LOCK_CONT(pcpu_cnt); + for (j = 0; j < NMB_CLUST_INIT; j++) { + if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt) + == NULL) + goto bad; + } + MB_UNLOCK_CONT(pcpu_cnt); + } + + return; +bad: + panic("mbuf_init(): failed to initialize mbuf subsystem!"); +} + +/* + * Populate a given mbuf PCPU container with a bucket full of fresh new + * buffers. Return a pointer to the new bucket (already in the container if + * successful), or return NULL on failure. + * + * LOCKING NOTES: + * PCPU container lock must be held when this is called. + * The lock is dropped here so that we can cleanly call the underlying VM + * code. If we fail, we return with no locks held. If we succeed (i.e., return + * non-NULL), we return with the PCPU lock held, ready for allocation from + * the returned bucket. + */ +static struct mb_bucket * +mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst) +{ + struct mb_bucket *bucket; + caddr_t p; + int i; + + MB_UNLOCK_CONT(cnt_lst); + /* + * If our object's (finite) map is starved now (i.e., no more address + * space), bail out now. + */ + if (mb_list->ml_mapfull) + return (NULL); + + bucket = malloc(sizeof(struct mb_bucket) + + PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF, + how == M_TRYWAIT ? M_WAITOK : M_NOWAIT); + if (bucket == NULL) + return (NULL); + + p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE, + how == M_TRYWAIT ? M_WAITOK : M_NOWAIT); + if (p == NULL) { + free(bucket, M_MBUF); + if (how == M_TRYWAIT) + mb_list->ml_mapfull = 1; + return (NULL); + } + + bucket->mb_numfree = 0; + mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket; + for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) { + bucket->mb_free[i] = p; + bucket->mb_numfree++; + p += mb_list->ml_objsize; + } + + MB_LOCK_CONT(cnt_lst); + bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; + SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist); + (*(cnt_lst->mb_cont.mc_numpgs))++; + *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree; + + return (bucket); +} + +/* + * Allocate an mbuf-subsystem type object. + * The general case is very easy. Complications only arise if our PCPU + * container is empty. Things get worse if the PCPU container is empty, + * the general container is empty, and we've run out of address space + * in our map; then we try to block if we're willing to (M_TRYWAIT). + */ +static __inline +void * +mb_alloc(struct mb_lstmngr *mb_list, int how, short type) +{ + static int last_report; + struct mb_pcpu_list *cnt_lst; + struct mb_bucket *bucket; + void *m; + + m = NULL; + cnt_lst = MB_GET_PCPU_LIST(mb_list); + MB_LOCK_CONT(cnt_lst); + + if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) { + /* + * This is the easy allocation case. We just grab an object + * from a bucket in the PCPU container. At worst, we + * have just emptied the bucket and so we remove it + * from the container. + */ + MB_GET_OBJECT(m, bucket, cnt_lst); + MB_MBTYPES_INC(cnt_lst, type, 1); + MB_UNLOCK_CONT(cnt_lst); + } else { + struct mb_gen_list *gen_list; + + /* + * This is the less-common more difficult case. We must + * first verify if the general list has anything for us + * and if that also fails, we must allocate a page from + * the map and create a new bucket to place in our PCPU + * container (already locked). If the map is starved then + * we're really in for trouble, as we have to wait on + * the general container's condition variable. + */ + gen_list = MB_GET_GEN_LIST(mb_list); + MB_LOCK_CONT(gen_list); + + if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) + != NULL) { + /* + * Give ownership of the bucket to our CPU's + * container, but only actually put the bucket + * in the container if it doesn't become free + * upon removing an mbuf from it. + */ + SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead), + mb_blist); + bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; + (*(gen_list->mb_cont.mc_numpgs))--; + (*(cnt_lst->mb_cont.mc_numpgs))++; + *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree; + bucket->mb_numfree--; + m = bucket->mb_free[(bucket->mb_numfree)]; + if (bucket->mb_numfree == 0) { + SLIST_NEXT(bucket, mb_blist) = NULL; + bucket->mb_owner |= MB_BUCKET_FREE; + } else { + SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), + bucket, mb_blist); + *(cnt_lst->mb_cont.mc_objcount) += + bucket->mb_numfree; + } + MB_UNLOCK_CONT(gen_list); + MB_MBTYPES_INC(cnt_lst, type, 1); + MB_UNLOCK_CONT(cnt_lst); + } else { + /* + * We'll have to allocate a new page. + */ + MB_UNLOCK_CONT(gen_list); + bucket = mb_pop_cont(mb_list, how, cnt_lst); + if (bucket != NULL) { + MB_GET_OBJECT(m, bucket, cnt_lst); + MB_MBTYPES_INC(cnt_lst, type, 1); + MB_UNLOCK_CONT(cnt_lst); + } else { + if (how == M_TRYWAIT) { + /* + * Absolute worst-case scenario. + * We block if we're willing to, but + * only after trying to steal from + * other lists. + */ + m = mb_alloc_wait(mb_list, type); + } else { + /* XXX: No consistency. */ + mbstat.m_drops++; + + if (ticks < last_report || + (ticks - last_report) >= hz) { + last_report = ticks; + printf( +"All mbufs exhausted, please see tuning(7).\n"); +/* XXX: Actually could be clusters, but it gets the point across. */ + } + + } + } + } + } + + return (m); +} + +/* + * This is the worst-case scenario called only if we're allocating with + * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf + * by looking in every PCPU container. If we're still unsuccesful, we + * try the general container one last time and possibly block on our + * starved cv. + */ +static void * +mb_alloc_wait(struct mb_lstmngr *mb_list, short type) +{ + struct mb_pcpu_list *cnt_lst; + struct mb_gen_list *gen_list; + struct mb_bucket *bucket; + void *m; + int i, cv_ret; + + /* + * Try to reclaim mbuf-related objects (mbufs, clusters). + */ + mb_reclaim(); + + /* + * Cycle all the PCPU containers. Increment starved counts if found + * empty. + */ + for (i = 0; i < NCPU; i++) { + if (CPU_ABSENT(i)) + continue; + cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i); + MB_LOCK_CONT(cnt_lst); + + /* + * If container is non-empty, get a single object from it. + * If empty, increment starved count. + */ + if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != + NULL) { + MB_GET_OBJECT(m, bucket, cnt_lst); + MB_MBTYPES_INC(cnt_lst, type, 1); + MB_UNLOCK_CONT(cnt_lst); + mbstat.m_wait++; /* XXX: No consistency. */ + return (m); + } else + cnt_lst->mb_cont.mc_starved++; + + MB_UNLOCK_CONT(cnt_lst); + } + + /* + * We're still here, so that means it's time to get the general + * container lock, check it one more time (now that mb_reclaim() + * has been called) and if we still get nothing, block on the cv. + */ + gen_list = MB_GET_GEN_LIST(mb_list); + MB_LOCK_CONT(gen_list); + if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) { + MB_GET_OBJECT(m, bucket, gen_list); + MB_MBTYPES_INC(gen_list, type, 1); + MB_UNLOCK_CONT(gen_list); + mbstat.m_wait++; /* XXX: No consistency. */ + return (m); + } + + gen_list->mb_cont.mc_starved++; + cv_ret = cv_timedwait(&(gen_list->mgl_mstarved), + gen_list->mb_cont.mc_lock, mbuf_wait); + gen_list->mb_cont.mc_starved--; + + if ((cv_ret == 0) && + ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) { + MB_GET_OBJECT(m, bucket, gen_list); + MB_MBTYPES_INC(gen_list, type, 1); + mbstat.m_wait++; /* XXX: No consistency. */ + } else { + mbstat.m_drops++; /* XXX: No consistency. */ + m = NULL; + } + + MB_UNLOCK_CONT(gen_list); + + return (m); +} + +/*- + * Free an object to its rightful container. + * In the very general case, this operation is really very easy. + * Complications arise primarily if: + * (a) We've hit the high limit on number of free objects allowed in + * our PCPU container. + * (b) We're in a critical situation where our container has been + * marked 'starved' and we need to issue wakeups on the starved + * condition variable. + * (c) Minor (odd) cases: our bucket has migrated while we were + * waiting for the lock; our bucket is in the general container; + * our bucket is empty. + */ +static __inline +void +mb_free(struct mb_lstmngr *mb_list, void *m, short type) +{ + struct mb_pcpu_list *cnt_lst; + struct mb_gen_list *gen_list; + struct mb_bucket *bucket; + u_int owner; + + bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)]; + + /* + * Make sure that if after we lock the bucket's present container the + * bucket has migrated, that we drop the lock and get the new one. + */ +retry_lock: + owner = bucket->mb_owner & ~MB_BUCKET_FREE; + switch (owner) { + case MB_GENLIST_OWNER: + gen_list = MB_GET_GEN_LIST(mb_list); + MB_LOCK_CONT(gen_list); + if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) { + MB_UNLOCK_CONT(gen_list); + goto retry_lock; + } + + /* + * If we're intended for the general container, this is + * real easy: no migrating required. The only `bogon' + * is that we're now contending with all the threads + * dealing with the general list, but this is expected. + */ + MB_PUT_OBJECT(m, bucket, gen_list); + MB_MBTYPES_DEC(gen_list, type, 1); + if (gen_list->mb_cont.mc_starved > 0) + cv_signal(&(gen_list->mgl_mstarved)); + MB_UNLOCK_CONT(gen_list); + break; + + default: + cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner); + MB_LOCK_CONT(cnt_lst); + if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) { + MB_UNLOCK_CONT(cnt_lst); + goto retry_lock; + } + + MB_PUT_OBJECT(m, bucket, cnt_lst); + MB_MBTYPES_DEC(cnt_lst, type, 1); + + if (cnt_lst->mb_cont.mc_starved > 0) { + /* + * This is a tough case. It means that we've + * been flagged at least once to indicate that + * we're empty, and that the system is in a critical + * situation, so we ought to migrate at least one + * bucket over to the general container. + * There may or may not be a thread blocking on + * the starved condition variable, but chances + * are that one will eventually come up soon so + * it's better to migrate now than never. + */ + gen_list = MB_GET_GEN_LIST(mb_list); + MB_LOCK_CONT(gen_list); + KASSERT((bucket->mb_owner & MB_BUCKET_FREE) != 0, + ("mb_free: corrupt bucket %p\n", bucket)); + SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead), + bucket, mb_blist); + bucket->mb_owner = MB_GENLIST_OWNER; + (*(cnt_lst->mb_cont.mc_objcount))--; + (*(gen_list->mb_cont.mc_objcount))++; + (*(cnt_lst->mb_cont.mc_numpgs))--; + (*(gen_list->mb_cont.mc_numpgs))++; + + /* + * Determine whether or not to keep transferring + * buckets to the general list or whether we've + * transferred enough already. + * We realize that although we may flag another + * bucket to be migrated to the general container + * that in the meantime, the thread that was + * blocked on the cv is already woken up and + * long gone. But in that case, the worst + * consequence is that we will end up migrating + * one bucket too many, which is really not a big + * deal, especially if we're close to a critical + * situation. + */ + if (gen_list->mb_cont.mc_starved > 0) { + cnt_lst->mb_cont.mc_starved--; + cv_signal(&(gen_list->mgl_mstarved)); + } else + cnt_lst->mb_cont.mc_starved = 0; + + MB_UNLOCK_CONT(gen_list); + MB_UNLOCK_CONT(cnt_lst); + break; + } + + if (*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) { + /* + * We've hit the high limit of allowed numbers of mbufs + * on this PCPU list. We must now migrate a bucket + * over to the general container. + */ + gen_list = MB_GET_GEN_LIST(mb_list); + MB_LOCK_CONT(gen_list); + if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) { + bucket = + SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead)); + SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead), + mb_blist); + } + SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead), + bucket, mb_blist); + bucket->mb_owner = MB_GENLIST_OWNER; + *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree; + *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree; + (*(cnt_lst->mb_cont.mc_numpgs))--; + (*(gen_list->mb_cont.mc_numpgs))++; + + /* + * While we're at it, transfer some of the mbtypes + * "count load" onto the general list's mbtypes + * array, seeing as how we're moving the bucket + * there now, meaning that the freeing of objects + * there will now decrement the _general list's_ + * mbtypes counters, and no longer our PCPU list's + * mbtypes counters. We do this for the type presently + * being freed in an effort to keep the mbtypes + * counters approximately balanced across all lists. + */ + MB_MBTYPES_DEC(cnt_lst, type, (PAGE_SIZE / + mb_list->ml_objsize) - bucket->mb_numfree); + MB_MBTYPES_INC(gen_list, type, (PAGE_SIZE / + mb_list->ml_objsize) - bucket->mb_numfree); + + MB_UNLOCK_CONT(gen_list); + MB_UNLOCK_CONT(cnt_lst); + break; + } + + if (bucket->mb_owner & MB_BUCKET_FREE) { + SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), + bucket, mb_blist); + bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; + } + + MB_UNLOCK_CONT(cnt_lst); + break; + } +} + +/* + * Drain protocols in hopes to free up some resources. + * + * LOCKING NOTES: + * No locks should be held when this is called. The drain routines have to + * presently acquire some locks which raises the possibility of lock order + * violation if we're holding any mutex if that mutex is acquired in reverse + * order relative to one of the locks in the drain routines. + */ +static void +mb_reclaim(void) +{ + struct domain *dp; + struct protosw *pr; + +/* + * XXX: Argh, we almost always trip here with witness turned on now-a-days + * XXX: because we often come in with Giant held. For now, there's no way + * XXX: to avoid this. + */ +#ifdef WITNESS + KASSERT(witness_list(curthread) == 0, + ("mb_reclaim() called with locks held")); +#endif + + mbstat.m_drain++; /* XXX: No consistency. */ + + for (dp = domains; dp != NULL; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_drain != NULL) + (*pr->pr_drain)(); +} + +/* + * Local mbuf & cluster alloc macros and routines. + * Local macro and function names begin with an underscore ("_"). + */ +static void _mclfree(struct mbuf *); + +#define _m_get(m, how, type) do { \ + (m) = (struct mbuf *)mb_alloc(&mb_list_mbuf, (how), (type)); \ + if ((m) != NULL) { \ + (m)->m_type = (type); \ + (m)->m_next = NULL; \ + (m)->m_nextpkt = NULL; \ + (m)->m_data = (m)->m_dat; \ + (m)->m_flags = 0; \ + } \ +} while (0) + +#define _m_gethdr(m, how, type) do { \ + (m) = (struct mbuf *)mb_alloc(&mb_list_mbuf, (how), (type)); \ + if ((m) != NULL) { \ + (m)->m_type = (type); \ + (m)->m_next = NULL; \ + (m)->m_nextpkt = NULL; \ + (m)->m_data = (m)->m_pktdat; \ + (m)->m_flags = M_PKTHDR; \ + (m)->m_pkthdr.rcvif = NULL; \ + (m)->m_pkthdr.csum_flags = 0; \ + (m)->m_pkthdr.aux = NULL; \ + } \ +} while (0) + +/* XXX: Check for M_PKTHDR && m_pkthdr.aux is bogus... please fix (see KAME). */ +#define _m_free(m, n) do { \ + (n) = (m)->m_next; \ + if ((m)->m_flags & M_EXT) \ + MEXTFREE((m)); \ + if (((m)->m_flags & M_PKTHDR) != 0 && (m)->m_pkthdr.aux) { \ + m_freem((m)->m_pkthdr.aux); \ + (m)->m_pkthdr.aux = NULL; \ + } \ + mb_free(&mb_list_mbuf, (m), (m)->m_type); \ +} while (0) + +#define _mext_init_ref(m) do { \ + (m)->m_ext.ref_cnt = malloc(sizeof(u_int), M_MBUF, M_NOWAIT); \ + if ((m)->m_ext.ref_cnt != NULL) { \ + *((m)->m_ext.ref_cnt) = 0; \ + MEXT_ADD_REF((m)); \ + } \ +} while (0) + +#define _mext_dealloc_ref(m) \ + free((m)->m_ext.ref_cnt, M_MBUF) + +void +_mext_free(struct mbuf *mb) +{ + + if (mb->m_ext.ext_type == EXT_CLUSTER) + mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF); + else + (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args); + _mext_dealloc_ref(mb); +} + +/* + * We only include this here to avoid making m_clget() excessively large + * due to too much inlined code. + */ +static void +_mclfree(struct mbuf *mb) +{ + + mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF); + mb->m_ext.ext_buf = NULL; +} + +/* + * Exported space allocation and de-allocation routines. + */ +struct mbuf * +m_get(int how, int type) +{ + struct mbuf *mb; + + _m_get(mb, how, type); + return (mb); +} + +struct mbuf * +m_gethdr(int how, int type) +{ + struct mbuf *mb; + + _m_gethdr(mb, how, type); + return (mb); +} + +struct mbuf * +m_get_clrd(int how, int type) +{ + struct mbuf *mb; + + _m_get(mb, how, type); + if (mb != NULL) + bzero(mtod(mb, caddr_t), MLEN); + return (mb); +} + +struct mbuf * +m_gethdr_clrd(int how, int type) +{ + struct mbuf *mb; + + _m_gethdr(mb, how, type); + if (mb != NULL) + bzero(mtod(mb, caddr_t), MHLEN); + return (mb); +} + +struct mbuf * +m_free(struct mbuf *mb) +{ + struct mbuf *nb; + + _m_free(mb, nb); + return (nb); +} + +void +m_clget(struct mbuf *mb, int how) +{ + + mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how, MT_NOTMBUF); + if (mb->m_ext.ext_buf != NULL) { + _mext_init_ref(mb); + if (mb->m_ext.ref_cnt == NULL) + _mclfree(mb); + else { + mb->m_data = mb->m_ext.ext_buf; + mb->m_flags |= M_EXT; + mb->m_ext.ext_free = NULL; + mb->m_ext.ext_args = NULL; + mb->m_ext.ext_size = MCLBYTES; + mb->m_ext.ext_type = EXT_CLUSTER; + } + } +} + +void +m_extadd(struct mbuf *mb, caddr_t buf, u_int size, + void (*freef)(void *, void *), void *args, short flags, int type) +{ + + _mext_init_ref(mb); + if (mb->m_ext.ref_cnt != NULL) { + mb->m_flags |= (M_EXT | flags); + mb->m_ext.ext_buf = buf; + mb->m_data = mb->m_ext.ext_buf; + mb->m_ext.ext_size = size; + mb->m_ext.ext_free = freef; + mb->m_ext.ext_args = args; + mb->m_ext.ext_type = type; + } +} + +/* + * Change type for mbuf `mb'; this is a relatively expensive operation and + * should be avoided. + */ +void +m_chtype(struct mbuf *mb, short new_type) +{ + struct mb_gen_list *gen_list; + + gen_list = MB_GET_GEN_LIST(&mb_list_mbuf); + MB_LOCK_CONT(gen_list); + MB_MBTYPES_DEC(gen_list, mb->m_type, 1); + MB_MBTYPES_INC(gen_list, new_type, 1); + MB_UNLOCK_CONT(gen_list); + mb->m_type = new_type; +} diff --git a/sys/kern/subr_mchain.c b/sys/kern/subr_mchain.c new file mode 100644 index 0000000..1a8c4bd --- /dev/null +++ b/sys/kern/subr_mchain.c @@ -0,0 +1,550 @@ +/* + * Copyright (c) 2000, 2001 Boris Popov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Boris Popov. + * 4. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/endian.h> +#include <sys/errno.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/uio.h> + +#include <sys/mchain.h> + +MODULE_VERSION(libmchain, 1); + +#define MBERROR(format, args...) printf("%s(%d): "format, __func__ , \ + __LINE__ ,## args) + +#define MBPANIC(format, args...) printf("%s(%d): "format, __func__ , \ + __LINE__ ,## args) + +/* + * Various helper functions + */ +int +m_fixhdr(struct mbuf *m0) +{ + struct mbuf *m = m0; + int len = 0; + + while (m) { + len += m->m_len; + m = m->m_next; + } + m0->m_pkthdr.len = len; + return len; +} + +int +mb_init(struct mbchain *mbp) +{ + struct mbuf *m; + + m = m_gethdr(M_TRYWAIT, MT_DATA); + if (m == NULL) + return ENOBUFS; + m->m_len = 0; + mb_initm(mbp, m); + return 0; +} + +void +mb_initm(struct mbchain *mbp, struct mbuf *m) +{ + bzero(mbp, sizeof(*mbp)); + mbp->mb_top = mbp->mb_cur = m; + mbp->mb_mleft = M_TRAILINGSPACE(m); +} + +void +mb_done(struct mbchain *mbp) +{ + if (mbp->mb_top) { + m_freem(mbp->mb_top); + mbp->mb_top = NULL; + } +} + +struct mbuf * +mb_detach(struct mbchain *mbp) +{ + struct mbuf *m; + + m = mbp->mb_top; + mbp->mb_top = NULL; + return m; +} + +int +mb_fixhdr(struct mbchain *mbp) +{ + return mbp->mb_top->m_pkthdr.len = m_fixhdr(mbp->mb_top); +} + +/* + * Check if object of size 'size' fit to the current position and + * allocate new mbuf if not. Advance pointers and increase length of mbuf(s). + * Return pointer to the object placeholder or NULL if any error occured. + * Note: size should be <= MLEN + */ +caddr_t +mb_reserve(struct mbchain *mbp, int size) +{ + struct mbuf *m, *mn; + caddr_t bpos; + + if (size > MLEN) + panic("mb_reserve: size = %d\n", size); + m = mbp->mb_cur; + if (mbp->mb_mleft < size) { + mn = m_get(M_TRYWAIT, MT_DATA); + if (mn == NULL) + return NULL; + mbp->mb_cur = m->m_next = mn; + m = mn; + m->m_len = 0; + mbp->mb_mleft = M_TRAILINGSPACE(m); + } + mbp->mb_mleft -= size; + mbp->mb_count += size; + bpos = mtod(m, caddr_t) + m->m_len; + m->m_len += size; + return bpos; +} + +int +mb_put_uint8(struct mbchain *mbp, u_int8_t x) +{ + return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM); +} + +int +mb_put_uint16be(struct mbchain *mbp, u_int16_t x) +{ + x = htobes(x); + return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM); +} + +int +mb_put_uint16le(struct mbchain *mbp, u_int16_t x) +{ + x = htoles(x); + return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM); +} + +int +mb_put_uint32be(struct mbchain *mbp, u_int32_t x) +{ + x = htobel(x); + return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM); +} + +int +mb_put_uint32le(struct mbchain *mbp, u_int32_t x) +{ + x = htolel(x); + return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM); +} + +int +mb_put_int64be(struct mbchain *mbp, int64_t x) +{ + x = htobeq(x); + return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM); +} + +int +mb_put_int64le(struct mbchain *mbp, int64_t x) +{ + x = htoleq(x); + return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM); +} + +int +mb_put_mem(struct mbchain *mbp, c_caddr_t source, int size, int type) +{ + struct mbuf *m; + caddr_t dst; + c_caddr_t src; + int cplen, error, mleft, count; + + m = mbp->mb_cur; + mleft = mbp->mb_mleft; + + while (size > 0) { + if (mleft == 0) { + if (m->m_next == NULL) { + m = m_getm(m, size, M_TRYWAIT, MT_DATA); + if (m == NULL) + return ENOBUFS; + } + m = m->m_next; + mleft = M_TRAILINGSPACE(m); + continue; + } + cplen = mleft > size ? size : mleft; + dst = mtod(m, caddr_t) + m->m_len; + switch (type) { + case MB_MCUSTOM: + error = mbp->mb_copy(mbp, source, dst, cplen); + if (error) + return error; + break; + case MB_MINLINE: + for (src = source, count = cplen; count; count--) + *dst++ = *src++; + break; + case MB_MSYSTEM: + bcopy(source, dst, cplen); + break; + case MB_MUSER: + error = copyin(source, dst, cplen); + if (error) + return error; + break; + case MB_MZERO: + bzero(dst, cplen); + break; + } + size -= cplen; + source += cplen; + m->m_len += cplen; + mleft -= cplen; + mbp->mb_count += cplen; + } + mbp->mb_cur = m; + mbp->mb_mleft = mleft; + return 0; +} + +int +mb_put_mbuf(struct mbchain *mbp, struct mbuf *m) +{ + mbp->mb_cur->m_next = m; + while (m) { + mbp->mb_count += m->m_len; + if (m->m_next == NULL) + break; + m = m->m_next; + } + mbp->mb_mleft = M_TRAILINGSPACE(m); + mbp->mb_cur = m; + return 0; +} + +/* + * copies a uio scatter/gather list to an mbuf chain. + */ +int +mb_put_uio(struct mbchain *mbp, struct uio *uiop, int size) +{ + long left; + int mtype, error; + + mtype = (uiop->uio_segflg == UIO_SYSSPACE) ? MB_MSYSTEM : MB_MUSER; + + while (size > 0 && uiop->uio_resid) { + if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) + return EFBIG; + left = uiop->uio_iov->iov_len; + if (left == 0) { + uiop->uio_iov++; + uiop->uio_iovcnt--; + continue; + } + if (left > size) + left = size; + error = mb_put_mem(mbp, uiop->uio_iov->iov_base, left, mtype); + if (error) + return error; + uiop->uio_offset += left; + uiop->uio_resid -= left; + uiop->uio_iov->iov_base += left; + uiop->uio_iov->iov_len -= left; + size -= left; + } + return 0; +} + +/* + * Routines for fetching data from an mbuf chain + */ +int +md_init(struct mdchain *mdp) +{ + struct mbuf *m; + + m = m_gethdr(M_TRYWAIT, MT_DATA); + if (m == NULL) + return ENOBUFS; + m->m_len = 0; + md_initm(mdp, m); + return 0; +} + +void +md_initm(struct mdchain *mdp, struct mbuf *m) +{ + bzero(mdp, sizeof(*mdp)); + mdp->md_top = mdp->md_cur = m; + mdp->md_pos = mtod(m, u_char*); +} + +void +md_done(struct mdchain *mdp) +{ + if (mdp->md_top) { + m_freem(mdp->md_top); + mdp->md_top = NULL; + } +} + +/* + * Append a separate mbuf chain. It is caller responsibility to prevent + * multiple calls to fetch/record routines. + */ +void +md_append_record(struct mdchain *mdp, struct mbuf *top) +{ + struct mbuf *m; + + if (mdp->md_top == NULL) { + md_initm(mdp, top); + return; + } + m = mdp->md_top; + while (m->m_nextpkt) + m = m->m_nextpkt; + m->m_nextpkt = top; + top->m_nextpkt = NULL; + return; +} + +/* + * Put next record in place of existing + */ +int +md_next_record(struct mdchain *mdp) +{ + struct mbuf *m; + + if (mdp->md_top == NULL) + return ENOENT; + m = mdp->md_top->m_nextpkt; + md_done(mdp); + if (m == NULL) + return ENOENT; + md_initm(mdp, m); + return 0; +} + +int +md_get_uint8(struct mdchain *mdp, u_int8_t *x) +{ + return md_get_mem(mdp, x, 1, MB_MINLINE); +} + +int +md_get_uint16(struct mdchain *mdp, u_int16_t *x) +{ + return md_get_mem(mdp, (caddr_t)x, 2, MB_MINLINE); +} + +int +md_get_uint16le(struct mdchain *mdp, u_int16_t *x) +{ + u_int16_t v; + int error = md_get_uint16(mdp, &v); + + *x = letohs(v); + return error; +} + +int +md_get_uint16be(struct mdchain *mdp, u_int16_t *x) { + u_int16_t v; + int error = md_get_uint16(mdp, &v); + + *x = betohs(v); + return error; +} + +int +md_get_uint32(struct mdchain *mdp, u_int32_t *x) +{ + return md_get_mem(mdp, (caddr_t)x, 4, MB_MINLINE); +} + +int +md_get_uint32be(struct mdchain *mdp, u_int32_t *x) +{ + u_int32_t v; + int error; + + error = md_get_uint32(mdp, &v); + *x = betohl(v); + return error; +} + +int +md_get_uint32le(struct mdchain *mdp, u_int32_t *x) +{ + u_int32_t v; + int error; + + error = md_get_uint32(mdp, &v); + *x = letohl(v); + return error; +} + +int +md_get_int64(struct mdchain *mdp, int64_t *x) +{ + return md_get_mem(mdp, (caddr_t)x, 8, MB_MINLINE); +} + +int +md_get_int64be(struct mdchain *mdp, int64_t *x) +{ + int64_t v; + int error; + + error = md_get_int64(mdp, &v); + *x = betohq(v); + return error; +} + +int +md_get_int64le(struct mdchain *mdp, int64_t *x) +{ + int64_t v; + int error; + + error = md_get_int64(mdp, &v); + *x = letohq(v); + return error; +} + +int +md_get_mem(struct mdchain *mdp, caddr_t target, int size, int type) +{ + struct mbuf *m = mdp->md_cur; + int error; + u_int count; + u_char *s; + + while (size > 0) { + if (m == NULL) { + MBERROR("incomplete copy\n"); + return EBADRPC; + } + s = mdp->md_pos; + count = mtod(m, u_char*) + m->m_len - s; + if (count == 0) { + mdp->md_cur = m = m->m_next; + if (m) + s = mdp->md_pos = mtod(m, caddr_t); + continue; + } + if (count > size) + count = size; + size -= count; + mdp->md_pos += count; + if (target == NULL) + continue; + switch (type) { + case MB_MUSER: + error = copyout(s, target, count); + if (error) + return error; + break; + case MB_MSYSTEM: + bcopy(s, target, count); + break; + case MB_MINLINE: + while (count--) + *target++ = *s++; + continue; + } + target += count; + } + return 0; +} + +int +md_get_mbuf(struct mdchain *mdp, int size, struct mbuf **ret) +{ + struct mbuf *m = mdp->md_cur, *rm; + + rm = m_copym(m, mdp->md_pos - mtod(m, u_char*), size, M_TRYWAIT); + if (rm == NULL) + return EBADRPC; + md_get_mem(mdp, NULL, size, MB_MZERO); + *ret = rm; + return 0; +} + +int +md_get_uio(struct mdchain *mdp, struct uio *uiop, int size) +{ + char *uiocp; + long left; + int mtype, error; + + mtype = (uiop->uio_segflg == UIO_SYSSPACE) ? MB_MSYSTEM : MB_MUSER; + while (size > 0 && uiop->uio_resid) { + if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) + return EFBIG; + left = uiop->uio_iov->iov_len; + if (left == 0) { + uiop->uio_iov++; + uiop->uio_iovcnt--; + continue; + } + uiocp = uiop->uio_iov->iov_base; + if (left > size) + left = size; + error = md_get_mem(mdp, uiocp, left, mtype); + if (error) + return error; + uiop->uio_offset += left; + uiop->uio_resid -= left; + uiop->uio_iov->iov_base += left; + uiop->uio_iov->iov_len -= left; + size -= left; + } + return 0; +} diff --git a/sys/kern/subr_module.c b/sys/kern/subr_module.c new file mode 100644 index 0000000..ce74eca --- /dev/null +++ b/sys/kern/subr_module.c @@ -0,0 +1,266 @@ +/*- + * Copyright (c) 1998 Michael Smith + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/linker.h> + +/* + * Preloaded module support + */ + +caddr_t preload_metadata; + +/* + * Search for the preloaded module (name) + */ +caddr_t +preload_search_by_name(const char *name) +{ + caddr_t curp; + u_int32_t *hdr; + int next; + + if (preload_metadata != NULL) { + + curp = preload_metadata; + for (;;) { + hdr = (u_int32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* Search for a MODINFO_NAME field */ + if ((hdr[0] == MODINFO_NAME) && + !strcmp(name, curp + sizeof(u_int32_t) * 2)) + return(curp); + + /* skip to next field */ + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } + return(NULL); +} + +/* + * Search for the first preloaded module of (type) + */ +caddr_t +preload_search_by_type(const char *type) +{ + caddr_t curp, lname; + u_int32_t *hdr; + int next; + + if (preload_metadata != NULL) { + + curp = preload_metadata; + lname = NULL; + for (;;) { + hdr = (u_int32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* remember the start of each record */ + if (hdr[0] == MODINFO_NAME) + lname = curp; + + /* Search for a MODINFO_TYPE field */ + if ((hdr[0] == MODINFO_TYPE) && + !strcmp(type, curp + sizeof(u_int32_t) * 2)) + return(lname); + + /* skip to next field */ + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } + return(NULL); +} + +/* + * Walk through the preloaded module list + */ +caddr_t +preload_search_next_name(caddr_t base) +{ + caddr_t curp; + u_int32_t *hdr; + int next; + + if (preload_metadata != NULL) { + + /* Pick up where we left off last time */ + if (base) { + /* skip to next field */ + curp = base; + hdr = (u_int32_t *)curp; + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } else + curp = preload_metadata; + + for (;;) { + hdr = (u_int32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* Found a new record? */ + if (hdr[0] == MODINFO_NAME) + return curp; + + /* skip to next field */ + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } + return(NULL); +} + +/* + * Given a preloaded module handle (mod), return a pointer + * to the data for the attribute (inf). + */ +caddr_t +preload_search_info(caddr_t mod, int inf) +{ + caddr_t curp; + u_int32_t *hdr; + u_int32_t type = 0; + int next; + + curp = mod; + for (;;) { + hdr = (u_int32_t *)curp; + /* end of module data? */ + if (hdr[0] == 0 && hdr[1] == 0) + break; + /* + * We give up once we've looped back to what we were looking at + * first - this should normally be a MODINFO_NAME field. + */ + if (type == 0) { + type = hdr[0]; + } else { + if (hdr[0] == type) + break; + } + + /* + * Attribute match? Return pointer to data. + * Consumer may safely assume that size value precedes + * data. + */ + if (hdr[0] == inf) + return(curp + (sizeof(u_int32_t) * 2)); + + /* skip to next field */ + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + return(NULL); +} + +/* + * Delete a preload record by name. + */ +void +preload_delete_name(const char *name) +{ + caddr_t curp; + u_int32_t *hdr; + int next; + int clearing; + + if (preload_metadata != NULL) { + + clearing = 0; + curp = preload_metadata; + for (;;) { + hdr = (u_int32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* Search for a MODINFO_NAME field */ + if (hdr[0] == MODINFO_NAME) { + if (!strcmp(name, curp + sizeof(u_int32_t) * 2)) + clearing = 1; /* got it, start clearing */ + else if (clearing) + clearing = 0; /* at next one now.. better stop */ + } + if (clearing) + hdr[0] = MODINFO_EMPTY; + + /* skip to next field */ + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } +} + +/* Called from locore on i386. Convert physical pointers to kvm. Sigh. */ +void +preload_bootstrap_relocate(vm_offset_t offset) +{ + caddr_t curp; + u_int32_t *hdr; + vm_offset_t *ptr; + int next; + + if (preload_metadata != NULL) { + + curp = preload_metadata; + for (;;) { + hdr = (u_int32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* Deal with the ones that we know we have to fix */ + switch (hdr[0]) { + case MODINFO_ADDR: + case MODINFO_METADATA|MODINFOMD_SSYM: + case MODINFO_METADATA|MODINFOMD_ESYM: + ptr = (vm_offset_t *)(curp + (sizeof(u_int32_t) * 2)); + *ptr += offset; + break; + } + /* The rest is beyond us for now */ + + /* skip to next field */ + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } +} diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c new file mode 100644 index 0000000..820fe0107 --- /dev/null +++ b/sys/kern/subr_param.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 1980, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)param.c 8.3 (Berkeley) 8/20/94 + * $FreeBSD$ + */ + +#include "opt_param.h" +#include "opt_maxusers.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> + +#include <machine/vmparam.h> + +/* + * System parameter formulae. + */ + +#ifndef HZ +#define HZ 100 +#endif +#define NPROC (20 + 16 * maxusers) +#ifndef NBUF +#define NBUF 0 +#endif +#ifndef MAXFILES +#define MAXFILES (maxproc * 2) +#endif + +int hz; +int tick; +int maxusers; /* base tunable */ +int maxproc; /* maximum # of processes */ +int maxprocperuid; /* max # of procs per user */ +int maxfiles; /* sys. wide open files limit */ +int maxfilesperproc; /* per-proc open files limit */ +int ncallout; /* maximum # of timer events */ +int nbuf; +int nswbuf; +int maxswzone; /* max swmeta KVA storage */ +int maxbcache; /* max buffer cache KVA storage */ +u_quad_t maxtsiz; /* max text size */ +u_quad_t dfldsiz; /* initial data size limit */ +u_quad_t maxdsiz; /* max data size */ +u_quad_t dflssiz; /* initial stack size limit */ +u_quad_t maxssiz; /* max stack size */ +u_quad_t sgrowsiz; /* amount to grow stack */ + +/* + * These have to be allocated somewhere; allocating + * them here forces loader errors if this file is omitted + * (if they've been externed everywhere else; hah!). + */ +struct buf *swbuf; + +/* + * Boot time overrides that are not scaled against main memory + */ +void +init_param1(void) +{ + + hz = HZ; + TUNABLE_INT_FETCH("kern.hz", &hz); + tick = 1000000 / hz; + +#ifdef VM_SWZONE_SIZE_MAX + maxswzone = VM_SWZONE_SIZE_MAX; +#endif + TUNABLE_INT_FETCH("kern.maxswzone", &maxswzone); +#ifdef VM_BCACHE_SIZE_MAX + maxbcache = VM_BCACHE_SIZE_MAX; +#endif + TUNABLE_INT_FETCH("kern.maxbcache", &maxbcache); + + maxtsiz = MAXTSIZ; + TUNABLE_QUAD_FETCH("kern.maxtsiz", &maxtsiz); + dfldsiz = DFLDSIZ; + TUNABLE_QUAD_FETCH("kern.dfldsiz", &dfldsiz); + maxdsiz = MAXDSIZ; + TUNABLE_QUAD_FETCH("kern.maxdsiz", &maxdsiz); + dflssiz = DFLSSIZ; + TUNABLE_QUAD_FETCH("kern.dflssiz", &dflssiz); + maxssiz = MAXSSIZ; + TUNABLE_QUAD_FETCH("kern.maxssiz", &maxssiz); + sgrowsiz = SGROWSIZ; + TUNABLE_QUAD_FETCH("kern.sgrowsiz", &sgrowsiz); +} + +/* + * Boot time overrides that are scaled against main memory + */ +void +init_param2(int physpages) +{ + + /* Base parameters */ + maxusers = MAXUSERS; + TUNABLE_INT_FETCH("kern.maxusers", &maxusers); + if (maxusers == 0) { + maxusers = physpages / (2 * 1024 * 1024 / PAGE_SIZE); + if (maxusers < 32) + maxusers = 32; + if (maxusers > 384) + maxusers = 384; + } + + /* + * The following can be overridden after boot via sysctl. Note: + * unless overriden, these macros are ultimately based on maxusers. + */ + maxproc = NPROC; + TUNABLE_INT_FETCH("kern.maxproc", &maxproc); + /* + * Limit maxproc so that kmap entries cannot be exhausted by + * processes. + */ + if (maxproc > (physpages / 12)) + maxproc = physpages / 12; + maxfiles = MAXFILES; + TUNABLE_INT_FETCH("kern.maxfiles", &maxfiles); + maxprocperuid = (maxproc * 9) / 10; + maxfilesperproc = (maxfiles * 9) / 10; + + /* + * Cannot be changed after boot. + */ + nbuf = NBUF; + TUNABLE_INT_FETCH("kern.nbuf", &nbuf); + + ncallout = 16 + maxproc + maxfiles; + TUNABLE_INT_FETCH("kern.ncallout", &ncallout); +} diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c new file mode 100644 index 0000000..132e957 --- /dev/null +++ b/sys/kern/subr_pcpu.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2001 Wind River Systems, Inc. + * All rights reserved. + * Written by: John Baldwin <jhb@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This module provides MI support for per-cpu data. + * + * Each architecture determines the mapping of logical CPU IDs to physical + * CPUs. The requirements of this mapping are as follows: + * - Logical CPU IDs must reside in the range 0 ... MAXCPU - 1. + * - The mapping is not required to be dense. That is, there may be + * gaps in the mappings. + * - The platform sets the value of MAXCPU in <machine/param.h>. + * - It is suggested, but not required, that in the non-SMP case, the + * platform define MAXCPU to be 1 and define the logical ID of the + * sole CPU as 0. + */ + +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/linker_set.h> +#include <sys/lock.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <ddb/ddb.h> + +static struct pcpu *cpuid_to_pcpu[MAXCPU]; +struct cpuhead cpuhead = SLIST_HEAD_INITIALIZER(cpuhead); + +/* + * Initialize the MI portions of a struct pcpu. + */ +void +pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) +{ + + bzero(pcpu, size); + KASSERT(cpuid >= 0 && cpuid < MAXCPU, + ("pcpu_init: invalid cpuid %d", cpuid)); + pcpu->pc_cpuid = cpuid; + pcpu->pc_cpumask = 1 << cpuid; + cpuid_to_pcpu[cpuid] = pcpu; + SLIST_INSERT_HEAD(&cpuhead, pcpu, pc_allcpu); + cpu_pcpu_init(pcpu, cpuid, size); +} + +/* + * Destroy a struct pcpu. + */ +void +pcpu_destroy(struct pcpu *pcpu) +{ + + SLIST_REMOVE(&cpuhead, pcpu, pcpu, pc_allcpu); + cpuid_to_pcpu[pcpu->pc_cpuid] = NULL; +} + +/* + * Locate a struct pcpu by cpu id. + */ +struct pcpu * +pcpu_find(u_int cpuid) +{ + + return (cpuid_to_pcpu[cpuid]); +} + +#ifdef DDB +DB_SHOW_COMMAND(pcpu, db_show_pcpu) +{ + struct pcpu *pc; + struct thread *td; + int id; + + if (have_addr) + id = ((addr >> 4) % 16) * 10 + (addr % 16); + else + id = PCPU_GET(cpuid); + pc = pcpu_find(id); + if (pc == NULL) { + db_printf("CPU %d not found\n", id); + return; + } + db_printf("cpuid = %d\n", pc->pc_cpuid); + db_printf("curthread = "); + td = pc->pc_curthread; + if (td != NULL) + db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid, + td->td_proc->p_comm); + else + db_printf("none\n"); + db_printf("curpcb = %p\n", pc->pc_curpcb); + db_printf("fpcurthread = "); + td = pc->pc_fpcurthread; + if (td != NULL) + db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid, + td->td_proc->p_comm); + else + db_printf("none\n"); + db_printf("idlethread = "); + td = pc->pc_idlethread; + if (td != NULL) + db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid, + td->td_proc->p_comm); + else + db_printf("none\n"); + db_show_mdpcpu(pc); + +#ifdef WITNESS + db_printf("spin locks held:\n"); + witness_list_locks(&pc->pc_spinlocks); +#endif +} +#endif diff --git a/sys/kern/subr_power.c b/sys/kern/subr_power.c new file mode 100644 index 0000000..7c96c9e --- /dev/null +++ b/sys/kern/subr_power.c @@ -0,0 +1,107 @@ +/*- + * Copyright (c) 2001 Mitsuru IWASAKI + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <sys/power.h> + +static u_int power_pm_type = POWER_PM_TYPE_NONE; +static power_pm_fn_t power_pm_fn = NULL; +static void *power_pm_arg = NULL; + +int +power_pm_register(u_int pm_type, power_pm_fn_t pm_fn, void *pm_arg) +{ + int error; + + if (power_pm_type == POWER_PM_TYPE_NONE || + power_pm_type == pm_type) { + power_pm_type = pm_type; + power_pm_fn = pm_fn; + power_pm_arg = pm_arg; + error = 0; + } else { + error = ENXIO; + } + + return (error); +} + +u_int +power_pm_get_type(void) +{ + + return (power_pm_type); +} + +void +power_pm_suspend(int state) +{ + if (power_pm_fn == NULL) + return; + + if (state != POWER_SLEEP_STATE_STANDBY && + state != POWER_SLEEP_STATE_SUSPEND && + state != POWER_SLEEP_STATE_HIBERNATE) + return; + + power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state); +} + +/* + * Power profile. + */ + +static int power_profile_state = POWER_PROFILE_PERFORMANCE; + +int +power_profile_get_state(void) +{ + return (power_profile_state); +} + +void +power_profile_set_state(int state) +{ + int changed; + + if (state != power_profile_state) { + power_profile_state = state; + changed = 1; + printf("system power profile changed to '%s'\n", + (state == POWER_PROFILE_PERFORMANCE) ? "performance" : "economy"); + } else { + changed = 0; + } + + if (changed) + EVENTHANDLER_INVOKE(power_profile_change); +} + diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c new file mode 100644 index 0000000..7f9b790 --- /dev/null +++ b/sys/kern/subr_prf.c @@ -0,0 +1,905 @@ +/*- + * Copyright (c) 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/kernel.h> +#include <sys/msgbuf.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/stdint.h> +#include <sys/sysctl.h> +#include <sys/tty.h> +#include <sys/syslog.h> +#include <sys/cons.h> +#include <sys/uio.h> + +/* + * Note that stdarg.h and the ANSI style va_start macro is used for both + * ANSI and traditional C compilers. + */ +#include <machine/stdarg.h> + +#define TOCONS 0x01 +#define TOTTY 0x02 +#define TOLOG 0x04 + +/* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */ +#define MAXNBUF (sizeof(intmax_t) * NBBY + 1) + +struct putchar_arg { + int flags; + int pri; + struct tty *tty; +}; + +struct snprintf_arg { + char *str; + size_t remain; +}; + +extern int log_open; + +struct tty *constty; /* pointer to console "window" tty */ + +static void (*v_putc)(int) = cnputc; /* routine to putc on virtual console */ +static void msglogchar(int c, int pri); +static void msgaddchar(int c, void *dummy); +static void putchar(int ch, void *arg); +static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len); +static void snprintf_func(int ch, void *arg); + +static int consintr = 1; /* Ok to handle console interrupts? */ +static int msgbufmapped; /* Set when safe to use msgbuf */ +int msgbuftrigger; + +static int log_console_output = 1; +SYSCTL_INT(_kern, OID_AUTO, log_console_output, CTLFLAG_RW, + &log_console_output, 0, ""); + +/* + * Warn that a system table is full. + */ +void +tablefull(const char *tab) +{ + + log(LOG_ERR, "%s: table is full\n", tab); +} + +/* + * Uprintf prints to the controlling terminal for the current process. + * It may block if the tty queue is overfull. No message is printed if + * the queue does not clear in a reasonable time. + */ +int +uprintf(const char *fmt, ...) +{ + struct thread *td = curthread; + struct proc *p = td->td_proc; + va_list ap; + struct putchar_arg pca; + int retval; + + if (td == NULL || td == PCPU_GET(idlethread)) + return (0); + + p = td->td_proc; + PROC_LOCK(p); + if ((p->p_flag & P_CONTROLT) == 0) { + PROC_UNLOCK(p); + return (0); + } + SESS_LOCK(p->p_session); + pca.tty = p->p_session->s_ttyp; + SESS_UNLOCK(p->p_session); + PROC_UNLOCK(p); + if (pca.tty == NULL) + return (0); + pca.flags = TOTTY; + va_start(ap, fmt); + retval = kvprintf(fmt, putchar, &pca, 10, ap); + va_end(ap); + + return (retval); +} + +/* + * tprintf prints on the controlling terminal associated + * with the given session, possibly to the log as well. + */ +void +tprintf(struct proc *p, int pri, const char *fmt, ...) +{ + struct tty *tp = NULL; + int flags = 0, shld = 0; + va_list ap; + struct putchar_arg pca; + int retval; + + if (pri != -1) + flags |= TOLOG; + if (p != NULL) { + PROC_LOCK(p); + if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { + SESS_LOCK(p->p_session); + SESSHOLD(p->p_session); + tp = p->p_session->s_ttyp; + SESS_UNLOCK(p->p_session); + PROC_UNLOCK(p); + shld++; + if (ttycheckoutq(tp, 0)) + flags |= TOTTY; + else + tp = NULL; + } else + PROC_UNLOCK(p); + } + pca.pri = pri; + pca.tty = tp; + pca.flags = flags; + va_start(ap, fmt); + retval = kvprintf(fmt, putchar, &pca, 10, ap); + va_end(ap); + if (shld) { + PROC_LOCK(p); + SESS_LOCK(p->p_session); + SESSRELE(p->p_session); + SESS_UNLOCK(p->p_session); + PROC_UNLOCK(p); + } + msgbuftrigger = 1; +} + +/* + * Ttyprintf displays a message on a tty; it should be used only by + * the tty driver, or anything that knows the underlying tty will not + * be revoke(2)'d away. Other callers should use tprintf. + */ +int +ttyprintf(struct tty *tp, const char *fmt, ...) +{ + va_list ap; + struct putchar_arg pca; + int retval; + + va_start(ap, fmt); + pca.tty = tp; + pca.flags = TOTTY; + retval = kvprintf(fmt, putchar, &pca, 10, ap); + va_end(ap); + return (retval); +} + +/* + * Log writes to the log buffer, and guarantees not to sleep (so can be + * called by interrupt routines). If there is no process reading the + * log yet, it writes to the console also. + */ +void +log(int level, const char *fmt, ...) +{ + va_list ap; + int retval; + struct putchar_arg pca; + + pca.tty = NULL; + pca.pri = level; + pca.flags = log_open ? TOLOG : TOCONS; + + va_start(ap, fmt); + retval = kvprintf(fmt, putchar, &pca, 10, ap); + va_end(ap); + + msgbuftrigger = 1; +} + +#define CONSCHUNK 128 + +void +log_console(struct uio *uio) +{ + int c, i, error, iovlen, nl; + struct uio muio; + struct iovec *miov = NULL; + char *consbuffer; + int pri; + + if (!log_console_output) + return; + + pri = LOG_INFO | LOG_CONSOLE; + muio = *uio; + iovlen = uio->uio_iovcnt * sizeof (struct iovec); + MALLOC(miov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + MALLOC(consbuffer, char *, CONSCHUNK, M_TEMP, M_WAITOK); + bcopy(muio.uio_iov, miov, iovlen); + muio.uio_iov = miov; + uio = &muio; + + nl = 0; + while (uio->uio_resid > 0) { + c = imin(uio->uio_resid, CONSCHUNK); + error = uiomove(consbuffer, c, uio); + if (error != 0) + return; + for (i = 0; i < c; i++) { + msglogchar(consbuffer[i], pri); + if (consbuffer[i] == '\n') + nl = 1; + else + nl = 0; + } + } + if (!nl) + msglogchar('\n', pri); + msgbuftrigger = 1; + FREE(miov, M_TEMP); + FREE(consbuffer, M_TEMP); + return; +} + +int +printf(const char *fmt, ...) +{ + va_list ap; + int savintr; + struct putchar_arg pca; + int retval; + + savintr = consintr; /* disable interrupts */ + consintr = 0; + va_start(ap, fmt); + pca.tty = NULL; + pca.flags = TOCONS | TOLOG; + pca.pri = -1; + retval = kvprintf(fmt, putchar, &pca, 10, ap); + va_end(ap); + if (!panicstr) + msgbuftrigger = 1; + consintr = savintr; /* reenable interrupts */ + return (retval); +} + +int +vprintf(const char *fmt, va_list ap) +{ + int savintr; + struct putchar_arg pca; + int retval; + + savintr = consintr; /* disable interrupts */ + consintr = 0; + pca.tty = NULL; + pca.flags = TOCONS | TOLOG; + pca.pri = -1; + retval = kvprintf(fmt, putchar, &pca, 10, ap); + if (!panicstr) + msgbuftrigger = 1; + consintr = savintr; /* reenable interrupts */ + return (retval); +} + +/* + * Print a character on console or users terminal. If destination is + * the console then the last bunch of characters are saved in msgbuf for + * inspection later. + */ +static void +putchar(int c, void *arg) +{ + struct putchar_arg *ap = (struct putchar_arg*) arg; + int flags = ap->flags; + struct tty *tp = ap->tty; + if (panicstr) + constty = NULL; + if ((flags & TOCONS) && tp == NULL && constty) { + tp = constty; + flags |= TOTTY; + } + if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 && + (flags & TOCONS) && tp == constty) + constty = NULL; + if ((flags & TOLOG)) + msglogchar(c, ap->pri); + if ((flags & TOCONS) && constty == NULL && c != '\0') + (*v_putc)(c); +} + +/* + * Scaled down version of sprintf(3). + */ +int +sprintf(char *buf, const char *cfmt, ...) +{ + int retval; + va_list ap; + + va_start(ap, cfmt); + retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap); + buf[retval] = '\0'; + va_end(ap); + return (retval); +} + +/* + * Scaled down version of vsprintf(3). + */ +int +vsprintf(char *buf, const char *cfmt, va_list ap) +{ + int retval; + + retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap); + buf[retval] = '\0'; + return (retval); +} + +/* + * Scaled down version of snprintf(3). + */ +int +snprintf(char *str, size_t size, const char *format, ...) +{ + int retval; + va_list ap; + + va_start(ap, format); + retval = vsnprintf(str, size, format, ap); + va_end(ap); + return(retval); +} + +/* + * Scaled down version of vsnprintf(3). + */ +int +vsnprintf(char *str, size_t size, const char *format, va_list ap) +{ + struct snprintf_arg info; + int retval; + + info.str = str; + info.remain = size; + retval = kvprintf(format, snprintf_func, &info, 10, ap); + if (info.remain >= 1) + *info.str++ = '\0'; + return (retval); +} + +static void +snprintf_func(int ch, void *arg) +{ + struct snprintf_arg *const info = arg; + + if (info->remain >= 2) { + *info->str++ = ch; + info->remain--; + } +} + +/* + * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse + * order; return an optional length and a pointer to the last character + * written in the buffer (i.e., the first character of the string). + * The buffer pointed to by `nbuf' must have length >= MAXNBUF. + */ +static char * +ksprintn(char *nbuf, uintmax_t num, int base, int *lenp) +{ + char *p; + + p = nbuf; + *p = '\0'; + do { + *++p = hex2ascii(num % base); + } while (num /= base); + if (lenp) + *lenp = p - nbuf; + return (p); +} + +/* + * Scaled down version of printf(3). + * + * Two additional formats: + * + * The format %b is supported to decode error registers. + * Its usage is: + * + * printf("reg=%b\n", regval, "<base><arg>*"); + * + * where <base> is the output base expressed as a control character, e.g. + * \10 gives octal; \20 gives hex. Each arg is a sequence of characters, + * the first of which gives the bit number to be inspected (origin 1), and + * the next characters (up to a control character, i.e. a character <= 32), + * give the name of the register. Thus: + * + * kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n"); + * + * would produce output: + * + * reg=3<BITTWO,BITONE> + * + * XXX: %D -- Hexdump, takes pointer and separator string: + * ("%6D", ptr, ":") -> XX:XX:XX:XX:XX:XX + * ("%*D", len, ptr, " " -> XX XX XX XX ... + */ +int +kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap) +{ +#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; } + char nbuf[MAXNBUF]; + char *d; + const char *p, *percent, *q; + u_char *up; + int ch, n; + uintmax_t num; + int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot; + int jflag; + int dwidth; + char padc; + int retval = 0; + + num = 0; + if (!func) + d = (char *) arg; + else + d = NULL; + + if (fmt == NULL) + fmt = "(fmt null)\n"; + + if (radix < 2 || radix > 36) + radix = 10; + + for (;;) { + padc = ' '; + width = 0; + while ((ch = (u_char)*fmt++) != '%') { + if (ch == '\0') + return (retval); + PCHAR(ch); + } + percent = fmt - 1; + qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0; + sign = 0; dot = 0; dwidth = 0; + jflag = 0; +reswitch: switch (ch = (u_char)*fmt++) { + case '.': + dot = 1; + goto reswitch; + case '#': + sharpflag = 1; + goto reswitch; + case '+': + sign = 1; + goto reswitch; + case '-': + ladjust = 1; + goto reswitch; + case '%': + PCHAR(ch); + break; + case '*': + if (!dot) { + width = va_arg(ap, int); + if (width < 0) { + ladjust = !ladjust; + width = -width; + } + } else { + dwidth = va_arg(ap, int); + } + goto reswitch; + case '0': + if (!dot) { + padc = '0'; + goto reswitch; + } + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + for (n = 0;; ++fmt) { + n = n * 10 + ch - '0'; + ch = *fmt; + if (ch < '0' || ch > '9') + break; + } + if (dot) + dwidth = n; + else + width = n; + goto reswitch; + case 'b': + num = va_arg(ap, int); + p = va_arg(ap, char *); + for (q = ksprintn(nbuf, num, *p++, NULL); *q;) + PCHAR(*q--); + + if (num == 0) + break; + + for (tmp = 0; *p;) { + n = *p++; + if (num & (1 << (n - 1))) { + PCHAR(tmp ? ',' : '<'); + for (; (n = *p) > ' '; ++p) + PCHAR(n); + tmp = 1; + } else + for (; *p > ' '; ++p) + continue; + } + if (tmp) + PCHAR('>'); + break; + case 'c': + PCHAR(va_arg(ap, int)); + break; + case 'D': + up = va_arg(ap, u_char *); + p = va_arg(ap, char *); + if (!width) + width = 16; + while(width--) { + PCHAR(hex2ascii(*up >> 4)); + PCHAR(hex2ascii(*up & 0x0f)); + up++; + if (width) + for (q=p;*q;q++) + PCHAR(*q); + } + break; + case 'd': + base = 10; + sign = 1; + goto handle_sign; + case 'j': + jflag = 1; + goto reswitch; + case 'l': + if (lflag) { + lflag = 0; + qflag = 1; + } else + lflag = 1; + goto reswitch; + case 'n': + if (jflag) + *(va_arg(ap, intmax_t *)) = retval; + else if (qflag) + *(va_arg(ap, quad_t *)) = retval; + else if (lflag) + *(va_arg(ap, long *)) = retval; + else + *(va_arg(ap, int *)) = retval; + break; + case 'o': + base = 8; + goto handle_nosign; + case 'p': + base = 16; + sharpflag = (width == 0); + sign = 0; + num = (uintptr_t)va_arg(ap, void *); + goto number; + case 'q': + qflag = 1; + goto reswitch; + case 'r': + base = radix; + if (sign) + goto handle_sign; + goto handle_nosign; + case 's': + p = va_arg(ap, char *); + if (p == NULL) + p = "(null)"; + if (!dot) + n = strlen (p); + else + for (n = 0; n < dwidth && p[n]; n++) + continue; + + width -= n; + + if (!ladjust && width > 0) + while (width--) + PCHAR(padc); + while (n--) + PCHAR(*p++); + if (ladjust && width > 0) + while (width--) + PCHAR(padc); + break; + case 'u': + base = 10; + goto handle_nosign; + case 'x': + case 'X': + base = 16; + goto handle_nosign; + case 'z': + base = 16; + if (sign) + goto handle_sign; +handle_nosign: + sign = 0; + if (jflag) + num = va_arg(ap, uintmax_t); + else if (qflag) + num = va_arg(ap, u_quad_t); + else if (lflag) + num = va_arg(ap, u_long); + else + num = va_arg(ap, u_int); + goto number; +handle_sign: + if (jflag) + num = va_arg(ap, intmax_t); + else if (qflag) + num = va_arg(ap, quad_t); + else if (lflag) + num = va_arg(ap, long); + else + num = va_arg(ap, int); +number: + if (sign && (intmax_t)num < 0) { + neg = 1; + num = -(intmax_t)num; + } + p = ksprintn(nbuf, num, base, &tmp); + if (sharpflag && num != 0) { + if (base == 8) + tmp++; + else if (base == 16) + tmp += 2; + } + if (neg) + tmp++; + + if (!ladjust && width && (width -= tmp) > 0) + while (width--) + PCHAR(padc); + if (neg) + PCHAR('-'); + if (sharpflag && num != 0) { + if (base == 8) { + PCHAR('0'); + } else if (base == 16) { + PCHAR('0'); + PCHAR('x'); + } + } + + while (*p) + PCHAR(*p--); + + if (ladjust && width && (width -= tmp) > 0) + while (width--) + PCHAR(padc); + + break; + default: + while (percent < fmt) + PCHAR(*percent++); + break; + } + } +#undef PCHAR +} + +/* + * Put character in log buffer with a particular priority. + */ +static void +msglogchar(int c, int pri) +{ + static int lastpri = -1; + static int dangling; + char nbuf[MAXNBUF]; + char *p; + + if (!msgbufmapped) + return; + if (c == '\0' || c == '\r') + return; + if (pri != -1 && pri != lastpri) { + if (dangling) { + msgaddchar('\n', NULL); + dangling = 0; + } + msgaddchar('<', NULL); + for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL); *p;) + msgaddchar(*p--, NULL); + msgaddchar('>', NULL); + lastpri = pri; + } + msgaddchar(c, NULL); + if (c == '\n') { + dangling = 0; + lastpri = -1; + } else { + dangling = 1; + } +} + +/* + * Put char in log buffer + */ +static void +msgaddchar(int c, void *dummy) +{ + struct msgbuf *mbp; + + if (!msgbufmapped) + return; + mbp = msgbufp; + mbp->msg_ptr[mbp->msg_bufx++] = c; + if (mbp->msg_bufx >= mbp->msg_size) + mbp->msg_bufx = 0; + /* If the buffer is full, keep the most recent data. */ + if (mbp->msg_bufr == mbp->msg_bufx) { + if (++mbp->msg_bufr >= mbp->msg_size) + mbp->msg_bufr = 0; + } +} + +static void +msgbufcopy(struct msgbuf *oldp) +{ + int pos; + + pos = oldp->msg_bufr; + while (pos != oldp->msg_bufx) { + msglogchar(oldp->msg_ptr[pos], -1); + if (++pos >= oldp->msg_size) + pos = 0; + } +} + +void +msgbufinit(void *ptr, size_t size) +{ + char *cp; + static struct msgbuf *oldp = NULL; + + size -= sizeof(*msgbufp); + cp = (char *)ptr; + msgbufp = (struct msgbuf *) (cp + size); + if (msgbufp->msg_magic != MSG_MAGIC || msgbufp->msg_size != size || + msgbufp->msg_bufx >= size || msgbufp->msg_bufr >= size) { + bzero(cp, size); + bzero(msgbufp, sizeof(*msgbufp)); + msgbufp->msg_magic = MSG_MAGIC; + msgbufp->msg_size = (char *)msgbufp - cp; + } + msgbufp->msg_ptr = cp; + if (msgbufmapped && oldp != msgbufp) + msgbufcopy(oldp); + msgbufmapped = 1; + oldp = msgbufp; +} + +SYSCTL_DECL(_security_bsd); + +static int unprivileged_read_msgbuf = 1; +SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_read_msgbuf, + CTLFLAG_RW, &unprivileged_read_msgbuf, 0, + "Unprivileged processes may read the kernel message buffer"); + +/* Sysctls for accessing/clearing the msgbuf */ +static int +sysctl_kern_msgbuf(SYSCTL_HANDLER_ARGS) +{ + int error; + + if (!unprivileged_read_msgbuf) { + error = suser(req->td); + if (error) + return (error); + } + + /* + * Unwind the buffer, so that it's linear (possibly starting with + * some initial nulls). + */ + error = sysctl_handle_opaque(oidp, msgbufp->msg_ptr + msgbufp->msg_bufx, + msgbufp->msg_size - msgbufp->msg_bufx, req); + if (error) + return (error); + if (msgbufp->msg_bufx > 0) { + error = sysctl_handle_opaque(oidp, msgbufp->msg_ptr, + msgbufp->msg_bufx, req); + } + return (error); +} + +SYSCTL_PROC(_kern, OID_AUTO, msgbuf, CTLTYPE_STRING | CTLFLAG_RD, + 0, 0, sysctl_kern_msgbuf, "A", "Contents of kernel message buffer"); + +static int msgbuf_clear; + +static int +sysctl_kern_msgbuf_clear(SYSCTL_HANDLER_ARGS) +{ + int error; + error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + if (!error && req->newptr) { + /* Clear the buffer and reset write pointer */ + bzero(msgbufp->msg_ptr, msgbufp->msg_size); + msgbufp->msg_bufr = msgbufp->msg_bufx = 0; + msgbuf_clear = 0; + } + return (error); +} + +SYSCTL_PROC(_kern, OID_AUTO, msgbuf_clear, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, &msgbuf_clear, 0, + sysctl_kern_msgbuf_clear, "I", "Clear kernel message buffer"); + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +DB_SHOW_COMMAND(msgbuf, db_show_msgbuf) +{ + int i, j; + + if (!msgbufmapped) { + db_printf("msgbuf not mapped yet\n"); + return; + } + db_printf("msgbufp = %p\n", msgbufp); + db_printf("magic = %x, size = %d, r= %d, w = %d, ptr = %p\n", + msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_bufr, + msgbufp->msg_bufx, msgbufp->msg_ptr); + for (i = 0; i < msgbufp->msg_size; i++) { + j = (i + msgbufp->msg_bufr) % msgbufp->msg_size; + db_printf("%c", msgbufp->msg_ptr[j]); + } + db_printf("\n"); +} + +#endif /* DDB */ diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c new file mode 100644 index 0000000..706863d --- /dev/null +++ b/sys/kern/subr_prof.c @@ -0,0 +1,531 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_prof.c 8.3 (Berkeley) 9/23/93 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/sysctl.h> + +#include <machine/cpu.h> + +#ifdef GPROF +#include <sys/malloc.h> +#include <sys/gmon.h> +#undef MCOUNT + +static MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer"); + +static void kmstartup(void *); +SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL) + +struct gmonparam _gmonparam = { GMON_PROF_OFF }; + +#ifdef GUPROF +#include <machine/asmacros.h> + +void +nullfunc_loop_profiled() +{ + int i; + + for (i = 0; i < CALIB_SCALE; i++) + nullfunc_profiled(); +} + +#define nullfunc_loop_profiled_end nullfunc_profiled /* XXX */ + +void +nullfunc_profiled() +{ +} +#endif /* GUPROF */ + +/* + * Update the histograms to support extending the text region arbitrarily. + * This is done slightly naively (no sparse regions), so will waste slight + * amounts of memory, but will overall work nicely enough to allow profiling + * of KLDs. + */ +void +kmupetext(uintfptr_t nhighpc) +{ + struct gmonparam np; /* slightly large */ + struct gmonparam *p = &_gmonparam; + char *cp; + + GIANT_REQUIRED; + bcopy(p, &np, sizeof(*p)); + np.highpc = ROUNDUP(nhighpc, HISTFRACTION * sizeof(HISTCOUNTER)); + if (np.highpc <= p->highpc) + return; + np.textsize = np.highpc - p->lowpc; + np.kcountsize = np.textsize / HISTFRACTION; + np.hashfraction = HASHFRACTION; + np.fromssize = np.textsize / HASHFRACTION; + np.tolimit = np.textsize * ARCDENSITY / 100; + if (np.tolimit < MINARCS) + np.tolimit = MINARCS; + else if (np.tolimit > MAXARCS) + np.tolimit = MAXARCS; + np.tossize = np.tolimit * sizeof(struct tostruct); + cp = malloc(np.kcountsize + np.fromssize + np.tossize, + M_GPROF, M_WAITOK); + /* + * Check for something else extending highpc while we slept. + */ + if (np.highpc <= p->highpc) { + free(cp, M_GPROF); + return; + } + np.tos = (struct tostruct *)cp; + cp += np.tossize; + np.kcount = (HISTCOUNTER *)cp; + cp += np.kcountsize; + np.froms = (u_short *)cp; +#ifdef GUPROF + /* Reinitialize pointers to overhead counters. */ + np.cputime_count = &KCOUNT(&np, PC_TO_I(&np, cputime)); + np.mcount_count = &KCOUNT(&np, PC_TO_I(&np, mcount)); + np.mexitcount_count = &KCOUNT(&np, PC_TO_I(&np, mexitcount)); +#endif + critical_enter(); + bcopy(p->tos, np.tos, p->tossize); + bzero((char *)np.tos + p->tossize, np.tossize - p->tossize); + bcopy(p->kcount, np.kcount, p->kcountsize); + bzero((char *)np.kcount + p->kcountsize, np.kcountsize - + p->kcountsize); + bcopy(p->froms, np.froms, p->fromssize); + bzero((char *)np.froms + p->fromssize, np.fromssize - p->fromssize); + cp = (char *)p->tos; + bcopy(&np, p, sizeof(*p)); + critical_exit(); + free(cp, M_GPROF); +} + +static void +kmstartup(dummy) + void *dummy; +{ + char *cp; + struct gmonparam *p = &_gmonparam; +#ifdef GUPROF + int cputime_overhead; + int empty_loop_time; + int i; + int mcount_overhead; + int mexitcount_overhead; + int nullfunc_loop_overhead; + int nullfunc_loop_profiled_time; + uintfptr_t tmp_addr; +#endif + + /* + * Round lowpc and highpc to multiples of the density we're using + * so the rest of the scaling (here and in gprof) stays in ints. + */ + p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER)); + p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER)); + p->textsize = p->highpc - p->lowpc; + printf("Profiling kernel, textsize=%lu [%x..%x]\n", + p->textsize, p->lowpc, p->highpc); + p->kcountsize = p->textsize / HISTFRACTION; + p->hashfraction = HASHFRACTION; + p->fromssize = p->textsize / HASHFRACTION; + p->tolimit = p->textsize * ARCDENSITY / 100; + if (p->tolimit < MINARCS) + p->tolimit = MINARCS; + else if (p->tolimit > MAXARCS) + p->tolimit = MAXARCS; + p->tossize = p->tolimit * sizeof(struct tostruct); + cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize, + M_GPROF, M_WAITOK | M_ZERO); + p->tos = (struct tostruct *)cp; + cp += p->tossize; + p->kcount = (HISTCOUNTER *)cp; + cp += p->kcountsize; + p->froms = (u_short *)cp; + +#ifdef GUPROF + /* Initialize pointers to overhead counters. */ + p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime)); + p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount)); + p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount)); + + /* + * Disable interrupts to avoid interference while we calibrate + * things. + */ + critical_enter(); + + /* + * Determine overheads. + * XXX this needs to be repeated for each useful timer/counter. + */ + cputime_overhead = 0; + startguprof(p); + for (i = 0; i < CALIB_SCALE; i++) + cputime_overhead += cputime(); + + empty_loop(); + startguprof(p); + empty_loop(); + empty_loop_time = cputime(); + + nullfunc_loop_profiled(); + + /* + * Start profiling. There won't be any normal function calls since + * interrupts are disabled, but we will call the profiling routines + * directly to determine their overheads. + */ + p->state = GMON_PROF_HIRES; + + startguprof(p); + nullfunc_loop_profiled(); + + startguprof(p); + for (i = 0; i < CALIB_SCALE; i++) +#if defined(__i386__) && __GNUC__ >= 2 + __asm("pushl %0; call __mcount; popl %%ecx" + : + : "i" (profil) + : "ax", "bx", "cx", "dx", "memory"); +#else +#error +#endif + mcount_overhead = KCOUNT(p, PC_TO_I(p, profil)); + + startguprof(p); + for (i = 0; i < CALIB_SCALE; i++) +#if defined(__i386__) && __GNUC__ >= 2 + __asm("call " __XSTRING(HIDENAME(mexitcount)) "; 1:" + : : : "ax", "bx", "cx", "dx", "memory"); + __asm("movl $1b,%0" : "=rm" (tmp_addr)); +#else +#error +#endif + mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr)); + + p->state = GMON_PROF_OFF; + stopguprof(p); + + critical_exit(); + + nullfunc_loop_profiled_time = 0; + for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled; + tmp_addr < (uintfptr_t)nullfunc_loop_profiled_end; + tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER)) + nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr)); +#define CALIB_DOSCALE(count) (((count) + CALIB_SCALE / 3) / CALIB_SCALE) +#define c2n(count, freq) ((int)((count) * 1000000000LL / freq)) + printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n", + CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)), + CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)), + CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)), + CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)), + CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate))); + cputime_overhead -= empty_loop_time; + mcount_overhead -= empty_loop_time; + mexitcount_overhead -= empty_loop_time; + + /*- + * Profiling overheads are determined by the times between the + * following events: + * MC1: mcount() is called + * MC2: cputime() (called from mcount()) latches the timer + * MC3: mcount() completes + * ME1: mexitcount() is called + * ME2: cputime() (called from mexitcount()) latches the timer + * ME3: mexitcount() completes. + * The times between the events vary slightly depending on instruction + * combination and cache misses, etc. Attempt to determine the + * minimum times. These can be subtracted from the profiling times + * without much risk of reducing the profiling times below what they + * would be when profiling is not configured. Abbreviate: + * ab = minimum time between MC1 and MC3 + * a = minumum time between MC1 and MC2 + * b = minimum time between MC2 and MC3 + * cd = minimum time between ME1 and ME3 + * c = minimum time between ME1 and ME2 + * d = minimum time between ME2 and ME3. + * These satisfy the relations: + * ab <= mcount_overhead (just measured) + * a + b <= ab + * cd <= mexitcount_overhead (just measured) + * c + d <= cd + * a + d <= nullfunc_loop_profiled_time (just measured) + * a >= 0, b >= 0, c >= 0, d >= 0. + * Assume that ab and cd are equal to the minimums. + */ + p->cputime_overhead = CALIB_DOSCALE(cputime_overhead); + p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead); + p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead + - cputime_overhead); + nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time; + p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead + - nullfunc_loop_overhead) + / 4); + p->mexitcount_pre_overhead = p->mexitcount_overhead + + p->cputime_overhead + - p->mexitcount_post_overhead; + p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead) + - p->mexitcount_post_overhead; + p->mcount_post_overhead = p->mcount_overhead + + p->cputime_overhead + - p->mcount_pre_overhead; + printf( +"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n", + c2n(p->cputime_overhead, p->profrate), + c2n(p->mcount_overhead, p->profrate), + c2n(p->mcount_pre_overhead, p->profrate), + c2n(p->mcount_post_overhead, p->profrate), + c2n(p->cputime_overhead, p->profrate), + c2n(p->mexitcount_overhead, p->profrate), + c2n(p->mexitcount_pre_overhead, p->profrate), + c2n(p->mexitcount_post_overhead, p->profrate)); + printf( +"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n", + p->cputime_overhead, p->mcount_overhead, + p->mcount_pre_overhead, p->mcount_post_overhead, + p->cputime_overhead, p->mexitcount_overhead, + p->mexitcount_pre_overhead, p->mexitcount_post_overhead); +#endif /* GUPROF */ +} + +/* + * Return kernel profiling information. + */ +static int +sysctl_kern_prof(SYSCTL_HANDLER_ARGS) +{ + int *name = (int *) arg1; + u_int namelen = arg2; + struct gmonparam *gp = &_gmonparam; + int error; + int state; + + /* all sysctl names at this level are terminal */ + if (namelen != 1) + return (ENOTDIR); /* overloaded */ + + switch (name[0]) { + case GPROF_STATE: + state = gp->state; + error = sysctl_handle_int(oidp, &state, 0, req); + if (error) + return (error); + if (!req->newptr) + return (0); + if (state == GMON_PROF_OFF) { + gp->state = state; + stopprofclock(&proc0); + stopguprof(gp); + } else if (state == GMON_PROF_ON) { + gp->state = GMON_PROF_OFF; + stopguprof(gp); + gp->profrate = profhz; + startprofclock(&proc0); + gp->state = state; +#ifdef GUPROF + } else if (state == GMON_PROF_HIRES) { + gp->state = GMON_PROF_OFF; + stopprofclock(&proc0); + startguprof(gp); + gp->state = state; +#endif + } else if (state != gp->state) + return (EINVAL); + return (0); + case GPROF_COUNT: + return (sysctl_handle_opaque(oidp, + gp->kcount, gp->kcountsize, req)); + case GPROF_FROMS: + return (sysctl_handle_opaque(oidp, + gp->froms, gp->fromssize, req)); + case GPROF_TOS: + return (sysctl_handle_opaque(oidp, + gp->tos, gp->tossize, req)); + case GPROF_GMONPARAM: + return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req)); + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} + +SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, ""); +#endif /* GPROF */ + +/* + * Profiling system call. + * + * The scale factor is a fixed point number with 16 bits of fraction, so that + * 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling. + */ +#ifndef _SYS_SYSPROTO_H_ +struct profil_args { + caddr_t samples; + size_t size; + size_t offset; + u_int scale; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +profil(td, uap) + struct thread *td; + register struct profil_args *uap; +{ + register struct uprof *upp; + int s; + int error = 0; + + mtx_lock(&Giant); + + if (uap->scale > (1 << 16)) { + error = EINVAL; + goto done2; + } + if (uap->scale == 0) { + stopprofclock(td->td_proc); + goto done2; + } + upp = &td->td_proc->p_stats->p_prof; + + /* Block profile interrupts while changing state. */ + s = splstatclock(); + upp->pr_off = uap->offset; + upp->pr_scale = uap->scale; + upp->pr_base = uap->samples; + upp->pr_size = uap->size; + startprofclock(td->td_proc); + splx(s); + +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * Scale is a fixed-point number with the binary point 16 bits + * into the value, and is <= 1.0. pc is at most 32 bits, so the + * intermediate result is at most 48 bits. + */ +#define PC_TO_INDEX(pc, prof) \ + ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \ + (u_quad_t)((prof)->pr_scale)) >> 16) & ~1) + +/* + * Collect user-level profiling statistics; called on a profiling tick, + * when a process is running in user-mode. This routine may be called + * from an interrupt context. We try to update the user profiling buffers + * cheaply with fuswintr() and suswintr(). If that fails, we revert to + * an AST that will vector us to trap() with a context in which copyin + * and copyout will work. Trap will then call addupc_task(). + * + * Note that we may (rarely) not get around to the AST soon enough, and + * lose profile ticks when the next tick overwrites this one, but in this + * case the system is overloaded and the profile is probably already + * inaccurate. + */ +void +addupc_intr(ke, pc, ticks) + register struct kse *ke; + register uintptr_t pc; + u_int ticks; +{ + register struct uprof *prof; + register caddr_t addr; + register u_int i; + register int v; + + if (ticks == 0) + return; + prof = &ke->ke_proc->p_stats->p_prof; + if (pc < prof->pr_off || + (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) + return; /* out of range; ignore */ + + addr = prof->pr_base + i; + if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) { + mtx_lock_spin(&sched_lock); + prof->pr_addr = pc; + prof->pr_ticks = ticks; + ke->ke_flags |= KEF_OWEUPC | KEF_ASTPENDING ; + mtx_unlock_spin(&sched_lock); + } +} + +/* + * Much like before, but we can afford to take faults here. If the + * update fails, we simply turn off profiling. + */ +void +addupc_task(ke, pc, ticks) + register struct kse *ke; + register uintptr_t pc; + u_int ticks; +{ + struct proc *p = ke->ke_proc; + register struct uprof *prof; + register caddr_t addr; + register u_int i; + u_short v; + + if (ticks == 0) + return; + + prof = &p->p_stats->p_prof; + if (pc < prof->pr_off || + (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) + return; + + addr = prof->pr_base + i; + if (copyin(addr, (caddr_t)&v, sizeof(v)) == 0) { + v += ticks; + if (copyout((caddr_t)&v, addr, sizeof(v)) == 0) + return; + } + stopprofclock(p); +} diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c new file mode 100644 index 0000000..85af088 --- /dev/null +++ b/sys/kern/subr_rman.c @@ -0,0 +1,609 @@ +/* + * Copyright 1998 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * The kernel resource manager. This code is responsible for keeping track + * of hardware resources which are apportioned out to various drivers. + * It does not actually assign those resources, and it is not expected + * that end-device drivers will call into this code directly. Rather, + * the code which implements the buses that those devices are attached to, + * and the code which manages CPU resources, will call this code, and the + * end-device drivers will make upcalls to that code to actually perform + * the allocation. + * + * There are two sorts of resources managed by this code. The first is + * the more familiar array (RMAN_ARRAY) type; resources in this class + * consist of a sequence of individually-allocatable objects which have + * been numbered in some well-defined order. Most of the resources + * are of this type, as it is the most familiar. The second type is + * called a gauge (RMAN_GAUGE), and models fungible resources (i.e., + * resources in which each instance is indistinguishable from every + * other instance). The principal anticipated application of gauges + * is in the context of power consumption, where a bus may have a specific + * power budget which all attached devices share. RMAN_GAUGE is not + * implemented yet. + * + * For array resources, we make one simplifying assumption: two clients + * sharing the same resource must use the same range of indices. That + * is to say, sharing of overlapping-but-not-identical regions is not + * permitted. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/bus.h> /* XXX debugging */ +#include <machine/bus.h> +#include <sys/rman.h> + +#ifdef RMAN_DEBUG +#define DPRINTF(params) printf##params +#else +#define DPRINTF(params) +#endif + +static MALLOC_DEFINE(M_RMAN, "rman", "Resource manager"); + +struct rman_head rman_head; +static struct mtx rman_mtx; /* mutex to protect rman_head */ +static int int_rman_activate_resource(struct rman *rm, struct resource *r, + struct resource **whohas); +static int int_rman_deactivate_resource(struct resource *r); +static int int_rman_release_resource(struct rman *rm, struct resource *r); + +int +rman_init(struct rman *rm) +{ + static int once; + + if (once == 0) { + once = 1; + TAILQ_INIT(&rman_head); + mtx_init(&rman_mtx, "rman head", NULL, MTX_DEF); + } + + if (rm->rm_type == RMAN_UNINIT) + panic("rman_init"); + if (rm->rm_type == RMAN_GAUGE) + panic("implement RMAN_GAUGE"); + + TAILQ_INIT(&rm->rm_list); + rm->rm_mtx = malloc(sizeof *rm->rm_mtx, M_RMAN, M_NOWAIT | M_ZERO); + if (rm->rm_mtx == 0) + return ENOMEM; + mtx_init(rm->rm_mtx, "rman", NULL, MTX_DEF); + + mtx_lock(&rman_mtx); + TAILQ_INSERT_TAIL(&rman_head, rm, rm_link); + mtx_unlock(&rman_mtx); + return 0; +} + +/* + * NB: this interface is not robust against programming errors which + * add multiple copies of the same region. + */ +int +rman_manage_region(struct rman *rm, u_long start, u_long end) +{ + struct resource *r, *s; + + r = malloc(sizeof *r, M_RMAN, M_NOWAIT | M_ZERO); + if (r == 0) + return ENOMEM; + r->r_start = start; + r->r_end = end; + r->r_rm = rm; + + mtx_lock(rm->rm_mtx); + for (s = TAILQ_FIRST(&rm->rm_list); + s && s->r_end < r->r_start; + s = TAILQ_NEXT(s, r_link)) + ; + + if (s == NULL) { + TAILQ_INSERT_TAIL(&rm->rm_list, r, r_link); + } else { + TAILQ_INSERT_BEFORE(s, r, r_link); + } + + mtx_unlock(rm->rm_mtx); + return 0; +} + +int +rman_fini(struct rman *rm) +{ + struct resource *r; + + mtx_lock(rm->rm_mtx); + TAILQ_FOREACH(r, &rm->rm_list, r_link) { + if (r->r_flags & RF_ALLOCATED) { + mtx_unlock(rm->rm_mtx); + return EBUSY; + } + } + + /* + * There really should only be one of these if we are in this + * state and the code is working properly, but it can't hurt. + */ + while (!TAILQ_EMPTY(&rm->rm_list)) { + r = TAILQ_FIRST(&rm->rm_list); + TAILQ_REMOVE(&rm->rm_list, r, r_link); + free(r, M_RMAN); + } + mtx_unlock(rm->rm_mtx); + mtx_lock(&rman_mtx); + TAILQ_REMOVE(&rman_head, rm, rm_link); + mtx_unlock(&rman_mtx); + mtx_destroy(rm->rm_mtx); + free(rm->rm_mtx, M_RMAN); + + return 0; +} + +struct resource * +rman_reserve_resource_bound(struct rman *rm, u_long start, u_long end, + u_long count, u_long bound, u_int flags, + struct device *dev) +{ + u_int want_activate; + struct resource *r, *s, *rv; + u_long rstart, rend, amask, bmask; + + rv = 0; + + DPRINTF(("rman_reserve_resource: <%s> request: [%#lx, %#lx], length " + "%#lx, flags %u, device %s\n", rm->rm_descr, start, end, count, + flags, dev == NULL ? "<null>" : device_get_nameunit(dev))); + want_activate = (flags & RF_ACTIVE); + flags &= ~RF_ACTIVE; + + mtx_lock(rm->rm_mtx); + + for (r = TAILQ_FIRST(&rm->rm_list); + r && r->r_end < start; + r = TAILQ_NEXT(r, r_link)) + ; + + if (r == NULL) { + DPRINTF(("could not find a region\n")); + goto out; + } + + amask = (1ul << RF_ALIGNMENT(flags)) - 1; + /* If bound is 0, bmask will also be 0 */ + bmask = ~(bound - 1); + /* + * First try to find an acceptable totally-unshared region. + */ + for (s = r; s; s = TAILQ_NEXT(s, r_link)) { + DPRINTF(("considering [%#lx, %#lx]\n", s->r_start, s->r_end)); + if (s->r_start > end) { + DPRINTF(("s->r_start (%#lx) > end (%#lx)\n", s->r_start, end)); + break; + } + if (s->r_flags & RF_ALLOCATED) { + DPRINTF(("region is allocated\n")); + continue; + } + rstart = ulmax(s->r_start, start); + /* + * Try to find a region by adjusting to boundary and alignment + * until both conditions are satisfied. This is not an optimal + * algorithm, but in most cases it isn't really bad, either. + */ + do { + rstart = (rstart + amask) & ~amask; + if (((rstart ^ (rstart + count)) & bmask) != 0) + rstart += bound - (rstart & ~bmask); + } while ((rstart & amask) != 0 && rstart < end && + rstart < s->r_end); + rend = ulmin(s->r_end, ulmax(rstart + count, end)); + DPRINTF(("truncated region: [%#lx, %#lx]; size %#lx (requested %#lx)\n", + rstart, rend, (rend - rstart + 1), count)); + + if ((rend - rstart + 1) >= count) { + DPRINTF(("candidate region: [%#lx, %#lx], size %#lx\n", + rend, rstart, (rend - rstart + 1))); + if ((s->r_end - s->r_start + 1) == count) { + DPRINTF(("candidate region is entire chunk\n")); + rv = s; + rv->r_flags |= RF_ALLOCATED | flags; + rv->r_dev = dev; + goto out; + } + + /* + * If s->r_start < rstart and + * s->r_end > rstart + count - 1, then + * we need to split the region into three pieces + * (the middle one will get returned to the user). + * Otherwise, we are allocating at either the + * beginning or the end of s, so we only need to + * split it in two. The first case requires + * two new allocations; the second requires but one. + */ + rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT | M_ZERO); + if (rv == 0) + goto out; + rv->r_start = rstart; + rv->r_end = rstart + count - 1; + rv->r_flags = flags | RF_ALLOCATED; + rv->r_dev = dev; + rv->r_rm = rm; + + if (s->r_start < rv->r_start && s->r_end > rv->r_end) { + DPRINTF(("splitting region in three parts: " + "[%#lx, %#lx]; [%#lx, %#lx]; [%#lx, %#lx]\n", + s->r_start, rv->r_start - 1, + rv->r_start, rv->r_end, + rv->r_end + 1, s->r_end)); + /* + * We are allocating in the middle. + */ + r = malloc(sizeof *r, M_RMAN, M_NOWAIT|M_ZERO); + if (r == 0) { + free(rv, M_RMAN); + rv = 0; + goto out; + } + r->r_start = rv->r_end + 1; + r->r_end = s->r_end; + r->r_flags = s->r_flags; + r->r_rm = rm; + s->r_end = rv->r_start - 1; + TAILQ_INSERT_AFTER(&rm->rm_list, s, rv, + r_link); + TAILQ_INSERT_AFTER(&rm->rm_list, rv, r, + r_link); + } else if (s->r_start == rv->r_start) { + DPRINTF(("allocating from the beginning\n")); + /* + * We are allocating at the beginning. + */ + s->r_start = rv->r_end + 1; + TAILQ_INSERT_BEFORE(s, rv, r_link); + } else { + DPRINTF(("allocating at the end\n")); + /* + * We are allocating at the end. + */ + s->r_end = rv->r_start - 1; + TAILQ_INSERT_AFTER(&rm->rm_list, s, rv, + r_link); + } + goto out; + } + } + + /* + * Now find an acceptable shared region, if the client's requirements + * allow sharing. By our implementation restriction, a candidate + * region must match exactly by both size and sharing type in order + * to be considered compatible with the client's request. (The + * former restriction could probably be lifted without too much + * additional work, but this does not seem warranted.) + */ + DPRINTF(("no unshared regions found\n")); + if ((flags & (RF_SHAREABLE | RF_TIMESHARE)) == 0) + goto out; + + for (s = r; s; s = TAILQ_NEXT(s, r_link)) { + if (s->r_start > end) + break; + if ((s->r_flags & flags) != flags) + continue; + rstart = ulmax(s->r_start, start); + rend = ulmin(s->r_end, ulmax(start + count, end)); + if (s->r_start >= start && s->r_end <= end + && (s->r_end - s->r_start + 1) == count && + (s->r_start & amask) == 0 && + ((s->r_start ^ s->r_end) & bmask) == 0) { + rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT | M_ZERO); + if (rv == 0) + goto out; + rv->r_start = s->r_start; + rv->r_end = s->r_end; + rv->r_flags = s->r_flags & + (RF_ALLOCATED | RF_SHAREABLE | RF_TIMESHARE); + rv->r_dev = dev; + rv->r_rm = rm; + if (s->r_sharehead == 0) { + s->r_sharehead = malloc(sizeof *s->r_sharehead, + M_RMAN, M_NOWAIT | M_ZERO); + if (s->r_sharehead == 0) { + free(rv, M_RMAN); + rv = 0; + goto out; + } + LIST_INIT(s->r_sharehead); + LIST_INSERT_HEAD(s->r_sharehead, s, + r_sharelink); + s->r_flags |= RF_FIRSTSHARE; + } + rv->r_sharehead = s->r_sharehead; + LIST_INSERT_HEAD(s->r_sharehead, rv, r_sharelink); + goto out; + } + } + + /* + * We couldn't find anything. + */ +out: + /* + * If the user specified RF_ACTIVE in the initial flags, + * which is reflected in `want_activate', we attempt to atomically + * activate the resource. If this fails, we release the resource + * and indicate overall failure. (This behavior probably doesn't + * make sense for RF_TIMESHARE-type resources.) + */ + if (rv && want_activate) { + struct resource *whohas; + if (int_rman_activate_resource(rm, rv, &whohas)) { + int_rman_release_resource(rm, rv); + rv = 0; + } + } + + mtx_unlock(rm->rm_mtx); + return (rv); +} + +struct resource * +rman_reserve_resource(struct rman *rm, u_long start, u_long end, u_long count, + u_int flags, struct device *dev) +{ + + return (rman_reserve_resource_bound(rm, start, end, count, 0, flags, + dev)); +} + +static int +int_rman_activate_resource(struct rman *rm, struct resource *r, + struct resource **whohas) +{ + struct resource *s; + int ok; + + /* + * If we are not timesharing, then there is nothing much to do. + * If we already have the resource, then there is nothing at all to do. + * If we are not on a sharing list with anybody else, then there is + * little to do. + */ + if ((r->r_flags & RF_TIMESHARE) == 0 + || (r->r_flags & RF_ACTIVE) != 0 + || r->r_sharehead == 0) { + r->r_flags |= RF_ACTIVE; + return 0; + } + + ok = 1; + for (s = LIST_FIRST(r->r_sharehead); s && ok; + s = LIST_NEXT(s, r_sharelink)) { + if ((s->r_flags & RF_ACTIVE) != 0) { + ok = 0; + *whohas = s; + } + } + if (ok) { + r->r_flags |= RF_ACTIVE; + return 0; + } + return EBUSY; +} + +int +rman_activate_resource(struct resource *r) +{ + int rv; + struct resource *whohas; + struct rman *rm; + + rm = r->r_rm; + mtx_lock(rm->rm_mtx); + rv = int_rman_activate_resource(rm, r, &whohas); + mtx_unlock(rm->rm_mtx); + return rv; +} + +int +rman_await_resource(struct resource *r, int pri, int timo) +{ + int rv; + struct resource *whohas; + struct rman *rm; + + rm = r->r_rm; + mtx_lock(rm->rm_mtx); + for (;;) { + rv = int_rman_activate_resource(rm, r, &whohas); + if (rv != EBUSY) + return (rv); /* returns with mutex held */ + + if (r->r_sharehead == 0) + panic("rman_await_resource"); + whohas->r_flags |= RF_WANTED; + rv = msleep(r->r_sharehead, rm->rm_mtx, pri, "rmwait", timo); + if (rv) { + mtx_unlock(rm->rm_mtx); + return (rv); + } + } +} + +static int +int_rman_deactivate_resource(struct resource *r) +{ + struct rman *rm; + + rm = r->r_rm; + r->r_flags &= ~RF_ACTIVE; + if (r->r_flags & RF_WANTED) { + r->r_flags &= ~RF_WANTED; + wakeup(r->r_sharehead); + } + return 0; +} + +int +rman_deactivate_resource(struct resource *r) +{ + struct rman *rm; + + rm = r->r_rm; + mtx_lock(rm->rm_mtx); + int_rman_deactivate_resource(r); + mtx_unlock(rm->rm_mtx); + return 0; +} + +static int +int_rman_release_resource(struct rman *rm, struct resource *r) +{ + struct resource *s, *t; + + if (r->r_flags & RF_ACTIVE) + int_rman_deactivate_resource(r); + + /* + * Check for a sharing list first. If there is one, then we don't + * have to think as hard. + */ + if (r->r_sharehead) { + /* + * If a sharing list exists, then we know there are at + * least two sharers. + * + * If we are in the main circleq, appoint someone else. + */ + LIST_REMOVE(r, r_sharelink); + s = LIST_FIRST(r->r_sharehead); + if (r->r_flags & RF_FIRSTSHARE) { + s->r_flags |= RF_FIRSTSHARE; + TAILQ_INSERT_BEFORE(r, s, r_link); + TAILQ_REMOVE(&rm->rm_list, r, r_link); + } + + /* + * Make sure that the sharing list goes away completely + * if the resource is no longer being shared at all. + */ + if (LIST_NEXT(s, r_sharelink) == 0) { + free(s->r_sharehead, M_RMAN); + s->r_sharehead = 0; + s->r_flags &= ~RF_FIRSTSHARE; + } + goto out; + } + + /* + * Look at the adjacent resources in the list and see if our + * segment can be merged with any of them. + */ + s = TAILQ_PREV(r, resource_head, r_link); + t = TAILQ_NEXT(r, r_link); + + if (s != NULL && (s->r_flags & RF_ALLOCATED) == 0 + && t != NULL && (t->r_flags & RF_ALLOCATED) == 0) { + /* + * Merge all three segments. + */ + s->r_end = t->r_end; + TAILQ_REMOVE(&rm->rm_list, r, r_link); + TAILQ_REMOVE(&rm->rm_list, t, r_link); + free(t, M_RMAN); + } else if (s != NULL && (s->r_flags & RF_ALLOCATED) == 0) { + /* + * Merge previous segment with ours. + */ + s->r_end = r->r_end; + TAILQ_REMOVE(&rm->rm_list, r, r_link); + } else if (t != NULL && (t->r_flags & RF_ALLOCATED) == 0) { + /* + * Merge next segment with ours. + */ + t->r_start = r->r_start; + TAILQ_REMOVE(&rm->rm_list, r, r_link); + } else { + /* + * At this point, we know there is nothing we + * can potentially merge with, because on each + * side, there is either nothing there or what is + * there is still allocated. In that case, we don't + * want to remove r from the list; we simply want to + * change it to an unallocated region and return + * without freeing anything. + */ + r->r_flags &= ~RF_ALLOCATED; + return 0; + } + +out: + free(r, M_RMAN); + return 0; +} + +int +rman_release_resource(struct resource *r) +{ + int rv; + struct rman *rm = r->r_rm; + + mtx_lock(rm->rm_mtx); + rv = int_rman_release_resource(rm, r); + mtx_unlock(rm->rm_mtx); + return (rv); +} + +uint32_t +rman_make_alignment_flags(uint32_t size) +{ + int i; + + /* + * Find the hightest bit set, and add one if more than one bit + * set. We're effectively computing the ceil(log2(size)) here. + */ + for (i = 31; i > 0; i--) + if ((1 << i) & size) + break; + if (~(1 << i) & size) + i++; + + return(RF_ALIGNMENT_LOG2(i)); +} diff --git a/sys/kern/subr_rtc.c b/sys/kern/subr_rtc.c new file mode 100644 index 0000000..a79e331 --- /dev/null +++ b/sys/kern/subr_rtc.c @@ -0,0 +1,316 @@ +/* + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1982, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: clock.c 1.18 91/01/21$ + * from: @(#)clock.c 8.2 (Berkeley) 1/12/94 + * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp + * and + * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04 + * + * $FreeBSD$ + */ + +/* + * Helpers for time-of-day clocks. This is useful for architectures that need + * support multiple models of such clocks, and generally serves to make the + * code more machine-independent. + * If the clock in question can also be used as a time counter, the driver + * needs to initiate this. + * This code is not yet used by all architectures. + */ + +/* + * Generic routines to convert between a POSIX date + * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec + * Derived from NetBSD arch/hp300/hp300/clock.c + */ +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/clock.h> +#include <sys/sysctl.h> +#include <sys/timetc.h> + +#include "clock_if.h" + +static __inline int leapyear(int year); +static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS); + +#define FEBRUARY 2 +#define days_in_year(y) (leapyear(y) ? 366 : 365) +#define days_in_month(y, m) \ + (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0)) +/* Day of week. Days are counted from 1/1/1970, which was a Thursday */ +#define day_of_week(days) (((days) + 4) % 7) + +static const int month_days[12] = { + 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 +}; + +static device_t clock_dev = NULL; +static long clock_res; + +int adjkerntz; /* local offset from GMT in seconds */ +int disable_rtc_set; /* disable resettodr() if != 0 */ +int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */ + +/* + * These have traditionally been in machdep, but should probably be moved to + * kern. + */ +SYSCTL_PROC(_machdep, OID_AUTO, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, + &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); + +SYSCTL_INT(_machdep, OID_AUTO, disable_rtc_set, + CTLFLAG_RW, &disable_rtc_set, 0, ""); + +SYSCTL_INT(_machdep, OID_AUTO, wall_cmos_clock, + CTLFLAG_RW, &wall_cmos_clock, 0, ""); + +static int +sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) +{ + int error; + error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, + req); + if (!error && req->newptr) + resettodr(); + return (error); +} + +/* + * This inline avoids some unnecessary modulo operations + * as compared with the usual macro: + * ( ((year % 4) == 0 && + * (year % 100) != 0) || + * ((year % 400) == 0) ) + * It is otherwise equivalent. + */ +static __inline int +leapyear(int year) +{ + int rv = 0; + + if ((year & 3) == 0) { + rv = 1; + if ((year % 100) == 0) { + rv = 0; + if ((year % 400) == 0) + rv = 1; + } + } + return (rv); +} + +int +clock_ct_to_ts(struct clocktime *ct, struct timespec *ts) +{ + time_t secs; + int i, year, days; + + year = ct->year; + + /* Sanity checks. */ + if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 || + ct->day > days_in_month(year, ct->mon) || + ct->hour > 23 || ct->min > 59 || ct->sec > 59 || + ct->year > 2037) /* time_t overflow */ + return (EINVAL); + + /* + * Compute days since start of time + * First from years, then from months. + */ + days = 0; + for (i = POSIX_BASE_YEAR; i < year; i++) + days += days_in_year(i); + + /* Months */ + for (i = 1; i < ct->mon; i++) + days += days_in_month(year, i); + days += (ct->day - 1); + + /* Another sanity check. */ + if (ct->dow != -1 && ct->dow != day_of_week(days)) + return (EINVAL); + + /* Add hours, minutes, seconds. */ + secs = ((days * 24 + ct->hour) * 60 + ct->min) * 60 + ct->sec; + + ts->tv_sec = secs; + ts->tv_nsec = ct->nsec; + return (0); +} + +void +clock_ts_to_ct(struct timespec *ts, struct clocktime *ct) +{ + int i, year, days; + time_t rsec; /* remainder seconds */ + time_t secs; + + secs = ts->tv_sec; + days = secs / SECDAY; + rsec = secs % SECDAY; + + ct->dow = day_of_week(days); + + /* Subtract out whole years, counting them in i. */ + for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++) + days -= days_in_year(year); + ct->year = year; + + /* Subtract out whole months, counting them in i. */ + for (i = 1; days >= days_in_month(year, i); i++) + days -= days_in_month(year, i); + ct->mon = i; + + /* Days are what is left over (+1) from all that. */ + ct->day = days + 1; + + /* Hours, minutes, seconds are easy */ + ct->hour = rsec / 3600; + rsec = rsec % 3600; + ct->min = rsec / 60; + rsec = rsec % 60; + ct->sec = rsec; + ct->nsec = ts->tv_nsec; +} + +void +clock_register(device_t dev, long res) +{ + + if (clock_dev != NULL) { + if (clock_res > res) { + if (bootverbose) { + device_printf(dev, "not installed as " + "time-of-day clock: clock %s has higher " + "resolution\n", device_get_name(clock_dev)); + } + return; + } else { + if (bootverbose) { + device_printf(clock_dev, "removed as " + "time-of-day clock: clock %s has higher " + "resolution\n", device_get_name(dev)); + } + } + } + clock_dev = dev; + clock_res = res; + if (bootverbose) { + device_printf(dev, "registered as a time-of-day clock " + "(resolution %ldus)\n", res); + } +} + +/* + * inittodr and settodr derived from the i386 versions written + * by Christoph Robitschko <chmr@edvz.tu-graz.ac.at>, reintroduced and + * updated by Chris Stenton <chris@gnome.co.uk> 8/10/94 + */ + +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +void +inittodr(time_t base) +{ + struct timespec diff, ref, ts; + int error; + + if (base) { + ref.tv_sec = base; + ref.tv_nsec = 0; + tc_setclock(&ref); + } + + if (clock_dev == NULL) { + printf("warning: no time-of-day clock registered, system time " + "will not be set accurately\n"); + return; + } + error = CLOCK_GETTIME(clock_dev, &ts); + if (error != 0 && error != EINVAL) { + printf("warning: clock_gettime failed (%d), the system time " + "will not be set accurately\n", error); + return; + } + if (error == EINVAL || ts.tv_sec < 0) { + printf("Invalid time in real time clock.\n"); + printf("Check and reset the date immediately!\n"); + } + + ts.tv_sec += tz.tz_minuteswest * 60 + + (wall_cmos_clock ? adjkerntz : 0); + + if (timespeccmp(&ref, &ts, >)) { + diff = ref; + timespecsub(&ref, &ts); + } else { + diff = ts; + timespecsub(&diff, &ref); + } + if (ts.tv_sec >= 2) { + /* badly off, adjust it */ + tc_setclock(&ts); + } +} + +/* + * Write system time back to RTC + */ +void +resettodr() +{ + struct timespec ts; + int error; + + if (disable_rtc_set || clock_dev == NULL) + return; + + getnanotime(&ts); + ts.tv_sec -= tz.tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); + if ((error = CLOCK_SETTIME(clock_dev, &ts)) != 0) { + printf("warning: clock_settime failed (%d), time-of-day clock " + "not adjusted to system time\n", error); + return; + } +} diff --git a/sys/kern/subr_sbuf.c b/sys/kern/subr_sbuf.c new file mode 100644 index 0000000..6c910e6 --- /dev/null +++ b/sys/kern/subr_sbuf.c @@ -0,0 +1,560 @@ +/*- + * Copyright (c) 2000 Poul-Henning Kamp and Dag-Erling Coïdan Smørgrav + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> + +#ifdef _KERNEL +#include <sys/ctype.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/systm.h> +#include <sys/uio.h> +#include <machine/stdarg.h> +#else /* _KERNEL */ +#include <ctype.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif /* _KERNEL */ + +#include <sys/sbuf.h> + +#ifdef _KERNEL +MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers"); +#define SBMALLOC(size) malloc(size, M_SBUF, M_WAITOK) +#define SBFREE(buf) free(buf, M_SBUF) +#else /* _KERNEL */ +#define KASSERT(e, m) +#define SBMALLOC(size) malloc(size) +#define SBFREE(buf) free(buf) +#define min(x,y) MIN(x,y) +#endif /* _KERNEL */ + +/* + * Predicates + */ +#define SBUF_ISDYNAMIC(s) ((s)->s_flags & SBUF_DYNAMIC) +#define SBUF_ISDYNSTRUCT(s) ((s)->s_flags & SBUF_DYNSTRUCT) +#define SBUF_ISFINISHED(s) ((s)->s_flags & SBUF_FINISHED) +#define SBUF_HASOVERFLOWED(s) ((s)->s_flags & SBUF_OVERFLOWED) +#define SBUF_HASROOM(s) ((s)->s_len < (s)->s_size - 1) +#define SBUF_FREESPACE(s) ((s)->s_size - (s)->s_len - 1) +#define SBUF_CANEXTEND(s) ((s)->s_flags & SBUF_AUTOEXTEND) + +/* + * Set / clear flags + */ +#define SBUF_SETFLAG(s, f) do { (s)->s_flags |= (f); } while (0) +#define SBUF_CLEARFLAG(s, f) do { (s)->s_flags &= ~(f); } while (0) + +#define SBUF_MINEXTENDSIZE 16 /* Should be power of 2. */ +#define SBUF_MAXEXTENDSIZE PAGE_SIZE +#define SBUF_MAXEXTENDINCR PAGE_SIZE + +/* + * Debugging support + */ +#if defined(_KERNEL) && defined(INVARIANTS) +static void +_assert_sbuf_integrity(const char *fun, struct sbuf *s) +{ + KASSERT(s != NULL, + ("%s called with a NULL sbuf pointer", fun)); + KASSERT(s->s_buf != NULL, + ("%s called with uninitialized or corrupt sbuf", fun)); + KASSERT(s->s_len < s->s_size, + ("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size)); +} + +static void +_assert_sbuf_state(const char *fun, struct sbuf *s, int state) +{ + KASSERT((s->s_flags & SBUF_FINISHED) == state, + ("%s called with %sfinished or corrupt sbuf", fun, + (state ? "un" : ""))); +} +#define assert_sbuf_integrity(s) _assert_sbuf_integrity(__func__, (s)) +#define assert_sbuf_state(s, i) _assert_sbuf_state(__func__, (s), (i)) +#else /* _KERNEL && INVARIANTS */ +#define assert_sbuf_integrity(s) do { } while (0) +#define assert_sbuf_state(s, i) do { } while (0) +#endif /* _KERNEL && INVARIANTS */ + +static int +sbuf_extendsize(int size) +{ + int newsize; + + newsize = SBUF_MINEXTENDSIZE; + while (newsize < size) { + if (newsize < SBUF_MAXEXTENDSIZE) + newsize *= 2; + else + newsize += SBUF_MAXEXTENDINCR; + } + + return (newsize); +} + + +/* + * Extend an sbuf. + */ +static int +sbuf_extend(struct sbuf *s, int addlen) +{ + char *newbuf; + int newsize; + + if (!SBUF_CANEXTEND(s)) + return (-1); + + newsize = sbuf_extendsize(s->s_size + addlen); + newbuf = (char *)SBMALLOC(newsize); + if (newbuf == NULL) + return (-1); + bcopy(s->s_buf, newbuf, s->s_size); + if (SBUF_ISDYNAMIC(s)) + SBFREE(s->s_buf); + else + SBUF_SETFLAG(s, SBUF_DYNAMIC); + s->s_buf = newbuf; + s->s_size = newsize; + return (0); +} + +/* + * Initialize an sbuf. + * If buf is non-NULL, it points to a static or already-allocated string + * big enough to hold at least length characters. + */ +struct sbuf * +sbuf_new(struct sbuf *s, char *buf, int length, int flags) +{ + KASSERT(length >= 0, + ("attempt to create an sbuf of negative length (%d)", length)); + KASSERT((flags & ~SBUF_USRFLAGMSK) == 0, + ("%s called with invalid flags", __func__)); + + flags &= SBUF_USRFLAGMSK; + if (s == NULL) { + s = (struct sbuf *)SBMALLOC(sizeof *s); + if (s == NULL) + return (NULL); + bzero(s, sizeof *s); + s->s_flags = flags; + SBUF_SETFLAG(s, SBUF_DYNSTRUCT); + } else { + bzero(s, sizeof *s); + s->s_flags = flags; + } + s->s_size = length; + if (buf) { + s->s_buf = buf; + return (s); + } + if (flags & SBUF_AUTOEXTEND) + s->s_size = sbuf_extendsize(s->s_size); + s->s_buf = (char *)SBMALLOC(s->s_size); + if (s->s_buf == NULL) { + if (SBUF_ISDYNSTRUCT(s)) + SBFREE(s); + return (NULL); + } + SBUF_SETFLAG(s, SBUF_DYNAMIC); + return (s); +} + +#ifdef _KERNEL +/* + * Create an sbuf with uio data + */ +struct sbuf * +sbuf_uionew(struct sbuf *s, struct uio *uio, int *error) +{ + KASSERT(uio != NULL, + ("%s called with NULL uio pointer", __func__)); + KASSERT(error != NULL, + ("%s called with NULL error pointer", __func__)); + + s = sbuf_new(s, NULL, uio->uio_resid + 1, 0); + if (s == NULL) { + *error = ENOMEM; + return (NULL); + } + *error = uiomove(s->s_buf, uio->uio_resid, uio); + if (*error != 0) { + sbuf_delete(s); + return (NULL); + } + s->s_len = s->s_size - 1; + *error = 0; + return (s); +} +#endif + +/* + * Clear an sbuf and reset its position. + */ +void +sbuf_clear(struct sbuf *s) +{ + assert_sbuf_integrity(s); + /* don't care if it's finished or not */ + + SBUF_CLEARFLAG(s, SBUF_FINISHED); + SBUF_CLEARFLAG(s, SBUF_OVERFLOWED); + s->s_len = 0; +} + +/* + * Set the sbuf's end position to an arbitrary value. + * Effectively truncates the sbuf at the new position. + */ +int +sbuf_setpos(struct sbuf *s, int pos) +{ + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + KASSERT(pos >= 0, + ("attempt to seek to a negative position (%d)", pos)); + KASSERT(pos < s->s_size, + ("attempt to seek past end of sbuf (%d >= %d)", pos, s->s_size)); + + if (pos < 0 || pos > s->s_len) + return (-1); + s->s_len = pos; + return (0); +} + +/* + * Append a byte string to an sbuf. + */ +int +sbuf_bcat(struct sbuf *s, const char *str, size_t len) +{ + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + if (SBUF_HASOVERFLOWED(s)) + return (-1); + + for (; len; len--) { + if (!SBUF_HASROOM(s) && sbuf_extend(s, len) < 0) + break; + s->s_buf[s->s_len++] = *str++; + } + if (len) { + SBUF_SETFLAG(s, SBUF_OVERFLOWED); + return (-1); + } + return (0); +} + +#ifdef _KERNEL +/* + * Copy a byte string from userland into an sbuf. + */ +int +sbuf_bcopyin(struct sbuf *s, const void *uaddr, size_t len) +{ + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + if (SBUF_HASOVERFLOWED(s)) + return (-1); + + if (len == 0) + return (0); + if (len > SBUF_FREESPACE(s)) { + sbuf_extend(s, len - SBUF_FREESPACE(s)); + len = min(len, SBUF_FREESPACE(s)); + } + if (copyin(uaddr, s->s_buf + s->s_len, len) != 0) + return (-1); + s->s_len += len; + + return (0); +} +#endif + +/* + * Copy a byte string into an sbuf. + */ +int +sbuf_bcpy(struct sbuf *s, const char *str, size_t len) +{ + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + sbuf_clear(s); + return (sbuf_bcat(s, str, len)); +} + +/* + * Append a string to an sbuf. + */ +int +sbuf_cat(struct sbuf *s, const char *str) +{ + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + if (SBUF_HASOVERFLOWED(s)) + return (-1); + + while (*str) { + if (!SBUF_HASROOM(s) && sbuf_extend(s, strlen(str)) < 0) + break; + s->s_buf[s->s_len++] = *str++; + } + if (*str) { + SBUF_SETFLAG(s, SBUF_OVERFLOWED); + return (-1); + } + return (0); +} + +#ifdef _KERNEL +/* + * Append a string from userland to an sbuf. + */ +int +sbuf_copyin(struct sbuf *s, const void *uaddr, size_t len) +{ + size_t done; + + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + if (SBUF_HASOVERFLOWED(s)) + return (-1); + + if (len == 0) + len = SBUF_FREESPACE(s); /* XXX return 0? */ + if (len > SBUF_FREESPACE(s)) { + sbuf_extend(s, len); + len = min(len, SBUF_FREESPACE(s)); + } + switch (copyinstr(uaddr, s->s_buf + s->s_len, len + 1, &done)) { + case ENAMETOOLONG: + SBUF_SETFLAG(s, SBUF_OVERFLOWED); + /* fall through */ + case 0: + s->s_len += done - 1; + break; + default: + return (-1); /* XXX */ + } + + return (0); +} +#endif + +/* + * Copy a string into an sbuf. + */ +int +sbuf_cpy(struct sbuf *s, const char *str) +{ + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + sbuf_clear(s); + return (sbuf_cat(s, str)); +} + +/* + * Format the given argument list and append the resulting string to an sbuf. + */ +int +sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap) +{ + int len; + + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + KASSERT(fmt != NULL, + ("%s called with a NULL format string", __func__)); + + if (SBUF_HASOVERFLOWED(s)) + return (-1); + + do { + len = vsnprintf(&s->s_buf[s->s_len], SBUF_FREESPACE(s) + 1, + fmt, ap); + } while (len > SBUF_FREESPACE(s) && + sbuf_extend(s, len - SBUF_FREESPACE(s)) == 0); + + /* + * s->s_len is the length of the string, without the terminating nul. + * When updating s->s_len, we must subtract 1 from the length that + * we passed into vsnprintf() because that length includes the + * terminating nul. + * + * vsnprintf() returns the amount that would have been copied, + * given sufficient space, hence the min() calculation below. + */ + s->s_len += min(len, SBUF_FREESPACE(s)); + if (!SBUF_HASROOM(s) && !SBUF_CANEXTEND(s)) + SBUF_SETFLAG(s, SBUF_OVERFLOWED); + + KASSERT(s->s_len < s->s_size, + ("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size)); + + if (SBUF_HASOVERFLOWED(s)) + return (-1); + return (0); +} + +/* + * Format the given arguments and append the resulting string to an sbuf. + */ +int +sbuf_printf(struct sbuf *s, const char *fmt, ...) +{ + va_list ap; + int result; + + va_start(ap, fmt); + result = sbuf_vprintf(s, fmt, ap); + va_end(ap); + return(result); +} + +/* + * Append a character to an sbuf. + */ +int +sbuf_putc(struct sbuf *s, int c) +{ + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + if (SBUF_HASOVERFLOWED(s)) + return (-1); + + if (!SBUF_HASROOM(s) && sbuf_extend(s, 1) < 0) { + SBUF_SETFLAG(s, SBUF_OVERFLOWED); + return (-1); + } + if (c != '\0') + s->s_buf[s->s_len++] = c; + return (0); +} + +/* + * Trim whitespace characters from end of an sbuf. + */ +int +sbuf_trim(struct sbuf *s) +{ + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + if (SBUF_HASOVERFLOWED(s)) + return (-1); + + while (s->s_len && isspace(s->s_buf[s->s_len-1])) + --s->s_len; + + return (0); +} + +/* + * Check if an sbuf overflowed + */ +int +sbuf_overflowed(struct sbuf *s) +{ + return SBUF_HASOVERFLOWED(s); +} + +/* + * Finish off an sbuf. + */ +void +sbuf_finish(struct sbuf *s) +{ + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + s->s_buf[s->s_len] = '\0'; + SBUF_CLEARFLAG(s, SBUF_OVERFLOWED); + SBUF_SETFLAG(s, SBUF_FINISHED); +} + +/* + * Return a pointer to the sbuf data. + */ +char * +sbuf_data(struct sbuf *s) +{ + assert_sbuf_integrity(s); + assert_sbuf_state(s, SBUF_FINISHED); + + return s->s_buf; +} + +/* + * Return the length of the sbuf data. + */ +int +sbuf_len(struct sbuf *s) +{ + assert_sbuf_integrity(s); + /* don't care if it's finished or not */ + + if (SBUF_HASOVERFLOWED(s)) + return (-1); + return s->s_len; +} + +/* + * Clear an sbuf, free its buffer if necessary. + */ +void +sbuf_delete(struct sbuf *s) +{ + int isdyn; + + assert_sbuf_integrity(s); + /* don't care if it's finished or not */ + + if (SBUF_ISDYNAMIC(s)) + SBFREE(s->s_buf); + isdyn = SBUF_ISDYNSTRUCT(s); + bzero(s, sizeof *s); + if (isdyn) + SBFREE(s); +} diff --git a/sys/kern/subr_scanf.c b/sys/kern/subr_scanf.c new file mode 100644 index 0000000..13f02b8 --- /dev/null +++ b/sys/kern/subr_scanf.c @@ -0,0 +1,628 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Chris Torek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * From: Id: vfscanf.c,v 1.13 1998/09/25 12:20:27 obrien Exp + * From: static char sccsid[] = "@(#)strtol.c 8.1 (Berkeley) 6/4/93"; + * From: static char sccsid[] = "@(#)strtoul.c 8.1 (Berkeley) 6/4/93"; + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/ctype.h> +#include <machine/limits.h> + +/* + * Note that stdarg.h and the ANSI style va_start macro is used for both + * ANSI and traditional C compilers. + */ +#include <machine/stdarg.h> + +#define BUF 32 /* Maximum length of numeric string. */ + +/* + * Flags used during conversion. + */ +#define LONG 0x01 /* l: long or double */ +#define SHORT 0x04 /* h: short */ +#define SUPPRESS 0x08 /* suppress assignment */ +#define POINTER 0x10 /* weird %p pointer (`fake hex') */ +#define NOSKIP 0x20 /* do not skip blanks */ +#define QUAD 0x400 + +/* + * The following are used in numeric conversions only: + * SIGNOK, NDIGITS, DPTOK, and EXPOK are for floating point; + * SIGNOK, NDIGITS, PFXOK, and NZDIGITS are for integral. + */ +#define SIGNOK 0x40 /* +/- is (still) legal */ +#define NDIGITS 0x80 /* no digits detected */ + +#define DPTOK 0x100 /* (float) decimal point is still legal */ +#define EXPOK 0x200 /* (float) exponent (e+3, etc) still legal */ + +#define PFXOK 0x100 /* 0x prefix is (still) legal */ +#define NZDIGITS 0x200 /* no zero digits detected */ + +/* + * Conversion types. + */ +#define CT_CHAR 0 /* %c conversion */ +#define CT_CCL 1 /* %[...] conversion */ +#define CT_STRING 2 /* %s conversion */ +#define CT_INT 3 /* integer, i.e., strtoq or strtouq */ +typedef u_quad_t (*ccfntype)(const char *, char **, int); + +static const u_char *__sccl(char *, const u_char *); + +int +sscanf(const char *ibuf, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = vsscanf(ibuf, fmt, ap); + va_end(ap); + return(ret); +} + +int +vsscanf(const char *inp, char const *fmt0, va_list ap) +{ + int inr; + const u_char *fmt = (const u_char *)fmt0; + int c; /* character from format, or conversion */ + size_t width; /* field width, or 0 */ + char *p; /* points into all kinds of strings */ + int n; /* handy integer */ + int flags; /* flags as defined above */ + char *p0; /* saves original value of p when necessary */ + int nassigned; /* number of fields assigned */ + int nconversions; /* number of conversions */ + int nread; /* number of characters consumed from fp */ + int base; /* base argument to strtoq/strtouq */ + ccfntype ccfn; /* conversion function (strtoq/strtouq) */ + char ccltab[256]; /* character class table for %[...] */ + char buf[BUF]; /* buffer for numeric conversions */ + + /* `basefix' is used to avoid `if' tests in the integer scanner */ + static short basefix[17] = + { 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; + + inr = strlen(inp); + + nassigned = 0; + nconversions = 0; + nread = 0; + base = 0; /* XXX just to keep gcc happy */ + ccfn = NULL; /* XXX just to keep gcc happy */ + for (;;) { + c = *fmt++; + if (c == 0) + return (nassigned); + if (isspace(c)) { + while (inr > 0 && isspace(*inp)) + nread++, inr--, inp++; + continue; + } + if (c != '%') + goto literal; + width = 0; + flags = 0; + /* + * switch on the format. continue if done; + * break once format type is derived. + */ +again: c = *fmt++; + switch (c) { + case '%': +literal: + if (inr <= 0) + goto input_failure; + if (*inp != c) + goto match_failure; + inr--, inp++; + nread++; + continue; + + case '*': + flags |= SUPPRESS; + goto again; + case 'l': + flags |= LONG; + goto again; + case 'q': + flags |= QUAD; + goto again; + case 'h': + flags |= SHORT; + goto again; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + width = width * 10 + c - '0'; + goto again; + + /* + * Conversions. + * + */ + case 'd': + c = CT_INT; + ccfn = (ccfntype)strtoq; + base = 10; + break; + + case 'i': + c = CT_INT; + ccfn = (ccfntype)strtoq; + base = 0; + break; + + case 'o': + c = CT_INT; + ccfn = strtouq; + base = 8; + break; + + case 'u': + c = CT_INT; + ccfn = strtouq; + base = 10; + break; + + case 'x': + flags |= PFXOK; /* enable 0x prefixing */ + c = CT_INT; + ccfn = strtouq; + base = 16; + break; + + case 's': + c = CT_STRING; + break; + + case '[': + fmt = __sccl(ccltab, fmt); + flags |= NOSKIP; + c = CT_CCL; + break; + + case 'c': + flags |= NOSKIP; + c = CT_CHAR; + break; + + case 'p': /* pointer format is like hex */ + flags |= POINTER | PFXOK; + c = CT_INT; + ccfn = strtouq; + base = 16; + break; + + case 'n': + nconversions++; + if (flags & SUPPRESS) /* ??? */ + continue; + if (flags & SHORT) + *va_arg(ap, short *) = nread; + else if (flags & LONG) + *va_arg(ap, long *) = nread; + else if (flags & QUAD) + *va_arg(ap, quad_t *) = nread; + else + *va_arg(ap, int *) = nread; + continue; + } + + /* + * We have a conversion that requires input. + */ + if (inr <= 0) + goto input_failure; + + /* + * Consume leading white space, except for formats + * that suppress this. + */ + if ((flags & NOSKIP) == 0) { + while (isspace(*inp)) { + nread++; + if (--inr > 0) + inp++; + else + goto input_failure; + } + /* + * Note that there is at least one character in + * the buffer, so conversions that do not set NOSKIP + * can no longer result in an input failure. + */ + } + + /* + * Do the conversion. + */ + switch (c) { + + case CT_CHAR: + /* scan arbitrary characters (sets NOSKIP) */ + if (width == 0) + width = 1; + if (flags & SUPPRESS) { + size_t sum = 0; + for (;;) { + if ((n = inr) < width) { + sum += n; + width -= n; + inp += n; + if (sum == 0) + goto input_failure; + break; + } else { + sum += width; + inr -= width; + inp += width; + break; + } + } + nread += sum; + } else { + bcopy(inp, va_arg(ap, char *), width); + inr -= width; + inp += width; + nread += width; + nassigned++; + } + nconversions++; + break; + + case CT_CCL: + /* scan a (nonempty) character class (sets NOSKIP) */ + if (width == 0) + width = (size_t)~0; /* `infinity' */ + /* take only those things in the class */ + if (flags & SUPPRESS) { + n = 0; + while (ccltab[(unsigned char)*inp]) { + n++, inr--, inp++; + if (--width == 0) + break; + if (inr <= 0) { + if (n == 0) + goto input_failure; + break; + } + } + if (n == 0) + goto match_failure; + } else { + p0 = p = va_arg(ap, char *); + while (ccltab[(unsigned char)*inp]) { + inr--; + *p++ = *inp++; + if (--width == 0) + break; + if (inr <= 0) { + if (p == p0) + goto input_failure; + break; + } + } + n = p - p0; + if (n == 0) + goto match_failure; + *p = 0; + nassigned++; + } + nread += n; + nconversions++; + break; + + case CT_STRING: + /* like CCL, but zero-length string OK, & no NOSKIP */ + if (width == 0) + width = (size_t)~0; + if (flags & SUPPRESS) { + n = 0; + while (!isspace(*inp)) { + n++, inr--, inp++; + if (--width == 0) + break; + if (inr <= 0) + break; + } + nread += n; + } else { + p0 = p = va_arg(ap, char *); + while (!isspace(*inp)) { + inr--; + *p++ = *inp++; + if (--width == 0) + break; + if (inr <= 0) + break; + } + *p = 0; + nread += p - p0; + nassigned++; + } + nconversions++; + continue; + + case CT_INT: + /* scan an integer as if by strtoq/strtouq */ +#ifdef hardway + if (width == 0 || width > sizeof(buf) - 1) + width = sizeof(buf) - 1; +#else + /* size_t is unsigned, hence this optimisation */ + if (--width > sizeof(buf) - 2) + width = sizeof(buf) - 2; + width++; +#endif + flags |= SIGNOK | NDIGITS | NZDIGITS; + for (p = buf; width; width--) { + c = *inp; + /* + * Switch on the character; `goto ok' + * if we accept it as a part of number. + */ + switch (c) { + + /* + * The digit 0 is always legal, but is + * special. For %i conversions, if no + * digits (zero or nonzero) have been + * scanned (only signs), we will have + * base==0. In that case, we should set + * it to 8 and enable 0x prefixing. + * Also, if we have not scanned zero digits + * before this, do not turn off prefixing + * (someone else will turn it off if we + * have scanned any nonzero digits). + */ + case '0': + if (base == 0) { + base = 8; + flags |= PFXOK; + } + if (flags & NZDIGITS) + flags &= ~(SIGNOK|NZDIGITS|NDIGITS); + else + flags &= ~(SIGNOK|PFXOK|NDIGITS); + goto ok; + + /* 1 through 7 always legal */ + case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + base = basefix[base]; + flags &= ~(SIGNOK | PFXOK | NDIGITS); + goto ok; + + /* digits 8 and 9 ok iff decimal or hex */ + case '8': case '9': + base = basefix[base]; + if (base <= 8) + break; /* not legal here */ + flags &= ~(SIGNOK | PFXOK | NDIGITS); + goto ok; + + /* letters ok iff hex */ + case 'A': case 'B': case 'C': + case 'D': case 'E': case 'F': + case 'a': case 'b': case 'c': + case 'd': case 'e': case 'f': + /* no need to fix base here */ + if (base <= 10) + break; /* not legal here */ + flags &= ~(SIGNOK | PFXOK | NDIGITS); + goto ok; + + /* sign ok only as first character */ + case '+': case '-': + if (flags & SIGNOK) { + flags &= ~SIGNOK; + goto ok; + } + break; + + /* x ok iff flag still set & 2nd char */ + case 'x': case 'X': + if (flags & PFXOK && p == buf + 1) { + base = 16; /* if %i */ + flags &= ~PFXOK; + goto ok; + } + break; + } + + /* + * If we got here, c is not a legal character + * for a number. Stop accumulating digits. + */ + break; + ok: + /* + * c is legal: store it and look at the next. + */ + *p++ = c; + if (--inr > 0) + inp++; + else + break; /* end of input */ + } + /* + * If we had only a sign, it is no good; push + * back the sign. If the number ends in `x', + * it was [sign] '0' 'x', so push back the x + * and treat it as [sign] '0'. + */ + if (flags & NDIGITS) { + if (p > buf) { + inp--; + inr++; + } + goto match_failure; + } + c = ((u_char *)p)[-1]; + if (c == 'x' || c == 'X') { + --p; + inp--; + inr++; + } + if ((flags & SUPPRESS) == 0) { + u_quad_t res; + + *p = 0; + res = (*ccfn)(buf, (char **)NULL, base); + if (flags & POINTER) + *va_arg(ap, void **) = + (void *)(uintptr_t)res; + else if (flags & SHORT) + *va_arg(ap, short *) = res; + else if (flags & LONG) + *va_arg(ap, long *) = res; + else if (flags & QUAD) + *va_arg(ap, quad_t *) = res; + else + *va_arg(ap, int *) = res; + nassigned++; + } + nread += p - buf; + nconversions++; + break; + + } + } +input_failure: + return (nconversions != 0 ? nassigned : -1); +match_failure: + return (nassigned); +} + +/* + * Fill in the given table from the scanset at the given format + * (just after `['). Return a pointer to the character past the + * closing `]'. The table has a 1 wherever characters should be + * considered part of the scanset. + */ +static const u_char * +__sccl(char *tab, const u_char *fmt) +{ + int c, n, v; + + /* first `clear' the whole table */ + c = *fmt++; /* first char hat => negated scanset */ + if (c == '^') { + v = 1; /* default => accept */ + c = *fmt++; /* get new first char */ + } else + v = 0; /* default => reject */ + + /* XXX: Will not work if sizeof(tab*) > sizeof(char) */ + for (n = 0; n < 256; n++) + tab[n] = v; /* memset(tab, v, 256) */ + + if (c == 0) + return (fmt - 1);/* format ended before closing ] */ + + /* + * Now set the entries corresponding to the actual scanset + * to the opposite of the above. + * + * The first character may be ']' (or '-') without being special; + * the last character may be '-'. + */ + v = 1 - v; + for (;;) { + tab[c] = v; /* take character c */ +doswitch: + n = *fmt++; /* and examine the next */ + switch (n) { + + case 0: /* format ended too soon */ + return (fmt - 1); + + case '-': + /* + * A scanset of the form + * [01+-] + * is defined as `the digit 0, the digit 1, + * the character +, the character -', but + * the effect of a scanset such as + * [a-zA-Z0-9] + * is implementation defined. The V7 Unix + * scanf treats `a-z' as `the letters a through + * z', but treats `a-a' as `the letter a, the + * character -, and the letter a'. + * + * For compatibility, the `-' is not considerd + * to define a range if the character following + * it is either a close bracket (required by ANSI) + * or is not numerically greater than the character + * we just stored in the table (c). + */ + n = *fmt; + if (n == ']' || n < c) { + c = '-'; + break; /* resume the for(;;) */ + } + fmt++; + /* fill in the range */ + do { + tab[++c] = v; + } while (c < n); + c = n; + /* + * Alas, the V7 Unix scanf also treats formats + * such as [a-c-e] as `the letters a through e'. + * This too is permitted by the standard.... + */ + goto doswitch; + break; + + case ']': /* end of scanset */ + return (fmt); + + default: /* just another character */ + c = n; + break; + } + } + /* NOTREACHED */ +} + diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c new file mode 100644 index 0000000..9dad93b --- /dev/null +++ b/sys/kern/subr_smp.c @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2001 + * John Baldwin <jhb@FreeBSD.org>. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY JOHN BALDWIN AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL JOHN BALDWIN OR THE VOICES IN HIS HEAD + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This module holds the global variables and machine independent functions + * used for the kernel SMP support. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <machine/smp.h> + +volatile u_int stopped_cpus; +volatile u_int started_cpus; + +void (*cpustop_restartfunc)(void); +int mp_ncpus; + +volatile int smp_started; +u_int all_cpus; +u_int mp_maxid; + +SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD, NULL, "Kernel SMP"); + +int smp_active = 0; /* are the APs allowed to run? */ +SYSCTL_INT(_kern_smp, OID_AUTO, active, CTLFLAG_RW, &smp_active, 0, ""); + +int smp_cpus = 1; /* how many cpu's running */ +SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD, &smp_cpus, 0, ""); + +/* Enable forwarding of a signal to a process running on a different CPU */ +static int forward_signal_enabled = 1; +SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW, + &forward_signal_enabled, 0, ""); + +/* Enable forwarding of roundrobin to all other cpus */ +static int forward_roundrobin_enabled = 1; +SYSCTL_INT(_kern_smp, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, + &forward_roundrobin_enabled, 0, ""); + +/* Variables needed for SMP rendezvous. */ +static void (*smp_rv_setup_func)(void *arg); +static void (*smp_rv_action_func)(void *arg); +static void (*smp_rv_teardown_func)(void *arg); +static void *smp_rv_func_arg; +static volatile int smp_rv_waiters[2]; +static struct mtx smp_rv_mtx; +static int mp_probe_status; + +/* + * Initialize MI SMP variables. + */ +static void +mp_probe(void *dummy) +{ + mp_probe_status = cpu_mp_probe(); +} +SYSINIT(cpu_mp_probe, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_probe, NULL) + +/* + * Call the MD SMP initialization code. + */ +static void +mp_start(void *dummy) +{ + + /* Probe for MP hardware. */ + if (mp_probe_status == 0) + return; + + mtx_init(&smp_rv_mtx, "smp rendezvous", NULL, MTX_SPIN); + cpu_mp_start(); + printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n", + mp_ncpus); + cpu_mp_announce(); +} +SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_SECOND, mp_start, NULL) + +void +forward_signal(struct thread *td) +{ + int id; + + /* + * signotify() has already set KEF_ASTPENDING and PS_NEEDSIGCHECK on + * this process, so all we need to do is poke it if it is currently + * executing so that it executes ast(). + */ + mtx_assert(&sched_lock, MA_OWNED); + KASSERT(td->td_proc->p_stat == SRUN, + ("forward_signal: process is not SRUN")); + + CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc); + + if (!smp_started || cold || panicstr) + return; + if (!forward_signal_enabled) + return; + + /* No need to IPI ourself. */ + if (td == curthread) + return; + + id = td->td_kse->ke_oncpu; + if (id == NOCPU) + return; + ipi_selected(1 << id, IPI_AST); +} + +void +forward_roundrobin(void) +{ + struct pcpu *pc; + struct thread *td; + u_int id, map; + + mtx_assert(&sched_lock, MA_OWNED); + + CTR0(KTR_SMP, "forward_roundrobin()"); + + if (!smp_started || cold || panicstr) + return; + if (!forward_roundrobin_enabled) + return; + map = 0; + SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { + td = pc->pc_curthread; + id = pc->pc_cpumask; + if (id != PCPU_GET(cpumask) && (id & stopped_cpus) == 0 && + td != pc->pc_idlethread) { + td->td_kse->ke_flags |= KEF_NEEDRESCHED; + map |= id; + } + } + ipi_selected(map, IPI_AST); +} + +/* + * When called the executing CPU will send an IPI to all other CPUs + * requesting that they halt execution. + * + * Usually (but not necessarily) called with 'other_cpus' as its arg. + * + * - Signals all CPUs in map to stop. + * - Waits for each to stop. + * + * Returns: + * -1: error + * 0: NA + * 1: ok + * + * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs + * from executing at same time. + */ +int +stop_cpus(u_int map) +{ + int i; + + if (!smp_started) + return 0; + + CTR1(KTR_SMP, "stop_cpus(%x)", map); + + /* send the stop IPI to all CPUs in map */ + ipi_selected(map, IPI_STOP); + + i = 0; + while ((atomic_load_acq_int(&stopped_cpus) & map) != map) { + /* spin */ + i++; +#ifdef DIAGNOSTIC + if (i == 100000) { + printf("timeout stopping cpus\n"); + break; + } +#endif + } + + return 1; +} + + +/* + * Called by a CPU to restart stopped CPUs. + * + * Usually (but not necessarily) called with 'stopped_cpus' as its arg. + * + * - Signals all CPUs in map to restart. + * - Waits for each to restart. + * + * Returns: + * -1: error + * 0: NA + * 1: ok + */ +int +restart_cpus(u_int map) +{ + + if (!smp_started) + return 0; + + CTR1(KTR_SMP, "restart_cpus(%x)", map); + + /* signal other cpus to restart */ + atomic_store_rel_int(&started_cpus, map); + + /* wait for each to clear its bit */ + while ((atomic_load_acq_int(&stopped_cpus) & map) != 0) + ; /* nothing */ + + return 1; +} + +/* + * All-CPU rendezvous. CPUs are signalled, all execute the setup function + * (if specified), rendezvous, execute the action function (if specified), + * rendezvous again, execute the teardown function (if specified), and then + * resume. + * + * Note that the supplied external functions _must_ be reentrant and aware + * that they are running in parallel and in an unknown lock context. + */ +void +smp_rendezvous_action(void) +{ + + /* setup function */ + if (smp_rv_setup_func != NULL) + smp_rv_setup_func(smp_rv_func_arg); + /* spin on entry rendezvous */ + atomic_add_int(&smp_rv_waiters[0], 1); + while (atomic_load_acq_int(&smp_rv_waiters[0]) < mp_ncpus) + ; /* nothing */ + /* action function */ + if (smp_rv_action_func != NULL) + smp_rv_action_func(smp_rv_func_arg); + /* spin on exit rendezvous */ + atomic_add_int(&smp_rv_waiters[1], 1); + while (atomic_load_acq_int(&smp_rv_waiters[1]) < mp_ncpus) + ; /* nothing */ + /* teardown function */ + if (smp_rv_teardown_func != NULL) + smp_rv_teardown_func(smp_rv_func_arg); +} + +void +smp_rendezvous(void (* setup_func)(void *), + void (* action_func)(void *), + void (* teardown_func)(void *), + void *arg) +{ + + if (!smp_started) { + if (setup_func != NULL) + setup_func(arg); + if (action_func != NULL) + action_func(arg); + if (teardown_func != NULL) + teardown_func(arg); + return; + } + + /* obtain rendezvous lock */ + mtx_lock_spin(&smp_rv_mtx); + + /* set static function pointers */ + smp_rv_setup_func = setup_func; + smp_rv_action_func = action_func; + smp_rv_teardown_func = teardown_func; + smp_rv_func_arg = arg; + smp_rv_waiters[0] = 0; + smp_rv_waiters[1] = 0; + + /* signal other processors, which will enter the IPI with interrupts off */ + ipi_all_but_self(IPI_RENDEZVOUS); + + /* call executor function */ + smp_rendezvous_action(); + + /* release lock */ + mtx_unlock_spin(&smp_rv_mtx); +} diff --git a/sys/kern/subr_taskqueue.c b/sys/kern/subr_taskqueue.c new file mode 100644 index 0000000..19a93ad --- /dev/null +++ b/sys/kern/subr_taskqueue.c @@ -0,0 +1,223 @@ +/*- + * Copyright (c) 2000 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/taskqueue.h> + +static MALLOC_DEFINE(M_TASKQUEUE, "taskqueue", "Task Queues"); + +static STAILQ_HEAD(taskqueue_list, taskqueue) taskqueue_queues; + +static void *taskqueue_ih; +static struct mtx taskqueue_queues_mutex; + +struct taskqueue { + STAILQ_ENTRY(taskqueue) tq_link; + STAILQ_HEAD(, task) tq_queue; + const char *tq_name; + taskqueue_enqueue_fn tq_enqueue; + void *tq_context; + int tq_draining; + struct mtx tq_mutex; +}; + +static void init_taskqueue_list(void *data); + +static void +init_taskqueue_list(void *data __unused) +{ + + mtx_init(&taskqueue_queues_mutex, "taskqueue list", NULL, MTX_DEF); + STAILQ_INIT(&taskqueue_queues); +} +SYSINIT(taskqueue_list, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_taskqueue_list, + NULL); + +struct taskqueue * +taskqueue_create(const char *name, int mflags, + taskqueue_enqueue_fn enqueue, void *context) +{ + struct taskqueue *queue; + + queue = malloc(sizeof(struct taskqueue), M_TASKQUEUE, mflags | M_ZERO); + if (!queue) + return 0; + + STAILQ_INIT(&queue->tq_queue); + queue->tq_name = name; + queue->tq_enqueue = enqueue; + queue->tq_context = context; + queue->tq_draining = 0; + mtx_init(&queue->tq_mutex, "taskqueue", NULL, MTX_DEF); + + mtx_lock(&taskqueue_queues_mutex); + STAILQ_INSERT_TAIL(&taskqueue_queues, queue, tq_link); + mtx_unlock(&taskqueue_queues_mutex); + + return queue; +} + +void +taskqueue_free(struct taskqueue *queue) +{ + + mtx_lock(&queue->tq_mutex); + queue->tq_draining = 1; + mtx_unlock(&queue->tq_mutex); + + taskqueue_run(queue); + + mtx_lock(&taskqueue_queues_mutex); + STAILQ_REMOVE(&taskqueue_queues, queue, taskqueue, tq_link); + mtx_unlock(&taskqueue_queues_mutex); + + mtx_destroy(&queue->tq_mutex); + free(queue, M_TASKQUEUE); +} + +/* + * Returns with the taskqueue locked. + */ +struct taskqueue * +taskqueue_find(const char *name) +{ + struct taskqueue *queue; + + mtx_lock(&taskqueue_queues_mutex); + STAILQ_FOREACH(queue, &taskqueue_queues, tq_link) { + mtx_lock(&queue->tq_mutex); + if (!strcmp(queue->tq_name, name)) { + mtx_unlock(&taskqueue_queues_mutex); + return queue; + } + mtx_unlock(&queue->tq_mutex); + } + mtx_unlock(&taskqueue_queues_mutex); + return 0; +} + +int +taskqueue_enqueue(struct taskqueue *queue, struct task *task) +{ + struct task *ins; + struct task *prev; + + mtx_lock(&queue->tq_mutex); + + /* + * Don't allow new tasks on a queue which is being freed. + */ + if (queue->tq_draining) { + mtx_unlock(&queue->tq_mutex); + return EPIPE; + } + + /* + * Count multiple enqueues. + */ + if (task->ta_pending) { + task->ta_pending++; + mtx_unlock(&queue->tq_mutex); + return 0; + } + + /* + * Optimise the case when all tasks have the same priority. + */ + prev = STAILQ_LAST(&queue->tq_queue, task, ta_link); + if (!prev || prev->ta_priority >= task->ta_priority) { + STAILQ_INSERT_TAIL(&queue->tq_queue, task, ta_link); + } else { + prev = 0; + for (ins = STAILQ_FIRST(&queue->tq_queue); ins; + prev = ins, ins = STAILQ_NEXT(ins, ta_link)) + if (ins->ta_priority < task->ta_priority) + break; + + if (prev) + STAILQ_INSERT_AFTER(&queue->tq_queue, prev, task, ta_link); + else + STAILQ_INSERT_HEAD(&queue->tq_queue, task, ta_link); + } + + task->ta_pending = 1; + if (queue->tq_enqueue) + queue->tq_enqueue(queue->tq_context); + + mtx_unlock(&queue->tq_mutex); + + return 0; +} + +void +taskqueue_run(struct taskqueue *queue) +{ + struct task *task; + int pending; + + mtx_lock(&queue->tq_mutex); + while (STAILQ_FIRST(&queue->tq_queue)) { + /* + * Carefully remove the first task from the queue and + * zero its pending count. + */ + task = STAILQ_FIRST(&queue->tq_queue); + STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link); + pending = task->ta_pending; + task->ta_pending = 0; + mtx_unlock(&queue->tq_mutex); + + task->ta_func(task->ta_context, pending); + + mtx_lock(&queue->tq_mutex); + } + mtx_unlock(&queue->tq_mutex); +} + +static void +taskqueue_swi_enqueue(void *context) +{ + swi_sched(taskqueue_ih, 0); +} + +static void +taskqueue_swi_run(void *dummy) +{ + taskqueue_run(taskqueue_swi); +} + +TASKQUEUE_DEFINE(swi, taskqueue_swi_enqueue, 0, + swi_add(NULL, "task queue", taskqueue_swi_run, NULL, SWI_TQ, 0, + &taskqueue_ih)); diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c new file mode 100644 index 0000000..3b415de --- /dev/null +++ b/sys/kern/subr_trap.c @@ -0,0 +1,209 @@ +/*- + * Copyright (C) 1994, David Greenman + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the University of Utah, and William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 + * $FreeBSD$ + */ + +#ifdef __i386__ +#include "opt_npx.h" +#endif + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/systm.h> +#include <sys/vmmeter.h> +#include <machine/cpu.h> +#include <machine/pcb.h> + +/* + * Define the code needed before returning to user mode, for + * trap and syscall. + * + * MPSAFE + */ +void +userret(td, frame, oticks) + struct thread *td; + struct trapframe *frame; + u_int oticks; +{ + struct proc *p = td->td_proc; + struct kse *ke = td->td_kse; + struct ksegrp *kg = td->td_ksegrp; + +#ifdef INVARIANTS + /* Check that we called signotify() enough. */ + mtx_lock(&Giant); + PROC_LOCK(p); + mtx_lock_spin(&sched_lock); + if (SIGPENDING(p) && ((p->p_sflag & PS_NEEDSIGCHK) == 0 || + (p->p_kse.ke_flags & KEF_ASTPENDING) == 0)) + printf("failed to set signal flags proprly for ast()\n"); + mtx_unlock_spin(&sched_lock); + PROC_UNLOCK(p); + mtx_unlock(&Giant); +#endif + + /* + * XXX we cheat slightly on the locking here to avoid locking in + * the usual case. Setting td_priority here is essentially an + * incomplete workaround for not setting it properly elsewhere. + * Now that some interrupt handlers are threads, not setting it + * properly elsewhere can clobber it in the window between setting + * it here and returning to user mode, so don't waste time setting + * it perfectly here. + */ + if (td->td_priority != kg->kg_user_pri) { + mtx_lock_spin(&sched_lock); + td->td_priority = kg->kg_user_pri; + mtx_unlock_spin(&sched_lock); + } + + /* + * Charge system time if profiling. + * + * XXX should move PS_PROFIL to a place that can obviously be + * accessed safely without sched_lock. + */ + if (p->p_sflag & PS_PROFIL) { + quad_t ticks; + + mtx_lock_spin(&sched_lock); + ticks = ke->ke_sticks - oticks; + mtx_unlock_spin(&sched_lock); + addupc_task(ke, TRAPF_PC(frame), (u_int)ticks * psratio); + } +} + +/* + * Process an asynchronous software trap. + * This is relatively easy. + * This function will return with preemption disabled. + */ +void +ast(framep) + struct trapframe *framep; +{ + struct thread *td = curthread; + struct proc *p = td->td_proc; + struct kse *ke = td->td_kse; + struct ksegrp *kg = td->td_ksegrp; + u_int prticks, sticks; + int sflag; + int flags; + int sig; +#if defined(DEV_NPX) && !defined(SMP) + int ucode; +#endif + + KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode")); +#ifdef WITNESS + if (witness_list(td)) + panic("Returning to user mode with mutex(s) held"); +#endif + mtx_assert(&Giant, MA_NOTOWNED); + mtx_assert(&sched_lock, MA_NOTOWNED); + prticks = 0; /* XXX: Quiet warning. */ + td->td_frame = framep; + /* + * This updates the p_sflag's for the checks below in one + * "atomic" operation with turning off the astpending flag. + * If another AST is triggered while we are handling the + * AST's saved in sflag, the astpending flag will be set and + * ast() will be called again. + */ + mtx_lock_spin(&sched_lock); + sticks = ke->ke_sticks; + sflag = p->p_sflag; + flags = ke->ke_flags; + p->p_sflag &= ~(PS_ALRMPEND | PS_NEEDSIGCHK | PS_PROFPEND); + ke->ke_flags &= ~(KEF_ASTPENDING | KEF_NEEDRESCHED | KEF_OWEUPC); + cnt.v_soft++; + if (flags & KEF_OWEUPC && sflag & PS_PROFIL) { + prticks = p->p_stats->p_prof.pr_ticks; + p->p_stats->p_prof.pr_ticks = 0; + } + mtx_unlock_spin(&sched_lock); + + if (td->td_ucred != p->p_ucred) + cred_update_thread(td); + if (flags & KEF_OWEUPC && sflag & PS_PROFIL) + addupc_task(ke, p->p_stats->p_prof.pr_addr, prticks); + if (sflag & PS_ALRMPEND) { + PROC_LOCK(p); + psignal(p, SIGVTALRM); + PROC_UNLOCK(p); + } +#if defined(DEV_NPX) && !defined(SMP) + if (PCPU_GET(curpcb)->pcb_flags & PCB_NPXTRAP) { + atomic_clear_int(&PCPU_GET(curpcb)->pcb_flags, + PCB_NPXTRAP); + ucode = npxtrap(); + if (ucode != -1) { + trapsignal(p, SIGFPE, ucode); + } + } +#endif + if (sflag & PS_PROFPEND) { + PROC_LOCK(p); + psignal(p, SIGPROF); + PROC_UNLOCK(p); + } + if (flags & KEF_NEEDRESCHED) { + mtx_lock_spin(&sched_lock); + td->td_priority = kg->kg_user_pri; + setrunqueue(td); + p->p_stats->p_ru.ru_nivcsw++; + mi_switch(); + mtx_unlock_spin(&sched_lock); + } + if (sflag & PS_NEEDSIGCHK) { + PROC_LOCK(p); + while ((sig = cursig(p)) != 0) + postsig(sig); + PROC_UNLOCK(p); + } + + userret(td, framep, sticks); + mtx_assert(&Giant, MA_NOTOWNED); +} diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c new file mode 100644 index 0000000..08bca8d --- /dev/null +++ b/sys/kern/subr_turnstile.c @@ -0,0 +1,986 @@ +/*- + * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ + * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ + * $FreeBSD$ + */ + +/* + * Machine independent bits of mutex implementation. + */ + +#include "opt_adaptive_mutexes.h" +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/sbuf.h> +#include <sys/stdint.h> +#include <sys/sysctl.h> +#include <sys/vmmeter.h> + +#include <machine/atomic.h> +#include <machine/bus.h> +#include <machine/clock.h> +#include <machine/cpu.h> + +#include <ddb/ddb.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> + +/* + * Internal utility macros. + */ +#define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED) + +#define mtx_owner(m) (mtx_unowned((m)) ? NULL \ + : (struct thread *)((m)->mtx_lock & MTX_FLAGMASK)) + +/* XXXKSE This test will change. */ +#define thread_running(td) \ + ((td)->td_kse != NULL && (td)->td_kse->ke_oncpu != NOCPU) + +/* + * Lock classes for sleep and spin mutexes. + */ +struct lock_class lock_class_mtx_sleep = { + "sleep mutex", + LC_SLEEPLOCK | LC_RECURSABLE +}; +struct lock_class lock_class_mtx_spin = { + "spin mutex", + LC_SPINLOCK | LC_RECURSABLE +}; + +/* + * System-wide mutexes + */ +struct mtx sched_lock; +struct mtx Giant; + +/* + * Prototypes for non-exported routines. + */ +static void propagate_priority(struct thread *); + +static void +propagate_priority(struct thread *td) +{ + int pri = td->td_priority; + struct mtx *m = td->td_blocked; + + mtx_assert(&sched_lock, MA_OWNED); + for (;;) { + struct thread *td1; + + td = mtx_owner(m); + + if (td == NULL) { + /* + * This really isn't quite right. Really + * ought to bump priority of thread that + * next acquires the mutex. + */ + MPASS(m->mtx_lock == MTX_CONTESTED); + return; + } + + MPASS(td->td_proc->p_magic == P_MAGIC); + KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex")); + if (td->td_priority <= pri) /* lower is higher priority */ + return; + + /* + * Bump this thread's priority. + */ + td->td_priority = pri; + + /* + * If lock holder is actually running, just bump priority. + */ + if (thread_running(td)) { + MPASS(td->td_proc->p_stat == SRUN + || td->td_proc->p_stat == SZOMB + || td->td_proc->p_stat == SSTOP); + return; + } + +#ifndef SMP + /* + * For UP, we check to see if td is curthread (this shouldn't + * ever happen however as it would mean we are in a deadlock.) + */ + KASSERT(td != curthread, ("Deadlock detected")); +#endif + + /* + * If on run queue move to new run queue, and quit. + * XXXKSE this gets a lot more complicated under threads + * but try anyhow. + */ + if (td->td_proc->p_stat == SRUN) { + MPASS(td->td_blocked == NULL); + remrunqueue(td); + setrunqueue(td); + return; + } + + /* + * If we aren't blocked on a mutex, we should be. + */ + KASSERT(td->td_proc->p_stat == SMTX, ( + "process %d(%s):%d holds %s but isn't blocked on a mutex\n", + td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat, + m->mtx_object.lo_name)); + + /* + * Pick up the mutex that td is blocked on. + */ + m = td->td_blocked; + MPASS(m != NULL); + + /* + * Check if the thread needs to be moved up on + * the blocked chain + */ + if (td == TAILQ_FIRST(&m->mtx_blocked)) { + continue; + } + + td1 = TAILQ_PREV(td, threadqueue, td_blkq); + if (td1->td_priority <= pri) { + continue; + } + + /* + * Remove thread from blocked chain and determine where + * it should be moved up to. Since we know that td1 has + * a lower priority than td, we know that at least one + * thread in the chain has a lower priority and that + * td1 will thus not be NULL after the loop. + */ + TAILQ_REMOVE(&m->mtx_blocked, td, td_blkq); + TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq) { + MPASS(td1->td_proc->p_magic == P_MAGIC); + if (td1->td_priority > pri) + break; + } + + MPASS(td1 != NULL); + TAILQ_INSERT_BEFORE(td1, td, td_blkq); + CTR4(KTR_LOCK, + "propagate_priority: p %p moved before %p on [%p] %s", + td, td1, m, m->mtx_object.lo_name); + } +} + +#ifdef MUTEX_PROFILING +SYSCTL_NODE(_debug, OID_AUTO, mutex, CTLFLAG_RD, NULL, "mutex debugging"); +SYSCTL_NODE(_debug_mutex, OID_AUTO, prof, CTLFLAG_RD, NULL, "mutex profiling"); +static int mutex_prof_enable = 0; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, enable, CTLFLAG_RW, + &mutex_prof_enable, 0, "Enable tracing of mutex holdtime"); + +struct mutex_prof { + const char *name; + const char *file; + int line; +#define MPROF_MAX 0 +#define MPROF_TOT 1 +#define MPROF_CNT 2 +#define MPROF_AVG 3 + uintmax_t counter[4]; + struct mutex_prof *next; +}; + +/* + * mprof_buf is a static pool of profiling records to avoid possible + * reentrance of the memory allocation functions. + * + * Note: NUM_MPROF_BUFFERS must be smaller than MPROF_HASH_SIZE. + */ +#define NUM_MPROF_BUFFERS 1000 +static struct mutex_prof mprof_buf[NUM_MPROF_BUFFERS]; +static int first_free_mprof_buf; +#define MPROF_HASH_SIZE 1009 +static struct mutex_prof *mprof_hash[MPROF_HASH_SIZE]; + +static int mutex_prof_acquisitions; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, acquisitions, CTLFLAG_RD, + &mutex_prof_acquisitions, 0, "Number of mutex acquistions recorded"); +static int mutex_prof_records; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, records, CTLFLAG_RD, + &mutex_prof_records, 0, "Number of profiling records"); +static int mutex_prof_maxrecords = NUM_MPROF_BUFFERS; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, maxrecords, CTLFLAG_RD, + &mutex_prof_maxrecords, 0, "Maximum number of profiling records"); +static int mutex_prof_rejected; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, rejected, CTLFLAG_RD, + &mutex_prof_rejected, 0, "Number of rejected profiling records"); +static int mutex_prof_hashsize = MPROF_HASH_SIZE; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, hashsize, CTLFLAG_RD, + &mutex_prof_hashsize, 0, "Hash size"); +static int mutex_prof_collisions = 0; +SYSCTL_INT(_debug_mutex_prof, OID_AUTO, collisions, CTLFLAG_RD, + &mutex_prof_collisions, 0, "Number of hash collisions"); + +/* + * mprof_mtx protects the profiling buffers and the hash. + */ +static struct mtx mprof_mtx; +MTX_SYSINIT(mprof, &mprof_mtx, "mutex profiling lock", MTX_SPIN | MTX_QUIET); + +static u_int64_t +nanoseconds(void) +{ + struct timespec tv; + + nanotime(&tv); + return (tv.tv_sec * (u_int64_t)1000000000 + tv.tv_nsec); +} + +static int +dump_mutex_prof_stats(SYSCTL_HANDLER_ARGS) +{ + struct sbuf *sb; + int error, i; + + if (first_free_mprof_buf == 0) + return SYSCTL_OUT(req, "No locking recorded", + sizeof("No locking recorded")); + + sb = sbuf_new(NULL, NULL, 1024, SBUF_AUTOEXTEND); + sbuf_printf(sb, "%12s %12s %12s %12s %s\n", + "max", "total", "count", "average", "name"); + mtx_lock_spin(&mprof_mtx); + for (i = 0; i < first_free_mprof_buf; ++i) + sbuf_printf(sb, "%12ju %12ju %12ju %12ju %s:%d (%s)\n", + mprof_buf[i].counter[MPROF_MAX] / 1000, + mprof_buf[i].counter[MPROF_TOT] / 1000, + mprof_buf[i].counter[MPROF_CNT], + mprof_buf[i].counter[MPROF_AVG] / 1000, + mprof_buf[i].file, mprof_buf[i].line, mprof_buf[i].name); + mtx_unlock_spin(&mprof_mtx); + sbuf_finish(sb); + error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); + return (error); +} +SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, dump_mutex_prof_stats, "A", "Mutex profiling statistics"); +#endif + +/* + * Function versions of the inlined __mtx_* macros. These are used by + * modules and can also be called from assembly language if needed. + */ +void +_mtx_lock_flags(struct mtx *m, int opts, const char *file, int line) +{ + + MPASS(curthread != NULL); + _get_sleep_lock(m, curthread, opts, file, line); + LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file, + line); + WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line); +#ifdef MUTEX_PROFILING + /* don't reset the timer when/if recursing */ + if (m->acqtime == 0) { + m->file = file; + m->line = line; + m->acqtime = mutex_prof_enable ? nanoseconds() : 0; + ++mutex_prof_acquisitions; + } +#endif +} + +void +_mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line) +{ + + MPASS(curthread != NULL); + mtx_assert(m, MA_OWNED); +#ifdef MUTEX_PROFILING + if (m->acqtime != 0) { + static const char *unknown = "(unknown)"; + struct mutex_prof *mpp; + u_int64_t acqtime, now; + const char *p, *q; + volatile u_int hash; + + now = nanoseconds(); + acqtime = m->acqtime; + m->acqtime = 0; + if (now <= acqtime) + goto out; + for (p = file; strncmp(p, "../", 3) == 0; p += 3) + /* nothing */ ; + if (p == NULL || *p == '\0') + p = unknown; + for (hash = line, q = p; *q != '\0'; ++q) + hash = (hash * 2 + *q) % MPROF_HASH_SIZE; + mtx_lock_spin(&mprof_mtx); + for (mpp = mprof_hash[hash]; mpp != NULL; mpp = mpp->next) + if (mpp->line == line && strcmp(mpp->file, p) == 0) + break; + if (mpp == NULL) { + /* Just exit if we cannot get a trace buffer */ + if (first_free_mprof_buf >= NUM_MPROF_BUFFERS) { + ++mutex_prof_rejected; + goto unlock; + } + mpp = &mprof_buf[first_free_mprof_buf++]; + mpp->name = mtx_name(m); + mpp->file = p; + mpp->line = line; + mpp->next = mprof_hash[hash]; + if (mprof_hash[hash] != NULL) + ++mutex_prof_collisions; + mprof_hash[hash] = mpp; + ++mutex_prof_records; + } + /* + * Record if the mutex has been held longer now than ever + * before + */ + if ((now - acqtime) > mpp->counter[MPROF_MAX]) + mpp->counter[MPROF_MAX] = now - acqtime; + mpp->counter[MPROF_TOT] += now - acqtime; + mpp->counter[MPROF_CNT] += 1; + mpp->counter[MPROF_AVG] = + mpp->counter[MPROF_TOT] / mpp->counter[MPROF_CNT]; +unlock: + mtx_unlock_spin(&mprof_mtx); + } +out: +#endif + WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line); + LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file, + line); + _rel_sleep_lock(m, curthread, opts, file, line); +} + +void +_mtx_lock_spin_flags(struct mtx *m, int opts, const char *file, int line) +{ + + MPASS(curthread != NULL); +#if defined(SMP) || LOCK_DEBUG > 0 + _get_spin_lock(m, curthread, opts, file, line); +#else + critical_enter(); +#endif + LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file, + line); + WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line); +} + +void +_mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file, int line) +{ + + MPASS(curthread != NULL); + mtx_assert(m, MA_OWNED); + WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line); + LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file, + line); +#if defined(SMP) || LOCK_DEBUG > 0 + _rel_spin_lock(m); +#else + critical_exit(); +#endif +} + +/* + * The important part of mtx_trylock{,_flags}() + * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that + * if we're called, it's because we know we don't already own this lock. + */ +int +_mtx_trylock(struct mtx *m, int opts, const char *file, int line) +{ + int rval; + + MPASS(curthread != NULL); + + rval = _obtain_lock(m, curthread); + + LOCK_LOG_TRY("LOCK", &m->mtx_object, opts, rval, file, line); + if (rval) { + /* + * We do not handle recursion in _mtx_trylock; see the + * note at the top of the routine. + */ + KASSERT(!mtx_recursed(m), + ("mtx_trylock() called on a recursed mutex")); + WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK, + file, line); + } + + return (rval); +} + +/* + * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock. + * + * We call this if the lock is either contested (i.e. we need to go to + * sleep waiting for it), or if we need to recurse on it. + */ +void +_mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line) +{ + struct thread *td = curthread; +#if defined(SMP) && defined(ADAPTIVE_MUTEXES) + struct thread *owner; +#endif + + if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)td) { + m->mtx_recurse++; + atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m); + return; + } + + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR4(KTR_LOCK, + "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d", + m->mtx_object.lo_name, (void *)m->mtx_lock, file, line); + + while (!_obtain_lock(m, td)) { + uintptr_t v; + struct thread *td1; + + mtx_lock_spin(&sched_lock); + /* + * Check if the lock has been released while spinning for + * the sched_lock. + */ + if ((v = m->mtx_lock) == MTX_UNOWNED) { + mtx_unlock_spin(&sched_lock); +#ifdef __i386__ + ia32_pause(); +#endif + continue; + } + + /* + * The mutex was marked contested on release. This means that + * there are threads blocked on it. + */ + if (v == MTX_CONTESTED) { + td1 = TAILQ_FIRST(&m->mtx_blocked); + MPASS(td1 != NULL); + m->mtx_lock = (uintptr_t)td | MTX_CONTESTED; + + if (td1->td_priority < td->td_priority) + td->td_priority = td1->td_priority; + mtx_unlock_spin(&sched_lock); + return; + } + + /* + * If the mutex isn't already contested and a failure occurs + * setting the contested bit, the mutex was either released + * or the state of the MTX_RECURSED bit changed. + */ + if ((v & MTX_CONTESTED) == 0 && + !atomic_cmpset_ptr(&m->mtx_lock, (void *)v, + (void *)(v | MTX_CONTESTED))) { + mtx_unlock_spin(&sched_lock); +#ifdef __i386__ + ia32_pause(); +#endif + continue; + } + +#if defined(SMP) && defined(ADAPTIVE_MUTEXES) + /* + * If the current owner of the lock is executing on another + * CPU, spin instead of blocking. + */ + owner = (struct thread *)(v & MTX_FLAGMASK); + if (m != &Giant && thread_running(owner)) { + mtx_unlock_spin(&sched_lock); + while (mtx_owner(m) == owner && thread_running(owner)) { +#ifdef __i386__ + ia32_pause(); +#endif + } + continue; + } +#endif /* SMP && ADAPTIVE_MUTEXES */ + + /* + * We definitely must sleep for this lock. + */ + mtx_assert(m, MA_NOTOWNED); + +#ifdef notyet + /* + * If we're borrowing an interrupted thread's VM context, we + * must clean up before going to sleep. + */ + if (td->td_ithd != NULL) { + struct ithd *it = td->td_ithd; + + if (it->it_interrupted) { + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR2(KTR_LOCK, + "_mtx_lock_sleep: %p interrupted %p", + it, it->it_interrupted); + intr_thd_fixup(it); + } + } +#endif + + /* + * Put us on the list of threads blocked on this mutex. + */ + if (TAILQ_EMPTY(&m->mtx_blocked)) { + td1 = mtx_owner(m); + LIST_INSERT_HEAD(&td1->td_contested, m, mtx_contested); + TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq); + } else { + TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq) + if (td1->td_priority > td->td_priority) + break; + if (td1) + TAILQ_INSERT_BEFORE(td1, td, td_blkq); + else + TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq); + } + + /* + * Save who we're blocked on. + */ + td->td_blocked = m; + td->td_mtxname = m->mtx_object.lo_name; + td->td_proc->p_stat = SMTX; + propagate_priority(td); + + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR3(KTR_LOCK, + "_mtx_lock_sleep: p %p blocked on [%p] %s", td, m, + m->mtx_object.lo_name); + + td->td_proc->p_stats->p_ru.ru_nvcsw++; + mi_switch(); + + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR3(KTR_LOCK, + "_mtx_lock_sleep: p %p free from blocked on [%p] %s", + td, m, m->mtx_object.lo_name); + + mtx_unlock_spin(&sched_lock); + } + + return; +} + +/* + * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock. + * + * This is only called if we need to actually spin for the lock. Recursion + * is handled inline. + */ +void +_mtx_lock_spin(struct mtx *m, int opts, const char *file, int line) +{ + int i = 0; + + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m); + + for (;;) { + if (_obtain_lock(m, curthread)) + break; + + /* Give interrupts a chance while we spin. */ + critical_exit(); + while (m->mtx_lock != MTX_UNOWNED) { + if (i++ < 10000000) { +#ifdef __i386__ + ia32_pause(); +#endif + continue; + } + if (i < 60000000) + DELAY(1); +#ifdef DDB + else if (!db_active) +#else + else +#endif + panic("spin lock %s held by %p for > 5 seconds", + m->mtx_object.lo_name, (void *)m->mtx_lock); +#ifdef __i386__ + ia32_pause(); +#endif + } + critical_enter(); + } + + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m); + + return; +} + +/* + * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock. + * + * We are only called here if the lock is recursed or contested (i.e. we + * need to wake up a blocked thread). + */ +void +_mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line) +{ + struct thread *td, *td1; + struct mtx *m1; + int pri; + + td = curthread; + + if (mtx_recursed(m)) { + if (--(m->mtx_recurse) == 0) + atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m); + return; + } + + mtx_lock_spin(&sched_lock); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m); + + td1 = TAILQ_FIRST(&m->mtx_blocked); +#if defined(SMP) && defined(ADAPTIVE_MUTEXES) + if (td1 == NULL) { + _release_lock_quick(m); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m); + mtx_unlock_spin(&sched_lock); + return; + } +#endif + MPASS(td->td_proc->p_magic == P_MAGIC); + MPASS(td1->td_proc->p_magic == P_MAGIC); + + TAILQ_REMOVE(&m->mtx_blocked, td1, td_blkq); + + if (TAILQ_EMPTY(&m->mtx_blocked)) { + LIST_REMOVE(m, mtx_contested); + _release_lock_quick(m); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m); + } else + atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED); + + pri = PRI_MAX; + LIST_FOREACH(m1, &td->td_contested, mtx_contested) { + int cp = TAILQ_FIRST(&m1->mtx_blocked)->td_priority; + if (cp < pri) + pri = cp; + } + + if (pri > td->td_base_pri) + pri = td->td_base_pri; + td->td_priority = pri; + + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p", + m, td1); + + td1->td_blocked = NULL; + td1->td_proc->p_stat = SRUN; + setrunqueue(td1); + + if (td->td_critnest == 1 && td1->td_priority < pri) { +#ifdef notyet + if (td->td_ithd != NULL) { + struct ithd *it = td->td_ithd; + + if (it->it_interrupted) { + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR2(KTR_LOCK, + "_mtx_unlock_sleep: %p interrupted %p", + it, it->it_interrupted); + intr_thd_fixup(it); + } + } +#endif + setrunqueue(td); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR2(KTR_LOCK, + "_mtx_unlock_sleep: %p switching out lock=%p", m, + (void *)m->mtx_lock); + + td->td_proc->p_stats->p_ru.ru_nivcsw++; + mi_switch(); + if (LOCK_LOG_TEST(&m->mtx_object, opts)) + CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p", + m, (void *)m->mtx_lock); + } + + mtx_unlock_spin(&sched_lock); + + return; +} + +/* + * All the unlocking of MTX_SPIN locks is done inline. + * See the _rel_spin_lock() macro for the details. + */ + +/* + * The backing function for the INVARIANTS-enabled mtx_assert() + */ +#ifdef INVARIANT_SUPPORT +void +_mtx_assert(struct mtx *m, int what, const char *file, int line) +{ + + if (panicstr != NULL) + return; + switch (what) { + case MA_OWNED: + case MA_OWNED | MA_RECURSED: + case MA_OWNED | MA_NOTRECURSED: + if (!mtx_owned(m)) + panic("mutex %s not owned at %s:%d", + m->mtx_object.lo_name, file, line); + if (mtx_recursed(m)) { + if ((what & MA_NOTRECURSED) != 0) + panic("mutex %s recursed at %s:%d", + m->mtx_object.lo_name, file, line); + } else if ((what & MA_RECURSED) != 0) { + panic("mutex %s unrecursed at %s:%d", + m->mtx_object.lo_name, file, line); + } + break; + case MA_NOTOWNED: + if (mtx_owned(m)) + panic("mutex %s owned at %s:%d", + m->mtx_object.lo_name, file, line); + break; + default: + panic("unknown mtx_assert at %s:%d", file, line); + } +} +#endif + +/* + * The MUTEX_DEBUG-enabled mtx_validate() + * + * Most of these checks have been moved off into the LO_INITIALIZED flag + * maintained by the witness code. + */ +#ifdef MUTEX_DEBUG + +void mtx_validate(struct mtx *); + +void +mtx_validate(struct mtx *m) +{ + +/* + * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly + * we can re-enable the kernacc() checks. + */ +#ifndef __alpha__ + /* + * Can't call kernacc() from early init386(), especially when + * initializing Giant mutex, because some stuff in kernacc() + * requires Giant itself. + */ + if (!cold) + if (!kernacc((caddr_t)m, sizeof(m), + VM_PROT_READ | VM_PROT_WRITE)) + panic("Can't read and write to mutex %p", m); +#endif +} +#endif + +/* + * General init routine used by the MTX_SYSINIT() macro. + */ +void +mtx_sysinit(void *arg) +{ + struct mtx_args *margs = arg; + + mtx_init(margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts); +} + +/* + * Mutex initialization routine; initialize lock `m' of type contained in + * `opts' with options contained in `opts' and name `name.' The optional + * lock type `type' is used as a general lock category name for use with + * witness. + */ +void +mtx_init(struct mtx *m, const char *name, const char *type, int opts) +{ + struct lock_object *lock; + + MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE | + MTX_SLEEPABLE | MTX_NOWITNESS | MTX_DUPOK)) == 0); + +#ifdef MUTEX_DEBUG + /* Diagnostic and error correction */ + mtx_validate(m); +#endif + + lock = &m->mtx_object; + KASSERT((lock->lo_flags & LO_INITIALIZED) == 0, + ("mutex %s %p already initialized", name, m)); + bzero(m, sizeof(*m)); + if (opts & MTX_SPIN) + lock->lo_class = &lock_class_mtx_spin; + else + lock->lo_class = &lock_class_mtx_sleep; + lock->lo_name = name; + lock->lo_type = type != NULL ? type : name; + if (opts & MTX_QUIET) + lock->lo_flags = LO_QUIET; + if (opts & MTX_RECURSE) + lock->lo_flags |= LO_RECURSABLE; + if (opts & MTX_SLEEPABLE) + lock->lo_flags |= LO_SLEEPABLE; + if ((opts & MTX_NOWITNESS) == 0) + lock->lo_flags |= LO_WITNESS; + if (opts & MTX_DUPOK) + lock->lo_flags |= LO_DUPOK; + + m->mtx_lock = MTX_UNOWNED; + TAILQ_INIT(&m->mtx_blocked); + + LOCK_LOG_INIT(lock, opts); + + WITNESS_INIT(lock); +} + +/* + * Remove lock `m' from all_mtx queue. We don't allow MTX_QUIET to be + * passed in as a flag here because if the corresponding mtx_init() was + * called with MTX_QUIET set, then it will already be set in the mutex's + * flags. + */ +void +mtx_destroy(struct mtx *m) +{ + + LOCK_LOG_DESTROY(&m->mtx_object, 0); + + if (!mtx_owned(m)) + MPASS(mtx_unowned(m)); + else { + MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0); + + /* Tell witness this isn't locked to make it happy. */ + WITNESS_UNLOCK(&m->mtx_object, LOP_EXCLUSIVE, __FILE__, + __LINE__); + } + + WITNESS_DESTROY(&m->mtx_object); +} + +/* + * Intialize the mutex code and system mutexes. This is called from the MD + * startup code prior to mi_startup(). The per-CPU data space needs to be + * setup before this is called. + */ +void +mutex_init(void) +{ + + /* Setup thread0 so that mutexes work. */ + LIST_INIT(&thread0.td_contested); + + /* + * Initialize mutexes. + */ + mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE); + mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE); + mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); + mtx_lock(&Giant); +} + +/* + * Encapsulated Giant mutex routines. These routines provide encapsulation + * control for the Giant mutex, allowing sysctls to be used to turn on and + * off Giant around certain subsystems. The default value for the sysctls + * are set to what developers believe is stable and working in regards to + * the Giant pushdown. Developers should not turn off Giant via these + * sysctls unless they know what they are doing. + * + * Callers of mtx_lock_giant() are expected to pass the return value to an + * accompanying mtx_unlock_giant() later on. If multiple subsystems are + * effected by a Giant wrap, all related sysctl variables must be zero for + * the subsystem call to operate without Giant (as determined by the caller). + */ + +SYSCTL_NODE(_kern, OID_AUTO, giant, CTLFLAG_RD, NULL, "Giant mutex manipulation"); + +static int kern_giant_all = 0; +SYSCTL_INT(_kern_giant, OID_AUTO, all, CTLFLAG_RW, &kern_giant_all, 0, ""); + +int kern_giant_proc = 1; /* Giant around PROC locks */ +int kern_giant_file = 1; /* Giant around struct file & filedesc */ +int kern_giant_ucred = 1; /* Giant around ucred */ +SYSCTL_INT(_kern_giant, OID_AUTO, proc, CTLFLAG_RW, &kern_giant_proc, 0, ""); +SYSCTL_INT(_kern_giant, OID_AUTO, file, CTLFLAG_RW, &kern_giant_file, 0, ""); +SYSCTL_INT(_kern_giant, OID_AUTO, ucred, CTLFLAG_RW, &kern_giant_ucred, 0, ""); + +int +mtx_lock_giant(int sysctlvar) +{ + if (sysctlvar || kern_giant_all) { + mtx_lock(&Giant); + return(1); + } + return(0); +} + +void +mtx_unlock_giant(int s) +{ + if (s) + mtx_unlock(&Giant); +} + diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c new file mode 100644 index 0000000..182221d --- /dev/null +++ b/sys/kern/subr_witness.c @@ -0,0 +1,1488 @@ +/*- + * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ + * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ + * $FreeBSD$ + */ + +/* + * Implementation of the `witness' lock verifier. Originally implemented for + * mutexes in BSD/OS. Extended to handle generic lock objects and lock + * classes in FreeBSD. + */ + +/* + * Main Entry: witness + * Pronunciation: 'wit-n&s + * Function: noun + * Etymology: Middle English witnesse, from Old English witnes knowledge, + * testimony, witness, from 2wit + * Date: before 12th century + * 1 : attestation of a fact or event : TESTIMONY + * 2 : one that gives evidence; specifically : one who testifies in + * a cause or before a judicial tribunal + * 3 : one asked to be present at a transaction so as to be able to + * testify to its having taken place + * 4 : one who has personal knowledge of something + * 5 a : something serving as evidence or proof : SIGN + * b : public affirmation by word or example of usually + * religious faith or conviction <the heroic witness to divine + * life -- Pilot> + * 6 capitalized : a member of the Jehovah's Witnesses + */ + +#include "opt_ddb.h" +#include "opt_witness.h" + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <ddb/ddb.h> + +#define WITNESS_COUNT 200 +#define WITNESS_CHILDCOUNT (WITNESS_COUNT * 4) +/* + * XXX: This is somewhat bogus, as we assume here that at most 1024 threads + * will hold LOCK_NCHILDREN * 2 locks. We handle failure ok, and we should + * probably be safe for the most part, but it's still a SWAG. + */ +#define LOCK_CHILDCOUNT (MAXCPU + 1024) * 2 + +#define WITNESS_NCHILDREN 6 + +struct witness_child_list_entry; + +struct witness { + const char *w_name; + struct lock_class *w_class; + STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ + STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ + struct witness_child_list_entry *w_children; /* Great evilness... */ + const char *w_file; + int w_line; + u_int w_level; + u_int w_refcount; + u_char w_Giant_squawked:1; + u_char w_other_squawked:1; + u_char w_same_squawked:1; +}; + +struct witness_child_list_entry { + struct witness_child_list_entry *wcl_next; + struct witness *wcl_children[WITNESS_NCHILDREN]; + u_int wcl_count; +}; + +STAILQ_HEAD(witness_list, witness); + +struct witness_blessed { + const char *b_lock1; + const char *b_lock2; +}; + +struct witness_order_list_entry { + const char *w_name; + struct lock_class *w_class; +}; + +static struct witness *enroll(const char *description, + struct lock_class *lock_class); +static int itismychild(struct witness *parent, struct witness *child); +static void removechild(struct witness *parent, struct witness *child); +static int isitmychild(struct witness *parent, struct witness *child); +static int isitmydescendant(struct witness *parent, struct witness *child); +static int blessed(struct witness *, struct witness *); +static void witness_display_list(void(*prnt)(const char *fmt, ...), + struct witness_list *list); +static void witness_displaydescendants(void(*)(const char *fmt, ...), + struct witness *); +static void witness_leveldescendents(struct witness *parent, int level); +static void witness_levelall(void); +static struct witness *witness_get(void); +static void witness_free(struct witness *m); +static struct witness_child_list_entry *witness_child_get(void); +static void witness_child_free(struct witness_child_list_entry *wcl); +static struct lock_list_entry *witness_lock_list_get(void); +static void witness_lock_list_free(struct lock_list_entry *lle); +static void witness_display(void(*)(const char *fmt, ...)); +static struct lock_instance *find_instance(struct lock_list_entry *lock_list, + struct lock_object *lock); + +MALLOC_DEFINE(M_WITNESS, "witness", "witness structure"); + +static int witness_watch = 1; +TUNABLE_INT("debug.witness_watch", &witness_watch); +SYSCTL_INT(_debug, OID_AUTO, witness_watch, CTLFLAG_RD, &witness_watch, 0, ""); + +#ifdef DDB +/* + * When DDB is enabled and witness_ddb is set to 1, it will cause the system to + * drop into kdebug() when: + * - a lock heirarchy violation occurs + * - locks are held when going to sleep. + */ +#ifdef WITNESS_DDB +int witness_ddb = 1; +#else +int witness_ddb = 0; +#endif +TUNABLE_INT("debug.witness_ddb", &witness_ddb); +SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, ""); +#endif /* DDB */ + +#ifdef WITNESS_SKIPSPIN +int witness_skipspin = 1; +#else +int witness_skipspin = 0; +#endif +TUNABLE_INT("debug.witness_skipspin", &witness_skipspin); +SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0, + ""); + +static struct mtx w_mtx; +static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free); +static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all); +static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin); +static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep); +static struct witness_child_list_entry *w_child_free = NULL; +static struct lock_list_entry *w_lock_list_free = NULL; +static int witness_dead; /* fatal error, probably no memory */ + +static struct witness w_data[WITNESS_COUNT]; +static struct witness_child_list_entry w_childdata[WITNESS_CHILDCOUNT]; +static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT]; + +static struct witness_order_list_entry order_lists[] = { + { "Giant", &lock_class_mtx_sleep }, + { "proctree", &lock_class_sx }, + { "allproc", &lock_class_sx }, + { "sigio lock", &lock_class_mtx_sleep }, + { "process group", &lock_class_mtx_sleep }, + { "process lock", &lock_class_mtx_sleep }, + { "session", &lock_class_mtx_sleep }, + { "uidinfo hash", &lock_class_mtx_sleep }, + { "uidinfo struct", &lock_class_mtx_sleep }, + { NULL, NULL }, + /* + * spin locks + */ +#ifdef SMP + { "ap boot", &lock_class_mtx_spin }, +#ifdef __i386__ + { "com", &lock_class_mtx_spin }, +#endif +#endif + { "sio", &lock_class_mtx_spin }, +#ifdef __i386__ + { "cy", &lock_class_mtx_spin }, +#endif + { "ng_node", &lock_class_mtx_spin }, + { "ng_worklist", &lock_class_mtx_spin }, + { "ithread table lock", &lock_class_mtx_spin }, + { "sched lock", &lock_class_mtx_spin }, + { "callout", &lock_class_mtx_spin }, + /* + * leaf locks + */ + { "allpmaps", &lock_class_mtx_spin }, + { "vm page buckets mutex", &lock_class_mtx_spin }, + { "icu", &lock_class_mtx_spin }, +#ifdef SMP + { "smp rendezvous", &lock_class_mtx_spin }, +#endif + { "clk", &lock_class_mtx_spin }, + { "mutex profiling lock", &lock_class_mtx_spin }, + { NULL, NULL }, + { NULL, NULL } +}; + +/* + * Pairs of locks which have been blessed + * Don't complain about order problems with blessed locks + */ +static struct witness_blessed blessed_list[] = { +}; +static int blessed_count = + sizeof(blessed_list) / sizeof(struct witness_blessed); + +/* + * List of all locks in the system. + */ +TAILQ_HEAD(, lock_object) all_locks = TAILQ_HEAD_INITIALIZER(all_locks); + +static struct mtx all_mtx = { + { &lock_class_mtx_sleep, /* mtx_object.lo_class */ + "All locks list", /* mtx_object.lo_name */ + "All locks list", /* mtx_object.lo_type */ + LO_INITIALIZED, /* mtx_object.lo_flags */ + { NULL, NULL }, /* mtx_object.lo_list */ + NULL }, /* mtx_object.lo_witness */ + MTX_UNOWNED, 0, /* mtx_lock, mtx_recurse */ + TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked), + { NULL, NULL } /* mtx_contested */ +}; + +/* + * This global is set to 0 once it becomes safe to use the witness code. + */ +static int witness_cold = 1; + +/* + * Global variables for book keeping. + */ +static int lock_cur_cnt; +static int lock_max_cnt; + +/* + * The WITNESS-enabled diagnostic code. + */ +static void +witness_initialize(void *dummy __unused) +{ + struct lock_object *lock; + struct witness_order_list_entry *order; + struct witness *w, *w1; + int i; + + /* + * We have to release Giant before initializing its witness + * structure so that WITNESS doesn't get confused. + */ + mtx_unlock(&Giant); + mtx_assert(&Giant, MA_NOTOWNED); + + CTR1(KTR_WITNESS, "%s: initializing witness", __func__); + TAILQ_INSERT_HEAD(&all_locks, &all_mtx.mtx_object, lo_list); + mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET | + MTX_NOWITNESS); + for (i = 0; i < WITNESS_COUNT; i++) + witness_free(&w_data[i]); + for (i = 0; i < WITNESS_CHILDCOUNT; i++) + witness_child_free(&w_childdata[i]); + for (i = 0; i < LOCK_CHILDCOUNT; i++) + witness_lock_list_free(&w_locklistdata[i]); + + /* First add in all the specified order lists. */ + for (order = order_lists; order->w_name != NULL; order++) { + w = enroll(order->w_name, order->w_class); + if (w == NULL) + continue; + w->w_file = "order list"; + for (order++; order->w_name != NULL; order++) { + w1 = enroll(order->w_name, order->w_class); + if (w1 == NULL) + continue; + w1->w_file = "order list"; + itismychild(w, w1); + w = w1; + } + } + + /* Iterate through all locks and add them to witness. */ + mtx_lock(&all_mtx); + TAILQ_FOREACH(lock, &all_locks, lo_list) { + if (lock->lo_flags & LO_WITNESS) + lock->lo_witness = enroll(lock->lo_type, + lock->lo_class); + else + lock->lo_witness = NULL; + } + mtx_unlock(&all_mtx); + + /* Mark the witness code as being ready for use. */ + atomic_store_rel_int(&witness_cold, 0); + + mtx_lock(&Giant); +} +SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize, NULL) + +void +witness_init(struct lock_object *lock) +{ + struct lock_class *class; + + class = lock->lo_class; + if (lock->lo_flags & LO_INITIALIZED) + panic("%s: lock (%s) %s is already initialized", __func__, + class->lc_name, lock->lo_name); + if ((lock->lo_flags & LO_RECURSABLE) != 0 && + (class->lc_flags & LC_RECURSABLE) == 0) + panic("%s: lock (%s) %s can not be recursable", __func__, + class->lc_name, lock->lo_name); + if ((lock->lo_flags & LO_SLEEPABLE) != 0 && + (class->lc_flags & LC_SLEEPABLE) == 0) + panic("%s: lock (%s) %s can not be sleepable", __func__, + class->lc_name, lock->lo_name); + if ((lock->lo_flags & LO_UPGRADABLE) != 0 && + (class->lc_flags & LC_UPGRADABLE) == 0) + panic("%s: lock (%s) %s can not be upgradable", __func__, + class->lc_name, lock->lo_name); + + mtx_lock(&all_mtx); + TAILQ_INSERT_TAIL(&all_locks, lock, lo_list); + lock->lo_flags |= LO_INITIALIZED; + lock_cur_cnt++; + if (lock_cur_cnt > lock_max_cnt) + lock_max_cnt = lock_cur_cnt; + mtx_unlock(&all_mtx); + if (!witness_cold && !witness_dead && panicstr == NULL && + (lock->lo_flags & LO_WITNESS) != 0) + lock->lo_witness = enroll(lock->lo_type, class); + else + lock->lo_witness = NULL; +} + +void +witness_destroy(struct lock_object *lock) +{ + struct witness *w; + + if (witness_cold) + panic("lock (%s) %s destroyed while witness_cold", + lock->lo_class->lc_name, lock->lo_name); + if ((lock->lo_flags & LO_INITIALIZED) == 0) + panic("%s: lock (%s) %s is not initialized", __func__, + lock->lo_class->lc_name, lock->lo_name); + + /* XXX: need to verify that no one holds the lock */ + w = lock->lo_witness; + if (w != NULL) { + mtx_lock_spin(&w_mtx); + MPASS(w->w_refcount > 0); + w->w_refcount--; + mtx_unlock_spin(&w_mtx); + } + + mtx_lock(&all_mtx); + lock_cur_cnt--; + TAILQ_REMOVE(&all_locks, lock, lo_list); + lock->lo_flags &= ~LO_INITIALIZED; + mtx_unlock(&all_mtx); +} + +static void +witness_display_list(void(*prnt)(const char *fmt, ...), + struct witness_list *list) +{ + struct witness *w, *w1; + int found; + + STAILQ_FOREACH(w, list, w_typelist) { + if (w->w_file == NULL) + continue; + found = 0; + STAILQ_FOREACH(w1, list, w_typelist) { + if (isitmychild(w1, w)) { + found++; + break; + } + } + if (found) + continue; + /* + * This lock has no anscestors, display its descendants. + */ + witness_displaydescendants(prnt, w); + } +} + +static void +witness_display(void(*prnt)(const char *fmt, ...)) +{ + struct witness *w; + + KASSERT(!witness_cold, ("%s: witness_cold", __func__)); + witness_levelall(); + + /* + * First, handle sleep locks which have been acquired at least + * once. + */ + prnt("Sleep locks:\n"); + witness_display_list(prnt, &w_sleep); + + /* + * Now do spin locks which have been acquired at least once. + */ + prnt("\nSpin locks:\n"); + witness_display_list(prnt, &w_spin); + + /* + * Finally, any locks which have not been acquired yet. + */ + prnt("\nLocks which were never acquired:\n"); + STAILQ_FOREACH(w, &w_all, w_list) { + if (w->w_file != NULL || w->w_refcount == 0) + continue; + prnt("%s\n", w->w_name); + } +} + +void +witness_lock(struct lock_object *lock, int flags, const char *file, int line) +{ + struct lock_list_entry **lock_list, *lle; + struct lock_instance *lock1, *lock2; + struct lock_class *class; + struct witness *w, *w1; + struct thread *td; + int i, j; +#ifdef DDB + int go_into_ddb = 0; +#endif /* DDB */ + + if (witness_cold || witness_dead || lock->lo_witness == NULL || + panicstr != NULL) + return; + w = lock->lo_witness; + class = lock->lo_class; + td = curthread; + + if (class->lc_flags & LC_SLEEPLOCK) { + /* + * Since spin locks include a critical section, this check + * impliclty enforces a lock order of all sleep locks before + * all spin locks. + */ + if (td->td_critnest != 0 && (flags & LOP_TRYLOCK) == 0) + panic("blockable sleep lock (%s) %s @ %s:%d", + class->lc_name, lock->lo_name, file, line); + lock_list = &td->td_sleeplocks; + } else + lock_list = PCPU_PTR(spinlocks); + + /* + * Try locks do not block if they fail to acquire the lock, thus + * there is no danger of deadlocks or of switching while holding a + * spin lock if we acquire a lock via a try operation. + */ + if (flags & LOP_TRYLOCK) + goto out; + + /* + * Is this the first lock acquired? If so, then no order checking + * is needed. + */ + if (*lock_list == NULL) + goto out; + + /* + * Check to see if we are recursing on a lock we already own. + */ + lock1 = find_instance(*lock_list, lock); + if (lock1 != NULL) { + if ((lock1->li_flags & LI_EXCLUSIVE) != 0 && + (flags & LOP_EXCLUSIVE) == 0) { + printf("shared lock of (%s) %s @ %s:%d\n", + class->lc_name, lock->lo_name, file, line); + printf("while exclusively locked from %s:%d\n", + lock1->li_file, lock1->li_line); + panic("share->excl"); + } + if ((lock1->li_flags & LI_EXCLUSIVE) == 0 && + (flags & LOP_EXCLUSIVE) != 0) { + printf("exclusive lock of (%s) %s @ %s:%d\n", + class->lc_name, lock->lo_name, file, line); + printf("while share locked from %s:%d\n", + lock1->li_file, lock1->li_line); + panic("excl->share"); + } + lock1->li_flags++; + if ((lock->lo_flags & LO_RECURSABLE) == 0) { + printf( + "recursed on non-recursive lock (%s) %s @ %s:%d\n", + class->lc_name, lock->lo_name, file, line); + printf("first acquired @ %s:%d\n", lock1->li_file, + lock1->li_line); + panic("recurse"); + } + CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__, + td->td_proc->p_pid, lock->lo_name, + lock1->li_flags & LI_RECURSEMASK); + lock1->li_file = file; + lock1->li_line = line; + return; + } + + /* + * Check for duplicate locks of the same type. Note that we only + * have to check for this on the last lock we just acquired. Any + * other cases will be caught as lock order violations. + */ + lock1 = &(*lock_list)->ll_children[(*lock_list)->ll_count - 1]; + w1 = lock1->li_lock->lo_witness; + if (w1 == w) { + if (w->w_same_squawked || (lock->lo_flags & LO_DUPOK)) + goto out; + w->w_same_squawked = 1; + printf("acquiring duplicate lock of same type: \"%s\"\n", + lock->lo_type); + printf(" 1st %s @ %s:%d\n", lock1->li_lock->lo_name, + lock1->li_file, lock1->li_line); + printf(" 2nd %s @ %s:%d\n", lock->lo_name, file, line); +#ifdef DDB + go_into_ddb = 1; +#endif /* DDB */ + goto out; + } + MPASS(!mtx_owned(&w_mtx)); + mtx_lock_spin(&w_mtx); + /* + * If we have a known higher number just say ok + */ + if (witness_watch > 1 && w->w_level > w1->w_level) { + mtx_unlock_spin(&w_mtx); + goto out; + } + if (isitmydescendant(w1, w)) { + mtx_unlock_spin(&w_mtx); + goto out; + } + for (j = 0, lle = *lock_list; lle != NULL; lle = lle->ll_next) { + for (i = lle->ll_count - 1; i >= 0; i--, j++) { + + MPASS(j < WITNESS_COUNT); + lock1 = &lle->ll_children[i]; + w1 = lock1->li_lock->lo_witness; + + /* + * If this lock doesn't undergo witness checking, + * then skip it. + */ + if (w1 == NULL) { + KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0, + ("lock missing witness structure")); + continue; + } + /* + * If we are locking Giant and we slept with this + * lock, then skip it. + */ + if ((lock1->li_flags & LI_SLEPT) != 0 && + lock == &Giant.mtx_object) + continue; + /* + * If we are locking a sleepable lock and this lock + * isn't sleepable and isn't Giant, we want to treat + * it as a lock order violation to enfore a general + * lock order of sleepable locks before non-sleepable + * locks. Thus, we only bother checking the lock + * order hierarchy if we pass the initial test. + */ + if (!((lock->lo_flags & LO_SLEEPABLE) != 0 && + ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 && + lock1->li_lock != &Giant.mtx_object)) && + !isitmydescendant(w, w1)) + continue; + /* + * We have a lock order violation, check to see if it + * is allowed or has already been yelled about. + */ + mtx_unlock_spin(&w_mtx); + if (blessed(w, w1)) + goto out; + if (lock1->li_lock == &Giant.mtx_object) { + if (w1->w_Giant_squawked) + goto out; + else + w1->w_Giant_squawked = 1; + } else { + if (w1->w_other_squawked) + goto out; + else + w1->w_other_squawked = 1; + } + /* + * Ok, yell about it. + */ + printf("lock order reversal\n"); + /* + * Try to locate an earlier lock with + * witness w in our list. + */ + do { + lock2 = &lle->ll_children[i]; + MPASS(lock2->li_lock != NULL); + if (lock2->li_lock->lo_witness == w) + break; + i--; + if (i == 0 && lle->ll_next != NULL) { + lle = lle->ll_next; + i = lle->ll_count - 1; + MPASS(i != 0); + } + } while (i >= 0); + if (i < 0) { + printf(" 1st %p %s (%s) @ %s:%d\n", + lock1->li_lock, lock1->li_lock->lo_name, + lock1->li_lock->lo_type, lock1->li_file, + lock1->li_line); + printf(" 2nd %p %s (%s) @ %s:%d\n", lock, + lock->lo_name, lock->lo_type, file, line); + } else { + printf(" 1st %p %s (%s) @ %s:%d\n", + lock2->li_lock, lock2->li_lock->lo_name, + lock2->li_lock->lo_type, lock2->li_file, + lock2->li_line); + printf(" 2nd %p %s (%s) @ %s:%d\n", + lock1->li_lock, lock1->li_lock->lo_name, + lock1->li_lock->lo_type, lock1->li_file, + lock1->li_line); + printf(" 3rd %p %s (%s) @ %s:%d\n", lock, + lock->lo_name, lock->lo_type, file, line); + } +#ifdef DDB + go_into_ddb = 1; +#endif /* DDB */ + goto out; + } + } + lock1 = &(*lock_list)->ll_children[(*lock_list)->ll_count - 1]; + /* + * Don't build a new relationship if we are locking Giant just + * after waking up and the previous lock in the list was acquired + * prior to blocking. + */ + if (lock == &Giant.mtx_object && (lock1->li_flags & LI_SLEPT) != 0) + mtx_unlock_spin(&w_mtx); + else { + CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, + lock->lo_type, lock1->li_lock->lo_type); + if (!itismychild(lock1->li_lock->lo_witness, w)) + mtx_unlock_spin(&w_mtx); + } + +out: +#ifdef DDB + if (witness_ddb && go_into_ddb) + Debugger(__func__); +#endif /* DDB */ + w->w_file = file; + w->w_line = line; + + lle = *lock_list; + if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) { + lle = witness_lock_list_get(); + if (lle == NULL) + return; + lle->ll_next = *lock_list; + CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__, + td->td_proc->p_pid, lle); + *lock_list = lle; + } + lock1 = &lle->ll_children[lle->ll_count++]; + lock1->li_lock = lock; + lock1->li_line = line; + lock1->li_file = file; + if ((flags & LOP_EXCLUSIVE) != 0) + lock1->li_flags = LI_EXCLUSIVE; + else + lock1->li_flags = 0; + CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__, + td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1); +} + +void +witness_upgrade(struct lock_object *lock, int flags, const char *file, int line) +{ + struct lock_instance *instance; + struct lock_class *class; + + KASSERT(!witness_cold, ("%s: witness_cold", __func__)); + if (lock->lo_witness == NULL || witness_dead || panicstr != NULL) + return; + class = lock->lo_class; + if ((lock->lo_flags & LO_UPGRADABLE) == 0) + panic("upgrade of non-upgradable lock (%s) %s @ %s:%d", + class->lc_name, lock->lo_name, file, line); + if ((flags & LOP_TRYLOCK) == 0) + panic("non-try upgrade of lock (%s) %s @ %s:%d", class->lc_name, + lock->lo_name, file, line); + if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0) + panic("upgrade of non-sleep lock (%s) %s @ %s:%d", + class->lc_name, lock->lo_name, file, line); + instance = find_instance(curthread->td_sleeplocks, lock); + if (instance == NULL) + panic("upgrade of unlocked lock (%s) %s @ %s:%d", + class->lc_name, lock->lo_name, file, line); + if ((instance->li_flags & LI_EXCLUSIVE) != 0) + panic("upgrade of exclusive lock (%s) %s @ %s:%d", + class->lc_name, lock->lo_name, file, line); + if ((instance->li_flags & LI_RECURSEMASK) != 0) + panic("upgrade of recursed lock (%s) %s r=%d @ %s:%d", + class->lc_name, lock->lo_name, + instance->li_flags & LI_RECURSEMASK, file, line); + instance->li_flags |= LI_EXCLUSIVE; +} + +void +witness_downgrade(struct lock_object *lock, int flags, const char *file, + int line) +{ + struct lock_instance *instance; + struct lock_class *class; + + KASSERT(!witness_cold, ("%s: witness_cold", __func__)); + if (lock->lo_witness == NULL || witness_dead || panicstr != NULL) + return; + class = lock->lo_class; + if ((lock->lo_flags & LO_UPGRADABLE) == 0) + panic("downgrade of non-upgradable lock (%s) %s @ %s:%d", + class->lc_name, lock->lo_name, file, line); + if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0) + panic("downgrade of non-sleep lock (%s) %s @ %s:%d", + class->lc_name, lock->lo_name, file, line); + instance = find_instance(curthread->td_sleeplocks, lock); + if (instance == NULL) + panic("downgrade of unlocked lock (%s) %s @ %s:%d", + class->lc_name, lock->lo_name, file, line); + if ((instance->li_flags & LI_EXCLUSIVE) == 0) + panic("downgrade of shared lock (%s) %s @ %s:%d", + class->lc_name, lock->lo_name, file, line); + if ((instance->li_flags & LI_RECURSEMASK) != 0) + panic("downgrade of recursed lock (%s) %s r=%d @ %s:%d", + class->lc_name, lock->lo_name, + instance->li_flags & LI_RECURSEMASK, file, line); + instance->li_flags &= ~LI_EXCLUSIVE; +} + +void +witness_unlock(struct lock_object *lock, int flags, const char *file, int line) +{ + struct lock_list_entry **lock_list, *lle; + struct lock_instance *instance; + struct lock_class *class; + struct thread *td; + register_t s; + int i, j; + + if (witness_cold || witness_dead || lock->lo_witness == NULL || + panicstr != NULL) + return; + td = curthread; + class = lock->lo_class; + if (class->lc_flags & LC_SLEEPLOCK) + lock_list = &td->td_sleeplocks; + else + lock_list = PCPU_PTR(spinlocks); + for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next) + for (i = 0; i < (*lock_list)->ll_count; i++) { + instance = &(*lock_list)->ll_children[i]; + if (instance->li_lock == lock) { + if ((instance->li_flags & LI_EXCLUSIVE) != 0 && + (flags & LOP_EXCLUSIVE) == 0) { + printf( + "shared unlock of (%s) %s @ %s:%d\n", + class->lc_name, lock->lo_name, + file, line); + printf( + "while exclusively locked from %s:%d\n", + instance->li_file, + instance->li_line); + panic("excl->ushare"); + } + if ((instance->li_flags & LI_EXCLUSIVE) == 0 && + (flags & LOP_EXCLUSIVE) != 0) { + printf( + "exclusive unlock of (%s) %s @ %s:%d\n", + class->lc_name, lock->lo_name, + file, line); + printf( + "while share locked from %s:%d\n", + instance->li_file, + instance->li_line); + panic("share->uexcl"); + } + /* If we are recursed, unrecurse. */ + if ((instance->li_flags & LI_RECURSEMASK) > 0) { + CTR4(KTR_WITNESS, + "%s: pid %d unrecursed on %s r=%d", __func__, + td->td_proc->p_pid, + instance->li_lock->lo_name, + instance->li_flags); + instance->li_flags--; + return; + } + s = intr_disable(); + CTR4(KTR_WITNESS, + "%s: pid %d removed %s from lle[%d]", __func__, + td->td_proc->p_pid, + instance->li_lock->lo_name, + (*lock_list)->ll_count - 1); + for (j = i; j < (*lock_list)->ll_count - 1; j++) + (*lock_list)->ll_children[j] = + (*lock_list)->ll_children[j + 1]; + (*lock_list)->ll_count--; + intr_restore(s); + if ((*lock_list)->ll_count == 0) { + lle = *lock_list; + *lock_list = lle->ll_next; + CTR3(KTR_WITNESS, + "%s: pid %d removed lle %p", __func__, + td->td_proc->p_pid, lle); + witness_lock_list_free(lle); + } + return; + } + } + panic("lock (%s) %s not locked @ %s:%d", class->lc_name, lock->lo_name, + file, line); +} + +/* + * Warn if any held locks are not sleepable. Note that Giant and the lock + * passed in are both special cases since they are both released during the + * sleep process and aren't actually held while the thread is asleep. + */ +int +witness_sleep(int check_only, struct lock_object *lock, const char *file, + int line) +{ + struct lock_list_entry **lock_list, *lle; + struct lock_instance *lock1; + struct thread *td; + int i, n; + + if (witness_cold || witness_dead || panicstr != NULL) + return (0); + n = 0; + td = curthread; + lock_list = &td->td_sleeplocks; +again: + for (lle = *lock_list; lle != NULL; lle = lle->ll_next) + for (i = lle->ll_count - 1; i >= 0; i--) { + lock1 = &lle->ll_children[i]; + if (lock1->li_lock == lock || + lock1->li_lock == &Giant.mtx_object) + continue; + if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0) { + if (check_only == 0) { + CTR3(KTR_WITNESS, + "pid %d: sleeping with lock (%s) %s held", + td->td_proc->p_pid, + lock1->li_lock->lo_class->lc_name, + lock1->li_lock->lo_name); + lock1->li_flags |= LI_SLEPT; + } + continue; + } + n++; + printf("%s:%d: %s with \"%s\" locked from %s:%d\n", + file, line, check_only ? "could sleep" : "sleeping", + lock1->li_lock->lo_name, lock1->li_file, + lock1->li_line); + } + if (lock_list == &td->td_sleeplocks && PCPU_GET(spinlocks) != NULL) { + /* + * Since we already hold a spinlock preemption is + * already blocked. + */ + lock_list = PCPU_PTR(spinlocks); + goto again; + } +#ifdef DDB + if (witness_ddb && n) + Debugger(__func__); +#endif /* DDB */ + return (n); +} + +static struct witness * +enroll(const char *description, struct lock_class *lock_class) +{ + struct witness *w; + + if (!witness_watch || witness_dead || panicstr != NULL) + return (NULL); + if ((lock_class->lc_flags & LC_SPINLOCK) && witness_skipspin) + return (NULL); + mtx_lock_spin(&w_mtx); + STAILQ_FOREACH(w, &w_all, w_list) { + if (w->w_name == description || (w->w_refcount > 0 && + strcmp(description, w->w_name) == 0)) { + w->w_refcount++; + mtx_unlock_spin(&w_mtx); + if (lock_class != w->w_class) + panic( + "lock (%s) %s does not match earlier (%s) lock", + description, lock_class->lc_name, + w->w_class->lc_name); + return (w); + } + } + /* + * This isn't quite right, as witness_cold is still 0 while we + * enroll all the locks initialized before witness_initialize(). + */ + if ((lock_class->lc_flags & LC_SPINLOCK) && !witness_cold) { + mtx_unlock_spin(&w_mtx); + panic("spin lock %s not in order list", description); + } + if ((w = witness_get()) == NULL) + return (NULL); + w->w_name = description; + w->w_class = lock_class; + w->w_refcount = 1; + STAILQ_INSERT_HEAD(&w_all, w, w_list); + if (lock_class->lc_flags & LC_SPINLOCK) + STAILQ_INSERT_HEAD(&w_spin, w, w_typelist); + else if (lock_class->lc_flags & LC_SLEEPLOCK) + STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist); + else { + mtx_unlock_spin(&w_mtx); + panic("lock class %s is not sleep or spin", + lock_class->lc_name); + } + mtx_unlock_spin(&w_mtx); + return (w); +} + +static int +itismychild(struct witness *parent, struct witness *child) +{ + static int recursed; + struct witness_child_list_entry **wcl; + struct witness_list *list; + + MPASS(child != NULL && parent != NULL); + if ((parent->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) != + (child->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK))) + panic( + "%s: parent (%s) and child (%s) are not the same lock type", + __func__, parent->w_class->lc_name, + child->w_class->lc_name); + + /* + * Insert "child" after "parent" + */ + wcl = &parent->w_children; + while (*wcl != NULL && (*wcl)->wcl_count == WITNESS_NCHILDREN) + wcl = &(*wcl)->wcl_next; + if (*wcl == NULL) { + *wcl = witness_child_get(); + if (*wcl == NULL) + return (1); + } + (*wcl)->wcl_children[(*wcl)->wcl_count++] = child; + + /* + * Now prune whole tree. We look for cases where a lock is now + * both a descendant and a direct child of a given lock. In that + * case, we want to remove the direct child link from the tree. + */ + if (recursed) + return (0); + recursed = 1; + if (parent->w_class->lc_flags & LC_SLEEPLOCK) + list = &w_sleep; + else + list = &w_spin; + STAILQ_FOREACH(child, list, w_typelist) { + STAILQ_FOREACH(parent, list, w_typelist) { + if (!isitmychild(parent, child)) + continue; + removechild(parent, child); + if (isitmydescendant(parent, child)) + continue; + itismychild(parent, child); + } + } + recursed = 0; + witness_levelall(); + return (0); +} + +static void +removechild(struct witness *parent, struct witness *child) +{ + struct witness_child_list_entry **wcl, *wcl1; + int i; + + for (wcl = &parent->w_children; *wcl != NULL; wcl = &(*wcl)->wcl_next) + for (i = 0; i < (*wcl)->wcl_count; i++) + if ((*wcl)->wcl_children[i] == child) + goto found; + return; +found: + (*wcl)->wcl_count--; + if ((*wcl)->wcl_count > i) + (*wcl)->wcl_children[i] = + (*wcl)->wcl_children[(*wcl)->wcl_count]; + MPASS((*wcl)->wcl_children[i] != NULL); + if ((*wcl)->wcl_count != 0) + return; + wcl1 = *wcl; + *wcl = wcl1->wcl_next; + witness_child_free(wcl1); +} + +static int +isitmychild(struct witness *parent, struct witness *child) +{ + struct witness_child_list_entry *wcl; + int i; + + for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) { + for (i = 0; i < wcl->wcl_count; i++) { + if (wcl->wcl_children[i] == child) + return (1); + } + } + return (0); +} + +static int +isitmydescendant(struct witness *parent, struct witness *child) +{ + struct witness_child_list_entry *wcl; + int i, j; + + if (isitmychild(parent, child)) + return (1); + j = 0; + for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) { + MPASS(j < 1000); + for (i = 0; i < wcl->wcl_count; i++) { + if (isitmydescendant(wcl->wcl_children[i], child)) + return (1); + } + j++; + } + return (0); +} + +void +witness_levelall (void) +{ + struct witness_list *list; + struct witness *w, *w1; + + /* + * First clear all levels. + */ + STAILQ_FOREACH(w, &w_all, w_list) { + w->w_level = 0; + } + + /* + * Look for locks with no parent and level all their descendants. + */ + STAILQ_FOREACH(w, &w_all, w_list) { + /* + * This is just an optimization, technically we could get + * away just walking the all list each time. + */ + if (w->w_class->lc_flags & LC_SLEEPLOCK) + list = &w_sleep; + else + list = &w_spin; + STAILQ_FOREACH(w1, list, w_typelist) { + if (isitmychild(w1, w)) + goto skip; + } + witness_leveldescendents(w, 0); + skip: + ; /* silence GCC 3.x */ + } +} + +static void +witness_leveldescendents(struct witness *parent, int level) +{ + struct witness_child_list_entry *wcl; + int i; + + if (parent->w_level < level) + parent->w_level = level; + level++; + for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) + for (i = 0; i < wcl->wcl_count; i++) + witness_leveldescendents(wcl->wcl_children[i], level); +} + +static void +witness_displaydescendants(void(*prnt)(const char *fmt, ...), + struct witness *parent) +{ + struct witness_child_list_entry *wcl; + int i, level; + + level = parent->w_level; + prnt("%-2d", level); + for (i = 0; i < level; i++) + prnt(" "); + if (parent->w_refcount > 0) { + prnt("%s", parent->w_name); + if (parent->w_file != NULL) + prnt(" -- last acquired @ %s:%d\n", parent->w_file, + parent->w_line); + } else + prnt("(dead)\n"); + for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) + for (i = 0; i < wcl->wcl_count; i++) + witness_displaydescendants(prnt, + wcl->wcl_children[i]); +} + +static int +blessed(struct witness *w1, struct witness *w2) +{ + int i; + struct witness_blessed *b; + + for (i = 0; i < blessed_count; i++) { + b = &blessed_list[i]; + if (strcmp(w1->w_name, b->b_lock1) == 0) { + if (strcmp(w2->w_name, b->b_lock2) == 0) + return (1); + continue; + } + if (strcmp(w1->w_name, b->b_lock2) == 0) + if (strcmp(w2->w_name, b->b_lock1) == 0) + return (1); + } + return (0); +} + +static struct witness * +witness_get(void) +{ + struct witness *w; + + if (witness_dead) { + mtx_unlock_spin(&w_mtx); + return (NULL); + } + if (STAILQ_EMPTY(&w_free)) { + witness_dead = 1; + mtx_unlock_spin(&w_mtx); + printf("%s: witness exhausted\n", __func__); + return (NULL); + } + w = STAILQ_FIRST(&w_free); + STAILQ_REMOVE_HEAD(&w_free, w_list); + bzero(w, sizeof(*w)); + return (w); +} + +static void +witness_free(struct witness *w) +{ + + STAILQ_INSERT_HEAD(&w_free, w, w_list); +} + +static struct witness_child_list_entry * +witness_child_get(void) +{ + struct witness_child_list_entry *wcl; + + if (witness_dead) { + mtx_unlock_spin(&w_mtx); + return (NULL); + } + wcl = w_child_free; + if (wcl == NULL) { + witness_dead = 1; + mtx_unlock_spin(&w_mtx); + printf("%s: witness exhausted\n", __func__); + return (NULL); + } + w_child_free = wcl->wcl_next; + bzero(wcl, sizeof(*wcl)); + return (wcl); +} + +static void +witness_child_free(struct witness_child_list_entry *wcl) +{ + + wcl->wcl_next = w_child_free; + w_child_free = wcl; +} + +static struct lock_list_entry * +witness_lock_list_get(void) +{ + struct lock_list_entry *lle; + + if (witness_dead) + return (NULL); + mtx_lock_spin(&w_mtx); + lle = w_lock_list_free; + if (lle == NULL) { + witness_dead = 1; + mtx_unlock_spin(&w_mtx); + printf("%s: witness exhausted\n", __func__); + return (NULL); + } + w_lock_list_free = lle->ll_next; + mtx_unlock_spin(&w_mtx); + bzero(lle, sizeof(*lle)); + return (lle); +} + +static void +witness_lock_list_free(struct lock_list_entry *lle) +{ + + mtx_lock_spin(&w_mtx); + lle->ll_next = w_lock_list_free; + w_lock_list_free = lle; + mtx_unlock_spin(&w_mtx); +} + +static struct lock_instance * +find_instance(struct lock_list_entry *lock_list, struct lock_object *lock) +{ + struct lock_list_entry *lle; + struct lock_instance *instance; + int i; + + for (lle = lock_list; lle != NULL; lle = lle->ll_next) + for (i = lle->ll_count - 1; i >= 0; i--) { + instance = &lle->ll_children[i]; + if (instance->li_lock == lock) + return (instance); + } + return (NULL); +} + +int +witness_list_locks(struct lock_list_entry **lock_list) +{ + struct lock_list_entry *lle; + struct lock_instance *instance; + struct lock_object *lock; + int i, nheld; + + nheld = 0; + for (lle = *lock_list; lle != NULL; lle = lle->ll_next) + for (i = lle->ll_count - 1; i >= 0; i--) { + instance = &lle->ll_children[i]; + lock = instance->li_lock; + printf("%s %s %s", + (instance->li_flags & LI_EXCLUSIVE) != 0 ? + "exclusive" : "shared", + lock->lo_class->lc_name, lock->lo_name); + if (lock->lo_type != lock->lo_name) + printf(" (%s)", lock->lo_type); + printf(" r = %d (%p) locked @ %s:%d\n", + instance->li_flags & LI_RECURSEMASK, lock, + instance->li_file, instance->li_line); + nheld++; + } + return (nheld); +} + +/* + * Calling this on td != curthread is bad unless we are in ddb. + */ +int +witness_list(struct thread *td) +{ + int nheld; + + KASSERT(!witness_cold, ("%s: witness_cold", __func__)); +#ifdef DDB + KASSERT(td == curthread || db_active, + ("%s: td != curthread and we aren't in the debugger", __func__)); + if (!db_active && witness_dead) + return (0); +#else + KASSERT(td == curthread, ("%s: p != curthread", __func__)); + if (witness_dead) + return (0); +#endif + nheld = witness_list_locks(&td->td_sleeplocks); + + /* + * We only handle spinlocks if td == curthread. This is somewhat broken + * if td is currently executing on some other CPU and holds spin locks + * as we won't display those locks. If we had a MI way of getting + * the per-cpu data for a given cpu then we could use + * td->td_kse->ke_oncpu to get the list of spinlocks for this thread + * and "fix" this. + * + * That still wouldn't really fix this unless we locked sched_lock + * or stopped the other CPU to make sure it wasn't changing the list + * out from under us. It is probably best to just not try to handle + * threads on other CPU's for now. + */ + if (td == curthread && PCPU_GET(spinlocks) != NULL) + nheld += witness_list_locks(PCPU_PTR(spinlocks)); + + return (nheld); +} + +void +witness_save(struct lock_object *lock, const char **filep, int *linep) +{ + struct lock_instance *instance; + + KASSERT(!witness_cold, ("%s: witness_cold", __func__)); + if (lock->lo_witness == NULL || witness_dead || panicstr != NULL) + return; + if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0) + panic("%s: lock (%s) %s is not a sleep lock", __func__, + lock->lo_class->lc_name, lock->lo_name); + instance = find_instance(curthread->td_sleeplocks, lock); + if (instance == NULL) + panic("%s: lock (%s) %s not locked", __func__, + lock->lo_class->lc_name, lock->lo_name); + *filep = instance->li_file; + *linep = instance->li_line; +} + +void +witness_restore(struct lock_object *lock, const char *file, int line) +{ + struct lock_instance *instance; + + KASSERT(!witness_cold, ("%s: witness_cold", __func__)); + if (lock->lo_witness == NULL || witness_dead || panicstr != NULL) + return; + if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0) + panic("%s: lock (%s) %s is not a sleep lock", __func__, + lock->lo_class->lc_name, lock->lo_name); + instance = find_instance(curthread->td_sleeplocks, lock); + if (instance == NULL) + panic("%s: lock (%s) %s not locked", __func__, + lock->lo_class->lc_name, lock->lo_name); + lock->lo_witness->w_file = file; + lock->lo_witness->w_line = line; + instance->li_file = file; + instance->li_line = line; +} + +void +witness_assert(struct lock_object *lock, int flags, const char *file, int line) +{ +#ifdef INVARIANT_SUPPORT + struct lock_instance *instance; + + if (lock->lo_witness == NULL || witness_dead || panicstr != NULL) + return; + if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) != 0) + instance = find_instance(curthread->td_sleeplocks, lock); + else if ((lock->lo_class->lc_flags & LC_SPINLOCK) != 0) + instance = find_instance(PCPU_GET(spinlocks), lock); + else { + panic("Lock (%s) %s is not sleep or spin!", + lock->lo_class->lc_name, lock->lo_name); + return; + } + switch (flags) { + case LA_UNLOCKED: + if (instance != NULL) + panic("Lock (%s) %s locked @ %s:%d.", + lock->lo_class->lc_name, lock->lo_name, file, line); + break; + case LA_LOCKED: + case LA_LOCKED | LA_RECURSED: + case LA_LOCKED | LA_NOTRECURSED: + case LA_SLOCKED: + case LA_SLOCKED | LA_RECURSED: + case LA_SLOCKED | LA_NOTRECURSED: + case LA_XLOCKED: + case LA_XLOCKED | LA_RECURSED: + case LA_XLOCKED | LA_NOTRECURSED: + if (instance == NULL) { + panic("Lock (%s) %s not locked @ %s:%d.", + lock->lo_class->lc_name, lock->lo_name, file, line); + break; + } + if ((flags & LA_XLOCKED) != 0 && + (instance->li_flags & LI_EXCLUSIVE) == 0) + panic("Lock (%s) %s not exclusively locked @ %s:%d.", + lock->lo_class->lc_name, lock->lo_name, file, line); + if ((flags & LA_SLOCKED) != 0 && + (instance->li_flags & LI_EXCLUSIVE) != 0) + panic("Lock (%s) %s exclusively locked @ %s:%d.", + lock->lo_class->lc_name, lock->lo_name, file, line); + if ((flags & LA_RECURSED) != 0 && + (instance->li_flags & LI_RECURSEMASK) == 0) + panic("Lock (%s) %s not recursed @ %s:%d.", + lock->lo_class->lc_name, lock->lo_name, file, line); + if ((flags & LA_NOTRECURSED) != 0 && + (instance->li_flags & LI_RECURSEMASK) != 0) + panic("Lock (%s) %s recursed @ %s:%d.", + lock->lo_class->lc_name, lock->lo_name, file, line); + break; + default: + panic("Invalid lock assertion at %s:%d.", file, line); + + } +#endif /* INVARIANT_SUPPORT */ +} + +#ifdef DDB + +DB_SHOW_COMMAND(locks, db_witness_list) +{ + struct thread *td; + pid_t pid; + struct proc *p; + + if (have_addr) { + pid = (addr % 16) + ((addr >> 4) % 16) * 10 + + ((addr >> 8) % 16) * 100 + ((addr >> 12) % 16) * 1000 + + ((addr >> 16) % 16) * 10000; + /* sx_slock(&allproc_lock); */ + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_pid == pid) + break; + } + /* sx_sunlock(&allproc_lock); */ + if (p == NULL) { + db_printf("pid %d not found\n", pid); + return; + } + FOREACH_THREAD_IN_PROC(p, td) { + witness_list(td); + } + } else { + td = curthread; + witness_list(td); + } +} + +DB_SHOW_COMMAND(witness, db_witness_display) +{ + + witness_display(db_printf); +} +#endif diff --git a/sys/kern/subr_xxx.c b/sys/kern/subr_xxx.c new file mode 100644 index 0000000..c9d2676 --- /dev/null +++ b/sys/kern/subr_xxx.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_xxx.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +/* + * Miscellaneous trivial functions. + */ +#include <sys/param.h> +#include <sys/systm.h> + +/* + * Return error for operation not supported + * on a specific object or file type. + */ +int +eopnotsupp() +{ + + return (EOPNOTSUPP); +} + +/* + * Generic null operation, always returns success. + */ +int +nullop() +{ + + return (0); +} + +#include <sys/conf.h> + +/* + * Unsupported devswitch functions (e.g. for writing to read-only device). + * XXX may belong elsewhere. + */ + +int +noopen(dev, flags, fmt, td) + dev_t dev; + int flags; + int fmt; + struct thread *td; +{ + + return (ENODEV); +} + +int +noclose(dev, flags, fmt, td) + dev_t dev; + int flags; + int fmt; + struct thread *td; +{ + + return (ENODEV); +} + +int +noread(dev, uio, ioflag) + dev_t dev; + struct uio *uio; + int ioflag; +{ + + return (ENODEV); +} + +int +nowrite(dev, uio, ioflag) + dev_t dev; + struct uio *uio; + int ioflag; +{ + + return (ENODEV); +} + +int +noioctl(dev, cmd, data, flags, td) + dev_t dev; + u_long cmd; + caddr_t data; + int flags; + struct thread *td; +{ + + return (ENODEV); +} + +int +nokqfilter(dev, kn) + dev_t dev; + struct knote *kn; +{ + + return (ENODEV); +} + +int +nommap(dev, offset, nprot) + dev_t dev; + vm_offset_t offset; + int nprot; +{ + + /* Don't return ENODEV. That would allow mapping address ENODEV! */ + return (-1); +} + +int +nodump(dev_t dev, void *virtual __unused, vm_offset_t physical __unused, off_t offset __unused, size_t length __unused) +{ + + return (ENODEV); +} + +/* + * Null devswitch functions (for when the operation always succeeds). + * XXX may belong elsewhere. + * XXX not all are here (e.g., seltrue() isn't). + */ + +/* + * XXX this is probably bogus. Any device that uses it isn't checking the + * minor number. + */ +int +nullopen(dev, flags, fmt, td) + dev_t dev; + int flags; + int fmt; + struct thread *td; +{ + + return (0); +} + +int +nullclose(dev, flags, fmt, td) + dev_t dev; + int flags; + int fmt; + struct thread *td; +{ + + return (0); +} diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c new file mode 100644 index 0000000..1bdd913 --- /dev/null +++ b/sys/kern/sys_generic.c @@ -0,0 +1,1210 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/filedesc.h> +#include <sys/filio.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/signalvar.h> +#include <sys/socketvar.h> +#include <sys/uio.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/poll.h> +#include <sys/resourcevar.h> +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <sys/sysent.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/condvar.h> +#ifdef __alpha__ +#include <sys/disklabel.h> +#endif +#ifdef KTRACE +#include <sys/ktrace.h> +#endif +#include <vm/vm.h> +#include <vm/vm_page.h> + +#include <machine/limits.h> + +static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); +static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); +MALLOC_DEFINE(M_IOV, "iov", "large iov's"); + +static int pollscan(struct thread *, struct pollfd *, u_int); +static int selscan(struct thread *, fd_mask **, fd_mask **, int); +static int dofileread(struct thread *, struct file *, int, void *, + size_t, off_t, int); +static int dofilewrite(struct thread *, struct file *, int, + const void *, size_t, off_t, int); + +/* + * Read system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct read_args { + int fd; + void *buf; + size_t nbyte; +}; +#endif +/* + * MPSAFE + */ +int +read(td, uap) + struct thread *td; + struct read_args *uap; +{ + struct file *fp; + int error; + + if ((error = fget_read(td, uap->fd, &fp)) == 0) { + error = dofileread(td, fp, uap->fd, uap->buf, + uap->nbyte, (off_t)-1, 0); + fdrop(fp, td); + } + return(error); +} + +/* + * Pread system call + */ +#ifndef _SYS_SYSPROTO_H_ +struct pread_args { + int fd; + void *buf; + size_t nbyte; + int pad; + off_t offset; +}; +#endif +/* + * MPSAFE + */ +int +pread(td, uap) + struct thread *td; + struct pread_args *uap; +{ + struct file *fp; + int error; + + if ((error = fget_read(td, uap->fd, &fp)) != 0) + return (error); + if (fp->f_type != DTYPE_VNODE) { + error = ESPIPE; + } else { + error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, + uap->offset, FOF_OFFSET); + } + fdrop(fp, td); + return(error); +} + +/* + * Code common for read and pread + */ +int +dofileread(td, fp, fd, buf, nbyte, offset, flags) + struct thread *td; + struct file *fp; + int fd, flags; + void *buf; + size_t nbyte; + off_t offset; +{ + struct uio auio; + struct iovec aiov; + long cnt, error = 0; +#ifdef KTRACE + struct iovec ktriov; + struct uio ktruio; + int didktr = 0; +#endif + + aiov.iov_base = (caddr_t)buf; + aiov.iov_len = nbyte; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = offset; + if (nbyte > INT_MAX) + return (EINVAL); + auio.uio_resid = nbyte; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(td, KTR_GENIO)) { + ktriov = aiov; + ktruio = auio; + didktr = 1; + } +#endif + cnt = nbyte; + + if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) { + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + } + cnt -= auio.uio_resid; +#ifdef KTRACE + if (didktr && error == 0) { + ktruio.uio_iov = &ktriov; + ktruio.uio_resid = cnt; + ktrgenio(fd, UIO_READ, &ktruio, error); + } +#endif + td->td_retval[0] = cnt; + return (error); +} + +/* + * Scatter read system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct readv_args { + int fd; + struct iovec *iovp; + u_int iovcnt; +}; +#endif +/* + * MPSAFE + */ +int +readv(td, uap) + struct thread *td; + struct readv_args *uap; +{ + struct file *fp; + struct uio auio; + struct iovec *iov; + struct iovec *needfree; + struct iovec aiov[UIO_SMALLIOV]; + long i, cnt; + int error; + u_int iovlen; +#ifdef KTRACE + struct iovec *ktriov = NULL; + struct uio ktruio; +#endif + + if ((error = fget_read(td, uap->fd, &fp)) != 0) + return (error); + needfree = NULL; + /* note: can't use iovlen until iovcnt is validated */ + iovlen = uap->iovcnt * sizeof (struct iovec); + if (uap->iovcnt > UIO_SMALLIOV) { + if (uap->iovcnt > UIO_MAXIOV) { + error = EINVAL; + goto done; + } + MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); + needfree = iov; + } else + iov = aiov; + auio.uio_iov = iov; + auio.uio_iovcnt = uap->iovcnt; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + auio.uio_offset = -1; + if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) + goto done; + auio.uio_resid = 0; + for (i = 0; i < uap->iovcnt; i++) { + if (iov->iov_len > INT_MAX - auio.uio_resid) { + error = EINVAL; + goto done; + } + auio.uio_resid += iov->iov_len; + iov++; + } +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(td, KTR_GENIO)) { + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + ktruio = auio; + } +#endif + cnt = auio.uio_resid; + if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) { + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + } + cnt -= auio.uio_resid; +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) { + ktruio.uio_iov = ktriov; + ktruio.uio_resid = cnt; + ktrgenio(uap->fd, UIO_READ, &ktruio, error); + } + FREE(ktriov, M_TEMP); + } +#endif + td->td_retval[0] = cnt; +done: + fdrop(fp, td); + if (needfree) + FREE(needfree, M_IOV); + return (error); +} + +/* + * Write system call + */ +#ifndef _SYS_SYSPROTO_H_ +struct write_args { + int fd; + const void *buf; + size_t nbyte; +}; +#endif +/* + * MPSAFE + */ +int +write(td, uap) + struct thread *td; + struct write_args *uap; +{ + struct file *fp; + int error; + + if ((error = fget_write(td, uap->fd, &fp)) == 0) { + error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, + (off_t)-1, 0); + fdrop(fp, td); + } else { + error = EBADF; /* XXX this can't be right */ + } + return(error); +} + +/* + * Pwrite system call + */ +#ifndef _SYS_SYSPROTO_H_ +struct pwrite_args { + int fd; + const void *buf; + size_t nbyte; + int pad; + off_t offset; +}; +#endif +/* + * MPSAFE + */ +int +pwrite(td, uap) + struct thread *td; + struct pwrite_args *uap; +{ + struct file *fp; + int error; + + if ((error = fget_write(td, uap->fd, &fp)) == 0) { + if (fp->f_type == DTYPE_VNODE) { + error = dofilewrite(td, fp, uap->fd, uap->buf, + uap->nbyte, uap->offset, FOF_OFFSET); + } else { + error = ESPIPE; + } + fdrop(fp, td); + } else { + error = EBADF; /* this can't be right */ + } + return(error); +} + +static int +dofilewrite(td, fp, fd, buf, nbyte, offset, flags) + struct thread *td; + struct file *fp; + int fd, flags; + const void *buf; + size_t nbyte; + off_t offset; +{ + struct uio auio; + struct iovec aiov; + long cnt, error = 0; +#ifdef KTRACE + struct iovec ktriov; + struct uio ktruio; + int didktr = 0; +#endif + + aiov.iov_base = (void *)(uintptr_t)buf; + aiov.iov_len = nbyte; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = offset; + if (nbyte > INT_MAX) + return (EINVAL); + auio.uio_resid = nbyte; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; +#ifdef KTRACE + /* + * if tracing, save a copy of iovec and uio + */ + if (KTRPOINT(td, KTR_GENIO)) { + ktriov = aiov; + ktruio = auio; + didktr = 1; + } +#endif + cnt = nbyte; + if (fp->f_type == DTYPE_VNODE) + bwillwrite(); + if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) { + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + /* Socket layer is responsible for issuing SIGPIPE. */ + if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { + PROC_LOCK(td->td_proc); + psignal(td->td_proc, SIGPIPE); + PROC_UNLOCK(td->td_proc); + } + } + cnt -= auio.uio_resid; +#ifdef KTRACE + if (didktr && error == 0) { + ktruio.uio_iov = &ktriov; + ktruio.uio_resid = cnt; + ktrgenio(fd, UIO_WRITE, &ktruio, error); + } +#endif + td->td_retval[0] = cnt; + return (error); +} + +/* + * Gather write system call + */ +#ifndef _SYS_SYSPROTO_H_ +struct writev_args { + int fd; + struct iovec *iovp; + u_int iovcnt; +}; +#endif +/* + * MPSAFE + */ +int +writev(td, uap) + struct thread *td; + register struct writev_args *uap; +{ + struct file *fp; + struct uio auio; + register struct iovec *iov; + struct iovec *needfree; + struct iovec aiov[UIO_SMALLIOV]; + long i, cnt, error = 0; + u_int iovlen; +#ifdef KTRACE + struct iovec *ktriov = NULL; + struct uio ktruio; +#endif + + mtx_lock(&Giant); + if ((error = fget_write(td, uap->fd, &fp)) != 0) { + error = EBADF; + goto done2; + } + /* note: can't use iovlen until iovcnt is validated */ + iovlen = uap->iovcnt * sizeof (struct iovec); + if (uap->iovcnt > UIO_SMALLIOV) { + if (uap->iovcnt > UIO_MAXIOV) { + needfree = NULL; + error = EINVAL; + goto done; + } + MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); + needfree = iov; + } else { + iov = aiov; + needfree = NULL; + } + auio.uio_iov = iov; + auio.uio_iovcnt = uap->iovcnt; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + auio.uio_offset = -1; + if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) + goto done; + auio.uio_resid = 0; + for (i = 0; i < uap->iovcnt; i++) { + if (iov->iov_len > INT_MAX - auio.uio_resid) { + error = EINVAL; + goto done; + } + auio.uio_resid += iov->iov_len; + iov++; + } +#ifdef KTRACE + /* + * if tracing, save a copy of iovec and uio + */ + if (KTRPOINT(td, KTR_GENIO)) { + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + ktruio = auio; + } +#endif + cnt = auio.uio_resid; + if (fp->f_type == DTYPE_VNODE) + bwillwrite(); + if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) { + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + if (error == EPIPE) { + PROC_LOCK(td->td_proc); + psignal(td->td_proc, SIGPIPE); + PROC_UNLOCK(td->td_proc); + } + } + cnt -= auio.uio_resid; +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) { + ktruio.uio_iov = ktriov; + ktruio.uio_resid = cnt; + ktrgenio(uap->fd, UIO_WRITE, &ktruio, error); + } + FREE(ktriov, M_TEMP); + } +#endif + td->td_retval[0] = cnt; +done: + fdrop(fp, td); + if (needfree) + FREE(needfree, M_IOV); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * Ioctl system call + */ +#ifndef _SYS_SYSPROTO_H_ +struct ioctl_args { + int fd; + u_long com; + caddr_t data; +}; +#endif +/* + * MPSAFE + */ +/* ARGSUSED */ +int +ioctl(td, uap) + struct thread *td; + register struct ioctl_args *uap; +{ + struct file *fp; + register struct filedesc *fdp; + register u_long com; + int error = 0; + register u_int size; + caddr_t data, memp; + int tmp; +#define STK_PARAMS 128 + union { + char stkbuf[STK_PARAMS]; + long align; + } ubuf; + + if ((error = fget(td, uap->fd, &fp)) != 0) + return (error); + mtx_lock(&Giant); + if ((fp->f_flag & (FREAD | FWRITE)) == 0) { + fdrop(fp, td); + mtx_unlock(&Giant); + return (EBADF); + } + fdp = td->td_proc->p_fd; + switch (com = uap->com) { + case FIONCLEX: + FILEDESC_LOCK(fdp); + fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; + FILEDESC_UNLOCK(fdp); + fdrop(fp, td); + mtx_unlock(&Giant); + return (0); + case FIOCLEX: + FILEDESC_LOCK(fdp); + fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; + FILEDESC_UNLOCK(fdp); + fdrop(fp, td); + mtx_unlock(&Giant); + return (0); + } + + /* + * Interpret high order word to find amount of data to be + * copied to/from the user's address space. + */ + size = IOCPARM_LEN(com); + if (size > IOCPARM_MAX) { + fdrop(fp, td); + mtx_unlock(&Giant); + return (ENOTTY); + } + + memp = NULL; + if (size > sizeof (ubuf.stkbuf)) { + memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); + data = memp; + } else { + data = ubuf.stkbuf; + } + if (com&IOC_IN) { + if (size) { + error = copyin(uap->data, data, (u_int)size); + if (error) { + if (memp) + free(memp, M_IOCTLOPS); + fdrop(fp, td); + goto done; + } + } else { + *(caddr_t *)data = uap->data; + } + } else if ((com&IOC_OUT) && size) { + /* + * Zero the buffer so the user always + * gets back something deterministic. + */ + bzero(data, size); + } else if (com&IOC_VOID) { + *(caddr_t *)data = uap->data; + } + + switch (com) { + + case FIONBIO: + FILE_LOCK(fp); + if ((tmp = *(int *)data)) + fp->f_flag |= FNONBLOCK; + else + fp->f_flag &= ~FNONBLOCK; + FILE_UNLOCK(fp); + error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); + break; + + case FIOASYNC: + FILE_LOCK(fp); + if ((tmp = *(int *)data)) + fp->f_flag |= FASYNC; + else + fp->f_flag &= ~FASYNC; + FILE_UNLOCK(fp); + error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td); + break; + + default: + error = fo_ioctl(fp, com, data, td); + /* + * Copy any data to user, size was + * already set and checked above. + */ + if (error == 0 && (com&IOC_OUT) && size) + error = copyout(data, uap->data, (u_int)size); + break; + } + if (memp) + free(memp, M_IOCTLOPS); + fdrop(fp, td); +done: + mtx_unlock(&Giant); + return (error); +} + +/* + * sellock and selwait are initialized in selectinit() via SYSINIT. + */ +struct mtx sellock; +struct cv selwait; +u_int nselcoll; /* Select collisions since boot */ +SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); + +/* + * Select system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct select_args { + int nd; + fd_set *in, *ou, *ex; + struct timeval *tv; +}; +#endif +/* + * MPSAFE + */ +int +select(td, uap) + register struct thread *td; + register struct select_args *uap; +{ + struct filedesc *fdp; + /* + * The magic 2048 here is chosen to be just enough for FD_SETSIZE + * infds with the new FD_SETSIZE of 1024, and more than enough for + * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE + * of 256. + */ + fd_mask s_selbits[howmany(2048, NFDBITS)]; + fd_mask *ibits[3], *obits[3], *selbits, *sbp; + struct timeval atv, rtv, ttv; + int error, timo; + u_int ncoll, nbufbytes, ncpbytes, nfdbits; + + if (uap->nd < 0) + return (EINVAL); + fdp = td->td_proc->p_fd; + mtx_lock(&Giant); + FILEDESC_LOCK(fdp); + + if (uap->nd > td->td_proc->p_fd->fd_nfiles) + uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ + FILEDESC_UNLOCK(fdp); + + /* + * Allocate just enough bits for the non-null fd_sets. Use the + * preallocated auto buffer if possible. + */ + nfdbits = roundup(uap->nd, NFDBITS); + ncpbytes = nfdbits / NBBY; + nbufbytes = 0; + if (uap->in != NULL) + nbufbytes += 2 * ncpbytes; + if (uap->ou != NULL) + nbufbytes += 2 * ncpbytes; + if (uap->ex != NULL) + nbufbytes += 2 * ncpbytes; + if (nbufbytes <= sizeof s_selbits) + selbits = &s_selbits[0]; + else + selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); + + /* + * Assign pointers into the bit buffers and fetch the input bits. + * Put the output buffers together so that they can be bzeroed + * together. + */ + sbp = selbits; +#define getbits(name, x) \ + do { \ + if (uap->name == NULL) \ + ibits[x] = NULL; \ + else { \ + ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ + obits[x] = sbp; \ + sbp += ncpbytes / sizeof *sbp; \ + error = copyin(uap->name, ibits[x], ncpbytes); \ + if (error != 0) \ + goto done_nosellock; \ + } \ + } while (0) + getbits(in, 0); + getbits(ou, 1); + getbits(ex, 2); +#undef getbits + if (nbufbytes != 0) + bzero(selbits, nbufbytes / 2); + + if (uap->tv) { + error = copyin((caddr_t)uap->tv, (caddr_t)&atv, + sizeof (atv)); + if (error) + goto done_nosellock; + if (itimerfix(&atv)) { + error = EINVAL; + goto done_nosellock; + } + getmicrouptime(&rtv); + timevaladd(&atv, &rtv); + } else { + atv.tv_sec = 0; + atv.tv_usec = 0; + } + timo = 0; + mtx_lock(&sellock); +retry: + ncoll = nselcoll; + mtx_lock_spin(&sched_lock); + td->td_flags |= TDF_SELECT; + mtx_unlock_spin(&sched_lock); + mtx_unlock(&sellock); + + /* XXX Is there a better place for this? */ + TAILQ_INIT(&td->td_selq); + error = selscan(td, ibits, obits, uap->nd); + mtx_lock(&sellock); + if (error || td->td_retval[0]) + goto done; + if (atv.tv_sec || atv.tv_usec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) + goto done; + ttv = atv; + timevalsub(&ttv, &rtv); + timo = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + + /* + * An event of interest may occur while we do not hold + * sellock, so check TDF_SELECT and the number of + * collisions and rescan the file descriptors if + * necessary. + */ + mtx_lock_spin(&sched_lock); + if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { + mtx_unlock_spin(&sched_lock); + goto retry; + } + mtx_unlock_spin(&sched_lock); + + if (timo > 0) + error = cv_timedwait_sig(&selwait, &sellock, timo); + else + error = cv_wait_sig(&selwait, &sellock); + + if (error == 0) + goto retry; + +done: + clear_selinfo_list(td); + mtx_lock_spin(&sched_lock); + td->td_flags &= ~TDF_SELECT; + mtx_unlock_spin(&sched_lock); + mtx_unlock(&sellock); + +done_nosellock: + /* select is not restarted after signals... */ + if (error == ERESTART) + error = EINTR; + if (error == EWOULDBLOCK) + error = 0; +#define putbits(name, x) \ + if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ + error = error2; + if (error == 0) { + int error2; + + putbits(in, 0); + putbits(ou, 1); + putbits(ex, 2); +#undef putbits + } + if (selbits != &s_selbits[0]) + free(selbits, M_SELECT); + + mtx_unlock(&Giant); + return (error); +} + +static int +selscan(td, ibits, obits, nfd) + struct thread *td; + fd_mask **ibits, **obits; + int nfd; +{ + int msk, i, fd; + fd_mask bits; + struct file *fp; + int n = 0; + /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ + static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; + struct filedesc *fdp = td->td_proc->p_fd; + + FILEDESC_LOCK(fdp); + for (msk = 0; msk < 3; msk++) { + if (ibits[msk] == NULL) + continue; + for (i = 0; i < nfd; i += NFDBITS) { + bits = ibits[msk][i/NFDBITS]; + /* ffs(int mask) not portable, fd_mask is long */ + for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { + if (!(bits & 1)) + continue; + if ((fp = fget_locked(fdp, fd)) == NULL) { + FILEDESC_UNLOCK(fdp); + return (EBADF); + } + if (fo_poll(fp, flag[msk], fp->f_cred, td)) { + obits[msk][(fd)/NFDBITS] |= + ((fd_mask)1 << ((fd) % NFDBITS)); + n++; + } + } + } + } + FILEDESC_UNLOCK(fdp); + td->td_retval[0] = n; + return (0); +} + +/* + * Poll system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct poll_args { + struct pollfd *fds; + u_int nfds; + int timeout; +}; +#endif +/* + * MPSAFE + */ +int +poll(td, uap) + struct thread *td; + struct poll_args *uap; +{ + caddr_t bits; + char smallbits[32 * sizeof(struct pollfd)]; + struct timeval atv, rtv, ttv; + int error = 0, timo; + u_int ncoll, nfds; + size_t ni; + + nfds = SCARG(uap, nfds); + + mtx_lock(&Giant); + /* + * This is kinda bogus. We have fd limits, but that is not + * really related to the size of the pollfd array. Make sure + * we let the process use at least FD_SETSIZE entries and at + * least enough for the current limits. We want to be reasonably + * safe, but not overly restrictive. + */ + if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && + (nfds > FD_SETSIZE)) { + error = EINVAL; + goto done2; + } + ni = nfds * sizeof(struct pollfd); + if (ni > sizeof(smallbits)) + bits = malloc(ni, M_TEMP, M_WAITOK); + else + bits = smallbits; + error = copyin(SCARG(uap, fds), bits, ni); + if (error) + goto done_nosellock; + if (SCARG(uap, timeout) != INFTIM) { + atv.tv_sec = SCARG(uap, timeout) / 1000; + atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; + if (itimerfix(&atv)) { + error = EINVAL; + goto done_nosellock; + } + getmicrouptime(&rtv); + timevaladd(&atv, &rtv); + } else { + atv.tv_sec = 0; + atv.tv_usec = 0; + } + timo = 0; + mtx_lock(&sellock); +retry: + ncoll = nselcoll; + mtx_lock_spin(&sched_lock); + td->td_flags |= TDF_SELECT; + mtx_unlock_spin(&sched_lock); + mtx_unlock(&sellock); + + /* XXX Is there a better place for this? */ + TAILQ_INIT(&td->td_selq); + error = pollscan(td, (struct pollfd *)bits, nfds); + mtx_lock(&sellock); + if (error || td->td_retval[0]) + goto done; + if (atv.tv_sec || atv.tv_usec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) + goto done; + ttv = atv; + timevalsub(&ttv, &rtv); + timo = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + /* + * An event of interest may occur while we do not hold + * sellock, so check TDF_SELECT and the number of collisions + * and rescan the file descriptors if necessary. + */ + mtx_lock_spin(&sched_lock); + if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { + mtx_unlock_spin(&sched_lock); + goto retry; + } + mtx_unlock_spin(&sched_lock); + + if (timo > 0) + error = cv_timedwait_sig(&selwait, &sellock, timo); + else + error = cv_wait_sig(&selwait, &sellock); + + if (error == 0) + goto retry; + +done: + clear_selinfo_list(td); + mtx_lock_spin(&sched_lock); + td->td_flags &= ~TDF_SELECT; + mtx_unlock_spin(&sched_lock); + mtx_unlock(&sellock); + +done_nosellock: + /* poll is not restarted after signals... */ + if (error == ERESTART) + error = EINTR; + if (error == EWOULDBLOCK) + error = 0; + if (error == 0) { + error = copyout(bits, SCARG(uap, fds), ni); + if (error) + goto out; + } +out: + if (ni > sizeof(smallbits)) + free(bits, M_TEMP); +done2: + mtx_unlock(&Giant); + return (error); +} + +static int +pollscan(td, fds, nfd) + struct thread *td; + struct pollfd *fds; + u_int nfd; +{ + register struct filedesc *fdp = td->td_proc->p_fd; + int i; + struct file *fp; + int n = 0; + + FILEDESC_LOCK(fdp); + for (i = 0; i < nfd; i++, fds++) { + if (fds->fd >= fdp->fd_nfiles) { + fds->revents = POLLNVAL; + n++; + } else if (fds->fd < 0) { + fds->revents = 0; + } else { + fp = fdp->fd_ofiles[fds->fd]; + if (fp == NULL) { + fds->revents = POLLNVAL; + n++; + } else { + /* + * Note: backend also returns POLLHUP and + * POLLERR if appropriate. + */ + fds->revents = fo_poll(fp, fds->events, + fp->f_cred, td); + if (fds->revents != 0) + n++; + } + } + } + FILEDESC_UNLOCK(fdp); + td->td_retval[0] = n; + return (0); +} + +/* + * OpenBSD poll system call. + * XXX this isn't quite a true representation.. OpenBSD uses select ops. + */ +#ifndef _SYS_SYSPROTO_H_ +struct openbsd_poll_args { + struct pollfd *fds; + u_int nfds; + int timeout; +}; +#endif +/* + * MPSAFE + */ +int +openbsd_poll(td, uap) + register struct thread *td; + register struct openbsd_poll_args *uap; +{ + return (poll(td, (struct poll_args *)uap)); +} + +/* + * Remove the references to the thread from all of the objects + * we were polling. + * + * This code assumes that the underlying owner of the selinfo + * structure will hold sellock before it changes it, and that + * it will unlink itself from our list if it goes away. + */ +void +clear_selinfo_list(td) + struct thread *td; +{ + struct selinfo *si; + + mtx_assert(&sellock, MA_OWNED); + TAILQ_FOREACH(si, &td->td_selq, si_thrlist) + si->si_thread = NULL; + TAILQ_INIT(&td->td_selq); +} + +/*ARGSUSED*/ +int +seltrue(dev, events, td) + dev_t dev; + int events; + struct thread *td; +{ + + return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); +} + +/* + * Record a select request. + */ +void +selrecord(selector, sip) + struct thread *selector; + struct selinfo *sip; +{ + + mtx_lock(&sellock); + /* + * If the thread is NULL then take ownership of selinfo + * however if the thread is not NULL and the thread points to + * someone else, then we have a collision, otherwise leave it alone + * as we've owned it in a previous selrecord on this selinfo. + */ + if (sip->si_thread == NULL) { + sip->si_thread = selector; + TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); + } else if (sip->si_thread != selector) { + sip->si_flags |= SI_COLL; + } + + mtx_unlock(&sellock); +} + +/* + * Do a wakeup when a selectable event occurs. + */ +void +selwakeup(sip) + struct selinfo *sip; +{ + struct thread *td; + + mtx_lock(&sellock); + td = sip->si_thread; + if ((sip->si_flags & SI_COLL) != 0) { + nselcoll++; + sip->si_flags &= ~SI_COLL; + cv_broadcast(&selwait); + } + if (td == NULL) { + mtx_unlock(&sellock); + return; + } + TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); + sip->si_thread = NULL; + mtx_lock_spin(&sched_lock); + if (td->td_wchan == (caddr_t)&selwait) { + if (td->td_proc->p_stat == SSLEEP) + setrunnable(td); + else + cv_waitq_remove(td); + } else + td->td_flags &= ~TDF_SELECT; + mtx_unlock_spin(&sched_lock); + mtx_unlock(&sellock); +} + +static void selectinit(void *); +SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) + +/* ARGSUSED*/ +static void +selectinit(dummy) + void *dummy; +{ + cv_init(&selwait, "select"); + mtx_init(&sellock, "sellck", NULL, MTX_DEF); +} diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c new file mode 100644 index 0000000..11ab6d1 --- /dev/null +++ b/sys/kern/sys_pipe.c @@ -0,0 +1,1427 @@ +/* + * Copyright (c) 1996 John S. Dyson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Absolutely no warranty of function or purpose is made by the author + * John S. Dyson. + * 4. Modifications may be freely made to this file if the above conditions + * are met. + * + * $FreeBSD$ + */ + +/* + * This file contains a high-performance replacement for the socket-based + * pipes scheme originally used in FreeBSD/4.4Lite. It does not support + * all features of sockets, but does do everything that pipes normally + * do. + */ + +/* + * This code has two modes of operation, a small write mode and a large + * write mode. The small write mode acts like conventional pipes with + * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the + * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT + * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and + * the receiving process can copy it directly from the pages in the sending + * process. + * + * If the sending process receives a signal, it is possible that it will + * go away, and certainly its address space can change, because control + * is returned back to the user-mode side. In that case, the pipe code + * arranges to copy the buffer supplied by the user process, to a pageable + * kernel buffer, and the receiving process will grab the data from the + * pageable kernel buffer. Since signals don't happen all that often, + * the copy operation is normally eliminated. + * + * The constant PIPE_MINDIRECT is chosen to make sure that buffering will + * happen for small transfers so that the system will not spend all of + * its time context switching. PIPE_SIZE is constrained by the + * amount of kernel virtual memory. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/filedesc.h> +#include <sys/filio.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/ttycom.h> +#include <sys/stat.h> +#include <sys/malloc.h> +#include <sys/poll.h> +#include <sys/selinfo.h> +#include <sys/signalvar.h> +#include <sys/sysproto.h> +#include <sys/pipe.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/uio.h> +#include <sys/event.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/uma.h> + +/* + * Use this define if you want to disable *fancy* VM things. Expect an + * approx 30% decrease in transfer rate. This could be useful for + * NetBSD or OpenBSD. + */ +/* #define PIPE_NODIRECT */ + +/* + * interfaces to the outside world + */ +static int pipe_read(struct file *fp, struct uio *uio, + struct ucred *cred, int flags, struct thread *td); +static int pipe_write(struct file *fp, struct uio *uio, + struct ucred *cred, int flags, struct thread *td); +static int pipe_close(struct file *fp, struct thread *td); +static int pipe_poll(struct file *fp, int events, struct ucred *cred, + struct thread *td); +static int pipe_kqfilter(struct file *fp, struct knote *kn); +static int pipe_stat(struct file *fp, struct stat *sb, struct thread *td); +static int pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct thread *td); + +static struct fileops pipeops = { + pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter, + pipe_stat, pipe_close +}; + +static void filt_pipedetach(struct knote *kn); +static int filt_piperead(struct knote *kn, long hint); +static int filt_pipewrite(struct knote *kn, long hint); + +static struct filterops pipe_rfiltops = + { 1, NULL, filt_pipedetach, filt_piperead }; +static struct filterops pipe_wfiltops = + { 1, NULL, filt_pipedetach, filt_pipewrite }; + +#define PIPE_GET_GIANT(pipe) \ + do { \ + KASSERT(((pipe)->pipe_state & PIPE_LOCKFL) != 0, \ + ("%s:%d PIPE_GET_GIANT: line pipe not locked", \ + __FILE__, __LINE__)); \ + PIPE_UNLOCK(pipe); \ + mtx_lock(&Giant); \ + } while (0) + +#define PIPE_DROP_GIANT(pipe) \ + do { \ + mtx_unlock(&Giant); \ + PIPE_LOCK(pipe); \ + } while (0) + +/* + * Default pipe buffer size(s), this can be kind-of large now because pipe + * space is pageable. The pipe code will try to maintain locality of + * reference for performance reasons, so small amounts of outstanding I/O + * will not wipe the cache. + */ +#define MINPIPESIZE (PIPE_SIZE/3) +#define MAXPIPESIZE (2*PIPE_SIZE/3) + +/* + * Maximum amount of kva for pipes -- this is kind-of a soft limit, but + * is there so that on large systems, we don't exhaust it. + */ +#define MAXPIPEKVA (8*1024*1024) + +/* + * Limit for direct transfers, we cannot, of course limit + * the amount of kva for pipes in general though. + */ +#define LIMITPIPEKVA (16*1024*1024) + +/* + * Limit the number of "big" pipes + */ +#define LIMITBIGPIPES 32 +static int nbigpipe; + +static int amountpipekva; + +static void pipeinit(void *dummy __unused); +static void pipeclose(struct pipe *cpipe); +static void pipe_free_kmem(struct pipe *cpipe); +static int pipe_create(struct pipe **cpipep); +static __inline int pipelock(struct pipe *cpipe, int catch); +static __inline void pipeunlock(struct pipe *cpipe); +static __inline void pipeselwakeup(struct pipe *cpipe); +#ifndef PIPE_NODIRECT +static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); +static void pipe_destroy_write_buffer(struct pipe *wpipe); +static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); +static void pipe_clone_write_buffer(struct pipe *wpipe); +#endif +static int pipespace(struct pipe *cpipe, int size); + +static uma_zone_t pipe_zone; + +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); + +static void +pipeinit(void *dummy __unused) +{ + pipe_zone = uma_zcreate("PIPE", sizeof(struct pipe), NULL, + NULL, NULL, NULL, UMA_ALIGN_PTR, 0); +} + +/* + * The pipe system call for the DTYPE_PIPE type of pipes + */ + +/* ARGSUSED */ +int +pipe(td, uap) + struct thread *td; + struct pipe_args /* { + int dummy; + } */ *uap; +{ + struct filedesc *fdp = td->td_proc->p_fd; + struct file *rf, *wf; + struct pipe *rpipe, *wpipe; + struct mtx *pmtx; + int fd, error; + + KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); + + pmtx = malloc(sizeof(*pmtx), M_TEMP, M_WAITOK | M_ZERO); + + rpipe = wpipe = NULL; + if (pipe_create(&rpipe) || pipe_create(&wpipe)) { + pipeclose(rpipe); + pipeclose(wpipe); + free(pmtx, M_TEMP); + return (ENFILE); + } + + rpipe->pipe_state |= PIPE_DIRECTOK; + wpipe->pipe_state |= PIPE_DIRECTOK; + + error = falloc(td, &rf, &fd); + if (error) { + pipeclose(rpipe); + pipeclose(wpipe); + free(pmtx, M_TEMP); + return (error); + } + fhold(rf); + td->td_retval[0] = fd; + + /* + * Warning: once we've gotten past allocation of the fd for the + * read-side, we can only drop the read side via fdrop() in order + * to avoid races against processes which manage to dup() the read + * side while we are blocked trying to allocate the write side. + */ + FILE_LOCK(rf); + rf->f_flag = FREAD | FWRITE; + rf->f_type = DTYPE_PIPE; + rf->f_data = (caddr_t)rpipe; + rf->f_ops = &pipeops; + FILE_UNLOCK(rf); + error = falloc(td, &wf, &fd); + if (error) { + FILEDESC_LOCK(fdp); + if (fdp->fd_ofiles[td->td_retval[0]] == rf) { + fdp->fd_ofiles[td->td_retval[0]] = NULL; + FILEDESC_UNLOCK(fdp); + fdrop(rf, td); + } else + FILEDESC_UNLOCK(fdp); + fdrop(rf, td); + /* rpipe has been closed by fdrop(). */ + pipeclose(wpipe); + free(pmtx, M_TEMP); + return (error); + } + FILE_LOCK(wf); + wf->f_flag = FREAD | FWRITE; + wf->f_type = DTYPE_PIPE; + wf->f_data = (caddr_t)wpipe; + wf->f_ops = &pipeops; + FILE_UNLOCK(wf); + td->td_retval[1] = fd; + rpipe->pipe_peer = wpipe; + wpipe->pipe_peer = rpipe; + mtx_init(pmtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); + rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx; + fdrop(rf, td); + + return (0); +} + +/* + * Allocate kva for pipe circular buffer, the space is pageable + * This routine will 'realloc' the size of a pipe safely, if it fails + * it will retain the old buffer. + * If it fails it will return ENOMEM. + */ +static int +pipespace(cpipe, size) + struct pipe *cpipe; + int size; +{ + struct vm_object *object; + caddr_t buffer; + int npages, error; + + GIANT_REQUIRED; + KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)), + ("pipespace: pipe mutex locked")); + + npages = round_page(size)/PAGE_SIZE; + /* + * Create an object, I don't like the idea of paging to/from + * kernel_object. + * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. + */ + object = vm_object_allocate(OBJT_DEFAULT, npages); + buffer = (caddr_t) vm_map_min(kernel_map); + + /* + * Insert the object into the kernel map, and allocate kva for it. + * The map entry is, by default, pageable. + * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. + */ + error = vm_map_find(kernel_map, object, 0, + (vm_offset_t *) &buffer, size, 1, + VM_PROT_ALL, VM_PROT_ALL, 0); + + if (error != KERN_SUCCESS) { + vm_object_deallocate(object); + return (ENOMEM); + } + + /* free old resources if we're resizing */ + pipe_free_kmem(cpipe); + cpipe->pipe_buffer.object = object; + cpipe->pipe_buffer.buffer = buffer; + cpipe->pipe_buffer.size = size; + cpipe->pipe_buffer.in = 0; + cpipe->pipe_buffer.out = 0; + cpipe->pipe_buffer.cnt = 0; + amountpipekva += cpipe->pipe_buffer.size; + return (0); +} + +/* + * initialize and allocate VM and memory for pipe + */ +static int +pipe_create(cpipep) + struct pipe **cpipep; +{ + struct pipe *cpipe; + int error; + + *cpipep = uma_zalloc(pipe_zone, M_WAITOK); + if (*cpipep == NULL) + return (ENOMEM); + + cpipe = *cpipep; + + /* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */ + cpipe->pipe_buffer.object = NULL; +#ifndef PIPE_NODIRECT + cpipe->pipe_map.kva = NULL; +#endif + /* + * protect so pipeclose() doesn't follow a junk pointer + * if pipespace() fails. + */ + bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel)); + cpipe->pipe_state = 0; + cpipe->pipe_peer = NULL; + cpipe->pipe_busy = 0; + +#ifndef PIPE_NODIRECT + /* + * pipe data structure initializations to support direct pipe I/O + */ + cpipe->pipe_map.cnt = 0; + cpipe->pipe_map.kva = 0; + cpipe->pipe_map.pos = 0; + cpipe->pipe_map.npages = 0; + /* cpipe->pipe_map.ms[] = invalid */ +#endif + + cpipe->pipe_mtxp = NULL; /* avoid pipespace assertion */ + error = pipespace(cpipe, PIPE_SIZE); + if (error) + return (error); + + vfs_timestamp(&cpipe->pipe_ctime); + cpipe->pipe_atime = cpipe->pipe_ctime; + cpipe->pipe_mtime = cpipe->pipe_ctime; + + return (0); +} + + +/* + * lock a pipe for I/O, blocking other access + */ +static __inline int +pipelock(cpipe, catch) + struct pipe *cpipe; + int catch; +{ + int error; + + PIPE_LOCK_ASSERT(cpipe, MA_OWNED); + while (cpipe->pipe_state & PIPE_LOCKFL) { + cpipe->pipe_state |= PIPE_LWANT; + error = msleep(cpipe, PIPE_MTX(cpipe), + catch ? (PRIBIO | PCATCH) : PRIBIO, + "pipelk", 0); + if (error != 0) + return (error); + } + cpipe->pipe_state |= PIPE_LOCKFL; + return (0); +} + +/* + * unlock a pipe I/O lock + */ +static __inline void +pipeunlock(cpipe) + struct pipe *cpipe; +{ + + PIPE_LOCK_ASSERT(cpipe, MA_OWNED); + cpipe->pipe_state &= ~PIPE_LOCKFL; + if (cpipe->pipe_state & PIPE_LWANT) { + cpipe->pipe_state &= ~PIPE_LWANT; + wakeup(cpipe); + } +} + +static __inline void +pipeselwakeup(cpipe) + struct pipe *cpipe; +{ + + if (cpipe->pipe_state & PIPE_SEL) { + cpipe->pipe_state &= ~PIPE_SEL; + selwakeup(&cpipe->pipe_sel); + } + if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) + pgsigio(&cpipe->pipe_sigio, SIGIO, 0); + KNOTE(&cpipe->pipe_sel.si_note, 0); +} + +/* ARGSUSED */ +static int +pipe_read(fp, uio, cred, flags, td) + struct file *fp; + struct uio *uio; + struct ucred *cred; + struct thread *td; + int flags; +{ + struct pipe *rpipe = (struct pipe *) fp->f_data; + int error; + int nread = 0; + u_int size; + + PIPE_LOCK(rpipe); + ++rpipe->pipe_busy; + error = pipelock(rpipe, 1); + if (error) + goto unlocked_error; + + while (uio->uio_resid) { + /* + * normal pipe buffer receive + */ + if (rpipe->pipe_buffer.cnt > 0) { + size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; + if (size > rpipe->pipe_buffer.cnt) + size = rpipe->pipe_buffer.cnt; + if (size > (u_int) uio->uio_resid) + size = (u_int) uio->uio_resid; + + PIPE_UNLOCK(rpipe); + error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], + size, uio); + PIPE_LOCK(rpipe); + if (error) + break; + + rpipe->pipe_buffer.out += size; + if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) + rpipe->pipe_buffer.out = 0; + + rpipe->pipe_buffer.cnt -= size; + + /* + * If there is no more to read in the pipe, reset + * its pointers to the beginning. This improves + * cache hit stats. + */ + if (rpipe->pipe_buffer.cnt == 0) { + rpipe->pipe_buffer.in = 0; + rpipe->pipe_buffer.out = 0; + } + nread += size; +#ifndef PIPE_NODIRECT + /* + * Direct copy, bypassing a kernel buffer. + */ + } else if ((size = rpipe->pipe_map.cnt) && + (rpipe->pipe_state & PIPE_DIRECTW)) { + caddr_t va; + if (size > (u_int) uio->uio_resid) + size = (u_int) uio->uio_resid; + + va = (caddr_t) rpipe->pipe_map.kva + + rpipe->pipe_map.pos; + PIPE_UNLOCK(rpipe); + error = uiomove(va, size, uio); + PIPE_LOCK(rpipe); + if (error) + break; + nread += size; + rpipe->pipe_map.pos += size; + rpipe->pipe_map.cnt -= size; + if (rpipe->pipe_map.cnt == 0) { + rpipe->pipe_state &= ~PIPE_DIRECTW; + wakeup(rpipe); + } +#endif + } else { + /* + * detect EOF condition + * read returns 0 on EOF, no need to set error + */ + if (rpipe->pipe_state & PIPE_EOF) + break; + + /* + * If the "write-side" has been blocked, wake it up now. + */ + if (rpipe->pipe_state & PIPE_WANTW) { + rpipe->pipe_state &= ~PIPE_WANTW; + wakeup(rpipe); + } + + /* + * Break if some data was read. + */ + if (nread > 0) + break; + + /* + * Unlock the pipe buffer for our remaining processing. We + * will either break out with an error or we will sleep and + * relock to loop. + */ + pipeunlock(rpipe); + + /* + * Handle non-blocking mode operation or + * wait for more data. + */ + if (fp->f_flag & FNONBLOCK) { + error = EAGAIN; + } else { + rpipe->pipe_state |= PIPE_WANTR; + if ((error = msleep(rpipe, PIPE_MTX(rpipe), + PRIBIO | PCATCH, + "piperd", 0)) == 0) + error = pipelock(rpipe, 1); + } + if (error) + goto unlocked_error; + } + } + pipeunlock(rpipe); + + /* XXX: should probably do this before getting any locks. */ + if (error == 0) + vfs_timestamp(&rpipe->pipe_atime); +unlocked_error: + --rpipe->pipe_busy; + + /* + * PIPE_WANT processing only makes sense if pipe_busy is 0. + */ + if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { + rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); + wakeup(rpipe); + } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { + /* + * Handle write blocking hysteresis. + */ + if (rpipe->pipe_state & PIPE_WANTW) { + rpipe->pipe_state &= ~PIPE_WANTW; + wakeup(rpipe); + } + } + + if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) + pipeselwakeup(rpipe); + + PIPE_UNLOCK(rpipe); + return (error); +} + +#ifndef PIPE_NODIRECT +/* + * Map the sending processes' buffer into kernel space and wire it. + * This is similar to a physical write operation. + */ +static int +pipe_build_write_buffer(wpipe, uio) + struct pipe *wpipe; + struct uio *uio; +{ + u_int size; + int i; + vm_offset_t addr, endaddr, paddr; + + GIANT_REQUIRED; + PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); + + size = (u_int) uio->uio_iov->iov_len; + if (size > wpipe->pipe_buffer.size) + size = wpipe->pipe_buffer.size; + + endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); + addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); + for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { + vm_page_t m; + + if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 || + (paddr = pmap_extract(vmspace_pmap(curproc->p_vmspace), + addr)) == 0) { + int j; + + for (j = 0; j < i; j++) + vm_page_unwire(wpipe->pipe_map.ms[j], 1); + return (EFAULT); + } + + m = PHYS_TO_VM_PAGE(paddr); + vm_page_wire(m); + wpipe->pipe_map.ms[i] = m; + } + +/* + * set up the control block + */ + wpipe->pipe_map.npages = i; + wpipe->pipe_map.pos = + ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; + wpipe->pipe_map.cnt = size; + +/* + * and map the buffer + */ + if (wpipe->pipe_map.kva == 0) { + /* + * We need to allocate space for an extra page because the + * address range might (will) span pages at times. + */ + wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map, + wpipe->pipe_buffer.size + PAGE_SIZE); + amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE; + } + pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, + wpipe->pipe_map.npages); + +/* + * and update the uio data + */ + + uio->uio_iov->iov_len -= size; + uio->uio_iov->iov_base += size; + if (uio->uio_iov->iov_len == 0) + uio->uio_iov++; + uio->uio_resid -= size; + uio->uio_offset += size; + return (0); +} + +/* + * unmap and unwire the process buffer + */ +static void +pipe_destroy_write_buffer(wpipe) + struct pipe *wpipe; +{ + int i; + + GIANT_REQUIRED; + PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); + + if (wpipe->pipe_map.kva) { + pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); + + if (amountpipekva > MAXPIPEKVA) { + vm_offset_t kva = wpipe->pipe_map.kva; + wpipe->pipe_map.kva = 0; + kmem_free(kernel_map, kva, + wpipe->pipe_buffer.size + PAGE_SIZE); + amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; + } + } + for (i = 0; i < wpipe->pipe_map.npages; i++) + vm_page_unwire(wpipe->pipe_map.ms[i], 1); + wpipe->pipe_map.npages = 0; +} + +/* + * In the case of a signal, the writing process might go away. This + * code copies the data into the circular buffer so that the source + * pages can be freed without loss of data. + */ +static void +pipe_clone_write_buffer(wpipe) + struct pipe *wpipe; +{ + int size; + int pos; + + PIPE_LOCK_ASSERT(wpipe, MA_OWNED); + size = wpipe->pipe_map.cnt; + pos = wpipe->pipe_map.pos; + + wpipe->pipe_buffer.in = size; + wpipe->pipe_buffer.out = 0; + wpipe->pipe_buffer.cnt = size; + wpipe->pipe_state &= ~PIPE_DIRECTW; + + PIPE_GET_GIANT(wpipe); + bcopy((caddr_t) wpipe->pipe_map.kva + pos, + (caddr_t) wpipe->pipe_buffer.buffer, size); + pipe_destroy_write_buffer(wpipe); + PIPE_DROP_GIANT(wpipe); +} + +/* + * This implements the pipe buffer write mechanism. Note that only + * a direct write OR a normal pipe write can be pending at any given time. + * If there are any characters in the pipe buffer, the direct write will + * be deferred until the receiving process grabs all of the bytes from + * the pipe buffer. Then the direct mapping write is set-up. + */ +static int +pipe_direct_write(wpipe, uio) + struct pipe *wpipe; + struct uio *uio; +{ + int error; + +retry: + PIPE_LOCK_ASSERT(wpipe, MA_OWNED); + while (wpipe->pipe_state & PIPE_DIRECTW) { + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + wpipe->pipe_state |= PIPE_WANTW; + error = msleep(wpipe, PIPE_MTX(wpipe), + PRIBIO | PCATCH, "pipdww", 0); + if (error) + goto error1; + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + goto error1; + } + } + wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ + if (wpipe->pipe_buffer.cnt > 0) { + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + + wpipe->pipe_state |= PIPE_WANTW; + error = msleep(wpipe, PIPE_MTX(wpipe), + PRIBIO | PCATCH, "pipdwc", 0); + if (error) + goto error1; + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + goto error1; + } + goto retry; + } + + wpipe->pipe_state |= PIPE_DIRECTW; + + pipelock(wpipe, 0); + PIPE_GET_GIANT(wpipe); + error = pipe_build_write_buffer(wpipe, uio); + PIPE_DROP_GIANT(wpipe); + pipeunlock(wpipe); + if (error) { + wpipe->pipe_state &= ~PIPE_DIRECTW; + goto error1; + } + + error = 0; + while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { + if (wpipe->pipe_state & PIPE_EOF) { + pipelock(wpipe, 0); + PIPE_GET_GIANT(wpipe); + pipe_destroy_write_buffer(wpipe); + PIPE_DROP_GIANT(wpipe); + pipeunlock(wpipe); + pipeselwakeup(wpipe); + error = EPIPE; + goto error1; + } + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + pipeselwakeup(wpipe); + error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, + "pipdwt", 0); + } + + pipelock(wpipe,0); + if (wpipe->pipe_state & PIPE_DIRECTW) { + /* + * this bit of trickery substitutes a kernel buffer for + * the process that might be going away. + */ + pipe_clone_write_buffer(wpipe); + } else { + PIPE_GET_GIANT(wpipe); + pipe_destroy_write_buffer(wpipe); + PIPE_DROP_GIANT(wpipe); + } + pipeunlock(wpipe); + return (error); + +error1: + wakeup(wpipe); + return (error); +} +#endif + +static int +pipe_write(fp, uio, cred, flags, td) + struct file *fp; + struct uio *uio; + struct ucred *cred; + struct thread *td; + int flags; +{ + int error = 0; + int orig_resid; + struct pipe *wpipe, *rpipe; + + rpipe = (struct pipe *) fp->f_data; + wpipe = rpipe->pipe_peer; + + PIPE_LOCK(rpipe); + /* + * detect loss of pipe read side, issue SIGPIPE if lost. + */ + if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { + PIPE_UNLOCK(rpipe); + return (EPIPE); + } + ++wpipe->pipe_busy; + + /* + * If it is advantageous to resize the pipe buffer, do + * so. + */ + if ((uio->uio_resid > PIPE_SIZE) && + (nbigpipe < LIMITBIGPIPES) && + (wpipe->pipe_state & PIPE_DIRECTW) == 0 && + (wpipe->pipe_buffer.size <= PIPE_SIZE) && + (wpipe->pipe_buffer.cnt == 0)) { + + if ((error = pipelock(wpipe,1)) == 0) { + PIPE_GET_GIANT(wpipe); + if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) + nbigpipe++; + PIPE_DROP_GIANT(wpipe); + pipeunlock(wpipe); + } + } + + /* + * If an early error occured unbusy and return, waking up any pending + * readers. + */ + if (error) { + --wpipe->pipe_busy; + if ((wpipe->pipe_busy == 0) && + (wpipe->pipe_state & PIPE_WANT)) { + wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); + wakeup(wpipe); + } + PIPE_UNLOCK(rpipe); + return(error); + } + + KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone")); + + orig_resid = uio->uio_resid; + + while (uio->uio_resid) { + int space; + +#ifndef PIPE_NODIRECT + /* + * If the transfer is large, we can gain performance if + * we do process-to-process copies directly. + * If the write is non-blocking, we don't use the + * direct write mechanism. + * + * The direct write mechanism will detect the reader going + * away on us. + */ + if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && + (fp->f_flag & FNONBLOCK) == 0 && + (wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) && + (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) { + error = pipe_direct_write( wpipe, uio); + if (error) + break; + continue; + } +#endif + + /* + * Pipe buffered writes cannot be coincidental with + * direct writes. We wait until the currently executing + * direct write is completed before we start filling the + * pipe buffer. We break out if a signal occurs or the + * reader goes away. + */ + retrywrite: + while (wpipe->pipe_state & PIPE_DIRECTW) { + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, + "pipbww", 0); + if (wpipe->pipe_state & PIPE_EOF) + break; + if (error) + break; + } + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + break; + } + + space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; + + /* Writes of size <= PIPE_BUF must be atomic. */ + if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) + space = 0; + + if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) { + if ((error = pipelock(wpipe,1)) == 0) { + int size; /* Transfer size */ + int segsize; /* first segment to transfer */ + + /* + * It is possible for a direct write to + * slip in on us... handle it here... + */ + if (wpipe->pipe_state & PIPE_DIRECTW) { + pipeunlock(wpipe); + goto retrywrite; + } + /* + * If a process blocked in uiomove, our + * value for space might be bad. + * + * XXX will we be ok if the reader has gone + * away here? + */ + if (space > wpipe->pipe_buffer.size - + wpipe->pipe_buffer.cnt) { + pipeunlock(wpipe); + goto retrywrite; + } + + /* + * Transfer size is minimum of uio transfer + * and free space in pipe buffer. + */ + if (space > uio->uio_resid) + size = uio->uio_resid; + else + size = space; + /* + * First segment to transfer is minimum of + * transfer size and contiguous space in + * pipe buffer. If first segment to transfer + * is less than the transfer size, we've got + * a wraparound in the buffer. + */ + segsize = wpipe->pipe_buffer.size - + wpipe->pipe_buffer.in; + if (segsize > size) + segsize = size; + + /* Transfer first segment */ + + PIPE_UNLOCK(rpipe); + error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], + segsize, uio); + PIPE_LOCK(rpipe); + + if (error == 0 && segsize < size) { + /* + * Transfer remaining part now, to + * support atomic writes. Wraparound + * happened. + */ + if (wpipe->pipe_buffer.in + segsize != + wpipe->pipe_buffer.size) + panic("Expected pipe buffer wraparound disappeared"); + + PIPE_UNLOCK(rpipe); + error = uiomove(&wpipe->pipe_buffer.buffer[0], + size - segsize, uio); + PIPE_LOCK(rpipe); + } + if (error == 0) { + wpipe->pipe_buffer.in += size; + if (wpipe->pipe_buffer.in >= + wpipe->pipe_buffer.size) { + if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size) + panic("Expected wraparound bad"); + wpipe->pipe_buffer.in = size - segsize; + } + + wpipe->pipe_buffer.cnt += size; + if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size) + panic("Pipe buffer overflow"); + + } + pipeunlock(wpipe); + } + if (error) + break; + + } else { + /* + * If the "read-side" has been blocked, wake it up now. + */ + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + + /* + * don't block on non-blocking I/O + */ + if (fp->f_flag & FNONBLOCK) { + error = EAGAIN; + break; + } + + /* + * We have no more space and have something to offer, + * wake up select/poll. + */ + pipeselwakeup(wpipe); + + wpipe->pipe_state |= PIPE_WANTW; + error = msleep(wpipe, PIPE_MTX(rpipe), + PRIBIO | PCATCH, "pipewr", 0); + if (error != 0) + break; + /* + * If read side wants to go away, we just issue a signal + * to ourselves. + */ + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + break; + } + } + } + + --wpipe->pipe_busy; + + if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { + wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); + wakeup(wpipe); + } else if (wpipe->pipe_buffer.cnt > 0) { + /* + * If we have put any characters in the buffer, we wake up + * the reader. + */ + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + } + + /* + * Don't return EPIPE if I/O was successful + */ + if ((wpipe->pipe_buffer.cnt == 0) && + (uio->uio_resid == 0) && + (error == EPIPE)) { + error = 0; + } + + if (error == 0) + vfs_timestamp(&wpipe->pipe_mtime); + + /* + * We have something to offer, + * wake up select/poll. + */ + if (wpipe->pipe_buffer.cnt) + pipeselwakeup(wpipe); + + PIPE_UNLOCK(rpipe); + return (error); +} + +/* + * we implement a very minimal set of ioctls for compatibility with sockets. + */ +int +pipe_ioctl(fp, cmd, data, td) + struct file *fp; + u_long cmd; + caddr_t data; + struct thread *td; +{ + struct pipe *mpipe = (struct pipe *)fp->f_data; + + switch (cmd) { + + case FIONBIO: + return (0); + + case FIOASYNC: + PIPE_LOCK(mpipe); + if (*(int *)data) { + mpipe->pipe_state |= PIPE_ASYNC; + } else { + mpipe->pipe_state &= ~PIPE_ASYNC; + } + PIPE_UNLOCK(mpipe); + return (0); + + case FIONREAD: + PIPE_LOCK(mpipe); + if (mpipe->pipe_state & PIPE_DIRECTW) + *(int *)data = mpipe->pipe_map.cnt; + else + *(int *)data = mpipe->pipe_buffer.cnt; + PIPE_UNLOCK(mpipe); + return (0); + + case FIOSETOWN: + return (fsetown(*(int *)data, &mpipe->pipe_sigio)); + + case FIOGETOWN: + *(int *)data = fgetown(mpipe->pipe_sigio); + return (0); + + /* This is deprecated, FIOSETOWN should be used instead. */ + case TIOCSPGRP: + return (fsetown(-(*(int *)data), &mpipe->pipe_sigio)); + + /* This is deprecated, FIOGETOWN should be used instead. */ + case TIOCGPGRP: + *(int *)data = -fgetown(mpipe->pipe_sigio); + return (0); + + } + return (ENOTTY); +} + +int +pipe_poll(fp, events, cred, td) + struct file *fp; + int events; + struct ucred *cred; + struct thread *td; +{ + struct pipe *rpipe = (struct pipe *)fp->f_data; + struct pipe *wpipe; + int revents = 0; + + wpipe = rpipe->pipe_peer; + PIPE_LOCK(rpipe); + if (events & (POLLIN | POLLRDNORM)) + if ((rpipe->pipe_state & PIPE_DIRECTW) || + (rpipe->pipe_buffer.cnt > 0) || + (rpipe->pipe_state & PIPE_EOF)) + revents |= events & (POLLIN | POLLRDNORM); + + if (events & (POLLOUT | POLLWRNORM)) + if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) || + (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && + (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) + revents |= events & (POLLOUT | POLLWRNORM); + + if ((rpipe->pipe_state & PIPE_EOF) || + (wpipe == NULL) || + (wpipe->pipe_state & PIPE_EOF)) + revents |= POLLHUP; + + if (revents == 0) { + if (events & (POLLIN | POLLRDNORM)) { + selrecord(td, &rpipe->pipe_sel); + rpipe->pipe_state |= PIPE_SEL; + } + + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(td, &wpipe->pipe_sel); + wpipe->pipe_state |= PIPE_SEL; + } + } + PIPE_UNLOCK(rpipe); + + return (revents); +} + +/* + * We shouldn't need locks here as we're doing a read and this should + * be a natural race. + */ +static int +pipe_stat(fp, ub, td) + struct file *fp; + struct stat *ub; + struct thread *td; +{ + struct pipe *pipe = (struct pipe *)fp->f_data; + + bzero((caddr_t)ub, sizeof(*ub)); + ub->st_mode = S_IFIFO; + ub->st_blksize = pipe->pipe_buffer.size; + ub->st_size = pipe->pipe_buffer.cnt; + ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; + ub->st_atimespec = pipe->pipe_atime; + ub->st_mtimespec = pipe->pipe_mtime; + ub->st_ctimespec = pipe->pipe_ctime; + ub->st_uid = fp->f_cred->cr_uid; + ub->st_gid = fp->f_cred->cr_gid; + /* + * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. + * XXX (st_dev, st_ino) should be unique. + */ + return (0); +} + +/* ARGSUSED */ +static int +pipe_close(fp, td) + struct file *fp; + struct thread *td; +{ + struct pipe *cpipe = (struct pipe *)fp->f_data; + + fp->f_ops = &badfileops; + fp->f_data = NULL; + funsetown(&cpipe->pipe_sigio); + pipeclose(cpipe); + return (0); +} + +static void +pipe_free_kmem(cpipe) + struct pipe *cpipe; +{ + + GIANT_REQUIRED; + KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)), + ("pipespace: pipe mutex locked")); + + if (cpipe->pipe_buffer.buffer != NULL) { + if (cpipe->pipe_buffer.size > PIPE_SIZE) + --nbigpipe; + amountpipekva -= cpipe->pipe_buffer.size; + kmem_free(kernel_map, + (vm_offset_t)cpipe->pipe_buffer.buffer, + cpipe->pipe_buffer.size); + cpipe->pipe_buffer.buffer = NULL; + } +#ifndef PIPE_NODIRECT + if (cpipe->pipe_map.kva != NULL) { + amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE; + kmem_free(kernel_map, + cpipe->pipe_map.kva, + cpipe->pipe_buffer.size + PAGE_SIZE); + cpipe->pipe_map.cnt = 0; + cpipe->pipe_map.kva = 0; + cpipe->pipe_map.pos = 0; + cpipe->pipe_map.npages = 0; + } +#endif +} + +/* + * shutdown the pipe + */ +static void +pipeclose(cpipe) + struct pipe *cpipe; +{ + struct pipe *ppipe; + int hadpeer; + + if (cpipe == NULL) + return; + + hadpeer = 0; + + /* partially created pipes won't have a valid mutex. */ + if (PIPE_MTX(cpipe) != NULL) + PIPE_LOCK(cpipe); + + pipeselwakeup(cpipe); + + /* + * If the other side is blocked, wake it up saying that + * we want to close it down. + */ + while (cpipe->pipe_busy) { + wakeup(cpipe); + cpipe->pipe_state |= PIPE_WANT | PIPE_EOF; + msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); + } + + /* + * Disconnect from peer + */ + if ((ppipe = cpipe->pipe_peer) != NULL) { + hadpeer++; + pipeselwakeup(ppipe); + + ppipe->pipe_state |= PIPE_EOF; + wakeup(ppipe); + KNOTE(&ppipe->pipe_sel.si_note, 0); + ppipe->pipe_peer = NULL; + } + /* + * free resources + */ + if (PIPE_MTX(cpipe) != NULL) { + PIPE_UNLOCK(cpipe); + if (!hadpeer) { + mtx_destroy(PIPE_MTX(cpipe)); + free(PIPE_MTX(cpipe), M_TEMP); + } + } + mtx_lock(&Giant); + pipe_free_kmem(cpipe); + uma_zfree(pipe_zone, cpipe); + mtx_unlock(&Giant); +} + +/*ARGSUSED*/ +static int +pipe_kqfilter(struct file *fp, struct knote *kn) +{ + struct pipe *cpipe; + + cpipe = (struct pipe *)kn->kn_fp->f_data; + switch (kn->kn_filter) { + case EVFILT_READ: + kn->kn_fop = &pipe_rfiltops; + break; + case EVFILT_WRITE: + kn->kn_fop = &pipe_wfiltops; + cpipe = cpipe->pipe_peer; + break; + default: + return (1); + } + kn->kn_hook = (caddr_t)cpipe; + + PIPE_LOCK(cpipe); + SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext); + PIPE_UNLOCK(cpipe); + return (0); +} + +static void +filt_pipedetach(struct knote *kn) +{ + struct pipe *cpipe = (struct pipe *)kn->kn_hook; + + PIPE_LOCK(cpipe); + SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext); + PIPE_UNLOCK(cpipe); +} + +/*ARGSUSED*/ +static int +filt_piperead(struct knote *kn, long hint) +{ + struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *wpipe = rpipe->pipe_peer; + + PIPE_LOCK(rpipe); + kn->kn_data = rpipe->pipe_buffer.cnt; + if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) + kn->kn_data = rpipe->pipe_map.cnt; + + if ((rpipe->pipe_state & PIPE_EOF) || + (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { + kn->kn_flags |= EV_EOF; + PIPE_UNLOCK(rpipe); + return (1); + } + PIPE_UNLOCK(rpipe); + return (kn->kn_data > 0); +} + +/*ARGSUSED*/ +static int +filt_pipewrite(struct knote *kn, long hint) +{ + struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; + struct pipe *wpipe = rpipe->pipe_peer; + + PIPE_LOCK(rpipe); + if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { + kn->kn_data = 0; + kn->kn_flags |= EV_EOF; + PIPE_UNLOCK(rpipe); + return (1); + } + kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; + if (wpipe->pipe_state & PIPE_DIRECTW) + kn->kn_data = 0; + + PIPE_UNLOCK(rpipe); + return (kn->kn_data >= PIPE_BUF); +} diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c new file mode 100644 index 0000000..dacb9d9 --- /dev/null +++ b/sys/kern/sys_process.c @@ -0,0 +1,728 @@ +/* + * Copyright (c) 1994, Sean Eric Fagan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Sean Eric Fagan. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sysproto.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/ptrace.h> +#include <sys/sx.h> +#include <sys/user.h> + +#include <machine/reg.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/vm_map.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> + +/* + * Functions implemented using PROC_ACTION(): + * + * proc_read_regs(proc, regs) + * Get the current user-visible register set from the process + * and copy it into the regs structure (<machine/reg.h>). + * The process is stopped at the time read_regs is called. + * + * proc_write_regs(proc, regs) + * Update the current register set from the passed in regs + * structure. Take care to avoid clobbering special CPU + * registers or privileged bits in the PSL. + * Depending on the architecture this may have fix-up work to do, + * especially if the IAR or PCW are modified. + * The process is stopped at the time write_regs is called. + * + * proc_read_fpregs, proc_write_fpregs + * deal with the floating point register set, otherwise as above. + * + * proc_read_dbregs, proc_write_dbregs + * deal with the processor debug register set, otherwise as above. + * + * proc_sstep(proc) + * Arrange for the process to trap after executing a single instruction. + */ + +#define PROC_ACTION(action) do { \ + int error; \ + \ + mtx_lock_spin(&sched_lock); \ + if ((td->td_proc->p_sflag & PS_INMEM) == 0) \ + error = EIO; \ + else \ + error = (action); \ + mtx_unlock_spin(&sched_lock); \ + return (error); \ +} while(0) + +int +proc_read_regs(struct thread *td, struct reg *regs) +{ + + PROC_ACTION(fill_regs(td, regs)); +} + +int +proc_write_regs(struct thread *td, struct reg *regs) +{ + + PROC_ACTION(set_regs(td, regs)); +} + +int +proc_read_dbregs(struct thread *td, struct dbreg *dbregs) +{ + + PROC_ACTION(fill_dbregs(td, dbregs)); +} + +int +proc_write_dbregs(struct thread *td, struct dbreg *dbregs) +{ + + PROC_ACTION(set_dbregs(td, dbregs)); +} + +/* + * Ptrace doesn't support fpregs at all, and there are no security holes + * or translations for fpregs, so we can just copy them. + */ +int +proc_read_fpregs(struct thread *td, struct fpreg *fpregs) +{ + + PROC_ACTION(fill_fpregs(td, fpregs)); +} + +int +proc_write_fpregs(struct thread *td, struct fpreg *fpregs) +{ + + PROC_ACTION(set_fpregs(td, fpregs)); +} + +int +proc_sstep(struct thread *td) +{ + + PROC_ACTION(ptrace_single_step(td)); +} + +int +proc_rwmem(struct proc *p, struct uio *uio) +{ + struct vmspace *vm; + vm_map_t map; + vm_object_t object = NULL; + vm_offset_t pageno = 0; /* page number */ + vm_prot_t reqprot; + vm_offset_t kva; + int error, writing; + + GIANT_REQUIRED; + + /* + * if the vmspace is in the midst of being deallocated or the + * process is exiting, don't try to grab anything. The page table + * usage in that process can be messed up. + */ + vm = p->p_vmspace; + if ((p->p_flag & P_WEXIT)) + return (EFAULT); + if (vm->vm_refcnt < 1) + return (EFAULT); + ++vm->vm_refcnt; + /* + * The map we want... + */ + map = &vm->vm_map; + + writing = uio->uio_rw == UIO_WRITE; + reqprot = writing ? (VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE) : + VM_PROT_READ; + + kva = kmem_alloc_pageable(kernel_map, PAGE_SIZE); + + /* + * Only map in one page at a time. We don't have to, but it + * makes things easier. This way is trivial - right? + */ + do { + vm_map_t tmap; + vm_offset_t uva; + int page_offset; /* offset into page */ + vm_map_entry_t out_entry; + vm_prot_t out_prot; + boolean_t wired; + vm_pindex_t pindex; + u_int len; + vm_page_t m; + + object = NULL; + + uva = (vm_offset_t)uio->uio_offset; + + /* + * Get the page number of this segment. + */ + pageno = trunc_page(uva); + page_offset = uva - pageno; + + /* + * How many bytes to copy + */ + len = min(PAGE_SIZE - page_offset, uio->uio_resid); + + /* + * Fault the page on behalf of the process + */ + error = vm_fault(map, pageno, reqprot, VM_FAULT_NORMAL); + if (error) { + error = EFAULT; + break; + } + + /* + * Now we need to get the page. out_entry, out_prot, wired, + * and single_use aren't used. One would think the vm code + * would be a *bit* nicer... We use tmap because + * vm_map_lookup() can change the map argument. + */ + tmap = map; + error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry, + &object, &pindex, &out_prot, &wired); + + if (error) { + error = EFAULT; + + /* + * Make sure that there is no residue in 'object' from + * an error return on vm_map_lookup. + */ + object = NULL; + + break; + } + + m = vm_page_lookup(object, pindex); + + /* Allow fallback to backing objects if we are reading */ + + while (m == NULL && !writing && object->backing_object) { + + pindex += OFF_TO_IDX(object->backing_object_offset); + object = object->backing_object; + + m = vm_page_lookup(object, pindex); + } + + if (m == NULL) { + error = EFAULT; + + /* + * Make sure that there is no residue in 'object' from + * an error return on vm_map_lookup. + */ + object = NULL; + + vm_map_lookup_done(tmap, out_entry); + + break; + } + + /* + * Wire the page into memory + */ + vm_page_wire(m); + + /* + * We're done with tmap now. + * But reference the object first, so that we won't loose + * it. + */ + vm_object_reference(object); + vm_map_lookup_done(tmap, out_entry); + + pmap_qenter(kva, &m, 1); + + /* + * Now do the i/o move. + */ + error = uiomove((caddr_t)(kva + page_offset), len, uio); + + pmap_qremove(kva, 1); + + /* + * release the page and the object + */ + vm_page_unwire(m, 1); + vm_object_deallocate(object); + + object = NULL; + + } while (error == 0 && uio->uio_resid > 0); + + if (object) + vm_object_deallocate(object); + + kmem_free(kernel_map, kva, PAGE_SIZE); + vmspace_free(vm); + return (error); +} + +/* + * Process debugging system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ptrace_args { + int req; + pid_t pid; + caddr_t addr; + int data; +}; +#endif + +int +ptrace(struct thread *td, struct ptrace_args *uap) +{ + struct iovec iov; + struct uio uio; + /* + * XXX this obfuscation is to reduce stack usage, but the register + * structs may be too large to put on the stack anyway. + */ + union { + struct ptrace_io_desc piod; + struct dbreg dbreg; + struct fpreg fpreg; + struct reg reg; + } r; + struct proc *curp, *p, *pp; + struct thread *td2; + int error, write; + int proctree_locked = 0; + + curp = td->td_proc; + + /* + * Do copyin() early before getting locks and lock proctree before + * locking the process. + */ + switch (uap->req) { + case PT_TRACE_ME: + case PT_ATTACH: + case PT_STEP: + case PT_CONTINUE: + case PT_DETACH: + sx_xlock(&proctree_lock); + proctree_locked = 1; + break; +#ifdef PT_SETREGS + case PT_SETREGS: + error = copyin(uap->addr, &r.reg, sizeof r.reg); + if (error) + return (error); + break; +#endif /* PT_SETREGS */ +#ifdef PT_SETFPREGS + case PT_SETFPREGS: + error = copyin(uap->addr, &r.fpreg, sizeof r.fpreg); + if (error) + return (error); + break; +#endif /* PT_SETFPREGS */ +#ifdef PT_SETDBREGS + case PT_SETDBREGS: + error = copyin(uap->addr, &r.dbreg, sizeof r.dbreg); + if (error) + return (error); + break; +#endif /* PT_SETDBREGS */ + default: + break; + } + + write = 0; + if (uap->req == PT_TRACE_ME) { + p = td->td_proc; + PROC_LOCK(p); + } else { + if ((p = pfind(uap->pid)) == NULL) { + if (proctree_locked) + sx_xunlock(&proctree_lock); + return (ESRCH); + } + } + if (p_cansee(td, p)) { + error = ESRCH; + goto fail; + } + + if ((error = p_candebug(td, p)) != 0) + goto fail; + + /* + * System processes can't be debugged. + */ + if ((p->p_flag & P_SYSTEM) != 0) { + error = EINVAL; + goto fail; + } + + /* + * Permissions check + */ + switch (uap->req) { + case PT_TRACE_ME: + /* Always legal. */ + break; + + case PT_ATTACH: + /* Self */ + if (p->p_pid == td->td_proc->p_pid) { + error = EINVAL; + goto fail; + } + + /* Already traced */ + if (p->p_flag & P_TRACED) { + error = EBUSY; + goto fail; + } + + /* Can't trace an ancestor if you're being traced. */ + if (curp->p_flag & P_TRACED) { + for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) { + if (pp == p) { + error = EINVAL; + goto fail; + } + } + } + + + /* OK */ + break; + + case PT_READ_I: + case PT_READ_D: + case PT_WRITE_I: + case PT_WRITE_D: + case PT_IO: + case PT_CONTINUE: + case PT_KILL: + case PT_STEP: + case PT_DETACH: + case PT_GETREGS: + case PT_SETREGS: + case PT_GETFPREGS: + case PT_SETFPREGS: + case PT_GETDBREGS: + case PT_SETDBREGS: + /* not being traced... */ + if ((p->p_flag & P_TRACED) == 0) { + error = EPERM; + goto fail; + } + + /* not being traced by YOU */ + if (p->p_pptr != td->td_proc) { + error = EBUSY; + goto fail; + } + + /* not currently stopped */ + if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0) { + error = EBUSY; + goto fail; + } + + /* OK */ + break; + + default: + error = EINVAL; + goto fail; + } + + td2 = FIRST_THREAD_IN_PROC(p); +#ifdef FIX_SSTEP + /* + * Single step fixup ala procfs + */ + FIX_SSTEP(td2); /* XXXKSE */ +#endif + + /* + * Actually do the requests + */ + + td->td_retval[0] = 0; + + switch (uap->req) { + case PT_TRACE_ME: + /* set my trace flag and "owner" so it can read/write me */ + p->p_flag |= P_TRACED; + p->p_oppid = p->p_pptr->p_pid; + PROC_UNLOCK(p); + sx_xunlock(&proctree_lock); + return (0); + + case PT_ATTACH: + /* security check done above */ + p->p_flag |= P_TRACED; + p->p_oppid = p->p_pptr->p_pid; + if (p->p_pptr != td->td_proc) + proc_reparent(p, td->td_proc); + uap->data = SIGSTOP; + goto sendsig; /* in PT_CONTINUE below */ + + case PT_STEP: + case PT_CONTINUE: + case PT_DETACH: + /* XXX uap->data is used even in the PT_STEP case. */ + if (uap->req != PT_STEP && (unsigned)uap->data > _SIG_MAXSIG) { + error = EINVAL; + goto fail; + } + + _PHOLD(p); + + if (uap->req == PT_STEP) { + error = ptrace_single_step(td2); + if (error) { + _PRELE(p); + goto fail; + } + } + + if (uap->addr != (caddr_t)1) { + fill_kinfo_proc(p, &p->p_uarea->u_kproc); + error = ptrace_set_pc(td2, + (u_long)(uintfptr_t)uap->addr); + if (error) { + _PRELE(p); + goto fail; + } + } + _PRELE(p); + + if (uap->req == PT_DETACH) { + /* reset process parent */ + if (p->p_oppid != p->p_pptr->p_pid) { + struct proc *pp; + + PROC_UNLOCK(p); + pp = pfind(p->p_oppid); + if (pp == NULL) + pp = initproc; + else + PROC_UNLOCK(pp); + PROC_LOCK(p); + proc_reparent(p, pp); + } + p->p_flag &= ~(P_TRACED | P_WAITED); + p->p_oppid = 0; + + /* should we send SIGCHLD? */ + } + + sendsig: + if (proctree_locked) + sx_xunlock(&proctree_lock); + /* deliver or queue signal */ + if (p->p_stat == SSTOP) { + p->p_xstat = uap->data; + mtx_lock_spin(&sched_lock); + setrunnable(td2); /* XXXKSE */ + mtx_unlock_spin(&sched_lock); + } else if (uap->data) + psignal(p, uap->data); + PROC_UNLOCK(p); + + return (0); + + case PT_WRITE_I: + case PT_WRITE_D: + write = 1; + /* fallthrough */ + case PT_READ_I: + case PT_READ_D: + PROC_UNLOCK(p); + /* write = 0 set above */ + iov.iov_base = write ? (caddr_t)&uap->data : + (caddr_t)td->td_retval; + iov.iov_len = sizeof(int); + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = (off_t)(uintptr_t)uap->addr; + uio.uio_resid = sizeof(int); + uio.uio_segflg = UIO_SYSSPACE; /* i.e.: the uap */ + uio.uio_rw = write ? UIO_WRITE : UIO_READ; + uio.uio_td = td; + error = proc_rwmem(p, &uio); + if (uio.uio_resid != 0) { + /* + * XXX proc_rwmem() doesn't currently return ENOSPC, + * so I think write() can bogusly return 0. + * XXX what happens for short writes? We don't want + * to write partial data. + * XXX proc_rwmem() returns EPERM for other invalid + * addresses. Convert this to EINVAL. Does this + * clobber returns of EPERM for other reasons? + */ + if (error == 0 || error == ENOSPC || error == EPERM) + error = EINVAL; /* EOF */ + } + return (error); + + case PT_IO: + error = copyin(uap->addr, &r.piod, sizeof r.piod); + if (error) + return (error); + iov.iov_base = r.piod.piod_addr; + iov.iov_len = r.piod.piod_len; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = (off_t)(uintptr_t)r.piod.piod_offs; + uio.uio_resid = r.piod.piod_len; + uio.uio_segflg = UIO_USERSPACE; + uio.uio_td = td; + switch (r.piod.piod_op) { + case PIOD_READ_D: + case PIOD_READ_I: + uio.uio_rw = UIO_READ; + break; + case PIOD_WRITE_D: + case PIOD_WRITE_I: + uio.uio_rw = UIO_WRITE; + break; + default: + return (EINVAL); + } + error = proc_rwmem(p, &uio); + r.piod.piod_len -= uio.uio_resid; + (void)copyout(&r.piod, uap->addr, sizeof r.piod); + return (error); + + case PT_KILL: + uap->data = SIGKILL; + goto sendsig; /* in PT_CONTINUE above */ + + case PT_SETREGS: + _PHOLD(p); + error = proc_write_regs(td2, &r.reg); + _PRELE(p); + PROC_UNLOCK(p); + return (error); + + case PT_GETREGS: + _PHOLD(p); + error = proc_read_regs(td2, &r.reg); + _PRELE(p); + PROC_UNLOCK(p); + if (error == 0) + error = copyout(&r.reg, uap->addr, sizeof r.reg); + return (error); + + case PT_SETFPREGS: + _PHOLD(p); + error = proc_write_fpregs(td2, &r.fpreg); + _PRELE(p); + PROC_UNLOCK(p); + return (error); + + case PT_GETFPREGS: + _PHOLD(p); + error = proc_read_fpregs(td2, &r.fpreg); + _PRELE(p); + PROC_UNLOCK(p); + if (error == 0) + error = copyout(&r.fpreg, uap->addr, sizeof r.fpreg); + return (error); + + case PT_SETDBREGS: + _PHOLD(p); + error = proc_write_dbregs(td2, &r.dbreg); + _PRELE(p); + PROC_UNLOCK(p); + return (error); + + case PT_GETDBREGS: + _PHOLD(p); + error = proc_read_dbregs(td2, &r.dbreg); + _PRELE(p); + PROC_UNLOCK(p); + if (error == 0) + error = copyout(&r.dbreg, uap->addr, sizeof r.dbreg); + return (error); + + default: + KASSERT(0, ("unreachable code\n")); + break; + } + + KASSERT(0, ("unreachable code\n")); + return (0); + +fail: + PROC_UNLOCK(p); + if (proctree_locked) + sx_xunlock(&proctree_lock); + return (error); +} + +/* + * Stop a process because of a debugging event; + * stay stopped until p->p_step is cleared + * (cleared by PIOCCONT in procfs). + */ +void +stopevent(struct proc *p, unsigned int event, unsigned int val) +{ + + PROC_LOCK_ASSERT(p, MA_OWNED | MA_NOTRECURSED); + p->p_step = 1; + + do { + p->p_xstat = val; + p->p_stype = event; /* Which event caused the stop? */ + wakeup(&p->p_stype); /* Wake up any PIOCWAIT'ing procs */ + msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0); + } while (p->p_step); +} diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c new file mode 100644 index 0000000..c8a6198 --- /dev/null +++ b/sys/kern/sys_socket.c @@ -0,0 +1,217 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/filio.h> /* XXX */ +#include <sys/sockio.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/filedesc.h> +#include <sys/ucred.h> + +#include <net/if.h> +#include <net/route.h> + +struct fileops socketops = { + soo_read, soo_write, soo_ioctl, soo_poll, sokqfilter, + soo_stat, soo_close +}; + +/* ARGSUSED */ +int +soo_read(fp, uio, cred, flags, td) + struct file *fp; + struct uio *uio; + struct ucred *cred; + struct thread *td; + int flags; +{ + struct socket *so = (struct socket *)fp->f_data; + int error; + + mtx_lock(&Giant); + error = so->so_proto->pr_usrreqs->pru_soreceive(so, 0, uio, 0, 0, 0); + mtx_unlock(&Giant); + return (error); +} + +/* ARGSUSED */ +int +soo_write(fp, uio, cred, flags, td) + struct file *fp; + struct uio *uio; + struct ucred *cred; + struct thread *td; + int flags; +{ + struct socket *so = (struct socket *)fp->f_data; + int error; + + mtx_lock(&Giant); + error = so->so_proto->pr_usrreqs->pru_sosend(so, 0, uio, 0, 0, 0, + uio->uio_td); + mtx_unlock(&Giant); + return (error); +} + +int +soo_ioctl(fp, cmd, data, td) + struct file *fp; + u_long cmd; + register caddr_t data; + struct thread *td; +{ + register struct socket *so = (struct socket *)fp->f_data; + + switch (cmd) { + + case FIONBIO: + if (*(int *)data) + so->so_state |= SS_NBIO; + else + so->so_state &= ~SS_NBIO; + return (0); + + case FIOASYNC: + if (*(int *)data) { + so->so_state |= SS_ASYNC; + so->so_rcv.sb_flags |= SB_ASYNC; + so->so_snd.sb_flags |= SB_ASYNC; + } else { + so->so_state &= ~SS_ASYNC; + so->so_rcv.sb_flags &= ~SB_ASYNC; + so->so_snd.sb_flags &= ~SB_ASYNC; + } + return (0); + + case FIONREAD: + *(int *)data = so->so_rcv.sb_cc; + return (0); + + case FIOSETOWN: + return (fsetown(*(int *)data, &so->so_sigio)); + + case FIOGETOWN: + *(int *)data = fgetown(so->so_sigio); + return (0); + + case SIOCSPGRP: + return (fsetown(-(*(int *)data), &so->so_sigio)); + + case SIOCGPGRP: + *(int *)data = -fgetown(so->so_sigio); + return (0); + + case SIOCATMARK: + *(int *)data = (so->so_state&SS_RCVATMARK) != 0; + return (0); + } + /* + * Interface/routing/protocol specific ioctls: + * interface and routing ioctls should have a + * different entry since a socket's unnecessary + */ + if (IOCGROUP(cmd) == 'i') + return (ifioctl(so, cmd, data, td)); + if (IOCGROUP(cmd) == 'r') + return (rtioctl(cmd, data)); + return ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, 0, td)); +} + +int +soo_poll(fp, events, cred, td) + struct file *fp; + int events; + struct ucred *cred; + struct thread *td; +{ + struct socket *so = (struct socket *)fp->f_data; + return so->so_proto->pr_usrreqs->pru_sopoll(so, events, cred, td); +} + +int +soo_stat(fp, ub, td) + struct file *fp; + struct stat *ub; + struct thread *td; +{ + struct socket *so = (struct socket *)fp->f_data; + + bzero((caddr_t)ub, sizeof (*ub)); + ub->st_mode = S_IFSOCK; + /* + * If SS_CANTRCVMORE is set, but there's still data left in the + * receive buffer, the socket is still readable. + */ + if ((so->so_state & SS_CANTRCVMORE) == 0 || + so->so_rcv.sb_cc != 0) + ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; + if ((so->so_state & SS_CANTSENDMORE) == 0) + ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; + ub->st_size = so->so_rcv.sb_cc; + ub->st_uid = so->so_cred->cr_uid; + ub->st_gid = so->so_cred->cr_gid; + return ((*so->so_proto->pr_usrreqs->pru_sense)(so, ub)); +} + +/* + * API socket close on file pointer. We call soclose() to close the + * socket (including initiating closing protocols). soclose() will + * sorele() the file reference but the actual socket will not go away + * until the socket's ref count hits 0. + */ +/* ARGSUSED */ +int +soo_close(fp, td) + struct file *fp; + struct thread *td; +{ + int error = 0; + struct socket *so; + + so = (struct socket *)fp->f_data; + fp->f_ops = &badfileops; + fp->f_data = 0; + + if (so) + error = soclose(so); + return (error); +} diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c new file mode 100644 index 0000000..8b092fc --- /dev/null +++ b/sys/kern/syscalls.c @@ -0,0 +1,403 @@ +/* + * System call names. + * + * DO NOT EDIT-- this file is automatically generated. + * $FreeBSD$ + * created from FreeBSD: src/sys/kern/syscalls.master,v 1.113 2002/06/13 23:43:53 rwatson Exp + */ + +char *syscallnames[] = { + "syscall", /* 0 = syscall */ + "exit", /* 1 = exit */ + "fork", /* 2 = fork */ + "read", /* 3 = read */ + "write", /* 4 = write */ + "open", /* 5 = open */ + "close", /* 6 = close */ + "wait4", /* 7 = wait4 */ + "old.creat", /* 8 = old creat */ + "link", /* 9 = link */ + "unlink", /* 10 = unlink */ + "obs_execv", /* 11 = obsolete execv */ + "chdir", /* 12 = chdir */ + "fchdir", /* 13 = fchdir */ + "mknod", /* 14 = mknod */ + "chmod", /* 15 = chmod */ + "chown", /* 16 = chown */ + "break", /* 17 = break */ + "getfsstat", /* 18 = getfsstat */ + "old.lseek", /* 19 = old lseek */ + "getpid", /* 20 = getpid */ + "mount", /* 21 = mount */ + "unmount", /* 22 = unmount */ + "setuid", /* 23 = setuid */ + "getuid", /* 24 = getuid */ + "geteuid", /* 25 = geteuid */ + "ptrace", /* 26 = ptrace */ + "recvmsg", /* 27 = recvmsg */ + "sendmsg", /* 28 = sendmsg */ + "recvfrom", /* 29 = recvfrom */ + "accept", /* 30 = accept */ + "getpeername", /* 31 = getpeername */ + "getsockname", /* 32 = getsockname */ + "access", /* 33 = access */ + "chflags", /* 34 = chflags */ + "fchflags", /* 35 = fchflags */ + "sync", /* 36 = sync */ + "kill", /* 37 = kill */ + "old.stat", /* 38 = old stat */ + "getppid", /* 39 = getppid */ + "old.lstat", /* 40 = old lstat */ + "dup", /* 41 = dup */ + "pipe", /* 42 = pipe */ + "getegid", /* 43 = getegid */ + "profil", /* 44 = profil */ + "ktrace", /* 45 = ktrace */ + "old.sigaction", /* 46 = old sigaction */ + "getgid", /* 47 = getgid */ + "old.sigprocmask", /* 48 = old sigprocmask */ + "getlogin", /* 49 = getlogin */ + "setlogin", /* 50 = setlogin */ + "acct", /* 51 = acct */ + "old.sigpending", /* 52 = old sigpending */ + "sigaltstack", /* 53 = sigaltstack */ + "ioctl", /* 54 = ioctl */ + "reboot", /* 55 = reboot */ + "revoke", /* 56 = revoke */ + "symlink", /* 57 = symlink */ + "readlink", /* 58 = readlink */ + "execve", /* 59 = execve */ + "umask", /* 60 = umask */ + "chroot", /* 61 = chroot */ + "old.fstat", /* 62 = old fstat */ + "old.getkerninfo", /* 63 = old getkerninfo */ + "old.getpagesize", /* 64 = old getpagesize */ + "msync", /* 65 = msync */ + "vfork", /* 66 = vfork */ + "obs_vread", /* 67 = obsolete vread */ + "obs_vwrite", /* 68 = obsolete vwrite */ + "sbrk", /* 69 = sbrk */ + "sstk", /* 70 = sstk */ + "old.mmap", /* 71 = old mmap */ + "vadvise", /* 72 = vadvise */ + "munmap", /* 73 = munmap */ + "mprotect", /* 74 = mprotect */ + "madvise", /* 75 = madvise */ + "obs_vhangup", /* 76 = obsolete vhangup */ + "obs_vlimit", /* 77 = obsolete vlimit */ + "mincore", /* 78 = mincore */ + "getgroups", /* 79 = getgroups */ + "setgroups", /* 80 = setgroups */ + "getpgrp", /* 81 = getpgrp */ + "setpgid", /* 82 = setpgid */ + "setitimer", /* 83 = setitimer */ + "old.wait", /* 84 = old wait */ + "swapon", /* 85 = swapon */ + "getitimer", /* 86 = getitimer */ + "old.gethostname", /* 87 = old gethostname */ + "old.sethostname", /* 88 = old sethostname */ + "getdtablesize", /* 89 = getdtablesize */ + "dup2", /* 90 = dup2 */ + "#91", /* 91 = getdopt */ + "fcntl", /* 92 = fcntl */ + "select", /* 93 = select */ + "#94", /* 94 = setdopt */ + "fsync", /* 95 = fsync */ + "setpriority", /* 96 = setpriority */ + "socket", /* 97 = socket */ + "connect", /* 98 = connect */ + "old.accept", /* 99 = old accept */ + "getpriority", /* 100 = getpriority */ + "old.send", /* 101 = old send */ + "old.recv", /* 102 = old recv */ + "osigreturn", /* 103 = osigreturn */ + "bind", /* 104 = bind */ + "setsockopt", /* 105 = setsockopt */ + "listen", /* 106 = listen */ + "obs_vtimes", /* 107 = obsolete vtimes */ + "old.sigvec", /* 108 = old sigvec */ + "old.sigblock", /* 109 = old sigblock */ + "old.sigsetmask", /* 110 = old sigsetmask */ + "old.sigsuspend", /* 111 = old sigsuspend */ + "old.sigstack", /* 112 = old sigstack */ + "old.recvmsg", /* 113 = old recvmsg */ + "old.sendmsg", /* 114 = old sendmsg */ + "obs_vtrace", /* 115 = obsolete vtrace */ + "gettimeofday", /* 116 = gettimeofday */ + "getrusage", /* 117 = getrusage */ + "getsockopt", /* 118 = getsockopt */ + "#119", /* 119 = resuba */ + "readv", /* 120 = readv */ + "writev", /* 121 = writev */ + "settimeofday", /* 122 = settimeofday */ + "fchown", /* 123 = fchown */ + "fchmod", /* 124 = fchmod */ + "old.recvfrom", /* 125 = old recvfrom */ + "setreuid", /* 126 = setreuid */ + "setregid", /* 127 = setregid */ + "rename", /* 128 = rename */ + "old.truncate", /* 129 = old truncate */ + "old.ftruncate", /* 130 = old ftruncate */ + "flock", /* 131 = flock */ + "mkfifo", /* 132 = mkfifo */ + "sendto", /* 133 = sendto */ + "shutdown", /* 134 = shutdown */ + "socketpair", /* 135 = socketpair */ + "mkdir", /* 136 = mkdir */ + "rmdir", /* 137 = rmdir */ + "utimes", /* 138 = utimes */ + "obs_4.2", /* 139 = obsolete 4.2 sigreturn */ + "adjtime", /* 140 = adjtime */ + "old.getpeername", /* 141 = old getpeername */ + "old.gethostid", /* 142 = old gethostid */ + "old.sethostid", /* 143 = old sethostid */ + "old.getrlimit", /* 144 = old getrlimit */ + "old.setrlimit", /* 145 = old setrlimit */ + "old.killpg", /* 146 = old killpg */ + "setsid", /* 147 = setsid */ + "quotactl", /* 148 = quotactl */ + "old.quota", /* 149 = old quota */ + "old.getsockname", /* 150 = old getsockname */ + "#151", /* 151 = sem_lock */ + "#152", /* 152 = sem_wakeup */ + "#153", /* 153 = asyncdaemon */ + "#154", /* 154 = nosys */ + "nfssvc", /* 155 = nfssvc */ + "old.getdirentries", /* 156 = old getdirentries */ + "statfs", /* 157 = statfs */ + "fstatfs", /* 158 = fstatfs */ + "#159", /* 159 = nosys */ + "#160", /* 160 = nosys */ + "getfh", /* 161 = getfh */ + "getdomainname", /* 162 = getdomainname */ + "setdomainname", /* 163 = setdomainname */ + "uname", /* 164 = uname */ + "sysarch", /* 165 = sysarch */ + "rtprio", /* 166 = rtprio */ + "#167", /* 167 = nosys */ + "#168", /* 168 = nosys */ + "semsys", /* 169 = semsys */ + "msgsys", /* 170 = msgsys */ + "shmsys", /* 171 = shmsys */ + "#172", /* 172 = nosys */ + "pread", /* 173 = pread */ + "pwrite", /* 174 = pwrite */ + "#175", /* 175 = nosys */ + "ntp_adjtime", /* 176 = ntp_adjtime */ + "#177", /* 177 = sfork */ + "#178", /* 178 = getdescriptor */ + "#179", /* 179 = setdescriptor */ + "#180", /* 180 = nosys */ + "setgid", /* 181 = setgid */ + "setegid", /* 182 = setegid */ + "seteuid", /* 183 = seteuid */ + "#184", /* 184 = lfs_bmapv */ + "#185", /* 185 = lfs_markv */ + "#186", /* 186 = lfs_segclean */ + "#187", /* 187 = lfs_segwait */ + "stat", /* 188 = stat */ + "fstat", /* 189 = fstat */ + "lstat", /* 190 = lstat */ + "pathconf", /* 191 = pathconf */ + "fpathconf", /* 192 = fpathconf */ + "#193", /* 193 = nosys */ + "getrlimit", /* 194 = getrlimit */ + "setrlimit", /* 195 = setrlimit */ + "getdirentries", /* 196 = getdirentries */ + "mmap", /* 197 = mmap */ + "__syscall", /* 198 = __syscall */ + "lseek", /* 199 = lseek */ + "truncate", /* 200 = truncate */ + "ftruncate", /* 201 = ftruncate */ + "__sysctl", /* 202 = __sysctl */ + "mlock", /* 203 = mlock */ + "munlock", /* 204 = munlock */ + "undelete", /* 205 = undelete */ + "futimes", /* 206 = futimes */ + "getpgid", /* 207 = getpgid */ + "#208", /* 208 = newreboot */ + "poll", /* 209 = poll */ + "lkmnosys", /* 210 = lkmnosys */ + "lkmnosys", /* 211 = lkmnosys */ + "lkmnosys", /* 212 = lkmnosys */ + "lkmnosys", /* 213 = lkmnosys */ + "lkmnosys", /* 214 = lkmnosys */ + "lkmnosys", /* 215 = lkmnosys */ + "lkmnosys", /* 216 = lkmnosys */ + "lkmnosys", /* 217 = lkmnosys */ + "lkmnosys", /* 218 = lkmnosys */ + "lkmnosys", /* 219 = lkmnosys */ + "__semctl", /* 220 = __semctl */ + "semget", /* 221 = semget */ + "semop", /* 222 = semop */ + "#223", /* 223 = semconfig */ + "msgctl", /* 224 = msgctl */ + "msgget", /* 225 = msgget */ + "msgsnd", /* 226 = msgsnd */ + "msgrcv", /* 227 = msgrcv */ + "shmat", /* 228 = shmat */ + "shmctl", /* 229 = shmctl */ + "shmdt", /* 230 = shmdt */ + "shmget", /* 231 = shmget */ + "clock_gettime", /* 232 = clock_gettime */ + "clock_settime", /* 233 = clock_settime */ + "clock_getres", /* 234 = clock_getres */ + "#235", /* 235 = timer_create */ + "#236", /* 236 = timer_delete */ + "#237", /* 237 = timer_settime */ + "#238", /* 238 = timer_gettime */ + "#239", /* 239 = timer_getoverrun */ + "nanosleep", /* 240 = nanosleep */ + "#241", /* 241 = nosys */ + "#242", /* 242 = nosys */ + "#243", /* 243 = nosys */ + "#244", /* 244 = nosys */ + "#245", /* 245 = nosys */ + "#246", /* 246 = nosys */ + "#247", /* 247 = nosys */ + "#248", /* 248 = nosys */ + "#249", /* 249 = nosys */ + "minherit", /* 250 = minherit */ + "rfork", /* 251 = rfork */ + "openbsd_poll", /* 252 = openbsd_poll */ + "issetugid", /* 253 = issetugid */ + "lchown", /* 254 = lchown */ + "#255", /* 255 = nosys */ + "#256", /* 256 = nosys */ + "#257", /* 257 = nosys */ + "#258", /* 258 = nosys */ + "#259", /* 259 = nosys */ + "#260", /* 260 = nosys */ + "#261", /* 261 = nosys */ + "#262", /* 262 = nosys */ + "#263", /* 263 = nosys */ + "#264", /* 264 = nosys */ + "#265", /* 265 = nosys */ + "#266", /* 266 = nosys */ + "#267", /* 267 = nosys */ + "#268", /* 268 = nosys */ + "#269", /* 269 = nosys */ + "#270", /* 270 = nosys */ + "#271", /* 271 = nosys */ + "getdents", /* 272 = getdents */ + "#273", /* 273 = nosys */ + "lchmod", /* 274 = lchmod */ + "netbsd_lchown", /* 275 = netbsd_lchown */ + "lutimes", /* 276 = lutimes */ + "netbsd_msync", /* 277 = netbsd_msync */ + "nstat", /* 278 = nstat */ + "nfstat", /* 279 = nfstat */ + "nlstat", /* 280 = nlstat */ + "#281", /* 281 = nosys */ + "#282", /* 282 = nosys */ + "#283", /* 283 = nosys */ + "#284", /* 284 = nosys */ + "#285", /* 285 = nosys */ + "#286", /* 286 = nosys */ + "#287", /* 287 = nosys */ + "#288", /* 288 = nosys */ + "#289", /* 289 = nosys */ + "#290", /* 290 = nosys */ + "#291", /* 291 = nosys */ + "#292", /* 292 = nosys */ + "#293", /* 293 = nosys */ + "#294", /* 294 = nosys */ + "#295", /* 295 = nosys */ + "#296", /* 296 = nosys */ + "fhstatfs", /* 297 = fhstatfs */ + "fhopen", /* 298 = fhopen */ + "fhstat", /* 299 = fhstat */ + "modnext", /* 300 = modnext */ + "modstat", /* 301 = modstat */ + "modfnext", /* 302 = modfnext */ + "modfind", /* 303 = modfind */ + "kldload", /* 304 = kldload */ + "kldunload", /* 305 = kldunload */ + "kldfind", /* 306 = kldfind */ + "kldnext", /* 307 = kldnext */ + "kldstat", /* 308 = kldstat */ + "kldfirstmod", /* 309 = kldfirstmod */ + "getsid", /* 310 = getsid */ + "setresuid", /* 311 = setresuid */ + "setresgid", /* 312 = setresgid */ + "obs_signanosleep", /* 313 = obsolete signanosleep */ + "aio_return", /* 314 = aio_return */ + "aio_suspend", /* 315 = aio_suspend */ + "aio_cancel", /* 316 = aio_cancel */ + "aio_error", /* 317 = aio_error */ + "aio_read", /* 318 = aio_read */ + "aio_write", /* 319 = aio_write */ + "lio_listio", /* 320 = lio_listio */ + "yield", /* 321 = yield */ + "obs_thr_sleep", /* 322 = obsolete thr_sleep */ + "obs_thr_wakeup", /* 323 = obsolete thr_wakeup */ + "mlockall", /* 324 = mlockall */ + "munlockall", /* 325 = munlockall */ + "__getcwd", /* 326 = __getcwd */ + "sched_setparam", /* 327 = sched_setparam */ + "sched_getparam", /* 328 = sched_getparam */ + "sched_setscheduler", /* 329 = sched_setscheduler */ + "sched_getscheduler", /* 330 = sched_getscheduler */ + "sched_yield", /* 331 = sched_yield */ + "sched_get_priority_max", /* 332 = sched_get_priority_max */ + "sched_get_priority_min", /* 333 = sched_get_priority_min */ + "sched_rr_get_interval", /* 334 = sched_rr_get_interval */ + "utrace", /* 335 = utrace */ + "sendfile", /* 336 = sendfile */ + "kldsym", /* 337 = kldsym */ + "jail", /* 338 = jail */ + "#339", /* 339 = pioctl */ + "sigprocmask", /* 340 = sigprocmask */ + "sigsuspend", /* 341 = sigsuspend */ + "sigaction", /* 342 = sigaction */ + "sigpending", /* 343 = sigpending */ + "sigreturn", /* 344 = sigreturn */ + "#345", /* 345 = sigtimedwait */ + "#346", /* 346 = sigwaitinfo */ + "__acl_get_file", /* 347 = __acl_get_file */ + "__acl_set_file", /* 348 = __acl_set_file */ + "__acl_get_fd", /* 349 = __acl_get_fd */ + "__acl_set_fd", /* 350 = __acl_set_fd */ + "__acl_delete_file", /* 351 = __acl_delete_file */ + "__acl_delete_fd", /* 352 = __acl_delete_fd */ + "__acl_aclcheck_file", /* 353 = __acl_aclcheck_file */ + "__acl_aclcheck_fd", /* 354 = __acl_aclcheck_fd */ + "extattrctl", /* 355 = extattrctl */ + "extattr_set_file", /* 356 = extattr_set_file */ + "extattr_get_file", /* 357 = extattr_get_file */ + "extattr_delete_file", /* 358 = extattr_delete_file */ + "aio_waitcomplete", /* 359 = aio_waitcomplete */ + "getresuid", /* 360 = getresuid */ + "getresgid", /* 361 = getresgid */ + "kqueue", /* 362 = kqueue */ + "kevent", /* 363 = kevent */ + "#364", /* 364 = __cap_get_proc */ + "#365", /* 365 = __cap_set_proc */ + "#366", /* 366 = __cap_get_fd */ + "#367", /* 367 = __cap_get_file */ + "#368", /* 368 = __cap_set_fd */ + "#369", /* 369 = __cap_set_file */ + "lkmressys", /* 370 = lkmressys */ + "extattr_set_fd", /* 371 = extattr_set_fd */ + "extattr_get_fd", /* 372 = extattr_get_fd */ + "extattr_delete_fd", /* 373 = extattr_delete_fd */ + "__setugid", /* 374 = __setugid */ + "nfsclnt", /* 375 = nfsclnt */ + "eaccess", /* 376 = eaccess */ + "#377", /* 377 = afs_syscall */ + "nmount", /* 378 = nmount */ + "kse_exit", /* 379 = kse_exit */ + "kse_wakeup", /* 380 = kse_wakeup */ + "kse_new", /* 381 = kse_new */ + "thread_wakeup", /* 382 = thread_wakeup */ + "kse_yield", /* 383 = kse_yield */ + "#384", /* 384 = __mac_get_proc */ + "#385", /* 385 = __mac_set_proc */ + "#386", /* 386 = __mac_get_fd */ + "#387", /* 387 = __mac_get_file */ + "#388", /* 388 = __mac_set_fd */ + "#389", /* 389 = __mac_set_file */ + "kenv", /* 390 = kenv */ + "lchflags", /* 391 = lchflags */ + "uuidgen", /* 392 = uuidgen */ +}; diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master new file mode 100644 index 0000000..d8115fb --- /dev/null +++ b/sys/kern/syscalls.master @@ -0,0 +1,565 @@ + $FreeBSD$ +; from: @(#)syscalls.master 8.2 (Berkeley) 1/13/94 +; +; System call name/number master file. +; Processed to created init_sysent.c, syscalls.c and syscall.h. + +; Columns: number [M]type nargs namespc name alt{name,tag,rtyp}/comments +; number system call number, must be in order +; type one of [M]STD, [M]OBSOL, [M]UNIMPL, [M]COMPAT, [M]CPT_NOA, +; [M]LIBCOMPAT, [M]NODEF, [M]NOARGS, [M]NOPROTO, [M]NOIMPL, +; [M]NOSTD +; namespc one of POSIX, BSD, NOHIDE +; name psuedo-prototype of syscall routine +; If one of the following alts is different, then all appear: +; altname name of system call if different +; alttag name of args struct tag if different from [o]`name'"_args" +; altrtyp return type if not int (bogus - syscalls always return int) +; for UNIMPL/OBSOL, name continues with comments + +; types: +; [M] e.g. like MSTD -- means the system call is MP-safe. If no +; M prefix is used, the syscall wrapper will obtain the Giant +; lock for the syscall. +; STD always included +; COMPAT included on COMPAT #ifdef +; LIBCOMPAT included on COMPAT #ifdef, and placed in syscall.h +; OBSOL obsolete, not included in system, only specifies name +; UNIMPL not implemented, placeholder only +; NOSTD implemented but as a lkm that can be statically +; compiled in sysent entry will be filled with lkmsys +; so the SYSCALL_MODULE macro works + +; #ifdef's, etc. may be included, and are copied to the output files. + +#include <sys/param.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> + +; Reserved/unimplemented system calls in the range 0-150 inclusive +; are reserved for use in future Berkeley releases. +; Additional system calls implemented in vendor and other +; redistributions should be placed in the reserved range at the end +; of the current calls. + +0 STD NOHIDE { int nosys(void); } syscall nosys_args int +1 MSTD NOHIDE { void sys_exit(int rval); } exit sys_exit_args void +2 MSTD POSIX { int fork(void); } +3 MSTD POSIX { ssize_t read(int fd, void *buf, size_t nbyte); } +4 MSTD POSIX { ssize_t write(int fd, const void *buf, size_t nbyte); } +5 STD POSIX { int open(char *path, int flags, int mode); } +; XXX should be { int open(const char *path, int flags, ...); } +; but we're not ready for `const' or varargs. +; XXX man page says `mode_t mode'. +6 MSTD POSIX { int close(int fd); } +7 MSTD BSD { int wait4(int pid, int *status, int options, \ + struct rusage *rusage); } wait4 wait_args int +8 COMPAT BSD { int creat(char *path, int mode); } +9 STD POSIX { int link(char *path, char *link); } +10 STD POSIX { int unlink(char *path); } +11 OBSOL NOHIDE execv +12 STD POSIX { int chdir(char *path); } +13 STD BSD { int fchdir(int fd); } +14 STD POSIX { int mknod(char *path, int mode, int dev); } +15 STD POSIX { int chmod(char *path, int mode); } +16 STD POSIX { int chown(char *path, int uid, int gid); } +17 MSTD BSD { int obreak(char *nsize); } break obreak_args int +18 STD BSD { int getfsstat(struct statfs *buf, long bufsize, \ + int flags); } +19 COMPAT POSIX { long lseek(int fd, long offset, int whence); } +20 MSTD POSIX { pid_t getpid(void); } +21 STD BSD { int mount(char *type, char *path, int flags, \ + caddr_t data); } +; XXX `path' should have type `const char *' but we're not ready for that. +22 STD BSD { int unmount(char *path, int flags); } +23 MSTD POSIX { int setuid(uid_t uid); } +24 MSTD POSIX { uid_t getuid(void); } +25 MSTD POSIX { uid_t geteuid(void); } +26 STD BSD { int ptrace(int req, pid_t pid, caddr_t addr, \ + int data); } +27 MSTD BSD { int recvmsg(int s, struct msghdr *msg, int flags); } +28 MSTD BSD { int sendmsg(int s, caddr_t msg, int flags); } +29 MSTD BSD { int recvfrom(int s, caddr_t buf, size_t len, \ + int flags, caddr_t from, int *fromlenaddr); } +30 MSTD BSD { int accept(int s, caddr_t name, int *anamelen); } +31 MSTD BSD { int getpeername(int fdes, caddr_t asa, int *alen); } +32 MSTD BSD { int getsockname(int fdes, caddr_t asa, int *alen); } +33 STD POSIX { int access(char *path, int flags); } +34 STD BSD { int chflags(char *path, int flags); } +35 STD BSD { int fchflags(int fd, int flags); } +36 STD BSD { int sync(void); } +37 MSTD POSIX { int kill(int pid, int signum); } +38 COMPAT POSIX { int stat(char *path, struct ostat *ub); } +39 MSTD POSIX { pid_t getppid(void); } +40 COMPAT POSIX { int lstat(char *path, struct ostat *ub); } +41 STD POSIX { int dup(u_int fd); } +42 STD POSIX { int pipe(void); } +43 MSTD POSIX { gid_t getegid(void); } +44 MSTD BSD { int profil(caddr_t samples, size_t size, \ + size_t offset, u_int scale); } +45 STD BSD { int ktrace(const char *fname, int ops, int facs, \ + int pid); } +46 MCOMPAT POSIX { int sigaction(int signum, struct osigaction *nsa, \ + struct osigaction *osa); } +47 MSTD POSIX { gid_t getgid(void); } +48 MCOMPAT POSIX { int sigprocmask(int how, osigset_t mask); } +; XXX note nonstandard (bogus) calling convention - the libc stub passes +; us the mask, not a pointer to it, and we return the old mask as the +; (int) return value. +49 MSTD BSD { int getlogin(char *namebuf, u_int namelen); } +50 MSTD BSD { int setlogin(char *namebuf); } +51 MSTD BSD { int acct(char *path); } +52 MCOMPAT POSIX { int sigpending(void); } +53 MSTD BSD { int sigaltstack(stack_t *ss, stack_t *oss); } +54 MSTD POSIX { int ioctl(int fd, u_long com, caddr_t data); } +55 MSTD BSD { int reboot(int opt); } +56 STD POSIX { int revoke(char *path); } +57 STD POSIX { int symlink(char *path, char *link); } +58 STD POSIX { int readlink(char *path, char *buf, int count); } +59 MSTD POSIX { int execve(char *fname, char **argv, char **envv); } +60 MSTD POSIX { int umask(int newmask); } umask umask_args int +61 STD BSD { int chroot(char *path); } +62 MCOMPAT POSIX { int fstat(int fd, struct ostat *sb); } +63 MCOMPAT BSD { int getkerninfo(int op, char *where, size_t *size, \ + int arg); } getkerninfo getkerninfo_args int +64 MCOMPAT BSD { int getpagesize(void); } \ + getpagesize getpagesize_args int +65 STD BSD { int msync(void *addr, size_t len, int flags); } +66 MSTD BSD { int vfork(void); } +67 OBSOL NOHIDE vread +68 OBSOL NOHIDE vwrite +69 MSTD BSD { int sbrk(int incr); } +70 MSTD BSD { int sstk(int incr); } +71 MCOMPAT BSD { int mmap(void *addr, int len, int prot, \ + int flags, int fd, long pos); } +72 MSTD BSD { int ovadvise(int anom); } vadvise ovadvise_args int +73 MSTD BSD { int munmap(void *addr, size_t len); } +74 MSTD BSD { int mprotect(const void *addr, size_t len, int prot); } +75 MSTD BSD { int madvise(void *addr, size_t len, int behav); } +76 OBSOL NOHIDE vhangup +77 OBSOL NOHIDE vlimit +78 MSTD BSD { int mincore(const void *addr, size_t len, \ + char *vec); } +79 MSTD POSIX { int getgroups(u_int gidsetsize, gid_t *gidset); } +80 MSTD POSIX { int setgroups(u_int gidsetsize, gid_t *gidset); } +81 MSTD POSIX { int getpgrp(void); } +82 MSTD POSIX { int setpgid(int pid, int pgid); } +83 MSTD BSD { int setitimer(u_int which, struct itimerval *itv, \ + struct itimerval *oitv); } +84 MCOMPAT BSD { int wait(void); } +85 MSTD BSD { int swapon(char *name); } +86 MSTD BSD { int getitimer(u_int which, struct itimerval *itv); } +87 MCOMPAT BSD { int gethostname(char *hostname, u_int len); } \ + gethostname gethostname_args int +88 MCOMPAT BSD { int sethostname(char *hostname, u_int len); } \ + sethostname sethostname_args int +89 MSTD BSD { int getdtablesize(void); } +90 MSTD POSIX { int dup2(u_int from, u_int to); } +91 UNIMPL BSD getdopt +92 MSTD POSIX { int fcntl(int fd, int cmd, long arg); } +; XXX should be { int fcntl(int fd, int cmd, ...); } +; but we're not ready for varargs. +; XXX man page says `int arg' too. +93 MSTD BSD { int select(int nd, fd_set *in, fd_set *ou, \ + fd_set *ex, struct timeval *tv); } +94 UNIMPL BSD setdopt +95 STD POSIX { int fsync(int fd); } +96 MSTD BSD { int setpriority(int which, int who, int prio); } +97 MSTD BSD { int socket(int domain, int type, int protocol); } +98 MSTD BSD { int connect(int s, caddr_t name, int namelen); } +99 MCPT_NOA BSD { int accept(int s, caddr_t name, int *anamelen); } \ + accept accept_args int +100 MSTD BSD { int getpriority(int which, int who); } +101 MCOMPAT BSD { int send(int s, caddr_t buf, int len, int flags); } +102 MCOMPAT BSD { int recv(int s, caddr_t buf, int len, int flags); } +103 MSTD BSD { int osigreturn(struct osigcontext *sigcntxp); } +104 MSTD BSD { int bind(int s, caddr_t name, int namelen); } +105 MSTD BSD { int setsockopt(int s, int level, int name, \ + caddr_t val, int valsize); } +106 MSTD BSD { int listen(int s, int backlog); } +107 OBSOL NOHIDE vtimes +108 MCOMPAT BSD { int sigvec(int signum, struct sigvec *nsv, \ + struct sigvec *osv); } +109 MCOMPAT BSD { int sigblock(int mask); } +110 MCOMPAT BSD { int sigsetmask(int mask); } +111 MCOMPAT POSIX { int sigsuspend(osigset_t mask); } +; XXX note nonstandard (bogus) calling convention - the libc stub passes +; us the mask, not a pointer to it. +112 MCOMPAT BSD { int sigstack(struct sigstack *nss, \ + struct sigstack *oss); } +113 MCOMPAT BSD { int recvmsg(int s, struct omsghdr *msg, int flags); } +114 MCOMPAT BSD { int sendmsg(int s, caddr_t msg, int flags); } +115 OBSOL NOHIDE vtrace +116 MSTD BSD { int gettimeofday(struct timeval *tp, \ + struct timezone *tzp); } +117 MSTD BSD { int getrusage(int who, struct rusage *rusage); } +118 MSTD BSD { int getsockopt(int s, int level, int name, \ + caddr_t val, int *avalsize); } +119 UNIMPL NOHIDE resuba (BSD/OS 2.x) +120 MSTD BSD { int readv(int fd, struct iovec *iovp, u_int iovcnt); } +121 MSTD BSD { int writev(int fd, struct iovec *iovp, \ + u_int iovcnt); } +122 MSTD BSD { int settimeofday(struct timeval *tv, \ + struct timezone *tzp); } +123 STD BSD { int fchown(int fd, int uid, int gid); } +124 STD BSD { int fchmod(int fd, int mode); } +125 MCPT_NOA BSD { int recvfrom(int s, caddr_t buf, size_t len, \ + int flags, caddr_t from, int *fromlenaddr); } \ + recvfrom recvfrom_args int +126 MSTD BSD { int setreuid(int ruid, int euid); } +127 MSTD BSD { int setregid(int rgid, int egid); } +128 STD POSIX { int rename(char *from, char *to); } +129 COMPAT BSD { int truncate(char *path, long length); } +130 COMPAT BSD { int ftruncate(int fd, long length); } +131 MSTD BSD { int flock(int fd, int how); } +132 STD POSIX { int mkfifo(char *path, int mode); } +133 MSTD BSD { int sendto(int s, caddr_t buf, size_t len, \ + int flags, caddr_t to, int tolen); } +134 MSTD BSD { int shutdown(int s, int how); } +135 MSTD BSD { int socketpair(int domain, int type, int protocol, \ + int *rsv); } +136 STD POSIX { int mkdir(char *path, int mode); } +137 STD POSIX { int rmdir(char *path); } +138 STD BSD { int utimes(char *path, struct timeval *tptr); } +139 OBSOL NOHIDE 4.2 sigreturn +140 MSTD BSD { int adjtime(struct timeval *delta, \ + struct timeval *olddelta); } +141 MCOMPAT BSD { int getpeername(int fdes, caddr_t asa, int *alen); } +142 MCOMPAT BSD { long gethostid(void); } +143 MCOMPAT BSD { int sethostid(long hostid); } +144 MCOMPAT BSD { int getrlimit(u_int which, struct orlimit *rlp); } +145 MCOMPAT BSD { int setrlimit(u_int which, struct orlimit *rlp); } +146 MCOMPAT BSD { int killpg(int pgid, int signum); } +147 MSTD POSIX { int setsid(void); } +148 STD BSD { int quotactl(char *path, int cmd, int uid, \ + caddr_t arg); } +149 MCOMPAT BSD { int quota(void); } +150 MCPT_NOA BSD { int getsockname(int fdec, caddr_t asa, int *alen); }\ + getsockname getsockname_args int + +; Syscalls 151-180 inclusive are reserved for vendor-specific +; system calls. (This includes various calls added for compatibity +; with other Unix variants.) +; Some of these calls are now supported by BSD... +151 UNIMPL NOHIDE sem_lock (BSD/OS 2.x) +152 UNIMPL NOHIDE sem_wakeup (BSD/OS 2.x) +153 UNIMPL NOHIDE asyncdaemon (BSD/OS 2.x) +154 UNIMPL NOHIDE nosys +; 155 is initialized by the NFS code, if present. +155 MNOIMPL BSD { int nfssvc(int flag, caddr_t argp); } +156 COMPAT BSD { int getdirentries(int fd, char *buf, u_int count, \ + long *basep); } +157 STD BSD { int statfs(char *path, struct statfs *buf); } +158 STD BSD { int fstatfs(int fd, struct statfs *buf); } +159 UNIMPL NOHIDE nosys +160 UNIMPL NOHIDE nosys +161 STD BSD { int getfh(char *fname, struct fhandle *fhp); } +162 MSTD BSD { int getdomainname(char *domainname, int len); } +163 MSTD BSD { int setdomainname(char *domainname, int len); } +164 MSTD BSD { int uname(struct utsname *name); } +165 STD BSD { int sysarch(int op, char *parms); } +166 MSTD BSD { int rtprio(int function, pid_t pid, \ + struct rtprio *rtp); } +167 UNIMPL NOHIDE nosys +168 UNIMPL NOHIDE nosys +; 169 is initialized by the SYSVSEM code if present or loaded +169 MNOSTD BSD { int semsys(int which, int a2, int a3, int a4, \ + int a5); } +; 169 is initialized by the SYSVMSG code if present or loaded +; XXX should be { int semsys(int which, ...); } +170 MNOSTD BSD { int msgsys(int which, int a2, int a3, int a4, \ + int a5, int a6); } +; 169 is initialized by the SYSVSHM code if present or loaded +; XXX should be { int msgsys(int which, ...); } +171 MNOSTD BSD { int shmsys(int which, int a2, int a3, int a4); } +; XXX should be { int shmsys(int which, ...); } +172 UNIMPL NOHIDE nosys +173 MSTD POSIX { ssize_t pread(int fd, void *buf, size_t nbyte, \ + int pad, off_t offset); } +174 MSTD POSIX { ssize_t pwrite(int fd, const void *buf, \ + size_t nbyte, int pad, off_t offset); } +175 UNIMPL NOHIDE nosys +176 MSTD BSD { int ntp_adjtime(struct timex *tp); } +177 UNIMPL NOHIDE sfork (BSD/OS 2.x) +178 UNIMPL NOHIDE getdescriptor (BSD/OS 2.x) +179 UNIMPL NOHIDE setdescriptor (BSD/OS 2.x) +180 UNIMPL NOHIDE nosys + +; Syscalls 181-199 are used by/reserved for BSD +181 MSTD POSIX { int setgid(gid_t gid); } +182 MSTD BSD { int setegid(gid_t egid); } +183 MSTD BSD { int seteuid(uid_t euid); } +184 UNIMPL BSD lfs_bmapv +185 UNIMPL BSD lfs_markv +186 UNIMPL BSD lfs_segclean +187 UNIMPL BSD lfs_segwait +188 STD POSIX { int stat(char *path, struct stat *ub); } +189 MSTD POSIX { int fstat(int fd, struct stat *sb); } +190 STD POSIX { int lstat(char *path, struct stat *ub); } +191 STD POSIX { int pathconf(char *path, int name); } +192 MSTD POSIX { int fpathconf(int fd, int name); } +193 UNIMPL NOHIDE nosys +194 MSTD BSD { int getrlimit(u_int which, \ + struct rlimit *rlp); } \ + getrlimit __getrlimit_args int +195 MSTD BSD { int setrlimit(u_int which, \ + struct rlimit *rlp); } \ + setrlimit __setrlimit_args int +196 STD BSD { int getdirentries(int fd, char *buf, u_int count, \ + long *basep); } +197 MSTD BSD { caddr_t mmap(caddr_t addr, size_t len, int prot, \ + int flags, int fd, int pad, off_t pos); } +198 STD NOHIDE { int nosys(void); } __syscall __syscall_args int +199 STD POSIX { off_t lseek(int fd, int pad, off_t offset, \ + int whence); } +200 STD BSD { int truncate(char *path, int pad, off_t length); } +201 STD BSD { int ftruncate(int fd, int pad, off_t length); } +202 MSTD BSD { int __sysctl(int *name, u_int namelen, void *old, \ + size_t *oldlenp, void *new, size_t newlen); } \ + __sysctl sysctl_args int +; properly, __sysctl should be a NOHIDE, but making an exception +; here allows to avoid one in libc/sys/Makefile.inc. +203 MSTD BSD { int mlock(const void *addr, size_t len); } +204 MSTD BSD { int munlock(const void *addr, size_t len); } +205 STD BSD { int undelete(char *path); } +206 STD BSD { int futimes(int fd, struct timeval *tptr); } +207 MSTD BSD { int getpgid(pid_t pid); } +208 UNIMPL NOHIDE newreboot (NetBSD) +209 MSTD BSD { int poll(struct pollfd *fds, u_int nfds, \ + int timeout); } + +; +; The following are reserved for loadable syscalls +; +210 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +211 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +212 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +213 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +214 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +215 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +216 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +217 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +218 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +219 NODEF NOHIDE lkmnosys lkmnosys nosys_args int + +; +; The following were introduced with NetBSD/4.4Lite-2 +; They are initialized by thier respective modules/sysinits +220 MNOSTD BSD { int __semctl(int semid, int semnum, int cmd, \ + union semun *arg); } +221 MNOSTD BSD { int semget(key_t key, int nsems, int semflg); } +222 MNOSTD BSD { int semop(int semid, struct sembuf *sops, \ + u_int nsops); } +223 UNIMPL NOHIDE semconfig +224 MNOSTD BSD { int msgctl(int msqid, int cmd, \ + struct msqid_ds *buf); } +225 MNOSTD BSD { int msgget(key_t key, int msgflg); } +226 MNOSTD BSD { int msgsnd(int msqid, void *msgp, size_t msgsz, \ + int msgflg); } +227 MNOSTD BSD { int msgrcv(int msqid, void *msgp, size_t msgsz, \ + long msgtyp, int msgflg); } +228 MNOSTD BSD { int shmat(int shmid, void *shmaddr, int shmflg); } +229 MNOSTD BSD { int shmctl(int shmid, int cmd, \ + struct shmid_ds *buf); } +230 MNOSTD BSD { int shmdt(void *shmaddr); } +231 MNOSTD BSD { int shmget(key_t key, int size, int shmflg); } +; +232 MSTD POSIX { int clock_gettime(clockid_t clock_id, \ + struct timespec *tp); } +233 MSTD POSIX { int clock_settime(clockid_t clock_id, \ + const struct timespec *tp); } +234 MSTD POSIX { int clock_getres(clockid_t clock_id, \ + struct timespec *tp); } +235 UNIMPL NOHIDE timer_create +236 UNIMPL NOHIDE timer_delete +237 UNIMPL NOHIDE timer_settime +238 UNIMPL NOHIDE timer_gettime +239 UNIMPL NOHIDE timer_getoverrun +240 MSTD POSIX { int nanosleep(const struct timespec *rqtp, \ + struct timespec *rmtp); } +241 UNIMPL NOHIDE nosys +242 UNIMPL NOHIDE nosys +243 UNIMPL NOHIDE nosys +244 UNIMPL NOHIDE nosys +245 UNIMPL NOHIDE nosys +246 UNIMPL NOHIDE nosys +247 UNIMPL NOHIDE nosys +248 UNIMPL NOHIDE nosys +249 UNIMPL NOHIDE nosys +; syscall numbers initially used in OpenBSD +250 MSTD BSD { int minherit(void *addr, size_t len, int inherit); } +251 MSTD BSD { int rfork(int flags); } +252 MSTD BSD { int openbsd_poll(struct pollfd *fds, u_int nfds, \ + int timeout); } +253 STD BSD { int issetugid(void); } +254 STD BSD { int lchown(char *path, int uid, int gid); } +255 UNIMPL NOHIDE nosys +256 UNIMPL NOHIDE nosys +257 UNIMPL NOHIDE nosys +258 UNIMPL NOHIDE nosys +259 UNIMPL NOHIDE nosys +260 UNIMPL NOHIDE nosys +261 UNIMPL NOHIDE nosys +262 UNIMPL NOHIDE nosys +263 UNIMPL NOHIDE nosys +264 UNIMPL NOHIDE nosys +265 UNIMPL NOHIDE nosys +266 UNIMPL NOHIDE nosys +267 UNIMPL NOHIDE nosys +268 UNIMPL NOHIDE nosys +269 UNIMPL NOHIDE nosys +270 UNIMPL NOHIDE nosys +271 UNIMPL NOHIDE nosys +272 STD BSD { int getdents(int fd, char *buf, size_t count); } +273 UNIMPL NOHIDE nosys +274 STD BSD { int lchmod(char *path, mode_t mode); } +275 NOPROTO BSD { int lchown(char *path, uid_t uid, gid_t gid); } netbsd_lchown lchown_args int +276 STD BSD { int lutimes(char *path, struct timeval *tptr); } +277 MNOPROTO BSD { int msync(void *addr, size_t len, int flags); } netbsd_msync msync_args int +278 STD BSD { int nstat(char *path, struct nstat *ub); } +279 MSTD BSD { int nfstat(int fd, struct nstat *sb); } +280 STD BSD { int nlstat(char *path, struct nstat *ub); } +281 UNIMPL NOHIDE nosys +282 UNIMPL NOHIDE nosys +283 UNIMPL NOHIDE nosys +284 UNIMPL NOHIDE nosys +285 UNIMPL NOHIDE nosys +286 UNIMPL NOHIDE nosys +287 UNIMPL NOHIDE nosys +288 UNIMPL NOHIDE nosys +289 UNIMPL NOHIDE nosys +290 UNIMPL NOHIDE nosys +291 UNIMPL NOHIDE nosys +292 UNIMPL NOHIDE nosys +293 UNIMPL NOHIDE nosys +294 UNIMPL NOHIDE nosys +295 UNIMPL NOHIDE nosys +296 UNIMPL NOHIDE nosys +; XXX 297 is 300 in NetBSD +297 STD BSD { int fhstatfs(const struct fhandle *u_fhp, struct statfs *buf); } +298 STD BSD { int fhopen(const struct fhandle *u_fhp, int flags); } +299 STD BSD { int fhstat(const struct fhandle *u_fhp, struct stat *sb); } +; syscall numbers for FreeBSD +300 MSTD BSD { int modnext(int modid); } +301 MSTD BSD { int modstat(int modid, struct module_stat* stat); } +302 MSTD BSD { int modfnext(int modid); } +303 MSTD BSD { int modfind(const char *name); } +304 MSTD BSD { int kldload(const char *file); } +305 MSTD BSD { int kldunload(int fileid); } +306 MSTD BSD { int kldfind(const char *file); } +307 MSTD BSD { int kldnext(int fileid); } +308 MSTD BSD { int kldstat(int fileid, struct kld_file_stat* stat); } +309 MSTD BSD { int kldfirstmod(int fileid); } +310 MSTD BSD { int getsid(pid_t pid); } +311 MSTD BSD { int setresuid(uid_t ruid, uid_t euid, uid_t suid); } +312 MSTD BSD { int setresgid(gid_t rgid, gid_t egid, gid_t sgid); } +313 OBSOL NOHIDE signanosleep +314 NOSTD BSD { int aio_return(struct aiocb *aiocbp); } +315 NOSTD BSD { int aio_suspend(struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); } +316 NOSTD BSD { int aio_cancel(int fd, struct aiocb *aiocbp); } +317 NOSTD BSD { int aio_error(struct aiocb *aiocbp); } +318 NOSTD BSD { int aio_read(struct aiocb *aiocbp); } +319 NOSTD BSD { int aio_write(struct aiocb *aiocbp); } +320 NOSTD BSD { int lio_listio(int mode, struct aiocb * const *acb_list, int nent, struct sigevent *sig); } +321 MSTD BSD { int yield(void); } +322 OBSOL NOHIDE thr_sleep +323 OBSOL NOHIDE thr_wakeup +324 MSTD BSD { int mlockall(int how); } +325 MSTD BSD { int munlockall(void); } +326 STD BSD { int __getcwd(u_char *buf, u_int buflen); } + +327 MSTD POSIX { int sched_setparam (pid_t pid, const struct sched_param *param); } +328 MSTD POSIX { int sched_getparam (pid_t pid, struct sched_param *param); } + +329 MSTD POSIX { int sched_setscheduler (pid_t pid, int policy, const struct sched_param *param); } +330 MSTD POSIX { int sched_getscheduler (pid_t pid); } + +331 MSTD POSIX { int sched_yield (void); } +332 MSTD POSIX { int sched_get_priority_max (int policy); } +333 MSTD POSIX { int sched_get_priority_min (int policy); } +334 MSTD POSIX { int sched_rr_get_interval (pid_t pid, struct timespec *interval); } +335 STD BSD { int utrace(const void *addr, size_t len); } +336 MSTD BSD { int sendfile(int fd, int s, off_t offset, size_t nbytes, \ + struct sf_hdtr *hdtr, off_t *sbytes, int flags); } +337 STD BSD { int kldsym(int fileid, int cmd, void *data); } +338 MSTD BSD { int jail(struct jail *jail); } +339 UNIMPL BSD pioctl +340 MSTD POSIX { int sigprocmask(int how, const sigset_t *set, \ + sigset_t *oset); } +341 MSTD POSIX { int sigsuspend(const sigset_t *sigmask); } +342 MSTD POSIX { int sigaction(int sig, const struct sigaction *act, \ + struct sigaction *oact); } +343 MSTD POSIX { int sigpending(sigset_t *set); } +344 MSTD BSD { int sigreturn(const struct __ucontext *sigcntxp); } +345 UNIMPL NOHIDE sigtimedwait +346 UNIMPL NOHIDE sigwaitinfo +347 MSTD BSD { int __acl_get_file(const char *path, \ + acl_type_t type, struct acl *aclp); } +348 MSTD BSD { int __acl_set_file(const char *path, \ + acl_type_t type, struct acl *aclp); } +349 MSTD BSD { int __acl_get_fd(int filedes, acl_type_t type, \ + struct acl *aclp); } +350 MSTD BSD { int __acl_set_fd(int filedes, acl_type_t type, \ + struct acl *aclp); } +351 MSTD BSD { int __acl_delete_file(const char *path, \ + acl_type_t type); } +352 MSTD BSD { int __acl_delete_fd(int filedes, acl_type_t type); } +353 MSTD BSD { int __acl_aclcheck_file(const char *path, \ + acl_type_t type, struct acl *aclp); } +354 MSTD BSD { int __acl_aclcheck_fd(int filedes, acl_type_t type, \ + struct acl *aclp); } +355 STD BSD { int extattrctl(const char *path, int cmd, \ + const char *filename, int attrnamespace, \ + const char *attrname); } +356 STD BSD { int extattr_set_file(const char *path, \ + int attrnamespace, const char *attrname, \ + void *data, size_t nbytes); } +357 STD BSD { ssize_t extattr_get_file(const char *path, \ + int attrnamespace, const char *attrname, \ + void *data, size_t nbytes); } +358 STD BSD { int extattr_delete_file(const char *path, \ + int attrnamespace, const char *attrname); } +359 NOSTD BSD { int aio_waitcomplete(struct aiocb **aiocbp, struct timespec *timeout); } +360 MSTD BSD { int getresuid(uid_t *ruid, uid_t *euid, uid_t *suid); } +361 MSTD BSD { int getresgid(gid_t *rgid, gid_t *egid, gid_t *sgid); } +362 MSTD BSD { int kqueue(void); } +363 MSTD BSD { int kevent(int fd, \ + const struct kevent *changelist, int nchanges, \ + struct kevent *eventlist, int nevents, \ + const struct timespec *timeout); } +364 UNIMPL BSD __cap_get_proc +365 UNIMPL BSD __cap_set_proc +366 UNIMPL BSD __cap_get_fd +367 UNIMPL BSD __cap_get_file +368 UNIMPL BSD __cap_set_fd +369 UNIMPL BSD __cap_set_file +370 NODEF NOHIDE lkmressys lkmressys nosys_args int +371 STD BSD { int extattr_set_fd(int fd, int attrnamespace, \ + const char *attrname, void *data, \ + size_t nbytes); } +372 STD BSD { ssize_t extattr_get_fd(int fd, int attrnamespace, \ + const char *attrname, void *data, size_t nbytes); } +373 STD BSD { int extattr_delete_fd(int fd, int attrnamespace, \ + const char *attrname); } +374 MSTD BSD { int __setugid(int flag); } +375 NOIMPL BSD { int nfsclnt(int flag, caddr_t argp); } +376 STD BSD { int eaccess(char *path, int flags); } +377 UNIMPL BSD afs_syscall +378 STD BSD { int nmount(struct iovec *iovp, unsigned int iovcnt, \ + int flags); } +379 STD BSD { int kse_exit(void); } +380 STD BSD { int kse_wakeup(void); } +381 STD BSD { int kse_new(struct kse_mailbox * mbx, \ + int new_grp_flag); } +382 STD BSD { int thread_wakeup(struct thread_mailbox *tmbx); } +383 STD BSD { int kse_yield(void); } +384 UNIMPL BSD __mac_get_proc +385 UNIMPL BSD __mac_set_proc +386 UNIMPL BSD __mac_get_fd +387 UNIMPL BSD __mac_get_file +388 UNIMPL BSD __mac_set_fd +389 UNIMPL BSD __mac_set_file +390 STD BSD { int kenv(int what, const char *name, char *value, \ + int len); } +391 STD BSD { int lchflags(const char *path, int flags); } +392 STD BSD { int uuidgen(struct uuid *store, int count); } diff --git a/sys/kern/sysv_ipc.c b/sys/kern/sysv_ipc.c new file mode 100644 index 0000000..fc5fd8f --- /dev/null +++ b/sys/kern/sysv_ipc.c @@ -0,0 +1,97 @@ +/* $FreeBSD$ */ +/* $NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $ */ + +/* + * Copyright (c) 1994 Herb Peyerl <hpeyerl@novatel.ca> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Herb Peyerl. + * 4. The name of Herb Peyerl may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_sysvipc.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sem.h> +#include <sys/shm.h> +#include <sys/ipc.h> +#include <sys/proc.h> +#include <sys/ucred.h> + +void (*shmfork_hook)(struct proc *, struct proc *) = NULL; +void (*shmexit_hook)(struct proc *) = NULL; + +/* called from kern_fork.c */ +void +shmfork(p1, p2) + struct proc *p1, *p2; +{ + + if (shmfork_hook != NULL) + shmfork_hook(p1, p2); + return; +} + +/* called from kern_exit.c */ +void +shmexit(p) + struct proc *p; +{ + + if (shmexit_hook != NULL) + shmexit_hook(p); + return; +} + +/* + * Check for ipc permission + */ + +int +ipcperm(td, perm, mode) + struct thread *td; + struct ipc_perm *perm; + int mode; +{ + struct ucred *cred = td->td_ucred; + + /* Check for user match. */ + if (cred->cr_uid != perm->cuid && cred->cr_uid != perm->uid) { + if (mode & IPC_M) + return (suser(td) == 0 ? 0 : EPERM); + /* Check for group match. */ + mode >>= 3; + if (!groupmember(perm->gid, cred) && + !groupmember(perm->cgid, cred)) + /* Check for `other' match. */ + mode >>= 3; + } + + if (mode & IPC_M) + return (0); + return ((mode & perm->mode) == mode || + suser(td) == 0 ? 0 : EACCES); +} diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c new file mode 100644 index 0000000..4dd2249 --- /dev/null +++ b/sys/kern/sysv_msg.c @@ -0,0 +1,1240 @@ +/* $FreeBSD$ */ + +/* + * Implementation of SVID messages + * + * Author: Daniel Boulet + * + * Copyright 1993 Daniel Boulet and RTMX Inc. + * + * This system call was implemented by Daniel Boulet under contract from RTMX. + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + */ + +#include "opt_sysvipc.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/msg.h> +#include <sys/syscall.h> +#include <sys/sysent.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/jail.h> + +static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues"); + +static void msginit(void); +static int msgunload(void); +static int sysvmsg_modload(struct module *, int, void *); + +#define MSG_DEBUG +#undef MSG_DEBUG_OK + +static void msg_freehdr(struct msg *msghdr); + +/* XXX casting to (sy_call_t *) is bogus, as usual. */ +static sy_call_t *msgcalls[] = { + (sy_call_t *)msgctl, (sy_call_t *)msgget, + (sy_call_t *)msgsnd, (sy_call_t *)msgrcv +}; + +struct msg { + struct msg *msg_next; /* next msg in the chain */ + long msg_type; /* type of this message */ + /* >0 -> type of this message */ + /* 0 -> free header */ + u_short msg_ts; /* size of this message */ + short msg_spot; /* location of start of msg in buffer */ +}; + + +#ifndef MSGSSZ +#define MSGSSZ 8 /* Each segment must be 2^N long */ +#endif +#ifndef MSGSEG +#define MSGSEG 2048 /* must be less than 32767 */ +#endif +#define MSGMAX (MSGSSZ*MSGSEG) +#ifndef MSGMNB +#define MSGMNB 2048 /* max # of bytes in a queue */ +#endif +#ifndef MSGMNI +#define MSGMNI 40 +#endif +#ifndef MSGTQL +#define MSGTQL 40 +#endif + +/* + * Based on the configuration parameters described in an SVR2 (yes, two) + * config(1m) man page. + * + * Each message is broken up and stored in segments that are msgssz bytes + * long. For efficiency reasons, this should be a power of two. Also, + * it doesn't make sense if it is less than 8 or greater than about 256. + * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of + * two between 8 and 1024 inclusive (and panic's if it isn't). + */ +struct msginfo msginfo = { + MSGMAX, /* max chars in a message */ + MSGMNI, /* # of message queue identifiers */ + MSGMNB, /* max chars in a queue */ + MSGTQL, /* max messages in system */ + MSGSSZ, /* size of a message segment */ + /* (must be small power of 2 greater than 4) */ + MSGSEG /* number of message segments */ +}; + +/* + * macros to convert between msqid_ds's and msqid's. + * (specific to this implementation) + */ +#define MSQID(ix,ds) ((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000)) +#define MSQID_IX(id) ((id) & 0xffff) +#define MSQID_SEQ(id) (((id) >> 16) & 0xffff) + +/* + * The rest of this file is specific to this particular implementation. + */ + +struct msgmap { + short next; /* next segment in buffer */ + /* -1 -> available */ + /* 0..(MSGSEG-1) -> index of next segment */ +}; + +#define MSG_LOCKED 01000 /* Is this msqid_ds locked? */ + +static int nfree_msgmaps; /* # of free map entries */ +static short free_msgmaps; /* head of linked list of free map entries */ +static struct msg *free_msghdrs;/* list of free msg headers */ +static char *msgpool; /* MSGMAX byte long msg buffer pool */ +static struct msgmap *msgmaps; /* MSGSEG msgmap structures */ +static struct msg *msghdrs; /* MSGTQL msg headers */ +static struct msqid_ds *msqids; /* MSGMNI msqid_ds struct's */ + +static void +msginit() +{ + register int i; + + TUNABLE_INT_FETCH("kern.ipc.msgseg", &msginfo.msgseg); + TUNABLE_INT_FETCH("kern.ipc.msgssz", &msginfo.msgssz); + msginfo.msgmax = msginfo.msgseg * msginfo.msgssz; + TUNABLE_INT_FETCH("kern.ipc.msgmni", &msginfo.msgmni); + + msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK); + if (msgpool == NULL) + panic("msgpool is NULL"); + msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK); + if (msgmaps == NULL) + panic("msgmaps is NULL"); + msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK); + if (msghdrs == NULL) + panic("msghdrs is NULL"); + msqids = malloc(sizeof(struct msqid_ds) * msginfo.msgmni, M_MSG, M_WAITOK); + if (msqids == NULL) + panic("msqids is NULL"); + + /* + * msginfo.msgssz should be a power of two for efficiency reasons. + * It is also pretty silly if msginfo.msgssz is less than 8 + * or greater than about 256 so ... + */ + + i = 8; + while (i < 1024 && i != msginfo.msgssz) + i <<= 1; + if (i != msginfo.msgssz) { + printf("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz, + msginfo.msgssz); + panic("msginfo.msgssz not a small power of 2"); + } + + if (msginfo.msgseg > 32767) { + printf("msginfo.msgseg=%d\n", msginfo.msgseg); + panic("msginfo.msgseg > 32767"); + } + + if (msgmaps == NULL) + panic("msgmaps is NULL"); + + for (i = 0; i < msginfo.msgseg; i++) { + if (i > 0) + msgmaps[i-1].next = i; + msgmaps[i].next = -1; /* implies entry is available */ + } + free_msgmaps = 0; + nfree_msgmaps = msginfo.msgseg; + + if (msghdrs == NULL) + panic("msghdrs is NULL"); + + for (i = 0; i < msginfo.msgtql; i++) { + msghdrs[i].msg_type = 0; + if (i > 0) + msghdrs[i-1].msg_next = &msghdrs[i]; + msghdrs[i].msg_next = NULL; + } + free_msghdrs = &msghdrs[0]; + + if (msqids == NULL) + panic("msqids is NULL"); + + for (i = 0; i < msginfo.msgmni; i++) { + msqids[i].msg_qbytes = 0; /* implies entry is available */ + msqids[i].msg_perm.seq = 0; /* reset to a known value */ + msqids[i].msg_perm.mode = 0; + } +} + +static int +msgunload() +{ + struct msqid_ds *msqptr; + int msqid; + + for (msqid = 0; msqid < msginfo.msgmni; msqid++) { + /* + * Look for an unallocated and unlocked msqid_ds. + * msqid_ds's can be locked by msgsnd or msgrcv while + * they are copying the message in/out. We can't + * re-use the entry until they release it. + */ + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes != 0 || + (msqptr->msg_perm.mode & MSG_LOCKED) != 0) + break; + } + if (msqid != msginfo.msgmni) + return (EBUSY); + + free(msgpool, M_MSG); + free(msgmaps, M_MSG); + free(msghdrs, M_MSG); + free(msqids, M_MSG); + return (0); +} + + +static int +sysvmsg_modload(struct module *module, int cmd, void *arg) +{ + int error = 0; + + switch (cmd) { + case MOD_LOAD: + msginit(); + break; + case MOD_UNLOAD: + error = msgunload(); + break; + case MOD_SHUTDOWN: + break; + default: + error = EINVAL; + break; + } + return (error); +} + +static moduledata_t sysvmsg_mod = { + "sysvmsg", + &sysvmsg_modload, + NULL +}; + +SYSCALL_MODULE_HELPER(msgsys); +SYSCALL_MODULE_HELPER(msgctl); +SYSCALL_MODULE_HELPER(msgget); +SYSCALL_MODULE_HELPER(msgsnd); +SYSCALL_MODULE_HELPER(msgrcv); + +DECLARE_MODULE(sysvmsg, sysvmsg_mod, + SI_SUB_SYSV_MSG, SI_ORDER_FIRST); +MODULE_VERSION(sysvmsg, 1); + +/* + * Entry point for all MSG calls + * + * MPSAFE + */ +int +msgsys(td, uap) + struct thread *td; + /* XXX actually varargs. */ + struct msgsys_args /* { + u_int which; + int a2; + int a3; + int a4; + int a5; + int a6; + } */ *uap; +{ + int error; + + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + if (uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0])) + return (EINVAL); + mtx_lock(&Giant); + error = (*msgcalls[uap->which])(td, &uap->a2); + mtx_unlock(&Giant); + return (error); +} + +static void +msg_freehdr(msghdr) + struct msg *msghdr; +{ + while (msghdr->msg_ts > 0) { + short next; + if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg) + panic("msghdr->msg_spot out of range"); + next = msgmaps[msghdr->msg_spot].next; + msgmaps[msghdr->msg_spot].next = free_msgmaps; + free_msgmaps = msghdr->msg_spot; + nfree_msgmaps++; + msghdr->msg_spot = next; + if (msghdr->msg_ts >= msginfo.msgssz) + msghdr->msg_ts -= msginfo.msgssz; + else + msghdr->msg_ts = 0; + } + if (msghdr->msg_spot != -1) + panic("msghdr->msg_spot != -1"); + msghdr->msg_next = free_msghdrs; + free_msghdrs = msghdr; +} + +#ifndef _SYS_SYSPROTO_H_ +struct msgctl_args { + int msqid; + int cmd; + struct msqid_ds *buf; +}; +#endif + +/* + * MPSAFE + */ +int +msgctl(td, uap) + struct thread *td; + register struct msgctl_args *uap; +{ + int msqid = uap->msqid; + int cmd = uap->cmd; + struct msqid_ds *user_msqptr = uap->buf; + int rval, error; + struct msqid_ds msqbuf; + register struct msqid_ds *msqptr; + +#ifdef MSG_DEBUG_OK + printf("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, user_msqptr); +#endif + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + + mtx_lock(&Giant); + msqid = IPCID_TO_IX(msqid); + + if (msqid < 0 || msqid >= msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid, + msginfo.msgmni); +#endif + error = EINVAL; + goto done2; + } + + msqptr = &msqids[msqid]; + + if (msqptr->msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("no such msqid\n"); +#endif + error = EINVAL; + goto done2; + } + if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { +#ifdef MSG_DEBUG_OK + printf("wrong sequence number\n"); +#endif + error = EINVAL; + goto done2; + } + + error = 0; + rval = 0; + + switch (cmd) { + + case IPC_RMID: + { + struct msg *msghdr; + if ((error = ipcperm(td, &msqptr->msg_perm, IPC_M))) + goto done2; + /* Free the message headers */ + msghdr = msqptr->msg_first; + while (msghdr != NULL) { + struct msg *msghdr_tmp; + + /* Free the segments of each message */ + msqptr->msg_cbytes -= msghdr->msg_ts; + msqptr->msg_qnum--; + msghdr_tmp = msghdr; + msghdr = msghdr->msg_next; + msg_freehdr(msghdr_tmp); + } + + if (msqptr->msg_cbytes != 0) + panic("msg_cbytes is screwed up"); + if (msqptr->msg_qnum != 0) + panic("msg_qnum is screwed up"); + + msqptr->msg_qbytes = 0; /* Mark it as free */ + + wakeup((caddr_t)msqptr); + } + + break; + + case IPC_SET: + if ((error = ipcperm(td, &msqptr->msg_perm, IPC_M))) + goto done2; + if ((error = copyin(user_msqptr, &msqbuf, sizeof(msqbuf))) != 0) + goto done2; + if (msqbuf.msg_qbytes > msqptr->msg_qbytes) { + error = suser(td); + if (error) + goto done2; + } + if (msqbuf.msg_qbytes > msginfo.msgmnb) { +#ifdef MSG_DEBUG_OK + printf("can't increase msg_qbytes beyond %d (truncating)\n", + msginfo.msgmnb); +#endif + msqbuf.msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */ + } + if (msqbuf.msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("can't reduce msg_qbytes to 0\n"); +#endif + error = EINVAL; /* non-standard errno! */ + goto done2; + } + msqptr->msg_perm.uid = msqbuf.msg_perm.uid; /* change the owner */ + msqptr->msg_perm.gid = msqbuf.msg_perm.gid; /* change the owner */ + msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) | + (msqbuf.msg_perm.mode & 0777); + msqptr->msg_qbytes = msqbuf.msg_qbytes; + msqptr->msg_ctime = time_second; + break; + + case IPC_STAT: + if ((error = ipcperm(td, &msqptr->msg_perm, IPC_R))) { +#ifdef MSG_DEBUG_OK + printf("requester doesn't have read access\n"); +#endif + goto done2; + } + error = copyout((caddr_t)msqptr, user_msqptr, + sizeof(struct msqid_ds)); + break; + + default: +#ifdef MSG_DEBUG_OK + printf("invalid command %d\n", cmd); +#endif + error = EINVAL; + goto done2; + } + + if (error == 0) + td->td_retval[0] = rval; +done2: + mtx_unlock(&Giant); + return(error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct msgget_args { + key_t key; + int msgflg; +}; +#endif + +/* + * MPSAFE + */ +int +msgget(td, uap) + struct thread *td; + register struct msgget_args *uap; +{ + int msqid, error = 0; + int key = uap->key; + int msgflg = uap->msgflg; + struct ucred *cred = td->td_ucred; + register struct msqid_ds *msqptr = NULL; + +#ifdef MSG_DEBUG_OK + printf("msgget(0x%x, 0%o)\n", key, msgflg); +#endif + + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + + mtx_lock(&Giant); + if (key != IPC_PRIVATE) { + for (msqid = 0; msqid < msginfo.msgmni; msqid++) { + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes != 0 && + msqptr->msg_perm.key == key) + break; + } + if (msqid < msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("found public key\n"); +#endif + if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) { +#ifdef MSG_DEBUG_OK + printf("not exclusive\n"); +#endif + error = EEXIST; + goto done2; + } + if ((error = ipcperm(td, &msqptr->msg_perm, msgflg & 0700 ))) { +#ifdef MSG_DEBUG_OK + printf("requester doesn't have 0%o access\n", + msgflg & 0700); +#endif + goto done2; + } + goto found; + } + } + +#ifdef MSG_DEBUG_OK + printf("need to allocate the msqid_ds\n"); +#endif + if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) { + for (msqid = 0; msqid < msginfo.msgmni; msqid++) { + /* + * Look for an unallocated and unlocked msqid_ds. + * msqid_ds's can be locked by msgsnd or msgrcv while + * they are copying the message in/out. We can't + * re-use the entry until they release it. + */ + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes == 0 && + (msqptr->msg_perm.mode & MSG_LOCKED) == 0) + break; + } + if (msqid == msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("no more msqid_ds's available\n"); +#endif + error = ENOSPC; + goto done2; + } +#ifdef MSG_DEBUG_OK + printf("msqid %d is available\n", msqid); +#endif + msqptr->msg_perm.key = key; + msqptr->msg_perm.cuid = cred->cr_uid; + msqptr->msg_perm.uid = cred->cr_uid; + msqptr->msg_perm.cgid = cred->cr_gid; + msqptr->msg_perm.gid = cred->cr_gid; + msqptr->msg_perm.mode = (msgflg & 0777); + /* Make sure that the returned msqid is unique */ + msqptr->msg_perm.seq++; + msqptr->msg_first = NULL; + msqptr->msg_last = NULL; + msqptr->msg_cbytes = 0; + msqptr->msg_qnum = 0; + msqptr->msg_qbytes = msginfo.msgmnb; + msqptr->msg_lspid = 0; + msqptr->msg_lrpid = 0; + msqptr->msg_stime = 0; + msqptr->msg_rtime = 0; + msqptr->msg_ctime = time_second; + } else { +#ifdef MSG_DEBUG_OK + printf("didn't find it and wasn't asked to create it\n"); +#endif + error = ENOENT; + goto done2; + } + +found: + /* Construct the unique msqid */ + td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm); +done2: + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct msgsnd_args { + int msqid; + void *msgp; + size_t msgsz; + int msgflg; +}; +#endif + +/* + * MPSAFE + */ +int +msgsnd(td, uap) + struct thread *td; + register struct msgsnd_args *uap; +{ + int msqid = uap->msqid; + void *user_msgp = uap->msgp; + size_t msgsz = uap->msgsz; + int msgflg = uap->msgflg; + int segs_needed, error = 0; + register struct msqid_ds *msqptr; + register struct msg *msghdr; + short next; + +#ifdef MSG_DEBUG_OK + printf("call to msgsnd(%d, 0x%x, %d, %d)\n", msqid, user_msgp, msgsz, + msgflg); +#endif + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + + mtx_lock(&Giant); + msqid = IPCID_TO_IX(msqid); + + if (msqid < 0 || msqid >= msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid, + msginfo.msgmni); +#endif + error = EINVAL; + goto done2; + } + + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("no such message queue id\n"); +#endif + error = EINVAL; + goto done2; + } + if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { +#ifdef MSG_DEBUG_OK + printf("wrong sequence number\n"); +#endif + error = EINVAL; + goto done2; + } + + if ((error = ipcperm(td, &msqptr->msg_perm, IPC_W))) { +#ifdef MSG_DEBUG_OK + printf("requester doesn't have write access\n"); +#endif + goto done2; + } + + segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz; +#ifdef MSG_DEBUG_OK + printf("msgsz=%d, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz, + segs_needed); +#endif + for (;;) { + int need_more_resources = 0; + + /* + * check msgsz + * (inside this loop in case msg_qbytes changes while we sleep) + */ + + if (msgsz > msqptr->msg_qbytes) { +#ifdef MSG_DEBUG_OK + printf("msgsz > msqptr->msg_qbytes\n"); +#endif + error = EINVAL; + goto done2; + } + + if (msqptr->msg_perm.mode & MSG_LOCKED) { +#ifdef MSG_DEBUG_OK + printf("msqid is locked\n"); +#endif + need_more_resources = 1; + } + if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes) { +#ifdef MSG_DEBUG_OK + printf("msgsz + msg_cbytes > msg_qbytes\n"); +#endif + need_more_resources = 1; + } + if (segs_needed > nfree_msgmaps) { +#ifdef MSG_DEBUG_OK + printf("segs_needed > nfree_msgmaps\n"); +#endif + need_more_resources = 1; + } + if (free_msghdrs == NULL) { +#ifdef MSG_DEBUG_OK + printf("no more msghdrs\n"); +#endif + need_more_resources = 1; + } + + if (need_more_resources) { + int we_own_it; + + if ((msgflg & IPC_NOWAIT) != 0) { +#ifdef MSG_DEBUG_OK + printf("need more resources but caller doesn't want to wait\n"); +#endif + error = EAGAIN; + goto done2; + } + + if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) { +#ifdef MSG_DEBUG_OK + printf("we don't own the msqid_ds\n"); +#endif + we_own_it = 0; + } else { + /* Force later arrivals to wait for our + request */ +#ifdef MSG_DEBUG_OK + printf("we own the msqid_ds\n"); +#endif + msqptr->msg_perm.mode |= MSG_LOCKED; + we_own_it = 1; + } +#ifdef MSG_DEBUG_OK + printf("goodnight\n"); +#endif + error = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, + "msgwait", 0); +#ifdef MSG_DEBUG_OK + printf("good morning, error=%d\n", error); +#endif + if (we_own_it) + msqptr->msg_perm.mode &= ~MSG_LOCKED; + if (error != 0) { +#ifdef MSG_DEBUG_OK + printf("msgsnd: interrupted system call\n"); +#endif + error = EINTR; + goto done2; + } + + /* + * Make sure that the msq queue still exists + */ + + if (msqptr->msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("msqid deleted\n"); +#endif + error = EIDRM; + goto done2; + } + + } else { +#ifdef MSG_DEBUG_OK + printf("got all the resources that we need\n"); +#endif + break; + } + } + + /* + * We have the resources that we need. + * Make sure! + */ + + if (msqptr->msg_perm.mode & MSG_LOCKED) + panic("msg_perm.mode & MSG_LOCKED"); + if (segs_needed > nfree_msgmaps) + panic("segs_needed > nfree_msgmaps"); + if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes) + panic("msgsz + msg_cbytes > msg_qbytes"); + if (free_msghdrs == NULL) + panic("no more msghdrs"); + + /* + * Re-lock the msqid_ds in case we page-fault when copying in the + * message + */ + + if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) + panic("msqid_ds is already locked"); + msqptr->msg_perm.mode |= MSG_LOCKED; + + /* + * Allocate a message header + */ + + msghdr = free_msghdrs; + free_msghdrs = msghdr->msg_next; + msghdr->msg_spot = -1; + msghdr->msg_ts = msgsz; + + /* + * Allocate space for the message + */ + + while (segs_needed > 0) { + if (nfree_msgmaps <= 0) + panic("not enough msgmaps"); + if (free_msgmaps == -1) + panic("nil free_msgmaps"); + next = free_msgmaps; + if (next <= -1) + panic("next too low #1"); + if (next >= msginfo.msgseg) + panic("next out of range #1"); +#ifdef MSG_DEBUG_OK + printf("allocating segment %d to message\n", next); +#endif + free_msgmaps = msgmaps[next].next; + nfree_msgmaps--; + msgmaps[next].next = msghdr->msg_spot; + msghdr->msg_spot = next; + segs_needed--; + } + + /* + * Copy in the message type + */ + + if ((error = copyin(user_msgp, &msghdr->msg_type, + sizeof(msghdr->msg_type))) != 0) { +#ifdef MSG_DEBUG_OK + printf("error %d copying the message type\n", error); +#endif + msg_freehdr(msghdr); + msqptr->msg_perm.mode &= ~MSG_LOCKED; + wakeup((caddr_t)msqptr); + goto done2; + } + user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type); + + /* + * Validate the message type + */ + + if (msghdr->msg_type < 1) { + msg_freehdr(msghdr); + msqptr->msg_perm.mode &= ~MSG_LOCKED; + wakeup((caddr_t)msqptr); +#ifdef MSG_DEBUG_OK + printf("mtype (%d) < 1\n", msghdr->msg_type); +#endif + error = EINVAL; + goto done2; + } + + /* + * Copy in the message body + */ + + next = msghdr->msg_spot; + while (msgsz > 0) { + size_t tlen; + if (msgsz > msginfo.msgssz) + tlen = msginfo.msgssz; + else + tlen = msgsz; + if (next <= -1) + panic("next too low #2"); + if (next >= msginfo.msgseg) + panic("next out of range #2"); + if ((error = copyin(user_msgp, &msgpool[next * msginfo.msgssz], + tlen)) != 0) { +#ifdef MSG_DEBUG_OK + printf("error %d copying in message segment\n", error); +#endif + msg_freehdr(msghdr); + msqptr->msg_perm.mode &= ~MSG_LOCKED; + wakeup((caddr_t)msqptr); + goto done2; + } + msgsz -= tlen; + user_msgp = (char *)user_msgp + tlen; + next = msgmaps[next].next; + } + if (next != -1) + panic("didn't use all the msg segments"); + + /* + * We've got the message. Unlock the msqid_ds. + */ + + msqptr->msg_perm.mode &= ~MSG_LOCKED; + + /* + * Make sure that the msqid_ds is still allocated. + */ + + if (msqptr->msg_qbytes == 0) { + msg_freehdr(msghdr); + wakeup((caddr_t)msqptr); + error = EIDRM; + goto done2; + } + + /* + * Put the message into the queue + */ + + if (msqptr->msg_first == NULL) { + msqptr->msg_first = msghdr; + msqptr->msg_last = msghdr; + } else { + msqptr->msg_last->msg_next = msghdr; + msqptr->msg_last = msghdr; + } + msqptr->msg_last->msg_next = NULL; + + msqptr->msg_cbytes += msghdr->msg_ts; + msqptr->msg_qnum++; + msqptr->msg_lspid = td->td_proc->p_pid; + msqptr->msg_stime = time_second; + + wakeup((caddr_t)msqptr); + td->td_retval[0] = 0; +done2: + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct msgrcv_args { + int msqid; + void *msgp; + size_t msgsz; + long msgtyp; + int msgflg; +}; +#endif + +/* + * MPSAFE + */ +int +msgrcv(td, uap) + struct thread *td; + register struct msgrcv_args *uap; +{ + int msqid = uap->msqid; + void *user_msgp = uap->msgp; + size_t msgsz = uap->msgsz; + long msgtyp = uap->msgtyp; + int msgflg = uap->msgflg; + size_t len; + register struct msqid_ds *msqptr; + register struct msg *msghdr; + int error = 0; + short next; + +#ifdef MSG_DEBUG_OK + printf("call to msgrcv(%d, 0x%x, %d, %ld, %d)\n", msqid, user_msgp, + msgsz, msgtyp, msgflg); +#endif + + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + + mtx_lock(&Giant); + msqid = IPCID_TO_IX(msqid); + + if (msqid < 0 || msqid >= msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid, + msginfo.msgmni); +#endif + error = EINVAL; + goto done2; + } + + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("no such message queue id\n"); +#endif + error = EINVAL; + goto done2; + } + if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { +#ifdef MSG_DEBUG_OK + printf("wrong sequence number\n"); +#endif + error = EINVAL; + goto done2; + } + + if ((error = ipcperm(td, &msqptr->msg_perm, IPC_R))) { +#ifdef MSG_DEBUG_OK + printf("requester doesn't have read access\n"); +#endif + goto done2; + } + + msghdr = NULL; + while (msghdr == NULL) { + if (msgtyp == 0) { + msghdr = msqptr->msg_first; + if (msghdr != NULL) { + if (msgsz < msghdr->msg_ts && + (msgflg & MSG_NOERROR) == 0) { +#ifdef MSG_DEBUG_OK + printf("first message on the queue is too big (want %d, got %d)\n", + msgsz, msghdr->msg_ts); +#endif + error = E2BIG; + goto done2; + } + if (msqptr->msg_first == msqptr->msg_last) { + msqptr->msg_first = NULL; + msqptr->msg_last = NULL; + } else { + msqptr->msg_first = msghdr->msg_next; + if (msqptr->msg_first == NULL) + panic("msg_first/last screwed up #1"); + } + } + } else { + struct msg *previous; + struct msg **prev; + + previous = NULL; + prev = &(msqptr->msg_first); + while ((msghdr = *prev) != NULL) { + /* + * Is this message's type an exact match or is + * this message's type less than or equal to + * the absolute value of a negative msgtyp? + * Note that the second half of this test can + * NEVER be true if msgtyp is positive since + * msg_type is always positive! + */ + + if (msgtyp == msghdr->msg_type || + msghdr->msg_type <= -msgtyp) { +#ifdef MSG_DEBUG_OK + printf("found message type %d, requested %d\n", + msghdr->msg_type, msgtyp); +#endif + if (msgsz < msghdr->msg_ts && + (msgflg & MSG_NOERROR) == 0) { +#ifdef MSG_DEBUG_OK + printf("requested message on the queue is too big (want %d, got %d)\n", + msgsz, msghdr->msg_ts); +#endif + error = E2BIG; + goto done2; + } + *prev = msghdr->msg_next; + if (msghdr == msqptr->msg_last) { + if (previous == NULL) { + if (prev != + &msqptr->msg_first) + panic("msg_first/last screwed up #2"); + msqptr->msg_first = + NULL; + msqptr->msg_last = + NULL; + } else { + if (prev == + &msqptr->msg_first) + panic("msg_first/last screwed up #3"); + msqptr->msg_last = + previous; + } + } + break; + } + previous = msghdr; + prev = &(msghdr->msg_next); + } + } + + /* + * We've either extracted the msghdr for the appropriate + * message or there isn't one. + * If there is one then bail out of this loop. + */ + + if (msghdr != NULL) + break; + + /* + * Hmph! No message found. Does the user want to wait? + */ + + if ((msgflg & IPC_NOWAIT) != 0) { +#ifdef MSG_DEBUG_OK + printf("no appropriate message found (msgtyp=%d)\n", + msgtyp); +#endif + /* The SVID says to return ENOMSG. */ + error = ENOMSG; + goto done2; + } + + /* + * Wait for something to happen + */ + +#ifdef MSG_DEBUG_OK + printf("msgrcv: goodnight\n"); +#endif + error = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, "msgwait", + 0); +#ifdef MSG_DEBUG_OK + printf("msgrcv: good morning (error=%d)\n", error); +#endif + + if (error != 0) { +#ifdef MSG_DEBUG_OK + printf("msgsnd: interrupted system call\n"); +#endif + error = EINTR; + goto done2; + } + + /* + * Make sure that the msq queue still exists + */ + + if (msqptr->msg_qbytes == 0 || + msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { +#ifdef MSG_DEBUG_OK + printf("msqid deleted\n"); +#endif + error = EIDRM; + goto done2; + } + } + + /* + * Return the message to the user. + * + * First, do the bookkeeping (before we risk being interrupted). + */ + + msqptr->msg_cbytes -= msghdr->msg_ts; + msqptr->msg_qnum--; + msqptr->msg_lrpid = td->td_proc->p_pid; + msqptr->msg_rtime = time_second; + + /* + * Make msgsz the actual amount that we'll be returning. + * Note that this effectively truncates the message if it is too long + * (since msgsz is never increased). + */ + +#ifdef MSG_DEBUG_OK + printf("found a message, msgsz=%d, msg_ts=%d\n", msgsz, + msghdr->msg_ts); +#endif + if (msgsz > msghdr->msg_ts) + msgsz = msghdr->msg_ts; + + /* + * Return the type to the user. + */ + + error = copyout((caddr_t)&(msghdr->msg_type), user_msgp, + sizeof(msghdr->msg_type)); + if (error != 0) { +#ifdef MSG_DEBUG_OK + printf("error (%d) copying out message type\n", error); +#endif + msg_freehdr(msghdr); + wakeup((caddr_t)msqptr); + goto done2; + } + user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type); + + /* + * Return the segments to the user + */ + + next = msghdr->msg_spot; + for (len = 0; len < msgsz; len += msginfo.msgssz) { + size_t tlen; + + if (msgsz - len > msginfo.msgssz) + tlen = msginfo.msgssz; + else + tlen = msgsz - len; + if (next <= -1) + panic("next too low #3"); + if (next >= msginfo.msgseg) + panic("next out of range #3"); + error = copyout((caddr_t)&msgpool[next * msginfo.msgssz], + user_msgp, tlen); + if (error != 0) { +#ifdef MSG_DEBUG_OK + printf("error (%d) copying out message segment\n", + error); +#endif + msg_freehdr(msghdr); + wakeup((caddr_t)msqptr); + goto done2; + } + user_msgp = (char *)user_msgp + tlen; + next = msgmaps[next].next; + } + + /* + * Done, return the actual number of bytes copied out. + */ + + msg_freehdr(msghdr); + wakeup((caddr_t)msqptr); + td->td_retval[0] = msgsz; +done2: + mtx_unlock(&Giant); + return (error); +} + +static int +sysctl_msqids(SYSCTL_HANDLER_ARGS) +{ + + return (SYSCTL_OUT(req, msqids, + sizeof(struct msqid_ds) * msginfo.msgmni)); +} + +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RD, &msginfo.msgmni, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RD, &msginfo.msgmnb, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RD, &msginfo.msgtql, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RD, &msginfo.msgssz, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RD, &msginfo.msgseg, 0, "") +SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLFLAG_RD, + NULL, 0, sysctl_msqids, "", "Message queue IDs"); diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c new file mode 100644 index 0000000..af784b8 --- /dev/null +++ b/sys/kern/sysv_sem.c @@ -0,0 +1,1193 @@ +/* $FreeBSD$ */ + +/* + * Implementation of SVID semaphores + * + * Author: Daniel Boulet + * + * This software is provided ``AS IS'' without any warranties of any kind. + */ + +#include "opt_sysvipc.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sem.h> +#include <sys/syscall.h> +#include <sys/sysent.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/jail.h> + +static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores"); + +static void seminit(void); +static int sysvsem_modload(struct module *, int, void *); +static int semunload(void); +static void semexit_myhook(struct proc *p); +static int sysctl_sema(SYSCTL_HANDLER_ARGS); + +#ifndef _SYS_SYSPROTO_H_ +struct __semctl_args; +int __semctl(struct thread *td, struct __semctl_args *uap); +struct semget_args; +int semget(struct thread *td, struct semget_args *uap); +struct semop_args; +int semop(struct thread *td, struct semop_args *uap); +#endif + +static struct sem_undo *semu_alloc(struct thread *td); +static int semundo_adjust(struct thread *td, struct sem_undo **supptr, + int semid, int semnum, int adjval); +static void semundo_clear(int semid, int semnum); + +/* XXX casting to (sy_call_t *) is bogus, as usual. */ +static sy_call_t *semcalls[] = { + (sy_call_t *)__semctl, (sy_call_t *)semget, + (sy_call_t *)semop +}; + +static int semtot = 0; +static struct semid_ds *sema; /* semaphore id pool */ +static struct sem *sem; /* semaphore pool */ +static struct sem_undo *semu_list; /* list of active undo structures */ +static int *semu; /* undo structure pool */ + +struct sem { + u_short semval; /* semaphore value */ + pid_t sempid; /* pid of last operation */ + u_short semncnt; /* # awaiting semval > cval */ + u_short semzcnt; /* # awaiting semval = 0 */ +}; + +/* + * Undo structure (one per process) + */ +struct sem_undo { + struct sem_undo *un_next; /* ptr to next active undo structure */ + struct proc *un_proc; /* owner of this structure */ + short un_cnt; /* # of active entries */ + struct undo { + short un_adjval; /* adjust on exit values */ + short un_num; /* semaphore # */ + int un_id; /* semid */ + } un_ent[1]; /* undo entries */ +}; + +/* + * Configuration parameters + */ +#ifndef SEMMNI +#define SEMMNI 10 /* # of semaphore identifiers */ +#endif +#ifndef SEMMNS +#define SEMMNS 60 /* # of semaphores in system */ +#endif +#ifndef SEMUME +#define SEMUME 10 /* max # of undo entries per process */ +#endif +#ifndef SEMMNU +#define SEMMNU 30 /* # of undo structures in system */ +#endif + +/* shouldn't need tuning */ +#ifndef SEMMAP +#define SEMMAP 30 /* # of entries in semaphore map */ +#endif +#ifndef SEMMSL +#define SEMMSL SEMMNS /* max # of semaphores per id */ +#endif +#ifndef SEMOPM +#define SEMOPM 100 /* max # of operations per semop call */ +#endif + +#define SEMVMX 32767 /* semaphore maximum value */ +#define SEMAEM 16384 /* adjust on exit max value */ + +/* + * Due to the way semaphore memory is allocated, we have to ensure that + * SEMUSZ is properly aligned. + */ + +#define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1)) + +/* actual size of an undo structure */ +#define SEMUSZ SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME])) + +/* + * Macro to find a particular sem_undo vector + */ +#define SEMU(ix) ((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz)) + +/* + * semaphore info struct + */ +struct seminfo seminfo = { + SEMMAP, /* # of entries in semaphore map */ + SEMMNI, /* # of semaphore identifiers */ + SEMMNS, /* # of semaphores in system */ + SEMMNU, /* # of undo structures in system */ + SEMMSL, /* max # of semaphores per id */ + SEMOPM, /* max # of operations per semop call */ + SEMUME, /* max # of undo entries per process */ + SEMUSZ, /* size in bytes of undo structure */ + SEMVMX, /* semaphore maximum value */ + SEMAEM /* adjust on exit max value */ +}; + +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, semmap, CTLFLAG_RW, &seminfo.semmap, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RD, &seminfo.semmni, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RD, &seminfo.semmns, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RD, &seminfo.semmnu, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RW, &seminfo.semmsl, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RD, &seminfo.semopm, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RD, &seminfo.semume, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RD, &seminfo.semusz, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RW, &seminfo.semvmx, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RW, &seminfo.semaem, 0, ""); +SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLFLAG_RD, + NULL, 0, sysctl_sema, "", ""); + +static void +seminit(void) +{ + register int i; + + TUNABLE_INT_FETCH("kern.ipc.semmap", &seminfo.semmap); + TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni); + TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns); + TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu); + TUNABLE_INT_FETCH("kern.ipc.semmsl", &seminfo.semmsl); + TUNABLE_INT_FETCH("kern.ipc.semopm", &seminfo.semopm); + TUNABLE_INT_FETCH("kern.ipc.semume", &seminfo.semume); + TUNABLE_INT_FETCH("kern.ipc.semusz", &seminfo.semusz); + TUNABLE_INT_FETCH("kern.ipc.semvmx", &seminfo.semvmx); + TUNABLE_INT_FETCH("kern.ipc.semaem", &seminfo.semaem); + + sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK); + if (sem == NULL) + panic("sem is NULL"); + sema = malloc(sizeof(struct semid_ds) * seminfo.semmni, M_SEM, M_WAITOK); + if (sema == NULL) + panic("sema is NULL"); + semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK); + if (semu == NULL) + panic("semu is NULL"); + + for (i = 0; i < seminfo.semmni; i++) { + sema[i].sem_base = 0; + sema[i].sem_perm.mode = 0; + } + for (i = 0; i < seminfo.semmnu; i++) { + register struct sem_undo *suptr = SEMU(i); + suptr->un_proc = NULL; + } + semu_list = NULL; + at_exit(semexit_myhook); +} + +static int +semunload(void) +{ + + if (semtot != 0) + return (EBUSY); + + free(sem, M_SEM); + free(sema, M_SEM); + free(semu, M_SEM); + rm_at_exit(semexit_myhook); + return (0); +} + +static int +sysvsem_modload(struct module *module, int cmd, void *arg) +{ + int error = 0; + + switch (cmd) { + case MOD_LOAD: + seminit(); + break; + case MOD_UNLOAD: + error = semunload(); + break; + case MOD_SHUTDOWN: + break; + default: + error = EINVAL; + break; + } + return (error); +} + +static moduledata_t sysvsem_mod = { + "sysvsem", + &sysvsem_modload, + NULL +}; + +SYSCALL_MODULE_HELPER(semsys); +SYSCALL_MODULE_HELPER(__semctl); +SYSCALL_MODULE_HELPER(semget); +SYSCALL_MODULE_HELPER(semop); + +DECLARE_MODULE(sysvsem, sysvsem_mod, + SI_SUB_SYSV_SEM, SI_ORDER_FIRST); +MODULE_VERSION(sysvsem, 1); + +/* + * Entry point for all SEM calls + * + * MPSAFE + */ +int +semsys(td, uap) + struct thread *td; + /* XXX actually varargs. */ + struct semsys_args /* { + u_int which; + int a2; + int a3; + int a4; + int a5; + } */ *uap; +{ + int error; + + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + if (uap->which >= sizeof(semcalls)/sizeof(semcalls[0])) + return (EINVAL); + mtx_lock(&Giant); + error = (*semcalls[uap->which])(td, &uap->a2); + mtx_unlock(&Giant); + return (error); +} + +/* + * Allocate a new sem_undo structure for a process + * (returns ptr to structure or NULL if no more room) + */ + +static struct sem_undo * +semu_alloc(td) + struct thread *td; +{ + register int i; + register struct sem_undo *suptr; + register struct sem_undo **supptr; + int attempt; + + /* + * Try twice to allocate something. + * (we'll purge any empty structures after the first pass so + * two passes are always enough) + */ + + for (attempt = 0; attempt < 2; attempt++) { + /* + * Look for a free structure. + * Fill it in and return it if we find one. + */ + + for (i = 0; i < seminfo.semmnu; i++) { + suptr = SEMU(i); + if (suptr->un_proc == NULL) { + suptr->un_next = semu_list; + semu_list = suptr; + suptr->un_cnt = 0; + suptr->un_proc = td->td_proc; + return(suptr); + } + } + + /* + * We didn't find a free one, if this is the first attempt + * then try to free some structures. + */ + + if (attempt == 0) { + /* All the structures are in use - try to free some */ + int did_something = 0; + + supptr = &semu_list; + while ((suptr = *supptr) != NULL) { + if (suptr->un_cnt == 0) { + suptr->un_proc = NULL; + *supptr = suptr->un_next; + did_something = 1; + } else + supptr = &(suptr->un_next); + } + + /* If we didn't free anything then just give-up */ + if (!did_something) + return(NULL); + } else { + /* + * The second pass failed even though we freed + * something after the first pass! + * This is IMPOSSIBLE! + */ + panic("semu_alloc - second attempt failed"); + } + } + return (NULL); +} + +/* + * Adjust a particular entry for a particular proc + */ + +static int +semundo_adjust(td, supptr, semid, semnum, adjval) + register struct thread *td; + struct sem_undo **supptr; + int semid, semnum; + int adjval; +{ + struct proc *p = td->td_proc; + register struct sem_undo *suptr; + register struct undo *sunptr; + int i; + + /* Look for and remember the sem_undo if the caller doesn't provide + it */ + + suptr = *supptr; + if (suptr == NULL) { + for (suptr = semu_list; suptr != NULL; + suptr = suptr->un_next) { + if (suptr->un_proc == p) { + *supptr = suptr; + break; + } + } + if (suptr == NULL) { + if (adjval == 0) + return(0); + suptr = semu_alloc(td); + if (suptr == NULL) + return(ENOSPC); + *supptr = suptr; + } + } + + /* + * Look for the requested entry and adjust it (delete if adjval becomes + * 0). + */ + sunptr = &suptr->un_ent[0]; + for (i = 0; i < suptr->un_cnt; i++, sunptr++) { + if (sunptr->un_id != semid || sunptr->un_num != semnum) + continue; + if (adjval != 0) { + adjval += sunptr->un_adjval; + if (adjval > seminfo.semaem || adjval < -seminfo.semaem) + return (ERANGE); + } + sunptr->un_adjval = adjval; + if (sunptr->un_adjval == 0) { + suptr->un_cnt--; + if (i < suptr->un_cnt) + suptr->un_ent[i] = + suptr->un_ent[suptr->un_cnt]; + } + return(0); + } + + /* Didn't find the right entry - create it */ + if (adjval == 0) + return(0); + if (adjval > seminfo.semaem || adjval < -seminfo.semaem) + return (ERANGE); + if (suptr->un_cnt != seminfo.semume) { + sunptr = &suptr->un_ent[suptr->un_cnt]; + suptr->un_cnt++; + sunptr->un_adjval = adjval; + sunptr->un_id = semid; sunptr->un_num = semnum; + } else + return(EINVAL); + return(0); +} + +static void +semundo_clear(semid, semnum) + int semid, semnum; +{ + register struct sem_undo *suptr; + + for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) { + register struct undo *sunptr = &suptr->un_ent[0]; + register int i = 0; + + while (i < suptr->un_cnt) { + if (sunptr->un_id == semid) { + if (semnum == -1 || sunptr->un_num == semnum) { + suptr->un_cnt--; + if (i < suptr->un_cnt) { + suptr->un_ent[i] = + suptr->un_ent[suptr->un_cnt]; + continue; + } + } + if (semnum != -1) + break; + } + i++, sunptr++; + } + } +} + +/* + * Note that the user-mode half of this passes a union, not a pointer + */ +#ifndef _SYS_SYSPROTO_H_ +struct __semctl_args { + int semid; + int semnum; + int cmd; + union semun *arg; +}; +#endif + +/* + * MPSAFE + */ +int +__semctl(td, uap) + struct thread *td; + register struct __semctl_args *uap; +{ + int semid = uap->semid; + int semnum = uap->semnum; + int cmd = uap->cmd; + union semun *arg = uap->arg; + union semun real_arg; + struct ucred *cred = td->td_ucred; + int i, rval, error; + struct semid_ds sbuf; + register struct semid_ds *semaptr; + u_short usval; + +#ifdef SEM_DEBUG + printf("call to semctl(%d, %d, %d, 0x%x)\n", semid, semnum, cmd, arg); +#endif + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + + mtx_lock(&Giant); + switch(cmd) { + case SEM_STAT: + if (semid < 0 || semid >= seminfo.semmni) + UGAR(EINVAL); + semaptr = &sema[semid]; + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ) + UGAR(EINVAL); + if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R))) + UGAR(error); + if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + UGAR(error); + error = copyout((caddr_t)semaptr, real_arg.buf, + sizeof(struct semid_ds)); + rval = IXSEQ_TO_IPCID(semid,semaptr->sem_perm); + if (error == 0) + td->td_retval[0] = rval; + goto done2; + } + + semid = IPCID_TO_IX(semid); + if (semid < 0 || semid >= seminfo.semmni) { + error = EINVAL; + goto done2; + } + + semaptr = &sema[semid]; + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 || + semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) { + error = EINVAL; + goto done2; + } + + error = 0; + rval = 0; + + switch (cmd) { + case IPC_RMID: + if ((error = ipcperm(td, &semaptr->sem_perm, IPC_M))) + goto done2; + semaptr->sem_perm.cuid = cred->cr_uid; + semaptr->sem_perm.uid = cred->cr_uid; + semtot -= semaptr->sem_nsems; + for (i = semaptr->sem_base - sem; i < semtot; i++) + sem[i] = sem[i + semaptr->sem_nsems]; + for (i = 0; i < seminfo.semmni; i++) { + if ((sema[i].sem_perm.mode & SEM_ALLOC) && + sema[i].sem_base > semaptr->sem_base) + sema[i].sem_base -= semaptr->sem_nsems; + } + semaptr->sem_perm.mode = 0; + semundo_clear(semid, -1); + wakeup((caddr_t)semaptr); + break; + + case IPC_SET: + if ((error = ipcperm(td, &semaptr->sem_perm, IPC_M))) + goto done2; + if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + goto done2; + if ((error = copyin(real_arg.buf, (caddr_t)&sbuf, + sizeof(sbuf))) != 0) { + goto done2; + } + semaptr->sem_perm.uid = sbuf.sem_perm.uid; + semaptr->sem_perm.gid = sbuf.sem_perm.gid; + semaptr->sem_perm.mode = (semaptr->sem_perm.mode & ~0777) | + (sbuf.sem_perm.mode & 0777); + semaptr->sem_ctime = time_second; + break; + + case IPC_STAT: + if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R))) + goto done2; + if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + goto done2; + error = copyout((caddr_t)semaptr, real_arg.buf, + sizeof(struct semid_ds)); + break; + + case GETNCNT: + if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R))) + goto done2; + if (semnum < 0 || semnum >= semaptr->sem_nsems) { + error = EINVAL; + goto done2; + } + rval = semaptr->sem_base[semnum].semncnt; + break; + + case GETPID: + if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R))) + goto done2; + if (semnum < 0 || semnum >= semaptr->sem_nsems) { + error = EINVAL; + goto done2; + } + rval = semaptr->sem_base[semnum].sempid; + break; + + case GETVAL: + if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R))) + goto done2; + if (semnum < 0 || semnum >= semaptr->sem_nsems) { + error = EINVAL; + goto done2; + } + rval = semaptr->sem_base[semnum].semval; + break; + + case GETALL: + if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R))) + goto done2; + if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + goto done2; + for (i = 0; i < semaptr->sem_nsems; i++) { + error = copyout((caddr_t)&semaptr->sem_base[i].semval, + &real_arg.array[i], sizeof(real_arg.array[0])); + if (error != 0) + break; + } + break; + + case GETZCNT: + if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R))) + goto done2; + if (semnum < 0 || semnum >= semaptr->sem_nsems) { + error = EINVAL; + goto done2; + } + rval = semaptr->sem_base[semnum].semzcnt; + break; + + case SETVAL: + if ((error = ipcperm(td, &semaptr->sem_perm, IPC_W))) + goto done2; + if (semnum < 0 || semnum >= semaptr->sem_nsems) { + error = EINVAL; + goto done2; + } + if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + goto done2; + if (real_arg.val < 0 || real_arg.val > seminfo.semvmx) { + error = ERANGE; + goto done2; + } + semaptr->sem_base[semnum].semval = real_arg.val; + semundo_clear(semid, semnum); + wakeup((caddr_t)semaptr); + break; + + case SETALL: + if ((error = ipcperm(td, &semaptr->sem_perm, IPC_W))) + goto done2; + if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + goto done2; + for (i = 0; i < semaptr->sem_nsems; i++) { + error = copyin(&real_arg.array[i], + (caddr_t)&usval, sizeof(real_arg.array[0])); + if (error != 0) + break; + if (usval > seminfo.semvmx) { + error = ERANGE; + break; + } + semaptr->sem_base[i].semval = usval; + } + semundo_clear(semid, -1); + wakeup((caddr_t)semaptr); + break; + + default: + error = EINVAL; + break; + } + + if (error == 0) + td->td_retval[0] = rval; +done2: + mtx_unlock(&Giant); + return(error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct semget_args { + key_t key; + int nsems; + int semflg; +}; +#endif + +/* + * MPSAFE + */ +int +semget(td, uap) + struct thread *td; + register struct semget_args *uap; +{ + int semid, error = 0; + int key = uap->key; + int nsems = uap->nsems; + int semflg = uap->semflg; + struct ucred *cred = td->td_ucred; + +#ifdef SEM_DEBUG + printf("semget(0x%x, %d, 0%o)\n", key, nsems, semflg); +#endif + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + + mtx_lock(&Giant); + if (key != IPC_PRIVATE) { + for (semid = 0; semid < seminfo.semmni; semid++) { + if ((sema[semid].sem_perm.mode & SEM_ALLOC) && + sema[semid].sem_perm.key == key) + break; + } + if (semid < seminfo.semmni) { +#ifdef SEM_DEBUG + printf("found public key\n"); +#endif + if ((error = ipcperm(td, &sema[semid].sem_perm, + semflg & 0700))) { + goto done2; + } + if (nsems > 0 && sema[semid].sem_nsems < nsems) { +#ifdef SEM_DEBUG + printf("too small\n"); +#endif + error = EINVAL; + goto done2; + } + if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) { +#ifdef SEM_DEBUG + printf("not exclusive\n"); +#endif + error = EEXIST; + goto done2; + } + goto found; + } + } + +#ifdef SEM_DEBUG + printf("need to allocate the semid_ds\n"); +#endif + if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) { + if (nsems <= 0 || nsems > seminfo.semmsl) { +#ifdef SEM_DEBUG + printf("nsems out of range (0<%d<=%d)\n", nsems, + seminfo.semmsl); +#endif + error = EINVAL; + goto done2; + } + if (nsems > seminfo.semmns - semtot) { +#ifdef SEM_DEBUG + printf("not enough semaphores left (need %d, got %d)\n", + nsems, seminfo.semmns - semtot); +#endif + error = ENOSPC; + goto done2; + } + for (semid = 0; semid < seminfo.semmni; semid++) { + if ((sema[semid].sem_perm.mode & SEM_ALLOC) == 0) + break; + } + if (semid == seminfo.semmni) { +#ifdef SEM_DEBUG + printf("no more semid_ds's available\n"); +#endif + error = ENOSPC; + goto done2; + } +#ifdef SEM_DEBUG + printf("semid %d is available\n", semid); +#endif + sema[semid].sem_perm.key = key; + sema[semid].sem_perm.cuid = cred->cr_uid; + sema[semid].sem_perm.uid = cred->cr_uid; + sema[semid].sem_perm.cgid = cred->cr_gid; + sema[semid].sem_perm.gid = cred->cr_gid; + sema[semid].sem_perm.mode = (semflg & 0777) | SEM_ALLOC; + sema[semid].sem_perm.seq = + (sema[semid].sem_perm.seq + 1) & 0x7fff; + sema[semid].sem_nsems = nsems; + sema[semid].sem_otime = 0; + sema[semid].sem_ctime = time_second; + sema[semid].sem_base = &sem[semtot]; + semtot += nsems; + bzero(sema[semid].sem_base, + sizeof(sema[semid].sem_base[0])*nsems); +#ifdef SEM_DEBUG + printf("sembase = 0x%x, next = 0x%x\n", sema[semid].sem_base, + &sem[semtot]); +#endif + } else { +#ifdef SEM_DEBUG + printf("didn't find it and wasn't asked to create it\n"); +#endif + error = ENOENT; + goto done2; + } + +found: + td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].sem_perm); +done2: + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct semop_args { + int semid; + struct sembuf *sops; + u_int nsops; +}; +#endif + +/* + * MPSAFE + */ +int +semop(td, uap) + struct thread *td; + register struct semop_args *uap; +{ + int semid = uap->semid; + u_int nsops = uap->nsops; + struct sembuf *sops = NULL; + register struct semid_ds *semaptr; + register struct sembuf *sopptr = 0; + register struct sem *semptr = 0; + struct sem_undo *suptr; + int i, j, error; + int do_wakeup, do_undos; + +#ifdef SEM_DEBUG + printf("call to semop(%d, 0x%x, %u)\n", semid, sops, nsops); +#endif + + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + + mtx_lock(&Giant); + semid = IPCID_TO_IX(semid); /* Convert back to zero origin */ + + if (semid < 0 || semid >= seminfo.semmni) { + error = EINVAL; + goto done2; + } + + semaptr = &sema[semid]; + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0) { + error = EINVAL; + goto done2; + } + if (semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) { + error = EINVAL; + goto done2; + } + if (nsops > seminfo.semopm) { +#ifdef SEM_DEBUG + printf("too many sops (max=%d, nsops=%d)\n", seminfo.semopm, + nsops); +#endif + error = E2BIG; + goto done2; + } + + /* Allocate memory for sem_ops */ + sops = malloc(nsops * sizeof(sops[0]), M_SEM, M_WAITOK); + if (!sops) + panic("Failed to allocate %d sem_ops", nsops); + + if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) { +#ifdef SEM_DEBUG + printf("error = %d from copyin(%08x, %08x, %d)\n", error, + uap->sops, sops, nsops * sizeof(sops[0])); +#endif + goto done2; + } + + /* + * Initial pass thru sops to see what permissions are needed. + * Also perform any checks that don't need repeating on each + * attempt to satisfy the request vector. + */ + j = 0; /* permission needed */ + do_undos = 0; + for (i = 0; i < nsops; i++) { + sopptr = &sops[i]; + if (sopptr->sem_num >= semaptr->sem_nsems) { + error = EFBIG; + goto done2; + } + if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0) + do_undos = 1; + j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A; + } + + if ((error = ipcperm(td, &semaptr->sem_perm, j))) { +#ifdef SEM_DEBUG + printf("error = %d from ipaccess\n", error); +#endif + goto done2; + } + + /* + * Loop trying to satisfy the vector of requests. + * If we reach a point where we must wait, any requests already + * performed are rolled back and we go to sleep until some other + * process wakes us up. At this point, we start all over again. + * + * This ensures that from the perspective of other tasks, a set + * of requests is atomic (never partially satisfied). + */ + for (;;) { + do_wakeup = 0; + error = 0; /* error return if necessary */ + + for (i = 0; i < nsops; i++) { + sopptr = &sops[i]; + semptr = &semaptr->sem_base[sopptr->sem_num]; + +#ifdef SEM_DEBUG + printf("semop: semaptr=%x, sem_base=%x, semptr=%x, sem[%d]=%d : op=%d, flag=%s\n", + semaptr, semaptr->sem_base, semptr, + sopptr->sem_num, semptr->semval, sopptr->sem_op, + (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait"); +#endif + + if (sopptr->sem_op < 0) { + if (semptr->semval + sopptr->sem_op < 0) { +#ifdef SEM_DEBUG + printf("semop: can't do it now\n"); +#endif + break; + } else { + semptr->semval += sopptr->sem_op; + if (semptr->semval == 0 && + semptr->semzcnt > 0) + do_wakeup = 1; + } + } else if (sopptr->sem_op == 0) { + if (semptr->semval != 0) { +#ifdef SEM_DEBUG + printf("semop: not zero now\n"); +#endif + break; + } + } else if (semptr->semval + sopptr->sem_op > + seminfo.semvmx) { + error = ERANGE; + break; + } else { + if (semptr->semncnt > 0) + do_wakeup = 1; + semptr->semval += sopptr->sem_op; + } + } + + /* + * Did we get through the entire vector? + */ + if (i >= nsops) + goto done; + + /* + * No ... rollback anything that we've already done + */ +#ifdef SEM_DEBUG + printf("semop: rollback 0 through %d\n", i-1); +#endif + for (j = 0; j < i; j++) + semaptr->sem_base[sops[j].sem_num].semval -= + sops[j].sem_op; + + /* If we detected an error, return it */ + if (error != 0) + goto done2; + + /* + * If the request that we couldn't satisfy has the + * NOWAIT flag set then return with EAGAIN. + */ + if (sopptr->sem_flg & IPC_NOWAIT) { + error = EAGAIN; + goto done2; + } + + if (sopptr->sem_op == 0) + semptr->semzcnt++; + else + semptr->semncnt++; + +#ifdef SEM_DEBUG + printf("semop: good night!\n"); +#endif + error = tsleep((caddr_t)semaptr, (PZERO - 4) | PCATCH, + "semwait", 0); +#ifdef SEM_DEBUG + printf("semop: good morning (error=%d)!\n", error); +#endif + + if (error != 0) { + error = EINTR; + goto done2; + } +#ifdef SEM_DEBUG + printf("semop: good morning!\n"); +#endif + + /* + * Make sure that the semaphore still exists + */ + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 || + semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) { + error = EIDRM; + goto done2; + } + + /* + * The semaphore is still alive. Readjust the count of + * waiting processes. + */ + if (sopptr->sem_op == 0) + semptr->semzcnt--; + else + semptr->semncnt--; + } + +done: + /* + * Process any SEM_UNDO requests. + */ + if (do_undos) { + suptr = NULL; + for (i = 0; i < nsops; i++) { + /* + * We only need to deal with SEM_UNDO's for non-zero + * op's. + */ + int adjval; + + if ((sops[i].sem_flg & SEM_UNDO) == 0) + continue; + adjval = sops[i].sem_op; + if (adjval == 0) + continue; + error = semundo_adjust(td, &suptr, semid, + sops[i].sem_num, -adjval); + if (error == 0) + continue; + + /* + * Oh-Oh! We ran out of either sem_undo's or undo's. + * Rollback the adjustments to this point and then + * rollback the semaphore ups and down so we can return + * with an error with all structures restored. We + * rollback the undo's in the exact reverse order that + * we applied them. This guarantees that we won't run + * out of space as we roll things back out. + */ + for (j = i - 1; j >= 0; j--) { + if ((sops[j].sem_flg & SEM_UNDO) == 0) + continue; + adjval = sops[j].sem_op; + if (adjval == 0) + continue; + if (semundo_adjust(td, &suptr, semid, + sops[j].sem_num, adjval) != 0) + panic("semop - can't undo undos"); + } + + for (j = 0; j < nsops; j++) + semaptr->sem_base[sops[j].sem_num].semval -= + sops[j].sem_op; + +#ifdef SEM_DEBUG + printf("error = %d from semundo_adjust\n", error); +#endif + goto done2; + } /* loop through the sops */ + } /* if (do_undos) */ + + /* We're definitely done - set the sempid's and time */ + for (i = 0; i < nsops; i++) { + sopptr = &sops[i]; + semptr = &semaptr->sem_base[sopptr->sem_num]; + semptr->sempid = td->td_proc->p_pid; + } + semaptr->sem_otime = time_second; + + /* + * Do a wakeup if any semaphore was up'd whilst something was + * sleeping on it. + */ + if (do_wakeup) { +#ifdef SEM_DEBUG + printf("semop: doing wakeup\n"); +#endif + wakeup((caddr_t)semaptr); +#ifdef SEM_DEBUG + printf("semop: back from wakeup\n"); +#endif + } +#ifdef SEM_DEBUG + printf("semop: done\n"); +#endif + td->td_retval[0] = 0; +done2: + if (sops) + free(sops, M_SEM); + mtx_unlock(&Giant); + return (error); +} + +/* + * Go through the undo structures for this process and apply the adjustments to + * semaphores. + */ +static void +semexit_myhook(p) + struct proc *p; +{ + register struct sem_undo *suptr; + register struct sem_undo **supptr; + + /* + * Go through the chain of undo vectors looking for one + * associated with this process. + */ + + for (supptr = &semu_list; (suptr = *supptr) != NULL; + supptr = &suptr->un_next) { + if (suptr->un_proc == p) + break; + } + + if (suptr == NULL) + return; + +#ifdef SEM_DEBUG + printf("proc @%08x has undo structure with %d entries\n", p, + suptr->un_cnt); +#endif + + /* + * If there are any active undo elements then process them. + */ + if (suptr->un_cnt > 0) { + int ix; + + for (ix = 0; ix < suptr->un_cnt; ix++) { + int semid = suptr->un_ent[ix].un_id; + int semnum = suptr->un_ent[ix].un_num; + int adjval = suptr->un_ent[ix].un_adjval; + struct semid_ds *semaptr; + + semaptr = &sema[semid]; + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0) + panic("semexit - semid not allocated"); + if (semnum >= semaptr->sem_nsems) + panic("semexit - semnum out of range"); + +#ifdef SEM_DEBUG + printf("semexit: %08x id=%d num=%d(adj=%d) ; sem=%d\n", + suptr->un_proc, suptr->un_ent[ix].un_id, + suptr->un_ent[ix].un_num, + suptr->un_ent[ix].un_adjval, + semaptr->sem_base[semnum].semval); +#endif + + if (adjval < 0) { + if (semaptr->sem_base[semnum].semval < -adjval) + semaptr->sem_base[semnum].semval = 0; + else + semaptr->sem_base[semnum].semval += + adjval; + } else + semaptr->sem_base[semnum].semval += adjval; + + wakeup((caddr_t)semaptr); +#ifdef SEM_DEBUG + printf("semexit: back from wakeup\n"); +#endif + } + } + + /* + * Deallocate the undo vector. + */ +#ifdef SEM_DEBUG + printf("removing vector\n"); +#endif + suptr->un_proc = NULL; + *supptr = suptr->un_next; +} + +static int +sysctl_sema(SYSCTL_HANDLER_ARGS) +{ + + return (SYSCTL_OUT(req, sema, + sizeof(struct semid_ds) * seminfo.semmni)); +} diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c new file mode 100644 index 0000000..85356a0 --- /dev/null +++ b/sys/kern/sysv_shm.c @@ -0,0 +1,890 @@ +/* $FreeBSD$ */ +/* $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $ */ + +/* + * Copyright (c) 1994 Adam Glass and Charles Hannum. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Adam Glass and Charles + * Hannum. + * 4. The names of the authors may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_compat.h" +#include "opt_sysvipc.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/sysctl.h> +#include <sys/shm.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/mutex.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> +#include <sys/jail.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_object.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> + +static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments"); + +struct oshmctl_args; +static int oshmctl(struct thread *td, struct oshmctl_args *uap); + +static int shmget_allocate_segment(struct thread *td, + struct shmget_args *uap, int mode); +static int shmget_existing(struct thread *td, struct shmget_args *uap, + int mode, int segnum); + +/* XXX casting to (sy_call_t *) is bogus, as usual. */ +static sy_call_t *shmcalls[] = { + (sy_call_t *)shmat, (sy_call_t *)oshmctl, + (sy_call_t *)shmdt, (sy_call_t *)shmget, + (sy_call_t *)shmctl +}; + +#define SHMSEG_FREE 0x0200 +#define SHMSEG_REMOVED 0x0400 +#define SHMSEG_ALLOCATED 0x0800 +#define SHMSEG_WANTED 0x1000 + +static int shm_last_free, shm_nused, shm_committed, shmalloced; +static struct shmid_ds *shmsegs; + +struct shm_handle { + /* vm_offset_t kva; */ + vm_object_t shm_object; +}; + +struct shmmap_state { + vm_offset_t va; + int shmid; +}; + +static void shm_deallocate_segment(struct shmid_ds *); +static int shm_find_segment_by_key(key_t); +static struct shmid_ds *shm_find_segment_by_shmid(int); +static struct shmid_ds *shm_find_segment_by_shmidx(int); +static int shm_delete_mapping(struct proc *p, struct shmmap_state *); +static void shmrealloc(void); +static void shminit(void); +static int sysvshm_modload(struct module *, int, void *); +static int shmunload(void); +static void shmexit_myhook(struct proc *p); +static void shmfork_myhook(struct proc *p1, struct proc *p2); +static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS); + +/* + * Tuneable values. + */ +#ifndef SHMMAXPGS +#define SHMMAXPGS 8192 /* Note: sysv shared memory is swap backed. */ +#endif +#ifndef SHMMAX +#define SHMMAX (SHMMAXPGS*PAGE_SIZE) +#endif +#ifndef SHMMIN +#define SHMMIN 1 +#endif +#ifndef SHMMNI +#define SHMMNI 192 +#endif +#ifndef SHMSEG +#define SHMSEG 128 +#endif +#ifndef SHMALL +#define SHMALL (SHMMAXPGS) +#endif + +struct shminfo shminfo = { + SHMMAX, + SHMMIN, + SHMMNI, + SHMSEG, + SHMALL +}; + +static int shm_use_phys; + +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RD, &shminfo.shmmni, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RD, &shminfo.shmseg, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW, + &shm_use_phys, 0, ""); +SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLFLAG_RD, + NULL, 0, sysctl_shmsegs, "", ""); + +static int +shm_find_segment_by_key(key) + key_t key; +{ + int i; + + for (i = 0; i < shmalloced; i++) + if ((shmsegs[i].shm_perm.mode & SHMSEG_ALLOCATED) && + shmsegs[i].shm_perm.key == key) + return i; + return -1; +} + +static struct shmid_ds * +shm_find_segment_by_shmid(shmid) + int shmid; +{ + int segnum; + struct shmid_ds *shmseg; + + segnum = IPCID_TO_IX(shmid); + if (segnum < 0 || segnum >= shmalloced) + return NULL; + shmseg = &shmsegs[segnum]; + if ((shmseg->shm_perm.mode & (SHMSEG_ALLOCATED | SHMSEG_REMOVED)) + != SHMSEG_ALLOCATED || + shmseg->shm_perm.seq != IPCID_TO_SEQ(shmid)) + return NULL; + return shmseg; +} + +static struct shmid_ds * +shm_find_segment_by_shmidx(int segnum) +{ + struct shmid_ds *shmseg; + + if (segnum < 0 || segnum >= shmalloced) + return NULL; + shmseg = &shmsegs[segnum]; + if ((shmseg->shm_perm.mode & (SHMSEG_ALLOCATED | SHMSEG_REMOVED)) + != SHMSEG_ALLOCATED ) + return NULL; + return shmseg; +} + +static void +shm_deallocate_segment(shmseg) + struct shmid_ds *shmseg; +{ + struct shm_handle *shm_handle; + size_t size; + + GIANT_REQUIRED; + + shm_handle = shmseg->shm_internal; + vm_object_deallocate(shm_handle->shm_object); + free((caddr_t)shm_handle, M_SHM); + shmseg->shm_internal = NULL; + size = round_page(shmseg->shm_segsz); + shm_committed -= btoc(size); + shm_nused--; + shmseg->shm_perm.mode = SHMSEG_FREE; +} + +static int +shm_delete_mapping(p, shmmap_s) + struct proc *p; + struct shmmap_state *shmmap_s; +{ + struct shmid_ds *shmseg; + int segnum, result; + size_t size; + + GIANT_REQUIRED; + + segnum = IPCID_TO_IX(shmmap_s->shmid); + shmseg = &shmsegs[segnum]; + size = round_page(shmseg->shm_segsz); + result = vm_map_remove(&p->p_vmspace->vm_map, shmmap_s->va, + shmmap_s->va + size); + if (result != KERN_SUCCESS) + return EINVAL; + shmmap_s->shmid = -1; + shmseg->shm_dtime = time_second; + if ((--shmseg->shm_nattch <= 0) && + (shmseg->shm_perm.mode & SHMSEG_REMOVED)) { + shm_deallocate_segment(shmseg); + shm_last_free = segnum; + } + return 0; +} + +#ifndef _SYS_SYSPROTO_H_ +struct shmdt_args { + void *shmaddr; +}; +#endif + +/* + * MPSAFE + */ +int +shmdt(td, uap) + struct thread *td; + struct shmdt_args *uap; +{ + struct proc *p = td->td_proc; + struct shmmap_state *shmmap_s; + int i; + int error = 0; + + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + mtx_lock(&Giant); + shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm; + if (shmmap_s == NULL) { + error = EINVAL; + goto done2; + } + for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) { + if (shmmap_s->shmid != -1 && + shmmap_s->va == (vm_offset_t)uap->shmaddr) { + break; + } + } + if (i == shminfo.shmseg) { + error = EINVAL; + goto done2; + } + error = shm_delete_mapping(p, shmmap_s); +done2: + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct shmat_args { + int shmid; + void *shmaddr; + int shmflg; +}; +#endif + +/* + * MPSAFE + */ +int +shmat(td, uap) + struct thread *td; + struct shmat_args *uap; +{ + struct proc *p = td->td_proc; + int i, flags; + struct shmid_ds *shmseg; + struct shmmap_state *shmmap_s = NULL; + struct shm_handle *shm_handle; + vm_offset_t attach_va; + vm_prot_t prot; + vm_size_t size; + int rv; + int error = 0; + + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + mtx_lock(&Giant); + shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm; + if (shmmap_s == NULL) { + size = shminfo.shmseg * sizeof(struct shmmap_state); + shmmap_s = malloc(size, M_SHM, M_WAITOK); + for (i = 0; i < shminfo.shmseg; i++) + shmmap_s[i].shmid = -1; + p->p_vmspace->vm_shm = (caddr_t)shmmap_s; + } + shmseg = shm_find_segment_by_shmid(uap->shmid); + if (shmseg == NULL) { + error = EINVAL; + goto done2; + } + error = ipcperm(td, &shmseg->shm_perm, + (uap->shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W); + if (error) + goto done2; + for (i = 0; i < shminfo.shmseg; i++) { + if (shmmap_s->shmid == -1) + break; + shmmap_s++; + } + if (i >= shminfo.shmseg) { + error = EMFILE; + goto done2; + } + size = round_page(shmseg->shm_segsz); +#ifdef VM_PROT_READ_IS_EXEC + prot = VM_PROT_READ | VM_PROT_EXECUTE; +#else + prot = VM_PROT_READ; +#endif + if ((uap->shmflg & SHM_RDONLY) == 0) + prot |= VM_PROT_WRITE; + flags = MAP_ANON | MAP_SHARED; + if (uap->shmaddr) { + flags |= MAP_FIXED; + if (uap->shmflg & SHM_RND) { + attach_va = (vm_offset_t)uap->shmaddr & ~(SHMLBA-1); + } else if (((vm_offset_t)uap->shmaddr & (SHMLBA-1)) == 0) { + attach_va = (vm_offset_t)uap->shmaddr; + } else { + error = EINVAL; + goto done2; + } + } else { + /* + * This is just a hint to vm_map_find() about where to + * put it. + */ + attach_va = round_page((vm_offset_t)p->p_vmspace->vm_taddr + + maxtsiz + maxdsiz); + } + + shm_handle = shmseg->shm_internal; + vm_object_reference(shm_handle->shm_object); + rv = vm_map_find(&p->p_vmspace->vm_map, shm_handle->shm_object, + 0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0); + if (rv != KERN_SUCCESS) { + error = ENOMEM; + goto done2; + } + vm_map_inherit(&p->p_vmspace->vm_map, + attach_va, attach_va + size, VM_INHERIT_SHARE); + + shmmap_s->va = attach_va; + shmmap_s->shmid = uap->shmid; + shmseg->shm_lpid = p->p_pid; + shmseg->shm_atime = time_second; + shmseg->shm_nattch++; + td->td_retval[0] = attach_va; +done2: + mtx_unlock(&Giant); + return (error); +} + +struct oshmid_ds { + struct ipc_perm shm_perm; /* operation perms */ + int shm_segsz; /* size of segment (bytes) */ + ushort shm_cpid; /* pid, creator */ + ushort shm_lpid; /* pid, last operation */ + short shm_nattch; /* no. of current attaches */ + time_t shm_atime; /* last attach time */ + time_t shm_dtime; /* last detach time */ + time_t shm_ctime; /* last change time */ + void *shm_handle; /* internal handle for shm segment */ +}; + +struct oshmctl_args { + int shmid; + int cmd; + struct oshmid_ds *ubuf; +}; + +/* + * MPSAFE + */ +static int +oshmctl(td, uap) + struct thread *td; + struct oshmctl_args *uap; +{ +#ifdef COMPAT_43 + int error = 0; + struct shmid_ds *shmseg; + struct oshmid_ds outbuf; + + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + mtx_lock(&Giant); + shmseg = shm_find_segment_by_shmid(uap->shmid); + if (shmseg == NULL) { + error = EINVAL; + goto done2; + } + switch (uap->cmd) { + case IPC_STAT: + error = ipcperm(td, &shmseg->shm_perm, IPC_R); + if (error) + goto done2; + outbuf.shm_perm = shmseg->shm_perm; + outbuf.shm_segsz = shmseg->shm_segsz; + outbuf.shm_cpid = shmseg->shm_cpid; + outbuf.shm_lpid = shmseg->shm_lpid; + outbuf.shm_nattch = shmseg->shm_nattch; + outbuf.shm_atime = shmseg->shm_atime; + outbuf.shm_dtime = shmseg->shm_dtime; + outbuf.shm_ctime = shmseg->shm_ctime; + outbuf.shm_handle = shmseg->shm_internal; + error = copyout((caddr_t)&outbuf, uap->ubuf, sizeof(outbuf)); + if (error) + goto done2; + break; + default: + /* XXX casting to (sy_call_t *) is bogus, as usual. */ + error = ((sy_call_t *)shmctl)(td, uap); + break; + } +done2: + mtx_unlock(&Giant); + return (error); +#else + return EINVAL; +#endif +} + +#ifndef _SYS_SYSPROTO_H_ +struct shmctl_args { + int shmid; + int cmd; + struct shmid_ds *buf; +}; +#endif + +/* + * MPSAFE + */ +int +shmctl(td, uap) + struct thread *td; + struct shmctl_args *uap; +{ + int error = 0; + struct shmid_ds inbuf; + struct shmid_ds *shmseg; + + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + mtx_lock(&Giant); + switch (uap->cmd) { + case IPC_INFO: + error = copyout( (caddr_t)&shminfo, uap->buf, sizeof( shminfo ) ); + if (error) + goto done2; + td->td_retval[0] = shmalloced; + goto done2; + case SHM_INFO: { + struct shm_info shm_info; + shm_info.used_ids = shm_nused; + shm_info.shm_rss = 0; /*XXX where to get from ? */ + shm_info.shm_tot = 0; /*XXX where to get from ? */ + shm_info.shm_swp = 0; /*XXX where to get from ? */ + shm_info.swap_attempts = 0; /*XXX where to get from ? */ + shm_info.swap_successes = 0; /*XXX where to get from ? */ + error = copyout( (caddr_t)&shm_info, uap->buf, sizeof( shm_info ) ); + if (error) + goto done2; + td->td_retval[0] = shmalloced; + goto done2; + } + } + if( (uap->cmd) == SHM_STAT ) + shmseg = shm_find_segment_by_shmidx(uap->shmid); + else + shmseg = shm_find_segment_by_shmid(uap->shmid); + if (shmseg == NULL) { + error = EINVAL; + goto done2; + } + switch (uap->cmd) { + case SHM_STAT: + case IPC_STAT: + error = ipcperm(td, &shmseg->shm_perm, IPC_R); + if (error) + goto done2; + error = copyout((caddr_t)shmseg, uap->buf, sizeof(inbuf)); + if (error) + goto done2; + else if( (uap->cmd) == SHM_STAT ) + td->td_retval[0] = IXSEQ_TO_IPCID( uap->shmid, shmseg->shm_perm ); + break; + case IPC_SET: + error = ipcperm(td, &shmseg->shm_perm, IPC_M); + if (error) + goto done2; + error = copyin(uap->buf, (caddr_t)&inbuf, sizeof(inbuf)); + if (error) + goto done2; + shmseg->shm_perm.uid = inbuf.shm_perm.uid; + shmseg->shm_perm.gid = inbuf.shm_perm.gid; + shmseg->shm_perm.mode = + (shmseg->shm_perm.mode & ~ACCESSPERMS) | + (inbuf.shm_perm.mode & ACCESSPERMS); + shmseg->shm_ctime = time_second; + break; + case IPC_RMID: + error = ipcperm(td, &shmseg->shm_perm, IPC_M); + if (error) + goto done2; + shmseg->shm_perm.key = IPC_PRIVATE; + shmseg->shm_perm.mode |= SHMSEG_REMOVED; + if (shmseg->shm_nattch <= 0) { + shm_deallocate_segment(shmseg); + shm_last_free = IPCID_TO_IX(uap->shmid); + } + break; +#if 0 + case SHM_LOCK: + case SHM_UNLOCK: +#endif + default: + error = EINVAL; + break; + } +done2: + mtx_unlock(&Giant); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct shmget_args { + key_t key; + size_t size; + int shmflg; +}; +#endif + +static int +shmget_existing(td, uap, mode, segnum) + struct thread *td; + struct shmget_args *uap; + int mode; + int segnum; +{ + struct shmid_ds *shmseg; + int error; + + shmseg = &shmsegs[segnum]; + if (shmseg->shm_perm.mode & SHMSEG_REMOVED) { + /* + * This segment is in the process of being allocated. Wait + * until it's done, and look the key up again (in case the + * allocation failed or it was freed). + */ + shmseg->shm_perm.mode |= SHMSEG_WANTED; + error = tsleep((caddr_t)shmseg, PLOCK | PCATCH, "shmget", 0); + if (error) + return error; + return EAGAIN; + } + if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL)) + return EEXIST; + error = ipcperm(td, &shmseg->shm_perm, mode); + if (error) + return error; + if (uap->size && uap->size > shmseg->shm_segsz) + return EINVAL; + td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm); + return 0; +} + +static int +shmget_allocate_segment(td, uap, mode) + struct thread *td; + struct shmget_args *uap; + int mode; +{ + int i, segnum, shmid, size; + struct ucred *cred = td->td_ucred; + struct shmid_ds *shmseg; + struct shm_handle *shm_handle; + + GIANT_REQUIRED; + + if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax) + return EINVAL; + if (shm_nused >= shminfo.shmmni) /* Any shmids left? */ + return ENOSPC; + size = round_page(uap->size); + if (shm_committed + btoc(size) > shminfo.shmall) + return ENOMEM; + if (shm_last_free < 0) { + shmrealloc(); /* Maybe expand the shmsegs[] array. */ + for (i = 0; i < shmalloced; i++) + if (shmsegs[i].shm_perm.mode & SHMSEG_FREE) + break; + if (i == shmalloced) + return ENOSPC; + segnum = i; + } else { + segnum = shm_last_free; + shm_last_free = -1; + } + shmseg = &shmsegs[segnum]; + /* + * In case we sleep in malloc(), mark the segment present but deleted + * so that noone else tries to create the same key. + */ + shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED; + shmseg->shm_perm.key = uap->key; + shmseg->shm_perm.seq = (shmseg->shm_perm.seq + 1) & 0x7fff; + shm_handle = (struct shm_handle *) + malloc(sizeof(struct shm_handle), M_SHM, M_WAITOK); + shmid = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm); + + /* + * We make sure that we have allocated a pager before we need + * to. + */ + if (shm_use_phys) { + shm_handle->shm_object = + vm_pager_allocate(OBJT_PHYS, 0, size, VM_PROT_DEFAULT, 0); + } else { + shm_handle->shm_object = + vm_pager_allocate(OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0); + } + vm_object_clear_flag(shm_handle->shm_object, OBJ_ONEMAPPING); + vm_object_set_flag(shm_handle->shm_object, OBJ_NOSPLIT); + + shmseg->shm_internal = shm_handle; + shmseg->shm_perm.cuid = shmseg->shm_perm.uid = cred->cr_uid; + shmseg->shm_perm.cgid = shmseg->shm_perm.gid = cred->cr_gid; + shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) | + (mode & ACCESSPERMS) | SHMSEG_ALLOCATED; + shmseg->shm_segsz = uap->size; + shmseg->shm_cpid = td->td_proc->p_pid; + shmseg->shm_lpid = shmseg->shm_nattch = 0; + shmseg->shm_atime = shmseg->shm_dtime = 0; + shmseg->shm_ctime = time_second; + shm_committed += btoc(size); + shm_nused++; + if (shmseg->shm_perm.mode & SHMSEG_WANTED) { + /* + * Somebody else wanted this key while we were asleep. Wake + * them up now. + */ + shmseg->shm_perm.mode &= ~SHMSEG_WANTED; + wakeup((caddr_t)shmseg); + } + td->td_retval[0] = shmid; + return 0; +} + +/* + * MPSAFE + */ +int +shmget(td, uap) + struct thread *td; + struct shmget_args *uap; +{ + int segnum, mode; + int error; + + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + mtx_lock(&Giant); + mode = uap->shmflg & ACCESSPERMS; + if (uap->key != IPC_PRIVATE) { + again: + segnum = shm_find_segment_by_key(uap->key); + if (segnum >= 0) { + error = shmget_existing(td, uap, mode, segnum); + if (error == EAGAIN) + goto again; + goto done2; + } + if ((uap->shmflg & IPC_CREAT) == 0) { + error = ENOENT; + goto done2; + } + } + error = shmget_allocate_segment(td, uap, mode); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int +shmsys(td, uap) + struct thread *td; + /* XXX actually varargs. */ + struct shmsys_args /* { + u_int which; + int a2; + int a3; + int a4; + } */ *uap; +{ + int error; + + if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + return (ENOSYS); + if (uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0])) + return (EINVAL); + mtx_lock(&Giant); + error = (*shmcalls[uap->which])(td, &uap->a2); + mtx_unlock(&Giant); + return (error); +} + +static void +shmfork_myhook(p1, p2) + struct proc *p1, *p2; +{ + struct shmmap_state *shmmap_s; + size_t size; + int i; + + size = shminfo.shmseg * sizeof(struct shmmap_state); + shmmap_s = malloc(size, M_SHM, M_WAITOK); + bcopy((caddr_t)p1->p_vmspace->vm_shm, (caddr_t)shmmap_s, size); + p2->p_vmspace->vm_shm = (caddr_t)shmmap_s; + for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) + if (shmmap_s->shmid != -1) + shmsegs[IPCID_TO_IX(shmmap_s->shmid)].shm_nattch++; +} + +static void +shmexit_myhook(p) + struct proc *p; +{ + struct shmmap_state *shmmap_s; + int i; + + GIANT_REQUIRED; + + shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm; + for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) + if (shmmap_s->shmid != -1) + shm_delete_mapping(p, shmmap_s); + free((caddr_t)p->p_vmspace->vm_shm, M_SHM); + p->p_vmspace->vm_shm = NULL; +} + +static void +shmrealloc(void) +{ + int i; + struct shmid_ds *newsegs; + + if (shmalloced >= shminfo.shmmni) + return; + + newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK); + if (newsegs == NULL) + return; + for (i = 0; i < shmalloced; i++) + bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0])); + for (; i < shminfo.shmmni; i++) { + shmsegs[i].shm_perm.mode = SHMSEG_FREE; + shmsegs[i].shm_perm.seq = 0; + } + free(shmsegs, M_SHM); + shmsegs = newsegs; + shmalloced = shminfo.shmmni; +} + +static void +shminit() +{ + int i; + + TUNABLE_INT_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall); + shminfo.shmmax = shminfo.shmall * PAGE_SIZE; + TUNABLE_INT_FETCH("kern.ipc.shmmin", &shminfo.shmmin); + TUNABLE_INT_FETCH("kern.ipc.shmmni", &shminfo.shmmni); + TUNABLE_INT_FETCH("kern.ipc.shmseg", &shminfo.shmseg); + TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys); + + shmalloced = shminfo.shmmni; + shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK); + if (shmsegs == NULL) + panic("cannot allocate initial memory for sysvshm"); + for (i = 0; i < shmalloced; i++) { + shmsegs[i].shm_perm.mode = SHMSEG_FREE; + shmsegs[i].shm_perm.seq = 0; + } + shm_last_free = 0; + shm_nused = 0; + shm_committed = 0; + shmexit_hook = &shmexit_myhook; + shmfork_hook = &shmfork_myhook; +} + +static int +shmunload() +{ + + if (shm_nused > 0) + return (EBUSY); + + free(shmsegs, M_SHM); + shmexit_hook = NULL; + shmfork_hook = NULL; + return (0); +} + +static int +sysctl_shmsegs(SYSCTL_HANDLER_ARGS) +{ + + return (SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0]))); +} + +static int +sysvshm_modload(struct module *module, int cmd, void *arg) +{ + int error = 0; + + switch (cmd) { + case MOD_LOAD: + shminit(); + break; + case MOD_UNLOAD: + error = shmunload(); + break; + case MOD_SHUTDOWN: + break; + default: + error = EINVAL; + break; + } + return (error); +} + +static moduledata_t sysvshm_mod = { + "sysvshm", + &sysvshm_modload, + NULL +}; + +SYSCALL_MODULE_HELPER(shmsys); +SYSCALL_MODULE_HELPER(shmat); +SYSCALL_MODULE_HELPER(shmctl); +SYSCALL_MODULE_HELPER(shmdt); +SYSCALL_MODULE_HELPER(shmget); + +DECLARE_MODULE(sysvshm, sysvshm_mod, + SI_SUB_SYSV_SHM, SI_ORDER_FIRST); +MODULE_VERSION(sysvshm, 1); diff --git a/sys/kern/tty.c b/sys/kern/tty.c new file mode 100644 index 0000000..b9c5743 --- /dev/null +++ b/sys/kern/tty.c @@ -0,0 +1,2660 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Copyright (c) 2002 Networks Associates Technologies, Inc. + * All rights reserved. + * + * Portions of this software were developed for the FreeBSD Project by + * ThinkSec AS and NAI Labs, the Security Research Division of Network + * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 + * ("CBOSS"), as part of the DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty.c 8.8 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +/*- + * TODO: + * o Fix races for sending the start char in ttyflush(). + * o Handle inter-byte timeout for "MIN > 0, TIME > 0" in ttyselect(). + * With luck, there will be MIN chars before select() returns(). + * o Handle CLOCAL consistently for ptys. Perhaps disallow setting it. + * o Don't allow input in TS_ZOMBIE case. It would be visible through + * FIONREAD. + * o Do the new sio locking stuff here and use it to avoid special + * case for EXTPROC? + * o Lock PENDIN too? + * o Move EXTPROC and/or PENDIN to t_state? + * o Wrap most of ttioctl in spltty/splx. + * o Implement TIOCNOTTY or remove it from <sys/ioctl.h>. + * o Send STOP if IXOFF is toggled off while TS_TBLOCK is set. + * o Don't allow certain termios flags to affect disciplines other + * than TTYDISC. Cancel their effects before switch disciplines + * and ignore them if they are set while we are in another + * discipline. + * o Now that historical speed conversions are handled here, don't + * do them in drivers. + * o Check for TS_CARR_ON being set while everything is closed and not + * waiting for carrier. TS_CARR_ON isn't cleared if nothing is open, + * so it would live until the next open even if carrier drops. + * o Restore TS_WOPEN since it is useful in pstat. It must be cleared + * only when _all_ openers leave open(). + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/filio.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/namei.h> +#include <sys/sx.h> +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#include <sys/ioctl_compat.h> +#endif +#include <sys/proc.h> +#define TTYDEFCHARS +#include <sys/tty.h> +#undef TTYDEFCHARS +#include <sys/fcntl.h> +#include <sys/conf.h> +#include <sys/dkstat.h> +#include <sys/poll.h> +#include <sys/kernel.h> +#include <sys/vnode.h> +#include <sys/signalvar.h> +#include <sys/resourcevar.h> +#include <sys/malloc.h> +#include <sys/filedesc.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> + +MALLOC_DEFINE(M_TTYS, "ttys", "tty data structures"); + +static int proc_compare(struct proc *p1, struct proc *p2); +static int ttnread(struct tty *tp); +static void ttyecho(int c, struct tty *tp); +static int ttyoutput(int c, struct tty *tp); +static void ttypend(struct tty *tp); +static void ttyretype(struct tty *tp); +static void ttyrub(int c, struct tty *tp); +static void ttyrubo(struct tty *tp, int cnt); +static void ttyunblock(struct tty *tp); +static int ttywflush(struct tty *tp); +static int filt_ttyread(struct knote *kn, long hint); +static void filt_ttyrdetach(struct knote *kn); +static int filt_ttywrite(struct knote *kn, long hint); +static void filt_ttywdetach(struct knote *kn); + +/* + * Table with character classes and parity. The 8th bit indicates parity, + * the 7th bit indicates the character is an alphameric or underscore (for + * ALTWERASE), and the low 6 bits indicate delay type. If the low 6 bits + * are 0 then the character needs no special processing on output; classes + * other than 0 might be translated or (not currently) require delays. + */ +#define E 0x00 /* Even parity. */ +#define O 0x80 /* Odd parity. */ +#define PARITY(c) (char_type[c] & O) + +#define ALPHA 0x40 /* Alpha or underscore. */ +#define ISALPHA(c) (char_type[(c) & TTY_CHARMASK] & ALPHA) + +#define CCLASSMASK 0x3f +#define CCLASS(c) (char_type[c] & CCLASSMASK) + +#define BS BACKSPACE +#define CC CONTROL +#define CR RETURN +#define NA ORDINARY | ALPHA +#define NL NEWLINE +#define NO ORDINARY +#define TB TAB +#define VT VTAB + +static u_char const char_type[] = { + E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* nul - bel */ + O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */ + O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */ + E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */ + O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */ + E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */ + E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */ + O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */ + O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */ + E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */ + E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */ + O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */ + E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */ + O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */ + O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */ + E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */ + /* + * Meta chars; should be settable per character set; + * for now, treat them all as normal characters. + */ + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, +}; +#undef BS +#undef CC +#undef CR +#undef NA +#undef NL +#undef NO +#undef TB +#undef VT + +/* Macros to clear/set/test flags. */ +#define SET(t, f) (t) |= (f) +#define CLR(t, f) (t) &= ~(f) +#define ISSET(t, f) ((t) & (f)) + +#undef MAX_INPUT /* XXX wrong in <sys/syslimits.h> */ +#define MAX_INPUT TTYHOG /* XXX limit is usually larger for !ICANON */ + +/* + * list of struct tty where pstat(8) can pick it up with sysctl + */ +static SLIST_HEAD(, tty) tty_list; + +static int drainwait = 5*60; +SYSCTL_INT(_kern, OID_AUTO, drainwait, CTLFLAG_RW, &drainwait, + 0, "Output drain timeout in seconds"); + +/* + * Initial open of tty, or (re)entry to standard tty line discipline. + */ +int +ttyopen(dev_t device, struct tty *tp) +{ + int s; + + s = spltty(); + tp->t_dev = device; + if (!ISSET(tp->t_state, TS_ISOPEN)) { + SET(tp->t_state, TS_ISOPEN); + if (ISSET(tp->t_cflag, CLOCAL)) + SET(tp->t_state, TS_CONNECTED); + bzero(&tp->t_winsize, sizeof(tp->t_winsize)); + } + /* XXX don't hang forever on output */ + if (tp->t_timeout < 0) + tp->t_timeout = drainwait*hz; + ttsetwater(tp); + splx(s); + return (0); +} + +/* + * Handle close() on a tty line: flush and set to initial state, + * bumping generation number so that pending read/write calls + * can detect recycling of the tty. + * XXX our caller should have done `spltty(); l_close(); ttyclose();' + * and l_close() should have flushed, but we repeat the spltty() and + * the flush in case there are buggy callers. + */ +int +ttyclose(struct tty *tp) +{ + int s; + + funsetown(&tp->t_sigio); + s = spltty(); + if (constty == tp) + constty = NULL; + + ttyflush(tp, FREAD | FWRITE); + clist_free_cblocks(&tp->t_canq); + clist_free_cblocks(&tp->t_outq); + clist_free_cblocks(&tp->t_rawq); + + tp->t_gen++; + tp->t_line = TTYDISC; + tp->t_pgrp = NULL; + tp->t_session = NULL; + tp->t_state = 0; + splx(s); + return (0); +} + +#define FLUSHQ(q) { \ + if ((q)->c_cc) \ + ndflush(q, (q)->c_cc); \ +} + +/* Is 'c' a line delimiter ("break" character)? */ +#define TTBREAKC(c, lflag) \ + ((c) == '\n' || (((c) == cc[VEOF] || \ + (c) == cc[VEOL] || ((c) == cc[VEOL2] && lflag & IEXTEN)) && \ + (c) != _POSIX_VDISABLE)) + +/* + * Process input of a single character received on a tty. + */ +int +ttyinput(int c, struct tty *tp) +{ + tcflag_t iflag, lflag; + cc_t *cc; + int i, err; + + /* + * If input is pending take it first. + */ + lflag = tp->t_lflag; + if (ISSET(lflag, PENDIN)) + ttypend(tp); + /* + * Gather stats. + */ + if (ISSET(lflag, ICANON)) { + ++tk_cancc; + ++tp->t_cancc; + } else { + ++tk_rawcc; + ++tp->t_rawcc; + } + ++tk_nin; + + /* + * Block further input iff: + * current input > threshold AND input is available to user program + * AND input flow control is enabled and not yet invoked. + * The 3 is slop for PARMRK. + */ + iflag = tp->t_iflag; + if (tp->t_rawq.c_cc + tp->t_canq.c_cc > tp->t_ihiwat - 3 && + (!ISSET(lflag, ICANON) || tp->t_canq.c_cc != 0) && + (ISSET(tp->t_cflag, CRTS_IFLOW) || ISSET(iflag, IXOFF)) && + !ISSET(tp->t_state, TS_TBLOCK)) + ttyblock(tp); + + /* Handle exceptional conditions (break, parity, framing). */ + cc = tp->t_cc; + err = (ISSET(c, TTY_ERRORMASK)); + if (err) { + CLR(c, TTY_ERRORMASK); + if (ISSET(err, TTY_BI)) { + if (ISSET(iflag, IGNBRK)) + return (0); + if (ISSET(iflag, BRKINT)) { + ttyflush(tp, FREAD | FWRITE); + if (tp->t_pgrp != NULL) { + PGRP_LOCK(tp->t_pgrp); + pgsignal(tp->t_pgrp, SIGINT, 1); + PGRP_UNLOCK(tp->t_pgrp); + } + goto endcase; + } + if (ISSET(iflag, PARMRK)) + goto parmrk; + } else if ((ISSET(err, TTY_PE) && ISSET(iflag, INPCK)) + || ISSET(err, TTY_FE)) { + if (ISSET(iflag, IGNPAR)) + return (0); + else if (ISSET(iflag, PARMRK)) { +parmrk: + if (tp->t_rawq.c_cc + tp->t_canq.c_cc > + MAX_INPUT - 3) + goto input_overflow; + (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); + (void)putc(0 | TTY_QUOTE, &tp->t_rawq); + (void)putc(c | TTY_QUOTE, &tp->t_rawq); + goto endcase; + } else + c = 0; + } + } + + if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP)) + CLR(c, 0x80); + if (!ISSET(lflag, EXTPROC)) { + /* + * Check for literal nexting very first + */ + if (ISSET(tp->t_state, TS_LNCH)) { + SET(c, TTY_QUOTE); + CLR(tp->t_state, TS_LNCH); + } + /* + * Scan for special characters. This code + * is really just a big case statement with + * non-constant cases. The bottom of the + * case statement is labeled ``endcase'', so goto + * it after a case match, or similar. + */ + + /* + * Control chars which aren't controlled + * by ICANON, ISIG, or IXON. + */ + if (ISSET(lflag, IEXTEN)) { + if (CCEQ(cc[VLNEXT], c)) { + if (ISSET(lflag, ECHO)) { + if (ISSET(lflag, ECHOE)) { + (void)ttyoutput('^', tp); + (void)ttyoutput('\b', tp); + } else + ttyecho(c, tp); + } + SET(tp->t_state, TS_LNCH); + goto endcase; + } + if (CCEQ(cc[VDISCARD], c)) { + if (ISSET(lflag, FLUSHO)) + CLR(tp->t_lflag, FLUSHO); + else { + ttyflush(tp, FWRITE); + ttyecho(c, tp); + if (tp->t_rawq.c_cc + tp->t_canq.c_cc) + ttyretype(tp); + SET(tp->t_lflag, FLUSHO); + } + goto startoutput; + } + } + /* + * Signals. + */ + if (ISSET(lflag, ISIG)) { + if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) { + if (!ISSET(lflag, NOFLSH)) + ttyflush(tp, FREAD | FWRITE); + ttyecho(c, tp); + if (tp->t_pgrp != NULL) { + PGRP_LOCK(tp->t_pgrp); + pgsignal(tp->t_pgrp, + CCEQ(cc[VINTR], c) ? SIGINT : SIGQUIT, 1); + PGRP_UNLOCK(tp->t_pgrp); + } + goto endcase; + } + if (CCEQ(cc[VSUSP], c)) { + if (!ISSET(lflag, NOFLSH)) + ttyflush(tp, FREAD); + ttyecho(c, tp); + if (tp->t_pgrp != NULL) { + PGRP_LOCK(tp->t_pgrp); + pgsignal(tp->t_pgrp, SIGTSTP, 1); + PGRP_UNLOCK(tp->t_pgrp); + } + goto endcase; + } + } + /* + * Handle start/stop characters. + */ + if (ISSET(iflag, IXON)) { + if (CCEQ(cc[VSTOP], c)) { + if (!ISSET(tp->t_state, TS_TTSTOP)) { + SET(tp->t_state, TS_TTSTOP); + (*tp->t_stop)(tp, 0); + return (0); + } + if (!CCEQ(cc[VSTART], c)) + return (0); + /* + * if VSTART == VSTOP then toggle + */ + goto endcase; + } + if (CCEQ(cc[VSTART], c)) + goto restartoutput; + } + /* + * IGNCR, ICRNL, & INLCR + */ + if (c == '\r') { + if (ISSET(iflag, IGNCR)) + return (0); + else if (ISSET(iflag, ICRNL)) + c = '\n'; + } else if (c == '\n' && ISSET(iflag, INLCR)) + c = '\r'; + } + if (!ISSET(tp->t_lflag, EXTPROC) && ISSET(lflag, ICANON)) { + /* + * From here on down canonical mode character + * processing takes place. + */ + /* + * erase or erase2 (^H / ^?) + */ + if (CCEQ(cc[VERASE], c) || CCEQ(cc[VERASE2], c) ) { + if (tp->t_rawq.c_cc) + ttyrub(unputc(&tp->t_rawq), tp); + goto endcase; + } + /* + * kill (^U) + */ + if (CCEQ(cc[VKILL], c)) { + if (ISSET(lflag, ECHOKE) && + tp->t_rawq.c_cc == tp->t_rocount && + !ISSET(lflag, ECHOPRT)) + while (tp->t_rawq.c_cc) + ttyrub(unputc(&tp->t_rawq), tp); + else { + ttyecho(c, tp); + if (ISSET(lflag, ECHOK) || + ISSET(lflag, ECHOKE)) + ttyecho('\n', tp); + FLUSHQ(&tp->t_rawq); + tp->t_rocount = 0; + } + CLR(tp->t_state, TS_LOCAL); + goto endcase; + } + /* + * word erase (^W) + */ + if (CCEQ(cc[VWERASE], c) && ISSET(lflag, IEXTEN)) { + int ctype; + + /* + * erase whitespace + */ + while ((c = unputc(&tp->t_rawq)) == ' ' || c == '\t') + ttyrub(c, tp); + if (c == -1) + goto endcase; + /* + * erase last char of word and remember the + * next chars type (for ALTWERASE) + */ + ttyrub(c, tp); + c = unputc(&tp->t_rawq); + if (c == -1) + goto endcase; + if (c == ' ' || c == '\t') { + (void)putc(c, &tp->t_rawq); + goto endcase; + } + ctype = ISALPHA(c); + /* + * erase rest of word + */ + do { + ttyrub(c, tp); + c = unputc(&tp->t_rawq); + if (c == -1) + goto endcase; + } while (c != ' ' && c != '\t' && + (!ISSET(lflag, ALTWERASE) || ISALPHA(c) == ctype)); + (void)putc(c, &tp->t_rawq); + goto endcase; + } + /* + * reprint line (^R) + */ + if (CCEQ(cc[VREPRINT], c) && ISSET(lflag, IEXTEN)) { + ttyretype(tp); + goto endcase; + } + /* + * ^T - kernel info and generate SIGINFO + */ + if (CCEQ(cc[VSTATUS], c) && ISSET(lflag, IEXTEN)) { + if (ISSET(lflag, ISIG) && tp->t_pgrp != NULL) { + PGRP_LOCK(tp->t_pgrp); + pgsignal(tp->t_pgrp, SIGINFO, 1); + PGRP_UNLOCK(tp->t_pgrp); + } + if (!ISSET(lflag, NOKERNINFO)) + ttyinfo(tp); + goto endcase; + } + } + /* + * Check for input buffer overflow + */ + if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= MAX_INPUT) { +input_overflow: + if (ISSET(iflag, IMAXBEL)) { + if (tp->t_outq.c_cc < tp->t_ohiwat) + (void)ttyoutput(CTRL('g'), tp); + } + goto endcase; + } + + if ( c == 0377 && ISSET(iflag, PARMRK) && !ISSET(iflag, ISTRIP) + && ISSET(iflag, IGNBRK|IGNPAR) != (IGNBRK|IGNPAR)) + (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); + + /* + * Put data char in q for user and + * wakeup on seeing a line delimiter. + */ + if (putc(c, &tp->t_rawq) >= 0) { + if (!ISSET(lflag, ICANON)) { + ttwakeup(tp); + ttyecho(c, tp); + goto endcase; + } + if (TTBREAKC(c, lflag)) { + tp->t_rocount = 0; + catq(&tp->t_rawq, &tp->t_canq); + ttwakeup(tp); + } else if (tp->t_rocount++ == 0) + tp->t_rocol = tp->t_column; + if (ISSET(tp->t_state, TS_ERASE)) { + /* + * end of prterase \.../ + */ + CLR(tp->t_state, TS_ERASE); + (void)ttyoutput('/', tp); + } + i = tp->t_column; + ttyecho(c, tp); + if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) { + /* + * Place the cursor over the '^' of the ^D. + */ + i = imin(2, tp->t_column - i); + while (i > 0) { + (void)ttyoutput('\b', tp); + i--; + } + } + } +endcase: + /* + * IXANY means allow any character to restart output. + */ + if (ISSET(tp->t_state, TS_TTSTOP) && + !ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP]) + return (0); +restartoutput: + CLR(tp->t_lflag, FLUSHO); + CLR(tp->t_state, TS_TTSTOP); +startoutput: + return (ttstart(tp)); +} + +/* + * Output a single character on a tty, doing output processing + * as needed (expanding tabs, newline processing, etc.). + * Returns < 0 if succeeds, otherwise returns char to resend. + * Must be recursive. + */ +static int +ttyoutput(int c, struct tty *tp) +{ + tcflag_t oflag; + int col, s; + + oflag = tp->t_oflag; + if (!ISSET(oflag, OPOST)) { + if (ISSET(tp->t_lflag, FLUSHO)) + return (-1); + if (putc(c, &tp->t_outq)) + return (c); + tk_nout++; + tp->t_outcc++; + return (-1); + } + /* + * Do tab expansion if OXTABS is set. Special case if we external + * processing, we don't do the tab expansion because we'll probably + * get it wrong. If tab expansion needs to be done, let it happen + * externally. + */ + CLR(c, ~TTY_CHARMASK); + if (c == '\t' && + ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) { + c = 8 - (tp->t_column & 7); + if (!ISSET(tp->t_lflag, FLUSHO)) { + s = spltty(); /* Don't interrupt tabs. */ + c -= b_to_q(" ", c, &tp->t_outq); + tk_nout += c; + tp->t_outcc += c; + splx(s); + } + tp->t_column += c; + return (c ? -1 : '\t'); + } + if (c == CEOT && ISSET(oflag, ONOEOT)) + return (-1); + + /* + * Newline translation: if ONLCR is set, + * translate newline into "\r\n". + */ + if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) { + tk_nout++; + tp->t_outcc++; + if (!ISSET(tp->t_lflag, FLUSHO) && putc('\r', &tp->t_outq)) + return (c); + } + /* If OCRNL is set, translate "\r" into "\n". */ + else if (c == '\r' && ISSET(tp->t_oflag, OCRNL)) + c = '\n'; + /* If ONOCR is set, don't transmit CRs when on column 0. */ + else if (c == '\r' && ISSET(tp->t_oflag, ONOCR) && tp->t_column == 0) + return (-1); + + tk_nout++; + tp->t_outcc++; + if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq)) + return (c); + + col = tp->t_column; + switch (CCLASS(c)) { + case BACKSPACE: + if (col > 0) + --col; + break; + case CONTROL: + break; + case NEWLINE: + if (ISSET(tp->t_oflag, ONLCR | ONLRET)) + col = 0; + break; + case RETURN: + col = 0; + break; + case ORDINARY: + ++col; + break; + case TAB: + col = (col + 8) & ~7; + break; + } + tp->t_column = col; + return (-1); +} + +/* + * Ioctls for all tty devices. Called after line-discipline specific ioctl + * has been called to do discipline-specific functions and/or reject any + * of these ioctl commands. + */ +/* ARGSUSED */ +int +ttioctl(struct tty *tp, u_long cmd, void *data, int flag) +{ + struct proc *p; + struct thread *td; + struct pgrp *pgrp; + int s, error; + + td = curthread; /* XXX */ + p = td->td_proc; + + /* If the ioctl involves modification, hang if in the background. */ + switch (cmd) { + case TIOCCBRK: + case TIOCCONS: + case TIOCDRAIN: + case TIOCEXCL: + case TIOCFLUSH: +#ifdef TIOCHPCL + case TIOCHPCL: +#endif + case TIOCNXCL: + case TIOCSBRK: + case TIOCSCTTY: + case TIOCSDRAINWAIT: + case TIOCSETA: + case TIOCSETAF: + case TIOCSETAW: + case TIOCSETD: + case TIOCSPGRP: + case TIOCSTART: + case TIOCSTAT: + case TIOCSTI: + case TIOCSTOP: + case TIOCSWINSZ: +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + case TIOCLBIC: + case TIOCLBIS: + case TIOCLSET: + case TIOCSETC: + case OTIOCSETD: + case TIOCSETN: + case TIOCSETP: + case TIOCSLTC: +#endif + sx_slock(&proctree_lock); + PROC_LOCK(p); + while (isbackground(p, tp) && !(p->p_flag & P_PPWAIT) && + !SIGISMEMBER(p->p_sigignore, SIGTTOU) && + !SIGISMEMBER(p->p_sigmask, SIGTTOU)) { + pgrp = p->p_pgrp; + PROC_UNLOCK(p); + if (pgrp->pg_jobc == 0) { + sx_sunlock(&proctree_lock); + return (EIO); + } + PGRP_LOCK(pgrp); + sx_sunlock(&proctree_lock); + pgsignal(pgrp, SIGTTOU, 1); + PGRP_UNLOCK(pgrp); + error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, "ttybg1", + 0); + if (error) + return (error); + sx_slock(&proctree_lock); + PROC_LOCK(p); + } + PROC_UNLOCK(p); + sx_sunlock(&proctree_lock); + break; + } + + switch (cmd) { /* Process the ioctl. */ + case FIOASYNC: /* set/clear async i/o */ + s = spltty(); + if (*(int *)data) + SET(tp->t_state, TS_ASYNC); + else + CLR(tp->t_state, TS_ASYNC); + splx(s); + break; + case FIONBIO: /* set/clear non-blocking i/o */ + break; /* XXX: delete. */ + case FIONREAD: /* get # bytes to read */ + s = spltty(); + *(int *)data = ttnread(tp); + splx(s); + break; + + case FIOSETOWN: + /* + * Policy -- Don't allow FIOSETOWN on someone else's + * controlling tty + */ + if (tp->t_session != NULL && !isctty(p, tp)) + return (ENOTTY); + + error = fsetown(*(int *)data, &tp->t_sigio); + if (error) + return (error); + break; + case FIOGETOWN: + if (tp->t_session != NULL && !isctty(p, tp)) + return (ENOTTY); + *(int *)data = fgetown(tp->t_sigio); + break; + + case TIOCEXCL: /* set exclusive use of tty */ + s = spltty(); + SET(tp->t_state, TS_XCLUDE); + splx(s); + break; + case TIOCFLUSH: { /* flush buffers */ + int flags = *(int *)data; + + if (flags == 0) + flags = FREAD | FWRITE; + else + flags &= FREAD | FWRITE; + ttyflush(tp, flags); + break; + } + case TIOCCONS: /* become virtual console */ + if (*(int *)data) { + struct nameidata nid; + + if (constty && constty != tp && + ISSET(constty->t_state, TS_CONNECTED)) + return (EBUSY); + + /* Ensure user can open the real console. */ + NDINIT(&nid, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, + "/dev/console", td); + if ((error = namei(&nid)) != 0) + return (error); + NDFREE(&nid, NDF_ONLY_PNBUF); + error = VOP_ACCESS(nid.ni_vp, VREAD, td->td_ucred, td); + vput(nid.ni_vp); + if (error) + return (error); + + constty = tp; + } else if (tp == constty) + constty = NULL; + break; + case TIOCDRAIN: /* wait till output drained */ + error = ttywait(tp); + if (error) + return (error); + break; + case TIOCGETA: { /* get termios struct */ + struct termios *t = (struct termios *)data; + + bcopy(&tp->t_termios, t, sizeof(struct termios)); + break; + } + case TIOCGETD: /* get line discipline */ + *(int *)data = tp->t_line; + break; + case TIOCGWINSZ: /* get window size */ + *(struct winsize *)data = tp->t_winsize; + break; + case TIOCGPGRP: /* get pgrp of tty */ + if (!isctty(p, tp)) + return (ENOTTY); + *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; + break; +#ifdef TIOCHPCL + case TIOCHPCL: /* hang up on last close */ + s = spltty(); + SET(tp->t_cflag, HUPCL); + splx(s); + break; +#endif + case TIOCNXCL: /* reset exclusive use of tty */ + s = spltty(); + CLR(tp->t_state, TS_XCLUDE); + splx(s); + break; + case TIOCOUTQ: /* output queue size */ + *(int *)data = tp->t_outq.c_cc; + break; + case TIOCSETA: /* set termios struct */ + case TIOCSETAW: /* drain output, set */ + case TIOCSETAF: { /* drn out, fls in, set */ + struct termios *t = (struct termios *)data; + + if (t->c_ispeed == 0) + t->c_ispeed = t->c_ospeed; + if (t->c_ispeed == 0) + t->c_ispeed = tp->t_ospeed; + if (t->c_ispeed == 0) + return (EINVAL); + s = spltty(); + if (cmd == TIOCSETAW || cmd == TIOCSETAF) { + error = ttywait(tp); + if (error) { + splx(s); + return (error); + } + if (cmd == TIOCSETAF) + ttyflush(tp, FREAD); + } + if (!ISSET(t->c_cflag, CIGNORE)) { + /* + * Set device hardware. + */ + if (tp->t_param && (error = (*tp->t_param)(tp, t))) { + splx(s); + return (error); + } + if (ISSET(t->c_cflag, CLOCAL) && + !ISSET(tp->t_cflag, CLOCAL)) { + /* + * XXX disconnections would be too hard to + * get rid of without this kludge. The only + * way to get rid of controlling terminals + * is to exit from the session leader. + */ + CLR(tp->t_state, TS_ZOMBIE); + + wakeup(TSA_CARR_ON(tp)); + ttwakeup(tp); + ttwwakeup(tp); + } + if ((ISSET(tp->t_state, TS_CARR_ON) || + ISSET(t->c_cflag, CLOCAL)) && + !ISSET(tp->t_state, TS_ZOMBIE)) + SET(tp->t_state, TS_CONNECTED); + else + CLR(tp->t_state, TS_CONNECTED); + tp->t_cflag = t->c_cflag; + tp->t_ispeed = t->c_ispeed; + if (t->c_ospeed != 0) + tp->t_ospeed = t->c_ospeed; + ttsetwater(tp); + } + if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON) && + cmd != TIOCSETAF) { + if (ISSET(t->c_lflag, ICANON)) + SET(tp->t_lflag, PENDIN); + else { + /* + * XXX we really shouldn't allow toggling + * ICANON while we're in a non-termios line + * discipline. Now we have to worry about + * panicing for a null queue. + */ + if (tp->t_canq.c_cbreserved > 0 && + tp->t_rawq.c_cbreserved > 0) { + catq(&tp->t_rawq, &tp->t_canq); + /* + * XXX the queue limits may be + * different, so the old queue + * swapping method no longer works. + */ + catq(&tp->t_canq, &tp->t_rawq); + } + CLR(tp->t_lflag, PENDIN); + } + ttwakeup(tp); + } + tp->t_iflag = t->c_iflag; + tp->t_oflag = t->c_oflag; + /* + * Make the EXTPROC bit read only. + */ + if (ISSET(tp->t_lflag, EXTPROC)) + SET(t->c_lflag, EXTPROC); + else + CLR(t->c_lflag, EXTPROC); + tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN); + if (t->c_cc[VMIN] != tp->t_cc[VMIN] || + t->c_cc[VTIME] != tp->t_cc[VTIME]) + ttwakeup(tp); + bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc)); + splx(s); + break; + } + case TIOCSETD: { /* set line discipline */ + int t = *(int *)data; + dev_t device = tp->t_dev; + + if ((u_int)t >= nlinesw) + return (ENXIO); + if (t != tp->t_line) { + s = spltty(); + (*linesw[tp->t_line].l_close)(tp, flag); + error = (*linesw[t].l_open)(device, tp); + if (error) { + (void)(*linesw[tp->t_line].l_open)(device, tp); + splx(s); + return (error); + } + tp->t_line = t; + splx(s); + } + break; + } + case TIOCSTART: /* start output, like ^Q */ + s = spltty(); + if (ISSET(tp->t_state, TS_TTSTOP) || + ISSET(tp->t_lflag, FLUSHO)) { + CLR(tp->t_lflag, FLUSHO); + CLR(tp->t_state, TS_TTSTOP); + ttstart(tp); + } + splx(s); + break; + case TIOCSTI: /* simulate terminal input */ + if ((flag & FREAD) == 0 && suser(td)) + return (EPERM); + if (!isctty(p, tp) && suser(td)) + return (EACCES); + s = spltty(); + (*linesw[tp->t_line].l_rint)(*(u_char *)data, tp); + splx(s); + break; + case TIOCSTOP: /* stop output, like ^S */ + s = spltty(); + if (!ISSET(tp->t_state, TS_TTSTOP)) { + SET(tp->t_state, TS_TTSTOP); + (*tp->t_stop)(tp, 0); + } + splx(s); + break; + case TIOCSCTTY: /* become controlling tty */ + /* Session ctty vnode pointer set in vnode layer. */ + sx_slock(&proctree_lock); + if (!SESS_LEADER(p) || + ((p->p_session->s_ttyvp || tp->t_session) && + (tp->t_session != p->p_session))) { + sx_sunlock(&proctree_lock); + return (EPERM); + } + tp->t_session = p->p_session; + tp->t_pgrp = p->p_pgrp; + SESS_LOCK(p->p_session); + p->p_session->s_ttyp = tp; + SESS_UNLOCK(p->p_session); + PROC_LOCK(p); + p->p_flag |= P_CONTROLT; + PROC_UNLOCK(p); + sx_sunlock(&proctree_lock); + break; + case TIOCSPGRP: { /* set pgrp of tty */ + sx_slock(&proctree_lock); + pgrp = pgfind(*(int *)data); + if (!isctty(p, tp)) { + if (pgrp != NULL) + PGRP_UNLOCK(pgrp); + sx_sunlock(&proctree_lock); + return (ENOTTY); + } + if (pgrp == NULL) { + sx_sunlock(&proctree_lock); + return (EPERM); + } + PGRP_UNLOCK(pgrp); + if (pgrp->pg_session != p->p_session) { + sx_sunlock(&proctree_lock); + return (EPERM); + } + sx_sunlock(&proctree_lock); + tp->t_pgrp = pgrp; + break; + } + case TIOCSTAT: /* simulate control-T */ + s = spltty(); + ttyinfo(tp); + splx(s); + break; + case TIOCSWINSZ: /* set window size */ + if (bcmp((caddr_t)&tp->t_winsize, data, + sizeof (struct winsize))) { + tp->t_winsize = *(struct winsize *)data; + if (tp->t_pgrp != NULL) { + PGRP_LOCK(tp->t_pgrp); + pgsignal(tp->t_pgrp, SIGWINCH, 1); + PGRP_UNLOCK(tp->t_pgrp); + } + } + break; + case TIOCSDRAINWAIT: + error = suser(td); + if (error) + return (error); + tp->t_timeout = *(int *)data * hz; + wakeup(TSA_OCOMPLETE(tp)); + wakeup(TSA_OLOWAT(tp)); + break; + case TIOCGDRAINWAIT: + *(int *)data = tp->t_timeout / hz; + break; + default: +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + return (ttcompat(tp, cmd, data, flag)); +#else + return (ENOIOCTL); +#endif + } + return (0); +} + +int +ttypoll(dev_t dev, int events, struct thread *td) +{ + int s; + int revents = 0; + struct tty *tp; + + tp = dev->si_tty; + if (tp == NULL) /* XXX used to return ENXIO, but that means true! */ + return ((events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)) + | POLLHUP); + + s = spltty(); + if (events & (POLLIN | POLLRDNORM)) { + if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE)) + revents |= events & (POLLIN | POLLRDNORM); + else + selrecord(td, &tp->t_rsel); + } + if (events & (POLLOUT | POLLWRNORM)) { + if ((tp->t_outq.c_cc <= tp->t_olowat && + ISSET(tp->t_state, TS_CONNECTED)) + || ISSET(tp->t_state, TS_ZOMBIE)) + revents |= events & (POLLOUT | POLLWRNORM); + else + selrecord(td, &tp->t_wsel); + } + splx(s); + return (revents); +} + +static struct filterops ttyread_filtops = + { 1, NULL, filt_ttyrdetach, filt_ttyread }; +static struct filterops ttywrite_filtops = + { 1, NULL, filt_ttywdetach, filt_ttywrite }; + +int +ttykqfilter(dev_t dev, struct knote *kn) +{ + struct tty *tp = dev->si_tty; + struct klist *klist; + int s; + + switch (kn->kn_filter) { + case EVFILT_READ: + klist = &tp->t_rsel.si_note; + kn->kn_fop = &ttyread_filtops; + break; + case EVFILT_WRITE: + klist = &tp->t_wsel.si_note; + kn->kn_fop = &ttywrite_filtops; + break; + default: + return (1); + } + + kn->kn_hook = (caddr_t)dev; + + s = spltty(); + SLIST_INSERT_HEAD(klist, kn, kn_selnext); + splx(s); + + return (0); +} + +static void +filt_ttyrdetach(struct knote *kn) +{ + struct tty *tp = ((dev_t)kn->kn_hook)->si_tty; + int s = spltty(); + + SLIST_REMOVE(&tp->t_rsel.si_note, kn, knote, kn_selnext); + splx(s); +} + +static int +filt_ttyread(struct knote *kn, long hint) +{ + struct tty *tp = ((dev_t)kn->kn_hook)->si_tty; + + kn->kn_data = ttnread(tp); + if (ISSET(tp->t_state, TS_ZOMBIE)) { + kn->kn_flags |= EV_EOF; + return (1); + } + return (kn->kn_data > 0); +} + +static void +filt_ttywdetach(struct knote *kn) +{ + struct tty *tp = ((dev_t)kn->kn_hook)->si_tty; + int s = spltty(); + + SLIST_REMOVE(&tp->t_wsel.si_note, kn, knote, kn_selnext); + splx(s); +} + +static int +filt_ttywrite(struct knote *kn, long hint) +{ + struct tty *tp = ((dev_t)kn->kn_hook)->si_tty; + + kn->kn_data = tp->t_outq.c_cc; + if (ISSET(tp->t_state, TS_ZOMBIE)) + return (1); + return (kn->kn_data <= tp->t_olowat && + ISSET(tp->t_state, TS_CONNECTED)); +} + +/* + * Must be called at spltty(). + */ +static int +ttnread(struct tty *tp) +{ + int nread; + + if (ISSET(tp->t_lflag, PENDIN)) + ttypend(tp); + nread = tp->t_canq.c_cc; + if (!ISSET(tp->t_lflag, ICANON)) { + nread += tp->t_rawq.c_cc; + if (nread < tp->t_cc[VMIN] && tp->t_cc[VTIME] == 0) + nread = 0; + } + return (nread); +} + +/* + * Wait for output to drain. + */ +int +ttywait(struct tty *tp) +{ + int error, s; + + error = 0; + s = spltty(); + while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && + ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) { + (*tp->t_oproc)(tp); + if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && + ISSET(tp->t_state, TS_CONNECTED)) { + SET(tp->t_state, TS_SO_OCOMPLETE); + error = ttysleep(tp, TSA_OCOMPLETE(tp), + TTOPRI | PCATCH, "ttywai", + tp->t_timeout); + if (error) { + if (error == EWOULDBLOCK) + error = EIO; + break; + } + } else + break; + } + if (!error && (tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY))) + error = EIO; + splx(s); + return (error); +} + +/* + * Flush if successfully wait. + */ +static int +ttywflush(struct tty *tp) +{ + int error; + + if ((error = ttywait(tp)) == 0) + ttyflush(tp, FREAD); + return (error); +} + +/* + * Flush tty read and/or write queues, notifying anyone waiting. + */ +void +ttyflush(struct tty *tp, int rw) +{ + int s; + + s = spltty(); +#if 0 +again: +#endif + if (rw & FWRITE) { + FLUSHQ(&tp->t_outq); + CLR(tp->t_state, TS_TTSTOP); + } + (*tp->t_stop)(tp, rw); + if (rw & FREAD) { + FLUSHQ(&tp->t_canq); + FLUSHQ(&tp->t_rawq); + CLR(tp->t_lflag, PENDIN); + tp->t_rocount = 0; + tp->t_rocol = 0; + CLR(tp->t_state, TS_LOCAL); + ttwakeup(tp); + if (ISSET(tp->t_state, TS_TBLOCK)) { + if (rw & FWRITE) + FLUSHQ(&tp->t_outq); + ttyunblock(tp); + + /* + * Don't let leave any state that might clobber the + * next line discipline (although we should do more + * to send the START char). Not clearing the state + * may have caused the "putc to a clist with no + * reserved cblocks" panic/printf. + */ + CLR(tp->t_state, TS_TBLOCK); + +#if 0 /* forget it, sleeping isn't always safe and we don't know when it is */ + if (ISSET(tp->t_iflag, IXOFF)) { + /* + * XXX wait a bit in the hope that the stop + * character (if any) will go out. Waiting + * isn't good since it allows races. This + * will be fixed when the stop character is + * put in a special queue. Don't bother with + * the checks in ttywait() since the timeout + * will save us. + */ + SET(tp->t_state, TS_SO_OCOMPLETE); + ttysleep(tp, TSA_OCOMPLETE(tp), TTOPRI, + "ttyfls", hz / 10); + /* + * Don't try sending the stop character again. + */ + CLR(tp->t_state, TS_TBLOCK); + goto again; + } +#endif + } + } + if (rw & FWRITE) { + FLUSHQ(&tp->t_outq); + ttwwakeup(tp); + } + splx(s); +} + +/* + * Copy in the default termios characters. + */ +void +termioschars(struct termios *t) +{ + + bcopy(ttydefchars, t->c_cc, sizeof t->c_cc); +} + +/* + * Old interface. + */ +void +ttychars(struct tty *tp) +{ + + termioschars(&tp->t_termios); +} + +/* + * Handle input high water. Send stop character for the IXOFF case. Turn + * on our input flow control bit and propagate the changes to the driver. + * XXX the stop character should be put in a special high priority queue. + */ +void +ttyblock(struct tty *tp) +{ + + SET(tp->t_state, TS_TBLOCK); + if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE && + putc(tp->t_cc[VSTOP], &tp->t_outq) != 0) + CLR(tp->t_state, TS_TBLOCK); /* try again later */ + ttstart(tp); +} + +/* + * Handle input low water. Send start character for the IXOFF case. Turn + * off our input flow control bit and propagate the changes to the driver. + * XXX the start character should be put in a special high priority queue. + */ +static void +ttyunblock(struct tty *tp) +{ + + CLR(tp->t_state, TS_TBLOCK); + if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTART] != _POSIX_VDISABLE && + putc(tp->t_cc[VSTART], &tp->t_outq) != 0) + SET(tp->t_state, TS_TBLOCK); /* try again later */ + ttstart(tp); +} + +#ifdef notyet +/* Not used by any current (i386) drivers. */ +/* + * Restart after an inter-char delay. + */ +void +ttrstrt(void *tp_arg) +{ + struct tty *tp; + int s; + + KASSERT(tp_arg != NULL, ("ttrstrt")); + + tp = tp_arg; + s = spltty(); + + CLR(tp->t_state, TS_TIMEOUT); + ttstart(tp); + + splx(s); +} +#endif + +int +ttstart(struct tty *tp) +{ + + if (tp->t_oproc != NULL) /* XXX: Kludge for pty. */ + (*tp->t_oproc)(tp); + return (0); +} + +/* + * "close" a line discipline + */ +int +ttylclose(struct tty *tp, int flag) +{ + + if (flag & FNONBLOCK || ttywflush(tp)) + ttyflush(tp, FREAD | FWRITE); + return (0); +} + +/* + * Handle modem control transition on a tty. + * Flag indicates new state of carrier. + * Returns 0 if the line should be turned off, otherwise 1. + */ +int +ttymodem(struct tty *tp, int flag) +{ + + if (ISSET(tp->t_state, TS_CARR_ON) && ISSET(tp->t_cflag, MDMBUF)) { + /* + * MDMBUF: do flow control according to carrier flag + * XXX TS_CAR_OFLOW doesn't do anything yet. TS_TTSTOP + * works if IXON and IXANY are clear. + */ + if (flag) { + CLR(tp->t_state, TS_CAR_OFLOW); + CLR(tp->t_state, TS_TTSTOP); + ttstart(tp); + } else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) { + SET(tp->t_state, TS_CAR_OFLOW); + SET(tp->t_state, TS_TTSTOP); + (*tp->t_stop)(tp, 0); + } + } else if (flag == 0) { + /* + * Lost carrier. + */ + CLR(tp->t_state, TS_CARR_ON); + if (ISSET(tp->t_state, TS_ISOPEN) && + !ISSET(tp->t_cflag, CLOCAL)) { + SET(tp->t_state, TS_ZOMBIE); + CLR(tp->t_state, TS_CONNECTED); + if (tp->t_session) { + sx_slock(&proctree_lock); + if (tp->t_session->s_leader) { + struct proc *p; + + p = tp->t_session->s_leader; + PROC_LOCK(p); + psignal(p, SIGHUP); + PROC_UNLOCK(p); + } + sx_sunlock(&proctree_lock); + } + ttyflush(tp, FREAD | FWRITE); + return (0); + } + } else { + /* + * Carrier now on. + */ + SET(tp->t_state, TS_CARR_ON); + if (!ISSET(tp->t_state, TS_ZOMBIE)) + SET(tp->t_state, TS_CONNECTED); + wakeup(TSA_CARR_ON(tp)); + ttwakeup(tp); + ttwwakeup(tp); + } + return (1); +} + +/* + * Reinput pending characters after state switch + * call at spltty(). + */ +static void +ttypend(struct tty *tp) +{ + struct clist tq; + int c; + + CLR(tp->t_lflag, PENDIN); + SET(tp->t_state, TS_TYPEN); + /* + * XXX this assumes too much about clist internals. It may even + * fail if the cblock slush pool is empty. We can't allocate more + * cblocks here because we are called from an interrupt handler + * and clist_alloc_cblocks() can wait. + */ + tq = tp->t_rawq; + bzero(&tp->t_rawq, sizeof tp->t_rawq); + tp->t_rawq.c_cbmax = tq.c_cbmax; + tp->t_rawq.c_cbreserved = tq.c_cbreserved; + while ((c = getc(&tq)) >= 0) + ttyinput(c, tp); + CLR(tp->t_state, TS_TYPEN); +} + +/* + * Process a read call on a tty device. + */ +int +ttread(struct tty *tp, struct uio *uio, int flag) +{ + struct clist *qp; + int c; + tcflag_t lflag; + cc_t *cc = tp->t_cc; + struct proc *p = curproc; + int s, first, error = 0; + int has_stime = 0, last_cc = 0; + long slp = 0; /* XXX this should be renamed `timo'. */ + struct timeval stime; + struct pgrp *pg; + +loop: + s = spltty(); + lflag = tp->t_lflag; + /* + * take pending input first + */ + if (ISSET(lflag, PENDIN)) { + ttypend(tp); + splx(s); /* reduce latency */ + s = spltty(); + lflag = tp->t_lflag; /* XXX ttypend() clobbers it */ + } + + /* + * Hang process if it's in the background. + */ + if (isbackground(p, tp)) { + splx(s); + sx_slock(&proctree_lock); + PROC_LOCK(p); + if (SIGISMEMBER(p->p_sigignore, SIGTTIN) || + SIGISMEMBER(p->p_sigmask, SIGTTIN) || + (p->p_flag & P_PPWAIT) || p->p_pgrp->pg_jobc == 0) { + PROC_UNLOCK(p); + sx_sunlock(&proctree_lock); + return (EIO); + } + pg = p->p_pgrp; + PROC_UNLOCK(p); + PGRP_LOCK(pg); + sx_sunlock(&proctree_lock); + pgsignal(pg, SIGTTIN, 1); + PGRP_UNLOCK(pg); + error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg2", 0); + if (error) + return (error); + goto loop; + } + + if (ISSET(tp->t_state, TS_ZOMBIE)) { + splx(s); + return (0); /* EOF */ + } + + /* + * If canonical, use the canonical queue, + * else use the raw queue. + * + * (should get rid of clists...) + */ + qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq; + + if (flag & IO_NDELAY) { + if (qp->c_cc > 0) + goto read; + if (!ISSET(lflag, ICANON) && cc[VMIN] == 0) { + splx(s); + return (0); + } + splx(s); + return (EWOULDBLOCK); + } + if (!ISSET(lflag, ICANON)) { + int m = cc[VMIN]; + long t = cc[VTIME]; + struct timeval timecopy; + + /* + * Check each of the four combinations. + * (m > 0 && t == 0) is the normal read case. + * It should be fairly efficient, so we check that and its + * companion case (m == 0 && t == 0) first. + * For the other two cases, we compute the target sleep time + * into slp. + */ + if (t == 0) { + if (qp->c_cc < m) + goto sleep; + if (qp->c_cc > 0) + goto read; + + /* m, t and qp->c_cc are all 0. 0 is enough input. */ + splx(s); + return (0); + } + t *= 100000; /* time in us */ +#define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 1000000 + \ + ((t1).tv_usec - (t2).tv_usec)) + if (m > 0) { + if (qp->c_cc <= 0) + goto sleep; + if (qp->c_cc >= m) + goto read; + getmicrotime(&timecopy); + if (!has_stime) { + /* first character, start timer */ + has_stime = 1; + stime = timecopy; + slp = t; + } else if (qp->c_cc > last_cc) { + /* got a character, restart timer */ + stime = timecopy; + slp = t; + } else { + /* nothing, check expiration */ + slp = t - diff(timecopy, stime); + if (slp <= 0) + goto read; + } + last_cc = qp->c_cc; + } else { /* m == 0 */ + if (qp->c_cc > 0) + goto read; + getmicrotime(&timecopy); + if (!has_stime) { + has_stime = 1; + stime = timecopy; + slp = t; + } else { + slp = t - diff(timecopy, stime); + if (slp <= 0) { + /* Timed out, but 0 is enough input. */ + splx(s); + return (0); + } + } + } +#undef diff + /* + * Rounding down may make us wake up just short + * of the target, so we round up. + * The formula is ceiling(slp * hz/1000000). + * 32-bit arithmetic is enough for hz < 169. + * XXX see tvtohz() for how to avoid overflow if hz + * is large (divide by `tick' and/or arrange to + * use tvtohz() if hz is large). + */ + slp = (long) (((u_long)slp * hz) + 999999) / 1000000; + goto sleep; + } + if (qp->c_cc <= 0) { +sleep: + /* + * There is no input, or not enough input and we can block. + */ + error = ttysleep(tp, TSA_HUP_OR_INPUT(tp), TTIPRI | PCATCH, + ISSET(tp->t_state, TS_CONNECTED) ? + "ttyin" : "ttyhup", (int)slp); + splx(s); + if (error == EWOULDBLOCK) + error = 0; + else if (error) + return (error); + /* + * XXX what happens if another process eats some input + * while we are asleep (not just here)? It would be + * safest to detect changes and reset our state variables + * (has_stime and last_cc). + */ + slp = 0; + goto loop; + } +read: + splx(s); + /* + * Input present, check for input mapping and processing. + */ + first = 1; + if (ISSET(lflag, ICANON | ISIG)) + goto slowcase; + for (;;) { + char ibuf[IBUFSIZ]; + int icc; + + icc = imin(uio->uio_resid, IBUFSIZ); + icc = q_to_b(qp, ibuf, icc); + if (icc <= 0) { + if (first) + goto loop; + break; + } + error = uiomove(ibuf, icc, uio); + /* + * XXX if there was an error then we should ungetc() the + * unmoved chars and reduce icc here. + */ + if (error) + break; + if (uio->uio_resid == 0) + break; + first = 0; + } + goto out; +slowcase: + for (;;) { + c = getc(qp); + if (c < 0) { + if (first) + goto loop; + break; + } + /* + * delayed suspend (^Y) + */ + if (CCEQ(cc[VDSUSP], c) && + ISSET(lflag, IEXTEN | ISIG) == (IEXTEN | ISIG)) { + if (tp->t_pgrp != NULL) { + PGRP_LOCK(tp->t_pgrp); + pgsignal(tp->t_pgrp, SIGTSTP, 1); + PGRP_UNLOCK(tp->t_pgrp); + } + if (first) { + error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, + "ttybg3", 0); + if (error) + break; + goto loop; + } + break; + } + /* + * Interpret EOF only in canonical mode. + */ + if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON)) + break; + /* + * Give user character. + */ + error = ureadc(c, uio); + if (error) + /* XXX should ungetc(c, qp). */ + break; + if (uio->uio_resid == 0) + break; + /* + * In canonical mode check for a "break character" + * marking the end of a "line of input". + */ + if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag)) + break; + first = 0; + } + +out: + /* + * Look to unblock input now that (presumably) + * the input queue has gone down. + */ + s = spltty(); + if (ISSET(tp->t_state, TS_TBLOCK) && + tp->t_rawq.c_cc + tp->t_canq.c_cc <= tp->t_ilowat) + ttyunblock(tp); + splx(s); + + return (error); +} + +/* + * Check the output queue on tp for space for a kernel message (from uprintf + * or tprintf). Allow some space over the normal hiwater mark so we don't + * lose messages due to normal flow control, but don't let the tty run amok. + * Sleeps here are not interruptible, but we return prematurely if new signals + * arrive. + */ +int +ttycheckoutq(struct tty *tp, int wait) +{ + int hiwat, s; + sigset_t oldmask; + + hiwat = tp->t_ohiwat; + SIGEMPTYSET(oldmask); + s = spltty(); + if (wait) + oldmask = curproc->p_siglist; + if (tp->t_outq.c_cc > hiwat + OBUFSIZ + 100) + while (tp->t_outq.c_cc > hiwat) { + ttstart(tp); + if (tp->t_outq.c_cc <= hiwat) + break; + if (!(wait && SIGSETEQ(curproc->p_siglist, oldmask))) { + splx(s); + return (0); + } + SET(tp->t_state, TS_SO_OLOWAT); + tsleep(TSA_OLOWAT(tp), PZERO - 1, "ttoutq", hz); + } + splx(s); + return (1); +} + +/* + * Process a write call on a tty device. + */ +int +ttwrite(struct tty *tp, struct uio *uio, int flag) +{ + char *cp = NULL; + int cc, ce; + struct proc *p; + int i, hiwat, cnt, error, s; + char obuf[OBUFSIZ]; + + hiwat = tp->t_ohiwat; + cnt = uio->uio_resid; + error = 0; + cc = 0; +loop: + s = spltty(); + if (ISSET(tp->t_state, TS_ZOMBIE)) { + splx(s); + if (uio->uio_resid == cnt) + error = EIO; + goto out; + } + if (!ISSET(tp->t_state, TS_CONNECTED)) { + if (flag & IO_NDELAY) { + splx(s); + error = EWOULDBLOCK; + goto out; + } + error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH, + "ttydcd", 0); + splx(s); + if (error) + goto out; + goto loop; + } + splx(s); + /* + * Hang the process if it's in the background. + */ + p = curproc; + sx_slock(&proctree_lock); + PROC_LOCK(p); + if (isbackground(p, tp) && + ISSET(tp->t_lflag, TOSTOP) && !(p->p_flag & P_PPWAIT) && + !SIGISMEMBER(p->p_sigignore, SIGTTOU) && + !SIGISMEMBER(p->p_sigmask, SIGTTOU)) { + if (p->p_pgrp->pg_jobc == 0) { + PROC_UNLOCK(p); + sx_sunlock(&proctree_lock); + error = EIO; + goto out; + } + PROC_UNLOCK(p); + PGRP_LOCK(p->p_pgrp); + sx_sunlock(&proctree_lock); + pgsignal(p->p_pgrp, SIGTTOU, 1); + PGRP_UNLOCK(p->p_pgrp); + error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg4", 0); + if (error) + goto out; + goto loop; + } else { + PROC_UNLOCK(p); + sx_sunlock(&proctree_lock); + } + /* + * Process the user's data in at most OBUFSIZ chunks. Perform any + * output translation. Keep track of high water mark, sleep on + * overflow awaiting device aid in acquiring new space. + */ + while (uio->uio_resid > 0 || cc > 0) { + if (ISSET(tp->t_lflag, FLUSHO)) { + uio->uio_resid = 0; + return (0); + } + if (tp->t_outq.c_cc > hiwat) + goto ovhiwat; + /* + * Grab a hunk of data from the user, unless we have some + * leftover from last time. + */ + if (cc == 0) { + cc = imin(uio->uio_resid, OBUFSIZ); + cp = obuf; + error = uiomove(cp, cc, uio); + if (error) { + cc = 0; + break; + } + } + /* + * If nothing fancy need be done, grab those characters we + * can handle without any of ttyoutput's processing and + * just transfer them to the output q. For those chars + * which require special processing (as indicated by the + * bits in char_type), call ttyoutput. After processing + * a hunk of data, look for FLUSHO so ^O's will take effect + * immediately. + */ + while (cc > 0) { + if (!ISSET(tp->t_oflag, OPOST)) + ce = cc; + else { + ce = cc - scanc((u_int)cc, (u_char *)cp, + char_type, CCLASSMASK); + /* + * If ce is zero, then we're processing + * a special character through ttyoutput. + */ + if (ce == 0) { + tp->t_rocount = 0; + if (ttyoutput(*cp, tp) >= 0) { + /* No Clists, wait a bit. */ + ttstart(tp); + if (flag & IO_NDELAY) { + error = EWOULDBLOCK; + goto out; + } + error = ttysleep(tp, &lbolt, + TTOPRI|PCATCH, + "ttybf1", 0); + if (error) + goto out; + goto loop; + } + cp++; + cc--; + if (ISSET(tp->t_lflag, FLUSHO) || + tp->t_outq.c_cc > hiwat) + goto ovhiwat; + continue; + } + } + /* + * A bunch of normal characters have been found. + * Transfer them en masse to the output queue and + * continue processing at the top of the loop. + * If there are any further characters in this + * <= OBUFSIZ chunk, the first should be a character + * requiring special handling by ttyoutput. + */ + tp->t_rocount = 0; + i = b_to_q(cp, ce, &tp->t_outq); + ce -= i; + tp->t_column += ce; + cp += ce, cc -= ce, tk_nout += ce; + tp->t_outcc += ce; + if (i > 0) { + /* No Clists, wait a bit. */ + ttstart(tp); + if (flag & IO_NDELAY) { + error = EWOULDBLOCK; + goto out; + } + error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, + "ttybf2", 0); + if (error) + goto out; + goto loop; + } + if (ISSET(tp->t_lflag, FLUSHO) || + tp->t_outq.c_cc > hiwat) + break; + } + ttstart(tp); + } +out: + /* + * If cc is nonzero, we leave the uio structure inconsistent, as the + * offset and iov pointers have moved forward, but it doesn't matter + * (the call will either return short or restart with a new uio). + */ + uio->uio_resid += cc; + return (error); + +ovhiwat: + ttstart(tp); + s = spltty(); + /* + * This can only occur if FLUSHO is set in t_lflag, + * or if ttstart/oproc is synchronous (or very fast). + */ + if (tp->t_outq.c_cc <= hiwat) { + splx(s); + goto loop; + } + if (flag & IO_NDELAY) { + splx(s); + uio->uio_resid += cc; + return (uio->uio_resid == cnt ? EWOULDBLOCK : 0); + } + SET(tp->t_state, TS_SO_OLOWAT); + error = ttysleep(tp, TSA_OLOWAT(tp), TTOPRI | PCATCH, "ttywri", + tp->t_timeout); + splx(s); + if (error == EWOULDBLOCK) + error = EIO; + if (error) + goto out; + goto loop; +} + +/* + * Rubout one character from the rawq of tp + * as cleanly as possible. + */ +static void +ttyrub(int c, struct tty *tp) +{ + char *cp; + int savecol; + int tabc, s; + + if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC)) + return; + CLR(tp->t_lflag, FLUSHO); + if (ISSET(tp->t_lflag, ECHOE)) { + if (tp->t_rocount == 0) { + /* + * Screwed by ttwrite; retype + */ + ttyretype(tp); + return; + } + if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE)) + ttyrubo(tp, 2); + else { + CLR(c, ~TTY_CHARMASK); + switch (CCLASS(c)) { + case ORDINARY: + ttyrubo(tp, 1); + break; + case BACKSPACE: + case CONTROL: + case NEWLINE: + case RETURN: + case VTAB: + if (ISSET(tp->t_lflag, ECHOCTL)) + ttyrubo(tp, 2); + break; + case TAB: + if (tp->t_rocount < tp->t_rawq.c_cc) { + ttyretype(tp); + return; + } + s = spltty(); + savecol = tp->t_column; + SET(tp->t_state, TS_CNTTB); + SET(tp->t_lflag, FLUSHO); + tp->t_column = tp->t_rocol; + cp = tp->t_rawq.c_cf; + if (cp) + tabc = *cp; /* XXX FIX NEXTC */ + for (; cp; cp = nextc(&tp->t_rawq, cp, &tabc)) + ttyecho(tabc, tp); + CLR(tp->t_lflag, FLUSHO); + CLR(tp->t_state, TS_CNTTB); + splx(s); + + /* savecol will now be length of the tab. */ + savecol -= tp->t_column; + tp->t_column += savecol; + if (savecol > 8) + savecol = 8; /* overflow screw */ + while (--savecol >= 0) + (void)ttyoutput('\b', tp); + break; + default: /* XXX */ +#define PANICSTR "ttyrub: would panic c = %d, val = %d\n" + (void)printf(PANICSTR, c, CCLASS(c)); +#ifdef notdef + panic(PANICSTR, c, CCLASS(c)); +#endif + } + } + } else if (ISSET(tp->t_lflag, ECHOPRT)) { + if (!ISSET(tp->t_state, TS_ERASE)) { + SET(tp->t_state, TS_ERASE); + (void)ttyoutput('\\', tp); + } + ttyecho(c, tp); + } else { + ttyecho(tp->t_cc[VERASE], tp); + /* + * This code may be executed not only when an ERASE key + * is pressed, but also when ^U (KILL) or ^W (WERASE) are. + * So, I didn't think it was worthwhile to pass the extra + * information (which would need an extra parameter, + * changing every call) needed to distinguish the ERASE2 + * case from the ERASE. + */ + } + --tp->t_rocount; +} + +/* + * Back over cnt characters, erasing them. + */ +static void +ttyrubo(struct tty *tp, int cnt) +{ + + while (cnt-- > 0) { + (void)ttyoutput('\b', tp); + (void)ttyoutput(' ', tp); + (void)ttyoutput('\b', tp); + } +} + +/* + * ttyretype -- + * Reprint the rawq line. Note, it is assumed that c_cc has already + * been checked. + */ +static void +ttyretype(struct tty *tp) +{ + char *cp; + int s, c; + + /* Echo the reprint character. */ + if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE) + ttyecho(tp->t_cc[VREPRINT], tp); + + (void)ttyoutput('\n', tp); + + /* + * XXX + * FIX: NEXTC IS BROKEN - DOESN'T CHECK QUOTE + * BIT OF FIRST CHAR. + */ + s = spltty(); + for (cp = tp->t_canq.c_cf, c = (cp != NULL ? *cp : 0); + cp != NULL; cp = nextc(&tp->t_canq, cp, &c)) + ttyecho(c, tp); + for (cp = tp->t_rawq.c_cf, c = (cp != NULL ? *cp : 0); + cp != NULL; cp = nextc(&tp->t_rawq, cp, &c)) + ttyecho(c, tp); + CLR(tp->t_state, TS_ERASE); + splx(s); + + tp->t_rocount = tp->t_rawq.c_cc; + tp->t_rocol = 0; +} + +/* + * Echo a typed character to the terminal. + */ +static void +ttyecho(int c, struct tty *tp) +{ + + if (!ISSET(tp->t_state, TS_CNTTB)) + CLR(tp->t_lflag, FLUSHO); + if ((!ISSET(tp->t_lflag, ECHO) && + (c != '\n' || !ISSET(tp->t_lflag, ECHONL))) || + ISSET(tp->t_lflag, EXTPROC)) + return; + if (ISSET(tp->t_lflag, ECHOCTL) && + ((ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n') || + ISSET(c, TTY_CHARMASK) == 0177)) { + (void)ttyoutput('^', tp); + CLR(c, ~TTY_CHARMASK); + if (c == 0177) + c = '?'; + else + c += 'A' - 1; + } + (void)ttyoutput(c, tp); +} + +/* + * Wake up any readers on a tty. + */ +void +ttwakeup(struct tty *tp) +{ + + if (SEL_WAITING(&tp->t_rsel)) + selwakeup(&tp->t_rsel); + if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL) + pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL)); + wakeup(TSA_HUP_OR_INPUT(tp)); + KNOTE(&tp->t_rsel.si_note, 0); +} + +/* + * Wake up any writers on a tty. + */ +void +ttwwakeup(struct tty *tp) +{ + + if (SEL_WAITING(&tp->t_wsel) && tp->t_outq.c_cc <= tp->t_olowat) + selwakeup(&tp->t_wsel); + if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL) + pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL)); + if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) == + TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) { + CLR(tp->t_state, TS_SO_OCOMPLETE); + wakeup(TSA_OCOMPLETE(tp)); + } + if (ISSET(tp->t_state, TS_SO_OLOWAT) && + tp->t_outq.c_cc <= tp->t_olowat) { + CLR(tp->t_state, TS_SO_OLOWAT); + wakeup(TSA_OLOWAT(tp)); + } + KNOTE(&tp->t_wsel.si_note, 0); +} + +/* + * Look up a code for a specified speed in a conversion table; + * used by drivers to map software speed values to hardware parameters. + */ +int +ttspeedtab(int speed, struct speedtab *table) +{ + + for ( ; table->sp_speed != -1; table++) + if (table->sp_speed == speed) + return (table->sp_code); + return (-1); +} + +/* + * Set input and output watermarks and buffer sizes. For input, the + * high watermark is about one second's worth of input above empty, the + * low watermark is slightly below high water, and the buffer size is a + * driver-dependent amount above high water. For output, the watermarks + * are near the ends of the buffer, with about 1 second's worth of input + * between them. All this only applies to the standard line discipline. + */ +void +ttsetwater(struct tty *tp) +{ + int cps, ttmaxhiwat, x; + + /* Input. */ + clist_alloc_cblocks(&tp->t_canq, TTYHOG, 512); + switch (tp->t_ispeedwat) { + case (speed_t)-1: + cps = tp->t_ispeed / 10; + break; + case 0: + /* + * This case is for old drivers that don't know about + * t_ispeedwat. Arrange for them to get the old buffer + * sizes and watermarks. + */ + cps = TTYHOG - 2 * 256; + tp->t_ififosize = 2 * 256; + break; + default: + cps = tp->t_ispeedwat / 10; + break; + } + tp->t_ihiwat = cps; + tp->t_ilowat = 7 * cps / 8; + x = cps + tp->t_ififosize; + clist_alloc_cblocks(&tp->t_rawq, x, x); + + /* Output. */ + switch (tp->t_ospeedwat) { + case (speed_t)-1: + cps = tp->t_ospeed / 10; + ttmaxhiwat = 2 * TTMAXHIWAT; + break; + case 0: + cps = tp->t_ospeed / 10; + ttmaxhiwat = TTMAXHIWAT; + break; + default: + cps = tp->t_ospeedwat / 10; + ttmaxhiwat = 8 * TTMAXHIWAT; + break; + } +#define CLAMP(x, h, l) ((x) > h ? h : ((x) < l) ? l : (x)) + tp->t_olowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT); + x += cps; + x = CLAMP(x, ttmaxhiwat, TTMINHIWAT); /* XXX clamps are too magic */ + tp->t_ohiwat = roundup(x, CBSIZE); /* XXX for compat */ + x = imax(tp->t_ohiwat, TTMAXHIWAT); /* XXX for compat/safety */ + x += OBUFSIZ + 100; + clist_alloc_cblocks(&tp->t_outq, x, x); +#undef CLAMP +} + +/* + * Report on state of foreground process group. + */ +void +ttyinfo(struct tty *tp) +{ + struct proc *p, *pick; + struct timeval utime, stime; + const char *stmp; + long ltmp; + int tmp; + struct thread *td; + + if (ttycheckoutq(tp,0) == 0) + return; + + /* Print load average. */ + tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT; + ttyprintf(tp, "load: %d.%02d ", tmp / 100, tmp % 100); + + if (tp->t_session == NULL) + ttyprintf(tp, "not a controlling terminal\n"); + else if (tp->t_pgrp == NULL) + ttyprintf(tp, "no foreground process group\n"); + else { + PGRP_LOCK(tp->t_pgrp); + if ((p = LIST_FIRST(&tp->t_pgrp->pg_members)) == 0) { + PGRP_UNLOCK(tp->t_pgrp); + ttyprintf(tp, "empty foreground process group\n"); + } else { + mtx_lock_spin(&sched_lock); + + /* Pick interesting process. */ + for (pick = NULL; p != 0; p = LIST_NEXT(p, p_pglist)) + if (proc_compare(pick, p)) + pick = p; + PGRP_UNLOCK(tp->t_pgrp); + + td = FIRST_THREAD_IN_PROC(pick); + stmp = pick->p_stat == SRUN ? "running" : /* XXXKSE */ + pick->p_stat == SMTX ? td->td_mtxname : + td->td_wmesg ? td->td_wmesg : "iowait"; + calcru(pick, &utime, &stime, NULL); + ltmp = pick->p_stat == SIDL || pick->p_stat == SWAIT || + pick->p_stat == SZOMB ? 0 : + pgtok(vmspace_resident_count(pick->p_vmspace)); + mtx_unlock_spin(&sched_lock); + + ttyprintf(tp, " cmd: %s %d [%s%s] ", pick->p_comm, + pick->p_pid, pick->p_stat == SMTX ? "*" : "", stmp); + + /* Print user time. */ + ttyprintf(tp, "%ld.%02ldu ", + utime.tv_sec, utime.tv_usec / 10000); + + /* Print system time. */ + ttyprintf(tp, "%ld.%02lds ", + (long)stime.tv_sec, stime.tv_usec / 10000); + + /* Print percentage cpu, resident set size. */ + ttyprintf(tp, "%d%% %ldk\n", tmp / 100, ltmp); + + } + } + tp->t_rocount = 0; /* so pending input will be retyped if BS */ +} + +/* + * Returns 1 if p2 is "better" than p1 + * + * The algorithm for picking the "interesting" process is thus: + * + * 1) Only foreground processes are eligible - implied. + * 2) Runnable processes are favored over anything else. The runner + * with the highest cpu utilization is picked (p_estcpu). Ties are + * broken by picking the highest pid. + * 3) The sleeper with the shortest sleep time is next. With ties, + * we pick out just "short-term" sleepers (P_SINTR == 0). + * 4) Further ties are broken by picking the highest pid. + */ +#define ISRUN(p) (((p)->p_stat == SRUN) || ((p)->p_stat == SIDL)) +#define TESTAB(a, b) ((a)<<1 | (b)) +#define ONLYA 2 +#define ONLYB 1 +#define BOTH 3 + +static int +proc_compare(struct proc *p1, struct proc *p2) +{ + + int esta, estb; + struct ksegrp *kg; + mtx_assert(&sched_lock, MA_OWNED); + if (p1 == NULL) + return (1); + + /* + * see if at least one of them is runnable + */ + switch (TESTAB(ISRUN(p1), ISRUN(p2))) { + case ONLYA: + return (0); + case ONLYB: + return (1); + case BOTH: + /* + * tie - favor one with highest recent cpu utilization + */ + esta = estb = 0; + FOREACH_KSEGRP_IN_PROC(p1,kg) { + esta += kg->kg_estcpu; + } + FOREACH_KSEGRP_IN_PROC(p2,kg) { + estb += kg->kg_estcpu; + } + if (estb > esta) + return (1); + if (esta > estb) + return (0); + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ + } + /* + * weed out zombies + */ + switch (TESTAB(p1->p_stat == SZOMB, p2->p_stat == SZOMB)) { + case ONLYA: + return (1); + case ONLYB: + return (0); + case BOTH: + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ + } + +#if 0 /* XXXKSE */ + /* + * pick the one with the smallest sleep time + */ + if (p2->p_slptime > p1->p_slptime) + return (0); + if (p1->p_slptime > p2->p_slptime) + return (1); + /* + * favor one sleeping in a non-interruptible sleep + */ + if (p1->p_sflag & PS_SINTR && (p2->p_sflag & PS_SINTR) == 0) + return (1); + if (p2->p_sflag & PS_SINTR && (p1->p_sflag & PS_SINTR) == 0) + return (0); +#endif + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ +} + +/* + * Output char to tty; console putchar style. + */ +int +tputchar(int c, struct tty *tp) +{ + int s; + + s = spltty(); + if (!ISSET(tp->t_state, TS_CONNECTED)) { + splx(s); + return (-1); + } + if (c == '\n') + (void)ttyoutput('\r', tp); + (void)ttyoutput(c, tp); + ttstart(tp); + splx(s); + return (0); +} + +/* + * Sleep on chan, returning ERESTART if tty changed while we napped and + * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by tsleep. If + * the tty is revoked, restarting a pending call will redo validation done + * at the start of the call. + */ +int +ttysleep(struct tty *tp, void *chan, int pri, char *wmesg, int timo) +{ + int error; + int gen; + + gen = tp->t_gen; + error = tsleep(chan, pri, wmesg, timo); + if (error) + return (error); + return (tp->t_gen == gen ? 0 : ERESTART); +} + +/* + * Allocate a tty struct. Clists in the struct will be allocated by + * ttyopen(). + */ +struct tty * +ttymalloc(struct tty *tp) +{ + + if (tp) + return(tp); + tp = malloc(sizeof *tp, M_TTYS, M_WAITOK | M_ZERO); + ttyregister(tp); + return (tp); +} + +#if 0 /* XXX not yet usable: session leader holds a ref (see kern_exit.c). */ +/* + * Free a tty struct. Clists in the struct should have been freed by + * ttyclose(). + */ +void +ttyfree(struct tty *tp) +{ + free(tp, M_TTYS); +} +#endif /* 0 */ + +void +ttyregister(struct tty *tp) +{ + tp->t_timeout = -1; + SLIST_INSERT_HEAD(&tty_list, tp, t_list); +} + +static int +sysctl_kern_ttys(SYSCTL_HANDLER_ARGS) +{ + struct tty *tp; + struct xtty xt; + int error; + + SLIST_FOREACH(tp, &tty_list, t_list) { + bzero(&xt, sizeof xt); + xt.xt_size = sizeof xt; +#define XT_COPY(field) xt.xt_##field = tp->t_##field + xt.xt_rawcc = tp->t_rawq.c_cc; + xt.xt_cancc = tp->t_canq.c_cc; + xt.xt_outcc = tp->t_outq.c_cc; + XT_COPY(line); + if (tp->t_dev) + xt.xt_dev = dev2udev(tp->t_dev); + XT_COPY(state); + XT_COPY(flags); + XT_COPY(timeout); + if (tp->t_pgrp) + xt.xt_pgid = tp->t_pgrp->pg_id; + if (tp->t_session) + xt.xt_sid = tp->t_session->s_sid; + XT_COPY(termios); + XT_COPY(winsize); + XT_COPY(column); + XT_COPY(rocount); + XT_COPY(rocol); + XT_COPY(ififosize); + XT_COPY(ihiwat); + XT_COPY(ilowat); + XT_COPY(ispeedwat); + XT_COPY(ohiwat); + XT_COPY(olowat); + XT_COPY(ospeedwat); +#undef XT_COPY + error = SYSCTL_OUT(req, &xt, sizeof xt); + if (error) + return (error); + } + return (0); +} + +SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, 0, sysctl_kern_ttys, "S,xtty", "All ttys"); +SYSCTL_LONG(_kern, OID_AUTO, tty_nin, CTLFLAG_RD, + &tk_nin, 0, "Total TTY in characters"); +SYSCTL_LONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD, + &tk_nout, 0, "Total TTY out characters"); + +void +nottystop(struct tty *tp, int rw) +{ + + return; +} + +int +ttyread(dev_t dev, struct uio *uio, int flag) +{ + struct tty *tp; + + tp = dev->si_tty; + if (tp == NULL) + return (ENODEV); + return ((*linesw[tp->t_line].l_read)(tp, uio, flag)); +} + +int +ttywrite(dev_t dev, struct uio *uio, int flag) +{ + struct tty *tp; + + tp = dev->si_tty; + if (tp == NULL) + return (ENODEV); + return ((*linesw[tp->t_line].l_write)(tp, uio, flag)); +} diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c new file mode 100644 index 0000000..01628ff --- /dev/null +++ b/sys/kern/tty_compat.c @@ -0,0 +1,490 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_compat.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include "opt_compat.h" + +/* + * mapping routines for old line discipline (yuck) + */ +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/ioctl_compat.h> +#include <sys/tty.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +static int ttcompatgetflags(struct tty *tp); +static void ttcompatsetflags(struct tty *tp, struct termios *t); +static void ttcompatsetlflags(struct tty *tp, struct termios *t); +static int ttcompatspeedtab(int speed, struct speedtab *table); + +static int ttydebug = 0; +SYSCTL_INT(_debug, OID_AUTO, ttydebug, CTLFLAG_RW, &ttydebug, 0, ""); + +static struct speedtab compatspeeds[] = { +#define MAX_SPEED 17 + { 115200, 17 }, + { 57600, 16 }, + { 38400, 15 }, + { 19200, 14 }, + { 9600, 13 }, + { 4800, 12 }, + { 2400, 11 }, + { 1800, 10 }, + { 1200, 9 }, + { 600, 8 }, + { 300, 7 }, + { 200, 6 }, + { 150, 5 }, + { 134, 4 }, + { 110, 3 }, + { 75, 2 }, + { 50, 1 }, + { 0, 0 }, + { -1, -1 }, +}; +static int compatspcodes[] = { + 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, + 1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200, +}; + +static int +ttcompatspeedtab(speed, table) + int speed; + register struct speedtab *table; +{ + if (speed == 0) + return (0); /* hangup */ + for ( ; table->sp_speed > 0; table++) + if (table->sp_speed <= speed) /* nearest one, rounded down */ + return (table->sp_code); + return (1); /* 50, min and not hangup */ +} + +int +ttsetcompat(tp, com, data, term) + register struct tty *tp; + u_long *com; + caddr_t data; + struct termios *term; +{ + switch (*com) { + case TIOCSETP: + case TIOCSETN: { + register struct sgttyb *sg = (struct sgttyb *)data; + int speed; + + if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0) + return(EINVAL); + else if (speed != ttcompatspeedtab(tp->t_ispeed, compatspeeds)) + term->c_ispeed = compatspcodes[speed]; + else + term->c_ispeed = tp->t_ispeed; + if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0) + return(EINVAL); + else if (speed != ttcompatspeedtab(tp->t_ospeed, compatspeeds)) + term->c_ospeed = compatspcodes[speed]; + else + term->c_ospeed = tp->t_ospeed; + term->c_cc[VERASE] = sg->sg_erase; + term->c_cc[VKILL] = sg->sg_kill; + tp->t_flags = (tp->t_flags&0xffff0000) | (sg->sg_flags&0xffff); + ttcompatsetflags(tp, term); + *com = (*com == TIOCSETP) ? TIOCSETAF : TIOCSETA; + break; + } + case TIOCSETC: { + struct tchars *tc = (struct tchars *)data; + register cc_t *cc; + + cc = term->c_cc; + cc[VINTR] = tc->t_intrc; + cc[VQUIT] = tc->t_quitc; + cc[VSTART] = tc->t_startc; + cc[VSTOP] = tc->t_stopc; + cc[VEOF] = tc->t_eofc; + cc[VEOL] = tc->t_brkc; + if (tc->t_brkc == -1) + cc[VEOL2] = _POSIX_VDISABLE; + *com = TIOCSETA; + break; + } + case TIOCSLTC: { + struct ltchars *ltc = (struct ltchars *)data; + register cc_t *cc; + + cc = term->c_cc; + cc[VSUSP] = ltc->t_suspc; + cc[VDSUSP] = ltc->t_dsuspc; + cc[VREPRINT] = ltc->t_rprntc; + cc[VDISCARD] = ltc->t_flushc; + cc[VWERASE] = ltc->t_werasc; + cc[VLNEXT] = ltc->t_lnextc; + *com = TIOCSETA; + break; + } + case TIOCLBIS: + case TIOCLBIC: + case TIOCLSET: + if (*com == TIOCLSET) + tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16; + else { + tp->t_flags = + (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff); + if (*com == TIOCLBIS) + tp->t_flags |= *(int *)data<<16; + else + tp->t_flags &= ~(*(int *)data<<16); + } + ttcompatsetlflags(tp, term); + *com = TIOCSETA; + break; + } + return 0; +} + +/*ARGSUSED*/ +int +ttcompat(tp, com, data, flag) + register struct tty *tp; + u_long com; + caddr_t data; + int flag; +{ + switch (com) { + case TIOCSETP: + case TIOCSETN: + case TIOCSETC: + case TIOCSLTC: + case TIOCLBIS: + case TIOCLBIC: + case TIOCLSET: { + struct termios term; + int error; + + term = tp->t_termios; + if ((error = ttsetcompat(tp, &com, data, &term)) != 0) + return error; + return ttioctl(tp, com, &term, flag); + } + case TIOCGETP: { + register struct sgttyb *sg = (struct sgttyb *)data; + register cc_t *cc = tp->t_cc; + + sg->sg_ospeed = ttcompatspeedtab(tp->t_ospeed, compatspeeds); + if (tp->t_ispeed == 0) + sg->sg_ispeed = sg->sg_ospeed; + else + sg->sg_ispeed = ttcompatspeedtab(tp->t_ispeed, compatspeeds); + sg->sg_erase = cc[VERASE]; + sg->sg_kill = cc[VKILL]; + sg->sg_flags = tp->t_flags = ttcompatgetflags(tp); + break; + } + case TIOCGETC: { + struct tchars *tc = (struct tchars *)data; + register cc_t *cc = tp->t_cc; + + tc->t_intrc = cc[VINTR]; + tc->t_quitc = cc[VQUIT]; + tc->t_startc = cc[VSTART]; + tc->t_stopc = cc[VSTOP]; + tc->t_eofc = cc[VEOF]; + tc->t_brkc = cc[VEOL]; + break; + } + case TIOCGLTC: { + struct ltchars *ltc = (struct ltchars *)data; + register cc_t *cc = tp->t_cc; + + ltc->t_suspc = cc[VSUSP]; + ltc->t_dsuspc = cc[VDSUSP]; + ltc->t_rprntc = cc[VREPRINT]; + ltc->t_flushc = cc[VDISCARD]; + ltc->t_werasc = cc[VWERASE]; + ltc->t_lnextc = cc[VLNEXT]; + break; + } + case TIOCLGET: + tp->t_flags = + (ttcompatgetflags(tp) & 0xffff0000UL) + | (tp->t_flags & 0xffff); + *(int *)data = tp->t_flags>>16; + if (ttydebug) + printf("CLGET: returning %x\n", *(int *)data); + break; + + case OTIOCGETD: + *(int *)data = tp->t_line ? tp->t_line : 2; + break; + + case OTIOCSETD: { + int ldisczero = 0; + + return (ttioctl(tp, TIOCSETD, + *(int *)data == 2 ? (caddr_t)&ldisczero : data, flag)); + } + + case OTIOCCONS: + *(int *)data = 1; + return (ttioctl(tp, TIOCCONS, data, flag)); + + default: + return (ENOIOCTL); + } + return (0); +} + +static int +ttcompatgetflags(tp) + register struct tty *tp; +{ + register tcflag_t iflag = tp->t_iflag; + register tcflag_t lflag = tp->t_lflag; + register tcflag_t oflag = tp->t_oflag; + register tcflag_t cflag = tp->t_cflag; + register int flags = 0; + + if (iflag&IXOFF) + flags |= TANDEM; + if (iflag&ICRNL || oflag&ONLCR) + flags |= CRMOD; + if ((cflag&CSIZE) == CS8) { + flags |= PASS8; + if (iflag&ISTRIP) + flags |= ANYP; + } + else if (cflag&PARENB) { + if (iflag&INPCK) { + if (cflag&PARODD) + flags |= ODDP; + else + flags |= EVENP; + } else + flags |= EVENP | ODDP; + } + + if ((lflag&ICANON) == 0) { + /* fudge */ + if (iflag&(INPCK|ISTRIP|IXON) || lflag&(IEXTEN|ISIG) + || (cflag&(CSIZE|PARENB)) != CS8) + flags |= CBREAK; + else + flags |= RAW; + } + if (!(flags&RAW) && !(oflag&OPOST) && (cflag&(CSIZE|PARENB)) == CS8) + flags |= LITOUT; + if (cflag&MDMBUF) + flags |= MDMBUF; + if ((cflag&HUPCL) == 0) + flags |= NOHANG; + if (oflag&OXTABS) + flags |= XTABS; + if (lflag&ECHOE) + flags |= CRTERA|CRTBS; + if (lflag&ECHOKE) + flags |= CRTKIL|CRTBS; + if (lflag&ECHOPRT) + flags |= PRTERA; + if (lflag&ECHOCTL) + flags |= CTLECH; + if ((iflag&IXANY) == 0) + flags |= DECCTQ; + flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH); + if (ttydebug) + printf("getflags: %x\n", flags); + return (flags); +} + +static void +ttcompatsetflags(tp, t) + register struct tty *tp; + register struct termios *t; +{ + register int flags = tp->t_flags; + register tcflag_t iflag = t->c_iflag; + register tcflag_t oflag = t->c_oflag; + register tcflag_t lflag = t->c_lflag; + register tcflag_t cflag = t->c_cflag; + + if (flags & RAW) { + iflag = IGNBRK; + lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN); + } else { + iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR); + iflag |= BRKINT|IXON|IMAXBEL; + lflag |= ISIG|IEXTEN|ECHOCTL; /* XXX was echoctl on ? */ + if (flags & XTABS) + oflag |= OXTABS; + else + oflag &= ~OXTABS; + if (flags & CBREAK) + lflag &= ~ICANON; + else + lflag |= ICANON; + if (flags&CRMOD) { + iflag |= ICRNL; + oflag |= ONLCR; + } else { + iflag &= ~ICRNL; + oflag &= ~ONLCR; + } + } + if (flags&ECHO) + lflag |= ECHO; + else + lflag &= ~ECHO; + + cflag &= ~(CSIZE|PARENB); + if (flags&(RAW|LITOUT|PASS8)) { + cflag |= CS8; + if (!(flags&(RAW|PASS8)) + || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP)) + iflag |= ISTRIP; + else + iflag &= ~ISTRIP; + if (flags&(RAW|LITOUT)) + oflag &= ~OPOST; + else + oflag |= OPOST; + } else { + cflag |= CS7|PARENB; + iflag |= ISTRIP; + oflag |= OPOST; + } + /* XXX don't set INPCK if RAW or PASS8? */ + if ((flags&(EVENP|ODDP)) == EVENP) { + iflag |= INPCK; + cflag &= ~PARODD; + } else if ((flags&(EVENP|ODDP)) == ODDP) { + iflag |= INPCK; + cflag |= PARODD; + } else + iflag &= ~INPCK; + if (flags&TANDEM) + iflag |= IXOFF; + else + iflag &= ~IXOFF; + if ((flags&DECCTQ) == 0) + iflag |= IXANY; + else + iflag &= ~IXANY; + t->c_iflag = iflag; + t->c_oflag = oflag; + t->c_lflag = lflag; + t->c_cflag = cflag; +} + +static void +ttcompatsetlflags(tp, t) + register struct tty *tp; + register struct termios *t; +{ + register int flags = tp->t_flags; + register tcflag_t iflag = t->c_iflag; + register tcflag_t oflag = t->c_oflag; + register tcflag_t lflag = t->c_lflag; + register tcflag_t cflag = t->c_cflag; + + iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR); + if (flags&CRTERA) + lflag |= ECHOE; + else + lflag &= ~ECHOE; + if (flags&CRTKIL) + lflag |= ECHOKE; + else + lflag &= ~ECHOKE; + if (flags&PRTERA) + lflag |= ECHOPRT; + else + lflag &= ~ECHOPRT; + if (flags&CTLECH) + lflag |= ECHOCTL; + else + lflag &= ~ECHOCTL; + if (flags&TANDEM) + iflag |= IXOFF; + else + iflag &= ~IXOFF; + if ((flags&DECCTQ) == 0) + iflag |= IXANY; + else + iflag &= ~IXANY; + if (flags & MDMBUF) + cflag |= MDMBUF; + else + cflag &= ~MDMBUF; + if (flags&NOHANG) + cflag &= ~HUPCL; + else + cflag |= HUPCL; + lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH); + lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH); + + /* + * The next if-else statement is copied from above so don't bother + * checking it separately. We could avoid fiddlling with the + * character size if the mode is already RAW or if neither the + * LITOUT bit or the PASS8 bit is being changed, but the delta of + * the change is not available here and skipping the RAW case would + * make the code different from above. + */ + cflag &= ~(CSIZE|PARENB); + if (flags&(RAW|LITOUT|PASS8)) { + cflag |= CS8; + if (!(flags&(RAW|PASS8)) + || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP)) + iflag |= ISTRIP; + else + iflag &= ~ISTRIP; + if (flags&(RAW|LITOUT)) + oflag &= ~OPOST; + else + oflag |= OPOST; + } else { + cflag |= CS7|PARENB; + iflag |= ISTRIP; + oflag |= OPOST; + } + t->c_iflag = iflag; + t->c_oflag = oflag; + t->c_lflag = lflag; + t->c_cflag = cflag; +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ diff --git a/sys/kern/tty_conf.c b/sys/kern/tty_conf.c new file mode 100644 index 0000000..0609dc9 --- /dev/null +++ b/sys/kern/tty_conf.c @@ -0,0 +1,210 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_conf.c 8.4 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/tty.h> +#include <sys/conf.h> + +#ifndef MAXLDISC +#define MAXLDISC 9 +#endif + +static l_open_t l_noopen; +static l_close_t l_noclose; +static l_rint_t l_norint; +static l_start_t l_nostart; + +/* + * XXX it probably doesn't matter what the entries other than the l_open + * entry are here. The l_nullioctl and ttymodem entries still look fishy. + * Reconsider the removal of nullmodem anyway. It was too much like + * ttymodem, but a completely null version might be useful. + */ +#define NODISC(n) \ + { l_noopen, l_noclose, l_noread, l_nowrite, \ + l_nullioctl, l_norint, l_nostart, ttymodem } + +struct linesw linesw[MAXLDISC] = +{ + /* 0- termios */ + { ttyopen, ttylclose, ttread, ttwrite, + l_nullioctl, ttyinput, ttstart, ttymodem }, + NODISC(1), /* 1- defunct */ + /* 2- NTTYDISC */ +#ifdef COMPAT_43 + { ttyopen, ttylclose, ttread, ttwrite, + l_nullioctl, ttyinput, ttstart, ttymodem }, +#else + NODISC(2), +#endif + NODISC(3), /* loadable */ + NODISC(4), /* SLIPDISC */ + NODISC(5), /* PPPDISC */ + NODISC(6), /* NETGRAPHDISC */ + NODISC(7), /* loadable */ + NODISC(8), /* loadable */ +}; + +int nlinesw = sizeof (linesw) / sizeof (linesw[0]); + +static struct linesw nodisc = NODISC(0); + +#define LOADABLE_LDISC 7 +/* + * ldisc_register: Register a line discipline. + * + * discipline: Index for discipline to load, or LDISC_LOAD for us to choose. + * linesw_p: Pointer to linesw_p. + * + * Returns: Index used or -1 on failure. + */ +int +ldisc_register(discipline, linesw_p) + int discipline; + struct linesw *linesw_p; +{ + int slot = -1; + + if (discipline == LDISC_LOAD) { + int i; + for (i = LOADABLE_LDISC; i < MAXLDISC; i++) + if (bcmp(linesw + i, &nodisc, sizeof(nodisc)) == 0) { + slot = i; + } + } + else if (discipline >= 0 && discipline < MAXLDISC) { + slot = discipline; + } + + if (slot != -1 && linesw_p) + linesw[slot] = *linesw_p; + + return slot; +} + +/* + * ldisc_deregister: Deregister a line discipline obtained with + * ldisc_register. + * + * discipline: Index for discipline to unload. + */ +void +ldisc_deregister(discipline) + int discipline; +{ + if (discipline < MAXLDISC) { + linesw[discipline] = nodisc; + } +} + +static int +l_noopen(dev, tp) + dev_t dev; + struct tty *tp; +{ + + return (ENODEV); +} + +static int +l_noclose(tp, flag) + struct tty *tp; + int flag; +{ + + return (ENODEV); +} + +int +l_noread(tp, uio, flag) + struct tty *tp; + struct uio *uio; + int flag; +{ + + return (ENODEV); +} + +int +l_nowrite(tp, uio, flag) + struct tty *tp; + struct uio *uio; + int flag; +{ + + return (ENODEV); +} + +static int +l_norint(c, tp) + int c; + struct tty *tp; +{ + + return (ENODEV); +} + +static int +l_nostart(tp) + struct tty *tp; +{ + + return (ENODEV); +} + +/* + * Do nothing specific version of line + * discipline specific ioctl command. + */ +int +l_nullioctl(tp, cmd, data, flags, td) + struct tty *tp; + u_long cmd; + char *data; + int flags; + struct thread *td; +{ + + return (ENOIOCTL); +} diff --git a/sys/kern/tty_cons.c b/sys/kern/tty_cons.c new file mode 100644 index 0000000..91713c1 --- /dev/null +++ b/sys/kern/tty_cons.c @@ -0,0 +1,597 @@ +/* + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)cons.c 7.2 (Berkeley) 5/9/91 + * $FreeBSD$ + */ + +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/cons.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/namei.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/reboot.h> +#include <sys/sysctl.h> +#include <sys/tty.h> +#include <sys/uio.h> +#include <sys/vnode.h> + +#include <ddb/ddb.h> + +#include <machine/cpu.h> + +static d_open_t cnopen; +static d_close_t cnclose; +static d_read_t cnread; +static d_write_t cnwrite; +static d_ioctl_t cnioctl; +static d_poll_t cnpoll; +static d_kqfilter_t cnkqfilter; + +#define CDEV_MAJOR 0 +static struct cdevsw cn_cdevsw = { + /* open */ cnopen, + /* close */ cnclose, + /* read */ cnread, + /* write */ cnwrite, + /* ioctl */ cnioctl, + /* poll */ cnpoll, + /* mmap */ nommap, + /* strategy */ nostrategy, + /* name */ "console", + /* maj */ CDEV_MAJOR, + /* dump */ nodump, + /* psize */ nopsize, + /* flags */ D_TTY | D_KQFILTER, + /* kqfilter */ cnkqfilter, +}; + +struct cn_device { + STAILQ_ENTRY(cn_device) cnd_next; + char cnd_name[16]; + struct vnode *cnd_vp; + struct consdev *cnd_cn; +}; + +#define CNDEVPATHMAX 32 +#define CNDEVTAB_SIZE 4 +static struct cn_device cn_devtab[CNDEVTAB_SIZE]; +static STAILQ_HEAD(, cn_device) cn_devlist = + STAILQ_HEAD_INITIALIZER(cn_devlist); + +#define CND_INVALID(cnd, td) \ + (cnd == NULL || cnd->cnd_vp == NULL || \ + (cnd->cnd_vp->v_type == VBAD && !cn_devopen(cnd, td, 1))) + +static udev_t cn_udev_t; +SYSCTL_OPAQUE(_machdep, CPU_CONSDEV, consdev, CTLFLAG_RD, + &cn_udev_t, sizeof cn_udev_t, "T,dev_t", ""); + +int cons_unavail = 0; /* XXX: + * physical console not available for + * input (i.e., it is in graphics mode) + */ +static int cn_mute; +static int openflag; /* how /dev/console was opened */ +static int cn_is_open; +static dev_t cn_devfsdev; /* represents the device private info */ +static u_char console_pausing; /* pause after each line during probe */ +static char *console_pausestr= +"<pause; press any key to proceed to next line or '.' to end pause mode>"; + +void cndebug(char *); + +CONS_DRIVER(cons, NULL, NULL, NULL, NULL, NULL, NULL, NULL); +SET_DECLARE(cons_set, struct consdev); + +void +cninit(void) +{ + struct consdev *best_cn, *cn, **list; + + /* + * Check if we should mute the console (for security reasons perhaps) + * It can be changes dynamically using sysctl kern.consmute + * once we are up and going. + * + */ + cn_mute = ((boothowto & (RB_MUTE + |RB_SINGLE + |RB_VERBOSE + |RB_ASKNAME + |RB_CONFIG)) == RB_MUTE); + + /* + * Find the first console with the highest priority. + */ + best_cn = NULL; + SET_FOREACH(list, cons_set) { + cn = *list; + if (cn->cn_probe == NULL) + continue; + cn->cn_probe(cn); + if (cn->cn_pri == CN_DEAD) + continue; + if (best_cn == NULL || cn->cn_pri > best_cn->cn_pri) + best_cn = cn; + if (boothowto & RB_MULTIPLE) { + /* + * Initialize console, and attach to it. + */ + cnadd(cn); + cn->cn_init(cn); + } + } + if (best_cn == NULL) + return; + if ((boothowto & RB_MULTIPLE) == 0) { + cnadd(best_cn); + best_cn->cn_init(best_cn); + } + if (boothowto & RB_PAUSE) + console_pausing = 1; + /* + * Make the best console the preferred console. + */ + cnselect(best_cn); +} + +void +cninit_finish() +{ + console_pausing = 0; +} + +/* add a new physical console to back the virtual console */ +int +cnadd(struct consdev *cn) +{ + struct cn_device *cnd; + int i; + + STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) + if (cnd->cnd_cn == cn) + return (0); + for (i = 0; i < CNDEVTAB_SIZE; i++) { + cnd = &cn_devtab[i]; + if (cnd->cnd_cn == NULL) + break; + } + if (cnd->cnd_cn != NULL) + return (ENOMEM); + cnd->cnd_cn = cn; + STAILQ_INSERT_TAIL(&cn_devlist, cnd, cnd_next); + return (0); +} + +void +cnremove(struct consdev *cn) +{ + struct cn_device *cnd; + + STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { + if (cnd->cnd_cn != cn) + continue; + STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next); + if (cnd->cnd_vp != NULL) + vn_close(cnd->cnd_vp, openflag, NOCRED, NULL); + cnd->cnd_vp = NULL; + cnd->cnd_cn = NULL; + cnd->cnd_name[0] = '\0'; +#if 0 + /* + * XXX + * syscons gets really confused if console resources are + * freed after the system has initialized. + */ + if (cn->cn_term != NULL) + cn->cn_term(cn); +#endif + return; + } +} + +void +cnselect(struct consdev *cn) +{ + struct cn_device *cnd; + + STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { + if (cnd->cnd_cn != cn) + continue; + if (cnd == STAILQ_FIRST(&cn_devlist)) + return; + STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next); + STAILQ_INSERT_HEAD(&cn_devlist, cnd, cnd_next); + return; + } +} + +void +cndebug(char *str) +{ + int i, len; + + len = strlen(str); + cnputc('>'); cnputc('>'); cnputc('>'); cnputc(' '); + for (i = 0; i < len; i++) + cnputc(str[i]); + cnputc('\n'); +} + +static int +sysctl_kern_console(SYSCTL_HANDLER_ARGS) +{ + struct cn_device *cnd; + struct consdev *cp, **list; + char *name, *p; + int delete, len, error; + + len = 2; + SET_FOREACH(list, cons_set) { + cp = *list; + if (cp->cn_dev != NULL) + len += strlen(devtoname(cp->cn_dev)) + 1; + } + STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) + len += strlen(devtoname(cnd->cnd_cn->cn_dev)) + 1; + len = len > CNDEVPATHMAX ? len : CNDEVPATHMAX; + MALLOC(name, char *, len, M_TEMP, M_WAITOK | M_ZERO); + p = name; + STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) + p += sprintf(p, "%s,", devtoname(cnd->cnd_cn->cn_dev)); + *p++ = '/'; + SET_FOREACH(list, cons_set) { + cp = *list; + if (cp->cn_dev != NULL) + p += sprintf(p, "%s,", devtoname(cp->cn_dev)); + } + error = sysctl_handle_string(oidp, name, len, req); + if (error == 0 && req->newptr != NULL) { + p = name; + error = ENXIO; + delete = 0; + if (*p == '-') { + delete = 1; + p++; + } + SET_FOREACH(list, cons_set) { + cp = *list; + if (cp->cn_dev == NULL || + strcmp(p, devtoname(cp->cn_dev)) != 0) + continue; + if (delete) { + cnremove(cp); + error = 0; + } else { + error = cnadd(cp); + if (error == 0) + cnselect(cp); + } + break; + } + } + FREE(name, M_TEMP); + return (error); +} + +SYSCTL_PROC(_kern, OID_AUTO, console, CTLTYPE_STRING|CTLFLAG_RW, + 0, 0, sysctl_kern_console, "A", "Console device control"); + +/* + * User has changed the state of the console muting. + * This may require us to open or close the device in question. + */ +static int +sysctl_kern_consmute(SYSCTL_HANDLER_ARGS) +{ + int error; + int ocn_mute; + + ocn_mute = cn_mute; + error = sysctl_handle_int(oidp, &cn_mute, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (ocn_mute && !cn_mute && cn_is_open) + error = cnopen(NODEV, openflag, 0, curthread); + else if (!ocn_mute && cn_mute && cn_is_open) { + error = cnclose(NODEV, openflag, 0, curthread); + cn_is_open = 1; /* XXX hack */ + } + return (error); +} + +SYSCTL_PROC(_kern, OID_AUTO, consmute, CTLTYPE_INT|CTLFLAG_RW, + 0, sizeof(cn_mute), sysctl_kern_consmute, "I", ""); + +static int +cn_devopen(struct cn_device *cnd, struct thread *td, int forceopen) +{ + char path[CNDEVPATHMAX]; + struct nameidata nd; + struct vnode *vp; + dev_t dev; + int error; + + if ((vp = cnd->cnd_vp) != NULL) { + if (!forceopen && vp->v_type != VBAD) { + dev = vp->v_rdev; + return ((*devsw(dev)->d_open)(dev, openflag, 0, td)); + } + cnd->cnd_vp = NULL; + vn_close(vp, openflag, td->td_ucred, td); + } + if (cnd->cnd_name[0] == '\0') + strncpy(cnd->cnd_name, devtoname(cnd->cnd_cn->cn_dev), + sizeof(cnd->cnd_name)); + snprintf(path, sizeof(path), "/dev/%s", cnd->cnd_name); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, td); + error = vn_open(&nd, &openflag, 0); + if (error == 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + VOP_UNLOCK(nd.ni_vp, 0, td); + if (nd.ni_vp->v_type == VCHR) + cnd->cnd_vp = nd.ni_vp; + else + vn_close(nd.ni_vp, openflag, td->td_ucred, td); + } + return (cnd->cnd_vp != NULL); +} + +static int +cnopen(dev_t dev, int flag, int mode, struct thread *td) +{ + struct cn_device *cnd; + + openflag = flag | FWRITE; /* XXX */ + cn_is_open = 1; /* console is logically open */ + if (cn_mute) + return (0); + STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) + cn_devopen(cnd, td, 0); + return (0); +} + +static int +cnclose(dev_t dev, int flag, int mode, struct thread *td) +{ + struct cn_device *cnd; + struct vnode *vp; + + STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { + if ((vp = cnd->cnd_vp) == NULL) + continue; + cnd->cnd_vp = NULL; + vn_close(vp, openflag, td->td_ucred, td); + } + cn_is_open = 0; + return (0); +} + +static int +cnread(dev_t dev, struct uio *uio, int flag) +{ + struct cn_device *cnd; + + cnd = STAILQ_FIRST(&cn_devlist); + if (cn_mute || CND_INVALID(cnd, curthread)) + return (0); + dev = cnd->cnd_vp->v_rdev; + return ((*devsw(dev)->d_read)(dev, uio, flag)); +} + +static int +cnwrite(dev_t dev, struct uio *uio, int flag) +{ + struct cn_device *cnd; + + cnd = STAILQ_FIRST(&cn_devlist); + if (cn_mute || CND_INVALID(cnd, curthread)) + goto done; + if (constty) + dev = constty->t_dev; + else + dev = cnd->cnd_vp->v_rdev; + if (dev != NULL) { + log_console(uio); + return ((*devsw(dev)->d_write)(dev, uio, flag)); + } +done: + uio->uio_resid = 0; /* dump the data */ + return (0); +} + +static int +cnioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td) +{ + struct cn_device *cnd; + int error; + + cnd = STAILQ_FIRST(&cn_devlist); + if (cn_mute || CND_INVALID(cnd, td)) + return (0); + /* + * Superuser can always use this to wrest control of console + * output from the "virtual" console. + */ + if (cmd == TIOCCONS && constty) { + error = suser(td); + if (error) + return (error); + constty = NULL; + return (0); + } + dev = cnd->cnd_vp->v_rdev; + if (dev != NULL) + return ((*devsw(dev)->d_ioctl)(dev, cmd, data, flag, td)); + return (0); +} + +/* + * XXX + * poll/kqfilter do not appear to be correct + */ +static int +cnpoll(dev_t dev, int events, struct thread *td) +{ + struct cn_device *cnd; + + cnd = STAILQ_FIRST(&cn_devlist); + if (cn_mute || CND_INVALID(cnd, td)) + return (0); + dev = cnd->cnd_vp->v_rdev; + if (dev != NULL) + return ((*devsw(dev)->d_poll)(dev, events, td)); + return (0); +} + +static int +cnkqfilter(dev_t dev, struct knote *kn) +{ + struct cn_device *cnd; + + cnd = STAILQ_FIRST(&cn_devlist); + if (cn_mute || CND_INVALID(cnd, curthread)) + return (1); + dev = cnd->cnd_vp->v_rdev; + if (dev != NULL) + return ((*devsw(dev)->d_kqfilter)(dev, kn)); + return (1); +} + +/* + * Low level console routines. + */ +int +cngetc(void) +{ + int c; + + if (cn_mute) + return (-1); + while ((c = cncheckc()) == -1) + ; + if (c == '\r') + c = '\n'; /* console input is always ICRNL */ + return (c); +} + +int +cncheckc(void) +{ + struct cn_device *cnd; + struct consdev *cn; + int c; + + if (cn_mute) + return (-1); + STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { + cn = cnd->cnd_cn; + c = cn->cn_checkc(cn->cn_dev); + if (c != -1) { + return (c); + } + } + return (-1); +} + +void +cnputc(int c) +{ + struct cn_device *cnd; + struct consdev *cn; + char *cp; + + if (cn_mute || c == '\0') + return; + STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { + cn = cnd->cnd_cn; + if (c == '\n') + cn->cn_putc(cn->cn_dev, '\r'); + cn->cn_putc(cn->cn_dev, c); + } +#ifdef DDB + if (console_pausing && !db_active && (c == '\n')) { +#else + if (console_pausing && (c == '\n')) { +#endif + for (cp = console_pausestr; *cp != '\0'; cp++) + cnputc(*cp); + if (cngetc() == '.') + console_pausing = 0; + cnputc('\r'); + for (cp = console_pausestr; *cp != '\0'; cp++) + cnputc(' '); + cnputc('\r'); + } +} + +void +cndbctl(int on) +{ + struct cn_device *cnd; + struct consdev *cn; + static int refcount; + + if (!on) + refcount--; + if (refcount == 0) + STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { + cn = cnd->cnd_cn; + if (cn->cn_dbctl != NULL) + cn->cn_dbctl(cn->cn_dev, on); + } + if (on) + refcount++; +} + +static void +cn_drvinit(void *unused) +{ + + cn_devfsdev = make_dev(&cn_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, + "console"); +} + +SYSINIT(cndev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,cn_drvinit,NULL) diff --git a/sys/kern/tty_pty.c b/sys/kern/tty_pty.c new file mode 100644 index 0000000..7d6e736 --- /dev/null +++ b/sys/kern/tty_pty.c @@ -0,0 +1,874 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_pty.c 8.4 (Berkeley) 2/20/95 + * $FreeBSD$ + */ + +/* + * Pseudo-teletype Driver + * (Actually two drivers, requiring two entries in 'cdevsw') + */ +#include "opt_compat.h" +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#include <sys/ioctl_compat.h> +#endif +#include <sys/proc.h> +#include <sys/tty.h> +#include <sys/conf.h> +#include <sys/fcntl.h> +#include <sys/poll.h> +#include <sys/kernel.h> +#include <sys/vnode.h> +#include <sys/signalvar.h> +#include <sys/malloc.h> + +static MALLOC_DEFINE(M_PTY, "ptys", "pty data structures"); + +static void ptsstart(struct tty *tp); +static void ptsstop(struct tty *tp, int rw); +static void ptcwakeup(struct tty *tp, int flag); +static dev_t ptyinit(dev_t cdev); + +static d_open_t ptsopen; +static d_close_t ptsclose; +static d_read_t ptsread; +static d_write_t ptswrite; +static d_ioctl_t ptyioctl; +static d_open_t ptcopen; +static d_close_t ptcclose; +static d_read_t ptcread; +static d_write_t ptcwrite; +static d_poll_t ptcpoll; + +#define CDEV_MAJOR_S 5 +static struct cdevsw pts_cdevsw = { + /* open */ ptsopen, + /* close */ ptsclose, + /* read */ ptsread, + /* write */ ptswrite, + /* ioctl */ ptyioctl, + /* poll */ ttypoll, + /* mmap */ nommap, + /* strategy */ nostrategy, + /* name */ "pts", + /* maj */ CDEV_MAJOR_S, + /* dump */ nodump, + /* psize */ nopsize, + /* flags */ D_TTY | D_KQFILTER, + /* kqfilter */ ttykqfilter, +}; + +#define CDEV_MAJOR_C 6 +static struct cdevsw ptc_cdevsw = { + /* open */ ptcopen, + /* close */ ptcclose, + /* read */ ptcread, + /* write */ ptcwrite, + /* ioctl */ ptyioctl, + /* poll */ ptcpoll, + /* mmap */ nommap, + /* strategy */ nostrategy, + /* name */ "ptc", + /* maj */ CDEV_MAJOR_C, + /* dump */ nodump, + /* psize */ nopsize, + /* flags */ D_TTY | D_KQFILTER, + /* kqfilter */ ttykqfilter, +}; + +#define BUFSIZ 100 /* Chunk size iomoved to/from user */ + +struct pt_ioctl { + int pt_flags; + struct selinfo pt_selr, pt_selw; + u_char pt_send; + u_char pt_ucntl; + struct tty pt_tty; + dev_t devs, devc; + struct prison *pt_prison; +}; + +#define PF_PKT 0x08 /* packet mode */ +#define PF_STOPPED 0x10 /* user told stopped */ +#define PF_REMOTE 0x20 /* remote and flow controlled input */ +#define PF_NOSTOP 0x40 +#define PF_UCNTL 0x80 /* user control mode */ + +static char *names = "pqrsPQRS"; +/* + * This function creates and initializes a pts/ptc pair + * + * pts == /dev/tty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv] + * ptc == /dev/pty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv] + * + * XXX: define and add mapping of upper minor bits to allow more + * than 256 ptys. + */ +static dev_t +ptyinit(dev_t devc) +{ + dev_t devs; + struct pt_ioctl *pt; + int n; + + n = minor(devc); + /* For now we only map the lower 8 bits of the minor */ + if (n & ~0xff) + return (NODEV); + + devc->si_flags &= ~SI_CHEAPCLONE; + + pt = malloc(sizeof(*pt), M_PTY, M_WAITOK | M_ZERO); + pt->devs = devs = make_dev(&pts_cdevsw, n, + UID_ROOT, GID_WHEEL, 0666, "tty%c%r", names[n / 32], n % 32); + pt->devc = devc; + + devs->si_drv1 = devc->si_drv1 = pt; + devs->si_tty = devc->si_tty = &pt->pt_tty; + pt->pt_tty.t_dev = devs; + ttyregister(&pt->pt_tty); + return (devc); +} + +/*ARGSUSED*/ +static int +ptsopen(dev, flag, devtype, td) + dev_t dev; + int flag, devtype; + struct thread *td; +{ + register struct tty *tp; + int error; + struct pt_ioctl *pti; + + if (!dev->si_drv1) + return(ENXIO); + pti = dev->si_drv1; + tp = dev->si_tty; + if ((tp->t_state & TS_ISOPEN) == 0) { + ttychars(tp); /* Set up default chars */ + tp->t_iflag = TTYDEF_IFLAG; + tp->t_oflag = TTYDEF_OFLAG; + tp->t_lflag = TTYDEF_LFLAG; + tp->t_cflag = TTYDEF_CFLAG; + tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; + } else if (tp->t_state & TS_XCLUDE && suser(td)) { + return (EBUSY); + } else if (pti->pt_prison != td->td_ucred->cr_prison) { + return (EBUSY); + } + if (tp->t_oproc) /* Ctrlr still around. */ + (void)(*linesw[tp->t_line].l_modem)(tp, 1); + while ((tp->t_state & TS_CARR_ON) == 0) { + if (flag&FNONBLOCK) + break; + error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH, + "ptsopn", 0); + if (error) + return (error); + } + error = (*linesw[tp->t_line].l_open)(dev, tp); + if (error == 0) + ptcwakeup(tp, FREAD|FWRITE); + return (error); +} + +static int +ptsclose(dev, flag, mode, td) + dev_t dev; + int flag, mode; + struct thread *td; +{ + register struct tty *tp; + int err; + + tp = dev->si_tty; + err = (*linesw[tp->t_line].l_close)(tp, flag); + ptsstop(tp, FREAD|FWRITE); + (void) ttyclose(tp); + return (err); +} + +static int +ptsread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + struct thread *td = curthread; + struct proc *p = td->td_proc; + register struct tty *tp = dev->si_tty; + register struct pt_ioctl *pti = dev->si_drv1; + struct pgrp *pg; + int error = 0; + +again: + if (pti->pt_flags & PF_REMOTE) { + while (isbackground(p, tp)) { + sx_slock(&proctree_lock); + PROC_LOCK(p); + if (SIGISMEMBER(p->p_sigignore, SIGTTIN) || + SIGISMEMBER(p->p_sigmask, SIGTTIN) || + p->p_pgrp->pg_jobc == 0 || p->p_flag & P_PPWAIT) { + PROC_UNLOCK(p); + sx_sunlock(&proctree_lock); + return (EIO); + } + pg = p->p_pgrp; + PROC_UNLOCK(p); + PGRP_LOCK(pg); + sx_sunlock(&proctree_lock); + pgsignal(pg, SIGTTIN, 1); + PGRP_UNLOCK(pg); + error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ptsbg", + 0); + if (error) + return (error); + } + if (tp->t_canq.c_cc == 0) { + if (flag & IO_NDELAY) + return (EWOULDBLOCK); + error = ttysleep(tp, TSA_PTS_READ(tp), TTIPRI | PCATCH, + "ptsin", 0); + if (error) + return (error); + goto again; + } + while (tp->t_canq.c_cc > 1 && uio->uio_resid > 0) + if (ureadc(getc(&tp->t_canq), uio) < 0) { + error = EFAULT; + break; + } + if (tp->t_canq.c_cc == 1) + (void) getc(&tp->t_canq); + if (tp->t_canq.c_cc) + return (error); + } else + if (tp->t_oproc) + error = (*linesw[tp->t_line].l_read)(tp, uio, flag); + ptcwakeup(tp, FWRITE); + return (error); +} + +/* + * Write to pseudo-tty. + * Wakeups of controlling tty will happen + * indirectly, when tty driver calls ptsstart. + */ +static int +ptswrite(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct tty *tp; + + tp = dev->si_tty; + if (tp->t_oproc == 0) + return (EIO); + return ((*linesw[tp->t_line].l_write)(tp, uio, flag)); +} + +/* + * Start output on pseudo-tty. + * Wake up process selecting or sleeping for input from controlling tty. + */ +static void +ptsstart(tp) + struct tty *tp; +{ + register struct pt_ioctl *pti = tp->t_dev->si_drv1; + + if (tp->t_state & TS_TTSTOP) + return; + if (pti->pt_flags & PF_STOPPED) { + pti->pt_flags &= ~PF_STOPPED; + pti->pt_send = TIOCPKT_START; + } + ptcwakeup(tp, FREAD); +} + +static void +ptcwakeup(tp, flag) + struct tty *tp; + int flag; +{ + struct pt_ioctl *pti = tp->t_dev->si_drv1; + + if (flag & FREAD) { + selwakeup(&pti->pt_selr); + wakeup(TSA_PTC_READ(tp)); + } + if (flag & FWRITE) { + selwakeup(&pti->pt_selw); + wakeup(TSA_PTC_WRITE(tp)); + } +} + +static int +ptcopen(dev, flag, devtype, td) + dev_t dev; + int flag, devtype; + struct thread *td; +{ + register struct tty *tp; + struct pt_ioctl *pti; + + if (!dev->si_drv1) + ptyinit(dev); + if (!dev->si_drv1) + return(ENXIO); + tp = dev->si_tty; + if (tp->t_oproc) + return (EIO); + tp->t_timeout = -1; + tp->t_oproc = ptsstart; + tp->t_stop = ptsstop; + (void)(*linesw[tp->t_line].l_modem)(tp, 1); + tp->t_lflag &= ~EXTPROC; + pti = dev->si_drv1; + pti->pt_prison = td->td_ucred->cr_prison; + pti->pt_flags = 0; + pti->pt_send = 0; + pti->pt_ucntl = 0; + return (0); +} + +static int +ptcclose(dev, flags, fmt, td) + dev_t dev; + int flags; + int fmt; + struct thread *td; +{ + register struct tty *tp; + + tp = dev->si_tty; + (void)(*linesw[tp->t_line].l_modem)(tp, 0); + + /* + * XXX MDMBUF makes no sense for ptys but would inhibit the above + * l_modem(). CLOCAL makes sense but isn't supported. Special + * l_modem()s that ignore carrier drop make no sense for ptys but + * may be in use because other parts of the line discipline make + * sense for ptys. Recover by doing everything that a normal + * ttymodem() would have done except for sending a SIGHUP. + */ + if (tp->t_state & TS_ISOPEN) { + tp->t_state &= ~(TS_CARR_ON | TS_CONNECTED); + tp->t_state |= TS_ZOMBIE; + ttyflush(tp, FREAD | FWRITE); + } + + tp->t_oproc = 0; /* mark closed */ + return (0); +} + +static int +ptcread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct tty *tp = dev->si_tty; + struct pt_ioctl *pti = dev->si_drv1; + char buf[BUFSIZ]; + int error = 0, cc; + + /* + * We want to block until the slave + * is open, and there's something to read; + * but if we lost the slave or we're NBIO, + * then return the appropriate error instead. + */ + for (;;) { + if (tp->t_state&TS_ISOPEN) { + if (pti->pt_flags&PF_PKT && pti->pt_send) { + error = ureadc((int)pti->pt_send, uio); + if (error) + return (error); + if (pti->pt_send & TIOCPKT_IOCTL) { + cc = min(uio->uio_resid, + sizeof(tp->t_termios)); + uiomove((caddr_t)&tp->t_termios, cc, + uio); + } + pti->pt_send = 0; + return (0); + } + if (pti->pt_flags&PF_UCNTL && pti->pt_ucntl) { + error = ureadc((int)pti->pt_ucntl, uio); + if (error) + return (error); + pti->pt_ucntl = 0; + return (0); + } + if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0) + break; + } + if ((tp->t_state & TS_CONNECTED) == 0) + return (0); /* EOF */ + if (flag & IO_NDELAY) + return (EWOULDBLOCK); + error = tsleep(TSA_PTC_READ(tp), TTIPRI | PCATCH, "ptcin", 0); + if (error) + return (error); + } + if (pti->pt_flags & (PF_PKT|PF_UCNTL)) + error = ureadc(0, uio); + while (uio->uio_resid > 0 && error == 0) { + cc = q_to_b(&tp->t_outq, buf, min(uio->uio_resid, BUFSIZ)); + if (cc <= 0) + break; + error = uiomove(buf, cc, uio); + } + ttwwakeup(tp); + return (error); +} + +static void +ptsstop(tp, flush) + register struct tty *tp; + int flush; +{ + struct pt_ioctl *pti = tp->t_dev->si_drv1; + int flag; + + /* note: FLUSHREAD and FLUSHWRITE already ok */ + if (flush == 0) { + flush = TIOCPKT_STOP; + pti->pt_flags |= PF_STOPPED; + } else + pti->pt_flags &= ~PF_STOPPED; + pti->pt_send |= flush; + /* change of perspective */ + flag = 0; + if (flush & FREAD) + flag |= FWRITE; + if (flush & FWRITE) + flag |= FREAD; + ptcwakeup(tp, flag); +} + +static int +ptcpoll(dev, events, td) + dev_t dev; + int events; + struct thread *td; +{ + register struct tty *tp = dev->si_tty; + struct pt_ioctl *pti = dev->si_drv1; + int revents = 0; + int s; + + if ((tp->t_state & TS_CONNECTED) == 0) + return (seltrue(dev, events, td) | POLLHUP); + + /* + * Need to block timeouts (ttrstart). + */ + s = spltty(); + + if (events & (POLLIN | POLLRDNORM)) + if ((tp->t_state & TS_ISOPEN) && + ((tp->t_outq.c_cc && (tp->t_state & TS_TTSTOP) == 0) || + ((pti->pt_flags & PF_PKT) && pti->pt_send) || + ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl))) + revents |= events & (POLLIN | POLLRDNORM); + + if (events & (POLLOUT | POLLWRNORM)) + if (tp->t_state & TS_ISOPEN && + ((pti->pt_flags & PF_REMOTE) ? + (tp->t_canq.c_cc == 0) : + ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG - 2) || + (tp->t_canq.c_cc == 0 && (tp->t_lflag & ICANON))))) + revents |= events & (POLLOUT | POLLWRNORM); + + if (events & POLLHUP) + if ((tp->t_state & TS_CARR_ON) == 0) + revents |= POLLHUP; + + if (revents == 0) { + if (events & (POLLIN | POLLRDNORM)) + selrecord(td, &pti->pt_selr); + + if (events & (POLLOUT | POLLWRNORM)) + selrecord(td, &pti->pt_selw); + } + splx(s); + + return (revents); +} + +static int +ptcwrite(dev, uio, flag) + dev_t dev; + register struct uio *uio; + int flag; +{ + register struct tty *tp = dev->si_tty; + register u_char *cp = 0; + register int cc = 0; + u_char locbuf[BUFSIZ]; + int cnt = 0; + struct pt_ioctl *pti = dev->si_drv1; + int error = 0; + +again: + if ((tp->t_state&TS_ISOPEN) == 0) + goto block; + if (pti->pt_flags & PF_REMOTE) { + if (tp->t_canq.c_cc) + goto block; + while ((uio->uio_resid > 0 || cc > 0) && + tp->t_canq.c_cc < TTYHOG - 1) { + if (cc == 0) { + cc = min(uio->uio_resid, BUFSIZ); + cc = min(cc, TTYHOG - 1 - tp->t_canq.c_cc); + cp = locbuf; + error = uiomove((caddr_t)cp, cc, uio); + if (error) + return (error); + /* check again for safety */ + if ((tp->t_state & TS_ISOPEN) == 0) { + /* adjust as usual */ + uio->uio_resid += cc; + return (EIO); + } + } + if (cc > 0) { + cc = b_to_q((char *)cp, cc, &tp->t_canq); + /* + * XXX we don't guarantee that the canq size + * is >= TTYHOG, so the above b_to_q() may + * leave some bytes uncopied. However, space + * is guaranteed for the null terminator if + * we don't fail here since (TTYHOG - 1) is + * not a multiple of CBSIZE. + */ + if (cc > 0) + break; + } + } + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + (void) putc(0, &tp->t_canq); + ttwakeup(tp); + wakeup(TSA_PTS_READ(tp)); + return (0); + } + while (uio->uio_resid > 0 || cc > 0) { + if (cc == 0) { + cc = min(uio->uio_resid, BUFSIZ); + cp = locbuf; + error = uiomove((caddr_t)cp, cc, uio); + if (error) + return (error); + /* check again for safety */ + if ((tp->t_state & TS_ISOPEN) == 0) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + return (EIO); + } + } + while (cc > 0) { + if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 && + (tp->t_canq.c_cc > 0 || !(tp->t_lflag&ICANON))) { + wakeup(TSA_HUP_OR_INPUT(tp)); + goto block; + } + (*linesw[tp->t_line].l_rint)(*cp++, tp); + cnt++; + cc--; + } + cc = 0; + } + return (0); +block: + /* + * Come here to wait for slave to open, for space + * in outq, or space in rawq, or an empty canq. + */ + if ((tp->t_state & TS_CONNECTED) == 0) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + return (EIO); + } + if (flag & IO_NDELAY) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + if (cnt == 0) + return (EWOULDBLOCK); + return (0); + } + error = tsleep(TSA_PTC_WRITE(tp), TTOPRI | PCATCH, "ptcout", 0); + if (error) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + return (error); + } + goto again; +} + +/*ARGSUSED*/ +static int +ptyioctl(dev, cmd, data, flag, td) + dev_t dev; + u_long cmd; + caddr_t data; + int flag; + struct thread *td; +{ + register struct tty *tp = dev->si_tty; + register struct pt_ioctl *pti = dev->si_drv1; + register u_char *cc = tp->t_cc; + int stop, error; + + if (devsw(dev)->d_open == ptcopen) { + switch (cmd) { + + case TIOCGPGRP: + /* + * We avoid calling ttioctl on the controller since, + * in that case, tp must be the controlling terminal. + */ + *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0; + return (0); + + case TIOCPKT: + if (*(int *)data) { + if (pti->pt_flags & PF_UCNTL) + return (EINVAL); + pti->pt_flags |= PF_PKT; + } else + pti->pt_flags &= ~PF_PKT; + return (0); + + case TIOCUCNTL: + if (*(int *)data) { + if (pti->pt_flags & PF_PKT) + return (EINVAL); + pti->pt_flags |= PF_UCNTL; + } else + pti->pt_flags &= ~PF_UCNTL; + return (0); + + case TIOCREMOTE: + if (*(int *)data) + pti->pt_flags |= PF_REMOTE; + else + pti->pt_flags &= ~PF_REMOTE; + ttyflush(tp, FREAD|FWRITE); + return (0); + } + + /* + * The rest of the ioctls shouldn't be called until + * the slave is open. + */ + if ((tp->t_state & TS_ISOPEN) == 0) + return (EAGAIN); + + switch (cmd) { +#ifdef COMPAT_43 + case TIOCSETP: + case TIOCSETN: +#endif + case TIOCSETD: + case TIOCSETA: + case TIOCSETAW: + case TIOCSETAF: + /* + * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG. + * ttywflush(tp) will hang if there are characters in + * the outq. + */ + ndflush(&tp->t_outq, tp->t_outq.c_cc); + break; + + case TIOCSIG: + if (*(unsigned int *)data >= NSIG || + *(unsigned int *)data == 0) + return(EINVAL); + if ((tp->t_lflag&NOFLSH) == 0) + ttyflush(tp, FREAD|FWRITE); + if (tp->t_pgrp != NULL) { + PGRP_LOCK(tp->t_pgrp); + pgsignal(tp->t_pgrp, *(unsigned int *)data, 1); + PGRP_UNLOCK(tp->t_pgrp); + } + if ((*(unsigned int *)data == SIGINFO) && + ((tp->t_lflag&NOKERNINFO) == 0)) + ttyinfo(tp); + return(0); + } + } + if (cmd == TIOCEXT) { + /* + * When the EXTPROC bit is being toggled, we need + * to send an TIOCPKT_IOCTL if the packet driver + * is turned on. + */ + if (*(int *)data) { + if (pti->pt_flags & PF_PKT) { + pti->pt_send |= TIOCPKT_IOCTL; + ptcwakeup(tp, FREAD); + } + tp->t_lflag |= EXTPROC; + } else { + if ((tp->t_lflag & EXTPROC) && + (pti->pt_flags & PF_PKT)) { + pti->pt_send |= TIOCPKT_IOCTL; + ptcwakeup(tp, FREAD); + } + tp->t_lflag &= ~EXTPROC; + } + return(0); + } + error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, td); + if (error == ENOIOCTL) + error = ttioctl(tp, cmd, data, flag); + if (error == ENOIOCTL) { + if (pti->pt_flags & PF_UCNTL && + (cmd & ~0xff) == UIOCCMD(0)) { + if (cmd & 0xff) { + pti->pt_ucntl = (u_char)cmd; + ptcwakeup(tp, FREAD); + } + return (0); + } + error = ENOTTY; + } + /* + * If external processing and packet mode send ioctl packet. + */ + if ((tp->t_lflag&EXTPROC) && (pti->pt_flags & PF_PKT)) { + switch(cmd) { + case TIOCSETA: + case TIOCSETAW: + case TIOCSETAF: +#ifdef COMPAT_43 + case TIOCSETP: + case TIOCSETN: +#endif +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + case TIOCSETC: + case TIOCSLTC: + case TIOCLBIS: + case TIOCLBIC: + case TIOCLSET: +#endif + pti->pt_send |= TIOCPKT_IOCTL; + ptcwakeup(tp, FREAD); + default: + break; + } + } + stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s')) + && CCEQ(cc[VSTART], CTRL('q')); + if (pti->pt_flags & PF_NOSTOP) { + if (stop) { + pti->pt_send &= ~TIOCPKT_NOSTOP; + pti->pt_send |= TIOCPKT_DOSTOP; + pti->pt_flags &= ~PF_NOSTOP; + ptcwakeup(tp, FREAD); + } + } else { + if (!stop) { + pti->pt_send &= ~TIOCPKT_DOSTOP; + pti->pt_send |= TIOCPKT_NOSTOP; + pti->pt_flags |= PF_NOSTOP; + ptcwakeup(tp, FREAD); + } + } + return (error); +} + + +static void ptc_drvinit(void *unused); + +static void pty_clone(void *arg, char *name, int namelen, dev_t *dev); + +static void +pty_clone(arg, name, namelen, dev) + void *arg; + char *name; + int namelen; + dev_t *dev; +{ + int u; + + if (*dev != NODEV) + return; + if (bcmp(name, "pty", 3) != 0) + return; + if (name[5] != '\0') + return; + switch (name[3]) { + case 'p': u = 0; break; + case 'q': u = 32; break; + case 'r': u = 64; break; + case 's': u = 96; break; + case 'P': u = 128; break; + case 'Q': u = 160; break; + case 'R': u = 192; break; + case 'S': u = 224; break; + default: return; + } + if (name[4] >= '0' && name[4] <= '9') + u += name[4] - '0'; + else if (name[4] >= 'a' && name[4] <= 'v') + u += name[4] - 'a' + 10; + else + return; + *dev = make_dev(&ptc_cdevsw, u, + UID_ROOT, GID_WHEEL, 0666, "pty%c%r", names[u / 32], u % 32); + (*dev)->si_flags |= SI_CHEAPCLONE; + return; +} + +static void +ptc_drvinit(unused) + void *unused; +{ + EVENTHANDLER_REGISTER(dev_clone, pty_clone, 0, 1000); + cdevsw_add(&pts_cdevsw); + cdevsw_add(&ptc_cdevsw); +} + +SYSINIT(ptcdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR_C,ptc_drvinit,NULL) diff --git a/sys/kern/tty_subr.c b/sys/kern/tty_subr.c new file mode 100644 index 0000000..78bb231 --- /dev/null +++ b/sys/kern/tty_subr.c @@ -0,0 +1,696 @@ +/* + * Copyright (c) 1994, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * clist support routines + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/tty.h> +#include <sys/clist.h> + +static void clist_init(void *); +SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL) + +static struct cblock *cfreelist = 0; +int cfreecount = 0; +static int cslushcount; +static int ctotcount; + +#ifndef INITIAL_CBLOCKS +#define INITIAL_CBLOCKS 50 +#endif + +static struct cblock *cblock_alloc(void); +static void cblock_alloc_cblocks(int number); +static void cblock_free(struct cblock *cblockp); +static void cblock_free_cblocks(int number); + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +DB_SHOW_COMMAND(cbstat, cbstat) +{ + int cbsize = CBSIZE; + + printf( + "tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n", + ctotcount * cbsize, ctotcount * cbsize - cfreecount, cfreecount, + cfreecount - cslushcount * cbsize, cslushcount * cbsize); +} +#endif /* DDB */ + +/* + * Called from init_main.c + */ +/* ARGSUSED*/ +static void +clist_init(dummy) + void *dummy; +{ + /* + * Allocate an initial base set of cblocks as a 'slush'. + * We allocate non-slush cblocks with each initial ttyopen() and + * deallocate them with each ttyclose(). + * We should adjust the slush allocation. This can't be done in + * the i/o routines because they are sometimes called from + * interrupt handlers when it may be unsafe to call malloc(). + */ + cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS); +} + +/* + * Remove a cblock from the cfreelist queue and return a pointer + * to it. + */ +static __inline struct cblock * +cblock_alloc() +{ + struct cblock *cblockp; + + cblockp = cfreelist; + if (cblockp == NULL) + panic("clist reservation botch"); + cfreelist = cblockp->c_next; + cblockp->c_next = NULL; + cfreecount -= CBSIZE; + return (cblockp); +} + +/* + * Add a cblock to the cfreelist queue. + */ +static __inline void +cblock_free(cblockp) + struct cblock *cblockp; +{ + if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) + bzero(cblockp->c_quote, sizeof cblockp->c_quote); + cblockp->c_next = cfreelist; + cfreelist = cblockp; + cfreecount += CBSIZE; +} + +/* + * Allocate some cblocks for the cfreelist queue. + */ +static void +cblock_alloc_cblocks(number) + int number; +{ + int i; + struct cblock *cbp; + + for (i = 0; i < number; ++i) { + cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT); + if (cbp == NULL) { + printf( +"cblock_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n"); + cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK); + } + /* + * Freed cblocks have zero quotes and garbage elsewhere. + * Set the may-have-quote bit to force zeroing the quotes. + */ + setbit(cbp->c_quote, CBQSIZE * NBBY - 1); + cblock_free(cbp); + } + ctotcount += number; +} + +/* + * Set the cblock allocation policy for a a clist. + * Must be called in process context at spltty(). + */ +void +clist_alloc_cblocks(clistp, ccmax, ccreserved) + struct clist *clistp; + int ccmax; + int ccreserved; +{ + int dcbr; + + /* + * Allow for wasted space at the head. + */ + if (ccmax != 0) + ccmax += CBSIZE - 1; + if (ccreserved != 0) + ccreserved += CBSIZE - 1; + + clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE; + dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved; + if (dcbr >= 0) + cblock_alloc_cblocks(dcbr); + else { + if (clistp->c_cbreserved + dcbr < clistp->c_cbcount) + dcbr = clistp->c_cbcount - clistp->c_cbreserved; + cblock_free_cblocks(-dcbr); + } + clistp->c_cbreserved += dcbr; +} + +/* + * Free some cblocks from the cfreelist queue back to the + * system malloc pool. + */ +static void +cblock_free_cblocks(number) + int number; +{ + int i; + + for (i = 0; i < number; ++i) + free(cblock_alloc(), M_TTYS); + ctotcount -= number; +} + +/* + * Free the cblocks reserved for a clist. + * Must be called at spltty(). + */ +void +clist_free_cblocks(clistp) + struct clist *clistp; +{ + if (clistp->c_cbcount != 0) + panic("freeing active clist cblocks"); + cblock_free_cblocks(clistp->c_cbreserved); + clistp->c_cbmax = 0; + clistp->c_cbreserved = 0; +} + +/* + * Get a character from the head of a clist. + */ +int +getc(clistp) + struct clist *clistp; +{ + int chr = -1; + int s; + struct cblock *cblockp; + + s = spltty(); + + /* If there are characters in the list, get one */ + if (clistp->c_cc) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + chr = (u_char)*clistp->c_cf; + + /* + * If this char is quoted, set the flag. + */ + if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info)) + chr |= TTY_QUOTE; + + /* + * Advance to next character. + */ + clistp->c_cf++; + clistp->c_cc--; + /* + * If we have advanced the 'first' character pointer + * past the end of this cblock, advance to the next one. + * If there are no more characters, set the first and + * last pointers to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); + return (chr); +} + +/* + * Copy 'amount' of chars, beginning at head of clist 'clistp' to + * destination linear buffer 'dest'. Return number of characters + * actually copied. + */ +int +q_to_b(clistp, dest, amount) + struct clist *clistp; + char *dest; + int amount; +{ + struct cblock *cblockp; + struct cblock *cblockn; + char *dest_orig = dest; + int numc; + int s; + + s = spltty(); + + while (clistp && amount && (clistp->c_cc > 0)) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + cblockn = cblockp + 1; /* pointer arithmetic! */ + numc = min(amount, (char *)cblockn - clistp->c_cf); + numc = min(numc, clistp->c_cc); + bcopy(clistp->c_cf, dest, numc); + amount -= numc; + clistp->c_cf += numc; + clistp->c_cc -= numc; + dest += numc; + /* + * If this cblock has been emptied, advance to the next + * one. If there are no more characters, set the first + * and last pointer to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); + return (dest - dest_orig); +} + +/* + * Flush 'amount' of chars, beginning at head of clist 'clistp'. + */ +void +ndflush(clistp, amount) + struct clist *clistp; + int amount; +{ + struct cblock *cblockp; + struct cblock *cblockn; + int numc; + int s; + + s = spltty(); + + while (amount && (clistp->c_cc > 0)) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + cblockn = cblockp + 1; /* pointer arithmetic! */ + numc = min(amount, (char *)cblockn - clistp->c_cf); + numc = min(numc, clistp->c_cc); + amount -= numc; + clistp->c_cf += numc; + clistp->c_cc -= numc; + /* + * If this cblock has been emptied, advance to the next + * one. If there are no more characters, set the first + * and last pointer to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); +} + +/* + * Add a character to the end of a clist. Return -1 is no + * more clists, or 0 for success. + */ +int +putc(chr, clistp) + int chr; + struct clist *clistp; +{ + struct cblock *cblockp; + int s; + + s = spltty(); + + if (clistp->c_cl == NULL) { + if (clistp->c_cbreserved < 1) { + splx(s); + printf("putc to a clist with no reserved cblocks\n"); + return (-1); /* nothing done */ + } + cblockp = cblock_alloc(); + clistp->c_cbcount = 1; + clistp->c_cf = clistp->c_cl = cblockp->c_info; + clistp->c_cc = 0; + } else { + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + if (((intptr_t)clistp->c_cl & CROUND) == 0) { + struct cblock *prev = (cblockp - 1); + + if (clistp->c_cbcount >= clistp->c_cbreserved) { + if (clistp->c_cbcount >= clistp->c_cbmax + || cslushcount <= 0) { + splx(s); + return (-1); + } + --cslushcount; + } + cblockp = cblock_alloc(); + clistp->c_cbcount++; + prev->c_next = cblockp; + clistp->c_cl = cblockp->c_info; + } + } + + /* + * If this character is quoted, set the quote bit, if not, clear it. + */ + if (chr & TTY_QUOTE) { + setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info); + /* + * Use one of the spare quote bits to record that something + * may be quoted. + */ + setbit(cblockp->c_quote, CBQSIZE * NBBY - 1); + } else + clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info); + + *clistp->c_cl++ = chr; + clistp->c_cc++; + + splx(s); + return (0); +} + +/* + * Copy data from linear buffer to clist chain. Return the + * number of characters not copied. + */ +int +b_to_q(src, amount, clistp) + char *src; + int amount; + struct clist *clistp; +{ + struct cblock *cblockp; + char *firstbyte, *lastbyte; + u_char startmask, endmask; + int startbit, endbit, num_between, numc; + int s; + + /* + * Avoid allocating an initial cblock and then not using it. + * c_cc == 0 must imply c_cbount == 0. + */ + if (amount <= 0) + return (amount); + + s = spltty(); + + /* + * If there are no cblocks assigned to this clist yet, + * then get one. + */ + if (clistp->c_cl == NULL) { + if (clistp->c_cbreserved < 1) { + splx(s); + printf("b_to_q to a clist with no reserved cblocks.\n"); + return (amount); /* nothing done */ + } + cblockp = cblock_alloc(); + clistp->c_cbcount = 1; + clistp->c_cf = clistp->c_cl = cblockp->c_info; + clistp->c_cc = 0; + } else { + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + } + + while (amount) { + /* + * Get another cblock if needed. + */ + if (((intptr_t)clistp->c_cl & CROUND) == 0) { + struct cblock *prev = cblockp - 1; + + if (clistp->c_cbcount >= clistp->c_cbreserved) { + if (clistp->c_cbcount >= clistp->c_cbmax + || cslushcount <= 0) { + splx(s); + return (amount); + } + --cslushcount; + } + cblockp = cblock_alloc(); + clistp->c_cbcount++; + prev->c_next = cblockp; + clistp->c_cl = cblockp->c_info; + } + + /* + * Copy a chunk of the linear buffer up to the end + * of this cblock. + */ + numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl); + bcopy(src, clistp->c_cl, numc); + + /* + * Clear quote bits if they aren't known to be clear. + * The following could probably be made into a separate + * "bitzero()" routine, but why bother? + */ + if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) { + startbit = clistp->c_cl - (char *)cblockp->c_info; + endbit = startbit + numc - 1; + + firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY); + lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY); + + /* + * Calculate mask of bits to preserve in first and + * last bytes. + */ + startmask = NBBY - (startbit % NBBY); + startmask = 0xff >> startmask; + endmask = (endbit % NBBY); + endmask = 0xff << (endmask + 1); + + if (firstbyte != lastbyte) { + *firstbyte &= startmask; + *lastbyte &= endmask; + + num_between = lastbyte - firstbyte - 1; + if (num_between) + bzero(firstbyte + 1, num_between); + } else { + *firstbyte &= (startmask | endmask); + } + } + + /* + * ...and update pointer for the next chunk. + */ + src += numc; + clistp->c_cl += numc; + clistp->c_cc += numc; + amount -= numc; + /* + * If we go through the loop again, it's always + * for data in the next cblock, so by adding one (cblock), + * (which makes the pointer 1 beyond the end of this + * cblock) we prepare for the assignment of 'prev' + * above. + */ + cblockp += 1; + + } + + splx(s); + return (amount); +} + +/* + * Get the next character in the clist. Store it at dst. Don't + * advance any clist pointers, but return a pointer to the next + * character position. + */ +char * +nextc(clistp, cp, dst) + struct clist *clistp; + char *cp; + int *dst; +{ + struct cblock *cblockp; + + ++cp; + /* + * See if the next character is beyond the end of + * the clist. + */ + if (clistp->c_cc && (cp != clistp->c_cl)) { + /* + * If the next character is beyond the end of this + * cblock, advance to the next cblock. + */ + if (((intptr_t)cp & CROUND) == 0) + cp = ((struct cblock *)cp - 1)->c_next->c_info; + cblockp = (struct cblock *)((intptr_t)cp & ~CROUND); + + /* + * Get the character. Set the quote flag if this character + * is quoted. + */ + *dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0); + + return (cp); + } + + return (NULL); +} + +/* + * "Unput" a character from a clist. + */ +int +unputc(clistp) + struct clist *clistp; +{ + struct cblock *cblockp = 0, *cbp = 0; + int s; + int chr = -1; + + + s = spltty(); + + if (clistp->c_cc) { + --clistp->c_cc; + --clistp->c_cl; + + chr = (u_char)*clistp->c_cl; + + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + + /* + * Set quote flag if this character was quoted. + */ + if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info)) + chr |= TTY_QUOTE; + + /* + * If all of the characters have been unput in this + * cblock, then find the previous one and free this + * one. + */ + if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) { + cbp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + + while (cbp->c_next != cblockp) + cbp = cbp->c_next; + + /* + * When the previous cblock is at the end, the 'last' + * pointer always points (invalidly) one past. + */ + clistp->c_cl = (char *)(cbp+1); + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + cbp->c_next = NULL; + } + } + + /* + * If there are no more characters on the list, then + * free the last cblock. + */ + if ((clistp->c_cc == 0) && clistp->c_cl) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + clistp->c_cf = clistp->c_cl = NULL; + } + + splx(s); + return (chr); +} + +/* + * Move characters in source clist to destination clist, + * preserving quote bits. + */ +void +catq(src_clistp, dest_clistp) + struct clist *src_clistp, *dest_clistp; +{ + int chr, s; + + s = spltty(); + /* + * If the destination clist is empty (has no cblocks atttached), + * and there are no possible complications with the resource counters, + * then we simply assign the current clist to the destination. + */ + if (!dest_clistp->c_cf + && src_clistp->c_cbcount <= src_clistp->c_cbmax + && src_clistp->c_cbcount <= dest_clistp->c_cbmax) { + dest_clistp->c_cf = src_clistp->c_cf; + dest_clistp->c_cl = src_clistp->c_cl; + src_clistp->c_cf = src_clistp->c_cl = NULL; + + dest_clistp->c_cc = src_clistp->c_cc; + src_clistp->c_cc = 0; + dest_clistp->c_cbcount = src_clistp->c_cbcount; + src_clistp->c_cbcount = 0; + + splx(s); + return; + } + + splx(s); + + /* + * XXX This should probably be optimized to more than one + * character at a time. + */ + while ((chr = getc(src_clistp)) != -1) + putc(chr, dest_clistp); +} diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c new file mode 100644 index 0000000..e1e03bd --- /dev/null +++ b/sys/kern/tty_tty.c @@ -0,0 +1,252 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_tty.c 8.2 (Berkeley) 9/23/93 + * $FreeBSD$ + */ + +/* + * Indirect driver for controlling tty. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/proc.h> +#include <sys/ttycom.h> +#include <sys/vnode.h> + +static d_open_t cttyopen; +static d_read_t cttyread; +static d_write_t cttywrite; +static d_ioctl_t cttyioctl; +static d_poll_t cttypoll; + +#define CDEV_MAJOR 1 + +static struct cdevsw ctty_cdevsw = { + /* open */ cttyopen, + /* close */ nullclose, + /* read */ cttyread, + /* write */ cttywrite, + /* ioctl */ cttyioctl, + /* poll */ cttypoll, + /* mmap */ nommap, + /* strategy */ nostrategy, + /* name */ "ctty", + /* maj */ CDEV_MAJOR, + /* dump */ nodump, + /* psize */ nopsize, + /* flags */ D_TTY, +}; + +#define cttyvp(td) ((td)->td_proc->p_flag & P_CONTROLT ? (td)->td_proc->p_session->s_ttyvp : NULL) + +/*ARGSUSED*/ +static int +cttyopen(dev, flag, mode, td) + dev_t dev; + int flag, mode; + struct thread *td; +{ + struct vnode *ttyvp; + int error; + + PROC_LOCK(td->td_proc); + SESS_LOCK(td->td_proc->p_session); + ttyvp = cttyvp(td); + SESS_UNLOCK(td->td_proc->p_session); + PROC_UNLOCK(td->td_proc); + + if (ttyvp == NULL) + return (ENXIO); + vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_OPEN(ttyvp, flag, NOCRED, td); + VOP_UNLOCK(ttyvp, 0, td); + return (error); +} + +/*ARGSUSED*/ +static int +cttyread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + struct thread *td = uio->uio_td; + register struct vnode *ttyvp; + int error; + + PROC_LOCK(td->td_proc); + SESS_LOCK(td->td_proc->p_session); + ttyvp = cttyvp(td); + SESS_UNLOCK(td->td_proc->p_session); + PROC_UNLOCK(td->td_proc); + + if (ttyvp == NULL) + return (EIO); + vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_READ(ttyvp, uio, flag, NOCRED); + VOP_UNLOCK(ttyvp, 0, td); + return (error); +} + +/*ARGSUSED*/ +static int +cttywrite(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + struct thread *td = uio->uio_td; + struct vnode *ttyvp; + struct mount *mp; + int error; + + PROC_LOCK(td->td_proc); + SESS_LOCK(td->td_proc->p_session); + ttyvp = cttyvp(td); + SESS_UNLOCK(td->td_proc->p_session); + PROC_UNLOCK(td->td_proc); + + if (ttyvp == NULL) + return (EIO); + mp = NULL; + if (ttyvp->v_type != VCHR && + (error = vn_start_write(ttyvp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_WRITE(ttyvp, uio, flag, NOCRED); + VOP_UNLOCK(ttyvp, 0, td); + vn_finished_write(mp); + return (error); +} + +/*ARGSUSED*/ +static int +cttyioctl(dev, cmd, addr, flag, td) + dev_t dev; + u_long cmd; + caddr_t addr; + int flag; + struct thread *td; +{ + struct vnode *ttyvp; + int error; + + PROC_LOCK(td->td_proc); + SESS_LOCK(td->td_proc->p_session); + ttyvp = cttyvp(td); + SESS_UNLOCK(td->td_proc->p_session); + PROC_UNLOCK(td->td_proc); + + if (ttyvp == NULL) + return (EIO); + if (cmd == TIOCSCTTY) /* don't allow controlling tty to be set */ + return EINVAL; /* to controlling tty -- infinite recursion */ + if (cmd == TIOCNOTTY) { + PROC_LOCK(td->td_proc); + SESS_LOCK(td->td_proc->p_session); + error = 0; + if (!SESS_LEADER(td->td_proc)) + td->td_proc->p_flag &= ~P_CONTROLT; + else + error = EINVAL; + SESS_UNLOCK(td->td_proc->p_session); + PROC_UNLOCK(td->td_proc); + return (error); + } + return (VOP_IOCTL(ttyvp, cmd, addr, flag, NOCRED, td)); +} + +/*ARGSUSED*/ +static int +cttypoll(dev, events, td) + dev_t dev; + int events; + struct thread *td; +{ + struct vnode *ttyvp; + + PROC_LOCK(td->td_proc); + SESS_LOCK(td->td_proc->p_session); + ttyvp = cttyvp(td); + SESS_UNLOCK(td->td_proc->p_session); + PROC_UNLOCK(td->td_proc); + + if (ttyvp == NULL) + /* try operation to get EOF/failure */ + return (seltrue(dev, events, td)); + return (VOP_POLL(ttyvp, events, td->td_ucred, td)); +} + +static void ctty_clone(void *arg, char *name, int namelen, dev_t *dev); + +static dev_t ctty; + +static void +ctty_clone(void *arg, char *name, int namelen, dev_t *dev) +{ + struct vnode *vp; + + if (*dev != NODEV) + return; + if (strcmp(name, "tty")) + return; + vp = cttyvp(curthread); + if (vp == NULL) { + if (ctty) + *dev = ctty; + } else + *dev = vp->v_rdev; +} + + +static void ctty_drvinit(void *unused); +static void +ctty_drvinit(unused) + void *unused; +{ + + if (devfs_present) { + EVENTHANDLER_REGISTER(dev_clone, ctty_clone, 0, 1000); + ctty = make_dev(&ctty_cdevsw, 0, 0, 0, 0666, "ctty"); + } else { + make_dev(&ctty_cdevsw, 0, 0, 0, 0666, "tty"); + } +} + +SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,ctty_drvinit,NULL) diff --git a/sys/kern/uipc_accf.c b/sys/kern/uipc_accf.c new file mode 100644 index 0000000..b31026a --- /dev/null +++ b/sys/kern/uipc_accf.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2000 Paycounter, Inc. + * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define ACCEPT_FILTER_MOD + +#include "opt_param.h" +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/domain.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/sysctl.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/queue.h> + +static SLIST_HEAD(, accept_filter) accept_filtlsthd = + SLIST_HEAD_INITIALIZER(&accept_filtlsthd); + +MALLOC_DEFINE(M_ACCF, "accf", "accept filter data"); + +static int unloadable = 0; + +SYSCTL_DECL(_net_inet); /* XXX: some header should do this for me */ +SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW, 0, "Accept filters"); +SYSCTL_INT(_net_inet_accf, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, + "Allow unload of accept filters (not recommended)"); + +/* + * must be passed a malloc'd structure so we don't explode if the kld + * is unloaded, we leak the struct on deallocation to deal with this, + * but if a filter is loaded with the same name as a leaked one we re-use + * the entry. + */ +int +accept_filt_add(struct accept_filter *filt) +{ + struct accept_filter *p; + + SLIST_FOREACH(p, &accept_filtlsthd, accf_next) + if (strcmp(p->accf_name, filt->accf_name) == 0) { + if (p->accf_callback != NULL) { + return (EEXIST); + } else { + p->accf_callback = filt->accf_callback; + FREE(filt, M_ACCF); + return (0); + } + } + + if (p == NULL) + SLIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next); + return (0); +} + +int +accept_filt_del(char *name) +{ + struct accept_filter *p; + + p = accept_filt_get(name); + if (p == NULL) + return (ENOENT); + + p->accf_callback = NULL; + return (0); +} + +struct accept_filter * +accept_filt_get(char *name) +{ + struct accept_filter *p; + + SLIST_FOREACH(p, &accept_filtlsthd, accf_next) + if (strcmp(p->accf_name, name) == 0) + return (p); + + return (NULL); +} + +int +accept_filt_generic_mod_event(module_t mod, int event, void *data) +{ + struct accept_filter *p; + struct accept_filter *accfp = (struct accept_filter *) data; + int s, error; + + switch (event) { + case MOD_LOAD: + MALLOC(p, struct accept_filter *, sizeof(*p), M_ACCF, M_WAITOK); + bcopy(accfp, p, sizeof(*p)); + s = splnet(); + error = accept_filt_add(p); + splx(s); + break; + + case MOD_UNLOAD: + /* + * Do not support unloading yet. we don't keep track of refcounts + * and unloading an accept filter callback and then having it called + * is a bad thing. A simple fix would be to track the refcount + * in the struct accept_filter. + */ + if (unloadable != 0) { + s = splnet(); + error = accept_filt_del(accfp->accf_name); + splx(s); + } else + error = EOPNOTSUPP; + break; + + case MOD_SHUTDOWN: + error = 0; + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} diff --git a/sys/kern/uipc_cow.c b/sys/kern/uipc_cow.c new file mode 100644 index 0000000..239e7c5 --- /dev/null +++ b/sys/kern/uipc_cow.c @@ -0,0 +1,181 @@ +/*- + * Copyright (c) 1997, Duke University + * All rights reserved. + * + * Author: + * Andrew Gallatin <gallatin@cs.duke.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgements: + * This product includes software developed by Duke University + * 4. The name of Duke University may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY DUKE UNIVERSITY ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DUKE UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITSOR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This is a set of routines for enabling and disabling copy on write + * protection for data written into sockets. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/mbuf.h> +#include <sys/socketvar.h> +#include <sys/uio.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#if 0 +#include <vm/vm_pager.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/vm_zone.h> +#include <vm/swap_pager.h> +#endif + + +struct netsend_cow_stats { + int attempted; + int fail_not_mapped; + int fail_wired; + int fail_not_anon; + int fail_pmap_cow; + int fail_pg_error; + int fail_kva; + int free_post_exit; + int success; + int iodone; + int freed; +}; + +static struct netsend_cow_stats socow_stats = {0,0,0,0,0,0,0,0,0,0,0}; + +extern struct sf_buf *sf_bufs; +extern vm_offset_t sf_base; +#define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT]) +void sf_buf_free(caddr_t addr, void *args); +struct sf_buf *sf_buf_alloc(void); +static void socow_iodone(caddr_t addr, void *args); + +static void +socow_iodone(caddr_t addr, void *args) +{ + int s; + struct sf_buf *sf; + + vm_offset_t paddr; + vm_page_t pp; + + sf = dtosf(addr); + paddr = vtophys((vm_offset_t)addr); + pp = PHYS_TO_VM_PAGE(paddr); + s = splvm(); + /* remove COW mapping */ + vm_page_cowclear(pp); + vm_object_deallocate(pp->object); + splx(s); + /* note that sf_buf_free() unwires the page for us*/ + sf_buf_free(addr, NULL); + socow_stats.iodone++; +} + +int +socow_setup(struct mbuf *m0, struct uio *uio) +{ + struct sf_buf *sf; + vm_page_t pp; + vm_offset_t pa; + struct iovec *iov; + struct vmspace *vmspace; + struct vm_map *map; + vm_offset_t uva; + int s; + + vmspace = curproc->p_vmspace;; + map = &vmspace->vm_map; + uva = (vm_offset_t) uio->uio_iov->iov_base; + + s = splvm(); + + /* + * verify page is mapped & not already wired for i/o + */ + socow_stats.attempted++; + pa=pmap_extract(map->pmap, uva); + if(!pa) { + socow_stats.fail_not_mapped++; + splx(s); + return(0); + } + pp = PHYS_TO_VM_PAGE(pa); + + sf = sf_buf_alloc(); + sf->m = pp; + pmap_qenter(sf->kva, &pp, 1); + + /* + * set up COW + */ + vm_page_cowsetup(pp); + + /* + * wire the page for I/O + */ + vm_page_wire(pp); + + /* + * prevent the process from exiting on us. + */ + vm_object_reference(pp->object); + + /* + * attach to mbuf + */ + m0->m_data = (caddr_t)sf->kva; + m0->m_len = PAGE_SIZE; + MEXTADD(m0, sf->kva, PAGE_SIZE, socow_iodone, NULL, 0, EXT_SFBUF); + socow_stats.success++; + + iov = uio->uio_iov; + iov->iov_base += PAGE_SIZE; + iov->iov_len -= PAGE_SIZE; + uio->uio_resid -= PAGE_SIZE; + uio->uio_offset += PAGE_SIZE; + if (iov->iov_len == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + } + + splx(s); + return(1); +} diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c new file mode 100644 index 0000000..b8321eb --- /dev/null +++ b/sys/kern/uipc_domain.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/protosw.h> +#include <sys/domain.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/socketvar.h> +#include <sys/systm.h> +#include <vm/uma.h> + +/* + * System initialization + * + * Note: domain initialization takes place on a per domain basis + * as a result of traversing a SYSINIT linker set. Most likely, + * each domain would want to call DOMAIN_SET(9) itself, which + * would cause the domain to be added just after domaininit() + * is called during startup. + * + * See DOMAIN_SET(9) for details on its use. + */ + +static void domaininit(void *); +SYSINIT(domain, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, domaininit, NULL) + +static struct callout pffast_callout; +static struct callout pfslow_callout; + +static void pffasttimo(void *); +static void pfslowtimo(void *); + +struct domain *domains; + +/* + * Add a new protocol domain to the list of supported domains + * Note: you cant unload it again because a socket may be using it. + * XXX can't fail at this time. + */ +static void +net_init_domain(struct domain *dp) +{ + register struct protosw *pr; + int s; + + s = splnet(); + if (dp->dom_init) + (*dp->dom_init)(); + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++){ + if (pr->pr_usrreqs == 0) + panic("domaininit: %ssw[%d] has no usrreqs!", + dp->dom_name, + (int)(pr - dp->dom_protosw)); + if (pr->pr_init) + (*pr->pr_init)(); + } + /* + * update global informatio about maximums + */ + max_hdr = max_linkhdr + max_protohdr; + max_datalen = MHLEN - max_hdr; + splx(s); +} + +/* + * Add a new protocol domain to the list of supported domains + * Note: you cant unload it again because a socket may be using it. + * XXX can't fail at this time. + */ +void +net_add_domain(void *data) +{ + int s; + struct domain *dp; + + dp = (struct domain *)data; + s = splnet(); + dp->dom_next = domains; + domains = dp; + splx(s); + net_init_domain(dp); +} + +/* ARGSUSED*/ +static void +domaininit(void *dummy) +{ + /* + * Before we do any setup, make sure to initialize the + * zone allocator we get struct sockets from. + */ + + socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(socket_zone, maxsockets); + + if (max_linkhdr < 16) /* XXX */ + max_linkhdr = 16; + + callout_init(&pffast_callout, 0); + callout_init(&pfslow_callout, 0); + + callout_reset(&pffast_callout, 1, pffasttimo, NULL); + callout_reset(&pfslow_callout, 1, pfslowtimo, NULL); +} + + +struct protosw * +pffindtype(family, type) + int family; + int type; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + if (dp->dom_family == family) + goto found; + return (0); +found: + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_type && pr->pr_type == type) + return (pr); + return (0); +} + +struct protosw * +pffindproto(family, protocol, type) + int family; + int protocol; + int type; +{ + register struct domain *dp; + register struct protosw *pr; + struct protosw *maybe = 0; + + if (family == 0) + return (0); + for (dp = domains; dp; dp = dp->dom_next) + if (dp->dom_family == family) + goto found; + return (0); +found: + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { + if ((pr->pr_protocol == protocol) && (pr->pr_type == type)) + return (pr); + + if (type == SOCK_RAW && pr->pr_type == SOCK_RAW && + pr->pr_protocol == 0 && maybe == (struct protosw *)0) + maybe = pr; + } + return (maybe); +} + +void +pfctlinput(cmd, sa) + int cmd; + struct sockaddr *sa; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_ctlinput) + (*pr->pr_ctlinput)(cmd, sa, (void *)0); +} + +void +pfctlinput2(cmd, sa, ctlparam) + int cmd; + struct sockaddr *sa; + void *ctlparam; +{ + struct domain *dp; + struct protosw *pr; + + if (!sa) + return; + for (dp = domains; dp; dp = dp->dom_next) { + /* + * the check must be made by xx_ctlinput() anyways, to + * make sure we use data item pointed to by ctlparam in + * correct way. the following check is made just for safety. + */ + if (dp->dom_family != sa->sa_family) + continue; + + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_ctlinput) + (*pr->pr_ctlinput)(cmd, sa, ctlparam); + } +} + +static void +pfslowtimo(arg) + void *arg; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_slowtimo) + (*pr->pr_slowtimo)(); + callout_reset(&pfslow_callout, hz/2, pfslowtimo, NULL); +} + +static void +pffasttimo(arg) + void *arg; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_fasttimo) + (*pr->pr_fasttimo)(); + callout_reset(&pffast_callout, hz/5, pffasttimo, NULL); +} diff --git a/sys/kern/uipc_jumbo.c b/sys/kern/uipc_jumbo.c new file mode 100644 index 0000000..4625752 --- /dev/null +++ b/sys/kern/uipc_jumbo.c @@ -0,0 +1,252 @@ +/*- + * Copyright (c) 1997, Duke University + * All rights reserved. + * + * Author: + * Andrew Gallatin <gallatin@cs.duke.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgements: + * This product includes software developed by Duke University + * 4. The name of Duke University may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY DUKE UNIVERSITY ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DUKE UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITSOR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This is a set of routines for allocating large-sized mbuf payload + * areas, and is primarily intended for use in receive side mbuf + * allocation. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/sockio.h> +#include <sys/uio.h> +#include <sys/lock.h> +#include <sys/kernel.h> +#include <sys/mutex.h> +#include <sys/malloc.h> +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_map.h> +#include <vm/vm_param.h> +#include <vm/vm_pageout.h> +#include <sys/vmmeter.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <sys/proc.h> +#include <sys/jumbo.h> + +/* + * XXX this may be too high or too low. + */ +#define JUMBO_MAX_PAGES 3072 + +struct jumbo_kmap { + vm_offset_t kva; + SLIST_ENTRY(jumbo_kmap) entries; /* Singly-linked List. */ +}; + +static SLIST_HEAD(jumbo_kmap_head, jumbo_kmap) jumbo_kmap_free, + jumbo_kmap_inuse; + +static struct mtx jumbo_mutex; +MTX_SYSINIT(jumbo_lock, &jumbo_mutex, "jumbo mutex", MTX_DEF); + +static struct vm_object *jumbo_vm_object; +static unsigned long jumbo_vmuiomove_pgs_freed = 0; +#if 0 +static int jumbo_vm_wakeup_wanted = 0; +#endif +vm_offset_t jumbo_basekva; + +int +jumbo_vm_init(void) +{ + int i; + struct jumbo_kmap *entry; + + mtx_lock(&jumbo_mutex); + + if (jumbo_vm_object != NULL) { + mtx_unlock(&jumbo_mutex); + return (1); + } + + /* allocate our object */ + jumbo_vm_object = vm_object_allocate_wait(OBJT_DEFAULT, JUMBO_MAX_PAGES, + M_NOWAIT); + + if (jumbo_vm_object == NULL) { + mtx_unlock(&jumbo_mutex); + return (0); + } + + SLIST_INIT(&jumbo_kmap_free); + SLIST_INIT(&jumbo_kmap_inuse); + + /* grab some kernel virtual address space */ + jumbo_basekva = kmem_alloc_pageable(kernel_map, + PAGE_SIZE * JUMBO_MAX_PAGES); + if (jumbo_basekva == 0) { + vm_object_deallocate(jumbo_vm_object); + jumbo_vm_object = NULL; + mtx_unlock(&jumbo_mutex); + return 0; + } + for (i = 0; i < JUMBO_MAX_PAGES; i++) { + entry = malloc(sizeof(struct jumbo_kmap), M_TEMP, M_NOWAIT); + if (!entry && !i) { + mtx_unlock(&jumbo_mutex); + panic("jumbo_vm_init: unable to allocated kvas"); + } else if (!entry) { + printf("warning: jumbo_vm_init allocated only %d kva\n", + i); + mtx_unlock(&jumbo_mutex); + return 1; + } + entry->kva = jumbo_basekva + (vm_offset_t)i * PAGE_SIZE; + SLIST_INSERT_HEAD(&jumbo_kmap_free, entry, entries); + } + mtx_unlock(&jumbo_mutex); + return 1; +} + +void +jumbo_freem(caddr_t addr, void *args) +{ + vm_page_t frame; + + frame = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)addr)); + + /* + * Need giant for looking at the hold count below. Convert this + * to the vm mutex once the VM code has been moved out from under + * giant. + */ + GIANT_REQUIRED; + + if (frame->hold_count == 0) + jumbo_pg_free((vm_offset_t)addr); + else + printf("jumbo_freem: hold count for %p is %d!!??\n", + frame, frame->hold_count); +} + +void +jumbo_pg_steal(vm_page_t pg) +{ + vm_offset_t addr; + struct jumbo_kmap *entry; + + addr = ptoa(pg->pindex) + jumbo_basekva; + + if (pg->object != jumbo_vm_object) + panic("stealing a non jumbo_vm_object page"); + vm_page_remove(pg); + + mtx_lock(&jumbo_mutex); + + pmap_qremove(addr,1); + entry = SLIST_FIRST(&jumbo_kmap_inuse); + entry->kva = addr; + SLIST_REMOVE_HEAD(&jumbo_kmap_inuse, entries); + SLIST_INSERT_HEAD(&jumbo_kmap_free, entry, entries); + + mtx_unlock(&jumbo_mutex); + +#if 0 + if (jumbo_vm_wakeup_wanted) + wakeup(jumbo_vm_object); +#endif +} + + +vm_page_t +jumbo_pg_alloc(void) +{ + vm_page_t pg; + vm_pindex_t pindex; + struct jumbo_kmap *entry; + + pg = NULL; + mtx_lock(&jumbo_mutex); + + entry = SLIST_FIRST(&jumbo_kmap_free); + if (entry != NULL){ + pindex = atop(entry->kva - jumbo_basekva); + pg = vm_page_alloc(jumbo_vm_object, pindex, VM_ALLOC_INTERRUPT); + if (pg != NULL) { + SLIST_REMOVE_HEAD(&jumbo_kmap_free, entries); + SLIST_INSERT_HEAD(&jumbo_kmap_inuse, entry, entries); + pmap_qenter(entry->kva, &pg, 1); + } + } + mtx_unlock(&jumbo_mutex); + return(pg); +} + +void +jumbo_pg_free(vm_offset_t addr) +{ + struct jumbo_kmap *entry; + vm_offset_t paddr; + vm_page_t pg; + + paddr = pmap_kextract((vm_offset_t)addr); + pg = PHYS_TO_VM_PAGE(paddr); + + if (pg->object != jumbo_vm_object) { + jumbo_vmuiomove_pgs_freed++; +/* if(vm_page_lookup(jumbo_vm_object, atop(addr - jumbo_basekva))) + panic("vm_page_rename didn't"); + printf("freeing uiomoved pg:\t pindex = %d, padd = 0x%lx\n", + atop(addr - jumbo_basekva), paddr); +*/ + } else { + vm_page_busy(pg); /* vm_page_free wants pages to be busy*/ + vm_page_free(pg); + } + + mtx_lock(&jumbo_mutex); + + pmap_qremove(addr,1); + entry = SLIST_FIRST(&jumbo_kmap_inuse); + entry->kva = addr; + SLIST_REMOVE_HEAD(&jumbo_kmap_inuse, entries); + SLIST_INSERT_HEAD(&jumbo_kmap_free, entry, entries); + + mtx_unlock(&jumbo_mutex); + +#if 0 + if (jumbo_vm_wakeup_wanted) + wakeup(jumbo_vm_object); +#endif +} diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c new file mode 100644 index 0000000..27ca156 --- /dev/null +++ b/sys/kern/uipc_mbuf.c @@ -0,0 +1,753 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 + * $FreeBSD$ + */ + +#include "opt_param.h" +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/sysctl.h> +#include <sys/domain.h> +#include <sys/protosw.h> + +int max_linkhdr; +int max_protohdr; +int max_hdr; +int max_datalen; + +/* + * sysctl(8) exported objects + */ +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW, + &max_linkhdr, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW, + &max_protohdr, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW, + &max_datalen, 0, ""); + +/* + * struct mbuf * + * m_getm(m, len, how, type) + * + * This will allocate len-worth of mbufs and/or mbuf clusters (whatever fits + * best) and return a pointer to the top of the allocated chain. If m is + * non-null, then we assume that it is a single mbuf or an mbuf chain to + * which we want len bytes worth of mbufs and/or clusters attached, and so + * if we succeed in allocating it, we will just return a pointer to m. + * + * If we happen to fail at any point during the allocation, we will free + * up everything we have already allocated and return NULL. + * + */ +struct mbuf * +m_getm(struct mbuf *m, int len, int how, int type) +{ + struct mbuf *top, *tail, *mp, *mtail = NULL; + + KASSERT(len >= 0, ("len is < 0 in m_getm")); + + MGET(mp, how, type); + if (mp == NULL) + return (NULL); + else if (len > MINCLSIZE) { + MCLGET(mp, how); + if ((mp->m_flags & M_EXT) == 0) { + m_free(mp); + return (NULL); + } + } + mp->m_len = 0; + len -= M_TRAILINGSPACE(mp); + + if (m != NULL) + for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next); + else + m = mp; + + top = tail = mp; + while (len > 0) { + MGET(mp, how, type); + if (mp == NULL) + goto failed; + + tail->m_next = mp; + tail = mp; + if (len > MINCLSIZE) { + MCLGET(mp, how); + if ((mp->m_flags & M_EXT) == 0) + goto failed; + } + + mp->m_len = 0; + len -= M_TRAILINGSPACE(mp); + } + + if (mtail != NULL) + mtail->m_next = top; + return (m); + +failed: + m_freem(top); + return (NULL); +} + +void +m_freem(struct mbuf *m) +{ + while (m) { + m = m_free(m); + } +} + +/* + * Lesser-used path for M_PREPEND: + * allocate new mbuf to prepend to chain, + * copy junk along. + */ +struct mbuf * +m_prepend(struct mbuf *m, int len, int how) +{ + struct mbuf *mn; + + MGET(mn, how, m->m_type); + if (mn == NULL) { + m_freem(m); + return (NULL); + } + if (m->m_flags & M_PKTHDR) { + M_COPY_PKTHDR(mn, m); + m->m_flags &= ~M_PKTHDR; + } + mn->m_next = m; + m = mn; + if (len < MHLEN) + MH_ALIGN(m, len); + m->m_len = len; + return (m); +} + +/* + * Make a copy of an mbuf chain starting "off0" bytes from the beginning, + * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. + * The wait parameter is a choice of M_TRYWAIT/M_DONTWAIT from caller. + * Note that the copy is read-only, because clusters are not copied, + * only their reference counts are incremented. + */ +struct mbuf * +m_copym(struct mbuf *m, int off0, int len, int wait) +{ + struct mbuf *n, **np; + int off = off0; + struct mbuf *top; + int copyhdr = 0; + + KASSERT(off >= 0, ("m_copym, negative off %d", off)); + KASSERT(len >= 0, ("m_copym, negative len %d", len)); + if (off == 0 && m->m_flags & M_PKTHDR) + copyhdr = 1; + while (off > 0) { + KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); + if (off < m->m_len) + break; + off -= m->m_len; + m = m->m_next; + } + np = ⊤ + top = 0; + while (len > 0) { + if (m == NULL) { + KASSERT(len == M_COPYALL, + ("m_copym, length > size of mbuf chain")); + break; + } + MGET(n, wait, m->m_type); + *np = n; + if (n == NULL) + goto nospace; + if (copyhdr) { + M_COPY_PKTHDR(n, m); + if (len == M_COPYALL) + n->m_pkthdr.len -= off0; + else + n->m_pkthdr.len = len; + copyhdr = 0; + } + n->m_len = min(len, m->m_len - off); + if (m->m_flags & M_EXT) { + n->m_data = m->m_data + off; + n->m_ext = m->m_ext; + n->m_flags |= M_EXT; + MEXT_ADD_REF(m); + } else + bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), + (unsigned)n->m_len); + if (len != M_COPYALL) + len -= n->m_len; + off = 0; + m = m->m_next; + np = &n->m_next; + } + if (top == NULL) + mbstat.m_mcfail++; /* XXX: No consistency. */ + + return (top); +nospace: + m_freem(top); + mbstat.m_mcfail++; /* XXX: No consistency. */ + return (NULL); +} + +/* + * Copy an entire packet, including header (which must be present). + * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. + * Note that the copy is read-only, because clusters are not copied, + * only their reference counts are incremented. + * Preserve alignment of the first mbuf so if the creator has left + * some room at the beginning (e.g. for inserting protocol headers) + * the copies still have the room available. + */ +struct mbuf * +m_copypacket(struct mbuf *m, int how) +{ + struct mbuf *top, *n, *o; + + MGET(n, how, m->m_type); + top = n; + if (n == NULL) + goto nospace; + + M_COPY_PKTHDR(n, m); + n->m_len = m->m_len; + if (m->m_flags & M_EXT) { + n->m_data = m->m_data; + n->m_ext = m->m_ext; + n->m_flags |= M_EXT; + MEXT_ADD_REF(m); + } else { + n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); + bcopy(mtod(m, char *), mtod(n, char *), n->m_len); + } + + m = m->m_next; + while (m) { + MGET(o, how, m->m_type); + if (o == NULL) + goto nospace; + + n->m_next = o; + n = n->m_next; + + n->m_len = m->m_len; + if (m->m_flags & M_EXT) { + n->m_data = m->m_data; + n->m_ext = m->m_ext; + n->m_flags |= M_EXT; + MEXT_ADD_REF(m); + } else { + bcopy(mtod(m, char *), mtod(n, char *), n->m_len); + } + + m = m->m_next; + } + return top; +nospace: + m_freem(top); + mbstat.m_mcfail++; /* XXX: No consistency. */ + return (NULL); +} + +/* + * Copy data from an mbuf chain starting "off" bytes from the beginning, + * continuing for "len" bytes, into the indicated buffer. + */ +void +m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) +{ + unsigned count; + + KASSERT(off >= 0, ("m_copydata, negative off %d", off)); + KASSERT(len >= 0, ("m_copydata, negative len %d", len)); + while (off > 0) { + KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); + if (off < m->m_len) + break; + off -= m->m_len; + m = m->m_next; + } + while (len > 0) { + KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); + count = min(m->m_len - off, len); + bcopy(mtod(m, caddr_t) + off, cp, count); + len -= count; + cp += count; + off = 0; + m = m->m_next; + } +} + +/* + * Copy a packet header mbuf chain into a completely new chain, including + * copying any mbuf clusters. Use this instead of m_copypacket() when + * you need a writable copy of an mbuf chain. + */ +struct mbuf * +m_dup(struct mbuf *m, int how) +{ + struct mbuf **p, *top = NULL; + int remain, moff, nsize; + + /* Sanity check */ + if (m == NULL) + return (NULL); + KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__)); + + /* While there's more data, get a new mbuf, tack it on, and fill it */ + remain = m->m_pkthdr.len; + moff = 0; + p = ⊤ + while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ + struct mbuf *n; + + /* Get the next new mbuf */ + MGET(n, how, m->m_type); + if (n == NULL) + goto nospace; + if (top == NULL) { /* first one, must be PKTHDR */ + M_COPY_PKTHDR(n, m); + nsize = MHLEN; + } else /* not the first one */ + nsize = MLEN; + if (remain >= MINCLSIZE) { + MCLGET(n, how); + if ((n->m_flags & M_EXT) == 0) { + (void)m_free(n); + goto nospace; + } + nsize = MCLBYTES; + } + n->m_len = 0; + + /* Link it into the new chain */ + *p = n; + p = &n->m_next; + + /* Copy data from original mbuf(s) into new mbuf */ + while (n->m_len < nsize && m != NULL) { + int chunk = min(nsize - n->m_len, m->m_len - moff); + + bcopy(m->m_data + moff, n->m_data + n->m_len, chunk); + moff += chunk; + n->m_len += chunk; + remain -= chunk; + if (moff == m->m_len) { + m = m->m_next; + moff = 0; + } + } + + /* Check correct total mbuf length */ + KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), + ("%s: bogus m_pkthdr.len", __func__)); + } + return (top); + +nospace: + m_freem(top); + mbstat.m_mcfail++; /* XXX: No consistency. */ + return (NULL); +} + +/* + * Concatenate mbuf chain n to m. + * Both chains must be of the same type (e.g. MT_DATA). + * Any m_pkthdr is not updated. + */ +void +m_cat(struct mbuf *m, struct mbuf *n) +{ + while (m->m_next) + m = m->m_next; + while (n) { + if (m->m_flags & M_EXT || + m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { + /* just join the two chains */ + m->m_next = n; + return; + } + /* splat the data from one into the other */ + bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, + (u_int)n->m_len); + m->m_len += n->m_len; + n = m_free(n); + } +} + +void +m_adj(struct mbuf *mp, int req_len) +{ + int len = req_len; + struct mbuf *m; + int count; + + if ((m = mp) == NULL) + return; + if (len >= 0) { + /* + * Trim from head. + */ + while (m != NULL && len > 0) { + if (m->m_len <= len) { + len -= m->m_len; + m->m_len = 0; + m = m->m_next; + } else { + m->m_len -= len; + m->m_data += len; + len = 0; + } + } + m = mp; + if (mp->m_flags & M_PKTHDR) + m->m_pkthdr.len -= (req_len - len); + } else { + /* + * Trim from tail. Scan the mbuf chain, + * calculating its length and finding the last mbuf. + * If the adjustment only affects this mbuf, then just + * adjust and return. Otherwise, rescan and truncate + * after the remaining size. + */ + len = -len; + count = 0; + for (;;) { + count += m->m_len; + if (m->m_next == (struct mbuf *)0) + break; + m = m->m_next; + } + if (m->m_len >= len) { + m->m_len -= len; + if (mp->m_flags & M_PKTHDR) + mp->m_pkthdr.len -= len; + return; + } + count -= len; + if (count < 0) + count = 0; + /* + * Correct length for chain is "count". + * Find the mbuf with last data, adjust its length, + * and toss data from remaining mbufs on chain. + */ + m = mp; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len = count; + for (; m; m = m->m_next) { + if (m->m_len >= count) { + m->m_len = count; + break; + } + count -= m->m_len; + } + while (m->m_next) + (m = m->m_next) ->m_len = 0; + } +} + +/* + * Rearange an mbuf chain so that len bytes are contiguous + * and in the data area of an mbuf (so that mtod and dtom + * will work for a structure of size len). Returns the resulting + * mbuf chain on success, frees it and returns null on failure. + * If there is room, it will add up to max_protohdr-len extra bytes to the + * contiguous region in an attempt to avoid being called next time. + */ +struct mbuf * +m_pullup(struct mbuf *n, int len) +{ + struct mbuf *m; + int count; + int space; + + /* + * If first mbuf has no cluster, and has room for len bytes + * without shifting current data, pullup into it, + * otherwise allocate a new mbuf to prepend to the chain. + */ + if ((n->m_flags & M_EXT) == 0 && + n->m_data + len < &n->m_dat[MLEN] && n->m_next) { + if (n->m_len >= len) + return (n); + m = n; + n = n->m_next; + len -= m->m_len; + } else { + if (len > MHLEN) + goto bad; + MGET(m, M_DONTWAIT, n->m_type); + if (m == NULL) + goto bad; + m->m_len = 0; + if (n->m_flags & M_PKTHDR) { + M_COPY_PKTHDR(m, n); + n->m_flags &= ~M_PKTHDR; + } + } + space = &m->m_dat[MLEN] - (m->m_data + m->m_len); + do { + count = min(min(max(len, max_protohdr), space), n->m_len); + bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, + (unsigned)count); + len -= count; + m->m_len += count; + n->m_len -= count; + space -= count; + if (n->m_len) + n->m_data += count; + else + n = m_free(n); + } while (len > 0 && n); + if (len > 0) { + (void) m_free(m); + goto bad; + } + m->m_next = n; + return (m); +bad: + m_freem(n); + mbstat.m_mpfail++; /* XXX: No consistency. */ + return (NULL); +} + +/* + * Partition an mbuf chain in two pieces, returning the tail -- + * all but the first len0 bytes. In case of failure, it returns NULL and + * attempts to restore the chain to its original state. + * + * Note that the resulting mbufs might be read-only, because the new + * mbuf can end up sharing an mbuf cluster with the original mbuf if + * the "breaking point" happens to lie within a cluster mbuf. Use the + * M_WRITABLE() macro to check for this case. + */ +struct mbuf * +m_split(struct mbuf *m0, int len0, int wait) +{ + struct mbuf *m, *n; + unsigned len = len0, remain; + + for (m = m0; m && len > m->m_len; m = m->m_next) + len -= m->m_len; + if (m == NULL) + return (NULL); + remain = m->m_len - len; + if (m0->m_flags & M_PKTHDR) { + MGETHDR(n, wait, m0->m_type); + if (n == NULL) + return (NULL); + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + n->m_pkthdr.len = m0->m_pkthdr.len - len0; + m0->m_pkthdr.len = len0; + if (m->m_flags & M_EXT) + goto extpacket; + if (remain > MHLEN) { + /* m can't be the lead packet */ + MH_ALIGN(n, 0); + n->m_next = m_split(m, len, wait); + if (n->m_next == NULL) { + (void) m_free(n); + return (NULL); + } else { + n->m_len = 0; + return (n); + } + } else + MH_ALIGN(n, remain); + } else if (remain == 0) { + n = m->m_next; + m->m_next = NULL; + return (n); + } else { + MGET(n, wait, m->m_type); + if (n == NULL) + return (NULL); + M_ALIGN(n, remain); + } +extpacket: + if (m->m_flags & M_EXT) { + n->m_flags |= M_EXT; + n->m_ext = m->m_ext; + MEXT_ADD_REF(m); + n->m_data = m->m_data + len; + } else { + bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); + } + n->m_len = remain; + m->m_len = len; + n->m_next = m->m_next; + m->m_next = NULL; + return (n); +} +/* + * Routine to copy from device local memory into mbufs. + * Note that `off' argument is offset into first mbuf of target chain from + * which to begin copying the data to. + */ +struct mbuf * +m_devget(char *buf, int totlen, int off, struct ifnet *ifp, + void (*copy)(char *from, caddr_t to, u_int len)) +{ + struct mbuf *m; + struct mbuf *top = 0, **mp = ⊤ + int len; + + if (off < 0 || off > MHLEN) + return (NULL); + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return (NULL); + m->m_pkthdr.rcvif = ifp; + m->m_pkthdr.len = totlen; + len = MHLEN; + + while (totlen > 0) { + if (top) { + MGET(m, M_DONTWAIT, MT_DATA); + if (m == NULL) { + m_freem(top); + return (NULL); + } + len = MLEN; + } + if (totlen + off >= MINCLSIZE) { + MCLGET(m, M_DONTWAIT); + if (m->m_flags & M_EXT) + len = MCLBYTES; + } else { + /* + * Place initial small packet/header at end of mbuf. + */ + if (top == NULL && totlen + off + max_linkhdr <= len) { + m->m_data += max_linkhdr; + len -= max_linkhdr; + } + } + if (off) { + m->m_data += off; + len -= off; + off = 0; + } + m->m_len = len = min(totlen, len); + if (copy) + copy(buf, mtod(m, caddr_t), (unsigned)len); + else + bcopy(buf, mtod(m, caddr_t), (unsigned)len); + buf += len; + *mp = m; + mp = &m->m_next; + totlen -= len; + } + return (top); +} + +/* + * Copy data from a buffer back into the indicated mbuf chain, + * starting "off" bytes from the beginning, extending the mbuf + * chain if necessary. + */ +void +m_copyback(struct mbuf *m0, int off, int len, caddr_t cp) +{ + int mlen; + struct mbuf *m = m0, *n; + int totlen = 0; + + if (m0 == NULL) + return; + while (off > (mlen = m->m_len)) { + off -= mlen; + totlen += mlen; + if (m->m_next == NULL) { + n = m_get_clrd(M_DONTWAIT, m->m_type); + if (n == NULL) + goto out; + n->m_len = min(MLEN, len + off); + m->m_next = n; + } + m = m->m_next; + } + while (len > 0) { + mlen = min (m->m_len - off, len); + bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen); + cp += mlen; + len -= mlen; + mlen += off; + off = 0; + totlen += mlen; + if (len == 0) + break; + if (m->m_next == NULL) { + n = m_get(M_DONTWAIT, m->m_type); + if (n == NULL) + break; + n->m_len = min(MLEN, len); + m->m_next = n; + } + m = m->m_next; + } +out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) + m->m_pkthdr.len = totlen; +} + +void +m_print(const struct mbuf *m) +{ + int len; + const struct mbuf *m2; + + len = m->m_pkthdr.len; + m2 = m; + while (len) { + printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-"); + len -= m2->m_len; + m2 = m2->m_next; + } + return; +} diff --git a/sys/kern/uipc_mbuf2.c b/sys/kern/uipc_mbuf2.c new file mode 100644 index 0000000..37ee53e --- /dev/null +++ b/sys/kern/uipc_mbuf2.c @@ -0,0 +1,404 @@ +/* $FreeBSD$ */ +/* $KAME: uipc_mbuf2.c,v 1.31 2001/11/28 11:08:53 itojun Exp $ */ +/* $NetBSD: uipc_mbuf.c,v 1.40 1999/04/01 00:23:25 thorpej Exp $ */ + +/* + * Copyright (C) 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_mbuf.c 8.4 (Berkeley) 2/14/95 + */ + +/*#define PULLDOWN_DEBUG*/ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> + +/* can't call it m_dup(), as freebsd[34] uses m_dup() with different arg */ +static struct mbuf *m_dup1(struct mbuf *, int, int, int); + +/* + * ensure that [off, off + len) is contiguous on the mbuf chain "m". + * packet chain before "off" is kept untouched. + * if offp == NULL, the target will start at <retval, 0> on resulting chain. + * if offp != NULL, the target will start at <retval, *offp> on resulting chain. + * + * on error return (NULL return value), original "m" will be freed. + * + * XXX: M_TRAILINGSPACE/M_LEADINGSPACE only permitted on writable ext_buf. + */ +struct mbuf * +m_pulldown(struct mbuf *m, int off, int len, int *offp) +{ + struct mbuf *n, *o; + int hlen, tlen, olen; + int writable; + + /* check invalid arguments. */ + if (m == NULL) + panic("m == NULL in m_pulldown()"); + if (len > MCLBYTES) { + m_freem(m); + return NULL; /* impossible */ + } + +#ifdef PULLDOWN_DEBUG + { + struct mbuf *t; + printf("before:"); + for (t = m; t; t = t->m_next) + printf(" %d", t->m_len); + printf("\n"); + } +#endif + n = m; + while (n != NULL && off > 0) { + if (n->m_len > off) + break; + off -= n->m_len; + n = n->m_next; + } + /* be sure to point non-empty mbuf */ + while (n != NULL && n->m_len == 0) + n = n->m_next; + if (!n) { + m_freem(m); + return NULL; /* mbuf chain too short */ + } + + /* + * XXX: This code is flawed because it considers a "writable" mbuf + * data region to require all of the following: + * (i) mbuf _has_ to have M_EXT set; if it is just a regular + * mbuf, it is still not considered "writable." + * (ii) since mbuf has M_EXT, the ext_type _has_ to be + * EXT_CLUSTER. Anything else makes it non-writable. + * (iii) M_WRITABLE() must evaluate true. + * Ideally, the requirement should only be (iii). + * + * If we're writable, we're sure we're writable, because the ref. count + * cannot increase from 1, as that would require posession of mbuf + * n by someone else (which is impossible). However, if we're _not_ + * writable, we may eventually become writable )if the ref. count drops + * to 1), but we'll fail to notice it unless we re-evaluate + * M_WRITABLE(). For now, we only evaluate once at the beginning and + * live with this. + */ + /* + * XXX: This is dumb. If we're just a regular mbuf with no M_EXT, + * then we're not "writable," according to this code. + */ + writable = 0; + if ((n->m_flags & M_EXT) == 0 || + (n->m_ext.ext_type == EXT_CLUSTER && M_WRITABLE(n))) + writable = 1; + + /* + * the target data is on <n, off>. + * if we got enough data on the mbuf "n", we're done. + */ + if ((off == 0 || offp) && len <= n->m_len - off && writable) + goto ok; + + /* + * when len <= n->m_len - off and off != 0, it is a special case. + * len bytes from <n, off> sits in single mbuf, but the caller does + * not like the starting position (off). + * chop the current mbuf into two pieces, set off to 0. + */ + if (len <= n->m_len - off) { + o = m_dup1(n, off, n->m_len - off, M_DONTWAIT); + if (o == NULL) { + m_freem(m); + return NULL; /* ENOBUFS */ + } + n->m_len = off; + o->m_next = n->m_next; + n->m_next = o; + n = n->m_next; + off = 0; + goto ok; + } + + /* + * we need to take hlen from <n, off> and tlen from <n->m_next, 0>, + * and construct contiguous mbuf with m_len == len. + * note that hlen + tlen == len, and tlen > 0. + */ + hlen = n->m_len - off; + tlen = len - hlen; + + /* + * ensure that we have enough trailing data on mbuf chain. + * if not, we can do nothing about the chain. + */ + olen = 0; + for (o = n->m_next; o != NULL; o = o->m_next) + olen += o->m_len; + if (hlen + olen < len) { + m_freem(m); + return NULL; /* mbuf chain too short */ + } + + /* + * easy cases first. + * we need to use m_copydata() to get data from <n->m_next, 0>. + */ + if ((off == 0 || offp) && M_TRAILINGSPACE(n) >= tlen + && writable) { + m_copydata(n->m_next, 0, tlen, mtod(n, caddr_t) + n->m_len); + n->m_len += tlen; + m_adj(n->m_next, tlen); + goto ok; + } + if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen + && writable) { + n->m_next->m_data -= hlen; + n->m_next->m_len += hlen; + bcopy(mtod(n, caddr_t) + off, mtod(n->m_next, caddr_t), hlen); + n->m_len -= hlen; + n = n->m_next; + off = 0; + goto ok; + } + + /* + * now, we need to do the hard way. don't m_copy as there's no room + * on both end. + */ + MGET(o, M_DONTWAIT, m->m_type); + if (o && len > MLEN) { + MCLGET(o, M_DONTWAIT); + if ((o->m_flags & M_EXT) == 0) { + m_free(o); + o = NULL; + } + } + if (!o) { + m_freem(m); + return NULL; /* ENOBUFS */ + } + /* get hlen from <n, off> into <o, 0> */ + o->m_len = hlen; + bcopy(mtod(n, caddr_t) + off, mtod(o, caddr_t), hlen); + n->m_len -= hlen; + /* get tlen from <n->m_next, 0> into <o, hlen> */ + m_copydata(n->m_next, 0, tlen, mtod(o, caddr_t) + o->m_len); + o->m_len += tlen; + m_adj(n->m_next, tlen); + o->m_next = n->m_next; + n->m_next = o; + n = o; + off = 0; + +ok: +#ifdef PULLDOWN_DEBUG + { + struct mbuf *t; + printf("after:"); + for (t = m; t; t = t->m_next) + printf("%c%d", t == n ? '*' : ' ', t->m_len); + printf(" (off=%d)\n", off); + } +#endif + if (offp) + *offp = off; + return n; +} + +static struct mbuf * +m_dup1(struct mbuf *m, int off, int len, int wait) +{ + struct mbuf *n; + int l; + int copyhdr; + + if (len > MCLBYTES) + return NULL; + if (off == 0 && (m->m_flags & M_PKTHDR) != 0) { + copyhdr = 1; + MGETHDR(n, wait, m->m_type); + l = MHLEN; + } else { + copyhdr = 0; + MGET(n, wait, m->m_type); + l = MLEN; + } + if (n && len > l) { + MCLGET(n, wait); + if ((n->m_flags & M_EXT) == 0) { + m_free(n); + n = NULL; + } + } + if (!n) + return NULL; + + if (copyhdr) + M_COPY_PKTHDR(n, m); + m_copydata(m, off, len, mtod(n, caddr_t)); + return n; +} + +/* + * pkthdr.aux chain manipulation. + * we don't allow clusters at this moment. + */ +struct mbuf * +m_aux_add2(struct mbuf *m, int af, int type, void *p) +{ + struct mbuf *n; + struct mauxtag *t; + + if ((m->m_flags & M_PKTHDR) == 0) + return NULL; + + n = m_aux_find(m, af, type); + if (n) + return n; + + MGET(n, M_DONTWAIT, m->m_type); + if (n == NULL) + return NULL; + + t = mtod(n, struct mauxtag *); + bzero(t, sizeof(*t)); + t->af = af; + t->type = type; + t->p = p; + n->m_data += sizeof(struct mauxtag); + n->m_len = 0; + n->m_next = m->m_pkthdr.aux; + m->m_pkthdr.aux = n; + return n; +} + +struct mbuf * +m_aux_find2(struct mbuf *m, int af, int type, void *p) +{ + struct mbuf *n; + struct mauxtag *t; + + if ((m->m_flags & M_PKTHDR) == 0) + return NULL; + + for (n = m->m_pkthdr.aux; n; n = n->m_next) { + t = (struct mauxtag *)n->m_dat; + if (n->m_data != ((caddr_t)t) + sizeof(struct mauxtag)) { + printf("m_aux_find: invalid m_data for mbuf=%p (%p %p)\n", n, t, n->m_data); + continue; + } + if (t->af == af && t->type == type && t->p == p) + return n; + } + return NULL; +} + +struct mbuf * +m_aux_find(struct mbuf *m, int af, int type) +{ + + return m_aux_find2(m, af, type, NULL); +} + +struct mbuf * +m_aux_add(struct mbuf *m, int af, int type) +{ + + return m_aux_add2(m, af, type, NULL); +} + +void +m_aux_delete(struct mbuf *m, struct mbuf *victim) +{ + struct mbuf *n, *prev, *next; + struct mauxtag *t; + + if ((m->m_flags & M_PKTHDR) == 0) + return; + + prev = NULL; + n = m->m_pkthdr.aux; + while (n) { + t = (struct mauxtag *)n->m_dat; + next = n->m_next; + if (n->m_data != ((caddr_t)t) + sizeof(struct mauxtag)) { + printf("m_aux_delete: invalid m_data for mbuf=%p (%p %p)\n", n, t, n->m_data); + prev = n; + n = next; + continue; + } + if (n == victim) { + if (prev) + prev->m_next = n->m_next; + else + m->m_pkthdr.aux = n->m_next; + n->m_next = NULL; + m_free(n); + return; + } else + prev = n; + n = next; + } +} diff --git a/sys/kern/uipc_proto.c b/sys/kern/uipc_proto.c new file mode 100644 index 0000000..74dab78 --- /dev/null +++ b/sys/kern/uipc_proto.c @@ -0,0 +1,80 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_proto.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/domain.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/queue.h> +#include <sys/sysctl.h> +#include <sys/un.h> + +#include <net/raw_cb.h> + +/* + * Definitions of protocols supported in the LOCAL domain. + */ + +static struct protosw localsw[] = { +{ SOCK_STREAM, &localdomain, 0, PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, + 0, 0, 0, &uipc_ctloutput, + 0, + 0, 0, 0, 0, + &uipc_usrreqs +}, +{ SOCK_DGRAM, &localdomain, 0, PR_ATOMIC|PR_ADDR|PR_RIGHTS, + 0, 0, 0, 0, + 0, + 0, 0, 0, 0, + &uipc_usrreqs +}, +{ 0, 0, 0, 0, + 0, 0, raw_ctlinput, 0, + 0, + raw_init, 0, 0, 0, + &raw_usrreqs +} +}; + +struct domain localdomain = + { AF_LOCAL, "local", unp_init, unp_externalize, unp_dispose, + localsw, &localsw[sizeof(localsw)/sizeof(localsw[0])] }; +DOMAIN_SET(local); + +SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain"); +SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM"); +SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM"); diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c new file mode 100644 index 0000000..1e68f83 --- /dev/null +++ b/sys/kern/uipc_sockbuf.c @@ -0,0 +1,983 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include "opt_param.h" +#include <sys/param.h> +#include <sys/aio.h> /* for aio_swake proto */ +#include <sys/domain.h> +#include <sys/event.h> +#include <sys/file.h> /* for maxfiles */ +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/protosw.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/stat.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +int maxsockets; + +void (*aio_swake)(struct socket *, struct sockbuf *); + +/* + * Primitive routines for operating on sockets and socket buffers + */ + +u_long sb_max = SB_MAX; /* XXX should be static */ + +static u_long sb_efficiency = 8; /* parameter for sbreserve() */ + +/* + * Procedures to manipulate state flags of socket + * and do appropriate wakeups. Normal sequence from the + * active (originating) side is that soisconnecting() is + * called during processing of connect() call, + * resulting in an eventual call to soisconnected() if/when the + * connection is established. When the connection is torn down + * soisdisconnecting() is called during processing of disconnect() call, + * and soisdisconnected() is called when the connection to the peer + * is totally severed. The semantics of these routines are such that + * connectionless protocols can call soisconnected() and soisdisconnected() + * only, bypassing the in-progress calls when setting up a ``connection'' + * takes no time. + * + * From the passive side, a socket is created with + * two queues of sockets: so_incomp for connections in progress + * and so_comp for connections already made and awaiting user acceptance. + * As a protocol is preparing incoming connections, it creates a socket + * structure queued on so_incomp by calling sonewconn(). When the connection + * is established, soisconnected() is called, and transfers the + * socket structure to so_comp, making it available to accept(). + * + * If a socket is closed with sockets on either + * so_incomp or so_comp, these sockets are dropped. + * + * If higher level protocols are implemented in + * the kernel, the wakeups done here will sometimes + * cause software-interrupt process scheduling. + */ + +void +soisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= SS_ISCONNECTING; +} + +void +soisconnected(so) + struct socket *so; +{ + struct socket *head = so->so_head; + + so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); + so->so_state |= SS_ISCONNECTED; + if (head && (so->so_state & SS_INCOMP)) { + if ((so->so_options & SO_ACCEPTFILTER) != 0) { + so->so_upcall = head->so_accf->so_accept_filter->accf_callback; + so->so_upcallarg = head->so_accf->so_accept_filter_arg; + so->so_rcv.sb_flags |= SB_UPCALL; + so->so_options &= ~SO_ACCEPTFILTER; + so->so_upcall(so, so->so_upcallarg, 0); + return; + } + TAILQ_REMOVE(&head->so_incomp, so, so_list); + head->so_incqlen--; + so->so_state &= ~SS_INCOMP; + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + head->so_qlen++; + so->so_state |= SS_COMP; + sorwakeup(head); + wakeup_one(&head->so_timeo); + } else { + wakeup(&so->so_timeo); + sorwakeup(so); + sowwakeup(so); + } +} + +void +soisdisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~SS_ISCONNECTING; + so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup(&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +void +soisdisconnected(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); + wakeup(&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +/* + * When an attempt at a new connection is noted on a socket + * which accepts connections, sonewconn is called. If the + * connection is possible (subject to space constraints, etc.) + * then we allocate a new structure, propoerly linked into the + * data structure of the original socket, and return this. + * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. + * + * note: the ref count on the socket is 0 on return + */ +struct socket * +sonewconn(head, connstatus) + register struct socket *head; + int connstatus; +{ + register struct socket *so; + + if (head->so_qlen > 3 * head->so_qlimit / 2) + return ((struct socket *)0); + so = soalloc(0); + if (so == NULL) + return ((struct socket *)0); + if ((head->so_options & SO_ACCEPTFILTER) != 0) + connstatus = 0; + so->so_head = head; + so->so_type = head->so_type; + so->so_options = head->so_options &~ SO_ACCEPTCONN; + so->so_linger = head->so_linger; + so->so_state = head->so_state | SS_NOFDREF; + so->so_proto = head->so_proto; + so->so_timeo = head->so_timeo; + so->so_cred = crhold(head->so_cred); + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || + (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { + sotryfree(so); + return ((struct socket *)0); + } + + if (connstatus) { + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + so->so_state |= SS_COMP; + head->so_qlen++; + } else { + if (head->so_incqlen > head->so_qlimit) { + struct socket *sp; + sp = TAILQ_FIRST(&head->so_incomp); + (void) soabort(sp); + } + TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); + so->so_state |= SS_INCOMP; + head->so_incqlen++; + } + if (connstatus) { + sorwakeup(head); + wakeup(&head->so_timeo); + so->so_state |= connstatus; + } + return (so); +} + +/* + * Socantsendmore indicates that no more data will be sent on the + * socket; it would normally be applied to a socket when the user + * informs the system that no more data is to be sent, by the protocol + * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data + * will be received, and will normally be applied to the socket by a + * protocol when it detects that the peer will send no more data. + * Data queued for reading in the socket may yet be read. + */ + +void +socantsendmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTSENDMORE; + sowwakeup(so); +} + +void +socantrcvmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTRCVMORE; + sorwakeup(so); +} + +/* + * Wait for data to arrive at/drain from a socket buffer. + */ +int +sbwait(sb) + struct sockbuf *sb; +{ + + sb->sb_flags |= SB_WAIT; + return (tsleep(&sb->sb_cc, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", + sb->sb_timeo)); +} + +/* + * Lock a sockbuf already known to be locked; + * return any error returned from sleep (EINTR). + */ +int +sb_lock(sb) + register struct sockbuf *sb; +{ + int error; + + while (sb->sb_flags & SB_LOCK) { + sb->sb_flags |= SB_WANT; + error = tsleep(&sb->sb_flags, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, + "sblock", 0); + if (error) + return (error); + } + sb->sb_flags |= SB_LOCK; + return (0); +} + +/* + * Wakeup processes waiting on a socket buffer. + * Do asynchronous notification via SIGIO + * if the socket has the SS_ASYNC flag set. + */ +void +sowakeup(so, sb) + register struct socket *so; + register struct sockbuf *sb; +{ + + selwakeup(&sb->sb_sel); + sb->sb_flags &= ~SB_SEL; + if (sb->sb_flags & SB_WAIT) { + sb->sb_flags &= ~SB_WAIT; + wakeup(&sb->sb_cc); + } + if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) + pgsigio(&so->so_sigio, SIGIO, 0); + if (sb->sb_flags & SB_UPCALL) + (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); + if (sb->sb_flags & SB_AIO) + aio_swake(so, sb); + KNOTE(&sb->sb_sel.si_note, 0); +} + +/* + * Socket buffer (struct sockbuf) utility routines. + * + * Each socket contains two socket buffers: one for sending data and + * one for receiving data. Each buffer contains a queue of mbufs, + * information about the number of mbufs and amount of data in the + * queue, and other fields allowing select() statements and notification + * on data availability to be implemented. + * + * Data stored in a socket buffer is maintained as a list of records. + * Each record is a list of mbufs chained together with the m_next + * field. Records are chained together with the m_nextpkt field. The upper + * level routine soreceive() expects the following conventions to be + * observed when placing information in the receive buffer: + * + * 1. If the protocol requires each message be preceded by the sender's + * name, then a record containing that name must be present before + * any associated data (mbuf's must be of type MT_SONAME). + * 2. If the protocol supports the exchange of ``access rights'' (really + * just additional data associated with the message), and there are + * ``rights'' to be received, then a record containing this data + * should be present (mbuf's must be of type MT_RIGHTS). + * 3. If a name or rights record exists, then it must be followed by + * a data record, perhaps of zero length. + * + * Before using a new socket structure it is first necessary to reserve + * buffer space to the socket, by calling sbreserve(). This should commit + * some of the available buffer space in the system buffer pool for the + * socket (currently, it does nothing but enforce limits). The space + * should be released by calling sbrelease() when the socket is destroyed. + */ + +int +soreserve(so, sndcc, rcvcc) + register struct socket *so; + u_long sndcc, rcvcc; +{ + struct thread *td = curthread; + + if (sbreserve(&so->so_snd, sndcc, so, td) == 0) + goto bad; + if (sbreserve(&so->so_rcv, rcvcc, so, td) == 0) + goto bad2; + if (so->so_rcv.sb_lowat == 0) + so->so_rcv.sb_lowat = 1; + if (so->so_snd.sb_lowat == 0) + so->so_snd.sb_lowat = MCLBYTES; + if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) + so->so_snd.sb_lowat = so->so_snd.sb_hiwat; + return (0); +bad2: + sbrelease(&so->so_snd, so); +bad: + return (ENOBUFS); +} + +/* + * Allot mbufs to a sockbuf. + * Attempt to scale mbmax so that mbcnt doesn't become limiting + * if buffering efficiency is near the normal case. + */ +int +sbreserve(sb, cc, so, td) + struct sockbuf *sb; + u_long cc; + struct socket *so; + struct thread *td; +{ + + /* + * td will only be NULL when we're in an interrupt + * (e.g. in tcp_input()) + */ + if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES)) + return (0); + if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, + td ? td->td_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur : RLIM_INFINITY)) { + return (0); + } + sb->sb_mbmax = min(cc * sb_efficiency, sb_max); + if (sb->sb_lowat > sb->sb_hiwat) + sb->sb_lowat = sb->sb_hiwat; + return (1); +} + +/* + * Free mbufs held by a socket, and reserved mbuf space. + */ +void +sbrelease(sb, so) + struct sockbuf *sb; + struct socket *so; +{ + + sbflush(sb); + (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, + RLIM_INFINITY); + sb->sb_mbmax = 0; +} + +/* + * Routines to add and remove + * data from an mbuf queue. + * + * The routines sbappend() or sbappendrecord() are normally called to + * append new mbufs to a socket buffer, after checking that adequate + * space is available, comparing the function sbspace() with the amount + * of data to be added. sbappendrecord() differs from sbappend() in + * that data supplied is treated as the beginning of a new record. + * To place a sender's address, optional access rights, and data in a + * socket receive buffer, sbappendaddr() should be used. To place + * access rights and data in a socket receive buffer, sbappendrights() + * should be used. In either case, the new data begins a new record. + * Note that unlike sbappend() and sbappendrecord(), these routines check + * for the caller that there will be enough space to store the data. + * Each fails if there is not enough space, or if it cannot find mbufs + * to store additional information in. + * + * Reliable protocols may use the socket send buffer to hold data + * awaiting acknowledgement. Data is normally copied from a socket + * send buffer in a protocol with m_copy for output to a peer, + * and then removing the data from the socket buffer with sbdrop() + * or sbdroprecord() when the data is acknowledged by the peer. + */ + +/* + * Append mbuf chain m to the last record in the + * socket buffer sb. The additional space associated + * the mbuf chain is recorded in sb. Empty mbufs are + * discarded and mbufs are compacted where possible. + */ +void +sbappend(sb, m) + struct sockbuf *sb; + struct mbuf *m; +{ + register struct mbuf *n; + + if (m == 0) + return; + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + do { + if (n->m_flags & M_EOR) { + sbappendrecord(sb, m); /* XXXXXX!!!! */ + return; + } + } while (n->m_next && (n = n->m_next)); + } + sbcompress(sb, m, n); +} + +#ifdef SOCKBUF_DEBUG +void +sbcheck(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m; + register struct mbuf *n = 0; + register u_long len = 0, mbcnt = 0; + + for (m = sb->sb_mb; m; m = n) { + n = m->m_nextpkt; + for (; m; m = m->m_next) { + len += m->m_len; + mbcnt += MSIZE; + if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ + mbcnt += m->m_ext.ext_size; + } + } + if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { + printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc, + mbcnt, sb->sb_mbcnt); + panic("sbcheck"); + } +} +#endif + +/* + * As above, except the mbuf chain + * begins a new record. + */ +void +sbappendrecord(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + + if (m0 == 0) + return; + m = sb->sb_mb; + if (m) + while (m->m_nextpkt) + m = m->m_nextpkt; + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + if (m) + m->m_nextpkt = m0; + else + sb->sb_mb = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * As above except that OOB data + * is inserted at the beginning of the sockbuf, + * but after any other OOB data. + */ +void +sbinsertoob(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + register struct mbuf **mp; + + if (m0 == 0) + return; + for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) { + m = *mp; + again: + switch (m->m_type) { + + case MT_OOBDATA: + continue; /* WANT next train */ + + case MT_CONTROL: + m = m->m_next; + if (m) + goto again; /* inspect THIS train further */ + } + break; + } + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + m0->m_nextpkt = *mp; + *mp = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * Append address and data, and optionally, control (ancillary) data + * to the receive queue of a socket. If present, + * m0 must include a packet header with total length. + * Returns 0 if no space in sockbuf or insufficient mbufs. + */ +int +sbappendaddr(sb, asa, m0, control) + register struct sockbuf *sb; + struct sockaddr *asa; + struct mbuf *m0, *control; +{ + register struct mbuf *m, *n; + int space = asa->sa_len; + + if (m0 && (m0->m_flags & M_PKTHDR) == 0) + panic("sbappendaddr"); + if (m0) + space += m0->m_pkthdr.len; + for (n = control; n; n = n->m_next) { + space += n->m_len; + if (n->m_next == 0) /* keep pointer to last control buf */ + break; + } + if (space > sbspace(sb)) + return (0); + if (asa->sa_len > MLEN) + return (0); + MGET(m, M_DONTWAIT, MT_SONAME); + if (m == 0) + return (0); + m->m_len = asa->sa_len; + bcopy(asa, mtod(m, caddr_t), asa->sa_len); + if (n) + n->m_next = m0; /* concatenate data to control */ + else + control = m0; + m->m_next = control; + for (n = m; n; n = n->m_next) + sballoc(sb, n); + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = m; + } else + sb->sb_mb = m; + return (1); +} + +int +sbappendcontrol(sb, m0, control) + struct sockbuf *sb; + struct mbuf *control, *m0; +{ + register struct mbuf *m, *n; + int space = 0; + + if (control == 0) + panic("sbappendcontrol"); + for (m = control; ; m = m->m_next) { + space += m->m_len; + if (m->m_next == 0) + break; + } + n = m; /* save pointer to last control buffer */ + for (m = m0; m; m = m->m_next) + space += m->m_len; + if (space > sbspace(sb)) + return (0); + n->m_next = m0; /* concatenate data to control */ + for (m = control; m; m = m->m_next) + sballoc(sb, m); + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = control; + } else + sb->sb_mb = control; + return (1); +} + +/* + * Compress mbuf chain m into the socket + * buffer sb following mbuf n. If n + * is null, the buffer is presumed empty. + */ +void +sbcompress(sb, m, n) + register struct sockbuf *sb; + register struct mbuf *m, *n; +{ + register int eor = 0; + register struct mbuf *o; + + while (m) { + eor |= m->m_flags & M_EOR; + if (m->m_len == 0 && + (eor == 0 || + (((o = m->m_next) || (o = n)) && + o->m_type == m->m_type))) { + m = m_free(m); + continue; + } + if (n && (n->m_flags & M_EOR) == 0 && + M_WRITABLE(n) && + m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ + m->m_len <= M_TRAILINGSPACE(n) && + n->m_type == m->m_type) { + bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, + (unsigned)m->m_len); + n->m_len += m->m_len; + sb->sb_cc += m->m_len; + m = m_free(m); + continue; + } + if (n) + n->m_next = m; + else + sb->sb_mb = m; + sballoc(sb, m); + n = m; + m->m_flags &= ~M_EOR; + m = m->m_next; + n->m_next = 0; + } + if (eor) { + if (n) + n->m_flags |= eor; + else + printf("semi-panic: sbcompress\n"); + } +} + +/* + * Free all mbufs in a sockbuf. + * Check that all resources are reclaimed. + */ +void +sbflush(sb) + register struct sockbuf *sb; +{ + + if (sb->sb_flags & SB_LOCK) + panic("sbflush: locked"); + while (sb->sb_mbcnt) { + /* + * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: + * we would loop forever. Panic instead. + */ + if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) + break; + sbdrop(sb, (int)sb->sb_cc); + } + if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt) + panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); +} + +/* + * Drop data from (the front of) a sockbuf. + */ +void +sbdrop(sb, len) + register struct sockbuf *sb; + register int len; +{ + register struct mbuf *m; + struct mbuf *next; + + next = (m = sb->sb_mb) ? m->m_nextpkt : 0; + while (len > 0) { + if (m == 0) { + if (next == 0) + panic("sbdrop"); + m = next; + next = m->m_nextpkt; + continue; + } + if (m->m_len > len) { + m->m_len -= len; + m->m_data += len; + sb->sb_cc -= len; + break; + } + len -= m->m_len; + sbfree(sb, m); + m = m_free(m); + } + while (m && m->m_len == 0) { + sbfree(sb, m); + m = m_free(m); + } + if (m) { + sb->sb_mb = m; + m->m_nextpkt = next; + } else + sb->sb_mb = next; +} + +/* + * Drop a record off the front of a sockbuf + * and move the next record to the front. + */ +void +sbdroprecord(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m; + + m = sb->sb_mb; + if (m) { + sb->sb_mb = m->m_nextpkt; + do { + sbfree(sb, m); + m = m_free(m); + } while (m); + } +} + +/* + * Create a "control" mbuf containing the specified data + * with the specified type for presentation on a socket buffer. + */ +struct mbuf * +sbcreatecontrol(p, size, type, level) + caddr_t p; + register int size; + int type, level; +{ + register struct cmsghdr *cp; + struct mbuf *m; + + if (CMSG_SPACE((u_int)size) > MCLBYTES) + return ((struct mbuf *) NULL); + if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) + return ((struct mbuf *) NULL); + if (CMSG_SPACE((u_int)size) > MLEN) { + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_free(m); + return ((struct mbuf *) NULL); + } + } + cp = mtod(m, struct cmsghdr *); + m->m_len = 0; + KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m), + ("sbcreatecontrol: short mbuf")); + if (p != NULL) + (void)memcpy(CMSG_DATA(cp), p, size); + m->m_len = CMSG_SPACE(size); + cp->cmsg_len = CMSG_LEN(size); + cp->cmsg_level = level; + cp->cmsg_type = type; + return (m); +} + +/* + * Some routines that return EOPNOTSUPP for entry points that are not + * supported by a protocol. Fill in as needed. + */ +int +pru_accept_notsupp(struct socket *so, struct sockaddr **nam) +{ + return EOPNOTSUPP; +} + +int +pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + return EOPNOTSUPP; +} + +int +pru_connect2_notsupp(struct socket *so1, struct socket *so2) +{ + return EOPNOTSUPP; +} + +int +pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, + struct ifnet *ifp, struct thread *td) +{ + return EOPNOTSUPP; +} + +int +pru_listen_notsupp(struct socket *so, struct thread *td) +{ + return EOPNOTSUPP; +} + +int +pru_rcvd_notsupp(struct socket *so, int flags) +{ + return EOPNOTSUPP; +} + +int +pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) +{ + return EOPNOTSUPP; +} + +/* + * This isn't really a ``null'' operation, but it's the default one + * and doesn't do anything destructive. + */ +int +pru_sense_null(struct socket *so, struct stat *sb) +{ + sb->st_blksize = so->so_snd.sb_hiwat; + return 0; +} + +/* + * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. + */ +struct sockaddr * +dup_sockaddr(sa, canwait) + struct sockaddr *sa; + int canwait; +{ + struct sockaddr *sa2; + + MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME, + canwait ? M_WAITOK : M_NOWAIT); + if (sa2) + bcopy(sa, sa2, sa->sa_len); + return sa2; +} + +/* + * Create an external-format (``xsocket'') structure using the information + * in the kernel-format socket structure pointed to by so. This is done + * to reduce the spew of irrelevant information over this interface, + * to isolate user code from changes in the kernel structure, and + * potentially to provide information-hiding if we decide that + * some of this information should be hidden from users. + */ +void +sotoxsocket(struct socket *so, struct xsocket *xso) +{ + xso->xso_len = sizeof *xso; + xso->xso_so = so; + xso->so_type = so->so_type; + xso->so_options = so->so_options; + xso->so_linger = so->so_linger; + xso->so_state = so->so_state; + xso->so_pcb = so->so_pcb; + xso->xso_protocol = so->so_proto->pr_protocol; + xso->xso_family = so->so_proto->pr_domain->dom_family; + xso->so_qlen = so->so_qlen; + xso->so_incqlen = so->so_incqlen; + xso->so_qlimit = so->so_qlimit; + xso->so_timeo = so->so_timeo; + xso->so_error = so->so_error; + xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; + xso->so_oobmark = so->so_oobmark; + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + xso->so_uid = so->so_cred->cr_uid; +} + +/* + * This does the same for sockbufs. Note that the xsockbuf structure, + * since it is always embedded in a socket, does not include a self + * pointer nor a length. We make this entry point public in case + * some other mechanism needs it. + */ +void +sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) +{ + xsb->sb_cc = sb->sb_cc; + xsb->sb_hiwat = sb->sb_hiwat; + xsb->sb_mbcnt = sb->sb_mbcnt; + xsb->sb_mbmax = sb->sb_mbmax; + xsb->sb_lowat = sb->sb_lowat; + xsb->sb_flags = sb->sb_flags; + xsb->sb_timeo = sb->sb_timeo; +} + +/* + * Here is the definition of some of the basic objects in the kern.ipc + * branch of the MIB. + */ +SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); + +/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ +static int dummy; +SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); + +SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, + &sb_max, 0, "Maximum socket buffer size"); +SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, + &maxsockets, 0, "Maximum number of sockets avaliable"); +SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, + &sb_efficiency, 0, ""); + +/* + * Initialise maxsockets + */ +static void init_maxsockets(void *ignored) +{ + TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); + maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); +} +SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c new file mode 100644 index 0000000..d596294 --- /dev/null +++ b/sys/kern/uipc_socket.c @@ -0,0 +1,1792 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_zero.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/domain.h> +#include <sys/file.h> /* for struct knote */ +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/event.h> +#include <sys/poll.h> +#include <sys/proc.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/sysctl.h> +#include <sys/uio.h> +#include <sys/jail.h> + +#include <vm/uma.h> + +#include <machine/limits.h> + +#ifdef INET +static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); +#endif + +static void filt_sordetach(struct knote *kn); +static int filt_soread(struct knote *kn, long hint); +static void filt_sowdetach(struct knote *kn); +static int filt_sowrite(struct knote *kn, long hint); +static int filt_solisten(struct knote *kn, long hint); + +static struct filterops solisten_filtops = + { 1, NULL, filt_sordetach, filt_solisten }; +static struct filterops soread_filtops = + { 1, NULL, filt_sordetach, filt_soread }; +static struct filterops sowrite_filtops = + { 1, NULL, filt_sowdetach, filt_sowrite }; + +uma_zone_t socket_zone; +so_gen_t so_gencnt; /* generation count for sockets */ + +MALLOC_DEFINE(M_SONAME, "soname", "socket name"); +MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); + +SYSCTL_DECL(_kern_ipc); + +static int somaxconn = SOMAXCONN; +SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, + &somaxconn, 0, "Maximum pending socket connection queue size"); +static int numopensockets; +SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, + &numopensockets, 0, "Number of open sockets"); +#ifdef ZERO_COPY_SOCKETS +/* These aren't static because they're used in other files. */ +int so_zero_copy_send = 1; +int so_zero_copy_receive = 1; +SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, + "Zero copy controls"); +SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, + &so_zero_copy_receive, 0, "Enable zero copy receive"); +SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, + &so_zero_copy_send, 0, "Enable zero copy send"); +#endif /* ZERO_COPY_SOCKETS */ + + +/* + * Socket operation routines. + * These routines are called by the routines in + * sys_socket.c or from a system process, and + * implement the semantics of socket operations by + * switching out to the protocol specific routines. + */ + +/* + * Get a socket structure from our zone, and initialize it. + * Note that it would probably be better to allocate socket + * and PCB at the same time, but I'm not convinced that all + * the protocols can be easily modified to do this. + * + * soalloc() returns a socket with a ref count of 0. + */ +struct socket * +soalloc(waitok) + int waitok; +{ + struct socket *so; + int flag; + + if (waitok == 1) + flag = M_WAITOK; + else + flag = M_NOWAIT; + flag |= M_ZERO; + so = uma_zalloc(socket_zone, flag); + if (so) { + /* XXX race condition for reentrant kernel */ + so->so_gencnt = ++so_gencnt; + /* sx_init(&so->so_sxlock, "socket sxlock"); */ + TAILQ_INIT(&so->so_aiojobq); + ++numopensockets; + } + return so; +} + +/* + * socreate returns a socket with a ref count of 1. The socket should be + * closed with soclose(). + */ +int +socreate(dom, aso, type, proto, cred, td) + int dom; + struct socket **aso; + register int type; + int proto; + struct ucred *cred; + struct thread *td; +{ + register struct protosw *prp; + register struct socket *so; + register int error; + + if (proto) + prp = pffindproto(dom, proto, type); + else + prp = pffindtype(dom, type); + + if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) + return (EPROTONOSUPPORT); + + if (jailed(td->td_ucred) && jail_socket_unixiproute_only && + prp->pr_domain->dom_family != PF_LOCAL && + prp->pr_domain->dom_family != PF_INET && + prp->pr_domain->dom_family != PF_ROUTE) { + return (EPROTONOSUPPORT); + } + + if (prp->pr_type != type) + return (EPROTOTYPE); + so = soalloc(M_NOWAIT); + if (so == NULL) + return (ENOBUFS); + + TAILQ_INIT(&so->so_incomp); + TAILQ_INIT(&so->so_comp); + so->so_type = type; + so->so_cred = crhold(cred); + so->so_proto = prp; + soref(so); + error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); + if (error) { + so->so_state |= SS_NOFDREF; + sorele(so); + return (error); + } + *aso = so; + return (0); +} + +int +sobind(so, nam, td) + struct socket *so; + struct sockaddr *nam; + struct thread *td; +{ + int s = splnet(); + int error; + + error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); + splx(s); + return (error); +} + +static void +sodealloc(struct socket *so) +{ + + KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); + so->so_gencnt = ++so_gencnt; + if (so->so_rcv.sb_hiwat) + (void)chgsbsize(so->so_cred->cr_uidinfo, + &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); + if (so->so_snd.sb_hiwat) + (void)chgsbsize(so->so_cred->cr_uidinfo, + &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); +#ifdef INET + if (so->so_accf != NULL) { + if (so->so_accf->so_accept_filter != NULL && + so->so_accf->so_accept_filter->accf_destroy != NULL) { + so->so_accf->so_accept_filter->accf_destroy(so); + } + if (so->so_accf->so_accept_filter_str != NULL) + FREE(so->so_accf->so_accept_filter_str, M_ACCF); + FREE(so->so_accf, M_ACCF); + } +#endif + crfree(so->so_cred); + /* sx_destroy(&so->so_sxlock); */ + uma_zfree(socket_zone, so); + --numopensockets; +} + +int +solisten(so, backlog, td) + register struct socket *so; + int backlog; + struct thread *td; +{ + int s, error; + + s = splnet(); + error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); + if (error) { + splx(s); + return (error); + } + if (TAILQ_EMPTY(&so->so_comp)) + so->so_options |= SO_ACCEPTCONN; + if (backlog < 0 || backlog > somaxconn) + backlog = somaxconn; + so->so_qlimit = backlog; + splx(s); + return (0); +} + +void +sofree(so) + register struct socket *so; +{ + struct socket *head = so->so_head; + + KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); + + if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) + return; + if (head != NULL) { + if (so->so_state & SS_INCOMP) { + TAILQ_REMOVE(&head->so_incomp, so, so_list); + head->so_incqlen--; + } else if (so->so_state & SS_COMP) { + /* + * We must not decommission a socket that's + * on the accept(2) queue. If we do, then + * accept(2) may hang after select(2) indicated + * that the listening socket was ready. + */ + return; + } else { + panic("sofree: not queued"); + } + so->so_state &= ~SS_INCOMP; + so->so_head = NULL; + } + sbrelease(&so->so_snd, so); + sorflush(so); + sodealloc(so); +} + +/* + * Close a socket on last file table reference removal. + * Initiate disconnect if connected. + * Free socket when disconnect complete. + * + * This function will sorele() the socket. Note that soclose() may be + * called prior to the ref count reaching zero. The actual socket + * structure will not be freed until the ref count reaches zero. + */ +int +soclose(so) + register struct socket *so; +{ + int s = splnet(); /* conservative */ + int error = 0; + + funsetown(&so->so_sigio); + if (so->so_options & SO_ACCEPTCONN) { + struct socket *sp, *sonext; + + sp = TAILQ_FIRST(&so->so_incomp); + for (; sp != NULL; sp = sonext) { + sonext = TAILQ_NEXT(sp, so_list); + (void) soabort(sp); + } + for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) { + sonext = TAILQ_NEXT(sp, so_list); + /* Dequeue from so_comp since sofree() won't do it */ + TAILQ_REMOVE(&so->so_comp, sp, so_list); + so->so_qlen--; + sp->so_state &= ~SS_COMP; + sp->so_head = NULL; + (void) soabort(sp); + } + } + if (so->so_pcb == 0) + goto discard; + if (so->so_state & SS_ISCONNECTED) { + if ((so->so_state & SS_ISDISCONNECTING) == 0) { + error = sodisconnect(so); + if (error) + goto drop; + } + if (so->so_options & SO_LINGER) { + if ((so->so_state & SS_ISDISCONNECTING) && + (so->so_state & SS_NBIO)) + goto drop; + while (so->so_state & SS_ISCONNECTED) { + error = tsleep(&so->so_timeo, + PSOCK | PCATCH, "soclos", so->so_linger * hz); + if (error) + break; + } + } + } +drop: + if (so->so_pcb) { + int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); + if (error == 0) + error = error2; + } +discard: + if (so->so_state & SS_NOFDREF) + panic("soclose: NOFDREF"); + so->so_state |= SS_NOFDREF; + sorele(so); + splx(s); + return (error); +} + +/* + * Must be called at splnet... + */ +int +soabort(so) + struct socket *so; +{ + int error; + + error = (*so->so_proto->pr_usrreqs->pru_abort)(so); + if (error) { + sotryfree(so); /* note: does not decrement the ref count */ + return error; + } + return (0); +} + +int +soaccept(so, nam) + register struct socket *so; + struct sockaddr **nam; +{ + int s = splnet(); + int error; + + if ((so->so_state & SS_NOFDREF) == 0) + panic("soaccept: !NOFDREF"); + so->so_state &= ~SS_NOFDREF; + error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); + splx(s); + return (error); +} + +int +soconnect(so, nam, td) + register struct socket *so; + struct sockaddr *nam; + struct thread *td; +{ + int s; + int error; + + if (so->so_options & SO_ACCEPTCONN) + return (EOPNOTSUPP); + s = splnet(); + /* + * If protocol is connection-based, can only connect once. + * Otherwise, if connected, try to disconnect first. + * This allows user to disconnect by connecting to, e.g., + * a null address. + */ + if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && + ((so->so_proto->pr_flags & PR_CONNREQUIRED) || + (error = sodisconnect(so)))) + error = EISCONN; + else + error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); + splx(s); + return (error); +} + +int +soconnect2(so1, so2) + register struct socket *so1; + struct socket *so2; +{ + int s = splnet(); + int error; + + error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); + splx(s); + return (error); +} + +int +sodisconnect(so) + register struct socket *so; +{ + int s = splnet(); + int error; + + if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + goto bad; + } + if (so->so_state & SS_ISDISCONNECTING) { + error = EALREADY; + goto bad; + } + error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); +bad: + splx(s); + return (error); +} + +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) +/* + * Send on a socket. + * If send must go all at once and message is larger than + * send buffering, then hard error. + * Lock against other senders. + * If must go all at once and not enough room now, then + * inform user that this would block and do nothing. + * Otherwise, if nonblocking, send as much as possible. + * The data to be sent is described by "uio" if nonzero, + * otherwise by the mbuf chain "top" (which must be null + * if uio is not). Data provided in mbuf chain must be small + * enough to send all at once. + * + * Returns nonzero on error, timeout or signal; callers + * must check for short counts if EINTR/ERESTART are returned. + * Data and control buffers are freed on return. + */ + +#ifdef ZERO_COPY_SOCKETS +struct so_zerocopy_stats{ + int size_ok; + int align_ok; + int found_ifp; +}; +struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; +#include <netinet/in.h> +#include <net/route.h> +#include <netinet/in_pcb.h> +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#endif /*ZERO_COPY_SOCKETS*/ + +int +sosend(so, addr, uio, top, control, flags, td) + register struct socket *so; + struct sockaddr *addr; + struct uio *uio; + struct mbuf *top; + struct mbuf *control; + int flags; + struct thread *td; +{ + struct mbuf **mp; + register struct mbuf *m; + register long space, len, resid; + int clen = 0, error, s, dontroute, mlen; + int atomic = sosendallatonce(so) || top; +#ifdef ZERO_COPY_SOCKETS + int cow_send; +#endif /* ZERO_COPY_SOCKETS */ + + if (uio) + resid = uio->uio_resid; + else + resid = top->m_pkthdr.len; + /* + * In theory resid should be unsigned. + * However, space must be signed, as it might be less than 0 + * if we over-committed, and we must use a signed comparison + * of space and resid. On the other hand, a negative resid + * causes us to loop sending 0-length segments to the protocol. + * + * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM + * type sockets since that's an error. + */ + if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { + error = EINVAL; + goto out; + } + + dontroute = + (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && + (so->so_proto->pr_flags & PR_ATOMIC); + if (td) + td->td_proc->p_stats->p_ru.ru_msgsnd++; + if (control) + clen = control->m_len; +#define snderr(errno) { error = errno; splx(s); goto release; } + +restart: + error = sblock(&so->so_snd, SBLOCKWAIT(flags)); + if (error) + goto out; + do { + s = splnet(); + if (so->so_state & SS_CANTSENDMORE) + snderr(EPIPE); + if (so->so_error) { + error = so->so_error; + so->so_error = 0; + splx(s); + goto release; + } + if ((so->so_state & SS_ISCONNECTED) == 0) { + /* + * `sendto' and `sendmsg' is allowed on a connection- + * based socket if it supports implied connect. + * Return ENOTCONN if not connected and no address is + * supplied. + */ + if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && + (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { + if ((so->so_state & SS_ISCONFIRMING) == 0 && + !(resid == 0 && clen != 0)) + snderr(ENOTCONN); + } else if (addr == 0) + snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? + ENOTCONN : EDESTADDRREQ); + } + space = sbspace(&so->so_snd); + if (flags & MSG_OOB) + space += 1024; + if ((atomic && resid > so->so_snd.sb_hiwat) || + clen > so->so_snd.sb_hiwat) + snderr(EMSGSIZE); + if (space < resid + clen && + (atomic || space < so->so_snd.sb_lowat || space < clen)) { + if (so->so_state & SS_NBIO) + snderr(EWOULDBLOCK); + sbunlock(&so->so_snd); + error = sbwait(&so->so_snd); + splx(s); + if (error) + goto out; + goto restart; + } + splx(s); + mp = ⊤ + space -= clen; + do { + if (uio == NULL) { + /* + * Data is prepackaged in "top". + */ + resid = 0; + if (flags & MSG_EOR) + top->m_flags |= M_EOR; + } else do { +#ifdef ZERO_COPY_SOCKETS + cow_send = 0; +#endif /* ZERO_COPY_SOCKETS */ + if (top == 0) { + MGETHDR(m, M_TRYWAIT, MT_DATA); + if (m == NULL) { + error = ENOBUFS; + goto release; + } + mlen = MHLEN; + m->m_pkthdr.len = 0; + m->m_pkthdr.rcvif = (struct ifnet *)0; + } else { + MGET(m, M_TRYWAIT, MT_DATA); + if (m == NULL) { + error = ENOBUFS; + goto release; + } + mlen = MLEN; + } + if (resid >= MINCLSIZE) { +#ifdef ZERO_COPY_SOCKETS + if (so_zero_copy_send && + resid>=PAGE_SIZE && + space>=PAGE_SIZE && + uio->uio_iov->iov_len>=PAGE_SIZE) { + so_zerocp_stats.size_ok++; + if (!((vm_offset_t) + uio->uio_iov->iov_base & PAGE_MASK)){ + so_zerocp_stats.align_ok++; + cow_send = socow_setup(m, uio); + } + } + if (!cow_send){ +#endif /* ZERO_COPY_SOCKETS */ + MCLGET(m, M_TRYWAIT); + if ((m->m_flags & M_EXT) == 0) + goto nopages; + mlen = MCLBYTES; + len = min(min(mlen, resid), space); + } else { +#ifdef ZERO_COPY_SOCKETS + len = PAGE_SIZE; + } + + } else { +#endif /* ZERO_COPY_SOCKETS */ +nopages: + len = min(min(mlen, resid), space); + /* + * For datagram protocols, leave room + * for protocol headers in first mbuf. + */ + if (atomic && top == 0 && len < mlen) + MH_ALIGN(m, len); + } + space -= len; +#ifdef ZERO_COPY_SOCKETS + if (cow_send) + error = 0; + else +#endif /* ZERO_COPY_SOCKETS */ + error = uiomove(mtod(m, caddr_t), (int)len, uio); + resid = uio->uio_resid; + m->m_len = len; + *mp = m; + top->m_pkthdr.len += len; + if (error) + goto release; + mp = &m->m_next; + if (resid <= 0) { + if (flags & MSG_EOR) + top->m_flags |= M_EOR; + break; + } + } while (space > 0 && atomic); + if (dontroute) + so->so_options |= SO_DONTROUTE; + s = splnet(); /* XXX */ + /* + * XXX all the SS_CANTSENDMORE checks previously + * done could be out of date. We could have recieved + * a reset packet in an interrupt or maybe we slept + * while doing page faults in uiomove() etc. We could + * probably recheck again inside the splnet() protection + * here, but there are probably other places that this + * also happens. We must rethink this. + */ + error = (*so->so_proto->pr_usrreqs->pru_send)(so, + (flags & MSG_OOB) ? PRUS_OOB : + /* + * If the user set MSG_EOF, the protocol + * understands this flag and nothing left to + * send then use PRU_SEND_EOF instead of PRU_SEND. + */ + ((flags & MSG_EOF) && + (so->so_proto->pr_flags & PR_IMPLOPCL) && + (resid <= 0)) ? + PRUS_EOF : + /* If there is more to send set PRUS_MORETOCOME */ + (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, + top, addr, control, td); + splx(s); + if (dontroute) + so->so_options &= ~SO_DONTROUTE; + clen = 0; + control = 0; + top = 0; + mp = ⊤ + if (error) + goto release; + } while (resid && space > 0); + } while (resid); + +release: + sbunlock(&so->so_snd); +out: + if (top) + m_freem(top); + if (control) + m_freem(control); + return (error); +} + +/* + * Implement receive operations on a socket. + * We depend on the way that records are added to the sockbuf + * by sbappend*. In particular, each record (mbufs linked through m_next) + * must begin with an address if the protocol so specifies, + * followed by an optional mbuf or mbufs containing ancillary data, + * and then zero or more mbufs of data. + * In order to avoid blocking network interrupts for the entire time here, + * we splx() while doing the actual copy to user space. + * Although the sockbuf is locked, new data may still be appended, + * and thus we must maintain consistency of the sockbuf during that time. + * + * The caller may receive the data as a single mbuf chain by supplying + * an mbuf **mp0 for use in returning the chain. The uio is then used + * only for the count in uio_resid. + */ +int +soreceive(so, psa, uio, mp0, controlp, flagsp) + register struct socket *so; + struct sockaddr **psa; + struct uio *uio; + struct mbuf **mp0; + struct mbuf **controlp; + int *flagsp; +{ + struct mbuf *m, **mp; + register int flags, len, error, s, offset; + struct protosw *pr = so->so_proto; + struct mbuf *nextrecord; + int moff, type = 0; + int orig_resid = uio->uio_resid; + + mp = mp0; + if (psa) + *psa = 0; + if (controlp) + *controlp = 0; + if (flagsp) + flags = *flagsp &~ MSG_EOR; + else + flags = 0; + if (flags & MSG_OOB) { + m = m_get(M_TRYWAIT, MT_DATA); + if (m == NULL) + return (ENOBUFS); + error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); + if (error) + goto bad; + do { +#ifdef ZERO_COPY_SOCKETS + if (so_zero_copy_receive) { + vm_page_t pg; + int disposable; + + if ((m->m_flags & M_EXT) + && (m->m_ext.ext_type == EXT_DISPOSABLE)) + disposable = 1; + else + disposable = 0; + + pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t))); + if (uio->uio_offset == -1) + uio->uio_offset =IDX_TO_OFF(pg->pindex); + + error = uiomoveco(mtod(m, caddr_t), + min(uio->uio_resid, m->m_len), + uio, pg->object, + disposable); + } else +#endif /* ZERO_COPY_SOCKETS */ + error = uiomove(mtod(m, caddr_t), + (int) min(uio->uio_resid, m->m_len), uio); + m = m_free(m); + } while (uio->uio_resid && error == 0 && m); +bad: + if (m) + m_freem(m); + return (error); + } + if (mp) + *mp = (struct mbuf *)0; + if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) + (*pr->pr_usrreqs->pru_rcvd)(so, 0); + +restart: + error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); + if (error) + return (error); + s = splnet(); + + m = so->so_rcv.sb_mb; + /* + * If we have less data than requested, block awaiting more + * (subject to any timeout) if: + * 1. the current count is less than the low water mark, or + * 2. MSG_WAITALL is set, and it is possible to do the entire + * receive operation at once if we block (resid <= hiwat). + * 3. MSG_DONTWAIT is not set + * If MSG_WAITALL is set but resid is larger than the receive buffer, + * we have to do the receive in sections, and thus risk returning + * a short count if a timeout or signal occurs after we start. + */ + if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && + so->so_rcv.sb_cc < uio->uio_resid) && + (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || + ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && + m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { + KASSERT(m != 0 || !so->so_rcv.sb_cc, + ("receive: m == %p so->so_rcv.sb_cc == %lu", + m, so->so_rcv.sb_cc)); + if (so->so_error) { + if (m) + goto dontblock; + error = so->so_error; + if ((flags & MSG_PEEK) == 0) + so->so_error = 0; + goto release; + } + if (so->so_state & SS_CANTRCVMORE) { + if (m) + goto dontblock; + else + goto release; + } + for (; m; m = m->m_next) + if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { + m = so->so_rcv.sb_mb; + goto dontblock; + } + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { + error = ENOTCONN; + goto release; + } + if (uio->uio_resid == 0) + goto release; + if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { + error = EWOULDBLOCK; + goto release; + } + sbunlock(&so->so_rcv); + error = sbwait(&so->so_rcv); + splx(s); + if (error) + return (error); + goto restart; + } +dontblock: + if (uio->uio_td) + uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; + nextrecord = m->m_nextpkt; + if (pr->pr_flags & PR_ADDR) { + KASSERT(m->m_type == MT_SONAME, + ("m->m_type == %d", m->m_type)); + orig_resid = 0; + if (psa) + *psa = dup_sockaddr(mtod(m, struct sockaddr *), + mp0 == 0); + if (flags & MSG_PEEK) { + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); + so->so_rcv.sb_mb = m_free(m); + m = so->so_rcv.sb_mb; + } + } + while (m && m->m_type == MT_CONTROL && error == 0) { + if (flags & MSG_PEEK) { + if (controlp) + *controlp = m_copy(m, 0, m->m_len); + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); + so->so_rcv.sb_mb = m->m_next; + m->m_next = NULL; + if (pr->pr_domain->dom_externalize) + error = + (*pr->pr_domain->dom_externalize)(m, controlp); + else if (controlp) + *controlp = m; + else + m_freem(m); + m = so->so_rcv.sb_mb; + } + if (controlp) { + orig_resid = 0; + do + controlp = &(*controlp)->m_next; + while (*controlp != NULL); + } + } + if (m) { + if ((flags & MSG_PEEK) == 0) + m->m_nextpkt = nextrecord; + type = m->m_type; + if (type == MT_OOBDATA) + flags |= MSG_OOB; + } + moff = 0; + offset = 0; + while (m && uio->uio_resid > 0 && error == 0) { + if (m->m_type == MT_OOBDATA) { + if (type != MT_OOBDATA) + break; + } else if (type == MT_OOBDATA) + break; + else + KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, + ("m->m_type == %d", m->m_type)); + so->so_state &= ~SS_RCVATMARK; + len = uio->uio_resid; + if (so->so_oobmark && len > so->so_oobmark - offset) + len = so->so_oobmark - offset; + if (len > m->m_len - moff) + len = m->m_len - moff; + /* + * If mp is set, just pass back the mbufs. + * Otherwise copy them out via the uio, then free. + * Sockbuf must be consistent here (points to current mbuf, + * it points to next record) when we drop priority; + * we must note any additions to the sockbuf when we + * block interrupts again. + */ + if (mp == 0) { + splx(s); +#ifdef ZERO_COPY_SOCKETS + if (so_zero_copy_receive) { + vm_page_t pg; + int disposable; + + if ((m->m_flags & M_EXT) + && (m->m_ext.ext_type == EXT_DISPOSABLE)) + disposable = 1; + else + disposable = 0; + + pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) + + moff)); + + if (uio->uio_offset == -1) + uio->uio_offset =IDX_TO_OFF(pg->pindex); + + error = uiomoveco(mtod(m, caddr_t) + moff, + (int)len, uio,pg->object, + disposable); + } else +#endif /* ZERO_COPY_SOCKETS */ + error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); + s = splnet(); + if (error) + goto release; + } else + uio->uio_resid -= len; + if (len == m->m_len - moff) { + if (m->m_flags & M_EOR) + flags |= MSG_EOR; + if (flags & MSG_PEEK) { + m = m->m_next; + moff = 0; + } else { + nextrecord = m->m_nextpkt; + sbfree(&so->so_rcv, m); + if (mp) { + *mp = m; + mp = &m->m_next; + so->so_rcv.sb_mb = m = m->m_next; + *mp = (struct mbuf *)0; + } else { + so->so_rcv.sb_mb = m_free(m); + m = so->so_rcv.sb_mb; + } + if (m) + m->m_nextpkt = nextrecord; + } + } else { + if (flags & MSG_PEEK) + moff += len; + else { + if (mp) + *mp = m_copym(m, 0, len, M_TRYWAIT); + m->m_data += len; + m->m_len -= len; + so->so_rcv.sb_cc -= len; + } + } + if (so->so_oobmark) { + if ((flags & MSG_PEEK) == 0) { + so->so_oobmark -= len; + if (so->so_oobmark == 0) { + so->so_state |= SS_RCVATMARK; + break; + } + } else { + offset += len; + if (offset == so->so_oobmark) + break; + } + } + if (flags & MSG_EOR) + break; + /* + * If the MSG_WAITALL flag is set (for non-atomic socket), + * we must not quit until "uio->uio_resid == 0" or an error + * termination. If a signal/timeout occurs, return + * with a short count but without error. + * Keep sockbuf locked against other readers. + */ + while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && + !sosendallatonce(so) && !nextrecord) { + if (so->so_error || so->so_state & SS_CANTRCVMORE) + break; + /* + * Notify the protocol that some data has been + * drained before blocking. + */ + if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) + (*pr->pr_usrreqs->pru_rcvd)(so, flags); + error = sbwait(&so->so_rcv); + if (error) { + sbunlock(&so->so_rcv); + splx(s); + return (0); + } + m = so->so_rcv.sb_mb; + if (m) + nextrecord = m->m_nextpkt; + } + } + + if (m && pr->pr_flags & PR_ATOMIC) { + flags |= MSG_TRUNC; + if ((flags & MSG_PEEK) == 0) + (void) sbdroprecord(&so->so_rcv); + } + if ((flags & MSG_PEEK) == 0) { + if (m == 0) + so->so_rcv.sb_mb = nextrecord; + if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) + (*pr->pr_usrreqs->pru_rcvd)(so, flags); + } + if (orig_resid == uio->uio_resid && orig_resid && + (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { + sbunlock(&so->so_rcv); + splx(s); + goto restart; + } + + if (flagsp) + *flagsp |= flags; +release: + sbunlock(&so->so_rcv); + splx(s); + return (error); +} + +int +soshutdown(so, how) + register struct socket *so; + register int how; +{ + register struct protosw *pr = so->so_proto; + + if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) + return (EINVAL); + + if (how != SHUT_WR) + sorflush(so); + if (how != SHUT_RD) + return ((*pr->pr_usrreqs->pru_shutdown)(so)); + return (0); +} + +void +sorflush(so) + register struct socket *so; +{ + register struct sockbuf *sb = &so->so_rcv; + register struct protosw *pr = so->so_proto; + register int s; + struct sockbuf asb; + + sb->sb_flags |= SB_NOINTR; + (void) sblock(sb, M_WAITOK); + s = splimp(); + socantrcvmore(so); + sbunlock(sb); + asb = *sb; + bzero(sb, sizeof (*sb)); + splx(s); + if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) + (*pr->pr_domain->dom_dispose)(asb.sb_mb); + sbrelease(&asb, so); +} + +#ifdef INET +static int +do_setopt_accept_filter(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + struct accept_filter_arg *afap = NULL; + struct accept_filter *afp; + struct so_accf *af = so->so_accf; + int error = 0; + + /* do not set/remove accept filters on non listen sockets */ + if ((so->so_options & SO_ACCEPTCONN) == 0) { + error = EINVAL; + goto out; + } + + /* removing the filter */ + if (sopt == NULL) { + if (af != NULL) { + if (af->so_accept_filter != NULL && + af->so_accept_filter->accf_destroy != NULL) { + af->so_accept_filter->accf_destroy(so); + } + if (af->so_accept_filter_str != NULL) { + FREE(af->so_accept_filter_str, M_ACCF); + } + FREE(af, M_ACCF); + so->so_accf = NULL; + } + so->so_options &= ~SO_ACCEPTFILTER; + return (0); + } + /* adding a filter */ + /* must remove previous filter first */ + if (af != NULL) { + error = EINVAL; + goto out; + } + /* don't put large objects on the kernel stack */ + MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); + afap->af_name[sizeof(afap->af_name)-1] = '\0'; + afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; + if (error) + goto out; + afp = accept_filt_get(afap->af_name); + if (afp == NULL) { + error = ENOENT; + goto out; + } + MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); + if (afp->accf_create != NULL) { + if (afap->af_name[0] != '\0') { + int len = strlen(afap->af_name) + 1; + + MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); + strcpy(af->so_accept_filter_str, afap->af_name); + } + af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); + if (af->so_accept_filter_arg == NULL) { + FREE(af->so_accept_filter_str, M_ACCF); + FREE(af, M_ACCF); + so->so_accf = NULL; + error = EINVAL; + goto out; + } + } + af->so_accept_filter = afp; + so->so_accf = af; + so->so_options |= SO_ACCEPTFILTER; +out: + if (afap != NULL) + FREE(afap, M_TEMP); + return (error); +} +#endif /* INET */ + +/* + * Perhaps this routine, and sooptcopyout(), below, ought to come in + * an additional variant to handle the case where the option value needs + * to be some kind of integer, but not a specific size. + * In addition to their use here, these functions are also called by the + * protocol-level pr_ctloutput() routines. + */ +int +sooptcopyin(sopt, buf, len, minlen) + struct sockopt *sopt; + void *buf; + size_t len; + size_t minlen; +{ + size_t valsize; + + /* + * If the user gives us more than we wanted, we ignore it, + * but if we don't get the minimum length the caller + * wants, we return EINVAL. On success, sopt->sopt_valsize + * is set to however much we actually retrieved. + */ + if ((valsize = sopt->sopt_valsize) < minlen) + return EINVAL; + if (valsize > len) + sopt->sopt_valsize = valsize = len; + + if (sopt->sopt_td != 0) + return (copyin(sopt->sopt_val, buf, valsize)); + + bcopy(sopt->sopt_val, buf, valsize); + return 0; +} + +int +sosetopt(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, optval; + struct linger l; + struct timeval tv; + u_long val; + + error = 0; + if (sopt->sopt_level != SOL_SOCKET) { + if (so->so_proto && so->so_proto->pr_ctloutput) + return ((*so->so_proto->pr_ctloutput) + (so, sopt)); + error = ENOPROTOOPT; + } else { + switch (sopt->sopt_name) { +#ifdef INET + case SO_ACCEPTFILTER: + error = do_setopt_accept_filter(so, sopt); + if (error) + goto bad; + break; +#endif + case SO_LINGER: + error = sooptcopyin(sopt, &l, sizeof l, sizeof l); + if (error) + goto bad; + + so->so_linger = l.l_linger; + if (l.l_onoff) + so->so_options |= SO_LINGER; + else + so->so_options &= ~SO_LINGER; + break; + + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_DONTROUTE: + case SO_USELOOPBACK: + case SO_BROADCAST: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_OOBINLINE: + case SO_TIMESTAMP: + case SO_NOSIGPIPE: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + goto bad; + if (optval) + so->so_options |= sopt->sopt_name; + else + so->so_options &= ~sopt->sopt_name; + break; + + case SO_SNDBUF: + case SO_RCVBUF: + case SO_SNDLOWAT: + case SO_RCVLOWAT: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + goto bad; + + /* + * Values < 1 make no sense for any of these + * options, so disallow them. + */ + if (optval < 1) { + error = EINVAL; + goto bad; + } + + switch (sopt->sopt_name) { + case SO_SNDBUF: + case SO_RCVBUF: + if (sbreserve(sopt->sopt_name == SO_SNDBUF ? + &so->so_snd : &so->so_rcv, (u_long)optval, + so, curthread) == 0) { + error = ENOBUFS; + goto bad; + } + break; + + /* + * Make sure the low-water is never greater than + * the high-water. + */ + case SO_SNDLOWAT: + so->so_snd.sb_lowat = + (optval > so->so_snd.sb_hiwat) ? + so->so_snd.sb_hiwat : optval; + break; + case SO_RCVLOWAT: + so->so_rcv.sb_lowat = + (optval > so->so_rcv.sb_hiwat) ? + so->so_rcv.sb_hiwat : optval; + break; + } + break; + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + error = sooptcopyin(sopt, &tv, sizeof tv, + sizeof tv); + if (error) + goto bad; + + /* assert(hz > 0); */ + if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || + tv.tv_usec < 0 || tv.tv_usec >= 1000000) { + error = EDOM; + goto bad; + } + /* assert(tick > 0); */ + /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ + val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; + if (val > SHRT_MAX) { + error = EDOM; + goto bad; + } + + switch (sopt->sopt_name) { + case SO_SNDTIMEO: + so->so_snd.sb_timeo = val; + break; + case SO_RCVTIMEO: + so->so_rcv.sb_timeo = val; + break; + } + break; + default: + error = ENOPROTOOPT; + break; + } + if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { + (void) ((*so->so_proto->pr_ctloutput) + (so, sopt)); + } + } +bad: + return (error); +} + +/* Helper routine for getsockopt */ +int +sooptcopyout(sopt, buf, len) + struct sockopt *sopt; + void *buf; + size_t len; +{ + int error; + size_t valsize; + + error = 0; + + /* + * Documented get behavior is that we always return a value, + * possibly truncated to fit in the user's buffer. + * Traditional behavior is that we always tell the user + * precisely how much we copied, rather than something useful + * like the total amount we had available for her. + * Note that this interface is not idempotent; the entire answer must + * generated ahead of time. + */ + valsize = min(len, sopt->sopt_valsize); + sopt->sopt_valsize = valsize; + if (sopt->sopt_val != 0) { + if (sopt->sopt_td != 0) + error = copyout(buf, sopt->sopt_val, valsize); + else + bcopy(buf, sopt->sopt_val, valsize); + } + return error; +} + +int +sogetopt(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, optval; + struct linger l; + struct timeval tv; +#ifdef INET + struct accept_filter_arg *afap; +#endif + + error = 0; + if (sopt->sopt_level != SOL_SOCKET) { + if (so->so_proto && so->so_proto->pr_ctloutput) { + return ((*so->so_proto->pr_ctloutput) + (so, sopt)); + } else + return (ENOPROTOOPT); + } else { + switch (sopt->sopt_name) { +#ifdef INET + case SO_ACCEPTFILTER: + if ((so->so_options & SO_ACCEPTCONN) == 0) + return (EINVAL); + MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), + M_TEMP, M_WAITOK | M_ZERO); + if ((so->so_options & SO_ACCEPTFILTER) != 0) { + strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); + if (so->so_accf->so_accept_filter_str != NULL) + strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); + } + error = sooptcopyout(sopt, afap, sizeof(*afap)); + FREE(afap, M_TEMP); + break; +#endif + + case SO_LINGER: + l.l_onoff = so->so_options & SO_LINGER; + l.l_linger = so->so_linger; + error = sooptcopyout(sopt, &l, sizeof l); + break; + + case SO_USELOOPBACK: + case SO_DONTROUTE: + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_BROADCAST: + case SO_OOBINLINE: + case SO_TIMESTAMP: + case SO_NOSIGPIPE: + optval = so->so_options & sopt->sopt_name; +integer: + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case SO_TYPE: + optval = so->so_type; + goto integer; + + case SO_ERROR: + optval = so->so_error; + so->so_error = 0; + goto integer; + + case SO_SNDBUF: + optval = so->so_snd.sb_hiwat; + goto integer; + + case SO_RCVBUF: + optval = so->so_rcv.sb_hiwat; + goto integer; + + case SO_SNDLOWAT: + optval = so->so_snd.sb_lowat; + goto integer; + + case SO_RCVLOWAT: + optval = so->so_rcv.sb_lowat; + goto integer; + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + optval = (sopt->sopt_name == SO_SNDTIMEO ? + so->so_snd.sb_timeo : so->so_rcv.sb_timeo); + + tv.tv_sec = optval / hz; + tv.tv_usec = (optval % hz) * tick; + error = sooptcopyout(sopt, &tv, sizeof tv); + break; + + default: + error = ENOPROTOOPT; + break; + } + return (error); + } +} + +/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ +int +soopt_getm(struct sockopt *sopt, struct mbuf **mp) +{ + struct mbuf *m, *m_prev; + int sopt_size = sopt->sopt_valsize; + + MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); + if (m == 0) + return ENOBUFS; + if (sopt_size > MLEN) { + MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_free(m); + return ENOBUFS; + } + m->m_len = min(MCLBYTES, sopt_size); + } else { + m->m_len = min(MLEN, sopt_size); + } + sopt_size -= m->m_len; + *mp = m; + m_prev = m; + + while (sopt_size) { + MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); + if (m == 0) { + m_freem(*mp); + return ENOBUFS; + } + if (sopt_size > MLEN) { + MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_freem(*mp); + return ENOBUFS; + } + m->m_len = min(MCLBYTES, sopt_size); + } else { + m->m_len = min(MLEN, sopt_size); + } + sopt_size -= m->m_len; + m_prev->m_next = m; + m_prev = m; + } + return 0; +} + +/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ +int +soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) +{ + struct mbuf *m0 = m; + + if (sopt->sopt_val == NULL) + return 0; + while (m != NULL && sopt->sopt_valsize >= m->m_len) { + if (sopt->sopt_td != NULL) { + int error; + + error = copyin(sopt->sopt_val, mtod(m, char *), + m->m_len); + if (error != 0) { + m_freem(m0); + return(error); + } + } else + bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); + sopt->sopt_valsize -= m->m_len; + (caddr_t)sopt->sopt_val += m->m_len; + m = m->m_next; + } + if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ + panic("ip6_sooptmcopyin"); + return 0; +} + +/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ +int +soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) +{ + struct mbuf *m0 = m; + size_t valsize = 0; + + if (sopt->sopt_val == NULL) + return 0; + while (m != NULL && sopt->sopt_valsize >= m->m_len) { + if (sopt->sopt_td != NULL) { + int error; + + error = copyout(mtod(m, char *), sopt->sopt_val, + m->m_len); + if (error != 0) { + m_freem(m0); + return(error); + } + } else + bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); + sopt->sopt_valsize -= m->m_len; + (caddr_t)sopt->sopt_val += m->m_len; + valsize += m->m_len; + m = m->m_next; + } + if (m != NULL) { + /* enough soopt buffer should be given from user-land */ + m_freem(m0); + return(EINVAL); + } + sopt->sopt_valsize = valsize; + return 0; +} + +void +sohasoutofband(so) + register struct socket *so; +{ + if (so->so_sigio != NULL) + pgsigio(&so->so_sigio, SIGURG, 0); + selwakeup(&so->so_rcv.sb_sel); +} + +int +sopoll(struct socket *so, int events, struct ucred *cred, struct thread *td) +{ + int revents = 0; + int s = splnet(); + + if (events & (POLLIN | POLLRDNORM)) + if (soreadable(so)) + revents |= events & (POLLIN | POLLRDNORM); + + if (events & POLLINIGNEOF) + if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || + !TAILQ_EMPTY(&so->so_comp) || so->so_error) + revents |= POLLINIGNEOF; + + if (events & (POLLOUT | POLLWRNORM)) + if (sowriteable(so)) + revents |= events & (POLLOUT | POLLWRNORM); + + if (events & (POLLPRI | POLLRDBAND)) + if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) + revents |= events & (POLLPRI | POLLRDBAND); + + if (revents == 0) { + if (events & + (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | + POLLRDBAND)) { + selrecord(td, &so->so_rcv.sb_sel); + so->so_rcv.sb_flags |= SB_SEL; + } + + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(td, &so->so_snd.sb_sel); + so->so_snd.sb_flags |= SB_SEL; + } + } + + splx(s); + return (revents); +} + +int +sokqfilter(struct file *fp, struct knote *kn) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_data; + struct sockbuf *sb; + int s; + + switch (kn->kn_filter) { + case EVFILT_READ: + if (so->so_options & SO_ACCEPTCONN) + kn->kn_fop = &solisten_filtops; + else + kn->kn_fop = &soread_filtops; + sb = &so->so_rcv; + break; + case EVFILT_WRITE: + kn->kn_fop = &sowrite_filtops; + sb = &so->so_snd; + break; + default: + return (1); + } + + s = splnet(); + SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); + sb->sb_flags |= SB_KNOTE; + splx(s); + return (0); +} + +static void +filt_sordetach(struct knote *kn) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_data; + int s = splnet(); + + SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); + if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) + so->so_rcv.sb_flags &= ~SB_KNOTE; + splx(s); +} + +/*ARGSUSED*/ +static int +filt_soread(struct knote *kn, long hint) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_data; + + kn->kn_data = so->so_rcv.sb_cc; + if (so->so_state & SS_CANTRCVMORE) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + return (1); + } + if (so->so_error) /* temporary udp error */ + return (1); + if (kn->kn_sfflags & NOTE_LOWAT) + return (kn->kn_data >= kn->kn_sdata); + return (kn->kn_data >= so->so_rcv.sb_lowat); +} + +static void +filt_sowdetach(struct knote *kn) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_data; + int s = splnet(); + + SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); + if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) + so->so_snd.sb_flags &= ~SB_KNOTE; + splx(s); +} + +/*ARGSUSED*/ +static int +filt_sowrite(struct knote *kn, long hint) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_data; + + kn->kn_data = sbspace(&so->so_snd); + if (so->so_state & SS_CANTSENDMORE) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + return (1); + } + if (so->so_error) /* temporary udp error */ + return (1); + if (((so->so_state & SS_ISCONNECTED) == 0) && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) + return (0); + if (kn->kn_sfflags & NOTE_LOWAT) + return (kn->kn_data >= kn->kn_sdata); + return (kn->kn_data >= so->so_snd.sb_lowat); +} + +/*ARGSUSED*/ +static int +filt_solisten(struct knote *kn, long hint) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_data; + + kn->kn_data = so->so_qlen; + return (! TAILQ_EMPTY(&so->so_comp)); +} + +int +socheckuid(struct socket *so, uid_t uid) +{ + + if (so == NULL) + return (EPERM); + if (so->so_cred->cr_uid == uid) + return (0); + return (EPERM); +} diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c new file mode 100644 index 0000000..1e68f83 --- /dev/null +++ b/sys/kern/uipc_socket2.c @@ -0,0 +1,983 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include "opt_param.h" +#include <sys/param.h> +#include <sys/aio.h> /* for aio_swake proto */ +#include <sys/domain.h> +#include <sys/event.h> +#include <sys/file.h> /* for maxfiles */ +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/protosw.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/stat.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +int maxsockets; + +void (*aio_swake)(struct socket *, struct sockbuf *); + +/* + * Primitive routines for operating on sockets and socket buffers + */ + +u_long sb_max = SB_MAX; /* XXX should be static */ + +static u_long sb_efficiency = 8; /* parameter for sbreserve() */ + +/* + * Procedures to manipulate state flags of socket + * and do appropriate wakeups. Normal sequence from the + * active (originating) side is that soisconnecting() is + * called during processing of connect() call, + * resulting in an eventual call to soisconnected() if/when the + * connection is established. When the connection is torn down + * soisdisconnecting() is called during processing of disconnect() call, + * and soisdisconnected() is called when the connection to the peer + * is totally severed. The semantics of these routines are such that + * connectionless protocols can call soisconnected() and soisdisconnected() + * only, bypassing the in-progress calls when setting up a ``connection'' + * takes no time. + * + * From the passive side, a socket is created with + * two queues of sockets: so_incomp for connections in progress + * and so_comp for connections already made and awaiting user acceptance. + * As a protocol is preparing incoming connections, it creates a socket + * structure queued on so_incomp by calling sonewconn(). When the connection + * is established, soisconnected() is called, and transfers the + * socket structure to so_comp, making it available to accept(). + * + * If a socket is closed with sockets on either + * so_incomp or so_comp, these sockets are dropped. + * + * If higher level protocols are implemented in + * the kernel, the wakeups done here will sometimes + * cause software-interrupt process scheduling. + */ + +void +soisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= SS_ISCONNECTING; +} + +void +soisconnected(so) + struct socket *so; +{ + struct socket *head = so->so_head; + + so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); + so->so_state |= SS_ISCONNECTED; + if (head && (so->so_state & SS_INCOMP)) { + if ((so->so_options & SO_ACCEPTFILTER) != 0) { + so->so_upcall = head->so_accf->so_accept_filter->accf_callback; + so->so_upcallarg = head->so_accf->so_accept_filter_arg; + so->so_rcv.sb_flags |= SB_UPCALL; + so->so_options &= ~SO_ACCEPTFILTER; + so->so_upcall(so, so->so_upcallarg, 0); + return; + } + TAILQ_REMOVE(&head->so_incomp, so, so_list); + head->so_incqlen--; + so->so_state &= ~SS_INCOMP; + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + head->so_qlen++; + so->so_state |= SS_COMP; + sorwakeup(head); + wakeup_one(&head->so_timeo); + } else { + wakeup(&so->so_timeo); + sorwakeup(so); + sowwakeup(so); + } +} + +void +soisdisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~SS_ISCONNECTING; + so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup(&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +void +soisdisconnected(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); + wakeup(&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +/* + * When an attempt at a new connection is noted on a socket + * which accepts connections, sonewconn is called. If the + * connection is possible (subject to space constraints, etc.) + * then we allocate a new structure, propoerly linked into the + * data structure of the original socket, and return this. + * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. + * + * note: the ref count on the socket is 0 on return + */ +struct socket * +sonewconn(head, connstatus) + register struct socket *head; + int connstatus; +{ + register struct socket *so; + + if (head->so_qlen > 3 * head->so_qlimit / 2) + return ((struct socket *)0); + so = soalloc(0); + if (so == NULL) + return ((struct socket *)0); + if ((head->so_options & SO_ACCEPTFILTER) != 0) + connstatus = 0; + so->so_head = head; + so->so_type = head->so_type; + so->so_options = head->so_options &~ SO_ACCEPTCONN; + so->so_linger = head->so_linger; + so->so_state = head->so_state | SS_NOFDREF; + so->so_proto = head->so_proto; + so->so_timeo = head->so_timeo; + so->so_cred = crhold(head->so_cred); + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || + (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { + sotryfree(so); + return ((struct socket *)0); + } + + if (connstatus) { + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + so->so_state |= SS_COMP; + head->so_qlen++; + } else { + if (head->so_incqlen > head->so_qlimit) { + struct socket *sp; + sp = TAILQ_FIRST(&head->so_incomp); + (void) soabort(sp); + } + TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); + so->so_state |= SS_INCOMP; + head->so_incqlen++; + } + if (connstatus) { + sorwakeup(head); + wakeup(&head->so_timeo); + so->so_state |= connstatus; + } + return (so); +} + +/* + * Socantsendmore indicates that no more data will be sent on the + * socket; it would normally be applied to a socket when the user + * informs the system that no more data is to be sent, by the protocol + * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data + * will be received, and will normally be applied to the socket by a + * protocol when it detects that the peer will send no more data. + * Data queued for reading in the socket may yet be read. + */ + +void +socantsendmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTSENDMORE; + sowwakeup(so); +} + +void +socantrcvmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTRCVMORE; + sorwakeup(so); +} + +/* + * Wait for data to arrive at/drain from a socket buffer. + */ +int +sbwait(sb) + struct sockbuf *sb; +{ + + sb->sb_flags |= SB_WAIT; + return (tsleep(&sb->sb_cc, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", + sb->sb_timeo)); +} + +/* + * Lock a sockbuf already known to be locked; + * return any error returned from sleep (EINTR). + */ +int +sb_lock(sb) + register struct sockbuf *sb; +{ + int error; + + while (sb->sb_flags & SB_LOCK) { + sb->sb_flags |= SB_WANT; + error = tsleep(&sb->sb_flags, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, + "sblock", 0); + if (error) + return (error); + } + sb->sb_flags |= SB_LOCK; + return (0); +} + +/* + * Wakeup processes waiting on a socket buffer. + * Do asynchronous notification via SIGIO + * if the socket has the SS_ASYNC flag set. + */ +void +sowakeup(so, sb) + register struct socket *so; + register struct sockbuf *sb; +{ + + selwakeup(&sb->sb_sel); + sb->sb_flags &= ~SB_SEL; + if (sb->sb_flags & SB_WAIT) { + sb->sb_flags &= ~SB_WAIT; + wakeup(&sb->sb_cc); + } + if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) + pgsigio(&so->so_sigio, SIGIO, 0); + if (sb->sb_flags & SB_UPCALL) + (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); + if (sb->sb_flags & SB_AIO) + aio_swake(so, sb); + KNOTE(&sb->sb_sel.si_note, 0); +} + +/* + * Socket buffer (struct sockbuf) utility routines. + * + * Each socket contains two socket buffers: one for sending data and + * one for receiving data. Each buffer contains a queue of mbufs, + * information about the number of mbufs and amount of data in the + * queue, and other fields allowing select() statements and notification + * on data availability to be implemented. + * + * Data stored in a socket buffer is maintained as a list of records. + * Each record is a list of mbufs chained together with the m_next + * field. Records are chained together with the m_nextpkt field. The upper + * level routine soreceive() expects the following conventions to be + * observed when placing information in the receive buffer: + * + * 1. If the protocol requires each message be preceded by the sender's + * name, then a record containing that name must be present before + * any associated data (mbuf's must be of type MT_SONAME). + * 2. If the protocol supports the exchange of ``access rights'' (really + * just additional data associated with the message), and there are + * ``rights'' to be received, then a record containing this data + * should be present (mbuf's must be of type MT_RIGHTS). + * 3. If a name or rights record exists, then it must be followed by + * a data record, perhaps of zero length. + * + * Before using a new socket structure it is first necessary to reserve + * buffer space to the socket, by calling sbreserve(). This should commit + * some of the available buffer space in the system buffer pool for the + * socket (currently, it does nothing but enforce limits). The space + * should be released by calling sbrelease() when the socket is destroyed. + */ + +int +soreserve(so, sndcc, rcvcc) + register struct socket *so; + u_long sndcc, rcvcc; +{ + struct thread *td = curthread; + + if (sbreserve(&so->so_snd, sndcc, so, td) == 0) + goto bad; + if (sbreserve(&so->so_rcv, rcvcc, so, td) == 0) + goto bad2; + if (so->so_rcv.sb_lowat == 0) + so->so_rcv.sb_lowat = 1; + if (so->so_snd.sb_lowat == 0) + so->so_snd.sb_lowat = MCLBYTES; + if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) + so->so_snd.sb_lowat = so->so_snd.sb_hiwat; + return (0); +bad2: + sbrelease(&so->so_snd, so); +bad: + return (ENOBUFS); +} + +/* + * Allot mbufs to a sockbuf. + * Attempt to scale mbmax so that mbcnt doesn't become limiting + * if buffering efficiency is near the normal case. + */ +int +sbreserve(sb, cc, so, td) + struct sockbuf *sb; + u_long cc; + struct socket *so; + struct thread *td; +{ + + /* + * td will only be NULL when we're in an interrupt + * (e.g. in tcp_input()) + */ + if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES)) + return (0); + if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, + td ? td->td_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur : RLIM_INFINITY)) { + return (0); + } + sb->sb_mbmax = min(cc * sb_efficiency, sb_max); + if (sb->sb_lowat > sb->sb_hiwat) + sb->sb_lowat = sb->sb_hiwat; + return (1); +} + +/* + * Free mbufs held by a socket, and reserved mbuf space. + */ +void +sbrelease(sb, so) + struct sockbuf *sb; + struct socket *so; +{ + + sbflush(sb); + (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, + RLIM_INFINITY); + sb->sb_mbmax = 0; +} + +/* + * Routines to add and remove + * data from an mbuf queue. + * + * The routines sbappend() or sbappendrecord() are normally called to + * append new mbufs to a socket buffer, after checking that adequate + * space is available, comparing the function sbspace() with the amount + * of data to be added. sbappendrecord() differs from sbappend() in + * that data supplied is treated as the beginning of a new record. + * To place a sender's address, optional access rights, and data in a + * socket receive buffer, sbappendaddr() should be used. To place + * access rights and data in a socket receive buffer, sbappendrights() + * should be used. In either case, the new data begins a new record. + * Note that unlike sbappend() and sbappendrecord(), these routines check + * for the caller that there will be enough space to store the data. + * Each fails if there is not enough space, or if it cannot find mbufs + * to store additional information in. + * + * Reliable protocols may use the socket send buffer to hold data + * awaiting acknowledgement. Data is normally copied from a socket + * send buffer in a protocol with m_copy for output to a peer, + * and then removing the data from the socket buffer with sbdrop() + * or sbdroprecord() when the data is acknowledged by the peer. + */ + +/* + * Append mbuf chain m to the last record in the + * socket buffer sb. The additional space associated + * the mbuf chain is recorded in sb. Empty mbufs are + * discarded and mbufs are compacted where possible. + */ +void +sbappend(sb, m) + struct sockbuf *sb; + struct mbuf *m; +{ + register struct mbuf *n; + + if (m == 0) + return; + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + do { + if (n->m_flags & M_EOR) { + sbappendrecord(sb, m); /* XXXXXX!!!! */ + return; + } + } while (n->m_next && (n = n->m_next)); + } + sbcompress(sb, m, n); +} + +#ifdef SOCKBUF_DEBUG +void +sbcheck(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m; + register struct mbuf *n = 0; + register u_long len = 0, mbcnt = 0; + + for (m = sb->sb_mb; m; m = n) { + n = m->m_nextpkt; + for (; m; m = m->m_next) { + len += m->m_len; + mbcnt += MSIZE; + if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ + mbcnt += m->m_ext.ext_size; + } + } + if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { + printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc, + mbcnt, sb->sb_mbcnt); + panic("sbcheck"); + } +} +#endif + +/* + * As above, except the mbuf chain + * begins a new record. + */ +void +sbappendrecord(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + + if (m0 == 0) + return; + m = sb->sb_mb; + if (m) + while (m->m_nextpkt) + m = m->m_nextpkt; + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + if (m) + m->m_nextpkt = m0; + else + sb->sb_mb = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * As above except that OOB data + * is inserted at the beginning of the sockbuf, + * but after any other OOB data. + */ +void +sbinsertoob(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + register struct mbuf **mp; + + if (m0 == 0) + return; + for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) { + m = *mp; + again: + switch (m->m_type) { + + case MT_OOBDATA: + continue; /* WANT next train */ + + case MT_CONTROL: + m = m->m_next; + if (m) + goto again; /* inspect THIS train further */ + } + break; + } + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + m0->m_nextpkt = *mp; + *mp = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * Append address and data, and optionally, control (ancillary) data + * to the receive queue of a socket. If present, + * m0 must include a packet header with total length. + * Returns 0 if no space in sockbuf or insufficient mbufs. + */ +int +sbappendaddr(sb, asa, m0, control) + register struct sockbuf *sb; + struct sockaddr *asa; + struct mbuf *m0, *control; +{ + register struct mbuf *m, *n; + int space = asa->sa_len; + + if (m0 && (m0->m_flags & M_PKTHDR) == 0) + panic("sbappendaddr"); + if (m0) + space += m0->m_pkthdr.len; + for (n = control; n; n = n->m_next) { + space += n->m_len; + if (n->m_next == 0) /* keep pointer to last control buf */ + break; + } + if (space > sbspace(sb)) + return (0); + if (asa->sa_len > MLEN) + return (0); + MGET(m, M_DONTWAIT, MT_SONAME); + if (m == 0) + return (0); + m->m_len = asa->sa_len; + bcopy(asa, mtod(m, caddr_t), asa->sa_len); + if (n) + n->m_next = m0; /* concatenate data to control */ + else + control = m0; + m->m_next = control; + for (n = m; n; n = n->m_next) + sballoc(sb, n); + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = m; + } else + sb->sb_mb = m; + return (1); +} + +int +sbappendcontrol(sb, m0, control) + struct sockbuf *sb; + struct mbuf *control, *m0; +{ + register struct mbuf *m, *n; + int space = 0; + + if (control == 0) + panic("sbappendcontrol"); + for (m = control; ; m = m->m_next) { + space += m->m_len; + if (m->m_next == 0) + break; + } + n = m; /* save pointer to last control buffer */ + for (m = m0; m; m = m->m_next) + space += m->m_len; + if (space > sbspace(sb)) + return (0); + n->m_next = m0; /* concatenate data to control */ + for (m = control; m; m = m->m_next) + sballoc(sb, m); + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = control; + } else + sb->sb_mb = control; + return (1); +} + +/* + * Compress mbuf chain m into the socket + * buffer sb following mbuf n. If n + * is null, the buffer is presumed empty. + */ +void +sbcompress(sb, m, n) + register struct sockbuf *sb; + register struct mbuf *m, *n; +{ + register int eor = 0; + register struct mbuf *o; + + while (m) { + eor |= m->m_flags & M_EOR; + if (m->m_len == 0 && + (eor == 0 || + (((o = m->m_next) || (o = n)) && + o->m_type == m->m_type))) { + m = m_free(m); + continue; + } + if (n && (n->m_flags & M_EOR) == 0 && + M_WRITABLE(n) && + m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ + m->m_len <= M_TRAILINGSPACE(n) && + n->m_type == m->m_type) { + bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, + (unsigned)m->m_len); + n->m_len += m->m_len; + sb->sb_cc += m->m_len; + m = m_free(m); + continue; + } + if (n) + n->m_next = m; + else + sb->sb_mb = m; + sballoc(sb, m); + n = m; + m->m_flags &= ~M_EOR; + m = m->m_next; + n->m_next = 0; + } + if (eor) { + if (n) + n->m_flags |= eor; + else + printf("semi-panic: sbcompress\n"); + } +} + +/* + * Free all mbufs in a sockbuf. + * Check that all resources are reclaimed. + */ +void +sbflush(sb) + register struct sockbuf *sb; +{ + + if (sb->sb_flags & SB_LOCK) + panic("sbflush: locked"); + while (sb->sb_mbcnt) { + /* + * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: + * we would loop forever. Panic instead. + */ + if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) + break; + sbdrop(sb, (int)sb->sb_cc); + } + if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt) + panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); +} + +/* + * Drop data from (the front of) a sockbuf. + */ +void +sbdrop(sb, len) + register struct sockbuf *sb; + register int len; +{ + register struct mbuf *m; + struct mbuf *next; + + next = (m = sb->sb_mb) ? m->m_nextpkt : 0; + while (len > 0) { + if (m == 0) { + if (next == 0) + panic("sbdrop"); + m = next; + next = m->m_nextpkt; + continue; + } + if (m->m_len > len) { + m->m_len -= len; + m->m_data += len; + sb->sb_cc -= len; + break; + } + len -= m->m_len; + sbfree(sb, m); + m = m_free(m); + } + while (m && m->m_len == 0) { + sbfree(sb, m); + m = m_free(m); + } + if (m) { + sb->sb_mb = m; + m->m_nextpkt = next; + } else + sb->sb_mb = next; +} + +/* + * Drop a record off the front of a sockbuf + * and move the next record to the front. + */ +void +sbdroprecord(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m; + + m = sb->sb_mb; + if (m) { + sb->sb_mb = m->m_nextpkt; + do { + sbfree(sb, m); + m = m_free(m); + } while (m); + } +} + +/* + * Create a "control" mbuf containing the specified data + * with the specified type for presentation on a socket buffer. + */ +struct mbuf * +sbcreatecontrol(p, size, type, level) + caddr_t p; + register int size; + int type, level; +{ + register struct cmsghdr *cp; + struct mbuf *m; + + if (CMSG_SPACE((u_int)size) > MCLBYTES) + return ((struct mbuf *) NULL); + if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) + return ((struct mbuf *) NULL); + if (CMSG_SPACE((u_int)size) > MLEN) { + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_free(m); + return ((struct mbuf *) NULL); + } + } + cp = mtod(m, struct cmsghdr *); + m->m_len = 0; + KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m), + ("sbcreatecontrol: short mbuf")); + if (p != NULL) + (void)memcpy(CMSG_DATA(cp), p, size); + m->m_len = CMSG_SPACE(size); + cp->cmsg_len = CMSG_LEN(size); + cp->cmsg_level = level; + cp->cmsg_type = type; + return (m); +} + +/* + * Some routines that return EOPNOTSUPP for entry points that are not + * supported by a protocol. Fill in as needed. + */ +int +pru_accept_notsupp(struct socket *so, struct sockaddr **nam) +{ + return EOPNOTSUPP; +} + +int +pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + return EOPNOTSUPP; +} + +int +pru_connect2_notsupp(struct socket *so1, struct socket *so2) +{ + return EOPNOTSUPP; +} + +int +pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, + struct ifnet *ifp, struct thread *td) +{ + return EOPNOTSUPP; +} + +int +pru_listen_notsupp(struct socket *so, struct thread *td) +{ + return EOPNOTSUPP; +} + +int +pru_rcvd_notsupp(struct socket *so, int flags) +{ + return EOPNOTSUPP; +} + +int +pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) +{ + return EOPNOTSUPP; +} + +/* + * This isn't really a ``null'' operation, but it's the default one + * and doesn't do anything destructive. + */ +int +pru_sense_null(struct socket *so, struct stat *sb) +{ + sb->st_blksize = so->so_snd.sb_hiwat; + return 0; +} + +/* + * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. + */ +struct sockaddr * +dup_sockaddr(sa, canwait) + struct sockaddr *sa; + int canwait; +{ + struct sockaddr *sa2; + + MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME, + canwait ? M_WAITOK : M_NOWAIT); + if (sa2) + bcopy(sa, sa2, sa->sa_len); + return sa2; +} + +/* + * Create an external-format (``xsocket'') structure using the information + * in the kernel-format socket structure pointed to by so. This is done + * to reduce the spew of irrelevant information over this interface, + * to isolate user code from changes in the kernel structure, and + * potentially to provide information-hiding if we decide that + * some of this information should be hidden from users. + */ +void +sotoxsocket(struct socket *so, struct xsocket *xso) +{ + xso->xso_len = sizeof *xso; + xso->xso_so = so; + xso->so_type = so->so_type; + xso->so_options = so->so_options; + xso->so_linger = so->so_linger; + xso->so_state = so->so_state; + xso->so_pcb = so->so_pcb; + xso->xso_protocol = so->so_proto->pr_protocol; + xso->xso_family = so->so_proto->pr_domain->dom_family; + xso->so_qlen = so->so_qlen; + xso->so_incqlen = so->so_incqlen; + xso->so_qlimit = so->so_qlimit; + xso->so_timeo = so->so_timeo; + xso->so_error = so->so_error; + xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; + xso->so_oobmark = so->so_oobmark; + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + xso->so_uid = so->so_cred->cr_uid; +} + +/* + * This does the same for sockbufs. Note that the xsockbuf structure, + * since it is always embedded in a socket, does not include a self + * pointer nor a length. We make this entry point public in case + * some other mechanism needs it. + */ +void +sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) +{ + xsb->sb_cc = sb->sb_cc; + xsb->sb_hiwat = sb->sb_hiwat; + xsb->sb_mbcnt = sb->sb_mbcnt; + xsb->sb_mbmax = sb->sb_mbmax; + xsb->sb_lowat = sb->sb_lowat; + xsb->sb_flags = sb->sb_flags; + xsb->sb_timeo = sb->sb_timeo; +} + +/* + * Here is the definition of some of the basic objects in the kern.ipc + * branch of the MIB. + */ +SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); + +/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ +static int dummy; +SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); + +SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, + &sb_max, 0, "Maximum socket buffer size"); +SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, + &maxsockets, 0, "Maximum number of sockets avaliable"); +SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, + &sb_efficiency, 0, ""); + +/* + * Initialise maxsockets + */ +static void init_maxsockets(void *ignored) +{ + TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); + maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); +} +SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c new file mode 100644 index 0000000..1e9c5fa --- /dev/null +++ b/sys/kern/uipc_syscalls.c @@ -0,0 +1,1945 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * sendfile(2) and related extensions: + * Copyright (c) 1998, David Greenman. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 + * $FreeBSD$ + */ + +#include "opt_compat.h" +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sysproto.h> +#include <sys/malloc.h> +#include <sys/filedesc.h> +#include <sys/event.h> +#include <sys/proc.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/lock.h> +#include <sys/mount.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/signalvar.h> +#include <sys/uio.h> +#include <sys/vnode.h> +#ifdef KTRACE +#include <sys/ktrace.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> + +static void sf_buf_init(void *arg); +SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) +struct sf_buf *sf_buf_alloc(void); +void sf_buf_free(void *addr, void *args); + +static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); +static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); + +static int accept1(struct thread *td, struct accept_args *uap, int compat); +static int getsockname1(struct thread *td, struct getsockname_args *uap, + int compat); +static int getpeername1(struct thread *td, struct getpeername_args *uap, + int compat); + +/* + * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the + * sf_freelist head with the sf_lock mutex. + */ +static struct { + SLIST_HEAD(, sf_buf) sf_head; + struct mtx sf_lock; +} sf_freelist; + +vm_offset_t sf_base; +struct sf_buf *sf_bufs; +u_int sf_buf_alloc_want; + +/* + * System call interface to the socket abstraction. + */ +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#define COMPAT_OLDSOCK +#endif + +extern struct fileops socketops; + +/* + * MPSAFE + */ +int +socket(td, uap) + struct thread *td; + register struct socket_args /* { + int domain; + int type; + int protocol; + } */ *uap; +{ + struct filedesc *fdp; + struct socket *so; + struct file *fp; + int fd, error; + + mtx_lock(&Giant); + fdp = td->td_proc->p_fd; + error = falloc(td, &fp, &fd); + if (error) + goto done2; + fhold(fp); + error = socreate(uap->domain, &so, uap->type, uap->protocol, + td->td_ucred, td); + FILEDESC_LOCK(fdp); + if (error) { + if (fdp->fd_ofiles[fd] == fp) { + fdp->fd_ofiles[fd] = NULL; + FILEDESC_UNLOCK(fdp); + fdrop(fp, td); + } else + FILEDESC_UNLOCK(fdp); + } else { + fp->f_data = so; /* already has ref count */ + fp->f_flag = FREAD|FWRITE; + fp->f_ops = &socketops; + fp->f_type = DTYPE_SOCKET; + FILEDESC_UNLOCK(fdp); + td->td_retval[0] = fd; + } + fdrop(fp, td); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +bind(td, uap) + struct thread *td; + register struct bind_args /* { + int s; + caddr_t name; + int namelen; + } */ *uap; +{ + struct socket *so; + struct sockaddr *sa; + int error; + + mtx_lock(&Giant); + if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) + goto done2; + if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0) + goto done1; + error = sobind(so, sa, td); + FREE(sa, M_SONAME); +done1: + fputsock(so); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +listen(td, uap) + struct thread *td; + register struct listen_args /* { + int s; + int backlog; + } */ *uap; +{ + struct socket *so; + int error; + + mtx_lock(&Giant); + if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { + error = solisten(so, uap->backlog, td); + fputsock(so); + } + mtx_unlock(&Giant); + return(error); +} + +/* + * accept1() + * MPSAFE + */ +static int +accept1(td, uap, compat) + struct thread *td; + register struct accept_args /* { + int s; + caddr_t name; + int *anamelen; + } */ *uap; + int compat; +{ + struct filedesc *fdp; + struct file *nfp = NULL; + struct sockaddr *sa; + int namelen, error, s; + struct socket *head, *so; + int fd; + u_int fflag; + + mtx_lock(&Giant); + fdp = td->td_proc->p_fd; + if (uap->name) { + error = copyin(uap->anamelen, &namelen, sizeof (namelen)); + if(error) + goto done2; + } + error = fgetsock(td, uap->s, &head, &fflag); + if (error) + goto done2; + s = splnet(); + if ((head->so_options & SO_ACCEPTCONN) == 0) { + splx(s); + error = EINVAL; + goto done; + } + if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { + splx(s); + error = EWOULDBLOCK; + goto done; + } + while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { + if (head->so_state & SS_CANTRCVMORE) { + head->so_error = ECONNABORTED; + break; + } + error = tsleep(&head->so_timeo, PSOCK | PCATCH, + "accept", 0); + if (error) { + splx(s); + goto done; + } + } + if (head->so_error) { + error = head->so_error; + head->so_error = 0; + splx(s); + goto done; + } + + /* + * At this point we know that there is at least one connection + * ready to be accepted. Remove it from the queue prior to + * allocating the file descriptor for it since falloc() may + * block allowing another process to accept the connection + * instead. + */ + so = TAILQ_FIRST(&head->so_comp); + TAILQ_REMOVE(&head->so_comp, so, so_list); + head->so_qlen--; + + error = falloc(td, &nfp, &fd); + if (error) { + /* + * Probably ran out of file descriptors. Put the + * unaccepted connection back onto the queue and + * do another wakeup so some other process might + * have a chance at it. + */ + TAILQ_INSERT_HEAD(&head->so_comp, so, so_list); + head->so_qlen++; + wakeup_one(&head->so_timeo); + splx(s); + goto done; + } + fhold(nfp); + td->td_retval[0] = fd; + + /* connection has been removed from the listen queue */ + KNOTE(&head->so_rcv.sb_sel.si_note, 0); + + so->so_state &= ~SS_COMP; + so->so_head = NULL; + if (head->so_sigio != NULL) + fsetown(fgetown(head->so_sigio), &so->so_sigio); + + FILE_LOCK(nfp); + soref(so); /* file descriptor reference */ + nfp->f_data = so; /* nfp has ref count from falloc */ + nfp->f_flag = fflag; + nfp->f_ops = &socketops; + nfp->f_type = DTYPE_SOCKET; + FILE_UNLOCK(nfp); + sa = 0; + error = soaccept(so, &sa); + if (error) { + /* + * return a namelen of zero for older code which might + * ignore the return value from accept. + */ + if (uap->name != NULL) { + namelen = 0; + (void) copyout(&namelen, + uap->anamelen, sizeof(*uap->anamelen)); + } + goto noconnection; + } + if (sa == NULL) { + namelen = 0; + if (uap->name) + goto gotnoname; + splx(s); + error = 0; + goto done; + } + if (uap->name) { + /* check sa_len before it is destroyed */ + if (namelen > sa->sa_len) + namelen = sa->sa_len; +#ifdef COMPAT_OLDSOCK + if (compat) + ((struct osockaddr *)sa)->sa_family = + sa->sa_family; +#endif + error = copyout(sa, uap->name, (u_int)namelen); + if (!error) +gotnoname: + error = copyout(&namelen, + uap->anamelen, sizeof (*uap->anamelen)); + } +noconnection: + if (sa) + FREE(sa, M_SONAME); + + /* + * close the new descriptor, assuming someone hasn't ripped it + * out from under us. + */ + if (error) { + FILEDESC_LOCK(fdp); + if (fdp->fd_ofiles[fd] == nfp) { + fdp->fd_ofiles[fd] = NULL; + FILEDESC_UNLOCK(fdp); + fdrop(nfp, td); + } else { + FILEDESC_UNLOCK(fdp); + } + } + splx(s); + + /* + * Release explicitly held references before returning. + */ +done: + if (nfp != NULL) + fdrop(nfp, td); + fputsock(head); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE (accept1() is MPSAFE) + */ +int +accept(td, uap) + struct thread *td; + struct accept_args *uap; +{ + + return (accept1(td, uap, 0)); +} + +#ifdef COMPAT_OLDSOCK +/* + * MPSAFE (accept1() is MPSAFE) + */ +int +oaccept(td, uap) + struct thread *td; + struct accept_args *uap; +{ + + return (accept1(td, uap, 1)); +} +#endif /* COMPAT_OLDSOCK */ + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +connect(td, uap) + struct thread *td; + register struct connect_args /* { + int s; + caddr_t name; + int namelen; + } */ *uap; +{ + struct socket *so; + struct sockaddr *sa; + int error, s; + + mtx_lock(&Giant); + if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) + goto done2; + if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { + error = EALREADY; + goto done1; + } + error = getsockaddr(&sa, uap->name, uap->namelen); + if (error) + goto done1; + error = soconnect(so, sa, td); + if (error) + goto bad; + if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { + FREE(sa, M_SONAME); + error = EINPROGRESS; + goto done1; + } + s = splnet(); + while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { + error = tsleep(&so->so_timeo, PSOCK | PCATCH, "connec", 0); + if (error) + break; + } + if (error == 0) { + error = so->so_error; + so->so_error = 0; + } + splx(s); +bad: + so->so_state &= ~SS_ISCONNECTING; + FREE(sa, M_SONAME); + if (error == ERESTART) + error = EINTR; +done1: + fputsock(so); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int +socketpair(td, uap) + struct thread *td; + register struct socketpair_args /* { + int domain; + int type; + int protocol; + int *rsv; + } */ *uap; +{ + register struct filedesc *fdp = td->td_proc->p_fd; + struct file *fp1, *fp2; + struct socket *so1, *so2; + int fd, error, sv[2]; + + mtx_lock(&Giant); + error = socreate(uap->domain, &so1, uap->type, uap->protocol, + td->td_ucred, td); + if (error) + goto done2; + error = socreate(uap->domain, &so2, uap->type, uap->protocol, + td->td_ucred, td); + if (error) + goto free1; + error = falloc(td, &fp1, &fd); + if (error) + goto free2; + fhold(fp1); + sv[0] = fd; + fp1->f_data = so1; /* so1 already has ref count */ + error = falloc(td, &fp2, &fd); + if (error) + goto free3; + fhold(fp2); + fp2->f_data = so2; /* so2 already has ref count */ + sv[1] = fd; + error = soconnect2(so1, so2); + if (error) + goto free4; + if (uap->type == SOCK_DGRAM) { + /* + * Datagram socket connection is asymmetric. + */ + error = soconnect2(so2, so1); + if (error) + goto free4; + } + FILE_LOCK(fp1); + fp1->f_flag = FREAD|FWRITE; + fp1->f_ops = &socketops; + fp1->f_type = DTYPE_SOCKET; + FILE_UNLOCK(fp1); + FILE_LOCK(fp2); + fp2->f_flag = FREAD|FWRITE; + fp2->f_ops = &socketops; + fp2->f_type = DTYPE_SOCKET; + FILE_UNLOCK(fp2); + error = copyout(sv, uap->rsv, 2 * sizeof (int)); + fdrop(fp1, td); + fdrop(fp2, td); + goto done2; +free4: + FILEDESC_LOCK(fdp); + if (fdp->fd_ofiles[sv[1]] == fp2) { + fdp->fd_ofiles[sv[1]] = NULL; + FILEDESC_UNLOCK(fdp); + fdrop(fp2, td); + } else + FILEDESC_UNLOCK(fdp); + fdrop(fp2, td); +free3: + FILEDESC_LOCK(fdp); + if (fdp->fd_ofiles[sv[0]] == fp1) { + fdp->fd_ofiles[sv[0]] = NULL; + FILEDESC_UNLOCK(fdp); + fdrop(fp1, td); + } else + FILEDESC_UNLOCK(fdp); + fdrop(fp1, td); +free2: + (void)soclose(so2); +free1: + (void)soclose(so1); +done2: + mtx_unlock(&Giant); + return (error); +} + +static int +sendit(td, s, mp, flags) + register struct thread *td; + int s; + register struct msghdr *mp; + int flags; +{ + struct uio auio; + register struct iovec *iov; + register int i; + struct mbuf *control; + struct sockaddr *to = NULL; + int len, error; + struct socket *so; +#ifdef KTRACE + struct iovec *ktriov = NULL; + struct uio ktruio; + int iovlen; +#endif + + if ((error = fgetsock(td, s, &so, NULL)) != 0) + return (error); + auio.uio_iov = mp->msg_iov; + auio.uio_iovcnt = mp->msg_iovlen; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_td = td; + auio.uio_offset = 0; /* XXX */ + auio.uio_resid = 0; + iov = mp->msg_iov; + for (i = 0; i < mp->msg_iovlen; i++, iov++) { + if ((auio.uio_resid += iov->iov_len) < 0) { + error = EINVAL; + goto bad; + } + } + if (mp->msg_name) { + error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); + if (error) + goto bad; + } + if (mp->msg_control) { + if (mp->msg_controllen < sizeof(struct cmsghdr) +#ifdef COMPAT_OLDSOCK + && mp->msg_flags != MSG_COMPAT +#endif + ) { + error = EINVAL; + goto bad; + } + error = sockargs(&control, mp->msg_control, + mp->msg_controllen, MT_CONTROL); + if (error) + goto bad; +#ifdef COMPAT_OLDSOCK + if (mp->msg_flags == MSG_COMPAT) { + register struct cmsghdr *cm; + + M_PREPEND(control, sizeof(*cm), M_TRYWAIT); + if (control == 0) { + error = ENOBUFS; + goto bad; + } else { + cm = mtod(control, struct cmsghdr *); + cm->cmsg_len = control->m_len; + cm->cmsg_level = SOL_SOCKET; + cm->cmsg_type = SCM_RIGHTS; + } + } +#endif + } else { + control = 0; + } +#ifdef KTRACE + if (KTRPOINT(td, KTR_GENIO)) { + iovlen = auio.uio_iovcnt * sizeof (struct iovec); + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy(auio.uio_iov, ktriov, iovlen); + ktruio = auio; + } +#endif + len = auio.uio_resid; + error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control, + flags, td); + if (error) { + if (auio.uio_resid != len && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + /* Generation of SIGPIPE can be controlled per socket */ + if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE)) { + PROC_LOCK(td->td_proc); + psignal(td->td_proc, SIGPIPE); + PROC_UNLOCK(td->td_proc); + } + } + if (error == 0) + td->td_retval[0] = len - auio.uio_resid; +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) { + ktruio.uio_iov = ktriov; + ktruio.uio_resid = td->td_retval[0]; + ktrgenio(s, UIO_WRITE, &ktruio, error); + } + FREE(ktriov, M_TEMP); + } +#endif +bad: + fputsock(so); + if (to) + FREE(to, M_SONAME); + return (error); +} + +/* + * MPSAFE + */ +int +sendto(td, uap) + struct thread *td; + register struct sendto_args /* { + int s; + caddr_t buf; + size_t len; + int flags; + caddr_t to; + int tolen; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov; + int error; + + msg.msg_name = uap->to; + msg.msg_namelen = uap->tolen; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + msg.msg_control = 0; +#ifdef COMPAT_OLDSOCK + msg.msg_flags = 0; +#endif + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + mtx_lock(&Giant); + error = sendit(td, uap->s, &msg, uap->flags); + mtx_unlock(&Giant); + return (error); +} + +#ifdef COMPAT_OLDSOCK +/* + * MPSAFE + */ +int +osend(td, uap) + struct thread *td; + register struct osend_args /* { + int s; + caddr_t buf; + int len; + int flags; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov; + int error; + + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + msg.msg_control = 0; + msg.msg_flags = 0; + mtx_lock(&Giant); + error = sendit(td, uap->s, &msg, uap->flags); + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int +osendmsg(td, uap) + struct thread *td; + register struct osendmsg_args /* { + int s; + caddr_t msg; + int flags; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + mtx_lock(&Giant); + error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); + if (error) + goto done2; + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { + error = EMSGSIZE; + goto done2; + } + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else { + iov = aiov; + } + error = copyin(msg.msg_iov, iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); + if (error) + goto done; + msg.msg_flags = MSG_COMPAT; + msg.msg_iov = iov; + error = sendit(td, uap->s, &msg, uap->flags); +done: + if (iov != aiov) + FREE(iov, M_IOV); +done2: + mtx_unlock(&Giant); + return (error); +} +#endif + +/* + * MPSAFE + */ +int +sendmsg(td, uap) + struct thread *td; + register struct sendmsg_args /* { + int s; + caddr_t msg; + int flags; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + mtx_lock(&Giant); + error = copyin(uap->msg, &msg, sizeof (msg)); + if (error) + goto done2; + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { + error = EMSGSIZE; + goto done2; + } + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else { + iov = aiov; + } + if (msg.msg_iovlen && + (error = copyin(msg.msg_iov, iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) + goto done; + msg.msg_iov = iov; +#ifdef COMPAT_OLDSOCK + msg.msg_flags = 0; +#endif + error = sendit(td, uap->s, &msg, uap->flags); +done: + if (iov != aiov) + FREE(iov, M_IOV); +done2: + mtx_unlock(&Giant); + return (error); +} + +static int +recvit(td, s, mp, namelenp) + register struct thread *td; + int s; + register struct msghdr *mp; + void *namelenp; +{ + struct uio auio; + register struct iovec *iov; + register int i; + int len, error; + struct mbuf *m, *control = 0; + caddr_t ctlbuf; + struct socket *so; + struct sockaddr *fromsa = 0; +#ifdef KTRACE + struct iovec *ktriov = NULL; + struct uio ktruio; + int iovlen; +#endif + + if ((error = fgetsock(td, s, &so, NULL)) != 0) + return (error); + auio.uio_iov = mp->msg_iov; + auio.uio_iovcnt = mp->msg_iovlen; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_rw = UIO_READ; + auio.uio_td = td; + auio.uio_offset = 0; /* XXX */ + auio.uio_resid = 0; + iov = mp->msg_iov; + for (i = 0; i < mp->msg_iovlen; i++, iov++) { + if ((auio.uio_resid += iov->iov_len) < 0) { + fputsock(so); + return (EINVAL); + } + } +#ifdef KTRACE + if (KTRPOINT(td, KTR_GENIO)) { + iovlen = auio.uio_iovcnt * sizeof (struct iovec); + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy(auio.uio_iov, ktriov, iovlen); + ktruio = auio; + } +#endif + len = auio.uio_resid; + error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio, + (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, + &mp->msg_flags); + if (error) { + if (auio.uio_resid != len && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + } +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) { + ktruio.uio_iov = ktriov; + ktruio.uio_resid = len - auio.uio_resid; + ktrgenio(s, UIO_READ, &ktruio, error); + } + FREE(ktriov, M_TEMP); + } +#endif + if (error) + goto out; + td->td_retval[0] = len - auio.uio_resid; + if (mp->msg_name) { + len = mp->msg_namelen; + if (len <= 0 || fromsa == 0) + len = 0; + else { +#ifndef MIN +#define MIN(a,b) ((a)>(b)?(b):(a)) +#endif + /* save sa_len before it is destroyed by MSG_COMPAT */ + len = MIN(len, fromsa->sa_len); +#ifdef COMPAT_OLDSOCK + if (mp->msg_flags & MSG_COMPAT) + ((struct osockaddr *)fromsa)->sa_family = + fromsa->sa_family; +#endif + error = copyout(fromsa, mp->msg_name, (unsigned)len); + if (error) + goto out; + } + mp->msg_namelen = len; + if (namelenp && + (error = copyout(&len, namelenp, sizeof (int)))) { +#ifdef COMPAT_OLDSOCK + if (mp->msg_flags & MSG_COMPAT) + error = 0; /* old recvfrom didn't check */ + else +#endif + goto out; + } + } + if (mp->msg_control) { +#ifdef COMPAT_OLDSOCK + /* + * We assume that old recvmsg calls won't receive access + * rights and other control info, esp. as control info + * is always optional and those options didn't exist in 4.3. + * If we receive rights, trim the cmsghdr; anything else + * is tossed. + */ + if (control && mp->msg_flags & MSG_COMPAT) { + if (mtod(control, struct cmsghdr *)->cmsg_level != + SOL_SOCKET || + mtod(control, struct cmsghdr *)->cmsg_type != + SCM_RIGHTS) { + mp->msg_controllen = 0; + goto out; + } + control->m_len -= sizeof (struct cmsghdr); + control->m_data += sizeof (struct cmsghdr); + } +#endif + len = mp->msg_controllen; + m = control; + mp->msg_controllen = 0; + ctlbuf = mp->msg_control; + + while (m && len > 0) { + unsigned int tocopy; + + if (len >= m->m_len) + tocopy = m->m_len; + else { + mp->msg_flags |= MSG_CTRUNC; + tocopy = len; + } + + if ((error = copyout(mtod(m, caddr_t), + ctlbuf, tocopy)) != 0) + goto out; + + ctlbuf += tocopy; + len -= tocopy; + m = m->m_next; + } + mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; + } +out: + fputsock(so); + if (fromsa) + FREE(fromsa, M_SONAME); + if (control) + m_freem(control); + return (error); +} + +/* + * MPSAFE + */ +int +recvfrom(td, uap) + struct thread *td; + register struct recvfrom_args /* { + int s; + caddr_t buf; + size_t len; + int flags; + caddr_t from; + int *fromlenaddr; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov; + int error; + + mtx_lock(&Giant); + if (uap->fromlenaddr) { + error = copyin(uap->fromlenaddr, + &msg.msg_namelen, sizeof (msg.msg_namelen)); + if (error) + goto done2; + } else { + msg.msg_namelen = 0; + } + msg.msg_name = uap->from; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + msg.msg_control = 0; + msg.msg_flags = uap->flags; + error = recvit(td, uap->s, &msg, uap->fromlenaddr); +done2: + mtx_unlock(&Giant); + return(error); +} + +#ifdef COMPAT_OLDSOCK +/* + * MPSAFE + */ +int +orecvfrom(td, uap) + struct thread *td; + struct recvfrom_args *uap; +{ + + uap->flags |= MSG_COMPAT; + return (recvfrom(td, uap)); +} +#endif + + +#ifdef COMPAT_OLDSOCK +/* + * MPSAFE + */ +int +orecv(td, uap) + struct thread *td; + register struct orecv_args /* { + int s; + caddr_t buf; + int len; + int flags; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov; + int error; + + mtx_lock(&Giant); + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + msg.msg_control = 0; + msg.msg_flags = uap->flags; + error = recvit(td, uap->s, &msg, NULL); + mtx_unlock(&Giant); + return (error); +} + +/* + * Old recvmsg. This code takes advantage of the fact that the old msghdr + * overlays the new one, missing only the flags, and with the (old) access + * rights where the control fields are now. + * + * MPSAFE + */ +int +orecvmsg(td, uap) + struct thread *td; + register struct orecvmsg_args /* { + int s; + struct omsghdr *msg; + int flags; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); + if (error) + return (error); + + mtx_lock(&Giant); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { + error = EMSGSIZE; + goto done2; + } + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else { + iov = aiov; + } + msg.msg_flags = uap->flags | MSG_COMPAT; + error = copyin(msg.msg_iov, iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); + if (error) + goto done; + msg.msg_iov = iov; + error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); + + if (msg.msg_controllen && error == 0) + error = copyout(&msg.msg_controllen, + &uap->msg->msg_accrightslen, sizeof (int)); +done: + if (iov != aiov) + FREE(iov, M_IOV); +done2: + mtx_unlock(&Giant); + return (error); +} +#endif + +/* + * MPSAFE + */ +int +recvmsg(td, uap) + struct thread *td; + register struct recvmsg_args /* { + int s; + struct msghdr *msg; + int flags; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; + register int error; + + mtx_lock(&Giant); + error = copyin(uap->msg, &msg, sizeof (msg)); + if (error) + goto done2; + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { + error = EMSGSIZE; + goto done2; + } + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else { + iov = aiov; + } +#ifdef COMPAT_OLDSOCK + msg.msg_flags = uap->flags &~ MSG_COMPAT; +#else + msg.msg_flags = uap->flags; +#endif + uiov = msg.msg_iov; + msg.msg_iov = iov; + error = copyin(uiov, iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); + if (error) + goto done; + error = recvit(td, uap->s, &msg, NULL); + if (!error) { + msg.msg_iov = uiov; + error = copyout(&msg, uap->msg, sizeof(msg)); + } +done: + if (iov != aiov) + FREE(iov, M_IOV); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +shutdown(td, uap) + struct thread *td; + register struct shutdown_args /* { + int s; + int how; + } */ *uap; +{ + struct socket *so; + int error; + + mtx_lock(&Giant); + if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { + error = soshutdown(so, uap->how); + fputsock(so); + } + mtx_unlock(&Giant); + return(error); +} + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +setsockopt(td, uap) + struct thread *td; + register struct setsockopt_args /* { + int s; + int level; + int name; + caddr_t val; + int valsize; + } */ *uap; +{ + struct socket *so; + struct sockopt sopt; + int error; + + if (uap->val == 0 && uap->valsize != 0) + return (EFAULT); + if (uap->valsize < 0) + return (EINVAL); + + mtx_lock(&Giant); + if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = uap->level; + sopt.sopt_name = uap->name; + sopt.sopt_val = uap->val; + sopt.sopt_valsize = uap->valsize; + sopt.sopt_td = td; + error = sosetopt(so, &sopt); + fputsock(so); + } + mtx_unlock(&Giant); + return(error); +} + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +getsockopt(td, uap) + struct thread *td; + register struct getsockopt_args /* { + int s; + int level; + int name; + caddr_t val; + int *avalsize; + } */ *uap; +{ + int valsize, error; + struct socket *so; + struct sockopt sopt; + + mtx_lock(&Giant); + if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) + goto done2; + if (uap->val) { + error = copyin(uap->avalsize, &valsize, sizeof (valsize)); + if (error) + goto done1; + if (valsize < 0) { + error = EINVAL; + goto done1; + } + } else { + valsize = 0; + } + + sopt.sopt_dir = SOPT_GET; + sopt.sopt_level = uap->level; + sopt.sopt_name = uap->name; + sopt.sopt_val = uap->val; + sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */ + sopt.sopt_td = td; + + error = sogetopt(so, &sopt); + if (error == 0) { + valsize = sopt.sopt_valsize; + error = copyout(&valsize, uap->avalsize, sizeof (valsize)); + } +done1: + fputsock(so); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * getsockname1() - Get socket name. + * + * MPSAFE + */ +/* ARGSUSED */ +static int +getsockname1(td, uap, compat) + struct thread *td; + register struct getsockname_args /* { + int fdes; + caddr_t asa; + int *alen; + } */ *uap; + int compat; +{ + struct socket *so; + struct sockaddr *sa; + int len, error; + + mtx_lock(&Giant); + if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0) + goto done2; + error = copyin(uap->alen, &len, sizeof (len)); + if (error) + goto done1; + sa = 0; + error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa); + if (error) + goto bad; + if (sa == 0) { + len = 0; + goto gotnothing; + } + + len = MIN(len, sa->sa_len); +#ifdef COMPAT_OLDSOCK + if (compat) + ((struct osockaddr *)sa)->sa_family = sa->sa_family; +#endif + error = copyout(sa, uap->asa, (u_int)len); + if (error == 0) +gotnothing: + error = copyout(&len, uap->alen, sizeof (len)); +bad: + if (sa) + FREE(sa, M_SONAME); +done1: + fputsock(so); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int +getsockname(td, uap) + struct thread *td; + struct getsockname_args *uap; +{ + + return (getsockname1(td, uap, 0)); +} + +#ifdef COMPAT_OLDSOCK +/* + * MPSAFE + */ +int +ogetsockname(td, uap) + struct thread *td; + struct getsockname_args *uap; +{ + + return (getsockname1(td, uap, 1)); +} +#endif /* COMPAT_OLDSOCK */ + +/* + * getpeername1() - Get name of peer for connected socket. + * + * MPSAFE + */ +/* ARGSUSED */ +static int +getpeername1(td, uap, compat) + struct thread *td; + register struct getpeername_args /* { + int fdes; + caddr_t asa; + int *alen; + } */ *uap; + int compat; +{ + struct socket *so; + struct sockaddr *sa; + int len, error; + + mtx_lock(&Giant); + if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0) + goto done2; + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { + error = ENOTCONN; + goto done1; + } + error = copyin(uap->alen, &len, sizeof (len)); + if (error) + goto done1; + sa = 0; + error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa); + if (error) + goto bad; + if (sa == 0) { + len = 0; + goto gotnothing; + } + len = MIN(len, sa->sa_len); +#ifdef COMPAT_OLDSOCK + if (compat) + ((struct osockaddr *)sa)->sa_family = + sa->sa_family; +#endif + error = copyout(sa, uap->asa, (u_int)len); + if (error) + goto bad; +gotnothing: + error = copyout(&len, uap->alen, sizeof (len)); +bad: + if (sa) + FREE(sa, M_SONAME); +done1: + fputsock(so); +done2: + mtx_unlock(&Giant); + return (error); +} + +/* + * MPSAFE + */ +int +getpeername(td, uap) + struct thread *td; + struct getpeername_args *uap; +{ + + return (getpeername1(td, uap, 0)); +} + +#ifdef COMPAT_OLDSOCK +/* + * MPSAFE + */ +int +ogetpeername(td, uap) + struct thread *td; + struct ogetpeername_args *uap; +{ + + /* XXX uap should have type `getpeername_args *' to begin with. */ + return (getpeername1(td, (struct getpeername_args *)uap, 1)); +} +#endif /* COMPAT_OLDSOCK */ + +int +sockargs(mp, buf, buflen, type) + struct mbuf **mp; + caddr_t buf; + int buflen, type; +{ + register struct sockaddr *sa; + register struct mbuf *m; + int error; + + if ((u_int)buflen > MLEN) { +#ifdef COMPAT_OLDSOCK + if (type == MT_SONAME && (u_int)buflen <= 112) + buflen = MLEN; /* unix domain compat. hack */ + else +#endif + return (EINVAL); + } + m = m_get(M_TRYWAIT, type); + if (m == NULL) + return (ENOBUFS); + m->m_len = buflen; + error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); + if (error) + (void) m_free(m); + else { + *mp = m; + if (type == MT_SONAME) { + sa = mtod(m, struct sockaddr *); + +#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN + if (sa->sa_family == 0 && sa->sa_len < AF_MAX) + sa->sa_family = sa->sa_len; +#endif + sa->sa_len = buflen; + } + } + return (error); +} + +int +getsockaddr(namp, uaddr, len) + struct sockaddr **namp; + caddr_t uaddr; + size_t len; +{ + struct sockaddr *sa; + int error; + + if (len > SOCK_MAXADDRLEN) + return ENAMETOOLONG; + MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); + error = copyin(uaddr, sa, len); + if (error) { + FREE(sa, M_SONAME); + } else { +#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN + if (sa->sa_family == 0 && sa->sa_len < AF_MAX) + sa->sa_family = sa->sa_len; +#endif + sa->sa_len = len; + *namp = sa; + } + return error; +} + +/* + * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) + * XXX - The sf_buf functions are currently private to sendfile(2), so have + * been made static, but may be useful in the future for doing zero-copy in + * other parts of the networking code. + */ +static void +sf_buf_init(void *arg) +{ + int i; + + mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF); + mtx_lock(&sf_freelist.sf_lock); + SLIST_INIT(&sf_freelist.sf_head); + sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE); + sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, + M_NOWAIT | M_ZERO); + for (i = 0; i < nsfbufs; i++) { + sf_bufs[i].kva = sf_base + i * PAGE_SIZE; + SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list); + } + sf_buf_alloc_want = 0; + mtx_unlock(&sf_freelist.sf_lock); +} + +/* + * Get an sf_buf from the freelist. Will block if none are available. + */ +struct sf_buf * +sf_buf_alloc() +{ + struct sf_buf *sf; + int error; + + mtx_lock(&sf_freelist.sf_lock); + while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) { + sf_buf_alloc_want++; + error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH, + "sfbufa", 0); + sf_buf_alloc_want--; + + /* + * If we got a signal, don't risk going back to sleep. + */ + if (error) + break; + } + if (sf != NULL) + SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list); + mtx_unlock(&sf_freelist.sf_lock); + return (sf); +} + +#define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT]) + +/* + * Detatch mapped page and release resources back to the system. + */ +void +sf_buf_free(void *addr, void *args) +{ + struct sf_buf *sf; + struct vm_page *m; + + GIANT_REQUIRED; + + sf = dtosf(addr); + pmap_qremove((vm_offset_t)addr, 1); + m = sf->m; + vm_page_unwire(m, 0); + /* + * Check for the object going away on us. This can + * happen since we don't hold a reference to it. + * If so, we're responsible for freeing the page. + */ + if (m->wire_count == 0 && m->object == NULL) + vm_page_free(m); + sf->m = NULL; + mtx_lock(&sf_freelist.sf_lock); + SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list); + if (sf_buf_alloc_want > 0) + wakeup_one(&sf_freelist); + mtx_unlock(&sf_freelist.sf_lock); +} + +/* + * sendfile(2) + * + * MPSAFE + * + * int sendfile(int fd, int s, off_t offset, size_t nbytes, + * struct sf_hdtr *hdtr, off_t *sbytes, int flags) + * + * Send a file specified by 'fd' and starting at 'offset' to a socket + * specified by 's'. Send only 'nbytes' of the file or until EOF if + * nbytes == 0. Optionally add a header and/or trailer to the socket + * output. If specified, write the total number of bytes sent into *sbytes. + * + */ +int +sendfile(struct thread *td, struct sendfile_args *uap) +{ + struct vnode *vp; + struct vm_object *obj; + struct socket *so = NULL; + struct mbuf *m; + struct sf_buf *sf; + struct vm_page *pg; + struct writev_args nuap; + struct sf_hdtr hdtr; + off_t off, xfsize, hdtr_size, sbytes = 0; + int error, s; + + mtx_lock(&Giant); + + hdtr_size = 0; + + /* + * The descriptor must be a regular file and have a backing VM object. + */ + if ((error = fgetvp_read(td, uap->fd, &vp)) != 0) + goto done; + if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) { + error = EINVAL; + goto done; + } + if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) + goto done; + if (so->so_type != SOCK_STREAM) { + error = EINVAL; + goto done; + } + if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + goto done; + } + if (uap->offset < 0) { + error = EINVAL; + goto done; + } + + /* + * If specified, get the pointer to the sf_hdtr struct for + * any headers/trailers. + */ + if (uap->hdtr != NULL) { + error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); + if (error) + goto done; + /* + * Send any headers. Wimp out and use writev(2). + */ + if (hdtr.headers != NULL) { + nuap.fd = uap->s; + nuap.iovp = hdtr.headers; + nuap.iovcnt = hdtr.hdr_cnt; + error = writev(td, &nuap); + if (error) + goto done; + hdtr_size += td->td_retval[0]; + } + } + + /* + * Protect against multiple writers to the socket. + */ + (void) sblock(&so->so_snd, M_WAITOK); + + /* + * Loop through the pages in the file, starting with the requested + * offset. Get a file page (do I/O if necessary), map the file page + * into an sf_buf, attach an mbuf header to the sf_buf, and queue + * it on the socket. + */ + for (off = uap->offset; ; off += xfsize, sbytes += xfsize) { + vm_pindex_t pindex; + vm_offset_t pgoff; + + pindex = OFF_TO_IDX(off); +retry_lookup: + /* + * Calculate the amount to transfer. Not to exceed a page, + * the EOF, or the passed in nbytes. + */ + xfsize = obj->un_pager.vnp.vnp_size - off; + if (xfsize > PAGE_SIZE) + xfsize = PAGE_SIZE; + pgoff = (vm_offset_t)(off & PAGE_MASK); + if (PAGE_SIZE - pgoff < xfsize) + xfsize = PAGE_SIZE - pgoff; + if (uap->nbytes && xfsize > (uap->nbytes - sbytes)) + xfsize = uap->nbytes - sbytes; + if (xfsize <= 0) + break; + /* + * Optimize the non-blocking case by looking at the socket space + * before going to the extra work of constituting the sf_buf. + */ + if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { + if (so->so_state & SS_CANTSENDMORE) + error = EPIPE; + else + error = EAGAIN; + sbunlock(&so->so_snd); + goto done; + } + /* + * Attempt to look up the page. + * + * Allocate if not found + * + * Wait and loop if busy. + */ + pg = vm_page_lookup(obj, pindex); + + if (pg == NULL) { + pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); + if (pg == NULL) { + VM_WAIT; + goto retry_lookup; + } + vm_page_wakeup(pg); + } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) { + goto retry_lookup; + } + + /* + * Wire the page so it does not get ripped out from under + * us. + */ + + vm_page_wire(pg); + + /* + * If page is not valid for what we need, initiate I/O + */ + + if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) { + int bsize; + + /* + * Ensure that our page is still around when the I/O + * completes. + */ + vm_page_io_start(pg); + + /* + * Get the page from backing store. + */ + bsize = vp->v_mount->mnt_stat.f_iosize; + vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td); + error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE, + trunc_page(off), UIO_NOCOPY, IO_NODELOCKED | + IO_VMIO | ((MAXBSIZE / bsize) << 16), + td->td_ucred, NULL, td); + VOP_UNLOCK(vp, 0, td); + vm_page_flag_clear(pg, PG_ZERO); + vm_page_io_finish(pg); + if (error) { + vm_page_unwire(pg, 0); + /* + * See if anyone else might know about this page. + * If not and it is not valid, then free it. + */ + if (pg->wire_count == 0 && pg->valid == 0 && + pg->busy == 0 && !(pg->flags & PG_BUSY) && + pg->hold_count == 0) { + vm_page_busy(pg); + vm_page_free(pg); + } + sbunlock(&so->so_snd); + goto done; + } + } + + + /* + * Get a sendfile buf. We usually wait as long as necessary, + * but this wait can be interrupted. + */ + if ((sf = sf_buf_alloc()) == NULL) { + vm_page_unwire(pg, 0); + if (pg->wire_count == 0 && pg->object == NULL) + vm_page_free(pg); + sbunlock(&so->so_snd); + error = EINTR; + goto done; + } + + /* + * Allocate a kernel virtual page and insert the physical page + * into it. + */ + sf->m = pg; + pmap_qenter(sf->kva, &pg, 1); + /* + * Get an mbuf header and set it up as having external storage. + */ + MGETHDR(m, M_TRYWAIT, MT_DATA); + if (m == NULL) { + error = ENOBUFS; + sf_buf_free((void *)sf->kva, NULL); + sbunlock(&so->so_snd); + goto done; + } + /* + * Setup external storage for mbuf. + */ + MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY, + EXT_SFBUF); + m->m_data = (char *) sf->kva + pgoff; + m->m_pkthdr.len = m->m_len = xfsize; + /* + * Add the buffer to the socket buffer chain. + */ + s = splnet(); +retry_space: + /* + * Make sure that the socket is still able to take more data. + * CANTSENDMORE being true usually means that the connection + * was closed. so_error is true when an error was sensed after + * a previous send. + * The state is checked after the page mapping and buffer + * allocation above since those operations may block and make + * any socket checks stale. From this point forward, nothing + * blocks before the pru_send (or more accurately, any blocking + * results in a loop back to here to re-check). + */ + if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { + if (so->so_state & SS_CANTSENDMORE) { + error = EPIPE; + } else { + error = so->so_error; + so->so_error = 0; + } + m_freem(m); + sbunlock(&so->so_snd); + splx(s); + goto done; + } + /* + * Wait for socket space to become available. We do this just + * after checking the connection state above in order to avoid + * a race condition with sbwait(). + */ + if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { + if (so->so_state & SS_NBIO) { + m_freem(m); + sbunlock(&so->so_snd); + splx(s); + error = EAGAIN; + goto done; + } + error = sbwait(&so->so_snd); + /* + * An error from sbwait usually indicates that we've + * been interrupted by a signal. If we've sent anything + * then return bytes sent, otherwise return the error. + */ + if (error) { + m_freem(m); + sbunlock(&so->so_snd); + splx(s); + goto done; + } + goto retry_space; + } + error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td); + splx(s); + if (error) { + sbunlock(&so->so_snd); + goto done; + } + } + sbunlock(&so->so_snd); + + /* + * Send trailers. Wimp out and use writev(2). + */ + if (uap->hdtr != NULL && hdtr.trailers != NULL) { + nuap.fd = uap->s; + nuap.iovp = hdtr.trailers; + nuap.iovcnt = hdtr.trl_cnt; + error = writev(td, &nuap); + if (error) + goto done; + hdtr_size += td->td_retval[0]; + } + +done: + /* + * If there was no error we have to clear td->td_retval[0] + * because it may have been set by writev. + */ + if (error == 0) { + td->td_retval[0] = 0; + } + if (uap->sbytes != NULL) { + sbytes += hdtr_size; + copyout(&sbytes, uap->sbytes, sizeof(off_t)); + } + if (vp) + vrele(vp); + if (so) + fputsock(so); + mtx_unlock(&Giant); + return (error); +} diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c new file mode 100644 index 0000000..b227d91 --- /dev/null +++ b/sys/kern/uipc_usrreq.c @@ -0,0 +1,1503 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/domain.h> +#include <sys/fcntl.h> +#include <sys/malloc.h> /* XXX must be before <sys/file.h> */ +#include <sys/file.h> +#include <sys/filedesc.h> +#include <sys/jail.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/namei.h> +#include <sys/proc.h> +#include <sys/protosw.h> +#include <sys/resourcevar.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/signalvar.h> +#include <sys/stat.h> +#include <sys/sx.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/un.h> +#include <sys/unpcb.h> +#include <sys/vnode.h> + +#include <vm/uma.h> + +static uma_zone_t unp_zone; +static unp_gen_t unp_gencnt; +static u_int unp_count; + +static struct unp_head unp_shead, unp_dhead; + +/* + * Unix communications domain. + * + * TODO: + * SEQPACKET, RDM + * rethink name space problems + * need a proper out-of-band + * lock pushdown + */ +static struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; +static ino_t unp_ino; /* prototype for fake inode numbers */ + +static int unp_attach(struct socket *); +static void unp_detach(struct unpcb *); +static int unp_bind(struct unpcb *,struct sockaddr *, struct thread *); +static int unp_connect(struct socket *,struct sockaddr *, struct thread *); +static void unp_disconnect(struct unpcb *); +static void unp_shutdown(struct unpcb *); +static void unp_drop(struct unpcb *, int); +static void unp_gc(void); +static void unp_scan(struct mbuf *, void (*)(struct file *)); +static void unp_mark(struct file *); +static void unp_discard(struct file *); +static void unp_freerights(struct file **, int); +static int unp_internalize(struct mbuf **, struct thread *); +static int unp_listen(struct unpcb *, struct thread *); + +static int +uipc_abort(struct socket *so) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + unp_drop(unp, ECONNABORTED); + unp_detach(unp); + sotryfree(so); + return 0; +} + +static int +uipc_accept(struct socket *so, struct sockaddr **nam) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + + /* + * Pass back name of connected socket, + * if it was bound and we are still connected + * (our peer may have closed already!). + */ + if (unp->unp_conn && unp->unp_conn->unp_addr) { + *nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr, + 1); + } else { + *nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1); + } + return 0; +} + +static int +uipc_attach(struct socket *so, int proto, struct thread *td) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp != 0) + return EISCONN; + return unp_attach(so); +} + +static int +uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + + return unp_bind(unp, nam, td); +} + +static int +uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + return unp_connect(so, nam, curthread); +} + +static int +uipc_connect2(struct socket *so1, struct socket *so2) +{ + struct unpcb *unp = sotounpcb(so1); + + if (unp == 0) + return EINVAL; + + return unp_connect2(so1, so2); +} + +/* control is EOPNOTSUPP */ + +static int +uipc_detach(struct socket *so) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + + unp_detach(unp); + return 0; +} + +static int +uipc_disconnect(struct socket *so) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + unp_disconnect(unp); + return 0; +} + +static int +uipc_listen(struct socket *so, struct thread *td) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0 || unp->unp_vnode == 0) + return EINVAL; + return unp_listen(unp, td); +} + +static int +uipc_peeraddr(struct socket *so, struct sockaddr **nam) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + if (unp->unp_conn && unp->unp_conn->unp_addr) + *nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr, + 1); + return 0; +} + +static int +uipc_rcvd(struct socket *so, int flags) +{ + struct unpcb *unp = sotounpcb(so); + struct socket *so2; + u_long newhiwat; + + if (unp == 0) + return EINVAL; + switch (so->so_type) { + case SOCK_DGRAM: + panic("uipc_rcvd DGRAM?"); + /*NOTREACHED*/ + + case SOCK_STREAM: + if (unp->unp_conn == 0) + break; + so2 = unp->unp_conn->unp_socket; + /* + * Adjust backpressure on sender + * and wakeup any waiting to write. + */ + so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt; + unp->unp_mbcnt = so->so_rcv.sb_mbcnt; + newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - + so->so_rcv.sb_cc; + (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, + newhiwat, RLIM_INFINITY); + unp->unp_cc = so->so_rcv.sb_cc; + sowwakeup(so2); + break; + + default: + panic("uipc_rcvd unknown socktype"); + } + return 0; +} + +/* pru_rcvoob is EOPNOTSUPP */ + +static int +uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct thread *td) +{ + int error = 0; + struct unpcb *unp = sotounpcb(so); + struct socket *so2; + u_long newhiwat; + + if (unp == 0) { + error = EINVAL; + goto release; + } + if (flags & PRUS_OOB) { + error = EOPNOTSUPP; + goto release; + } + + if (control && (error = unp_internalize(&control, td))) + goto release; + + switch (so->so_type) { + case SOCK_DGRAM: + { + struct sockaddr *from; + + if (nam) { + if (unp->unp_conn) { + error = EISCONN; + break; + } + error = unp_connect(so, nam, td); + if (error) + break; + } else { + if (unp->unp_conn == 0) { + error = ENOTCONN; + break; + } + } + so2 = unp->unp_conn->unp_socket; + if (unp->unp_addr) + from = (struct sockaddr *)unp->unp_addr; + else + from = &sun_noname; + if (sbappendaddr(&so2->so_rcv, from, m, control)) { + sorwakeup(so2); + m = 0; + control = 0; + } else + error = ENOBUFS; + if (nam) + unp_disconnect(unp); + break; + } + + case SOCK_STREAM: + /* Connect if not connected yet. */ + /* + * Note: A better implementation would complain + * if not equal to the peer's address. + */ + if ((so->so_state & SS_ISCONNECTED) == 0) { + if (nam) { + error = unp_connect(so, nam, td); + if (error) + break; /* XXX */ + } else { + error = ENOTCONN; + break; + } + } + + if (so->so_state & SS_CANTSENDMORE) { + error = EPIPE; + break; + } + if (unp->unp_conn == 0) + panic("uipc_send connected but no connection?"); + so2 = unp->unp_conn->unp_socket; + /* + * Send to paired receive port, and then reduce + * send buffer hiwater marks to maintain backpressure. + * Wake up readers. + */ + if (control) { + if (sbappendcontrol(&so2->so_rcv, m, control)) + control = 0; + } else + sbappend(&so2->so_rcv, m); + so->so_snd.sb_mbmax -= + so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt; + unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt; + newhiwat = so->so_snd.sb_hiwat - + (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc); + (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, + newhiwat, RLIM_INFINITY); + unp->unp_conn->unp_cc = so2->so_rcv.sb_cc; + sorwakeup(so2); + m = 0; + break; + + default: + panic("uipc_send unknown socktype"); + } + + /* + * SEND_EOF is equivalent to a SEND followed by + * a SHUTDOWN. + */ + if (flags & PRUS_EOF) { + socantsendmore(so); + unp_shutdown(unp); + } + + if (control && error != 0) + unp_dispose(control); + +release: + if (control) + m_freem(control); + if (m) + m_freem(m); + return error; +} + +static int +uipc_sense(struct socket *so, struct stat *sb) +{ + struct unpcb *unp = sotounpcb(so); + struct socket *so2; + + if (unp == 0) + return EINVAL; + sb->st_blksize = so->so_snd.sb_hiwat; + if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) { + so2 = unp->unp_conn->unp_socket; + sb->st_blksize += so2->so_rcv.sb_cc; + } + sb->st_dev = NOUDEV; + if (unp->unp_ino == 0) + unp->unp_ino = unp_ino++; + sb->st_ino = unp->unp_ino; + return (0); +} + +static int +uipc_shutdown(struct socket *so) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + socantsendmore(so); + unp_shutdown(unp); + return 0; +} + +static int +uipc_sockaddr(struct socket *so, struct sockaddr **nam) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + if (unp->unp_addr) + *nam = dup_sockaddr((struct sockaddr *)unp->unp_addr, 1); + else + *nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1); + return 0; +} + +struct pr_usrreqs uipc_usrreqs = { + uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect, + uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect, + uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp, + uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr, + sosend, soreceive, sopoll +}; + +int +uipc_ctloutput(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + struct unpcb *unp = sotounpcb(so); + int error; + + switch (sopt->sopt_dir) { + case SOPT_GET: + switch (sopt->sopt_name) { + case LOCAL_PEERCRED: + if (unp->unp_flags & UNP_HAVEPC) + error = sooptcopyout(sopt, &unp->unp_peercred, + sizeof(unp->unp_peercred)); + else { + if (so->so_type == SOCK_STREAM) + error = ENOTCONN; + else + error = EINVAL; + } + break; + default: + error = EOPNOTSUPP; + break; + } + break; + case SOPT_SET: + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +/* + * Both send and receive buffers are allocated PIPSIZ bytes of buffering + * for stream sockets, although the total for sender and receiver is + * actually only PIPSIZ. + * Datagram sockets really use the sendspace as the maximum datagram size, + * and don't really want to reserve the sendspace. Their recvspace should + * be large enough for at least one max-size datagram plus address. + */ +#ifndef PIPSIZ +#define PIPSIZ 8192 +#endif +static u_long unpst_sendspace = PIPSIZ; +static u_long unpst_recvspace = PIPSIZ; +static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ +static u_long unpdg_recvspace = 4*1024; + +static int unp_rights; /* file descriptors in flight */ + +SYSCTL_DECL(_net_local_stream); +SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, + &unpst_sendspace, 0, ""); +SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, + &unpst_recvspace, 0, ""); +SYSCTL_DECL(_net_local_dgram); +SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, + &unpdg_sendspace, 0, ""); +SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, + &unpdg_recvspace, 0, ""); +SYSCTL_DECL(_net_local); +SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); + +static int +unp_attach(so) + struct socket *so; +{ + register struct unpcb *unp; + int error; + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + switch (so->so_type) { + + case SOCK_STREAM: + error = soreserve(so, unpst_sendspace, unpst_recvspace); + break; + + case SOCK_DGRAM: + error = soreserve(so, unpdg_sendspace, unpdg_recvspace); + break; + + default: + panic("unp_attach"); + } + if (error) + return (error); + } + unp = uma_zalloc(unp_zone, M_WAITOK); + if (unp == NULL) + return (ENOBUFS); + bzero(unp, sizeof *unp); + unp->unp_gencnt = ++unp_gencnt; + unp_count++; + LIST_INIT(&unp->unp_refs); + unp->unp_socket = so; + FILEDESC_LOCK(curproc->p_fd); + unp->unp_rvnode = curthread->td_proc->p_fd->fd_rdir; + FILEDESC_UNLOCK(curproc->p_fd); + LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead + : &unp_shead, unp, unp_link); + so->so_pcb = unp; + return (0); +} + +static void +unp_detach(unp) + register struct unpcb *unp; +{ + LIST_REMOVE(unp, unp_link); + unp->unp_gencnt = ++unp_gencnt; + --unp_count; + if (unp->unp_vnode) { + unp->unp_vnode->v_socket = 0; + vrele(unp->unp_vnode); + unp->unp_vnode = 0; + } + if (unp->unp_conn) + unp_disconnect(unp); + while (!LIST_EMPTY(&unp->unp_refs)) + unp_drop(LIST_FIRST(&unp->unp_refs), ECONNRESET); + soisdisconnected(unp->unp_socket); + unp->unp_socket->so_pcb = 0; + if (unp_rights) { + /* + * Normally the receive buffer is flushed later, + * in sofree, but if our receive buffer holds references + * to descriptors that are now garbage, we will dispose + * of those descriptor references after the garbage collector + * gets them (resulting in a "panic: closef: count < 0"). + */ + sorflush(unp->unp_socket); + unp_gc(); + } + if (unp->unp_addr) + FREE(unp->unp_addr, M_SONAME); + uma_zfree(unp_zone, unp); +} + +static int +unp_bind(unp, nam, td) + struct unpcb *unp; + struct sockaddr *nam; + struct thread *td; +{ + struct sockaddr_un *soun = (struct sockaddr_un *)nam; + struct vnode *vp; + struct mount *mp; + struct vattr vattr; + int error, namelen; + struct nameidata nd; + char *buf; + + if (unp->unp_vnode != NULL) + return (EINVAL); + namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); + if (namelen <= 0) + return EINVAL; + buf = malloc(SOCK_MAXADDRLEN, M_TEMP, M_WAITOK); + strncpy(buf, soun->sun_path, namelen); + buf[namelen] = 0; /* null-terminate the string */ +restart: + NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE, + buf, td); +/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ + error = namei(&nd); + if (error) { + free(buf, M_TEMP); + return (error); + } + vp = nd.ni_vp; + if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp != NULL) { + vrele(vp); + free(buf, M_TEMP); + return (EADDRINUSE); + } + error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); + if (error) { + free(buf, M_TEMP); + return (error); + } + goto restart; + } + VATTR_NULL(&vattr); + vattr.va_type = VSOCK; + FILEDESC_LOCK(td->td_proc->p_fd); + vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask); + FILEDESC_UNLOCK(td->td_proc->p_fd); + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if (error) { + free(buf, M_TEMP); + return (error); + } + vp = nd.ni_vp; + vp->v_socket = unp->unp_socket; + unp->unp_vnode = vp; + unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam, 1); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + free(buf, M_TEMP); + return (0); +} + +static int +unp_connect(so, nam, td) + struct socket *so; + struct sockaddr *nam; + struct thread *td; +{ + register struct sockaddr_un *soun = (struct sockaddr_un *)nam; + register struct vnode *vp; + register struct socket *so2, *so3; + struct unpcb *unp, *unp2, *unp3; + int error, len; + struct nameidata nd; + char buf[SOCK_MAXADDRLEN]; + + len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); + if (len <= 0) + return EINVAL; + strncpy(buf, soun->sun_path, len); + buf[len] = 0; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td); + error = namei(&nd); + if (error) + return (error); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (vp->v_type != VSOCK) { + error = ENOTSOCK; + goto bad; + } + error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); + if (error) + goto bad; + so2 = vp->v_socket; + if (so2 == 0) { + error = ECONNREFUSED; + goto bad; + } + if (so->so_type != so2->so_type) { + error = EPROTOTYPE; + goto bad; + } + if (so->so_proto->pr_flags & PR_CONNREQUIRED) { + if ((so2->so_options & SO_ACCEPTCONN) == 0 || + (so3 = sonewconn(so2, 0)) == 0) { + error = ECONNREFUSED; + goto bad; + } + unp = sotounpcb(so); + unp2 = sotounpcb(so2); + unp3 = sotounpcb(so3); + if (unp2->unp_addr) + unp3->unp_addr = (struct sockaddr_un *) + dup_sockaddr((struct sockaddr *) + unp2->unp_addr, 1); + + /* + * unp_peercred management: + * + * The connecter's (client's) credentials are copied + * from its process structure at the time of connect() + * (which is now). + */ + cru2x(td->td_ucred, &unp3->unp_peercred); + unp3->unp_flags |= UNP_HAVEPC; + /* + * The receiver's (server's) credentials are copied + * from the unp_peercred member of socket on which the + * former called listen(); unp_listen() cached that + * process's credentials at that time so we can use + * them now. + */ + KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, + ("unp_connect: listener without cached peercred")); + memcpy(&unp->unp_peercred, &unp2->unp_peercred, + sizeof(unp->unp_peercred)); + unp->unp_flags |= UNP_HAVEPC; + + so2 = so3; + } + error = unp_connect2(so, so2); +bad: + vput(vp); + return (error); +} + +int +unp_connect2(so, so2) + register struct socket *so; + register struct socket *so2; +{ + register struct unpcb *unp = sotounpcb(so); + register struct unpcb *unp2; + + if (so2->so_type != so->so_type) + return (EPROTOTYPE); + unp2 = sotounpcb(so2); + unp->unp_conn = unp2; + switch (so->so_type) { + + case SOCK_DGRAM: + LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); + soisconnected(so); + break; + + case SOCK_STREAM: + unp2->unp_conn = unp; + soisconnected(so); + soisconnected(so2); + break; + + default: + panic("unp_connect2"); + } + return (0); +} + +static void +unp_disconnect(unp) + struct unpcb *unp; +{ + register struct unpcb *unp2 = unp->unp_conn; + + if (unp2 == 0) + return; + unp->unp_conn = 0; + switch (unp->unp_socket->so_type) { + + case SOCK_DGRAM: + LIST_REMOVE(unp, unp_reflink); + unp->unp_socket->so_state &= ~SS_ISCONNECTED; + break; + + case SOCK_STREAM: + soisdisconnected(unp->unp_socket); + unp2->unp_conn = 0; + soisdisconnected(unp2->unp_socket); + break; + } +} + +#ifdef notdef +void +unp_abort(unp) + struct unpcb *unp; +{ + + unp_detach(unp); +} +#endif + +static int +unp_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n; + struct unpcb *unp, **unp_list; + unp_gen_t gencnt; + struct xunpgen *xug; + struct unp_head *head; + struct xunpcb *xu; + + head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead); + + /* + * The process of preparing the PCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = unp_count; + req->oldidx = 2 * (sizeof *xug) + + (n + n/8) * sizeof(struct xunpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); + gencnt = unp_gencnt; + n = unp_count; + + xug->xug_len = sizeof *xug; + xug->xug_count = n; + xug->xug_gen = gencnt; + xug->xug_sogen = so_gencnt; + error = SYSCTL_OUT(req, xug, sizeof *xug); + if (error) { + free(xug, M_TEMP); + return error; + } + + unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); + + for (unp = LIST_FIRST(head), i = 0; unp && i < n; + unp = LIST_NEXT(unp, unp_link)) { + if (unp->unp_gencnt <= gencnt) { + if (cr_cansee(req->td->td_ucred, + unp->unp_socket->so_cred)) + continue; + unp_list[i++] = unp; + } + } + n = i; /* in case we lost some during malloc */ + + error = 0; + xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK); + for (i = 0; i < n; i++) { + unp = unp_list[i]; + if (unp->unp_gencnt <= gencnt) { + xu->xu_len = sizeof *xu; + xu->xu_unpp = unp; + /* + * XXX - need more locking here to protect against + * connect/disconnect races for SMP. + */ + if (unp->unp_addr) + bcopy(unp->unp_addr, &xu->xu_addr, + unp->unp_addr->sun_len); + if (unp->unp_conn && unp->unp_conn->unp_addr) + bcopy(unp->unp_conn->unp_addr, + &xu->xu_caddr, + unp->unp_conn->unp_addr->sun_len); + bcopy(unp, &xu->xu_unp, sizeof *unp); + sotoxsocket(unp->unp_socket, &xu->xu_socket); + error = SYSCTL_OUT(req, xu, sizeof *xu); + } + } + free(xu, M_TEMP); + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + xug->xug_gen = unp_gencnt; + xug->xug_sogen = so_gencnt; + xug->xug_count = unp_count; + error = SYSCTL_OUT(req, xug, sizeof *xug); + } + free(unp_list, M_TEMP); + free(xug, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, + (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", + "List of active local datagram sockets"); +SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, + (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", + "List of active local stream sockets"); + +static void +unp_shutdown(unp) + struct unpcb *unp; +{ + struct socket *so; + + if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && + (so = unp->unp_conn->unp_socket)) + socantrcvmore(so); +} + +static void +unp_drop(unp, errno) + struct unpcb *unp; + int errno; +{ + struct socket *so = unp->unp_socket; + + so->so_error = errno; + unp_disconnect(unp); +} + +#ifdef notdef +void +unp_drain() +{ + +} +#endif + +static void +unp_freerights(rp, fdcount) + struct file **rp; + int fdcount; +{ + int i; + struct file *fp; + + for (i = 0; i < fdcount; i++) { + fp = *rp; + /* + * zero the pointer before calling + * unp_discard since it may end up + * in unp_gc().. + */ + *rp++ = 0; + unp_discard(fp); + } +} + +int +unp_externalize(control, controlp) + struct mbuf *control, **controlp; +{ + struct thread *td = curthread; /* XXX */ + struct cmsghdr *cm = mtod(control, struct cmsghdr *); + int i; + int *fdp; + struct file **rp; + struct file *fp; + void *data; + socklen_t clen = control->m_len, datalen; + int error, newfds; + int f; + u_int newlen; + + error = 0; + if (controlp != NULL) /* controlp == NULL => free control messages */ + *controlp = NULL; + + while (cm != NULL) { + if (sizeof(*cm) > clen || cm->cmsg_len > clen) { + error = EINVAL; + break; + } + + data = CMSG_DATA(cm); + datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; + + if (cm->cmsg_level == SOL_SOCKET + && cm->cmsg_type == SCM_RIGHTS) { + newfds = datalen / sizeof(struct file *); + rp = data; + + /* If we're not outputting the discriptors free them. */ + if (error || controlp == NULL) { + unp_freerights(rp, newfds); + goto next; + } + FILEDESC_LOCK(td->td_proc->p_fd); + /* if the new FD's will not fit free them. */ + if (!fdavail(td, newfds)) { + FILEDESC_UNLOCK(td->td_proc->p_fd); + error = EMSGSIZE; + unp_freerights(rp, newfds); + goto next; + } + /* + * now change each pointer to an fd in the global + * table to an integer that is the index to the + * local fd table entry that we set up to point + * to the global one we are transferring. + */ + newlen = newfds * sizeof(int); + *controlp = sbcreatecontrol(NULL, newlen, + SCM_RIGHTS, SOL_SOCKET); + if (*controlp == NULL) { + FILEDESC_UNLOCK(td->td_proc->p_fd); + error = E2BIG; + unp_freerights(rp, newfds); + goto next; + } + + fdp = (int *) + CMSG_DATA(mtod(*controlp, struct cmsghdr *)); + for (i = 0; i < newfds; i++) { + if (fdalloc(td, 0, &f)) + panic("unp_externalize fdalloc failed"); + fp = *rp++; + td->td_proc->p_fd->fd_ofiles[f] = fp; + FILE_LOCK(fp); + fp->f_msgcount--; + FILE_UNLOCK(fp); + unp_rights--; + *fdp++ = f; + } + FILEDESC_UNLOCK(td->td_proc->p_fd); + } else { /* We can just copy anything else across */ + if (error || controlp == NULL) + goto next; + *controlp = sbcreatecontrol(NULL, datalen, + cm->cmsg_type, cm->cmsg_level); + if (*controlp == NULL) { + error = ENOBUFS; + goto next; + } + bcopy(data, + CMSG_DATA(mtod(*controlp, struct cmsghdr *)), + datalen); + } + + controlp = &(*controlp)->m_next; + +next: + if (CMSG_SPACE(datalen) < clen) { + clen -= CMSG_SPACE(datalen); + cm = (struct cmsghdr *) + ((caddr_t)cm + CMSG_SPACE(datalen)); + } else { + clen = 0; + cm = NULL; + } + } + + m_freem(control); + + return (error); +} + +void +unp_init(void) +{ + unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(unp_zone, nmbclusters); + if (unp_zone == 0) + panic("unp_init"); + LIST_INIT(&unp_dhead); + LIST_INIT(&unp_shead); +} + +#ifndef MIN +#define MIN(a,b) (((a)<(b))?(a):(b)) +#endif + +static int +unp_internalize(controlp, td) + struct mbuf **controlp; + struct thread *td; +{ + struct mbuf *control = *controlp; + struct proc *p = td->td_proc; + struct filedesc *fdescp = p->p_fd; + struct cmsghdr *cm = mtod(control, struct cmsghdr *); + struct cmsgcred *cmcred; + struct file **rp; + struct file *fp; + struct timeval *tv; + int i, fd, *fdp; + void *data; + socklen_t clen = control->m_len, datalen; + int error, oldfds; + u_int newlen; + + error = 0; + *controlp = NULL; + + while (cm != NULL) { + if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET + || cm->cmsg_len > clen) { + error = EINVAL; + goto out; + } + + data = CMSG_DATA(cm); + datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; + + switch (cm->cmsg_type) { + /* + * Fill in credential information. + */ + case SCM_CREDS: + *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), + SCM_CREDS, SOL_SOCKET); + if (*controlp == NULL) { + error = ENOBUFS; + goto out; + } + + cmcred = (struct cmsgcred *) + CMSG_DATA(mtod(*controlp, struct cmsghdr *)); + cmcred->cmcred_pid = p->p_pid; + cmcred->cmcred_uid = td->td_ucred->cr_ruid; + cmcred->cmcred_gid = td->td_ucred->cr_rgid; + cmcred->cmcred_euid = td->td_ucred->cr_uid; + cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, + CMGROUP_MAX); + for (i = 0; i < cmcred->cmcred_ngroups; i++) + cmcred->cmcred_groups[i] = + td->td_ucred->cr_groups[i]; + break; + + case SCM_RIGHTS: + oldfds = datalen / sizeof (int); + /* + * check that all the FDs passed in refer to legal files + * If not, reject the entire operation. + */ + fdp = data; + FILEDESC_LOCK(fdescp); + for (i = 0; i < oldfds; i++) { + fd = *fdp++; + if ((unsigned)fd >= fdescp->fd_nfiles || + fdescp->fd_ofiles[fd] == NULL) { + FILEDESC_UNLOCK(fdescp); + error = EBADF; + goto out; + } + } + /* + * Now replace the integer FDs with pointers to + * the associated global file table entry.. + */ + newlen = oldfds * sizeof(struct file *); + *controlp = sbcreatecontrol(NULL, newlen, + SCM_RIGHTS, SOL_SOCKET); + if (*controlp == NULL) { + FILEDESC_UNLOCK(fdescp); + error = E2BIG; + goto out; + } + + fdp = data; + rp = (struct file **) + CMSG_DATA(mtod(*controlp, struct cmsghdr *)); + for (i = 0; i < oldfds; i++) { + fp = fdescp->fd_ofiles[*fdp++]; + *rp++ = fp; + FILE_LOCK(fp); + fp->f_count++; + fp->f_msgcount++; + FILE_UNLOCK(fp); + unp_rights++; + } + FILEDESC_UNLOCK(fdescp); + break; + + case SCM_TIMESTAMP: + *controlp = sbcreatecontrol(NULL, sizeof(*tv), + SCM_TIMESTAMP, SOL_SOCKET); + if (*controlp == NULL) { + error = ENOBUFS; + goto out; + } + tv = (struct timeval *) + CMSG_DATA(mtod(*controlp, struct cmsghdr *)); + microtime(tv); + break; + + default: + error = EINVAL; + goto out; + } + + controlp = &(*controlp)->m_next; + + if (CMSG_SPACE(datalen) < clen) { + clen -= CMSG_SPACE(datalen); + cm = (struct cmsghdr *) + ((caddr_t)cm + CMSG_SPACE(datalen)); + } else { + clen = 0; + cm = NULL; + } + } + +out: + m_freem(control); + + return (error); +} + +static int unp_defer, unp_gcing; + +static void +unp_gc() +{ + register struct file *fp, *nextfp; + register struct socket *so; + struct file **extra_ref, **fpp; + int nunref, i; + + if (unp_gcing) + return; + unp_gcing = 1; + unp_defer = 0; + /* + * before going through all this, set all FDs to + * be NOT defered and NOT externally accessible + */ + sx_slock(&filelist_lock); + LIST_FOREACH(fp, &filehead, f_list) + fp->f_gcflag &= ~(FMARK|FDEFER); + do { + LIST_FOREACH(fp, &filehead, f_list) { + FILE_LOCK(fp); + /* + * If the file is not open, skip it + */ + if (fp->f_count == 0) { + FILE_UNLOCK(fp); + continue; + } + /* + * If we already marked it as 'defer' in a + * previous pass, then try process it this time + * and un-mark it + */ + if (fp->f_gcflag & FDEFER) { + fp->f_gcflag &= ~FDEFER; + unp_defer--; + } else { + /* + * if it's not defered, then check if it's + * already marked.. if so skip it + */ + if (fp->f_gcflag & FMARK) { + FILE_UNLOCK(fp); + continue; + } + /* + * If all references are from messages + * in transit, then skip it. it's not + * externally accessible. + */ + if (fp->f_count == fp->f_msgcount) { + FILE_UNLOCK(fp); + continue; + } + /* + * If it got this far then it must be + * externally accessible. + */ + fp->f_gcflag |= FMARK; + } + /* + * either it was defered, or it is externally + * accessible and not already marked so. + * Now check if it is possibly one of OUR sockets. + */ + if (fp->f_type != DTYPE_SOCKET || + (so = (struct socket *)fp->f_data) == 0) { + FILE_UNLOCK(fp); + continue; + } + FILE_UNLOCK(fp); + if (so->so_proto->pr_domain != &localdomain || + (so->so_proto->pr_flags&PR_RIGHTS) == 0) + continue; +#ifdef notdef + if (so->so_rcv.sb_flags & SB_LOCK) { + /* + * This is problematical; it's not clear + * we need to wait for the sockbuf to be + * unlocked (on a uniprocessor, at least), + * and it's also not clear what to do + * if sbwait returns an error due to receipt + * of a signal. If sbwait does return + * an error, we'll go into an infinite + * loop. Delete all of this for now. + */ + (void) sbwait(&so->so_rcv); + goto restart; + } +#endif + /* + * So, Ok, it's one of our sockets and it IS externally + * accessible (or was defered). Now we look + * to see if we hold any file descriptors in its + * message buffers. Follow those links and mark them + * as accessible too. + */ + unp_scan(so->so_rcv.sb_mb, unp_mark); + } + } while (unp_defer); + sx_sunlock(&filelist_lock); + /* + * We grab an extra reference to each of the file table entries + * that are not otherwise accessible and then free the rights + * that are stored in messages on them. + * + * The bug in the orginal code is a little tricky, so I'll describe + * what's wrong with it here. + * + * It is incorrect to simply unp_discard each entry for f_msgcount + * times -- consider the case of sockets A and B that contain + * references to each other. On a last close of some other socket, + * we trigger a gc since the number of outstanding rights (unp_rights) + * is non-zero. If during the sweep phase the gc code un_discards, + * we end up doing a (full) closef on the descriptor. A closef on A + * results in the following chain. Closef calls soo_close, which + * calls soclose. Soclose calls first (through the switch + * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply + * returns because the previous instance had set unp_gcing, and + * we return all the way back to soclose, which marks the socket + * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush + * to free up the rights that are queued in messages on the socket A, + * i.e., the reference on B. The sorflush calls via the dom_dispose + * switch unp_dispose, which unp_scans with unp_discard. This second + * instance of unp_discard just calls closef on B. + * + * Well, a similar chain occurs on B, resulting in a sorflush on B, + * which results in another closef on A. Unfortunately, A is already + * being closed, and the descriptor has already been marked with + * SS_NOFDREF, and soclose panics at this point. + * + * Here, we first take an extra reference to each inaccessible + * descriptor. Then, we call sorflush ourself, since we know + * it is a Unix domain socket anyhow. After we destroy all the + * rights carried in messages, we do a last closef to get rid + * of our extra reference. This is the last close, and the + * unp_detach etc will shut down the socket. + * + * 91/09/19, bsy@cs.cmu.edu + */ + extra_ref = malloc(nfiles * sizeof(struct file *), M_TEMP, M_WAITOK); + sx_slock(&filelist_lock); + for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0; + fp = nextfp) { + nextfp = LIST_NEXT(fp, f_list); + FILE_LOCK(fp); + /* + * If it's not open, skip it + */ + if (fp->f_count == 0) { + FILE_UNLOCK(fp); + continue; + } + /* + * If all refs are from msgs, and it's not marked accessible + * then it must be referenced from some unreachable cycle + * of (shut-down) FDs, so include it in our + * list of FDs to remove + */ + if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) { + *fpp++ = fp; + nunref++; + fp->f_count++; + } + FILE_UNLOCK(fp); + } + sx_sunlock(&filelist_lock); + /* + * for each FD on our hit list, do the following two things + */ + for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { + struct file *tfp = *fpp; + FILE_LOCK(tfp); + if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL) { + FILE_UNLOCK(tfp); + sorflush((struct socket *)(tfp->f_data)); + } else + FILE_UNLOCK(tfp); + } + for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) + closef(*fpp, (struct thread *) NULL); + free(extra_ref, M_TEMP); + unp_gcing = 0; +} + +void +unp_dispose(m) + struct mbuf *m; +{ + + if (m) + unp_scan(m, unp_discard); +} + +static int +unp_listen(unp, td) + struct unpcb *unp; + struct thread *td; +{ + + cru2x(td->td_ucred, &unp->unp_peercred); + unp->unp_flags |= UNP_HAVEPCCACHED; + return (0); +} + +static void +unp_scan(m0, op) + register struct mbuf *m0; + void (*op)(struct file *); +{ + struct mbuf *m; + struct file **rp; + struct cmsghdr *cm; + void *data; + int i; + socklen_t clen, datalen; + int qfds; + + while (m0) { + for (m = m0; m; m = m->m_next) { + if (m->m_type != MT_CONTROL) + continue; + + cm = mtod(m, struct cmsghdr *); + clen = m->m_len; + + while (cm != NULL) { + if (sizeof(*cm) > clen || cm->cmsg_len > clen) + break; + + data = CMSG_DATA(cm); + datalen = (caddr_t)cm + cm->cmsg_len + - (caddr_t)data; + + if (cm->cmsg_level == SOL_SOCKET && + cm->cmsg_type == SCM_RIGHTS) { + qfds = datalen / sizeof (struct file *); + rp = data; + for (i = 0; i < qfds; i++) + (*op)(*rp++); + } + + if (CMSG_SPACE(datalen) < clen) { + clen -= CMSG_SPACE(datalen); + cm = (struct cmsghdr *) + ((caddr_t)cm + CMSG_SPACE(datalen)); + } else { + clen = 0; + cm = NULL; + } + } + } + m0 = m0->m_act; + } +} + +static void +unp_mark(fp) + struct file *fp; +{ + if (fp->f_gcflag & FMARK) + return; + unp_defer++; + fp->f_gcflag |= (FMARK|FDEFER); +} + +static void +unp_discard(fp) + struct file *fp; +{ + FILE_LOCK(fp); + fp->f_msgcount--; + unp_rights--; + FILE_UNLOCK(fp); + (void) closef(fp, (struct thread *)NULL); +} diff --git a/sys/kern/vfs_acl.c b/sys/kern/vfs_acl.c new file mode 100644 index 0000000..70be0ec --- /dev/null +++ b/sys/kern/vfs_acl.c @@ -0,0 +1,830 @@ +/*- + * Copyright (c) 1999-2001 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * Developed by the TrustedBSD Project. + * Support for POSIX.1e access control lists. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/vnode.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/namei.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/sysent.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/acl.h> + +MALLOC_DEFINE(M_ACL, "acl", "access control list"); + +static int vacl_set_acl(struct thread *td, struct vnode *vp, + acl_type_t type, struct acl *aclp); +static int vacl_get_acl(struct thread *td, struct vnode *vp, + acl_type_t type, struct acl *aclp); +static int vacl_aclcheck(struct thread *td, struct vnode *vp, + acl_type_t type, struct acl *aclp); + +/* + * Implement a version of vaccess() that understands POSIX.1e ACL semantics. + * Return 0 on success, else an errno value. Should be merged into + * vaccess() eventually. + */ +int +vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, + struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused) +{ + struct acl_entry *acl_other, *acl_mask; + mode_t dac_granted; + mode_t cap_granted; + mode_t acl_mask_granted; + int group_matched, i; + + /* + * Look for a normal, non-privileged way to access the file/directory + * as requested. If it exists, go with that. Otherwise, attempt + * to use privileges granted via cap_granted. In some cases, + * which privileges to use may be ambiguous due to "best match", + * in which case fall back on first match for the time being. + */ + if (privused != NULL) + *privused = 0; + + /* + * Determine privileges now, but don't apply until we've found + * a DAC entry that matches but has failed to allow access. + */ +#ifndef CAPABILITIES + if (suser_cred(cred, PRISON_ROOT) == 0) + cap_granted = (VEXEC | VREAD | VWRITE | VADMIN); + else + cap_granted = 0; +#else + cap_granted = 0; + + if (type == VDIR) { + if ((acc_mode & VEXEC) && !cap_check(cred, NULL, + CAP_DAC_READ_SEARCH, PRISON_ROOT)) + cap_granted |= VEXEC; + } else { + if ((acc_mode & VEXEC) && !cap_check(cred, NULL, + CAP_DAC_EXECUTE, PRISON_ROOT)) + cap_granted |= VEXEC; + } + + if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, + PRISON_ROOT)) + cap_granted |= VREAD; + + if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE, + PRISON_ROOT)) + cap_granted |= VWRITE; + + if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER, + PRISON_ROOT)) + cap_granted |= VADMIN; +#endif /* CAPABILITIES */ + + /* + * The owner matches if the effective uid associated with the + * credential matches that of the ACL_USER_OBJ entry. While we're + * doing the first scan, also cache the location of the ACL_MASK + * and ACL_OTHER entries, preventing some future iterations. + */ + acl_mask = acl_other = NULL; + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_USER_OBJ: + if (file_uid != cred->cr_uid) + break; + dac_granted = 0; + dac_granted |= VADMIN; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + if ((acc_mode & dac_granted) == acc_mode) + return (0); + if ((acc_mode & (dac_granted | cap_granted)) == + acc_mode) { + if (privused != NULL) + *privused = 1; + return (0); + } + goto error; + + case ACL_MASK: + acl_mask = &acl->acl_entry[i]; + break; + + case ACL_OTHER: + acl_other = &acl->acl_entry[i]; + break; + + default: + break; + } + } + + /* + * An ACL_OTHER entry should always exist in a valid access + * ACL. If it doesn't, then generate a serious failure. For now, + * this means a debugging message and EPERM, but in the future + * should probably be a panic. + */ + if (acl_other == NULL) { + /* + * XXX This should never happen + */ + printf("vaccess_acl_posix1e: ACL_OTHER missing\n"); + return (EPERM); + } + + /* + * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields + * are masked by an ACL_MASK entry, if any. As such, first identify + * the ACL_MASK field, then iterate through identifying potential + * user matches, then group matches. If there is no ACL_MASK, + * assume that the mask allows all requests to succeed. + */ + if (acl_mask != NULL) { + acl_mask_granted = 0; + if (acl_mask->ae_perm & ACL_EXECUTE) + acl_mask_granted |= VEXEC; + if (acl_mask->ae_perm & ACL_READ) + acl_mask_granted |= VREAD; + if (acl_mask->ae_perm & ACL_WRITE) + acl_mask_granted |= VWRITE; + } else + acl_mask_granted = VEXEC | VREAD | VWRITE; + + /* + * Iterate through user ACL entries. Do checks twice, first + * without privilege, and then if a match is found but failed, + * a second time with privilege. + */ + + /* + * Check ACL_USER ACL entries. + */ + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_USER: + if (acl->acl_entry[i].ae_id != cred->cr_uid) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + if ((acc_mode & dac_granted) == acc_mode) + return (0); + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + goto error; + + if (privused != NULL) + *privused = 1; + return (0); + } + } + + /* + * Group match is best-match, not first-match, so find a + * "best" match. Iterate across, testing each potential group + * match. Make sure we keep track of whether we found a match + * or not, so that we know if we should try again with any + * available privilege, or if we should move on to ACL_OTHER. + */ + group_matched = 0; + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_GROUP_OBJ: + if (!groupmember(file_gid, cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + + group_matched = 1; + break; + + case ACL_GROUP: + if (!groupmember(acl->acl_entry[i].ae_id, cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + + group_matched = 1; + break; + + default: + break; + } + } + + if (group_matched == 1) { + /* + * There was a match, but it did not grant rights via + * pure DAC. Try again, this time with privilege. + */ + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_GROUP_OBJ: + if (!groupmember(file_gid, cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + break; + + if (privused != NULL) + *privused = 1; + return (0); + + case ACL_GROUP: + if (!groupmember(acl->acl_entry[i].ae_id, + cred)) + break; + dac_granted = 0; + if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl->acl_entry[i].ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl->acl_entry[i].ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + dac_granted &= acl_mask_granted; + + if ((acc_mode & (dac_granted | cap_granted)) != + acc_mode) + break; + + if (privused != NULL) + *privused = 1; + return (0); + + default: + break; + } + } + /* + * Even with privilege, group membership was not sufficient. + * Return failure. + */ + goto error; + } + + /* + * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER. + */ + dac_granted = 0; + if (acl_other->ae_perm & ACL_EXECUTE) + dac_granted |= VEXEC; + if (acl_other->ae_perm & ACL_READ) + dac_granted |= VREAD; + if (acl_other->ae_perm & ACL_WRITE) + dac_granted |= VWRITE; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) { + if (privused != NULL) + *privused = 1; + return (0); + } + +error: + return ((acc_mode & VADMIN) ? EPERM : EACCES); +} + +/* + * For the purposes of filesystems maintaining the _OBJ entries in an + * inode with a mode_t field, this routine converts a mode_t entry + * to an acl_perm_t. + */ +acl_perm_t +acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode) +{ + acl_perm_t perm = 0; + + switch(tag) { + case ACL_USER_OBJ: + if (mode & S_IXUSR) + perm |= ACL_EXECUTE; + if (mode & S_IRUSR) + perm |= ACL_READ; + if (mode & S_IWUSR) + perm |= ACL_WRITE; + return (perm); + + case ACL_GROUP_OBJ: + if (mode & S_IXGRP) + perm |= ACL_EXECUTE; + if (mode & S_IRGRP) + perm |= ACL_READ; + if (mode & S_IWGRP) + perm |= ACL_WRITE; + return (perm); + + case ACL_OTHER: + if (mode & S_IXOTH) + perm |= ACL_EXECUTE; + if (mode & S_IROTH) + perm |= ACL_READ; + if (mode & S_IWOTH) + perm |= ACL_WRITE; + return (perm); + + default: + printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag); + return (0); + } +} + +/* + * Given inode information (uid, gid, mode), return an acl entry of the + * appropriate type. + */ +struct acl_entry +acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode) +{ + struct acl_entry acl_entry; + + acl_entry.ae_tag = tag; + acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode); + switch(tag) { + case ACL_USER_OBJ: + acl_entry.ae_id = uid; + break; + + case ACL_GROUP_OBJ: + acl_entry.ae_id = gid; + break; + + case ACL_OTHER: + acl_entry.ae_id = ACL_UNDEFINED_ID; + break; + + default: + acl_entry.ae_id = ACL_UNDEFINED_ID; + printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag); + } + + return (acl_entry); +} + +/* + * Utility function to generate a file mode given appropriate ACL entries. + */ +mode_t +acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry, + struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry) +{ + mode_t mode; + + mode = 0; + if (acl_user_obj_entry->ae_perm & ACL_EXECUTE) + mode |= S_IXUSR; + if (acl_user_obj_entry->ae_perm & ACL_READ) + mode |= S_IRUSR; + if (acl_user_obj_entry->ae_perm & ACL_WRITE) + mode |= S_IWUSR; + if (acl_group_obj_entry->ae_perm & ACL_EXECUTE) + mode |= S_IXGRP; + if (acl_group_obj_entry->ae_perm & ACL_READ) + mode |= S_IRGRP; + if (acl_group_obj_entry->ae_perm & ACL_WRITE) + mode |= S_IWGRP; + if (acl_other_entry->ae_perm & ACL_EXECUTE) + mode |= S_IXOTH; + if (acl_other_entry->ae_perm & ACL_READ) + mode |= S_IROTH; + if (acl_other_entry->ae_perm & ACL_WRITE) + mode |= S_IWOTH; + + return (mode); +} + +/* + * Perform a syntactic check of the ACL, sufficient to allow an + * implementing filesystem to determine if it should accept this and + * rely on the POSIX.1e ACL properties. + */ +int +acl_posix1e_check(struct acl *acl) +{ + int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group; + int num_acl_mask, num_acl_other, i; + + /* + * Verify that the number of entries does not exceed the maximum + * defined for acl_t. + * Verify that the correct number of various sorts of ae_tags are + * present: + * Exactly one ACL_USER_OBJ + * Exactly one ACL_GROUP_OBJ + * Exactly one ACL_OTHER + * If any ACL_USER or ACL_GROUP entries appear, then exactly one + * ACL_MASK entry must also appear. + * Verify that all ae_perm entries are in ACL_PERM_BITS. + * Verify all ae_tag entries are understood by this implementation. + * Note: Does not check for uniqueness of qualifier (ae_id) field. + */ + num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group = + num_acl_mask = num_acl_other = 0; + if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0) + return (EINVAL); + for (i = 0; i < acl->acl_cnt; i++) { + /* + * Check for a valid tag. + */ + switch(acl->acl_entry[i].ae_tag) { + case ACL_USER_OBJ: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_user_obj++; + break; + case ACL_GROUP_OBJ: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_group_obj++; + break; + case ACL_USER: + if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_user++; + break; + case ACL_GROUP: + if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_group++; + break; + case ACL_OTHER: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_other++; + break; + case ACL_MASK: + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ + if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) + return (EINVAL); + num_acl_mask++; + break; + default: + return (EINVAL); + } + /* + * Check for valid perm entries. + */ + if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) != + ACL_PERM_BITS) + return (EINVAL); + } + if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) || + (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1)) + return (EINVAL); + if (((num_acl_group != 0) || (num_acl_user != 0)) && + (num_acl_mask != 1)) + return (EINVAL); + return (0); +} + +/* + * These calls wrap the real vnode operations, and are called by the + * syscall code once the syscall has converted the path or file + * descriptor to a vnode (unlocked). The aclp pointer is assumed + * still to point to userland, so this should not be consumed within + * the kernel except by syscall code. Other code should directly + * invoke VOP_{SET,GET}ACL. + */ + +/* + * Given a vnode, set its ACL. + */ +static int +vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, + struct acl *aclp) +{ + struct acl inkernacl; + struct mount *mp; + int error; + + error = copyin(aclp, &inkernacl, sizeof(struct acl)); + if (error) + return(error); + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return(error); +} + +/* + * Given a vnode, get its ACL. + */ +static int +vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, + struct acl *aclp) +{ + struct acl inkernelacl; + int error; + + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + if (error == 0) + error = copyout(&inkernelacl, aclp, sizeof(struct acl)); + return (error); +} + +/* + * Given a vnode, delete its ACL. + */ +static int +vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type) +{ + struct mount *mp; + int error; + + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_SETACL(vp, type, NULL, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return (error); +} + +/* + * Given a vnode, check whether an ACL is appropriate for it + */ +static int +vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, + struct acl *aclp) +{ + struct acl inkernelacl; + int error; + + error = copyin(aclp, &inkernelacl, sizeof(struct acl)); + if (error) + return(error); + error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td); + return (error); +} + +/* + * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. + * Don't need to lock, as the vacl_ code will get/release any locks + * required. + */ + +/* + * Given a file path, get an ACL for it + * + * MPSAFE + */ +int +__acl_get_file(struct thread *td, struct __acl_get_file_args *uap) +{ + struct nameidata nd; + int error; + + mtx_lock(&Giant); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + error = namei(&nd); + if (error == 0) { + error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type), + SCARG(uap, aclp)); + NDFREE(&nd, 0); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file path, set an ACL for it + * + * MPSAFE + */ +int +__acl_set_file(struct thread *td, struct __acl_set_file_args *uap) +{ + struct nameidata nd; + int error; + + mtx_lock(&Giant); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + error = namei(&nd); + if (error == 0) { + error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type), + SCARG(uap, aclp)); + NDFREE(&nd, 0); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file descriptor, get an ACL for it + * + * MPSAFE + */ +int +__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap) +{ + struct file *fp; + int error; + + mtx_lock(&Giant); + error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); + if (error == 0) { + error = vacl_get_acl(td, (struct vnode *)fp->f_data, + SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file descriptor, set an ACL for it + * + * MPSAFE + */ +int +__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap) +{ + struct file *fp; + int error; + + mtx_lock(&Giant); + error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); + if (error == 0) { + error = vacl_set_acl(td, (struct vnode *)fp->f_data, + SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file path, delete an ACL from it. + * + * MPSAFE + */ +int +__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap) +{ + struct nameidata nd; + int error; + + mtx_lock(&Giant); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + error = namei(&nd); + if (error == 0) { + error = vacl_delete(td, nd.ni_vp, SCARG(uap, type)); + NDFREE(&nd, 0); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file path, delete an ACL from it. + * + * MPSAFE + */ +int +__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap) +{ + struct file *fp; + int error; + + mtx_lock(&Giant); + error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); + if (error == 0) { + error = vacl_delete(td, (struct vnode *)fp->f_data, + SCARG(uap, type)); + fdrop(fp, td); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file path, check an ACL for it + * + * MPSAFE + */ +int +__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap) +{ + struct nameidata nd; + int error; + + mtx_lock(&Giant); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + error = namei(&nd); + if (error == 0) { + error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type), + SCARG(uap, aclp)); + NDFREE(&nd, 0); + } + mtx_unlock(&Giant); + return (error); +} + +/* + * Given a file descriptor, check an ACL for it + * + * MPSAFE + */ +int +__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap) +{ + struct file *fp; + int error; + + mtx_lock(&Giant); + error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp); + if (error == 0) { + error = vacl_aclcheck(td, (struct vnode *)fp->f_data, + SCARG(uap, type), SCARG(uap, aclp)); + fdrop(fp, td); + } + mtx_unlock(&Giant); + return (error); +} diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c new file mode 100644 index 0000000..891f272 --- /dev/null +++ b/sys/kern/vfs_aio.c @@ -0,0 +1,2307 @@ +/* + * Copyright (c) 1997 John S. Dyson. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. John S. Dyson's name may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * DISCLAIMER: This code isn't warranted to do anything useful. Anything + * bad that happens because of using this software isn't the responsibility + * of the author. This software is distributed AS-IS. + * + * $FreeBSD$ + */ + +/* + * This file contains support for the POSIX 1003.1B AIO/LIO facility. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/sysproto.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/unistd.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/protosw.h> +#include <sys/socketvar.h> +#include <sys/syscall.h> +#include <sys/sysent.h> +#include <sys/sysctl.h> +#include <sys/sx.h> +#include <sys/vnode.h> +#include <sys/conf.h> +#include <sys/event.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/uma.h> +#include <sys/aio.h> + +#include <machine/limits.h> + +#include "opt_vfs_aio.h" + +/* + * Counter for allocating reference ids to new jobs. Wrapped to 1 on + * overflow. + */ +static long jobrefid; + +#define JOBST_NULL 0x0 +#define JOBST_JOBQGLOBAL 0x2 +#define JOBST_JOBRUNNING 0x3 +#define JOBST_JOBFINISHED 0x4 +#define JOBST_JOBQBUF 0x5 +#define JOBST_JOBBFINISHED 0x6 + +#ifndef MAX_AIO_PER_PROC +#define MAX_AIO_PER_PROC 32 +#endif + +#ifndef MAX_AIO_QUEUE_PER_PROC +#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ +#endif + +#ifndef MAX_AIO_PROCS +#define MAX_AIO_PROCS 32 +#endif + +#ifndef MAX_AIO_QUEUE +#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ +#endif + +#ifndef TARGET_AIO_PROCS +#define TARGET_AIO_PROCS 4 +#endif + +#ifndef MAX_BUF_AIO +#define MAX_BUF_AIO 16 +#endif + +#ifndef AIOD_TIMEOUT_DEFAULT +#define AIOD_TIMEOUT_DEFAULT (10 * hz) +#endif + +#ifndef AIOD_LIFETIME_DEFAULT +#define AIOD_LIFETIME_DEFAULT (30 * hz) +#endif + +SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); + +static int max_aio_procs = MAX_AIO_PROCS; +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, + CTLFLAG_RW, &max_aio_procs, 0, + "Maximum number of kernel threads to use for handling async IO "); + +static int num_aio_procs = 0; +SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, + CTLFLAG_RD, &num_aio_procs, 0, + "Number of presently active kernel threads for async IO"); + +/* + * The code will adjust the actual number of AIO processes towards this + * number when it gets a chance. + */ +static int target_aio_procs = TARGET_AIO_PROCS; +SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, + 0, "Preferred number of ready kernel threads for async IO"); + +static int max_queue_count = MAX_AIO_QUEUE; +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, + "Maximum number of aio requests to queue, globally"); + +static int num_queue_count = 0; +SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, + "Number of queued aio requests"); + +static int num_buf_aio = 0; +SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, + "Number of aio requests presently handled by the buf subsystem"); + +/* Number of async I/O thread in the process of being started */ +/* XXX This should be local to _aio_aqueue() */ +static int num_aio_resv_start = 0; + +static int aiod_timeout; +SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0, + "Timeout value for synchronous aio operations"); + +static int aiod_lifetime; +SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, + "Maximum lifetime for idle aiod"); + +static int unloadable = 0; +SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, + "Allow unload of aio (not recommended)"); + + +static int max_aio_per_proc = MAX_AIO_PER_PROC; +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, + 0, "Maximum active aio requests per process (stored in the process)"); + +static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, + &max_aio_queue_per_proc, 0, + "Maximum queued aio requests per process (stored in the process)"); + +static int max_buf_aio = MAX_BUF_AIO; +SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, + "Maximum buf aio requests per process (stored in the process)"); + +struct aiocblist { + TAILQ_ENTRY(aiocblist) list; /* List of jobs */ + TAILQ_ENTRY(aiocblist) plist; /* List of jobs for proc */ + int jobflags; + int jobstate; + int inputcharge; + int outputcharge; + struct callout_handle timeouthandle; + struct buf *bp; /* Buffer pointer */ + struct proc *userproc; /* User process */ /* Not td! */ + struct file *fd_file; /* Pointer to file structure */ + struct aio_liojob *lio; /* Optional lio job */ + struct aiocb *uuaiocb; /* Pointer in userspace of aiocb */ + struct klist klist; /* list of knotes */ + struct aiocb uaiocb; /* Kernel I/O control block */ +}; + +/* jobflags */ +#define AIOCBLIST_RUNDOWN 0x4 +#define AIOCBLIST_ASYNCFREE 0x8 +#define AIOCBLIST_DONE 0x10 + +/* + * AIO process info + */ +#define AIOP_FREE 0x1 /* proc on free queue */ +#define AIOP_SCHED 0x2 /* proc explicitly scheduled */ + +struct aiothreadlist { + int aiothreadflags; /* AIO proc flags */ + TAILQ_ENTRY(aiothreadlist) list; /* List of processes */ + struct thread *aiothread; /* The AIO thread */ +}; + +/* + * data-structure for lio signal management + */ +struct aio_liojob { + int lioj_flags; + int lioj_buffer_count; + int lioj_buffer_finished_count; + int lioj_queue_count; + int lioj_queue_finished_count; + struct sigevent lioj_signal; /* signal on all I/O done */ + TAILQ_ENTRY(aio_liojob) lioj_list; + struct kaioinfo *lioj_ki; +}; +#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ +#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ + +/* + * per process aio data structure + */ +struct kaioinfo { + int kaio_flags; /* per process kaio flags */ + int kaio_maxactive_count; /* maximum number of AIOs */ + int kaio_active_count; /* number of currently used AIOs */ + int kaio_qallowed_count; /* maxiumu size of AIO queue */ + int kaio_queue_count; /* size of AIO queue */ + int kaio_ballowed_count; /* maximum number of buffers */ + int kaio_queue_finished_count; /* number of daemon jobs finished */ + int kaio_buffer_count; /* number of physio buffers */ + int kaio_buffer_finished_count; /* count of I/O done */ + struct proc *kaio_p; /* process that uses this kaio block */ + TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */ + TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */ + TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */ + TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */ + TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */ + TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ +}; + +#define KAIO_RUNDOWN 0x1 /* process is being run down */ +#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ + +static TAILQ_HEAD(,aiothreadlist) aio_activeproc; /* Active daemons */ +static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* Idle daemons */ +static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ +static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ + +static void aio_init_aioinfo(struct proc *p); +static void aio_onceonly(void); +static int aio_free_entry(struct aiocblist *aiocbe); +static void aio_process(struct aiocblist *aiocbe); +static int aio_newproc(void); +static int aio_aqueue(struct thread *td, struct aiocb *job, int type); +static void aio_physwakeup(struct buf *bp); +static void aio_proc_rundown(struct proc *p); +static int aio_fphysio(struct aiocblist *aiocbe); +static int aio_qphysio(struct proc *p, struct aiocblist *iocb); +static void aio_daemon(void *uproc); +static void aio_swake_cb(struct socket *, struct sockbuf *); +static int aio_unload(void); +static void process_signal(void *aioj); +static int filt_aioattach(struct knote *kn); +static void filt_aiodetach(struct knote *kn); +static int filt_aio(struct knote *kn, long hint); + +/* + * Zones for: + * kaio Per process async io info + * aiop async io thread data + * aiocb async io jobs + * aiol list io job pointer - internal to aio_suspend XXX + * aiolio list io jobs + */ +static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone; + +/* kqueue filters for aio */ +static struct filterops aio_filtops = + { 0, filt_aioattach, filt_aiodetach, filt_aio }; + +/* + * Main operations function for use as a kernel module. + */ +static int +aio_modload(struct module *module, int cmd, void *arg) +{ + int error = 0; + + switch (cmd) { + case MOD_LOAD: + aio_onceonly(); + break; + case MOD_UNLOAD: + error = aio_unload(); + break; + case MOD_SHUTDOWN: + break; + default: + error = EINVAL; + break; + } + return (error); +} + +static moduledata_t aio_mod = { + "aio", + &aio_modload, + NULL +}; + +SYSCALL_MODULE_HELPER(aio_return); +SYSCALL_MODULE_HELPER(aio_suspend); +SYSCALL_MODULE_HELPER(aio_cancel); +SYSCALL_MODULE_HELPER(aio_error); +SYSCALL_MODULE_HELPER(aio_read); +SYSCALL_MODULE_HELPER(aio_write); +SYSCALL_MODULE_HELPER(aio_waitcomplete); +SYSCALL_MODULE_HELPER(lio_listio); + +DECLARE_MODULE(aio, aio_mod, + SI_SUB_VFS, SI_ORDER_ANY); +MODULE_VERSION(aio, 1); + +/* + * Startup initialization + */ +static void +aio_onceonly(void) +{ + + /* XXX: should probably just use so->callback */ + aio_swake = &aio_swake_cb; + at_exit(aio_proc_rundown); + at_exec(aio_proc_rundown); + kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); + TAILQ_INIT(&aio_freeproc); + TAILQ_INIT(&aio_activeproc); + TAILQ_INIT(&aio_jobs); + TAILQ_INIT(&aio_bufjobs); + kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL, + NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL, + NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aio_liojob), NULL, + NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + aiod_timeout = AIOD_TIMEOUT_DEFAULT; + aiod_lifetime = AIOD_LIFETIME_DEFAULT; + jobrefid = 1; +} + +/* + * Callback for unload of AIO when used as a module. + */ +static int +aio_unload(void) +{ + + /* + * XXX: no unloads by default, it's too dangerous. + * perhaps we could do it if locked out callers and then + * did an aio_proc_rundown() on each process. + */ + if (!unloadable) + return (EOPNOTSUPP); + + aio_swake = NULL; + rm_at_exit(aio_proc_rundown); + rm_at_exec(aio_proc_rundown); + kqueue_del_filteropts(EVFILT_AIO); + return (0); +} + +/* + * Init the per-process aioinfo structure. The aioinfo limits are set + * per-process for user limit (resource) management. + */ +static void +aio_init_aioinfo(struct proc *p) +{ + struct kaioinfo *ki; + if (p->p_aioinfo == NULL) { + ki = uma_zalloc(kaio_zone, M_WAITOK); + p->p_aioinfo = ki; + ki->kaio_flags = 0; + ki->kaio_maxactive_count = max_aio_per_proc; + ki->kaio_active_count = 0; + ki->kaio_qallowed_count = max_aio_queue_per_proc; + ki->kaio_queue_count = 0; + ki->kaio_ballowed_count = max_buf_aio; + ki->kaio_buffer_count = 0; + ki->kaio_buffer_finished_count = 0; + ki->kaio_p = p; + TAILQ_INIT(&ki->kaio_jobdone); + TAILQ_INIT(&ki->kaio_jobqueue); + TAILQ_INIT(&ki->kaio_bufdone); + TAILQ_INIT(&ki->kaio_bufqueue); + TAILQ_INIT(&ki->kaio_liojoblist); + TAILQ_INIT(&ki->kaio_sockqueue); + } + + while (num_aio_procs < target_aio_procs) + aio_newproc(); +} + +/* + * Free a job entry. Wait for completion if it is currently active, but don't + * delay forever. If we delay, we return a flag that says that we have to + * restart the queue scan. + */ +static int +aio_free_entry(struct aiocblist *aiocbe) +{ + struct kaioinfo *ki; + struct aio_liojob *lj; + struct proc *p; + int error; + int s; + + if (aiocbe->jobstate == JOBST_NULL) + panic("aio_free_entry: freeing already free job"); + + p = aiocbe->userproc; + ki = p->p_aioinfo; + lj = aiocbe->lio; + if (ki == NULL) + panic("aio_free_entry: missing p->p_aioinfo"); + + while (aiocbe->jobstate == JOBST_JOBRUNNING) { + if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) + return 0; + aiocbe->jobflags |= AIOCBLIST_RUNDOWN; + tsleep(aiocbe, PRIBIO, "jobwai", 0); + } + aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; + + if (aiocbe->bp == NULL) { + if (ki->kaio_queue_count <= 0) + panic("aio_free_entry: process queue size <= 0"); + if (num_queue_count <= 0) + panic("aio_free_entry: system wide queue size <= 0"); + + if (lj) { + lj->lioj_queue_count--; + if (aiocbe->jobflags & AIOCBLIST_DONE) + lj->lioj_queue_finished_count--; + } + ki->kaio_queue_count--; + if (aiocbe->jobflags & AIOCBLIST_DONE) + ki->kaio_queue_finished_count--; + num_queue_count--; + } else { + if (lj) { + lj->lioj_buffer_count--; + if (aiocbe->jobflags & AIOCBLIST_DONE) + lj->lioj_buffer_finished_count--; + } + if (aiocbe->jobflags & AIOCBLIST_DONE) + ki->kaio_buffer_finished_count--; + ki->kaio_buffer_count--; + num_buf_aio--; + } + + /* aiocbe is going away, we need to destroy any knotes */ + /* XXXKSE Note the thread here is used to eventually find the + * owning process again, but it is also used to do a fo_close + * and that requires the thread. (but does it require the + * OWNING thread? (or maybe the running thread?) + * There is a semantic problem here... + */ + knote_remove(FIRST_THREAD_IN_PROC(p), &aiocbe->klist); /* XXXKSE */ + + if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) + && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { + ki->kaio_flags &= ~KAIO_WAKEUP; + wakeup(p); + } + + if (aiocbe->jobstate == JOBST_JOBQBUF) { + if ((error = aio_fphysio(aiocbe)) != 0) + return error; + if (aiocbe->jobstate != JOBST_JOBBFINISHED) + panic("aio_free_entry: invalid physio finish-up state"); + s = splbio(); + TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); + splx(s); + } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) { + s = splnet(); + TAILQ_REMOVE(&aio_jobs, aiocbe, list); + TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); + splx(s); + } else if (aiocbe->jobstate == JOBST_JOBFINISHED) + TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); + else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { + s = splbio(); + TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); + splx(s); + if (aiocbe->bp) { + vunmapbuf(aiocbe->bp); + relpbuf(aiocbe->bp, NULL); + aiocbe->bp = NULL; + } + } + if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { + TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); + uma_zfree(aiolio_zone, lj); + } + aiocbe->jobstate = JOBST_NULL; + untimeout(process_signal, aiocbe, aiocbe->timeouthandle); + fdrop(aiocbe->fd_file, curthread); + uma_zfree(aiocb_zone, aiocbe); + return 0; +} + +/* + * Rundown the jobs for a given process. + */ +static void +aio_proc_rundown(struct proc *p) +{ + int s; + struct kaioinfo *ki; + struct aio_liojob *lj, *ljn; + struct aiocblist *aiocbe, *aiocbn; + struct file *fp; + struct socket *so; + + ki = p->p_aioinfo; + if (ki == NULL) + return; + + ki->kaio_flags |= LIOJ_SIGNAL_POSTED; + while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > + ki->kaio_buffer_finished_count)) { + ki->kaio_flags |= KAIO_RUNDOWN; + if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) + break; + } + + /* + * Move any aio ops that are waiting on socket I/O to the normal job + * queues so they are cleaned up with any others. + */ + s = splnet(); + for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = + aiocbn) { + aiocbn = TAILQ_NEXT(aiocbe, plist); + fp = aiocbe->fd_file; + if (fp != NULL) { + so = (struct socket *)fp->f_data; + TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); + if (TAILQ_EMPTY(&so->so_aiojobq)) { + so->so_snd.sb_flags &= ~SB_AIO; + so->so_rcv.sb_flags &= ~SB_AIO; + } + } + TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); + TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); + TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); + } + splx(s); + +restart1: + for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { + aiocbn = TAILQ_NEXT(aiocbe, plist); + if (aio_free_entry(aiocbe)) + goto restart1; + } + +restart2: + for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = + aiocbn) { + aiocbn = TAILQ_NEXT(aiocbe, plist); + if (aio_free_entry(aiocbe)) + goto restart2; + } + +/* + * Note the use of lots of splbio here, trying to avoid splbio for long chains + * of I/O. Probably unnecessary. + */ +restart3: + s = splbio(); + while (TAILQ_FIRST(&ki->kaio_bufqueue)) { + ki->kaio_flags |= KAIO_WAKEUP; + tsleep(p, PRIBIO, "aioprn", 0); + splx(s); + goto restart3; + } + splx(s); + +restart4: + s = splbio(); + for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { + aiocbn = TAILQ_NEXT(aiocbe, plist); + if (aio_free_entry(aiocbe)) { + splx(s); + goto restart4; + } + } + splx(s); + + /* + * If we've slept, jobs might have moved from one queue to another. + * Retry rundown if we didn't manage to empty the queues. + */ + if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL || + TAILQ_FIRST(&ki->kaio_jobqueue) != NULL || + TAILQ_FIRST(&ki->kaio_bufqueue) != NULL || + TAILQ_FIRST(&ki->kaio_bufdone) != NULL) + goto restart1; + + for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { + ljn = TAILQ_NEXT(lj, lioj_list); + if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == + 0)) { + TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); + uma_zfree(aiolio_zone, lj); + } else { +#ifdef DIAGNOSTIC + printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " + "QF:%d\n", lj->lioj_buffer_count, + lj->lioj_buffer_finished_count, + lj->lioj_queue_count, + lj->lioj_queue_finished_count); +#endif + } + } + + uma_zfree(kaio_zone, ki); + p->p_aioinfo = NULL; +} + +/* + * Select a job to run (called by an AIO daemon). + */ +static struct aiocblist * +aio_selectjob(struct aiothreadlist *aiop) +{ + int s; + struct aiocblist *aiocbe; + struct kaioinfo *ki; + struct proc *userp; + + s = splnet(); + for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = + TAILQ_NEXT(aiocbe, list)) { + userp = aiocbe->userproc; + ki = userp->p_aioinfo; + + if (ki->kaio_active_count < ki->kaio_maxactive_count) { + TAILQ_REMOVE(&aio_jobs, aiocbe, list); + splx(s); + return aiocbe; + } + } + splx(s); + + return NULL; +} + +/* + * The AIO processing activity. This is the code that does the I/O request for + * the non-physio version of the operations. The normal vn operations are used, + * and this code should work in all instances for every type of file, including + * pipes, sockets, fifos, and regular files. + */ +static void +aio_process(struct aiocblist *aiocbe) +{ + struct thread *td; + struct proc *mycp; + struct aiocb *cb; + struct file *fp; + struct uio auio; + struct iovec aiov; + int cnt; + int error; + int oublock_st, oublock_end; + int inblock_st, inblock_end; + + td = curthread; + mycp = td->td_proc; + cb = &aiocbe->uaiocb; + fp = aiocbe->fd_file; + + aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; + aiov.iov_len = cb->aio_nbytes; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = cb->aio_offset; + auio.uio_resid = cb->aio_nbytes; + cnt = cb->aio_nbytes; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + + inblock_st = mycp->p_stats->p_ru.ru_inblock; + oublock_st = mycp->p_stats->p_ru.ru_oublock; + /* + * _aio_aqueue() acquires a reference to the file that is + * released in aio_free_entry(). + */ + if (cb->aio_lio_opcode == LIO_READ) { + auio.uio_rw = UIO_READ; + error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); + } else { + auio.uio_rw = UIO_WRITE; + error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); + } + inblock_end = mycp->p_stats->p_ru.ru_inblock; + oublock_end = mycp->p_stats->p_ru.ru_oublock; + + aiocbe->inputcharge = inblock_end - inblock_st; + aiocbe->outputcharge = oublock_end - oublock_st; + + if ((error) && (auio.uio_resid != cnt)) { + if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) + error = 0; + if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { + PROC_LOCK(aiocbe->userproc); + psignal(aiocbe->userproc, SIGPIPE); + PROC_UNLOCK(aiocbe->userproc); + } + } + + cnt -= auio.uio_resid; + cb->_aiocb_private.error = error; + cb->_aiocb_private.status = cnt; +} + +/* + * The AIO daemon, most of the actual work is done in aio_process, + * but the setup (and address space mgmt) is done in this routine. + */ +static void +aio_daemon(void *uproc) +{ + int s; + struct aio_liojob *lj; + struct aiocb *cb; + struct aiocblist *aiocbe; + struct aiothreadlist *aiop; + struct kaioinfo *ki; + struct proc *curcp, *mycp, *userp; + struct vmspace *myvm, *tmpvm; + struct thread *td = curthread; + struct pgrp *newpgrp; + struct session *newsess; + + mtx_lock(&Giant); + /* + * Local copies of curproc (cp) and vmspace (myvm) + */ + mycp = td->td_proc; + myvm = mycp->p_vmspace; + + if (mycp->p_textvp) { + vrele(mycp->p_textvp); + mycp->p_textvp = NULL; + } + + /* + * Allocate and ready the aio control info. There is one aiop structure + * per daemon. + */ + aiop = uma_zalloc(aiop_zone, M_WAITOK); + aiop->aiothread = td; + aiop->aiothreadflags |= AIOP_FREE; + + s = splnet(); + + /* + * Place thread (lightweight process) onto the AIO free thread list. + */ + if (TAILQ_EMPTY(&aio_freeproc)) + wakeup(&aio_freeproc); + TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); + + splx(s); + + /* + * Get rid of our current filedescriptors. AIOD's don't need any + * filedescriptors, except as temporarily inherited from the client. + */ + fdfree(td); + mycp->p_fd = NULL; + + mtx_unlock(&Giant); + /* The daemon resides in its own pgrp. */ + MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, + M_WAITOK | M_ZERO); + MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION, + M_WAITOK | M_ZERO); + + sx_xlock(&proctree_lock); + enterpgrp(mycp, mycp->p_pid, newpgrp, newsess); + sx_xunlock(&proctree_lock); + mtx_lock(&Giant); + + /* Mark special process type. */ + mycp->p_flag |= P_SYSTEM; + + /* + * Wakeup parent process. (Parent sleeps to keep from blasting away + * and creating too many daemons.) + */ + wakeup(mycp); + + for (;;) { + /* + * curcp is the current daemon process context. + * userp is the current user process context. + */ + curcp = mycp; + + /* + * Take daemon off of free queue + */ + if (aiop->aiothreadflags & AIOP_FREE) { + s = splnet(); + TAILQ_REMOVE(&aio_freeproc, aiop, list); + TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); + aiop->aiothreadflags &= ~AIOP_FREE; + splx(s); + } + aiop->aiothreadflags &= ~AIOP_SCHED; + + /* + * Check for jobs. + */ + while ((aiocbe = aio_selectjob(aiop)) != NULL) { + cb = &aiocbe->uaiocb; + userp = aiocbe->userproc; + + aiocbe->jobstate = JOBST_JOBRUNNING; + + /* + * Connect to process address space for user program. + */ + if (userp != curcp) { + /* + * Save the current address space that we are + * connected to. + */ + tmpvm = mycp->p_vmspace; + + /* + * Point to the new user address space, and + * refer to it. + */ + mycp->p_vmspace = userp->p_vmspace; + mycp->p_vmspace->vm_refcnt++; + + /* Activate the new mapping. */ + pmap_activate(FIRST_THREAD_IN_PROC(mycp)); + + /* + * If the old address space wasn't the daemons + * own address space, then we need to remove the + * daemon's reference from the other process + * that it was acting on behalf of. + */ + if (tmpvm != myvm) { + vmspace_free(tmpvm); + } + curcp = userp; + } + + ki = userp->p_aioinfo; + lj = aiocbe->lio; + + /* Account for currently active jobs. */ + ki->kaio_active_count++; + + /* Do the I/O function. */ + aio_process(aiocbe); + + /* Decrement the active job count. */ + ki->kaio_active_count--; + + /* + * Increment the completion count for wakeup/signal + * comparisons. + */ + aiocbe->jobflags |= AIOCBLIST_DONE; + ki->kaio_queue_finished_count++; + if (lj) + lj->lioj_queue_finished_count++; + if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags + & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { + ki->kaio_flags &= ~KAIO_WAKEUP; + wakeup(userp); + } + + s = splbio(); + if (lj && (lj->lioj_flags & + (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { + if ((lj->lioj_queue_finished_count == + lj->lioj_queue_count) && + (lj->lioj_buffer_finished_count == + lj->lioj_buffer_count)) { + PROC_LOCK(userp); + psignal(userp, + lj->lioj_signal.sigev_signo); + PROC_UNLOCK(userp); + lj->lioj_flags |= LIOJ_SIGNAL_POSTED; + } + } + splx(s); + + aiocbe->jobstate = JOBST_JOBFINISHED; + + /* + * If the I/O request should be automatically rundown, + * do the needed cleanup. Otherwise, place the queue + * entry for the just finished I/O request into the done + * queue for the associated client. + */ + s = splnet(); + if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { + aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; + uma_zfree(aiocb_zone, aiocbe); + } else { + TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); + TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, + plist); + } + splx(s); + KNOTE(&aiocbe->klist, 0); + + if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { + wakeup(aiocbe); + aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; + } + + if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { + PROC_LOCK(userp); + psignal(userp, cb->aio_sigevent.sigev_signo); + PROC_UNLOCK(userp); + } + } + + /* + * Disconnect from user address space. + */ + if (curcp != mycp) { + /* Get the user address space to disconnect from. */ + tmpvm = mycp->p_vmspace; + + /* Get original address space for daemon. */ + mycp->p_vmspace = myvm; + + /* Activate the daemon's address space. */ + pmap_activate(FIRST_THREAD_IN_PROC(mycp)); +#ifdef DIAGNOSTIC + if (tmpvm == myvm) { + printf("AIOD: vmspace problem -- %d\n", + mycp->p_pid); + } +#endif + /* Remove our vmspace reference. */ + vmspace_free(tmpvm); + + curcp = mycp; + } + + /* + * If we are the first to be put onto the free queue, wakeup + * anyone waiting for a daemon. + */ + s = splnet(); + TAILQ_REMOVE(&aio_activeproc, aiop, list); + if (TAILQ_EMPTY(&aio_freeproc)) + wakeup(&aio_freeproc); + TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); + aiop->aiothreadflags |= AIOP_FREE; + splx(s); + + /* + * If daemon is inactive for a long time, allow it to exit, + * thereby freeing resources. + */ + if ((aiop->aiothreadflags & AIOP_SCHED) == 0 && + tsleep(aiop->aiothread, PRIBIO, "aiordy", aiod_lifetime)) { + s = splnet(); + if (TAILQ_EMPTY(&aio_jobs)) { + if ((aiop->aiothreadflags & AIOP_FREE) && + (num_aio_procs > target_aio_procs)) { + TAILQ_REMOVE(&aio_freeproc, aiop, list); + splx(s); + uma_zfree(aiop_zone, aiop); + num_aio_procs--; +#ifdef DIAGNOSTIC + if (mycp->p_vmspace->vm_refcnt <= 1) { + printf("AIOD: bad vm refcnt for" + " exiting daemon: %d\n", + mycp->p_vmspace->vm_refcnt); + } +#endif + kthread_exit(0); + } + } + splx(s); + } + } +} + +/* + * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The + * AIO daemon modifies its environment itself. + */ +static int +aio_newproc() +{ + int error; + struct proc *p; + + error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, "aiod%d", + num_aio_procs); + if (error) + return error; + + /* + * Wait until daemon is started, but continue on just in case to + * handle error conditions. + */ + error = tsleep(p, PZERO, "aiosta", aiod_timeout); + + num_aio_procs++; + + return error; +} + +/* + * Try the high-performance, low-overhead physio method for eligible + * VCHR devices. This method doesn't use an aio helper thread, and + * thus has very low overhead. + * + * Assumes that the caller, _aio_aqueue(), has incremented the file + * structure's reference count, preventing its deallocation for the + * duration of this call. + */ +static int +aio_qphysio(struct proc *p, struct aiocblist *aiocbe) +{ + int error; + struct aiocb *cb; + struct file *fp; + struct buf *bp; + struct vnode *vp; + struct kaioinfo *ki; + struct aio_liojob *lj; + int s; + int notify; + + cb = &aiocbe->uaiocb; + fp = aiocbe->fd_file; + + if (fp->f_type != DTYPE_VNODE) + return (-1); + + vp = (struct vnode *)fp->f_data; + + /* + * If its not a disk, we don't want to return a positive error. + * It causes the aio code to not fall through to try the thread + * way when you're talking to a regular file. + */ + if (!vn_isdisk(vp, &error)) { + if (error == ENOTBLK) + return (-1); + else + return (error); + } + + if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) + return (-1); + + if (cb->aio_nbytes > + MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) + return (-1); + + ki = p->p_aioinfo; + if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) + return (-1); + + ki->kaio_buffer_count++; + + lj = aiocbe->lio; + if (lj) + lj->lioj_buffer_count++; + + /* Create and build a buffer header for a transfer. */ + bp = (struct buf *)getpbuf(NULL); + BUF_KERNPROC(bp); + + /* + * Get a copy of the kva from the physical buffer. + */ + bp->b_caller1 = p; + bp->b_dev = vp->v_rdev; + error = bp->b_error = 0; + + bp->b_bcount = cb->aio_nbytes; + bp->b_bufsize = cb->aio_nbytes; + bp->b_flags = B_PHYS; + bp->b_iodone = aio_physwakeup; + bp->b_saveaddr = bp->b_data; + bp->b_data = (void *)(uintptr_t)cb->aio_buf; + bp->b_blkno = btodb(cb->aio_offset); + + if (cb->aio_lio_opcode == LIO_WRITE) { + bp->b_iocmd = BIO_WRITE; + if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) { + error = EFAULT; + goto doerror; + } + } else { + bp->b_iocmd = BIO_READ; + if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) { + error = EFAULT; + goto doerror; + } + } + + /* Bring buffer into kernel space. */ + vmapbuf(bp); + + s = splbio(); + aiocbe->bp = bp; + bp->b_spc = (void *)aiocbe; + TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); + TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); + aiocbe->jobstate = JOBST_JOBQBUF; + cb->_aiocb_private.status = cb->aio_nbytes; + num_buf_aio++; + bp->b_error = 0; + + splx(s); + + /* Perform transfer. */ + DEV_STRATEGY(bp, 0); + + notify = 0; + s = splbio(); + + /* + * If we had an error invoking the request, or an error in processing + * the request before we have returned, we process it as an error in + * transfer. Note that such an I/O error is not indicated immediately, + * but is returned using the aio_error mechanism. In this case, + * aio_suspend will return immediately. + */ + if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) { + struct aiocb *job = aiocbe->uuaiocb; + + aiocbe->uaiocb._aiocb_private.status = 0; + suword(&job->_aiocb_private.status, 0); + aiocbe->uaiocb._aiocb_private.error = bp->b_error; + suword(&job->_aiocb_private.error, bp->b_error); + + ki->kaio_buffer_finished_count++; + + if (aiocbe->jobstate != JOBST_JOBBFINISHED) { + aiocbe->jobstate = JOBST_JOBBFINISHED; + aiocbe->jobflags |= AIOCBLIST_DONE; + TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); + TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); + TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); + notify = 1; + } + } + splx(s); + if (notify) + KNOTE(&aiocbe->klist, 0); + return 0; + +doerror: + ki->kaio_buffer_count--; + if (lj) + lj->lioj_buffer_count--; + aiocbe->bp = NULL; + relpbuf(bp, NULL); + return error; +} + +/* + * This waits/tests physio completion. + */ +static int +aio_fphysio(struct aiocblist *iocb) +{ + int s; + struct buf *bp; + int error; + + bp = iocb->bp; + + s = splbio(); + while ((bp->b_flags & B_DONE) == 0) { + if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) { + if ((bp->b_flags & B_DONE) == 0) { + splx(s); + return EINPROGRESS; + } else + break; + } + } + splx(s); + + /* Release mapping into kernel space. */ + vunmapbuf(bp); + iocb->bp = 0; + + error = 0; + + /* Check for an error. */ + if (bp->b_ioflags & BIO_ERROR) + error = bp->b_error; + + relpbuf(bp, NULL); + return (error); +} + +/* + * Wake up aio requests that may be serviceable now. + */ +static void +aio_swake_cb(struct socket *so, struct sockbuf *sb) +{ + struct aiocblist *cb,*cbn; + struct proc *p; + struct kaioinfo *ki = NULL; + int opcode, wakecount = 0; + struct aiothreadlist *aiop; + + if (sb == &so->so_snd) { + opcode = LIO_WRITE; + so->so_snd.sb_flags &= ~SB_AIO; + } else { + opcode = LIO_READ; + so->so_rcv.sb_flags &= ~SB_AIO; + } + + for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { + cbn = TAILQ_NEXT(cb, list); + if (opcode == cb->uaiocb.aio_lio_opcode) { + p = cb->userproc; + ki = p->p_aioinfo; + TAILQ_REMOVE(&so->so_aiojobq, cb, list); + TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); + TAILQ_INSERT_TAIL(&aio_jobs, cb, list); + TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); + wakecount++; + if (cb->jobstate != JOBST_JOBQGLOBAL) + panic("invalid queue value"); + } + } + + while (wakecount--) { + if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { + TAILQ_REMOVE(&aio_freeproc, aiop, list); + TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); + aiop->aiothreadflags &= ~AIOP_FREE; + wakeup(aiop->aiothread); + } + } +} + +/* + * Queue a new AIO request. Choosing either the threaded or direct physio VCHR + * technique is done in this code. + */ +static int +_aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type) +{ + struct proc *p = td->td_proc; + struct filedesc *fdp; + struct file *fp; + unsigned int fd; + struct socket *so; + int s; + int error; + int opcode; + struct aiocblist *aiocbe; + struct aiothreadlist *aiop; + struct kaioinfo *ki; + struct kevent kev; + struct kqueue *kq; + struct file *kq_fp; + + aiocbe = uma_zalloc(aiocb_zone, M_WAITOK); + aiocbe->inputcharge = 0; + aiocbe->outputcharge = 0; + callout_handle_init(&aiocbe->timeouthandle); + SLIST_INIT(&aiocbe->klist); + + suword(&job->_aiocb_private.status, -1); + suword(&job->_aiocb_private.error, 0); + suword(&job->_aiocb_private.kernelinfo, -1); + + error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb)); + if (error) { + suword(&job->_aiocb_private.error, error); + uma_zfree(aiocb_zone, aiocbe); + return error; + } + if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL && + !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { + uma_zfree(aiocb_zone, aiocbe); + return EINVAL; + } + + /* Save userspace address of the job info. */ + aiocbe->uuaiocb = job; + + /* Get the opcode. */ + if (type != LIO_NOP) + aiocbe->uaiocb.aio_lio_opcode = type; + opcode = aiocbe->uaiocb.aio_lio_opcode; + + /* Get the fd info for process. */ + fdp = p->p_fd; + + /* + * Range check file descriptor. + */ + fd = aiocbe->uaiocb.aio_fildes; + if (fd >= fdp->fd_nfiles) { + uma_zfree(aiocb_zone, aiocbe); + if (type == 0) + suword(&job->_aiocb_private.error, EBADF); + return EBADF; + } + + fp = aiocbe->fd_file = fdp->fd_ofiles[fd]; + if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == + 0))) { + uma_zfree(aiocb_zone, aiocbe); + if (type == 0) + suword(&job->_aiocb_private.error, EBADF); + return EBADF; + } + fhold(fp); + + if (aiocbe->uaiocb.aio_offset == -1LL) { + error = EINVAL; + goto aqueue_fail; + } + error = suword(&job->_aiocb_private.kernelinfo, jobrefid); + if (error) { + error = EINVAL; + goto aqueue_fail; + } + aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; + if (jobrefid == LONG_MAX) + jobrefid = 1; + else + jobrefid++; + + if (opcode == LIO_NOP) { + fdrop(fp, td); + uma_zfree(aiocb_zone, aiocbe); + if (type == 0) { + suword(&job->_aiocb_private.error, 0); + suword(&job->_aiocb_private.status, 0); + suword(&job->_aiocb_private.kernelinfo, 0); + } + return 0; + } + if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { + if (type == 0) + suword(&job->_aiocb_private.status, 0); + error = EINVAL; + goto aqueue_fail; + } + + if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) { + kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; + kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr; + } + else { + /* + * This method for requesting kevent-based notification won't + * work on the alpha, since we're passing in a pointer + * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT- + * based method instead. + */ + struct kevent *kevp; + + kevp = (struct kevent *)(uintptr_t)job->aio_lio_opcode; + if (kevp == NULL) + goto no_kqueue; + + error = copyin(kevp, &kev, sizeof(kev)); + if (error) + goto aqueue_fail; + } + if ((u_int)kev.ident >= fdp->fd_nfiles || + (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL || + (kq_fp->f_type != DTYPE_KQUEUE)) { + error = EBADF; + goto aqueue_fail; + } + kq = (struct kqueue *)kq_fp->f_data; + kev.ident = (uintptr_t)aiocbe; + kev.filter = EVFILT_AIO; + kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; + error = kqueue_register(kq, &kev, td); +aqueue_fail: + if (error) { + fdrop(fp, td); + uma_zfree(aiocb_zone, aiocbe); + if (type == 0) + suword(&job->_aiocb_private.error, error); + goto done; + } +no_kqueue: + + suword(&job->_aiocb_private.error, EINPROGRESS); + aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; + aiocbe->userproc = p; + aiocbe->jobflags = 0; + aiocbe->lio = lj; + ki = p->p_aioinfo; + + if (fp->f_type == DTYPE_SOCKET) { + /* + * Alternate queueing for socket ops: Reach down into the + * descriptor to get the socket data. Then check to see if the + * socket is ready to be read or written (based on the requested + * operation). + * + * If it is not ready for io, then queue the aiocbe on the + * socket, and set the flags so we get a call when sbnotify() + * happens. + */ + so = (struct socket *)fp->f_data; + s = splnet(); + if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == + LIO_WRITE) && (!sowriteable(so)))) { + TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); + TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); + if (opcode == LIO_READ) + so->so_rcv.sb_flags |= SB_AIO; + else + so->so_snd.sb_flags |= SB_AIO; + aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ + ki->kaio_queue_count++; + num_queue_count++; + splx(s); + error = 0; + goto done; + } + splx(s); + } + + if ((error = aio_qphysio(p, aiocbe)) == 0) + goto done; + if (error > 0) { + suword(&job->_aiocb_private.status, 0); + aiocbe->uaiocb._aiocb_private.error = error; + suword(&job->_aiocb_private.error, error); + goto done; + } + + /* No buffer for daemon I/O. */ + aiocbe->bp = NULL; + + ki->kaio_queue_count++; + if (lj) + lj->lioj_queue_count++; + s = splnet(); + TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); + TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); + splx(s); + aiocbe->jobstate = JOBST_JOBQGLOBAL; + + num_queue_count++; + error = 0; + + /* + * If we don't have a free AIO process, and we are below our quota, then + * start one. Otherwise, depend on the subsequent I/O completions to + * pick-up this job. If we don't sucessfully create the new process + * (thread) due to resource issues, we return an error for now (EAGAIN), + * which is likely not the correct thing to do. + */ + s = splnet(); +retryproc: + if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { + TAILQ_REMOVE(&aio_freeproc, aiop, list); + TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); + aiop->aiothreadflags &= ~AIOP_FREE; + wakeup(aiop->aiothread); + } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && + ((ki->kaio_active_count + num_aio_resv_start) < + ki->kaio_maxactive_count)) { + num_aio_resv_start++; + if ((error = aio_newproc()) == 0) { + num_aio_resv_start--; + goto retryproc; + } + num_aio_resv_start--; + } + splx(s); +done: + return error; +} + +/* + * This routine queues an AIO request, checking for quotas. + */ +static int +aio_aqueue(struct thread *td, struct aiocb *job, int type) +{ + struct proc *p = td->td_proc; + struct kaioinfo *ki; + + if (p->p_aioinfo == NULL) + aio_init_aioinfo(p); + + if (num_queue_count >= max_queue_count) + return EAGAIN; + + ki = p->p_aioinfo; + if (ki->kaio_queue_count >= ki->kaio_qallowed_count) + return EAGAIN; + + return _aio_aqueue(td, job, NULL, type); +} + +/* + * Support the aio_return system call, as a side-effect, kernel resources are + * released. + */ +int +aio_return(struct thread *td, struct aio_return_args *uap) +{ + struct proc *p = td->td_proc; + int s; + long jobref; + struct aiocblist *cb, *ncb; + struct aiocb *ujob; + struct kaioinfo *ki; + + ujob = uap->aiocbp; + jobref = fuword(&ujob->_aiocb_private.kernelinfo); + if (jobref == -1 || jobref == 0) + return EINVAL; + + ki = p->p_aioinfo; + if (ki == NULL) + return EINVAL; + TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { + if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == + jobref) { + if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { + p->p_stats->p_ru.ru_oublock += + cb->outputcharge; + cb->outputcharge = 0; + } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { + p->p_stats->p_ru.ru_inblock += cb->inputcharge; + cb->inputcharge = 0; + } + goto done; + } + } + s = splbio(); + for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { + ncb = TAILQ_NEXT(cb, plist); + if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) + == jobref) { + break; + } + } + splx(s); + done: + if (cb != NULL) { + if (ujob == cb->uuaiocb) { + td->td_retval[0] = + cb->uaiocb._aiocb_private.status; + } else + td->td_retval[0] = EFAULT; + aio_free_entry(cb); + return (0); + } + return (EINVAL); +} + +/* + * Allow a process to wakeup when any of the I/O requests are completed. + */ +int +aio_suspend(struct thread *td, struct aio_suspend_args *uap) +{ + struct proc *p = td->td_proc; + struct timeval atv; + struct timespec ts; + struct aiocb *const *cbptr, *cbp; + struct kaioinfo *ki; + struct aiocblist *cb; + int i; + int njoblist; + int error, s, timo; + long *ijoblist; + struct aiocb **ujoblist; + + if (uap->nent > AIO_LISTIO_MAX) + return EINVAL; + + timo = 0; + if (uap->timeout) { + /* Get timespec struct. */ + if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) + return error; + + if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) + return (EINVAL); + + TIMESPEC_TO_TIMEVAL(&atv, &ts); + if (itimerfix(&atv)) + return (EINVAL); + timo = tvtohz(&atv); + } + + ki = p->p_aioinfo; + if (ki == NULL) + return EAGAIN; + + njoblist = 0; + ijoblist = uma_zalloc(aiol_zone, M_WAITOK); + ujoblist = uma_zalloc(aiol_zone, M_WAITOK); + cbptr = uap->aiocbp; + + for (i = 0; i < uap->nent; i++) { + cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); + if (cbp == 0) + continue; + ujoblist[njoblist] = cbp; + ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); + njoblist++; + } + + if (njoblist == 0) { + uma_zfree(aiol_zone, ijoblist); + uma_zfree(aiol_zone, ujoblist); + return 0; + } + + error = 0; + for (;;) { + TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { + for (i = 0; i < njoblist; i++) { + if (((intptr_t) + cb->uaiocb._aiocb_private.kernelinfo) == + ijoblist[i]) { + if (ujoblist[i] != cb->uuaiocb) + error = EINVAL; + uma_zfree(aiol_zone, ijoblist); + uma_zfree(aiol_zone, ujoblist); + return error; + } + } + } + + s = splbio(); + for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = + TAILQ_NEXT(cb, plist)) { + for (i = 0; i < njoblist; i++) { + if (((intptr_t) + cb->uaiocb._aiocb_private.kernelinfo) == + ijoblist[i]) { + splx(s); + if (ujoblist[i] != cb->uuaiocb) + error = EINVAL; + uma_zfree(aiol_zone, ijoblist); + uma_zfree(aiol_zone, ujoblist); + return error; + } + } + } + + ki->kaio_flags |= KAIO_WAKEUP; + error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo); + splx(s); + + if (error == ERESTART || error == EINTR) { + uma_zfree(aiol_zone, ijoblist); + uma_zfree(aiol_zone, ujoblist); + return EINTR; + } else if (error == EWOULDBLOCK) { + uma_zfree(aiol_zone, ijoblist); + uma_zfree(aiol_zone, ujoblist); + return EAGAIN; + } + } + +/* NOTREACHED */ + return EINVAL; +} + +/* + * aio_cancel cancels any non-physio aio operations not currently in + * progress. + */ +int +aio_cancel(struct thread *td, struct aio_cancel_args *uap) +{ + struct proc *p = td->td_proc; + struct kaioinfo *ki; + struct aiocblist *cbe, *cbn; + struct file *fp; + struct filedesc *fdp; + struct socket *so; + struct proc *po; + int s,error; + int cancelled=0; + int notcancelled=0; + struct vnode *vp; + + fdp = p->p_fd; + if ((u_int)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + + if (fp->f_type == DTYPE_VNODE) { + vp = (struct vnode *)fp->f_data; + + if (vn_isdisk(vp,&error)) { + td->td_retval[0] = AIO_NOTCANCELED; + return 0; + } + } else if (fp->f_type == DTYPE_SOCKET) { + so = (struct socket *)fp->f_data; + + s = splnet(); + + for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { + cbn = TAILQ_NEXT(cbe, list); + if ((uap->aiocbp == NULL) || + (uap->aiocbp == cbe->uuaiocb) ) { + po = cbe->userproc; + ki = po->p_aioinfo; + TAILQ_REMOVE(&so->so_aiojobq, cbe, list); + TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); + TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); + if (ki->kaio_flags & KAIO_WAKEUP) { + wakeup(po); + } + cbe->jobstate = JOBST_JOBFINISHED; + cbe->uaiocb._aiocb_private.status=-1; + cbe->uaiocb._aiocb_private.error=ECANCELED; + cancelled++; +/* XXX cancelled, knote? */ + if (cbe->uaiocb.aio_sigevent.sigev_notify == + SIGEV_SIGNAL) { + PROC_LOCK(cbe->userproc); + psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); + PROC_UNLOCK(cbe->userproc); + } + if (uap->aiocbp) + break; + } + } + splx(s); + + if ((cancelled) && (uap->aiocbp)) { + td->td_retval[0] = AIO_CANCELED; + return 0; + } + } + ki=p->p_aioinfo; + s = splnet(); + + for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { + cbn = TAILQ_NEXT(cbe, plist); + + if ((uap->fd == cbe->uaiocb.aio_fildes) && + ((uap->aiocbp == NULL ) || + (uap->aiocbp == cbe->uuaiocb))) { + + if (cbe->jobstate == JOBST_JOBQGLOBAL) { + TAILQ_REMOVE(&aio_jobs, cbe, list); + TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); + TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, + plist); + cancelled++; + ki->kaio_queue_finished_count++; + cbe->jobstate = JOBST_JOBFINISHED; + cbe->uaiocb._aiocb_private.status = -1; + cbe->uaiocb._aiocb_private.error = ECANCELED; +/* XXX cancelled, knote? */ + if (cbe->uaiocb.aio_sigevent.sigev_notify == + SIGEV_SIGNAL) { + PROC_LOCK(cbe->userproc); + psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); + PROC_UNLOCK(cbe->userproc); + } + } else { + notcancelled++; + } + } + } + splx(s); + + if (notcancelled) { + td->td_retval[0] = AIO_NOTCANCELED; + return 0; + } + if (cancelled) { + td->td_retval[0] = AIO_CANCELED; + return 0; + } + td->td_retval[0] = AIO_ALLDONE; + + return 0; +} + +/* + * aio_error is implemented in the kernel level for compatibility purposes only. + * For a user mode async implementation, it would be best to do it in a userland + * subroutine. + */ +int +aio_error(struct thread *td, struct aio_error_args *uap) +{ + struct proc *p = td->td_proc; + int s; + struct aiocblist *cb; + struct kaioinfo *ki; + long jobref; + + ki = p->p_aioinfo; + if (ki == NULL) + return EINVAL; + + jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); + if ((jobref == -1) || (jobref == 0)) + return EINVAL; + + TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { + if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == + jobref) { + td->td_retval[0] = cb->uaiocb._aiocb_private.error; + return 0; + } + } + + s = splnet(); + + for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, + plist)) { + if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == + jobref) { + td->td_retval[0] = EINPROGRESS; + splx(s); + return 0; + } + } + + for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, + plist)) { + if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == + jobref) { + td->td_retval[0] = EINPROGRESS; + splx(s); + return 0; + } + } + splx(s); + + s = splbio(); + for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, + plist)) { + if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == + jobref) { + td->td_retval[0] = cb->uaiocb._aiocb_private.error; + splx(s); + return 0; + } + } + + for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, + plist)) { + if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == + jobref) { + td->td_retval[0] = EINPROGRESS; + splx(s); + return 0; + } + } + splx(s); + +#if (0) + /* + * Hack for lio. + */ + status = fuword(&uap->aiocbp->_aiocb_private.status); + if (status == -1) + return fuword(&uap->aiocbp->_aiocb_private.error); +#endif + return EINVAL; +} + +/* syscall - asynchronous read from a file (REALTIME) */ +int +aio_read(struct thread *td, struct aio_read_args *uap) +{ + + return aio_aqueue(td, uap->aiocbp, LIO_READ); +} + +/* syscall - asynchronous write to a file (REALTIME) */ +int +aio_write(struct thread *td, struct aio_write_args *uap) +{ + + return aio_aqueue(td, uap->aiocbp, LIO_WRITE); +} + +/* syscall - XXX undocumented */ +int +lio_listio(struct thread *td, struct lio_listio_args *uap) +{ + struct proc *p = td->td_proc; + int nent, nentqueued; + struct aiocb *iocb, * const *cbptr; + struct aiocblist *cb; + struct kaioinfo *ki; + struct aio_liojob *lj; + int error, runningcode; + int nerror; + int i; + int s; + + if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) + return EINVAL; + + nent = uap->nent; + if (nent > AIO_LISTIO_MAX) + return EINVAL; + + if (p->p_aioinfo == NULL) + aio_init_aioinfo(p); + + if ((nent + num_queue_count) > max_queue_count) + return EAGAIN; + + ki = p->p_aioinfo; + if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) + return EAGAIN; + + lj = uma_zalloc(aiolio_zone, M_WAITOK); + if (!lj) + return EAGAIN; + + lj->lioj_flags = 0; + lj->lioj_buffer_count = 0; + lj->lioj_buffer_finished_count = 0; + lj->lioj_queue_count = 0; + lj->lioj_queue_finished_count = 0; + lj->lioj_ki = ki; + + /* + * Setup signal. + */ + if (uap->sig && (uap->mode == LIO_NOWAIT)) { + error = copyin(uap->sig, &lj->lioj_signal, + sizeof(lj->lioj_signal)); + if (error) { + uma_zfree(aiolio_zone, lj); + return error; + } + if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { + uma_zfree(aiolio_zone, lj); + return EINVAL; + } + lj->lioj_flags |= LIOJ_SIGNAL; + lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; + } else + lj->lioj_flags &= ~LIOJ_SIGNAL; + + TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); + /* + * Get pointers to the list of I/O requests. + */ + nerror = 0; + nentqueued = 0; + cbptr = uap->acb_list; + for (i = 0; i < uap->nent; i++) { + iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); + if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) { + error = _aio_aqueue(td, iocb, lj, 0); + if (error == 0) + nentqueued++; + else + nerror++; + } + } + + /* + * If we haven't queued any, then just return error. + */ + if (nentqueued == 0) + return 0; + + /* + * Calculate the appropriate error return. + */ + runningcode = 0; + if (nerror) + runningcode = EIO; + + if (uap->mode == LIO_WAIT) { + int command, found, jobref; + + for (;;) { + found = 0; + for (i = 0; i < uap->nent; i++) { + /* + * Fetch address of the control buf pointer in + * user space. + */ + iocb = (struct aiocb *) + (intptr_t)fuword(&cbptr[i]); + if (((intptr_t)iocb == -1) || ((intptr_t)iocb + == 0)) + continue; + + /* + * Fetch the associated command from user space. + */ + command = fuword(&iocb->aio_lio_opcode); + if (command == LIO_NOP) { + found++; + continue; + } + + jobref = fuword(&iocb->_aiocb_private.kernelinfo); + + TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { + if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) + == jobref) { + if (cb->uaiocb.aio_lio_opcode + == LIO_WRITE) { + p->p_stats->p_ru.ru_oublock + += + cb->outputcharge; + cb->outputcharge = 0; + } else if (cb->uaiocb.aio_lio_opcode + == LIO_READ) { + p->p_stats->p_ru.ru_inblock + += cb->inputcharge; + cb->inputcharge = 0; + } + found++; + break; + } + } + + s = splbio(); + TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) { + if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) + == jobref) { + found++; + break; + } + } + splx(s); + } + + /* + * If all I/Os have been disposed of, then we can + * return. + */ + if (found == nentqueued) + return runningcode; + + ki->kaio_flags |= KAIO_WAKEUP; + error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0); + + if (error == EINTR) + return EINTR; + else if (error == EWOULDBLOCK) + return EAGAIN; + } + } + + return runningcode; +} + +/* + * This is a weird hack so that we can post a signal. It is safe to do so from + * a timeout routine, but *not* from an interrupt routine. + */ +static void +process_signal(void *aioj) +{ + struct aiocblist *aiocbe = aioj; + struct aio_liojob *lj = aiocbe->lio; + struct aiocb *cb = &aiocbe->uaiocb; + + if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && + (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { + PROC_LOCK(lj->lioj_ki->kaio_p); + psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); + PROC_UNLOCK(lj->lioj_ki->kaio_p); + lj->lioj_flags |= LIOJ_SIGNAL_POSTED; + } + + if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { + PROC_LOCK(aiocbe->userproc); + psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); + PROC_UNLOCK(aiocbe->userproc); + } +} + +/* + * Interrupt handler for physio, performs the necessary process wakeups, and + * signals. + */ +static void +aio_physwakeup(struct buf *bp) +{ + struct aiocblist *aiocbe; + struct proc *p; + struct kaioinfo *ki; + struct aio_liojob *lj; + + wakeup(bp); + + aiocbe = (struct aiocblist *)bp->b_spc; + if (aiocbe) { + p = bp->b_caller1; + + aiocbe->jobstate = JOBST_JOBBFINISHED; + aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; + aiocbe->uaiocb._aiocb_private.error = 0; + aiocbe->jobflags |= AIOCBLIST_DONE; + + if (bp->b_ioflags & BIO_ERROR) + aiocbe->uaiocb._aiocb_private.error = bp->b_error; + + lj = aiocbe->lio; + if (lj) { + lj->lioj_buffer_finished_count++; + + /* + * wakeup/signal if all of the interrupt jobs are done. + */ + if (lj->lioj_buffer_finished_count == + lj->lioj_buffer_count) { + /* + * Post a signal if it is called for. + */ + if ((lj->lioj_flags & + (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == + LIOJ_SIGNAL) { + lj->lioj_flags |= LIOJ_SIGNAL_POSTED; + aiocbe->timeouthandle = + timeout(process_signal, + aiocbe, 0); + } + } + } + + ki = p->p_aioinfo; + if (ki) { + ki->kaio_buffer_finished_count++; + TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); + TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); + TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); + + KNOTE(&aiocbe->klist, 0); + /* Do the wakeup. */ + if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { + ki->kaio_flags &= ~KAIO_WAKEUP; + wakeup(p); + } + } + + if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) + aiocbe->timeouthandle = + timeout(process_signal, aiocbe, 0); + } +} + +/* syscall - wait for the next completion of an aio request */ +int +aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) +{ + struct proc *p = td->td_proc; + struct timeval atv; + struct timespec ts; + struct kaioinfo *ki; + struct aiocblist *cb = NULL; + int error, s, timo; + + suword(uap->aiocbp, (int)NULL); + + timo = 0; + if (uap->timeout) { + /* Get timespec struct. */ + error = copyin(uap->timeout, &ts, sizeof(ts)); + if (error) + return error; + + if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) + return (EINVAL); + + TIMESPEC_TO_TIMEVAL(&atv, &ts); + if (itimerfix(&atv)) + return (EINVAL); + timo = tvtohz(&atv); + } + + ki = p->p_aioinfo; + if (ki == NULL) + return EAGAIN; + + for (;;) { + if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { + suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); + td->td_retval[0] = cb->uaiocb._aiocb_private.status; + if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { + p->p_stats->p_ru.ru_oublock += + cb->outputcharge; + cb->outputcharge = 0; + } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { + p->p_stats->p_ru.ru_inblock += cb->inputcharge; + cb->inputcharge = 0; + } + aio_free_entry(cb); + return cb->uaiocb._aiocb_private.error; + } + + s = splbio(); + if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { + splx(s); + suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); + td->td_retval[0] = cb->uaiocb._aiocb_private.status; + aio_free_entry(cb); + return cb->uaiocb._aiocb_private.error; + } + + ki->kaio_flags |= KAIO_WAKEUP; + error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo); + splx(s); + + if (error == ERESTART) + return EINTR; + else if (error < 0) + return error; + else if (error == EINTR) + return EINTR; + else if (error == EWOULDBLOCK) + return EAGAIN; + } +} + +/* kqueue attach function */ +static int +filt_aioattach(struct knote *kn) +{ + struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; + + /* + * The aiocbe pointer must be validated before using it, so + * registration is restricted to the kernel; the user cannot + * set EV_FLAG1. + */ + if ((kn->kn_flags & EV_FLAG1) == 0) + return (EPERM); + kn->kn_flags &= ~EV_FLAG1; + + SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext); + + return (0); +} + +/* kqueue detach function */ +static void +filt_aiodetach(struct knote *kn) +{ + struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; + + SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext); +} + +/* kqueue filter function */ +/*ARGSUSED*/ +static int +filt_aio(struct knote *kn, long hint) +{ + struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; + + kn->kn_data = aiocbe->uaiocb._aiocb_private.error; + if (aiocbe->jobstate != JOBST_JOBFINISHED && + aiocbe->jobstate != JOBST_JOBBFINISHED) + return (0); + kn->kn_flags |= EV_EOF; + return (1); +} diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c new file mode 100644 index 0000000..30dc753 --- /dev/null +++ b/sys/kern/vfs_bio.c @@ -0,0 +1,3395 @@ +/* + * Copyright (c) 1994,1997 John S. Dyson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. + * 2. Absolutely no warranty of function or purpose is made by the author + * John S. Dyson. + * + * $FreeBSD$ + */ + +/* + * this file contains a new buffer I/O scheme implementing a coherent + * VM object and buffer cache scheme. Pains have been taken to make + * sure that the performance degradation associated with schemes such + * as this is not realized. + * + * Author: John S. Dyson + * Significant help during the development and debugging phases + * had been provided by David Greenman, also of the FreeBSD core team. + * + * see man buf(9) for more info. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/stdint.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/eventhandler.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/mutex.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/ktr.h> +#include <sys/proc.h> +#include <sys/reboot.h> +#include <sys/resourcevar.h> +#include <sys/sysctl.h> +#include <sys/vmmeter.h> +#include <sys/vnode.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_pageout.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_map.h> + +static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); + +struct bio_ops bioops; /* I/O operation notification */ + +struct buf_ops buf_ops_bio = { + "buf_ops_bio", + bwrite +}; + +/* + * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has + * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. + */ +struct buf *buf; /* buffer header pool */ +struct mtx buftimelock; /* Interlock on setting prio and timo */ + +static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, + vm_offset_t to); +static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, + vm_offset_t to); +static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, + int pageno, vm_page_t m); +static void vfs_clean_pages(struct buf * bp); +static void vfs_setdirty(struct buf *bp); +static void vfs_vmio_release(struct buf *bp); +static void vfs_backgroundwritedone(struct buf *bp); +static int flushbufqueues(void); +static void buf_daemon(void); + +int vmiodirenable = TRUE; +SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, + "Use the VM system for directory writes"); +int runningbufspace; +SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, + "Amount of presently outstanding async buffer io"); +static int bufspace; +SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, + "KVA memory used for bufs"); +static int maxbufspace; +SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, + "Maximum allowed value of bufspace (including buf_daemon)"); +static int bufmallocspace; +SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, + "Amount of malloced memory for buffers"); +static int maxbufmallocspace; +SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, + "Maximum amount of malloced memory for buffers"); +static int lobufspace; +SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, + "Minimum amount of buffers we want to have"); +static int hibufspace; +SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, + "Maximum allowed value of bufspace (excluding buf_daemon)"); +static int bufreusecnt; +SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, + "Number of times we have reused a buffer"); +static int buffreekvacnt; +SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, + "Number of times we have freed the KVA space from some buffer"); +static int bufdefragcnt; +SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, + "Number of times we have had to repeat buffer allocation to defragment"); +static int lorunningspace; +SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, + "Minimum preferred space used for in-progress I/O"); +static int hirunningspace; +SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, + "Maximum amount of space to use for in-progress I/O"); +static int numdirtybuffers; +SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, + "Number of buffers that are dirty (has unwritten changes) at the moment"); +static int lodirtybuffers; +SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, + "How many buffers we want to have free before bufdaemon can sleep"); +static int hidirtybuffers; +SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, + "When the number of dirty buffers is considered severe"); +static int numfreebuffers; +SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, + "Number of free buffers"); +static int lofreebuffers; +SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, + "XXX Unused"); +static int hifreebuffers; +SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, + "XXX Complicatedly unused"); +static int getnewbufcalls; +SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, + "Number of calls to getnewbuf"); +static int getnewbufrestarts; +SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, + "Number of times getnewbuf has had to restart a buffer aquisition"); +static int dobkgrdwrite = 1; +SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, + "Do background writes (honoring the BX_BKGRDWRITE flag)?"); + +/* + * Wakeup point for bufdaemon, as well as indicator of whether it is already + * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it + * is idling. + */ +static int bd_request; + +/* + * bogus page -- for I/O to/from partially complete buffers + * this is a temporary solution to the problem, but it is not + * really that bad. it would be better to split the buffer + * for input in the case of buffers partially already in memory, + * but the code is intricate enough already. + */ +vm_page_t bogus_page; + +/* + * Offset for bogus_page. + * XXX bogus_offset should be local to bufinit + */ +static vm_offset_t bogus_offset; + +/* + * Synchronization (sleep/wakeup) variable for active buffer space requests. + * Set when wait starts, cleared prior to wakeup(). + * Used in runningbufwakeup() and waitrunningbufspace(). + */ +static int runningbufreq; + +/* + * Synchronization (sleep/wakeup) variable for buffer requests. + * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done + * by and/or. + * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(), + * getnewbuf(), and getblk(). + */ +static int needsbuffer; + +/* + * Mask for index into the buffer hash table, which needs to be power of 2 in + * size. Set in kern_vfs_bio_buffer_alloc. + */ +static int bufhashmask; + +/* + * Hash table for all buffers, with a linked list hanging from each table + * entry. Set in kern_vfs_bio_buffer_alloc, initialized in buf_init. + */ +static LIST_HEAD(bufhashhdr, buf) *bufhashtbl; + +/* + * Somewhere to store buffers when they are not in another list, to always + * have them in a list (and thus being able to use the same set of operations + * on them.) + */ +static struct bufhashhdr invalhash; + +/* + * Definitions for the buffer free lists. + */ +#define BUFFER_QUEUES 6 /* number of free buffer queues */ + +#define QUEUE_NONE 0 /* on no queue */ +#define QUEUE_LOCKED 1 /* locked buffers */ +#define QUEUE_CLEAN 2 /* non-B_DELWRI buffers */ +#define QUEUE_DIRTY 3 /* B_DELWRI buffers */ +#define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */ +#define QUEUE_EMPTY 5 /* empty buffer headers */ + +/* Queues for free buffers with various properties */ +static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; +/* + * Single global constant for BUF_WMESG, to avoid getting multiple references. + * buf_wmesg is referred from macros. + */ +const char *buf_wmesg = BUF_WMESG; + +#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ +#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ +#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ +#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ + +/* + * Buffer hash table code. Note that the logical block scans linearly, which + * gives us some L1 cache locality. + */ + +static __inline +struct bufhashhdr * +bufhash(struct vnode *vnp, daddr_t bn) +{ + return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]); +} + +/* + * numdirtywakeup: + * + * If someone is blocked due to there being too many dirty buffers, + * and numdirtybuffers is now reasonable, wake them up. + */ + +static __inline void +numdirtywakeup(int level) +{ + if (numdirtybuffers <= level) { + if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { + needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; + wakeup(&needsbuffer); + } + } +} + +/* + * bufspacewakeup: + * + * Called when buffer space is potentially available for recovery. + * getnewbuf() will block on this flag when it is unable to free + * sufficient buffer space. Buffer space becomes recoverable when + * bp's get placed back in the queues. + */ + +static __inline void +bufspacewakeup(void) +{ + /* + * If someone is waiting for BUF space, wake them up. Even + * though we haven't freed the kva space yet, the waiting + * process will be able to now. + */ + if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { + needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; + wakeup(&needsbuffer); + } +} + +/* + * runningbufwakeup() - in-progress I/O accounting. + * + */ +static __inline void +runningbufwakeup(struct buf *bp) +{ + if (bp->b_runningbufspace) { + runningbufspace -= bp->b_runningbufspace; + bp->b_runningbufspace = 0; + if (runningbufreq && runningbufspace <= lorunningspace) { + runningbufreq = 0; + wakeup(&runningbufreq); + } + } +} + +/* + * bufcountwakeup: + * + * Called when a buffer has been added to one of the free queues to + * account for the buffer and to wakeup anyone waiting for free buffers. + * This typically occurs when large amounts of metadata are being handled + * by the buffer cache ( else buffer space runs out first, usually ). + */ + +static __inline void +bufcountwakeup(void) +{ + ++numfreebuffers; + if (needsbuffer) { + needsbuffer &= ~VFS_BIO_NEED_ANY; + if (numfreebuffers >= hifreebuffers) + needsbuffer &= ~VFS_BIO_NEED_FREE; + wakeup(&needsbuffer); + } +} + +/* + * waitrunningbufspace() + * + * runningbufspace is a measure of the amount of I/O currently + * running. This routine is used in async-write situations to + * prevent creating huge backups of pending writes to a device. + * Only asynchronous writes are governed by this function. + * + * Reads will adjust runningbufspace, but will not block based on it. + * The read load has a side effect of reducing the allowed write load. + * + * This does NOT turn an async write into a sync write. It waits + * for earlier writes to complete and generally returns before the + * caller's write has reached the device. + */ +static __inline void +waitrunningbufspace(void) +{ + /* + * XXX race against wakeup interrupt, currently + * protected by Giant. FIXME! + */ + while (runningbufspace > hirunningspace) { + ++runningbufreq; + tsleep(&runningbufreq, PVM, "wdrain", 0); + } +} + + +/* + * vfs_buf_test_cache: + * + * Called when a buffer is extended. This function clears the B_CACHE + * bit if the newly extended portion of the buffer does not contain + * valid data. + */ +static __inline__ +void +vfs_buf_test_cache(struct buf *bp, + vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, + vm_page_t m) +{ + GIANT_REQUIRED; + + if (bp->b_flags & B_CACHE) { + int base = (foff + off) & PAGE_MASK; + if (vm_page_is_valid(m, base, size) == 0) + bp->b_flags &= ~B_CACHE; + } +} + +/* Wake up the buffer deamon if necessary */ +static __inline__ +void +bd_wakeup(int dirtybuflevel) +{ + if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { + bd_request = 1; + wakeup(&bd_request); + } +} + +/* + * bd_speedup - speedup the buffer cache flushing code + */ + +static __inline__ +void +bd_speedup(void) +{ + bd_wakeup(1); +} + +/* + * Calculating buffer cache scaling values and reserve space for buffer + * headers. This is called during low level kernel initialization and + * may be called more then once. We CANNOT write to the memory area + * being reserved at this time. + */ +caddr_t +kern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est) +{ + /* + * physmem_est is in pages. Convert it to kilobytes (assumes + * PAGE_SIZE is >= 1K) + */ + physmem_est = physmem_est * (PAGE_SIZE / 1024); + + /* + * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. + * For the first 64MB of ram nominally allocate sufficient buffers to + * cover 1/4 of our ram. Beyond the first 64MB allocate additional + * buffers to cover 1/20 of our ram over 64MB. When auto-sizing + * the buffer cache we limit the eventual kva reservation to + * maxbcache bytes. + * + * factor represents the 1/4 x ram conversion. + */ + if (nbuf == 0) { + int factor = 4 * BKVASIZE / 1024; + + nbuf = 50; + if (physmem_est > 4096) + nbuf += min((physmem_est - 4096) / factor, + 65536 / factor); + if (physmem_est > 65536) + nbuf += (physmem_est - 65536) * 2 / (factor * 5); + + if (maxbcache && nbuf > maxbcache / BKVASIZE) + nbuf = maxbcache / BKVASIZE; + } + +#if 0 + /* + * Do not allow the buffer_map to be more then 1/2 the size of the + * kernel_map. + */ + if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / + (BKVASIZE * 2)) { + nbuf = (kernel_map->max_offset - kernel_map->min_offset) / + (BKVASIZE * 2); + printf("Warning: nbufs capped at %d\n", nbuf); + } +#endif + + /* + * swbufs are used as temporary holders for I/O, such as paging I/O. + * We have no less then 16 and no more then 256. + */ + nswbuf = max(min(nbuf/4, 256), 16); + + /* + * Reserve space for the buffer cache buffers + */ + swbuf = (void *)v; + v = (caddr_t)(swbuf + nswbuf); + buf = (void *)v; + v = (caddr_t)(buf + nbuf); + + /* + * Calculate the hash table size and reserve space + */ + for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1) + ; + bufhashtbl = (void *)v; + v = (caddr_t)(bufhashtbl + bufhashmask); + --bufhashmask; + + return(v); +} + +/* Initialize the buffer subsystem. Called before use of any buffers. */ +void +bufinit(void) +{ + struct buf *bp; + int i; + + GIANT_REQUIRED; + + LIST_INIT(&invalhash); + mtx_init(&buftimelock, "buftime lock", NULL, MTX_DEF); + + for (i = 0; i <= bufhashmask; i++) + LIST_INIT(&bufhashtbl[i]); + + /* next, make a null set of free lists */ + for (i = 0; i < BUFFER_QUEUES; i++) + TAILQ_INIT(&bufqueues[i]); + + /* finally, initialize each buffer header and stick on empty q */ + for (i = 0; i < nbuf; i++) { + bp = &buf[i]; + bzero(bp, sizeof *bp); + bp->b_flags = B_INVAL; /* we're just an empty header */ + bp->b_dev = NODEV; + bp->b_rcred = NOCRED; + bp->b_wcred = NOCRED; + bp->b_qindex = QUEUE_EMPTY; + bp->b_xflags = 0; + LIST_INIT(&bp->b_dep); + BUF_LOCKINIT(bp); + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); + } + + /* + * maxbufspace is the absolute maximum amount of buffer space we are + * allowed to reserve in KVM and in real terms. The absolute maximum + * is nominally used by buf_daemon. hibufspace is the nominal maximum + * used by most other processes. The differential is required to + * ensure that buf_daemon is able to run when other processes might + * be blocked waiting for buffer space. + * + * maxbufspace is based on BKVASIZE. Allocating buffers larger then + * this may result in KVM fragmentation which is not handled optimally + * by the system. + */ + maxbufspace = nbuf * BKVASIZE; + hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); + lobufspace = hibufspace - MAXBSIZE; + + lorunningspace = 512 * 1024; + hirunningspace = 1024 * 1024; + +/* + * Limit the amount of malloc memory since it is wired permanently into + * the kernel space. Even though this is accounted for in the buffer + * allocation, we don't want the malloced region to grow uncontrolled. + * The malloc scheme improves memory utilization significantly on average + * (small) directories. + */ + maxbufmallocspace = hibufspace / 20; + +/* + * Reduce the chance of a deadlock occuring by limiting the number + * of delayed-write dirty buffers we allow to stack up. + */ + hidirtybuffers = nbuf / 4 + 20; + numdirtybuffers = 0; +/* + * To support extreme low-memory systems, make sure hidirtybuffers cannot + * eat up all available buffer space. This occurs when our minimum cannot + * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming + * BKVASIZE'd (8K) buffers. + */ + while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { + hidirtybuffers >>= 1; + } + lodirtybuffers = hidirtybuffers / 2; + +/* + * Try to keep the number of free buffers in the specified range, + * and give special processes (e.g. like buf_daemon) access to an + * emergency reserve. + */ + lofreebuffers = nbuf / 18 + 5; + hifreebuffers = 2 * lofreebuffers; + numfreebuffers = nbuf; + +/* + * Maximum number of async ops initiated per buf_daemon loop. This is + * somewhat of a hack at the moment, we really need to limit ourselves + * based on the number of bytes of I/O in-transit that were initiated + * from buf_daemon. + */ + + bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); + bogus_page = vm_page_alloc(kernel_object, + ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), + VM_ALLOC_NORMAL); + cnt.v_wire_count++; +} + +/* + * bfreekva() - free the kva allocation for a buffer. + * + * Must be called at splbio() or higher as this is the only locking for + * buffer_map. + * + * Since this call frees up buffer space, we call bufspacewakeup(). + */ +static void +bfreekva(struct buf * bp) +{ + GIANT_REQUIRED; + + if (bp->b_kvasize) { + ++buffreekvacnt; + bufspace -= bp->b_kvasize; + vm_map_delete(buffer_map, + (vm_offset_t) bp->b_kvabase, + (vm_offset_t) bp->b_kvabase + bp->b_kvasize + ); + bp->b_kvasize = 0; + bufspacewakeup(); + } +} + +/* + * bremfree: + * + * Remove the buffer from the appropriate free list. + */ +void +bremfree(struct buf * bp) +{ + int s = splbio(); + int old_qindex = bp->b_qindex; + + GIANT_REQUIRED; + + if (bp->b_qindex != QUEUE_NONE) { + KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp)); + TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); + bp->b_qindex = QUEUE_NONE; + } else { + if (BUF_REFCNT(bp) <= 1) + panic("bremfree: removing a buffer not on a queue"); + } + + /* + * Fixup numfreebuffers count. If the buffer is invalid or not + * delayed-write, and it was on the EMPTY, LRU, or AGE queues, + * the buffer was free and we must decrement numfreebuffers. + */ + if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { + switch(old_qindex) { + case QUEUE_DIRTY: + case QUEUE_CLEAN: + case QUEUE_EMPTY: + case QUEUE_EMPTYKVA: + --numfreebuffers; + break; + default: + break; + } + } + splx(s); +} + + +/* + * Get a buffer with the specified data. Look in the cache first. We + * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE + * is set, the buffer is valid and we do not have to do anything ( see + * getblk() ). This is really just a special case of breadn(). + */ +int +bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, + struct buf ** bpp) +{ + + return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp)); +} + +/* + * Operates like bread, but also starts asynchronous I/O on + * read-ahead blocks. We must clear BIO_ERROR and B_INVAL prior + * to initiating I/O . If B_CACHE is set, the buffer is valid + * and we do not have to do anything. + */ +int +breadn(struct vnode * vp, daddr_t blkno, int size, + daddr_t * rablkno, int *rabsize, + int cnt, struct ucred * cred, struct buf ** bpp) +{ + struct buf *bp, *rabp; + int i; + int rv = 0, readwait = 0; + + *bpp = bp = getblk(vp, blkno, size, 0, 0); + + /* if not found in cache, do some I/O */ + if ((bp->b_flags & B_CACHE) == 0) { + if (curthread != PCPU_GET(idlethread)) + curthread->td_proc->p_stats->p_ru.ru_inblock++; + bp->b_iocmd = BIO_READ; + bp->b_flags &= ~B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + if (bp->b_rcred == NOCRED && cred != NOCRED) + bp->b_rcred = crhold(cred); + vfs_busy_pages(bp, 0); + VOP_STRATEGY(vp, bp); + ++readwait; + } + + for (i = 0; i < cnt; i++, rablkno++, rabsize++) { + if (inmem(vp, *rablkno)) + continue; + rabp = getblk(vp, *rablkno, *rabsize, 0, 0); + + if ((rabp->b_flags & B_CACHE) == 0) { + if (curthread != PCPU_GET(idlethread)) + curthread->td_proc->p_stats->p_ru.ru_inblock++; + rabp->b_flags |= B_ASYNC; + rabp->b_flags &= ~B_INVAL; + rabp->b_ioflags &= ~BIO_ERROR; + rabp->b_iocmd = BIO_READ; + if (rabp->b_rcred == NOCRED && cred != NOCRED) + rabp->b_rcred = crhold(cred); + vfs_busy_pages(rabp, 0); + BUF_KERNPROC(rabp); + VOP_STRATEGY(vp, rabp); + } else { + brelse(rabp); + } + } + + if (readwait) { + rv = bufwait(bp); + } + return (rv); +} + +/* + * Write, release buffer on completion. (Done by iodone + * if async). Do not bother writing anything if the buffer + * is invalid. + * + * Note that we set B_CACHE here, indicating that buffer is + * fully valid and thus cacheable. This is true even of NFS + * now so we set it generally. This could be set either here + * or in biodone() since the I/O is synchronous. We put it + * here. + */ + +int +bwrite(struct buf * bp) +{ + int oldflags, s; + struct buf *newbp; + + if (bp->b_flags & B_INVAL) { + brelse(bp); + return (0); + } + + oldflags = bp->b_flags; + + if (BUF_REFCNT(bp) == 0) + panic("bwrite: buffer is not busy???"); + s = splbio(); + /* + * If a background write is already in progress, delay + * writing this block if it is asynchronous. Otherwise + * wait for the background write to complete. + */ + if (bp->b_xflags & BX_BKGRDINPROG) { + if (bp->b_flags & B_ASYNC) { + splx(s); + bdwrite(bp); + return (0); + } + bp->b_xflags |= BX_BKGRDWAIT; + tsleep(&bp->b_xflags, PRIBIO, "bwrbg", 0); + if (bp->b_xflags & BX_BKGRDINPROG) + panic("bwrite: still writing"); + } + + /* Mark the buffer clean */ + bundirty(bp); + + /* + * If this buffer is marked for background writing and we + * do not have to wait for it, make a copy and write the + * copy so as to leave this buffer ready for further use. + * + * This optimization eats a lot of memory. If we have a page + * or buffer shortfall we can't do it. + */ + if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && + (bp->b_flags & B_ASYNC) && + !vm_page_count_severe() && + !buf_dirty_count_severe()) { + if (bp->b_iodone != NULL) { + printf("bp->b_iodone = %p\n", bp->b_iodone); + panic("bwrite: need chained iodone"); + } + + /* get a new block */ + newbp = geteblk(bp->b_bufsize); + + /* set it to be identical to the old block */ + memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); + bgetvp(bp->b_vp, newbp); + newbp->b_lblkno = bp->b_lblkno; + newbp->b_blkno = bp->b_blkno; + newbp->b_offset = bp->b_offset; + newbp->b_iodone = vfs_backgroundwritedone; + newbp->b_flags |= B_ASYNC; + newbp->b_flags &= ~B_INVAL; + + /* move over the dependencies */ + if (LIST_FIRST(&bp->b_dep) != NULL) + buf_movedeps(bp, newbp); + + /* + * Initiate write on the copy, release the original to + * the B_LOCKED queue so that it cannot go away until + * the background write completes. If not locked it could go + * away and then be reconstituted while it was being written. + * If the reconstituted buffer were written, we could end up + * with two background copies being written at the same time. + */ + bp->b_xflags |= BX_BKGRDINPROG; + bp->b_flags |= B_LOCKED; + bqrelse(bp); + bp = newbp; + } + + bp->b_flags &= ~B_DONE; + bp->b_ioflags &= ~BIO_ERROR; + bp->b_flags |= B_WRITEINPROG | B_CACHE; + bp->b_iocmd = BIO_WRITE; + + bp->b_vp->v_numoutput++; + vfs_busy_pages(bp, 1); + + /* + * Normal bwrites pipeline writes + */ + bp->b_runningbufspace = bp->b_bufsize; + runningbufspace += bp->b_runningbufspace; + + if (curthread != PCPU_GET(idlethread)) + curthread->td_proc->p_stats->p_ru.ru_oublock++; + splx(s); + if (oldflags & B_ASYNC) + BUF_KERNPROC(bp); + BUF_STRATEGY(bp); + + if ((oldflags & B_ASYNC) == 0) { + int rtval = bufwait(bp); + brelse(bp); + return (rtval); + } else if ((oldflags & B_NOWDRAIN) == 0) { + /* + * don't allow the async write to saturate the I/O + * system. Deadlocks can occur only if a device strategy + * routine (like in MD) turns around and issues another + * high-level write, in which case B_NOWDRAIN is expected + * to be set. Otherwise we will not deadlock here because + * we are blocking waiting for I/O that is already in-progress + * to complete. + */ + waitrunningbufspace(); + } + + return (0); +} + +/* + * Complete a background write started from bwrite. + */ +static void +vfs_backgroundwritedone(bp) + struct buf *bp; +{ + struct buf *origbp; + + /* + * Find the original buffer that we are writing. + */ + if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL) + panic("backgroundwritedone: lost buffer"); + /* + * Process dependencies then return any unfinished ones. + */ + if (LIST_FIRST(&bp->b_dep) != NULL) + buf_complete(bp); + if (LIST_FIRST(&bp->b_dep) != NULL) + buf_movedeps(bp, origbp); + /* + * Clear the BX_BKGRDINPROG flag in the original buffer + * and awaken it if it is waiting for the write to complete. + * If BX_BKGRDINPROG is not set in the original buffer it must + * have been released and re-instantiated - which is not legal. + */ + KASSERT((origbp->b_xflags & BX_BKGRDINPROG), + ("backgroundwritedone: lost buffer2")); + origbp->b_xflags &= ~BX_BKGRDINPROG; + if (origbp->b_xflags & BX_BKGRDWAIT) { + origbp->b_xflags &= ~BX_BKGRDWAIT; + wakeup(&origbp->b_xflags); + } + /* + * Clear the B_LOCKED flag and remove it from the locked + * queue if it currently resides there. + */ + origbp->b_flags &= ~B_LOCKED; + if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { + bremfree(origbp); + bqrelse(origbp); + } + /* + * This buffer is marked B_NOCACHE, so when it is released + * by biodone, it will be tossed. We mark it with BIO_READ + * to avoid biodone doing a second vwakeup. + */ + bp->b_flags |= B_NOCACHE; + bp->b_iocmd = BIO_READ; + bp->b_flags &= ~(B_CACHE | B_DONE); + bp->b_iodone = 0; + bufdone(bp); +} + +/* + * Delayed write. (Buffer is marked dirty). Do not bother writing + * anything if the buffer is marked invalid. + * + * Note that since the buffer must be completely valid, we can safely + * set B_CACHE. In fact, we have to set B_CACHE here rather then in + * biodone() in order to prevent getblk from writing the buffer + * out synchronously. + */ +void +bdwrite(struct buf * bp) +{ + GIANT_REQUIRED; + + if (BUF_REFCNT(bp) == 0) + panic("bdwrite: buffer is not busy"); + + if (bp->b_flags & B_INVAL) { + brelse(bp); + return; + } + bdirty(bp); + + /* + * Set B_CACHE, indicating that the buffer is fully valid. This is + * true even of NFS now. + */ + bp->b_flags |= B_CACHE; + + /* + * This bmap keeps the system from needing to do the bmap later, + * perhaps when the system is attempting to do a sync. Since it + * is likely that the indirect block -- or whatever other datastructure + * that the filesystem needs is still in memory now, it is a good + * thing to do this. Note also, that if the pageout daemon is + * requesting a sync -- there might not be enough memory to do + * the bmap then... So, this is important to do. + */ + if (bp->b_lblkno == bp->b_blkno) { + VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); + } + + /* + * Set the *dirty* buffer range based upon the VM system dirty pages. + */ + vfs_setdirty(bp); + + /* + * We need to do this here to satisfy the vnode_pager and the + * pageout daemon, so that it thinks that the pages have been + * "cleaned". Note that since the pages are in a delayed write + * buffer -- the VFS layer "will" see that the pages get written + * out on the next sync, or perhaps the cluster will be completed. + */ + vfs_clean_pages(bp); + bqrelse(bp); + + /* + * Wakeup the buffer flushing daemon if we have a lot of dirty + * buffers (midpoint between our recovery point and our stall + * point). + */ + bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); + + /* + * note: we cannot initiate I/O from a bdwrite even if we wanted to, + * due to the softdep code. + */ +} + +/* + * bdirty: + * + * Turn buffer into delayed write request. We must clear BIO_READ and + * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to + * itself to properly update it in the dirty/clean lists. We mark it + * B_DONE to ensure that any asynchronization of the buffer properly + * clears B_DONE ( else a panic will occur later ). + * + * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which + * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() + * should only be called if the buffer is known-good. + * + * Since the buffer is not on a queue, we do not update the numfreebuffers + * count. + * + * Must be called at splbio(). + * The buffer must be on QUEUE_NONE. + */ +void +bdirty(bp) + struct buf *bp; +{ + KASSERT(bp->b_qindex == QUEUE_NONE, + ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); + bp->b_flags &= ~(B_RELBUF); + bp->b_iocmd = BIO_WRITE; + + if ((bp->b_flags & B_DELWRI) == 0) { + bp->b_flags |= B_DONE | B_DELWRI; + reassignbuf(bp, bp->b_vp); + ++numdirtybuffers; + bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); + } +} + +/* + * bundirty: + * + * Clear B_DELWRI for buffer. + * + * Since the buffer is not on a queue, we do not update the numfreebuffers + * count. + * + * Must be called at splbio(). + * The buffer must be on QUEUE_NONE. + */ + +void +bundirty(bp) + struct buf *bp; +{ + KASSERT(bp->b_qindex == QUEUE_NONE, + ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); + + if (bp->b_flags & B_DELWRI) { + bp->b_flags &= ~B_DELWRI; + reassignbuf(bp, bp->b_vp); + --numdirtybuffers; + numdirtywakeup(lodirtybuffers); + } + /* + * Since it is now being written, we can clear its deferred write flag. + */ + bp->b_flags &= ~B_DEFERRED; +} + +/* + * bawrite: + * + * Asynchronous write. Start output on a buffer, but do not wait for + * it to complete. The buffer is released when the output completes. + * + * bwrite() ( or the VOP routine anyway ) is responsible for handling + * B_INVAL buffers. Not us. + */ +void +bawrite(struct buf * bp) +{ + bp->b_flags |= B_ASYNC; + (void) BUF_WRITE(bp); +} + +/* + * bwillwrite: + * + * Called prior to the locking of any vnodes when we are expecting to + * write. We do not want to starve the buffer cache with too many + * dirty buffers so we block here. By blocking prior to the locking + * of any vnodes we attempt to avoid the situation where a locked vnode + * prevents the various system daemons from flushing related buffers. + */ + +void +bwillwrite(void) +{ + if (numdirtybuffers >= hidirtybuffers) { + int s; + + mtx_lock(&Giant); + s = splbio(); + while (numdirtybuffers >= hidirtybuffers) { + bd_wakeup(1); + needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; + tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0); + } + splx(s); + mtx_unlock(&Giant); + } +} + +/* + * Return true if we have too many dirty buffers. + */ +int +buf_dirty_count_severe(void) +{ + return(numdirtybuffers >= hidirtybuffers); +} + +/* + * brelse: + * + * Release a busy buffer and, if requested, free its resources. The + * buffer will be stashed in the appropriate bufqueue[] allowing it + * to be accessed later as a cache entity or reused for other purposes. + */ +void +brelse(struct buf * bp) +{ + int s; + + GIANT_REQUIRED; + + KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), + ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + + s = splbio(); + + if (bp->b_flags & B_LOCKED) + bp->b_ioflags &= ~BIO_ERROR; + + if (bp->b_iocmd == BIO_WRITE && + (bp->b_ioflags & BIO_ERROR) && + !(bp->b_flags & B_INVAL)) { + /* + * Failed write, redirty. Must clear BIO_ERROR to prevent + * pages from being scrapped. If B_INVAL is set then + * this case is not run and the next case is run to + * destroy the buffer. B_INVAL can occur if the buffer + * is outside the range supported by the underlying device. + */ + bp->b_ioflags &= ~BIO_ERROR; + bdirty(bp); + } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || + (bp->b_ioflags & BIO_ERROR) || + bp->b_iocmd == BIO_DELETE || (bp->b_bufsize <= 0)) { + /* + * Either a failed I/O or we were asked to free or not + * cache the buffer. + */ + bp->b_flags |= B_INVAL; + if (LIST_FIRST(&bp->b_dep) != NULL) + buf_deallocate(bp); + if (bp->b_flags & B_DELWRI) { + --numdirtybuffers; + numdirtywakeup(lodirtybuffers); + } + bp->b_flags &= ~(B_DELWRI | B_CACHE); + if ((bp->b_flags & B_VMIO) == 0) { + if (bp->b_bufsize) + allocbuf(bp, 0); + if (bp->b_vp) + brelvp(bp); + } + } + + /* + * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() + * is called with B_DELWRI set, the underlying pages may wind up + * getting freed causing a previous write (bdwrite()) to get 'lost' + * because pages associated with a B_DELWRI bp are marked clean. + * + * We still allow the B_INVAL case to call vfs_vmio_release(), even + * if B_DELWRI is set. + * + * If B_DELWRI is not set we may have to set B_RELBUF if we are low + * on pages to return pages to the VM page queues. + */ + if (bp->b_flags & B_DELWRI) + bp->b_flags &= ~B_RELBUF; + else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG)) + bp->b_flags |= B_RELBUF; + + /* + * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer + * constituted, not even NFS buffers now. Two flags effect this. If + * B_INVAL, the struct buf is invalidated but the VM object is kept + * around ( i.e. so it is trivial to reconstitute the buffer later ). + * + * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be + * invalidated. BIO_ERROR cannot be set for a failed write unless the + * buffer is also B_INVAL because it hits the re-dirtying code above. + * + * Normally we can do this whether a buffer is B_DELWRI or not. If + * the buffer is an NFS buffer, it is tracking piecemeal writes or + * the commit state and we cannot afford to lose the buffer. If the + * buffer has a background write in progress, we need to keep it + * around to prevent it from being reconstituted and starting a second + * background write. + */ + if ((bp->b_flags & B_VMIO) + && !(bp->b_vp->v_tag == VT_NFS && + !vn_isdisk(bp->b_vp, NULL) && + (bp->b_flags & B_DELWRI)) + ) { + + int i, j, resid; + vm_page_t m; + off_t foff; + vm_pindex_t poff; + vm_object_t obj; + struct vnode *vp; + + vp = bp->b_vp; + + /* + * Get the base offset and length of the buffer. Note that + * in the VMIO case if the buffer block size is not + * page-aligned then b_data pointer may not be page-aligned. + * But our b_pages[] array *IS* page aligned. + * + * block sizes less then DEV_BSIZE (usually 512) are not + * supported due to the page granularity bits (m->valid, + * m->dirty, etc...). + * + * See man buf(9) for more information + */ + resid = bp->b_bufsize; + foff = bp->b_offset; + + for (i = 0; i < bp->b_npages; i++) { + int had_bogus = 0; + + m = bp->b_pages[i]; + vm_page_flag_clear(m, PG_ZERO); + + /* + * If we hit a bogus page, fixup *all* the bogus pages + * now. + */ + if (m == bogus_page) { + VOP_GETVOBJECT(vp, &obj); + poff = OFF_TO_IDX(bp->b_offset); + had_bogus = 1; + + for (j = i; j < bp->b_npages; j++) { + vm_page_t mtmp; + mtmp = bp->b_pages[j]; + if (mtmp == bogus_page) { + mtmp = vm_page_lookup(obj, poff + j); + if (!mtmp) { + panic("brelse: page missing\n"); + } + bp->b_pages[j] = mtmp; + } + } + + if ((bp->b_flags & B_INVAL) == 0) { + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } + m = bp->b_pages[i]; + } + if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) { + int poffset = foff & PAGE_MASK; + int presid = resid > (PAGE_SIZE - poffset) ? + (PAGE_SIZE - poffset) : resid; + + KASSERT(presid >= 0, ("brelse: extra page")); + vm_page_set_invalid(m, poffset, presid); + if (had_bogus) + printf("avoided corruption bug in bogus_page/brelse code\n"); + } + resid -= PAGE_SIZE - (foff & PAGE_MASK); + foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; + } + + if (bp->b_flags & (B_INVAL | B_RELBUF)) + vfs_vmio_release(bp); + + } else if (bp->b_flags & B_VMIO) { + + if (bp->b_flags & (B_INVAL | B_RELBUF)) { + vfs_vmio_release(bp); + } + + } + + if (bp->b_qindex != QUEUE_NONE) + panic("brelse: free buffer onto another queue???"); + if (BUF_REFCNT(bp) > 1) { + /* do not release to free list */ + BUF_UNLOCK(bp); + splx(s); + return; + } + + /* enqueue */ + + /* buffers with no memory */ + if (bp->b_bufsize == 0) { + bp->b_flags |= B_INVAL; + bp->b_xflags &= ~BX_BKGRDWRITE; + if (bp->b_xflags & BX_BKGRDINPROG) + panic("losing buffer 1"); + if (bp->b_kvasize) { + bp->b_qindex = QUEUE_EMPTYKVA; + } else { + bp->b_qindex = QUEUE_EMPTY; + } + TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); + LIST_REMOVE(bp, b_hash); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); + bp->b_dev = NODEV; + /* buffers with junk contents */ + } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || + (bp->b_ioflags & BIO_ERROR)) { + bp->b_flags |= B_INVAL; + bp->b_xflags &= ~BX_BKGRDWRITE; + if (bp->b_xflags & BX_BKGRDINPROG) + panic("losing buffer 2"); + bp->b_qindex = QUEUE_CLEAN; + TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); + LIST_REMOVE(bp, b_hash); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); + bp->b_dev = NODEV; + + /* buffers that are locked */ + } else if (bp->b_flags & B_LOCKED) { + bp->b_qindex = QUEUE_LOCKED; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); + + /* remaining buffers */ + } else { + if (bp->b_flags & B_DELWRI) + bp->b_qindex = QUEUE_DIRTY; + else + bp->b_qindex = QUEUE_CLEAN; + if (bp->b_flags & B_AGE) + TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); + else + TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); + } + + /* + * If B_INVAL, clear B_DELWRI. We've already placed the buffer + * on the correct queue. + */ + if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) + bundirty(bp); + + /* + * Fixup numfreebuffers count. The bp is on an appropriate queue + * unless locked. We then bump numfreebuffers if it is not B_DELWRI. + * We've already handled the B_INVAL case ( B_DELWRI will be clear + * if B_INVAL is set ). + */ + + if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI)) + bufcountwakeup(); + + /* + * Something we can maybe free or reuse + */ + if (bp->b_bufsize || bp->b_kvasize) + bufspacewakeup(); + + /* unlock */ + BUF_UNLOCK(bp); + bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | + B_DIRECT | B_NOWDRAIN); + if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) + panic("brelse: not dirty"); + splx(s); +} + +/* + * Release a buffer back to the appropriate queue but do not try to free + * it. The buffer is expected to be used again soon. + * + * bqrelse() is used by bdwrite() to requeue a delayed write, and used by + * biodone() to requeue an async I/O on completion. It is also used when + * known good buffers need to be requeued but we think we may need the data + * again soon. + * + * XXX we should be able to leave the B_RELBUF hint set on completion. + */ +void +bqrelse(struct buf * bp) +{ + int s; + + s = splbio(); + + KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + + if (bp->b_qindex != QUEUE_NONE) + panic("bqrelse: free buffer onto another queue???"); + if (BUF_REFCNT(bp) > 1) { + /* do not release to free list */ + BUF_UNLOCK(bp); + splx(s); + return; + } + if (bp->b_flags & B_LOCKED) { + bp->b_ioflags &= ~BIO_ERROR; + bp->b_qindex = QUEUE_LOCKED; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); + /* buffers with stale but valid contents */ + } else if (bp->b_flags & B_DELWRI) { + bp->b_qindex = QUEUE_DIRTY; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); + } else if (vm_page_count_severe()) { + /* + * We are too low on memory, we have to try to free the + * buffer (most importantly: the wired pages making up its + * backing store) *now*. + */ + splx(s); + brelse(bp); + return; + } else { + bp->b_qindex = QUEUE_CLEAN; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); + } + + if ((bp->b_flags & B_LOCKED) == 0 && + ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) { + bufcountwakeup(); + } + + /* + * Something we can maybe free or reuse. + */ + if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) + bufspacewakeup(); + + /* unlock */ + BUF_UNLOCK(bp); + bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); + if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) + panic("bqrelse: not dirty"); + splx(s); +} + +/* Give pages used by the bp back to the VM system (where possible) */ +static void +vfs_vmio_release(bp) + struct buf *bp; +{ + int i; + vm_page_t m; + + GIANT_REQUIRED; + + for (i = 0; i < bp->b_npages; i++) { + m = bp->b_pages[i]; + bp->b_pages[i] = NULL; + /* + * In order to keep page LRU ordering consistent, put + * everything on the inactive queue. + */ + vm_page_unwire(m, 0); + /* + * We don't mess with busy pages, it is + * the responsibility of the process that + * busied the pages to deal with them. + */ + if ((m->flags & PG_BUSY) || (m->busy != 0)) + continue; + + if (m->wire_count == 0) { + vm_page_flag_clear(m, PG_ZERO); + /* + * Might as well free the page if we can and it has + * no valid data. We also free the page if the + * buffer was used for direct I/O + */ + if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && + m->hold_count == 0) { + vm_page_busy(m); + vm_page_protect(m, VM_PROT_NONE); + vm_page_free(m); + } else if (bp->b_flags & B_DIRECT) { + vm_page_try_to_free(m); + } else if (vm_page_count_severe()) { + vm_page_try_to_cache(m); + } + } + } + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); + + if (bp->b_bufsize) { + bufspacewakeup(); + bp->b_bufsize = 0; + } + bp->b_npages = 0; + bp->b_flags &= ~B_VMIO; + if (bp->b_vp) + brelvp(bp); +} + +/* + * Check to see if a block is currently memory resident. + */ +struct buf * +gbincore(struct vnode * vp, daddr_t blkno) +{ + struct buf *bp; + struct bufhashhdr *bh; + + bh = bufhash(vp, blkno); + + /* Search hash chain */ + LIST_FOREACH(bp, bh, b_hash) { + /* hit */ + if (bp->b_vp == vp && bp->b_lblkno == blkno && + (bp->b_flags & B_INVAL) == 0) { + break; + } + } + return (bp); +} + +/* + * vfs_bio_awrite: + * + * Implement clustered async writes for clearing out B_DELWRI buffers. + * This is much better then the old way of writing only one buffer at + * a time. Note that we may not be presented with the buffers in the + * correct order, so we search for the cluster in both directions. + */ +int +vfs_bio_awrite(struct buf * bp) +{ + int i; + int j; + daddr_t lblkno = bp->b_lblkno; + struct vnode *vp = bp->b_vp; + int s; + int ncl; + struct buf *bpa; + int nwritten; + int size; + int maxcl; + + s = splbio(); + /* + * right now we support clustered writing only to regular files. If + * we find a clusterable block we could be in the middle of a cluster + * rather then at the beginning. + */ + if ((vp->v_type == VREG) && + (vp->v_mount != 0) && /* Only on nodes that have the size info */ + (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { + + size = vp->v_mount->mnt_stat.f_iosize; + maxcl = MAXPHYS / size; + + for (i = 1; i < maxcl; i++) { + if ((bpa = gbincore(vp, lblkno + i)) && + BUF_REFCNT(bpa) == 0 && + ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == + (B_DELWRI | B_CLUSTEROK)) && + (bpa->b_bufsize == size)) { + if ((bpa->b_blkno == bpa->b_lblkno) || + (bpa->b_blkno != + bp->b_blkno + ((i * size) >> DEV_BSHIFT))) + break; + } else { + break; + } + } + for (j = 1; i + j <= maxcl && j <= lblkno; j++) { + if ((bpa = gbincore(vp, lblkno - j)) && + BUF_REFCNT(bpa) == 0 && + ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == + (B_DELWRI | B_CLUSTEROK)) && + (bpa->b_bufsize == size)) { + if ((bpa->b_blkno == bpa->b_lblkno) || + (bpa->b_blkno != + bp->b_blkno - ((j * size) >> DEV_BSHIFT))) + break; + } else { + break; + } + } + --j; + ncl = i + j; + /* + * this is a possible cluster write + */ + if (ncl != 1) { + nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); + splx(s); + return nwritten; + } + } + + BUF_LOCK(bp, LK_EXCLUSIVE); + bremfree(bp); + bp->b_flags |= B_ASYNC; + + splx(s); + /* + * default (old) behavior, writing out only one block + * + * XXX returns b_bufsize instead of b_bcount for nwritten? + */ + nwritten = bp->b_bufsize; + (void) BUF_WRITE(bp); + + return nwritten; +} + +/* + * getnewbuf: + * + * Find and initialize a new buffer header, freeing up existing buffers + * in the bufqueues as necessary. The new buffer is returned locked. + * + * Important: B_INVAL is not set. If the caller wishes to throw the + * buffer away, the caller must set B_INVAL prior to calling brelse(). + * + * We block if: + * We have insufficient buffer headers + * We have insufficient buffer space + * buffer_map is too fragmented ( space reservation fails ) + * If we have to flush dirty buffers ( but we try to avoid this ) + * + * To avoid VFS layer recursion we do not flush dirty buffers ourselves. + * Instead we ask the buf daemon to do it for us. We attempt to + * avoid piecemeal wakeups of the pageout daemon. + */ + +static struct buf * +getnewbuf(int slpflag, int slptimeo, int size, int maxsize) +{ + struct buf *bp; + struct buf *nbp; + int defrag = 0; + int nqindex; + static int flushingbufs; + + GIANT_REQUIRED; + + /* + * We can't afford to block since we might be holding a vnode lock, + * which may prevent system daemons from running. We deal with + * low-memory situations by proactively returning memory and running + * async I/O rather then sync I/O. + */ + + ++getnewbufcalls; + --getnewbufrestarts; +restart: + ++getnewbufrestarts; + + /* + * Setup for scan. If we do not have enough free buffers, + * we setup a degenerate case that immediately fails. Note + * that if we are specially marked process, we are allowed to + * dip into our reserves. + * + * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN + * + * We start with EMPTYKVA. If the list is empty we backup to EMPTY. + * However, there are a number of cases (defragging, reusing, ...) + * where we cannot backup. + */ + nqindex = QUEUE_EMPTYKVA; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); + + if (nbp == NULL) { + /* + * If no EMPTYKVA buffers and we are either + * defragging or reusing, locate a CLEAN buffer + * to free or reuse. If bufspace useage is low + * skip this step so we can allocate a new buffer. + */ + if (defrag || bufspace >= lobufspace) { + nqindex = QUEUE_CLEAN; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); + } + + /* + * If we could not find or were not allowed to reuse a + * CLEAN buffer, check to see if it is ok to use an EMPTY + * buffer. We can only use an EMPTY buffer if allocating + * its KVA would not otherwise run us out of buffer space. + */ + if (nbp == NULL && defrag == 0 && + bufspace + maxsize < hibufspace) { + nqindex = QUEUE_EMPTY; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); + } + } + + /* + * Run scan, possibly freeing data and/or kva mappings on the fly + * depending. + */ + + while ((bp = nbp) != NULL) { + int qindex = nqindex; + + /* + * Calculate next bp ( we can only use it if we do not block + * or do other fancy things ). + */ + if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { + switch(qindex) { + case QUEUE_EMPTY: + nqindex = QUEUE_EMPTYKVA; + if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) + break; + /* fall through */ + case QUEUE_EMPTYKVA: + nqindex = QUEUE_CLEAN; + if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) + break; + /* fall through */ + case QUEUE_CLEAN: + /* + * nbp is NULL. + */ + break; + } + } + + /* + * Sanity Checks + */ + KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); + + /* + * Note: we no longer distinguish between VMIO and non-VMIO + * buffers. + */ + + KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); + + /* + * If we are defragging then we need a buffer with + * b_kvasize != 0. XXX this situation should no longer + * occur, if defrag is non-zero the buffer's b_kvasize + * should also be non-zero at this point. XXX + */ + if (defrag && bp->b_kvasize == 0) { + printf("Warning: defrag empty buffer %p\n", bp); + continue; + } + + /* + * Start freeing the bp. This is somewhat involved. nbp + * remains valid only for QUEUE_EMPTY[KVA] bp's. + */ + + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) + panic("getnewbuf: locked buf"); + bremfree(bp); + + if (qindex == QUEUE_CLEAN) { + if (bp->b_flags & B_VMIO) { + bp->b_flags &= ~B_ASYNC; + vfs_vmio_release(bp); + } + if (bp->b_vp) + brelvp(bp); + } + + /* + * NOTE: nbp is now entirely invalid. We can only restart + * the scan from this point on. + * + * Get the rest of the buffer freed up. b_kva* is still + * valid after this operation. + */ + + if (bp->b_rcred != NOCRED) { + crfree(bp->b_rcred); + bp->b_rcred = NOCRED; + } + if (bp->b_wcred != NOCRED) { + crfree(bp->b_wcred); + bp->b_wcred = NOCRED; + } + if (LIST_FIRST(&bp->b_dep) != NULL) + buf_deallocate(bp); + if (bp->b_xflags & BX_BKGRDINPROG) + panic("losing buffer 3"); + LIST_REMOVE(bp, b_hash); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); + + if (bp->b_bufsize) + allocbuf(bp, 0); + + bp->b_flags = 0; + bp->b_ioflags = 0; + bp->b_xflags = 0; + bp->b_dev = NODEV; + bp->b_vp = NULL; + bp->b_blkno = bp->b_lblkno = 0; + bp->b_offset = NOOFFSET; + bp->b_iodone = 0; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_bcount = 0; + bp->b_npages = 0; + bp->b_dirtyoff = bp->b_dirtyend = 0; + bp->b_magic = B_MAGIC_BIO; + bp->b_op = &buf_ops_bio; + + LIST_INIT(&bp->b_dep); + + /* + * If we are defragging then free the buffer. + */ + if (defrag) { + bp->b_flags |= B_INVAL; + bfreekva(bp); + brelse(bp); + defrag = 0; + goto restart; + } + + /* + * If we are overcomitted then recover the buffer and its + * KVM space. This occurs in rare situations when multiple + * processes are blocked in getnewbuf() or allocbuf(). + */ + if (bufspace >= hibufspace) + flushingbufs = 1; + if (flushingbufs && bp->b_kvasize != 0) { + bp->b_flags |= B_INVAL; + bfreekva(bp); + brelse(bp); + goto restart; + } + if (bufspace < lobufspace) + flushingbufs = 0; + break; + } + + /* + * If we exhausted our list, sleep as appropriate. We may have to + * wakeup various daemons and write out some dirty buffers. + * + * Generally we are sleeping due to insufficient buffer space. + */ + + if (bp == NULL) { + int flags; + char *waitmsg; + + if (defrag) { + flags = VFS_BIO_NEED_BUFSPACE; + waitmsg = "nbufkv"; + } else if (bufspace >= hibufspace) { + waitmsg = "nbufbs"; + flags = VFS_BIO_NEED_BUFSPACE; + } else { + waitmsg = "newbuf"; + flags = VFS_BIO_NEED_ANY; + } + + bd_speedup(); /* heeeelp */ + + needsbuffer |= flags; + while (needsbuffer & flags) { + if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, + waitmsg, slptimeo)) + return (NULL); + } + } else { + /* + * We finally have a valid bp. We aren't quite out of the + * woods, we still have to reserve kva space. In order + * to keep fragmentation sane we only allocate kva in + * BKVASIZE chunks. + */ + maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; + + if (maxsize != bp->b_kvasize) { + vm_offset_t addr = 0; + + bfreekva(bp); + + if (vm_map_findspace(buffer_map, + vm_map_min(buffer_map), maxsize, &addr)) { + /* + * Uh oh. Buffer map is to fragmented. We + * must defragment the map. + */ + ++bufdefragcnt; + defrag = 1; + bp->b_flags |= B_INVAL; + brelse(bp); + goto restart; + } + if (addr) { + vm_map_insert(buffer_map, NULL, 0, + addr, addr + maxsize, + VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); + + bp->b_kvabase = (caddr_t) addr; + bp->b_kvasize = maxsize; + bufspace += bp->b_kvasize; + ++bufreusecnt; + } + } + bp->b_data = bp->b_kvabase; + } + return(bp); +} + +/* + * buf_daemon: + * + * buffer flushing daemon. Buffers are normally flushed by the + * update daemon but if it cannot keep up this process starts to + * take the load in an attempt to prevent getnewbuf() from blocking. + */ + +static struct proc *bufdaemonproc; + +static struct kproc_desc buf_kp = { + "bufdaemon", + buf_daemon, + &bufdaemonproc +}; +SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) + +static void +buf_daemon() +{ + int s; + + mtx_lock(&Giant); + + /* + * This process needs to be suspended prior to shutdown sync. + */ + EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, + SHUTDOWN_PRI_LAST); + + /* + * This process is allowed to take the buffer cache to the limit + */ + s = splbio(); + + for (;;) { + kthread_suspend_check(bufdaemonproc); + + bd_request = 0; + + /* + * Do the flush. Limit the amount of in-transit I/O we + * allow to build up, otherwise we would completely saturate + * the I/O system. Wakeup any waiting processes before we + * normally would so they can run in parallel with our drain. + */ + while (numdirtybuffers > lodirtybuffers) { + if (flushbufqueues() == 0) + break; + waitrunningbufspace(); + numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); + } + + /* + * Only clear bd_request if we have reached our low water + * mark. The buf_daemon normally waits 1 second and + * then incrementally flushes any dirty buffers that have + * built up, within reason. + * + * If we were unable to hit our low water mark and couldn't + * find any flushable buffers, we sleep half a second. + * Otherwise we loop immediately. + */ + if (numdirtybuffers <= lodirtybuffers) { + /* + * We reached our low water mark, reset the + * request and sleep until we are needed again. + * The sleep is just so the suspend code works. + */ + bd_request = 0; + tsleep(&bd_request, PVM, "psleep", hz); + } else { + /* + * We couldn't find any flushable dirty buffers but + * still have too many dirty buffers, we + * have to sleep and try again. (rare) + */ + tsleep(&bd_request, PVM, "qsleep", hz / 2); + } + } +} + +/* + * flushbufqueues: + * + * Try to flush a buffer in the dirty queue. We must be careful to + * free up B_INVAL buffers instead of write them, which NFS is + * particularly sensitive to. + */ + +static int +flushbufqueues(void) +{ + struct buf *bp; + int r = 0; + + bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]); + + while (bp) { + KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); + if ((bp->b_flags & B_DELWRI) != 0 && + (bp->b_xflags & BX_BKGRDINPROG) == 0) { + if (bp->b_flags & B_INVAL) { + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) + panic("flushbufqueues: locked buf"); + bremfree(bp); + brelse(bp); + ++r; + break; + } + if (LIST_FIRST(&bp->b_dep) != NULL && + (bp->b_flags & B_DEFERRED) == 0 && + buf_countdeps(bp, 0)) { + TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY], + bp, b_freelist); + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], + bp, b_freelist); + bp->b_flags |= B_DEFERRED; + bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]); + continue; + } + vfs_bio_awrite(bp); + ++r; + break; + } + bp = TAILQ_NEXT(bp, b_freelist); + } + return (r); +} + +/* + * Check to see if a block is currently memory resident. + */ +struct buf * +incore(struct vnode * vp, daddr_t blkno) +{ + struct buf *bp; + + int s = splbio(); + bp = gbincore(vp, blkno); + splx(s); + return (bp); +} + +/* + * Returns true if no I/O is needed to access the + * associated VM object. This is like incore except + * it also hunts around in the VM system for the data. + */ + +int +inmem(struct vnode * vp, daddr_t blkno) +{ + vm_object_t obj; + vm_offset_t toff, tinc, size; + vm_page_t m; + vm_ooffset_t off; + + GIANT_REQUIRED; + + if (incore(vp, blkno)) + return 1; + if (vp->v_mount == NULL) + return 0; + if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_flag & VOBJBUF) == 0) + return 0; + + size = PAGE_SIZE; + if (size > vp->v_mount->mnt_stat.f_iosize) + size = vp->v_mount->mnt_stat.f_iosize; + off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; + + for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { + m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); + if (!m) + goto notinmem; + tinc = size; + if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) + tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); + if (vm_page_is_valid(m, + (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) + goto notinmem; + } + return 1; + +notinmem: + return (0); +} + +/* + * vfs_setdirty: + * + * Sets the dirty range for a buffer based on the status of the dirty + * bits in the pages comprising the buffer. + * + * The range is limited to the size of the buffer. + * + * This routine is primarily used by NFS, but is generalized for the + * B_VMIO case. + */ +static void +vfs_setdirty(struct buf *bp) +{ + int i; + vm_object_t object; + + GIANT_REQUIRED; + /* + * Degenerate case - empty buffer + */ + + if (bp->b_bufsize == 0) + return; + + /* + * We qualify the scan for modified pages on whether the + * object has been flushed yet. The OBJ_WRITEABLE flag + * is not cleared simply by protecting pages off. + */ + + if ((bp->b_flags & B_VMIO) == 0) + return; + + object = bp->b_pages[0]->object; + + if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY)) + printf("Warning: object %p writeable but not mightbedirty\n", object); + if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY)) + printf("Warning: object %p mightbedirty but not writeable\n", object); + + if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { + vm_offset_t boffset; + vm_offset_t eoffset; + + /* + * test the pages to see if they have been modified directly + * by users through the VM system. + */ + for (i = 0; i < bp->b_npages; i++) { + vm_page_flag_clear(bp->b_pages[i], PG_ZERO); + vm_page_test_dirty(bp->b_pages[i]); + } + + /* + * Calculate the encompassing dirty range, boffset and eoffset, + * (eoffset - boffset) bytes. + */ + + for (i = 0; i < bp->b_npages; i++) { + if (bp->b_pages[i]->dirty) + break; + } + boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); + + for (i = bp->b_npages - 1; i >= 0; --i) { + if (bp->b_pages[i]->dirty) { + break; + } + } + eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); + + /* + * Fit it to the buffer. + */ + + if (eoffset > bp->b_bcount) + eoffset = bp->b_bcount; + + /* + * If we have a good dirty range, merge with the existing + * dirty range. + */ + + if (boffset < eoffset) { + if (bp->b_dirtyoff > boffset) + bp->b_dirtyoff = boffset; + if (bp->b_dirtyend < eoffset) + bp->b_dirtyend = eoffset; + } + } +} + +/* + * getblk: + * + * Get a block given a specified block and offset into a file/device. + * The buffers B_DONE bit will be cleared on return, making it almost + * ready for an I/O initiation. B_INVAL may or may not be set on + * return. The caller should clear B_INVAL prior to initiating a + * READ. + * + * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for + * an existing buffer. + * + * For a VMIO buffer, B_CACHE is modified according to the backing VM. + * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set + * and then cleared based on the backing VM. If the previous buffer is + * non-0-sized but invalid, B_CACHE will be cleared. + * + * If getblk() must create a new buffer, the new buffer is returned with + * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which + * case it is returned with B_INVAL clear and B_CACHE set based on the + * backing VM. + * + * getblk() also forces a BUF_WRITE() for any B_DELWRI buffer whos + * B_CACHE bit is clear. + * + * What this means, basically, is that the caller should use B_CACHE to + * determine whether the buffer is fully valid or not and should clear + * B_INVAL prior to issuing a read. If the caller intends to validate + * the buffer by loading its data area with something, the caller needs + * to clear B_INVAL. If the caller does this without issuing an I/O, + * the caller should set B_CACHE ( as an optimization ), else the caller + * should issue the I/O and biodone() will set B_CACHE if the I/O was + * a write attempt or if it was a successfull read. If the caller + * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR + * prior to issuing the READ. biodone() will *not* clear B_INVAL. + */ +struct buf * +getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) +{ + struct buf *bp; + int s; + struct bufhashhdr *bh; + + if (size > MAXBSIZE) + panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); + + s = splbio(); +loop: + /* + * Block if we are low on buffers. Certain processes are allowed + * to completely exhaust the buffer cache. + * + * If this check ever becomes a bottleneck it may be better to + * move it into the else, when gbincore() fails. At the moment + * it isn't a problem. + * + * XXX remove if 0 sections (clean this up after its proven) + */ + if (numfreebuffers == 0) { + if (curthread == PCPU_GET(idlethread)) + return NULL; + needsbuffer |= VFS_BIO_NEED_ANY; + } + + if ((bp = gbincore(vp, blkno))) { + /* + * Buffer is in-core. If the buffer is not busy, it must + * be on a queue. + */ + + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { + if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, + "getblk", slpflag, slptimeo) == ENOLCK) + goto loop; + splx(s); + return (struct buf *) NULL; + } + + /* + * The buffer is locked. B_CACHE is cleared if the buffer is + * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set + * and for a VMIO buffer B_CACHE is adjusted according to the + * backing VM cache. + */ + if (bp->b_flags & B_INVAL) + bp->b_flags &= ~B_CACHE; + else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) + bp->b_flags |= B_CACHE; + bremfree(bp); + + /* + * check for size inconsistancies for non-VMIO case. + */ + + if (bp->b_bcount != size) { + if ((bp->b_flags & B_VMIO) == 0 || + (size > bp->b_kvasize)) { + if (bp->b_flags & B_DELWRI) { + bp->b_flags |= B_NOCACHE; + BUF_WRITE(bp); + } else { + if ((bp->b_flags & B_VMIO) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + bp->b_flags |= B_RELBUF; + brelse(bp); + } else { + bp->b_flags |= B_NOCACHE; + BUF_WRITE(bp); + } + } + goto loop; + } + } + + /* + * If the size is inconsistant in the VMIO case, we can resize + * the buffer. This might lead to B_CACHE getting set or + * cleared. If the size has not changed, B_CACHE remains + * unchanged from its previous state. + */ + + if (bp->b_bcount != size) + allocbuf(bp, size); + + KASSERT(bp->b_offset != NOOFFSET, + ("getblk: no buffer offset")); + + /* + * A buffer with B_DELWRI set and B_CACHE clear must + * be committed before we can return the buffer in + * order to prevent the caller from issuing a read + * ( due to B_CACHE not being set ) and overwriting + * it. + * + * Most callers, including NFS and FFS, need this to + * operate properly either because they assume they + * can issue a read if B_CACHE is not set, or because + * ( for example ) an uncached B_DELWRI might loop due + * to softupdates re-dirtying the buffer. In the latter + * case, B_CACHE is set after the first write completes, + * preventing further loops. + * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE + * above while extending the buffer, we cannot allow the + * buffer to remain with B_CACHE set after the write + * completes or it will represent a corrupt state. To + * deal with this we set B_NOCACHE to scrap the buffer + * after the write. + * + * We might be able to do something fancy, like setting + * B_CACHE in bwrite() except if B_DELWRI is already set, + * so the below call doesn't set B_CACHE, but that gets real + * confusing. This is much easier. + */ + + if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { + bp->b_flags |= B_NOCACHE; + BUF_WRITE(bp); + goto loop; + } + + splx(s); + bp->b_flags &= ~B_DONE; + } else { + /* + * Buffer is not in-core, create new buffer. The buffer + * returned by getnewbuf() is locked. Note that the returned + * buffer is also considered valid (not marked B_INVAL). + */ + int bsize, maxsize, vmio; + off_t offset; + + if (vn_isdisk(vp, NULL)) + bsize = DEV_BSIZE; + else if (vp->v_mountedhere) + bsize = vp->v_mountedhere->mnt_stat.f_iosize; + else if (vp->v_mount) + bsize = vp->v_mount->mnt_stat.f_iosize; + else + bsize = size; + + offset = blkno * bsize; + vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF); + maxsize = vmio ? size + (offset & PAGE_MASK) : size; + maxsize = imax(maxsize, bsize); + + if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) { + if (slpflag || slptimeo) { + splx(s); + return NULL; + } + goto loop; + } + + /* + * This code is used to make sure that a buffer is not + * created while the getnewbuf routine is blocked. + * This can be a problem whether the vnode is locked or not. + * If the buffer is created out from under us, we have to + * throw away the one we just created. There is now window + * race because we are safely running at splbio() from the + * point of the duplicate buffer creation through to here, + * and we've locked the buffer. + */ + if (gbincore(vp, blkno)) { + bp->b_flags |= B_INVAL; + brelse(bp); + goto loop; + } + + /* + * Insert the buffer into the hash, so that it can + * be found by incore. + */ + bp->b_blkno = bp->b_lblkno = blkno; + bp->b_offset = offset; + + bgetvp(vp, bp); + LIST_REMOVE(bp, b_hash); + bh = bufhash(vp, blkno); + LIST_INSERT_HEAD(bh, bp, b_hash); + + /* + * set B_VMIO bit. allocbuf() the buffer bigger. Since the + * buffer size starts out as 0, B_CACHE will be set by + * allocbuf() for the VMIO case prior to it testing the + * backing store for validity. + */ + + if (vmio) { + bp->b_flags |= B_VMIO; +#if defined(VFS_BIO_DEBUG) + if (vp->v_type != VREG) + printf("getblk: vmioing file type %d???\n", vp->v_type); +#endif + } else { + bp->b_flags &= ~B_VMIO; + } + + allocbuf(bp, size); + + splx(s); + bp->b_flags &= ~B_DONE; + } + return (bp); +} + +/* + * Get an empty, disassociated buffer of given size. The buffer is initially + * set to B_INVAL. + */ +struct buf * +geteblk(int size) +{ + struct buf *bp; + int s; + int maxsize; + + maxsize = (size + BKVAMASK) & ~BKVAMASK; + + s = splbio(); + while ((bp = getnewbuf(0, 0, size, maxsize)) == 0); + splx(s); + allocbuf(bp, size); + bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ + return (bp); +} + + +/* + * This code constitutes the buffer memory from either anonymous system + * memory (in the case of non-VMIO operations) or from an associated + * VM object (in the case of VMIO operations). This code is able to + * resize a buffer up or down. + * + * Note that this code is tricky, and has many complications to resolve + * deadlock or inconsistant data situations. Tread lightly!!! + * There are B_CACHE and B_DELWRI interactions that must be dealt with by + * the caller. Calling this code willy nilly can result in the loss of data. + * + * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with + * B_CACHE for the non-VMIO case. + */ + +int +allocbuf(struct buf *bp, int size) +{ + int newbsize, mbsize; + int i; + + GIANT_REQUIRED; + + if (BUF_REFCNT(bp) == 0) + panic("allocbuf: buffer not busy"); + + if (bp->b_kvasize < size) + panic("allocbuf: buffer too small"); + + if ((bp->b_flags & B_VMIO) == 0) { + caddr_t origbuf; + int origbufsize; + /* + * Just get anonymous memory from the kernel. Don't + * mess with B_CACHE. + */ + mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + if (bp->b_flags & B_MALLOC) + newbsize = mbsize; + else + newbsize = round_page(size); + + if (newbsize < bp->b_bufsize) { + /* + * malloced buffers are not shrunk + */ + if (bp->b_flags & B_MALLOC) { + if (newbsize) { + bp->b_bcount = size; + } else { + free(bp->b_data, M_BIOBUF); + if (bp->b_bufsize) { + bufmallocspace -= bp->b_bufsize; + bufspacewakeup(); + bp->b_bufsize = 0; + } + bp->b_data = bp->b_kvabase; + bp->b_bcount = 0; + bp->b_flags &= ~B_MALLOC; + } + return 1; + } + vm_hold_free_pages( + bp, + (vm_offset_t) bp->b_data + newbsize, + (vm_offset_t) bp->b_data + bp->b_bufsize); + } else if (newbsize > bp->b_bufsize) { + /* + * We only use malloced memory on the first allocation. + * and revert to page-allocated memory when the buffer + * grows. + */ + if ( (bufmallocspace < maxbufmallocspace) && + (bp->b_bufsize == 0) && + (mbsize <= PAGE_SIZE/2)) { + + bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); + bp->b_bufsize = mbsize; + bp->b_bcount = size; + bp->b_flags |= B_MALLOC; + bufmallocspace += mbsize; + return 1; + } + origbuf = NULL; + origbufsize = 0; + /* + * If the buffer is growing on its other-than-first allocation, + * then we revert to the page-allocation scheme. + */ + if (bp->b_flags & B_MALLOC) { + origbuf = bp->b_data; + origbufsize = bp->b_bufsize; + bp->b_data = bp->b_kvabase; + if (bp->b_bufsize) { + bufmallocspace -= bp->b_bufsize; + bufspacewakeup(); + bp->b_bufsize = 0; + } + bp->b_flags &= ~B_MALLOC; + newbsize = round_page(newbsize); + } + vm_hold_load_pages( + bp, + (vm_offset_t) bp->b_data + bp->b_bufsize, + (vm_offset_t) bp->b_data + newbsize); + if (origbuf) { + bcopy(origbuf, bp->b_data, origbufsize); + free(origbuf, M_BIOBUF); + } + } + } else { + vm_page_t m; + int desiredpages; + + newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + desiredpages = (size == 0) ? 0 : + num_pages((bp->b_offset & PAGE_MASK) + newbsize); + + if (bp->b_flags & B_MALLOC) + panic("allocbuf: VMIO buffer can't be malloced"); + /* + * Set B_CACHE initially if buffer is 0 length or will become + * 0-length. + */ + if (size == 0 || bp->b_bufsize == 0) + bp->b_flags |= B_CACHE; + + if (newbsize < bp->b_bufsize) { + /* + * DEV_BSIZE aligned new buffer size is less then the + * DEV_BSIZE aligned existing buffer size. Figure out + * if we have to remove any pages. + */ + if (desiredpages < bp->b_npages) { + for (i = desiredpages; i < bp->b_npages; i++) { + /* + * the page is not freed here -- it + * is the responsibility of + * vnode_pager_setsize + */ + m = bp->b_pages[i]; + KASSERT(m != bogus_page, + ("allocbuf: bogus page found")); + while (vm_page_sleep_busy(m, TRUE, "biodep")) + ; + + bp->b_pages[i] = NULL; + vm_page_unwire(m, 0); + } + pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); + bp->b_npages = desiredpages; + } + } else if (size > bp->b_bcount) { + /* + * We are growing the buffer, possibly in a + * byte-granular fashion. + */ + struct vnode *vp; + vm_object_t obj; + vm_offset_t toff; + vm_offset_t tinc; + + /* + * Step 1, bring in the VM pages from the object, + * allocating them if necessary. We must clear + * B_CACHE if these pages are not valid for the + * range covered by the buffer. + */ + + vp = bp->b_vp; + VOP_GETVOBJECT(vp, &obj); + + while (bp->b_npages < desiredpages) { + vm_page_t m; + vm_pindex_t pi; + + pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; + if ((m = vm_page_lookup(obj, pi)) == NULL) { + /* + * note: must allocate system pages + * since blocking here could intefere + * with paging I/O, no matter which + * process we are. + */ + m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM); + if (m == NULL) { + VM_WAIT; + vm_pageout_deficit += desiredpages - bp->b_npages; + } else { + vm_page_wire(m); + vm_page_wakeup(m); + bp->b_flags &= ~B_CACHE; + bp->b_pages[bp->b_npages] = m; + ++bp->b_npages; + } + continue; + } + + /* + * We found a page. If we have to sleep on it, + * retry because it might have gotten freed out + * from under us. + * + * We can only test PG_BUSY here. Blocking on + * m->busy might lead to a deadlock: + * + * vm_fault->getpages->cluster_read->allocbuf + * + */ + + if (vm_page_sleep_busy(m, FALSE, "pgtblk")) + continue; + + /* + * We have a good page. Should we wakeup the + * page daemon? + */ + if ((curproc != pageproc) && + ((m->queue - m->pc) == PQ_CACHE) && + ((cnt.v_free_count + cnt.v_cache_count) < + (cnt.v_free_min + cnt.v_cache_min))) { + pagedaemon_wakeup(); + } + vm_page_flag_clear(m, PG_ZERO); + vm_page_wire(m); + bp->b_pages[bp->b_npages] = m; + ++bp->b_npages; + } + + /* + * Step 2. We've loaded the pages into the buffer, + * we have to figure out if we can still have B_CACHE + * set. Note that B_CACHE is set according to the + * byte-granular range ( bcount and size ), new the + * aligned range ( newbsize ). + * + * The VM test is against m->valid, which is DEV_BSIZE + * aligned. Needless to say, the validity of the data + * needs to also be DEV_BSIZE aligned. Note that this + * fails with NFS if the server or some other client + * extends the file's EOF. If our buffer is resized, + * B_CACHE may remain set! XXX + */ + + toff = bp->b_bcount; + tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); + + while ((bp->b_flags & B_CACHE) && toff < size) { + vm_pindex_t pi; + + if (tinc > (size - toff)) + tinc = size - toff; + + pi = ((bp->b_offset & PAGE_MASK) + toff) >> + PAGE_SHIFT; + + vfs_buf_test_cache( + bp, + bp->b_offset, + toff, + tinc, + bp->b_pages[pi] + ); + toff += tinc; + tinc = PAGE_SIZE; + } + + /* + * Step 3, fixup the KVM pmap. Remember that + * bp->b_data is relative to bp->b_offset, but + * bp->b_offset may be offset into the first page. + */ + + bp->b_data = (caddr_t) + trunc_page((vm_offset_t)bp->b_data); + pmap_qenter( + (vm_offset_t)bp->b_data, + bp->b_pages, + bp->b_npages + ); + + bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | + (vm_offset_t)(bp->b_offset & PAGE_MASK)); + } + } + if (newbsize < bp->b_bufsize) + bufspacewakeup(); + bp->b_bufsize = newbsize; /* actual buffer allocation */ + bp->b_bcount = size; /* requested buffer size */ + return 1; +} + +/* + * bufwait: + * + * Wait for buffer I/O completion, returning error status. The buffer + * is left locked and B_DONE on return. B_EINTR is converted into a EINTR + * error and cleared. + */ +int +bufwait(register struct buf * bp) +{ + int s; + + s = splbio(); + while ((bp->b_flags & B_DONE) == 0) { + if (bp->b_iocmd == BIO_READ) + tsleep(bp, PRIBIO, "biord", 0); + else + tsleep(bp, PRIBIO, "biowr", 0); + } + splx(s); + if (bp->b_flags & B_EINTR) { + bp->b_flags &= ~B_EINTR; + return (EINTR); + } + if (bp->b_ioflags & BIO_ERROR) { + return (bp->b_error ? bp->b_error : EIO); + } else { + return (0); + } +} + + /* + * Call back function from struct bio back up to struct buf. + * The corresponding initialization lives in sys/conf.h:DEV_STRATEGY(). + */ +void +bufdonebio(struct bio *bp) +{ + bufdone(bp->bio_caller2); +} + +/* + * bufdone: + * + * Finish I/O on a buffer, optionally calling a completion function. + * This is usually called from an interrupt so process blocking is + * not allowed. + * + * biodone is also responsible for setting B_CACHE in a B_VMIO bp. + * In a non-VMIO bp, B_CACHE will be set on the next getblk() + * assuming B_INVAL is clear. + * + * For the VMIO case, we set B_CACHE if the op was a read and no + * read error occured, or if the op was a write. B_CACHE is never + * set if the buffer is invalid or otherwise uncacheable. + * + * biodone does not mess with B_INVAL, allowing the I/O routine or the + * initiator to leave B_INVAL set to brelse the buffer out of existance + * in the biodone routine. + */ +void +bufdone(struct buf *bp) +{ + int s, error; + void (*biodone)(struct buf *); + + GIANT_REQUIRED; + + s = splbio(); + + KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); + KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); + + bp->b_flags |= B_DONE; + runningbufwakeup(bp); + + if (bp->b_iocmd == BIO_DELETE) { + brelse(bp); + splx(s); + return; + } + + if (bp->b_iocmd == BIO_WRITE) { + vwakeup(bp); + } + + /* call optional completion function if requested */ + if (bp->b_iodone != NULL) { + biodone = bp->b_iodone; + bp->b_iodone = NULL; + (*biodone) (bp); + splx(s); + return; + } + if (LIST_FIRST(&bp->b_dep) != NULL) + buf_complete(bp); + + if (bp->b_flags & B_VMIO) { + int i; + vm_ooffset_t foff; + vm_page_t m; + vm_object_t obj; + int iosize; + struct vnode *vp = bp->b_vp; + + error = VOP_GETVOBJECT(vp, &obj); + +#if defined(VFS_BIO_DEBUG) + if (vp->v_usecount == 0) { + panic("biodone: zero vnode ref count"); + } + + if (error) { + panic("biodone: missing VM object"); + } + + if ((vp->v_flag & VOBJBUF) == 0) { + panic("biodone: vnode is not setup for merged cache"); + } +#endif + + foff = bp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, + ("biodone: no buffer offset")); + + if (error) { + panic("biodone: no object"); + } +#if defined(VFS_BIO_DEBUG) + if (obj->paging_in_progress < bp->b_npages) { + printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", + obj->paging_in_progress, bp->b_npages); + } +#endif + + /* + * Set B_CACHE if the op was a normal read and no error + * occured. B_CACHE is set for writes in the b*write() + * routines. + */ + iosize = bp->b_bcount - bp->b_resid; + if (bp->b_iocmd == BIO_READ && + !(bp->b_flags & (B_INVAL|B_NOCACHE)) && + !(bp->b_ioflags & BIO_ERROR)) { + bp->b_flags |= B_CACHE; + } + + for (i = 0; i < bp->b_npages; i++) { + int bogusflag = 0; + int resid; + + resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; + if (resid > iosize) + resid = iosize; + + /* + * cleanup bogus pages, restoring the originals + */ + m = bp->b_pages[i]; + if (m == bogus_page) { + bogusflag = 1; + m = vm_page_lookup(obj, OFF_TO_IDX(foff)); + if (m == NULL) + panic("biodone: page disappeared!"); + bp->b_pages[i] = m; + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } +#if defined(VFS_BIO_DEBUG) + if (OFF_TO_IDX(foff) != m->pindex) { + printf( +"biodone: foff(%lu)/m->pindex(%d) mismatch\n", + (unsigned long)foff, m->pindex); + } +#endif + + /* + * In the write case, the valid and clean bits are + * already changed correctly ( see bdwrite() ), so we + * only need to do this here in the read case. + */ + if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) { + vfs_page_set_valid(bp, foff, i, m); + } + vm_page_flag_clear(m, PG_ZERO); + + /* + * when debugging new filesystems or buffer I/O methods, this + * is the most common error that pops up. if you see this, you + * have not set the page busy flag correctly!!! + */ + if (m->busy == 0) { + printf("biodone: page busy < 0, " + "pindex: %d, foff: 0x(%x,%x), " + "resid: %d, index: %d\n", + (int) m->pindex, (int)(foff >> 32), + (int) foff & 0xffffffff, resid, i); + if (!vn_isdisk(vp, NULL)) + printf(" iosize: %ld, lblkno: %jd, flags: 0x%lx, npages: %d\n", + bp->b_vp->v_mount->mnt_stat.f_iosize, + (intmax_t) bp->b_lblkno, + bp->b_flags, bp->b_npages); + else + printf(" VDEV, lblkno: %jd, flags: 0x%lx, npages: %d\n", + (intmax_t) bp->b_lblkno, + bp->b_flags, bp->b_npages); + printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", + m->valid, m->dirty, m->wire_count); + panic("biodone: page busy < 0\n"); + } + vm_page_io_finish(m); + vm_object_pip_subtract(obj, 1); + foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; + iosize -= resid; + } + if (obj) + vm_object_pip_wakeupn(obj, 0); + } + + /* + * For asynchronous completions, release the buffer now. The brelse + * will do a wakeup there if necessary - so no need to do a wakeup + * here in the async case. The sync case always needs to do a wakeup. + */ + + if (bp->b_flags & B_ASYNC) { + if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) + brelse(bp); + else + bqrelse(bp); + } else { + wakeup(bp); + } + splx(s); +} + +/* + * This routine is called in lieu of iodone in the case of + * incomplete I/O. This keeps the busy status for pages + * consistant. + */ +void +vfs_unbusy_pages(struct buf * bp) +{ + int i; + + GIANT_REQUIRED; + + runningbufwakeup(bp); + if (bp->b_flags & B_VMIO) { + struct vnode *vp = bp->b_vp; + vm_object_t obj; + + VOP_GETVOBJECT(vp, &obj); + + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m = bp->b_pages[i]; + + if (m == bogus_page) { + m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); + if (!m) { + panic("vfs_unbusy_pages: page missing\n"); + } + bp->b_pages[i] = m; + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } + vm_object_pip_subtract(obj, 1); + vm_page_flag_clear(m, PG_ZERO); + vm_page_io_finish(m); + } + vm_object_pip_wakeupn(obj, 0); + } +} + +/* + * vfs_page_set_valid: + * + * Set the valid bits in a page based on the supplied offset. The + * range is restricted to the buffer's size. + * + * This routine is typically called after a read completes. + */ +static void +vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) +{ + vm_ooffset_t soff, eoff; + + GIANT_REQUIRED; + /* + * Start and end offsets in buffer. eoff - soff may not cross a + * page boundry or cross the end of the buffer. The end of the + * buffer, in this case, is our file EOF, not the allocation size + * of the buffer. + */ + soff = off; + eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; + if (eoff > bp->b_offset + bp->b_bcount) + eoff = bp->b_offset + bp->b_bcount; + + /* + * Set valid range. This is typically the entire buffer and thus the + * entire page. + */ + if (eoff > soff) { + vm_page_set_validclean( + m, + (vm_offset_t) (soff & PAGE_MASK), + (vm_offset_t) (eoff - soff) + ); + } +} + +/* + * This routine is called before a device strategy routine. + * It is used to tell the VM system that paging I/O is in + * progress, and treat the pages associated with the buffer + * almost as being PG_BUSY. Also the object paging_in_progress + * flag is handled to make sure that the object doesn't become + * inconsistant. + * + * Since I/O has not been initiated yet, certain buffer flags + * such as BIO_ERROR or B_INVAL may be in an inconsistant state + * and should be ignored. + */ +void +vfs_busy_pages(struct buf * bp, int clear_modify) +{ + int i, bogus; + + GIANT_REQUIRED; + + if (bp->b_flags & B_VMIO) { + struct vnode *vp = bp->b_vp; + vm_object_t obj; + vm_ooffset_t foff; + + VOP_GETVOBJECT(vp, &obj); + foff = bp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, + ("vfs_busy_pages: no buffer offset")); + vfs_setdirty(bp); + +retry: + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m = bp->b_pages[i]; + if (vm_page_sleep_busy(m, FALSE, "vbpage")) + goto retry; + } + + bogus = 0; + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m = bp->b_pages[i]; + + vm_page_flag_clear(m, PG_ZERO); + if ((bp->b_flags & B_CLUSTER) == 0) { + vm_object_pip_add(obj, 1); + vm_page_io_start(m); + } + + /* + * When readying a buffer for a read ( i.e + * clear_modify == 0 ), it is important to do + * bogus_page replacement for valid pages in + * partially instantiated buffers. Partially + * instantiated buffers can, in turn, occur when + * reconstituting a buffer from its VM backing store + * base. We only have to do this if B_CACHE is + * clear ( which causes the I/O to occur in the + * first place ). The replacement prevents the read + * I/O from overwriting potentially dirty VM-backed + * pages. XXX bogus page replacement is, uh, bogus. + * It may not work properly with small-block devices. + * We need to find a better way. + */ + + vm_page_protect(m, VM_PROT_NONE); + if (clear_modify) + vfs_page_set_valid(bp, foff, i, m); + else if (m->valid == VM_PAGE_BITS_ALL && + (bp->b_flags & B_CACHE) == 0) { + bp->b_pages[i] = bogus_page; + bogus++; + } + foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; + } + if (bogus) + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } +} + +/* + * Tell the VM system that the pages associated with this buffer + * are clean. This is used for delayed writes where the data is + * going to go to disk eventually without additional VM intevention. + * + * Note that while we only really need to clean through to b_bcount, we + * just go ahead and clean through to b_bufsize. + */ +static void +vfs_clean_pages(struct buf * bp) +{ + int i; + + GIANT_REQUIRED; + + if (bp->b_flags & B_VMIO) { + vm_ooffset_t foff; + + foff = bp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, + ("vfs_clean_pages: no buffer offset")); + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m = bp->b_pages[i]; + vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; + vm_ooffset_t eoff = noff; + + if (eoff > bp->b_offset + bp->b_bufsize) + eoff = bp->b_offset + bp->b_bufsize; + vfs_page_set_valid(bp, foff, i, m); + /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ + foff = noff; + } + } +} + +/* + * vfs_bio_set_validclean: + * + * Set the range within the buffer to valid and clean. The range is + * relative to the beginning of the buffer, b_offset. Note that b_offset + * itself may be offset from the beginning of the first page. + * + */ + +void +vfs_bio_set_validclean(struct buf *bp, int base, int size) +{ + if (bp->b_flags & B_VMIO) { + int i; + int n; + + /* + * Fixup base to be relative to beginning of first page. + * Set initial n to be the maximum number of bytes in the + * first page that can be validated. + */ + + base += (bp->b_offset & PAGE_MASK); + n = PAGE_SIZE - (base & PAGE_MASK); + + for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { + vm_page_t m = bp->b_pages[i]; + + if (n > size) + n = size; + + vm_page_set_validclean(m, base & PAGE_MASK, n); + base += n; + size -= n; + n = PAGE_SIZE; + } + } +} + +/* + * vfs_bio_clrbuf: + * + * clear a buffer. This routine essentially fakes an I/O, so we need + * to clear BIO_ERROR and B_INVAL. + * + * Note that while we only theoretically need to clear through b_bcount, + * we go ahead and clear through b_bufsize. + */ + +void +vfs_bio_clrbuf(struct buf *bp) +{ + int i, mask = 0; + caddr_t sa, ea; + + GIANT_REQUIRED; + + if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { + bp->b_flags &= ~B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && + (bp->b_offset & PAGE_MASK) == 0) { + mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; + if ((bp->b_pages[0]->valid & mask) == mask) { + bp->b_resid = 0; + return; + } + if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && + ((bp->b_pages[0]->valid & mask) == 0)) { + bzero(bp->b_data, bp->b_bufsize); + bp->b_pages[0]->valid |= mask; + bp->b_resid = 0; + return; + } + } + ea = sa = bp->b_data; + for(i=0;i<bp->b_npages;i++,sa=ea) { + int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; + ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); + ea = (caddr_t)(vm_offset_t)ulmin( + (u_long)(vm_offset_t)ea, + (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); + mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; + if ((bp->b_pages[i]->valid & mask) == mask) + continue; + if ((bp->b_pages[i]->valid & mask) == 0) { + if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { + bzero(sa, ea - sa); + } + } else { + for (; sa < ea; sa += DEV_BSIZE, j++) { + if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && + (bp->b_pages[i]->valid & (1<<j)) == 0) + bzero(sa, DEV_BSIZE); + } + } + bp->b_pages[i]->valid |= mask; + vm_page_flag_clear(bp->b_pages[i], PG_ZERO); + } + bp->b_resid = 0; + } else { + clrbuf(bp); + } +} + +/* + * vm_hold_load_pages and vm_hold_free_pages get pages into + * a buffers address space. The pages are anonymous and are + * not associated with a file object. + */ +static void +vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) +{ + vm_offset_t pg; + vm_page_t p; + int index; + + GIANT_REQUIRED; + + to = round_page(to); + from = round_page(from); + index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; + + for (pg = from; pg < to; pg += PAGE_SIZE, index++) { +tryagain: + /* + * note: must allocate system pages since blocking here + * could intefere with paging I/O, no matter which + * process we are. + */ + p = vm_page_alloc(kernel_object, + ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), + VM_ALLOC_SYSTEM); + if (!p) { + vm_pageout_deficit += (to - from) >> PAGE_SHIFT; + VM_WAIT; + goto tryagain; + } + vm_page_wire(p); + p->valid = VM_PAGE_BITS_ALL; + vm_page_flag_clear(p, PG_ZERO); + pmap_qenter(pg, &p, 1); + bp->b_pages[index] = p; + vm_page_wakeup(p); + } + bp->b_npages = index; +} + +/* Return pages associated with this buf to the vm system */ +void +vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) +{ + vm_offset_t pg; + vm_page_t p; + int index, newnpages; + + GIANT_REQUIRED; + + from = round_page(from); + to = round_page(to); + newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; + + for (pg = from; pg < to; pg += PAGE_SIZE, index++) { + p = bp->b_pages[index]; + if (p && (index < bp->b_npages)) { + if (p->busy) { + printf( + "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", + (intmax_t)bp->b_blkno, + (intmax_t)bp->b_lblkno); + } + bp->b_pages[index] = NULL; + pmap_qremove(pg, 1); + vm_page_busy(p); + vm_page_unwire(p, 0); + vm_page_free(p); + } + } + bp->b_npages = newnpages; +} + + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +/* DDB command to show buffer data */ +DB_SHOW_COMMAND(buffer, db_show_buffer) +{ + /* get args */ + struct buf *bp = (struct buf *)addr; + + if (!have_addr) { + db_printf("usage: show buffer <addr>\n"); + return; + } + + db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); + db_printf( + "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" + "b_dev = (%d,%d), b_data = %p, b_blkno = %jd, b_pblkno = %jd\n", + bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, + major(bp->b_dev), minor(bp->b_dev), bp->b_data, + (intmax_t)bp->b_blkno, (intmax_t)bp->b_pblkno); + if (bp->b_npages) { + int i; + db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m; + m = bp->b_pages[i]; + db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, + (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); + if ((i + 1) < bp->b_npages) + db_printf(","); + } + db_printf("\n"); + } +} +#endif /* DDB */ diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c new file mode 100644 index 0000000..be79fc2 --- /dev/null +++ b/sys/kern/vfs_cache.c @@ -0,0 +1,898 @@ +/* + * Copyright (c) 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Poul-Henning Kamp of the FreeBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sysctl.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/namei.h> +#include <sys/malloc.h> +#include <sys/sysproto.h> +#include <sys/proc.h> +#include <sys/filedesc.h> +#include <sys/fnv_hash.h> + +/* + * This structure describes the elements in the cache of recent + * names looked up by namei. + */ + +struct namecache { + LIST_ENTRY(namecache) nc_hash; /* hash chain */ + LIST_ENTRY(namecache) nc_src; /* source vnode list */ + TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ + struct vnode *nc_dvp; /* vnode of parent of name */ + struct vnode *nc_vp; /* vnode the name refers to */ + u_char nc_flag; /* flag bits */ + u_char nc_nlen; /* length of name */ + char nc_name[0]; /* segment name */ +}; + +/* + * Name caching works as follows: + * + * Names found by directory scans are retained in a cache + * for future reference. It is managed LRU, so frequently + * used names will hang around. Cache is indexed by hash value + * obtained from (vp, name) where vp refers to the directory + * containing name. + * + * If it is a "negative" entry, (i.e. for a name that is known NOT to + * exist) the vnode pointer will be NULL. + * + * Upon reaching the last segment of a path, if the reference + * is for DELETE, or NOCACHE is set (rewrite), and the + * name is located in the cache, it will be dropped. + */ + +/* + * Structures associated with name cacheing. + */ +#define NCHHASH(hash) \ + (&nchashtbl[(hash) & nchash]) +static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ +static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */ +static u_long nchash; /* size of hash table */ +SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, ""); +static u_long ncnegfactor = 16; /* ratio of negative entries */ +SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, ""); +static u_long numneg; /* number of cache entries allocated */ +SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, ""); +static u_long numcache; /* number of cache entries allocated */ +SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, ""); +static u_long numcachehv; /* number of cache entries with vnodes held */ +SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, ""); +#if 0 +static u_long numcachepl; /* number of cache purge for leaf entries */ +SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, ""); +#endif +struct nchstats nchstats; /* cache effectiveness statistics */ + +static int doingcache = 1; /* 1 => enable the cache */ +SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, ""); + +/* Export size information to userland */ +SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), ""); +SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), ""); + +/* + * The new name cache statistics + */ +SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); +#define STATNODE(mode, name, var) \ + SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, ""); +STATNODE(CTLFLAG_RD, numneg, &numneg); +STATNODE(CTLFLAG_RD, numcache, &numcache); +static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls); +static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits); +static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits); +static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks); +static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss); +static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap); +static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps); +static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits); +static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps); +static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits); + +SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats, + sizeof(nchstats), "LU", "VFS cache effectiveness statistics"); + + + +static void cache_zap(struct namecache *ncp); + +static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); + +/* + * Flags in namecache.nc_flag + */ +#define NCF_WHITE 1 + +/* + * Grab an atomic snapshot of the name cache hash chain lengths + */ +SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats"); + +static int +sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) +{ + int error; + struct nchashhead *ncpp; + struct namecache *ncp; + int n_nchash; + int count; + + n_nchash = nchash + 1; /* nchash is max index, not count */ + if (!req->oldptr) + return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); + + /* Scan hash tables for applicable entries */ + for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { + count = 0; + LIST_FOREACH(ncp, ncpp, nc_hash) { + count++; + } + error = SYSCTL_OUT(req, &count, sizeof(count)); + if (error) + return (error); + } + return (0); +} +SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD, + 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); + +static int +sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) +{ + int error; + struct nchashhead *ncpp; + struct namecache *ncp; + int n_nchash; + int count, maxlength, used, pct; + + if (!req->oldptr) + return SYSCTL_OUT(req, 0, 4 * sizeof(int)); + + n_nchash = nchash + 1; /* nchash is max index, not count */ + used = 0; + maxlength = 0; + + /* Scan hash tables for applicable entries */ + for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { + count = 0; + LIST_FOREACH(ncp, ncpp, nc_hash) { + count++; + } + if (count) + used++; + if (maxlength < count) + maxlength = count; + } + n_nchash = nchash + 1; + pct = (used * 100 * 100) / n_nchash; + error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); + if (error) + return (error); + error = SYSCTL_OUT(req, &used, sizeof(used)); + if (error) + return (error); + error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); + if (error) + return (error); + error = SYSCTL_OUT(req, &pct, sizeof(pct)); + if (error) + return (error); + return (0); +} +SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD, + 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths"); + +/* + * Delete an entry from its hash list and move it to the front + * of the LRU list for immediate reuse. + */ +static void +cache_zap(ncp) + struct namecache *ncp; +{ + LIST_REMOVE(ncp, nc_hash); + LIST_REMOVE(ncp, nc_src); + if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { + vdrop(ncp->nc_dvp); + numcachehv--; + } + if (ncp->nc_vp) { + TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); + } else { + TAILQ_REMOVE(&ncneg, ncp, nc_dst); + numneg--; + } + numcache--; + free(ncp, M_VFSCACHE); +} + +/* + * cache_leaf_test() + * + * Test whether this (directory) vnode's namei cache entry contains + * subdirectories or not. Used to determine whether the directory is + * a leaf in the namei cache or not. Note: the directory may still + * contain files in the namei cache. + * + * Returns 0 if the directory is a leaf, -1 if it isn't. + */ +int +cache_leaf_test(struct vnode *vp) +{ + struct namecache *ncpc; + + for (ncpc = LIST_FIRST(&vp->v_cache_src); + ncpc != NULL; + ncpc = LIST_NEXT(ncpc, nc_src) + ) { + if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR) + return(-1); + } + return(0); +} + +/* + * Lookup an entry in the cache + * + * Lookup is called with dvp pointing to the directory to search, + * cnp pointing to the name of the entry being sought. If the lookup + * succeeds, the vnode is returned in *vpp, and a status of -1 is + * returned. If the lookup determines that the name does not exist + * (negative cacheing), a status of ENOENT is returned. If the lookup + * fails, a status of zero is returned. + */ + +int +cache_lookup(dvp, vpp, cnp) + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; +{ + struct namecache *ncp; + u_int32_t hash; + + if (!doingcache) { + cnp->cn_flags &= ~MAKEENTRY; + return (0); + } + + numcalls++; + + if (cnp->cn_nameptr[0] == '.') { + if (cnp->cn_namelen == 1) { + *vpp = dvp; + dothits++; + return (-1); + } + if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { + dotdothits++; + if (dvp->v_dd->v_id != dvp->v_ddid || + (cnp->cn_flags & MAKEENTRY) == 0) { + dvp->v_ddid = 0; + return (0); + } + *vpp = dvp->v_dd; + return (-1); + } + } + + hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT); + hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash); + LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { + numchecks++; + if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && + !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) + break; + } + + /* We failed to find an entry */ + if (ncp == 0) { + if ((cnp->cn_flags & MAKEENTRY) == 0) { + nummisszap++; + } else { + nummiss++; + } + nchstats.ncs_miss++; + return (0); + } + + /* We don't want to have an entry, so dump it */ + if ((cnp->cn_flags & MAKEENTRY) == 0) { + numposzaps++; + nchstats.ncs_badhits++; + cache_zap(ncp); + return (0); + } + + /* We found a "positive" match, return the vnode */ + if (ncp->nc_vp) { + numposhits++; + nchstats.ncs_goodhits++; + *vpp = ncp->nc_vp; + return (-1); + } + + /* We found a negative match, and want to create it, so purge */ + if (cnp->cn_nameiop == CREATE) { + numnegzaps++; + nchstats.ncs_badhits++; + cache_zap(ncp); + return (0); + } + + numneghits++; + /* + * We found a "negative" match, ENOENT notifies client of this match. + * The nc_vpid field records whether this is a whiteout. + */ + TAILQ_REMOVE(&ncneg, ncp, nc_dst); + TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); + nchstats.ncs_neghits++; + if (ncp->nc_flag & NCF_WHITE) + cnp->cn_flags |= ISWHITEOUT; + return (ENOENT); +} + +/* + * Add an entry to the cache. + */ +void +cache_enter(dvp, vp, cnp) + struct vnode *dvp; + struct vnode *vp; + struct componentname *cnp; +{ + struct namecache *ncp; + struct nchashhead *ncpp; + u_int32_t hash; + int len; + + if (!doingcache) + return; + + if (cnp->cn_nameptr[0] == '.') { + if (cnp->cn_namelen == 1) { + return; + } + if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { + if (vp) { + dvp->v_dd = vp; + dvp->v_ddid = vp->v_id; + } else { + dvp->v_dd = dvp; + dvp->v_ddid = 0; + } + return; + } + } + + ncp = (struct namecache *) + malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK); + bzero((char *)ncp, sizeof *ncp); + numcache++; + if (!vp) { + numneg++; + ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0; + } else if (vp->v_type == VDIR) { + vp->v_dd = dvp; + vp->v_ddid = dvp->v_id; + } + + /* + * Fill in cache info, if vp is NULL this is a "negative" cache entry. + * For negative entries, we have to record whether it is a whiteout. + * the whiteout flag is stored in the nc_vpid field which is + * otherwise unused. + */ + ncp->nc_vp = vp; + ncp->nc_dvp = dvp; + len = ncp->nc_nlen = cnp->cn_namelen; + hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT); + bcopy(cnp->cn_nameptr, ncp->nc_name, len); + hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash); + ncpp = NCHHASH(hash); + LIST_INSERT_HEAD(ncpp, ncp, nc_hash); + if (LIST_EMPTY(&dvp->v_cache_src)) { + vhold(dvp); + numcachehv++; + } + LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); + if (vp) { + TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); + } else { + TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); + } + if (numneg * ncnegfactor > numcache) { + ncp = TAILQ_FIRST(&ncneg); + cache_zap(ncp); + } +} + +/* + * Name cache initialization, from vfs_init() when we are booting + */ +static void +nchinit(void *dummy __unused) +{ + + TAILQ_INIT(&ncneg); + nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL) + + +/* + * Invalidate all entries to a particular vnode. + * + * Remove all entries in the namecache relating to this vnode and + * change the v_id. We take the v_id from a global counter, since + * it becomes a handy sequence number in crash-dumps that way. + * No valid vnode will ever have (v_id == 0). + * + * XXX: Only time and the size of v_id prevents this from failing: + * XXX: In theory we should hunt down all (struct vnode*, v_id) + * XXX: soft references and nuke them, at least on the global + * XXX: v_id wraparound. The period of resistance can be extended + * XXX: by incrementing each vnodes v_id individually instead of + * XXX: using the global v_id. + */ + +void +cache_purge(vp) + struct vnode *vp; +{ + static u_long nextid; + + while (!LIST_EMPTY(&vp->v_cache_src)) + cache_zap(LIST_FIRST(&vp->v_cache_src)); + while (!TAILQ_EMPTY(&vp->v_cache_dst)) + cache_zap(TAILQ_FIRST(&vp->v_cache_dst)); + + do + nextid++; + while (nextid == vp->v_id || !nextid); + vp->v_id = nextid; + vp->v_dd = vp; + vp->v_ddid = 0; +} + +/* + * Flush all entries referencing a particular filesystem. + * + * Since we need to check it anyway, we will flush all the invalid + * entries at the same time. + */ +void +cache_purgevfs(mp) + struct mount *mp; +{ + struct nchashhead *ncpp; + struct namecache *ncp, *nnp; + + /* Scan hash tables for applicable entries */ + for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { + for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) { + nnp = LIST_NEXT(ncp, nc_hash); + if (ncp->nc_dvp->v_mount == mp) { + cache_zap(ncp); + } + } + } +} + +/* + * Perform canonical checks and cache lookup and pass on to filesystem + * through the vop_cachedlookup only if needed. + */ + +int +vfs_cache_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + struct vnode *dvp, *vp; + int lockparent; + int error; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + struct ucred *cred = cnp->cn_cred; + int flags = cnp->cn_flags; + struct thread *td = cnp->cn_thread; + u_long vpid; /* capability number of vnode */ + + *vpp = NULL; + dvp = ap->a_dvp; + lockparent = flags & LOCKPARENT; + + if (dvp->v_type != VDIR) + return (ENOTDIR); + + if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) + return (EROFS); + + error = VOP_ACCESS(dvp, VEXEC, cred, td); + + if (error) + return (error); + + error = cache_lookup(dvp, vpp, cnp); + +#ifdef LOOKUP_SHARED + if (!error) { + /* We do this because the rest of the system now expects to get + * a shared lock, which is later upgraded if LOCKSHARED is not + * set. We have so many cases here because of bugs that yield + * inconsistant lock states. This all badly needs to be fixed + */ + error = VOP_CACHEDLOOKUP(dvp, vpp, cnp); + if (!error) { + int flock; + + flock = VOP_ISLOCKED(*vpp, td); + if (flock != LK_EXCLUSIVE) { + if (flock == 0) { + if ((flags & ISLASTCN) && + (flags & LOCKSHARED)) + VOP_LOCK(*vpp, LK_SHARED, td); + else + VOP_LOCK(*vpp, LK_EXCLUSIVE, td); + } + } else if ((flags & ISLASTCN) && (flags & LOCKSHARED)) + VOP_LOCK(*vpp, LK_DOWNGRADE, td); + } + return (error); + } +#else + if (!error) + return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); +#endif + + if (error == ENOENT) + return (error); + + vp = *vpp; + vpid = vp->v_id; + cnp->cn_flags &= ~PDIRUNLOCK; + if (dvp == vp) { /* lookup on "." */ + VREF(vp); + error = 0; + } else if (flags & ISDOTDOT) { + VOP_UNLOCK(dvp, 0, td); + cnp->cn_flags |= PDIRUNLOCK; +#ifdef LOOKUP_SHARED + if ((flags & ISLASTCN) && (flags & LOCKSHARED)) + error = vget(vp, LK_SHARED, td); + else + error = vget(vp, LK_EXCLUSIVE, td); +#else + error = vget(vp, LK_EXCLUSIVE, td); +#endif + + if (!error && lockparent && (flags & ISLASTCN)) { + if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0) + cnp->cn_flags &= ~PDIRUNLOCK; + } + } else { +#ifdef LOOKUP_SHARED + if ((flags & ISLASTCN) && (flags & LOCKSHARED)) + error = vget(vp, LK_SHARED, td); + else + error = vget(vp, LK_EXCLUSIVE, td); +#else + error = vget(vp, LK_EXCLUSIVE, td); +#endif + if (!lockparent || error || !(flags & ISLASTCN)) { + VOP_UNLOCK(dvp, 0, td); + cnp->cn_flags |= PDIRUNLOCK; + } + } + /* + * Check that the capability number did not change + * while we were waiting for the lock. + */ + if (!error) { + if (vpid == vp->v_id) + return (0); + vput(vp); + if (lockparent && dvp != vp && (flags & ISLASTCN)) { + VOP_UNLOCK(dvp, 0, td); + cnp->cn_flags |= PDIRUNLOCK; + } + } + if (cnp->cn_flags & PDIRUNLOCK) { + error = vn_lock(dvp, LK_EXCLUSIVE, td); + if (error) + return (error); + cnp->cn_flags &= ~PDIRUNLOCK; + } +#ifdef LOOKUP_SHARED + error = VOP_CACHEDLOOKUP(dvp, vpp, cnp); + + if (!error) { + int flock = 0; + + flock = VOP_ISLOCKED(*vpp, td); + if (flock != LK_EXCLUSIVE) { + if (flock == 0) { + if ((flags & ISLASTCN) && (flags & LOCKSHARED)) + VOP_LOCK(*vpp, LK_SHARED, td); + else + VOP_LOCK(*vpp, LK_EXCLUSIVE, td); + } + } else if ((flags & ISLASTCN) && (flags & LOCKSHARED)) + VOP_LOCK(*vpp, LK_DOWNGRADE, td); + } + + return (error); +#else + return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); +#endif +} + + +#ifndef _SYS_SYSPROTO_H_ +struct __getcwd_args { + u_char *buf; + u_int buflen; +}; +#endif + +/* + * XXX All of these sysctls would probably be more productive dead. + */ +static int disablecwd; +SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, + "Disable the getcwd syscall"); + +/* Various statistics for the getcwd syscall */ +static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls); +static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1); +static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2); +static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3); +static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4); +static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound); + +/* Implementation of the getcwd syscall */ +int +__getcwd(td, uap) + struct thread *td; + struct __getcwd_args *uap; +{ + char *bp, *buf; + int error, i, slash_prefixed; + struct filedesc *fdp; + struct namecache *ncp; + struct vnode *vp; + + numcwdcalls++; + if (disablecwd) + return (ENODEV); + if (uap->buflen < 2) + return (EINVAL); + if (uap->buflen > MAXPATHLEN) + uap->buflen = MAXPATHLEN; + buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK); + bp += uap->buflen - 1; + *bp = '\0'; + fdp = td->td_proc->p_fd; + slash_prefixed = 0; + FILEDESC_LOCK(fdp); + for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) { + if (vp->v_flag & VROOT) { + if (vp->v_mount == NULL) { /* forced unmount */ + FILEDESC_UNLOCK(fdp); + free(buf, M_TEMP); + return (EBADF); + } + vp = vp->v_mount->mnt_vnodecovered; + continue; + } + if (vp->v_dd->v_id != vp->v_ddid) { + FILEDESC_UNLOCK(fdp); + numcwdfail1++; + free(buf, M_TEMP); + return (ENOTDIR); + } + ncp = TAILQ_FIRST(&vp->v_cache_dst); + if (!ncp) { + FILEDESC_UNLOCK(fdp); + numcwdfail2++; + free(buf, M_TEMP); + return (ENOENT); + } + if (ncp->nc_dvp != vp->v_dd) { + FILEDESC_UNLOCK(fdp); + numcwdfail3++; + free(buf, M_TEMP); + return (EBADF); + } + for (i = ncp->nc_nlen - 1; i >= 0; i--) { + if (bp == buf) { + FILEDESC_UNLOCK(fdp); + numcwdfail4++; + free(buf, M_TEMP); + return (ENOMEM); + } + *--bp = ncp->nc_name[i]; + } + if (bp == buf) { + FILEDESC_UNLOCK(fdp); + numcwdfail4++; + free(buf, M_TEMP); + return (ENOMEM); + } + *--bp = '/'; + slash_prefixed = 1; + vp = vp->v_dd; + } + FILEDESC_UNLOCK(fdp); + if (!slash_prefixed) { + if (bp == buf) { + numcwdfail4++; + free(buf, M_TEMP); + return (ENOMEM); + } + *--bp = '/'; + } + numcwdfound++; + error = copyout(bp, uap->buf, strlen(bp) + 1); + free(buf, M_TEMP); + return (error); +} + +/* + * Thus begins the fullpath magic. + */ + +#undef STATNODE +#define STATNODE(name) \ + static u_int name; \ + SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "") + +static int disablefullpath; +SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, + "Disable the vn_fullpath function"); + +STATNODE(numfullpathcalls); +STATNODE(numfullpathfail1); +STATNODE(numfullpathfail2); +STATNODE(numfullpathfail3); +STATNODE(numfullpathfail4); +STATNODE(numfullpathfound); + +/* + * Retrieve the full filesystem path that correspond to a vnode from the name + * cache (if available) + */ +int +vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) +{ + char *bp, *buf; + int i, slash_prefixed; + struct filedesc *fdp; + struct namecache *ncp; + struct vnode *vp; + + numfullpathcalls++; + if (disablefullpath) + return (ENODEV); + if (vn == NULL) + return (EINVAL); + buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + bp = buf + MAXPATHLEN - 1; + *bp = '\0'; + fdp = td->td_proc->p_fd; + slash_prefixed = 0; + FILEDESC_LOCK(fdp); + for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) { + if (vp->v_flag & VROOT) { + if (vp->v_mount == NULL) { /* forced unmount */ + FILEDESC_UNLOCK(fdp); + free(buf, M_TEMP); + return (EBADF); + } + vp = vp->v_mount->mnt_vnodecovered; + continue; + } + if (vp != vn && vp->v_dd->v_id != vp->v_ddid) { + FILEDESC_UNLOCK(fdp); + numfullpathfail1++; + free(buf, M_TEMP); + return (ENOTDIR); + } + ncp = TAILQ_FIRST(&vp->v_cache_dst); + if (!ncp) { + FILEDESC_UNLOCK(fdp); + numfullpathfail2++; + free(buf, M_TEMP); + return (ENOENT); + } + if (vp != vn && ncp->nc_dvp != vp->v_dd) { + FILEDESC_UNLOCK(fdp); + numfullpathfail3++; + free(buf, M_TEMP); + return (EBADF); + } + for (i = ncp->nc_nlen - 1; i >= 0; i--) { + if (bp == buf) { + FILEDESC_UNLOCK(fdp); + numfullpathfail4++; + free(buf, M_TEMP); + return (ENOMEM); + } + *--bp = ncp->nc_name[i]; + } + if (bp == buf) { + FILEDESC_UNLOCK(fdp); + numfullpathfail4++; + free(buf, M_TEMP); + return (ENOMEM); + } + *--bp = '/'; + slash_prefixed = 1; + vp = ncp->nc_dvp; + } + if (!slash_prefixed) { + if (bp == buf) { + FILEDESC_UNLOCK(fdp); + numfullpathfail4++; + free(buf, M_TEMP); + return (ENOMEM); + } + *--bp = '/'; + } + FILEDESC_UNLOCK(fdp); + numfullpathfound++; + *retbuf = bp; + *freebuf = buf; + return (0); +} diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c new file mode 100644 index 0000000..4c11952 --- /dev/null +++ b/sys/kern/vfs_cluster.c @@ -0,0 +1,1008 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * Modifications/enhancements: + * Copyright (c) 1995 John S. Dyson. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 + * $FreeBSD$ + */ + +#include "opt_debug_cluster.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/stdint.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/resourcevar.h> +#include <sys/vmmeter.h> +#include <vm/vm.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <sys/sysctl.h> + +#if defined(CLUSTERDEBUG) +#include <sys/sysctl.h> +static int rcluster= 0; +SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, + "Debug VFS clustering code"); +#endif + +static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); + +static struct cluster_save * + cluster_collectbufs(struct vnode *vp, struct buf *last_bp); +static struct buf * + cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, + daddr_t blkno, long size, int run, struct buf *fbp); + +static int write_behind = 1; +SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, + "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); + +/* Page expended to mark partially backed buffers */ +extern vm_page_t bogus_page; + +/* + * Number of physical bufs (pbufs) this subsystem is allowed. + * Manipulated by vm_pager.c + */ +extern int cluster_pbuf_freecnt; + +/* + * Maximum number of blocks for read-ahead. + */ +#define MAXRA 32 + +/* + * Read data to a buf, including read-ahead if we find this to be beneficial. + * cluster_read replaces bread. + */ +int +cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) + struct vnode *vp; + u_quad_t filesize; + daddr_t lblkno; + long size; + struct ucred *cred; + long totread; + int seqcount; + struct buf **bpp; +{ + struct buf *bp, *rbp, *reqbp; + daddr_t blkno, origblkno; + int error, num_ra; + int i; + int maxra, racluster; + long origtotread; + + error = 0; + + /* + * Try to limit the amount of read-ahead by a few + * ad-hoc parameters. This needs work!!! + */ + racluster = vp->v_mount->mnt_iosize_max / size; + maxra = 2 * racluster + (totread / size); + if (maxra > MAXRA) + maxra = MAXRA; + if (maxra > nbuf/8) + maxra = nbuf/8; + + /* + * get the requested block + */ + *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); + origblkno = lblkno; + origtotread = totread; + + /* + * if it is in the cache, then check to see if the reads have been + * sequential. If they have, then try some read-ahead, otherwise + * back-off on prospective read-aheads. + */ + if (bp->b_flags & B_CACHE) { + if (!seqcount) { + return 0; + } else if ((bp->b_flags & B_RAM) == 0) { + return 0; + } else { + int s; + struct buf *tbp; + bp->b_flags &= ~B_RAM; + /* + * We do the spl here so that there is no window + * between the incore and the b_usecount increment + * below. We opt to keep the spl out of the loop + * for efficiency. + */ + s = splbio(); + for (i = 1; i < maxra; i++) { + + if (!(tbp = incore(vp, lblkno+i))) { + break; + } + + /* + * Set another read-ahead mark so we know + * to check again. + */ + if (((i % racluster) == (racluster - 1)) || + (i == (maxra - 1))) + tbp->b_flags |= B_RAM; + } + splx(s); + if (i >= maxra) { + return 0; + } + lblkno += i; + } + reqbp = bp = NULL; + } else { + off_t firstread = bp->b_offset; + + KASSERT(bp->b_offset != NOOFFSET, + ("cluster_read: no buffer offset")); + if (firstread + totread > filesize) + totread = filesize - firstread; + if (totread > size) { + int nblks = 0; + int ncontigafter; + while (totread > 0) { + nblks++; + totread -= size; + } + if (nblks == 1) + goto single_block_read; + if (nblks > racluster) + nblks = racluster; + + error = VOP_BMAP(vp, lblkno, NULL, + &blkno, &ncontigafter, NULL); + if (error) + goto single_block_read; + if (blkno == -1) + goto single_block_read; + if (ncontigafter == 0) + goto single_block_read; + if (ncontigafter + 1 < nblks) + nblks = ncontigafter + 1; + + bp = cluster_rbuild(vp, filesize, lblkno, + blkno, size, nblks, bp); + lblkno += (bp->b_bufsize / size); + } else { +single_block_read: + /* + * if it isn't in the cache, then get a chunk from + * disk if sequential, otherwise just get the block. + */ + bp->b_flags |= B_RAM; + bp->b_iocmd = BIO_READ; + lblkno += 1; + } + } + + /* + * if we have been doing sequential I/O, then do some read-ahead + */ + rbp = NULL; + if (seqcount && (lblkno < (origblkno + seqcount))) { + /* + * we now build the read-ahead buffer if it is desirable. + */ + if (((u_quad_t)(lblkno + 1) * size) <= filesize && + !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && + blkno != -1) { + int nblksread; + int ntoread = num_ra + 1; + nblksread = (origtotread + size - 1) / size; + if (seqcount < nblksread) + seqcount = nblksread; + if (seqcount < ntoread) + ntoread = seqcount; + if (num_ra) { + rbp = cluster_rbuild(vp, filesize, lblkno, + blkno, size, ntoread, NULL); + } else { + rbp = getblk(vp, lblkno, size, 0, 0); + rbp->b_flags |= B_ASYNC | B_RAM; + rbp->b_iocmd = BIO_READ; + rbp->b_blkno = blkno; + } + } + } + + /* + * handle the synchronous read + */ + if (bp) { +#if defined(CLUSTERDEBUG) + if (rcluster) + printf("S(%ld,%ld,%d) ", + (long)bp->b_lblkno, bp->b_bcount, seqcount); +#endif + if ((bp->b_flags & B_CLUSTER) == 0) { + vfs_busy_pages(bp, 0); + } + bp->b_flags &= ~B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) + BUF_KERNPROC(bp); + error = VOP_STRATEGY(vp, bp); + curproc->p_stats->p_ru.ru_inblock++; + } + + /* + * and if we have read-aheads, do them too + */ + if (rbp) { + if (error) { + rbp->b_flags &= ~B_ASYNC; + brelse(rbp); + } else if (rbp->b_flags & B_CACHE) { + rbp->b_flags &= ~B_ASYNC; + bqrelse(rbp); + } else { +#if defined(CLUSTERDEBUG) + if (rcluster) { + if (bp) + printf("A+"); + else + printf("A"); + printf("(%lld,%ld,%lld,%d) ", + (intmax_t)rbp->b_lblkno, rbp->b_bcount, + (intmax_t)(rbp->b_lblkno - origblkno), + seqcount); + } +#endif + + if ((rbp->b_flags & B_CLUSTER) == 0) { + vfs_busy_pages(rbp, 0); + } + rbp->b_flags &= ~B_INVAL; + rbp->b_ioflags &= ~BIO_ERROR; + if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) + BUF_KERNPROC(rbp); + (void) VOP_STRATEGY(vp, rbp); + curproc->p_stats->p_ru.ru_inblock++; + } + } + if (reqbp) + return (bufwait(reqbp)); + else + return (error); +} + +/* + * If blocks are contiguous on disk, use this to provide clustered + * read ahead. We will read as many blocks as possible sequentially + * and then parcel them up into logical blocks in the buffer hash table. + */ +static struct buf * +cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) + struct vnode *vp; + u_quad_t filesize; + daddr_t lbn; + daddr_t blkno; + long size; + int run; + struct buf *fbp; +{ + struct buf *bp, *tbp; + daddr_t bn; + int i, inc, j; + + GIANT_REQUIRED; + + KASSERT(size == vp->v_mount->mnt_stat.f_iosize, + ("cluster_rbuild: size %ld != filesize %ld\n", + size, vp->v_mount->mnt_stat.f_iosize)); + + /* + * avoid a division + */ + while ((u_quad_t) size * (lbn + run) > filesize) { + --run; + } + + if (fbp) { + tbp = fbp; + tbp->b_iocmd = BIO_READ; + } else { + tbp = getblk(vp, lbn, size, 0, 0); + if (tbp->b_flags & B_CACHE) + return tbp; + tbp->b_flags |= B_ASYNC | B_RAM; + tbp->b_iocmd = BIO_READ; + } + + tbp->b_blkno = blkno; + if( (tbp->b_flags & B_MALLOC) || + ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) + return tbp; + + bp = trypbuf(&cluster_pbuf_freecnt); + if (bp == 0) + return tbp; + + /* + * We are synthesizing a buffer out of vm_page_t's, but + * if the block size is not page aligned then the starting + * address may not be either. Inherit the b_data offset + * from the original buffer. + */ + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; + bp->b_iocmd = BIO_READ; + bp->b_iodone = cluster_callback; + bp->b_blkno = blkno; + bp->b_lblkno = lbn; + bp->b_offset = tbp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); + pbgetvp(vp, bp); + + TAILQ_INIT(&bp->b_cluster.cluster_head); + + bp->b_bcount = 0; + bp->b_bufsize = 0; + bp->b_npages = 0; + + inc = btodb(size); + for (bn = blkno, i = 0; i < run; ++i, bn += inc) { + if (i != 0) { + if ((bp->b_npages * PAGE_SIZE) + + round_page(size) > vp->v_mount->mnt_iosize_max) { + break; + } + + /* + * Shortcut some checks and try to avoid buffers that + * would block in the lock. The same checks have to + * be made again after we officially get the buffer. + */ + if ((tbp = incore(vp, lbn + i)) != NULL) { + if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) + break; + BUF_UNLOCK(tbp); + + for (j = 0; j < tbp->b_npages; j++) { + if (tbp->b_pages[j]->valid) + break; + } + + if (j != tbp->b_npages) + break; + + if (tbp->b_bcount != size) + break; + } + + tbp = getblk(vp, lbn + i, size, 0, 0); + + /* + * Stop scanning if the buffer is fully valid + * (marked B_CACHE), or locked (may be doing a + * background write), or if the buffer is not + * VMIO backed. The clustering code can only deal + * with VMIO-backed buffers. + */ + if ((tbp->b_flags & (B_CACHE|B_LOCKED)) || + (tbp->b_flags & B_VMIO) == 0) { + bqrelse(tbp); + break; + } + + /* + * The buffer must be completely invalid in order to + * take part in the cluster. If it is partially valid + * then we stop. + */ + for (j = 0;j < tbp->b_npages; j++) { + if (tbp->b_pages[j]->valid) + break; + } + if (j != tbp->b_npages) { + bqrelse(tbp); + break; + } + + /* + * Set a read-ahead mark as appropriate + */ + if ((fbp && (i == 1)) || (i == (run - 1))) + tbp->b_flags |= B_RAM; + + /* + * Set the buffer up for an async read (XXX should + * we do this only if we do not wind up brelse()ing?). + * Set the block number if it isn't set, otherwise + * if it is make sure it matches the block number we + * expect. + */ + tbp->b_flags |= B_ASYNC; + tbp->b_iocmd = BIO_READ; + if (tbp->b_blkno == tbp->b_lblkno) { + tbp->b_blkno = bn; + } else if (tbp->b_blkno != bn) { + brelse(tbp); + break; + } + } + /* + * XXX fbp from caller may not be B_ASYNC, but we are going + * to biodone() it in cluster_callback() anyway + */ + BUF_KERNPROC(tbp); + TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, + tbp, b_cluster.cluster_entry); + for (j = 0; j < tbp->b_npages; j += 1) { + vm_page_t m; + m = tbp->b_pages[j]; + vm_page_io_start(m); + vm_object_pip_add(m->object, 1); + if ((bp->b_npages == 0) || + (bp->b_pages[bp->b_npages-1] != m)) { + bp->b_pages[bp->b_npages] = m; + bp->b_npages++; + } + if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) + tbp->b_pages[j] = bogus_page; + } + /* + * XXX shouldn't this be += size for both, like in + * cluster_wbuild()? + * + * Don't inherit tbp->b_bufsize as it may be larger due to + * a non-page-aligned size. Instead just aggregate using + * 'size'. + */ + if (tbp->b_bcount != size) + printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size); + if (tbp->b_bufsize != size) + printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size); + bp->b_bcount += size; + bp->b_bufsize += size; + } + + /* + * Fully valid pages in the cluster are already good and do not need + * to be re-read from disk. Replace the page with bogus_page + */ + for (j = 0; j < bp->b_npages; j++) { + if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == + VM_PAGE_BITS_ALL) { + bp->b_pages[j] = bogus_page; + } + } + if (bp->b_bufsize > bp->b_kvasize) + panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", + bp->b_bufsize, bp->b_kvasize); + bp->b_kvasize = bp->b_bufsize; + + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + return (bp); +} + +/* + * Cleanup after a clustered read or write. + * This is complicated by the fact that any of the buffers might have + * extra memory (if there were no empty buffer headers at allocbuf time) + * that we will need to shift around. + */ +void +cluster_callback(bp) + struct buf *bp; +{ + struct buf *nbp, *tbp; + int error = 0; + + GIANT_REQUIRED; + + /* + * Must propogate errors to all the components. + */ + if (bp->b_ioflags & BIO_ERROR) + error = bp->b_error; + + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); + /* + * Move memory from the large cluster buffer into the component + * buffers and mark IO as done on these. + */ + for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); + tbp; tbp = nbp) { + nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); + if (error) { + tbp->b_ioflags |= BIO_ERROR; + tbp->b_error = error; + } else { + tbp->b_dirtyoff = tbp->b_dirtyend = 0; + tbp->b_flags &= ~B_INVAL; + tbp->b_ioflags &= ~BIO_ERROR; + /* + * XXX the bdwrite()/bqrelse() issued during + * cluster building clears B_RELBUF (see bqrelse() + * comment). If direct I/O was specified, we have + * to restore it here to allow the buffer and VM + * to be freed. + */ + if (tbp->b_flags & B_DIRECT) + tbp->b_flags |= B_RELBUF; + } + bufdone(tbp); + } + relpbuf(bp, &cluster_pbuf_freecnt); +} + +/* + * cluster_wbuild_wb: + * + * Implement modified write build for cluster. + * + * write_behind = 0 write behind disabled + * write_behind = 1 write behind normal (default) + * write_behind = 2 write behind backed-off + */ + +static __inline int +cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) +{ + int r = 0; + + switch(write_behind) { + case 2: + if (start_lbn < len) + break; + start_lbn -= len; + /* fall through */ + case 1: + r = cluster_wbuild(vp, size, start_lbn, len); + /* fall through */ + default: + /* fall through */ + break; + } + return(r); +} + +/* + * Do clustered write for FFS. + * + * Three cases: + * 1. Write is not sequential (write asynchronously) + * Write is sequential: + * 2. beginning of cluster - begin cluster + * 3. middle of a cluster - add to cluster + * 4. end of a cluster - asynchronously write cluster + */ +void +cluster_write(bp, filesize, seqcount) + struct buf *bp; + u_quad_t filesize; + int seqcount; +{ + struct vnode *vp; + daddr_t lbn; + int maxclen, cursize; + int lblocksize; + int async; + + vp = bp->b_vp; + if (vp->v_type == VREG) { + async = vp->v_mount->mnt_flag & MNT_ASYNC; + lblocksize = vp->v_mount->mnt_stat.f_iosize; + } else { + async = 0; + lblocksize = bp->b_bufsize; + } + lbn = bp->b_lblkno; + KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); + + /* Initialize vnode to beginning of file. */ + if (lbn == 0) + vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; + + if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || + (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { + maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; + if (vp->v_clen != 0) { + /* + * Next block is not sequential. + * + * If we are not writing at end of file, the process + * seeked to another point in the file since its last + * write, or we have reached our maximum cluster size, + * then push the previous cluster. Otherwise try + * reallocating to make it sequential. + * + * Change to algorithm: only push previous cluster if + * it was sequential from the point of view of the + * seqcount heuristic, otherwise leave the buffer + * intact so we can potentially optimize the I/O + * later on in the buf_daemon or update daemon + * flush. + */ + cursize = vp->v_lastw - vp->v_cstart + 1; + if (((u_quad_t) bp->b_offset + lblocksize) != filesize || + lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { + if (!async && seqcount > 0) { + cluster_wbuild_wb(vp, lblocksize, + vp->v_cstart, cursize); + } + } else { + struct buf **bpp, **endbp; + struct cluster_save *buflist; + + buflist = cluster_collectbufs(vp, bp); + endbp = &buflist->bs_children + [buflist->bs_nchildren - 1]; + if (VOP_REALLOCBLKS(vp, buflist)) { + /* + * Failed, push the previous cluster + * if *really* writing sequentially + * in the logical file (seqcount > 1), + * otherwise delay it in the hopes that + * the low level disk driver can + * optimize the write ordering. + */ + for (bpp = buflist->bs_children; + bpp < endbp; bpp++) + brelse(*bpp); + free(buflist, M_SEGMENT); + if (seqcount > 1) { + cluster_wbuild_wb(vp, + lblocksize, vp->v_cstart, + cursize); + } + } else { + /* + * Succeeded, keep building cluster. + */ + for (bpp = buflist->bs_children; + bpp <= endbp; bpp++) + bdwrite(*bpp); + free(buflist, M_SEGMENT); + vp->v_lastw = lbn; + vp->v_lasta = bp->b_blkno; + return; + } + } + } + /* + * Consider beginning a cluster. If at end of file, make + * cluster as large as possible, otherwise find size of + * existing cluster. + */ + if ((vp->v_type == VREG) && + ((u_quad_t) bp->b_offset + lblocksize) != filesize && + (bp->b_blkno == bp->b_lblkno) && + (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || + bp->b_blkno == -1)) { + bawrite(bp); + vp->v_clen = 0; + vp->v_lasta = bp->b_blkno; + vp->v_cstart = lbn + 1; + vp->v_lastw = lbn; + return; + } + vp->v_clen = maxclen; + if (!async && maxclen == 0) { /* I/O not contiguous */ + vp->v_cstart = lbn + 1; + bawrite(bp); + } else { /* Wait for rest of cluster */ + vp->v_cstart = lbn; + bdwrite(bp); + } + } else if (lbn == vp->v_cstart + vp->v_clen) { + /* + * At end of cluster, write it out if seqcount tells us we + * are operating sequentially, otherwise let the buf or + * update daemon handle it. + */ + bdwrite(bp); + if (seqcount > 1) + cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); + vp->v_clen = 0; + vp->v_cstart = lbn + 1; + } else if (vm_page_count_severe()) { + /* + * We are low on memory, get it going NOW + */ + bawrite(bp); + } else { + /* + * In the middle of a cluster, so just delay the I/O for now. + */ + bdwrite(bp); + } + vp->v_lastw = lbn; + vp->v_lasta = bp->b_blkno; +} + + +/* + * This is an awful lot like cluster_rbuild...wish they could be combined. + * The last lbn argument is the current block on which I/O is being + * performed. Check to see that it doesn't fall in the middle of + * the current block (if last_bp == NULL). + */ +int +cluster_wbuild(vp, size, start_lbn, len) + struct vnode *vp; + long size; + daddr_t start_lbn; + int len; +{ + struct buf *bp, *tbp; + int i, j, s; + int totalwritten = 0; + int dbsize = btodb(size); + + GIANT_REQUIRED; + + while (len > 0) { + s = splbio(); + /* + * If the buffer is not delayed-write (i.e. dirty), or it + * is delayed-write but either locked or inval, it cannot + * partake in the clustered write. + */ + if (((tbp = gbincore(vp, start_lbn)) == NULL) || + ((tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI) || + BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { + ++start_lbn; + --len; + splx(s); + continue; + } + bremfree(tbp); + tbp->b_flags &= ~B_DONE; + splx(s); + + /* + * Extra memory in the buffer, punt on this buffer. + * XXX we could handle this in most cases, but we would + * have to push the extra memory down to after our max + * possible cluster size and then potentially pull it back + * up if the cluster was terminated prematurely--too much + * hassle. + */ + if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != + (B_CLUSTEROK | B_VMIO)) || + (tbp->b_bcount != tbp->b_bufsize) || + (tbp->b_bcount != size) || + (len == 1) || + ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { + totalwritten += tbp->b_bufsize; + bawrite(tbp); + ++start_lbn; + --len; + continue; + } + + /* + * We got a pbuf to make the cluster in. + * so initialise it. + */ + TAILQ_INIT(&bp->b_cluster.cluster_head); + bp->b_bcount = 0; + bp->b_magic = tbp->b_magic; + bp->b_op = tbp->b_op; + bp->b_bufsize = 0; + bp->b_npages = 0; + if (tbp->b_wcred != NOCRED) + bp->b_wcred = crhold(tbp->b_wcred); + + bp->b_blkno = tbp->b_blkno; + bp->b_lblkno = tbp->b_lblkno; + bp->b_offset = tbp->b_offset; + + /* + * We are synthesizing a buffer out of vm_page_t's, but + * if the block size is not page aligned then the starting + * address may not be either. Inherit the b_data offset + * from the original buffer. + */ + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + bp->b_flags |= B_CLUSTER | + (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT | B_NOWDRAIN)); + bp->b_iodone = cluster_callback; + pbgetvp(vp, bp); + /* + * From this location in the file, scan forward to see + * if there are buffers with adjacent data that need to + * be written as well. + */ + for (i = 0; i < len; ++i, ++start_lbn) { + if (i != 0) { /* If not the first buffer */ + s = splbio(); + /* + * If the adjacent data is not even in core it + * can't need to be written. + */ + if ((tbp = gbincore(vp, start_lbn)) == NULL) { + splx(s); + break; + } + + /* + * If it IS in core, but has different + * characteristics, or is locked (which + * means it could be undergoing a background + * I/O or be in a weird state), then don't + * cluster with it. + */ + if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | + B_INVAL | B_DELWRI | B_NEEDCOMMIT)) + != (B_DELWRI | B_CLUSTEROK | + (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || + (tbp->b_flags & B_LOCKED) || + tbp->b_wcred != bp->b_wcred || + BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { + splx(s); + break; + } + + /* + * Check that the combined cluster + * would make sense with regard to pages + * and would not be too large + */ + if ((tbp->b_bcount != size) || + ((bp->b_blkno + (dbsize * i)) != + tbp->b_blkno) || + ((tbp->b_npages + bp->b_npages) > + (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { + BUF_UNLOCK(tbp); + splx(s); + break; + } + /* + * Ok, it's passed all the tests, + * so remove it from the free list + * and mark it busy. We will use it. + */ + bremfree(tbp); + tbp->b_flags &= ~B_DONE; + splx(s); + } /* end of code for non-first buffers only */ + /* check for latent dependencies to be handled */ + if ((LIST_FIRST(&tbp->b_dep)) != NULL) + buf_start(tbp); + /* + * If the IO is via the VM then we do some + * special VM hackery (yuck). Since the buffer's + * block size may not be page-aligned it is possible + * for a page to be shared between two buffers. We + * have to get rid of the duplication when building + * the cluster. + */ + if (tbp->b_flags & B_VMIO) { + vm_page_t m; + + if (i != 0) { /* if not first buffer */ + for (j = 0; j < tbp->b_npages; j += 1) { + m = tbp->b_pages[j]; + if (m->flags & PG_BUSY) { + bqrelse(tbp); + goto finishcluster; + } + } + } + + for (j = 0; j < tbp->b_npages; j += 1) { + m = tbp->b_pages[j]; + vm_page_io_start(m); + vm_object_pip_add(m->object, 1); + if ((bp->b_npages == 0) || + (bp->b_pages[bp->b_npages - 1] != m)) { + bp->b_pages[bp->b_npages] = m; + bp->b_npages++; + } + } + } + bp->b_bcount += size; + bp->b_bufsize += size; + + s = splbio(); + bundirty(tbp); + tbp->b_flags &= ~B_DONE; + tbp->b_ioflags &= ~BIO_ERROR; + tbp->b_flags |= B_ASYNC; + tbp->b_iocmd = BIO_WRITE; + reassignbuf(tbp, tbp->b_vp); /* put on clean list */ + ++tbp->b_vp->v_numoutput; + splx(s); + BUF_KERNPROC(tbp); + TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, + tbp, b_cluster.cluster_entry); + } + finishcluster: + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *) bp->b_pages, bp->b_npages); + if (bp->b_bufsize > bp->b_kvasize) + panic( + "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", + bp->b_bufsize, bp->b_kvasize); + bp->b_kvasize = bp->b_bufsize; + totalwritten += bp->b_bufsize; + bp->b_dirtyoff = 0; + bp->b_dirtyend = bp->b_bufsize; + bawrite(bp); + + len -= i; + } + return totalwritten; +} + +/* + * Collect together all the buffers in a cluster. + * Plus add one additional buffer. + */ +static struct cluster_save * +cluster_collectbufs(vp, last_bp) + struct vnode *vp; + struct buf *last_bp; +{ + struct cluster_save *buflist; + struct buf *bp; + daddr_t lbn; + int i, len; + + len = vp->v_lastw - vp->v_cstart + 1; + buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), + M_SEGMENT, M_WAITOK); + buflist->bs_nchildren = 0; + buflist->bs_children = (struct buf **) (buflist + 1); + for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { + (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); + buflist->bs_children[i] = bp; + if (bp->b_blkno == bp->b_lblkno) + VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, + NULL, NULL); + } + buflist->bs_children[i] = bp = last_bp; + if (bp->b_blkno == bp->b_lblkno) + VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, + NULL, NULL); + buflist->bs_nchildren = i + 1; + return (buflist); +} diff --git a/sys/kern/vfs_conf.c b/sys/kern/vfs_conf.c new file mode 100644 index 0000000..20d9b90 --- /dev/null +++ b/sys/kern/vfs_conf.c @@ -0,0 +1,396 @@ +/*- + * Copyright (c) 1999 Michael Smith + * All rights reserved. + * Copyright (c) 1999 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Locate and mount the root filesystem. + * + * The root filesystem is detailed in the kernel environment variable + * vfs.root.mountfrom, which is expected to be in the general format + * + * <vfsname>:[<path>] + * vfsname := the name of a VFS known to the kernel and capable + * of being mounted as root + * path := disk device name or other data used by the filesystem + * to locate its physical store + * + */ + +#include "opt_rootdevname.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/malloc.h> +#include <sys/reboot.h> +#include <sys/diskslice.h> +#include <sys/disklabel.h> +#include <sys/conf.h> +#include <sys/cons.h> +#include <sys/proc.h> + +#include "opt_ddb.h" + +#ifdef DDB +#include <ddb/ddb.h> +#endif + +#include <paths.h> + +MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); + +#define ROOTNAME "root_device" + +/* + * The vnode of the system's root (/ in the filesystem, without chroot + * active.) + */ +struct vnode *rootvnode; + +/* + * The root specifiers we will try if RB_CDROM is specified. + */ +static char *cdrom_rootdevnames[] = { + "cd9660:cd0a", + "cd9660:acd0a", + "cd9660:wcd0a", + NULL +}; + +static int vfs_mountroot_try(char *mountfrom); +static int vfs_mountroot_ask(void); +static void gets(char *cp); + +/* legacy find-root code */ +char *rootdevnames[2] = {NULL, NULL}; +static int setrootbyname(char *name); +dev_t rootdev = NODEV; + +/* + * Find and mount the root filesystem + */ +void +vfs_mountroot(void *foo __unused) +{ + char *cp; + int i, error; + + /* + * The root filesystem information is compiled in, and we are + * booted with instructions to use it. + */ +#ifdef ROOTDEVNAME + if ((boothowto & RB_DFLTROOT) && + !vfs_mountroot_try(ROOTDEVNAME)) + return; +#endif + /* + * We are booted with instructions to prompt for the root filesystem, + * or to use the compiled-in default when it doesn't exist. + */ + if (boothowto & (RB_DFLTROOT | RB_ASKNAME)) { + if (!vfs_mountroot_ask()) + return; + } + + /* + * We've been given the generic "use CDROM as root" flag. This is + * necessary because one media may be used in many different + * devices, so we need to search for them. + */ + if (boothowto & RB_CDROM) { + for (i = 0; cdrom_rootdevnames[i] != NULL; i++) { + if (!vfs_mountroot_try(cdrom_rootdevnames[i])) + return; + } + } + + /* + * Try to use the value read by the loader from /etc/fstab, or + * supplied via some other means. This is the preferred + * mechanism. + */ + if ((cp = getenv("vfs.root.mountfrom")) != NULL) { + error = vfs_mountroot_try(cp); + freeenv(cp); + if (!error) + return; + } + + /* + * Try values that may have been computed by the machine-dependant + * legacy code. + */ + if (!vfs_mountroot_try(rootdevnames[0])) + return; + if (!vfs_mountroot_try(rootdevnames[1])) + return; + + /* + * If we have a compiled-in default, and haven't already tried it, try + * it now. + */ +#ifdef ROOTDEVNAME + if (!(boothowto & RB_DFLTROOT)) + if (!vfs_mountroot_try(ROOTDEVNAME)) + return; +#endif + + /* + * Everything so far has failed, prompt on the console if we haven't + * already tried that. + */ + if (!(boothowto & (RB_DFLTROOT | RB_ASKNAME)) && !vfs_mountroot_ask()) + return; + panic("Root mount failed, startup aborted."); +} + +/* + * Mount (mountfrom) as the root filesystem. + */ +static int +vfs_mountroot_try(char *mountfrom) +{ + struct mount *mp; + char *vfsname, *path; + int error; + char patt[32]; + int s; + + vfsname = NULL; + path = NULL; + mp = NULL; + error = EINVAL; + + if (mountfrom == NULL) + return(error); /* don't complain */ + + s = splcam(); /* Overkill, but annoying without it */ + printf("Mounting root from %s\n", mountfrom); + splx(s); + + /* parse vfs name and path */ + vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK); + path = malloc(MNAMELEN, M_MOUNT, M_WAITOK); + vfsname[0] = path[0] = 0; + sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN); + if (sscanf(mountfrom, patt, vfsname, path) < 1) + goto done; + + /* allocate a root mount */ + error = vfs_rootmountalloc(vfsname, path[0] != 0 ? path : ROOTNAME, + &mp); + if (error != 0) { + printf("Can't allocate root mount for filesystem '%s': %d\n", + vfsname, error); + goto done; + } + mp->mnt_flag |= MNT_ROOTFS; + + /* do our best to set rootdev */ + if ((path[0] != 0) && setrootbyname(path)) + printf("setrootbyname failed\n"); + + /* If the root device is a type "memory disk", mount RW */ + if (rootdev != NODEV && devsw(rootdev) && + (devsw(rootdev)->d_flags & D_MEMDISK)) + mp->mnt_flag &= ~MNT_RDONLY; + + /* + * Set the mount path to be something useful, because the + * filesystem code isn't responsible now for initialising + * f_mntonname unless they want to override the default + * (which is `path'.) + */ + strncpy(mp->mnt_stat.f_mntonname, "/", MNAMELEN); + + error = VFS_MOUNT(mp, NULL, NULL, NULL, curthread); + +done: + if (vfsname != NULL) + free(vfsname, M_MOUNT); + if (path != NULL) + free(path, M_MOUNT); + if (error != 0) { + if (mp != NULL) { + vfs_unbusy(mp, curthread); + free(mp, M_MOUNT); + } + printf("Root mount failed: %d\n", error); + } else { + + /* register with list of mounted filesystems */ + mtx_lock(&mountlist_mtx); + TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); + mtx_unlock(&mountlist_mtx); + + /* sanity check system clock against root filesystem timestamp */ + inittodr(mp->mnt_time); + vfs_unbusy(mp, curthread); + } + return(error); +} + +/* + * Spin prompting on the console for a suitable root filesystem + */ +static int +vfs_mountroot_ask(void) +{ + char name[128]; + int i; + dev_t dev; + + for(;;) { + printf("\nManual root filesystem specification:\n"); + printf(" <fstype>:<device> Mount <device> using filesystem <fstype>\n"); +#if defined(__i386__) || defined(__ia64__) + printf(" eg. ufs:da0s1a\n"); +#else + printf(" eg. ufs:da0a\n"); +#endif + printf(" ? List valid disk boot devices\n"); + printf(" <empty line> Abort manual input\n"); + printf("\nmountroot> "); + gets(name); + if (name[0] == 0) + return(1); + if (name[0] == '?') { + printf("Possibly valid devices for 'ufs' root:\n"); + for (i = 0; i < NUMCDEVSW; i++) { + dev = makedev(i, 0); + if (devsw(dev) != NULL) + printf(" \"%s\"", devsw(dev)->d_name); + } + printf("\n"); + continue; + } + if (!vfs_mountroot_try(name)) + return(0); + } +} + +/* + * Local helper function for vfs_mountroot_ask. + */ +static void +gets(char *cp) +{ + char *lp; + int c; + + lp = cp; + for (;;) { + printf("%c", c = cngetc() & 0177); + switch (c) { + case -1: + case '\n': + case '\r': + *lp++ = '\0'; + return; + case '\b': + case '\177': + if (lp > cp) { + printf(" \b"); + lp--; + } + continue; + case '#': + lp--; + if (lp < cp) + lp = cp; + continue; + case '@': + case 'u' & 037: + lp = cp; + printf("%c", '\n'); + continue; + default: + *lp++ = c; + } + } +} + +/* + * Convert a given name to the dev_t of the disk-like device + * it refers to. + */ +dev_t +getdiskbyname(char *name) { + char *cp; + dev_t dev; + + cp = name; + if (!bcmp(cp, "/dev/", 5)) + cp += 5; + + dev = NODEV; + EVENTHANDLER_INVOKE(dev_clone, cp, strlen(cp), &dev); + return (dev); +} + +/* + * Set rootdev to match (name), given that we expect it to + * refer to a disk-like device. + */ +static int +setrootbyname(char *name) +{ + dev_t diskdev; + + diskdev = getdiskbyname(name); + if (diskdev != NODEV) { + rootdev = diskdev; + return (0); + } + + return (1); +} + +/* Show the dev_t for a disk specified by name */ +#ifdef DDB +DB_SHOW_COMMAND(disk, db_getdiskbyname) +{ + dev_t dev; + + if (modif[0] == '\0') { + db_error("usage: show disk/devicename"); + return; + } + dev = getdiskbyname(modif); + if (dev != NODEV) + db_printf("dev_t = %p\n", dev); + else + db_printf("No disk device matched.\n"); +} +#endif diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c new file mode 100644 index 0000000..6bfe085 --- /dev/null +++ b/sys/kern/vfs_default.c @@ -0,0 +1,845 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed + * to Berkeley by John Heidemann of the UCLA Ficus project. + * + * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/mutex.h> +#include <sys/unistd.h> +#include <sys/vnode.h> +#include <sys/poll.h> + +#include <machine/limits.h> + +#include <vm/vm.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/vnode_pager.h> + +static int vop_nolookup(struct vop_lookup_args *); +static int vop_nostrategy(struct vop_strategy_args *); + +/* + * This vnode table stores what we want to do if the filesystem doesn't + * implement a particular VOP. + * + * If there is no specific entry here, we will return EOPNOTSUPP. + * + */ + +vop_t **default_vnodeop_p; +static struct vnodeopv_entry_desc default_vnodeop_entries[] = { + { &vop_default_desc, (vop_t *) vop_eopnotsupp }, + { &vop_advlock_desc, (vop_t *) vop_einval }, + { &vop_bmap_desc, (vop_t *) vop_stdbmap }, + { &vop_close_desc, (vop_t *) vop_null }, + { &vop_createvobject_desc, (vop_t *) vop_stdcreatevobject }, + { &vop_destroyvobject_desc, (vop_t *) vop_stddestroyvobject }, + { &vop_fsync_desc, (vop_t *) vop_null }, + { &vop_getpages_desc, (vop_t *) vop_stdgetpages }, + { &vop_getvobject_desc, (vop_t *) vop_stdgetvobject }, + { &vop_inactive_desc, (vop_t *) vop_stdinactive }, + { &vop_ioctl_desc, (vop_t *) vop_enotty }, + { &vop_islocked_desc, (vop_t *) vop_noislocked }, + { &vop_lease_desc, (vop_t *) vop_null }, + { &vop_lock_desc, (vop_t *) vop_nolock }, + { &vop_lookup_desc, (vop_t *) vop_nolookup }, + { &vop_open_desc, (vop_t *) vop_null }, + { &vop_pathconf_desc, (vop_t *) vop_einval }, + { &vop_putpages_desc, (vop_t *) vop_stdputpages }, + { &vop_poll_desc, (vop_t *) vop_nopoll }, + { &vop_readlink_desc, (vop_t *) vop_einval }, + { &vop_revoke_desc, (vop_t *) vop_revoke }, + { &vop_strategy_desc, (vop_t *) vop_nostrategy }, + { &vop_unlock_desc, (vop_t *) vop_nounlock }, + { NULL, NULL } +}; + +static struct vnodeopv_desc default_vnodeop_opv_desc = + { &default_vnodeop_p, default_vnodeop_entries }; + +VNODEOP_SET(default_vnodeop_opv_desc); + +/* + * Series of placeholder functions for various error returns for + * VOPs. + */ + +int +vop_eopnotsupp(struct vop_generic_args *ap) +{ + /* + printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name); + */ + + return (EOPNOTSUPP); +} + +int +vop_ebadf(struct vop_generic_args *ap) +{ + + return (EBADF); +} + +int +vop_enotty(struct vop_generic_args *ap) +{ + + return (ENOTTY); +} + +int +vop_einval(struct vop_generic_args *ap) +{ + + return (EINVAL); +} + +int +vop_null(struct vop_generic_args *ap) +{ + + return (0); +} + +/* + * Used to make a defined VOP fall back to the default VOP. + */ +int +vop_defaultop(struct vop_generic_args *ap) +{ + + return (VOCALL(default_vnodeop_p, ap->a_desc->vdesc_offset, ap)); +} + +/* + * Helper function to panic on some bad VOPs in some filesystems. + */ +int +vop_panic(struct vop_generic_args *ap) +{ + + panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name); +} + +/* + * vop_std<something> and vop_no<something> are default functions for use by + * filesystems that need the "default reasonable" implementation for a + * particular operation. + * + * The documentation for the operations they implement exists (if it exists) + * in the VOP_<SOMETHING>(9) manpage (all uppercase). + */ + +/* + * Default vop for filesystems that do not support name lookup + */ +static int +vop_nolookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + + *ap->a_vpp = NULL; + return (ENOTDIR); +} + +/* + * vop_nostrategy: + * + * Strategy routine for VFS devices that have none. + * + * BIO_ERROR and B_INVAL must be cleared prior to calling any strategy + * routine. Typically this is done for a BIO_READ strategy call. + * Typically B_INVAL is assumed to already be clear prior to a write + * and should not be cleared manually unless you just made the buffer + * invalid. BIO_ERROR should be cleared either way. + */ + +static int +vop_nostrategy (struct vop_strategy_args *ap) +{ + printf("No strategy for buffer at %p\n", ap->a_bp); + vprint("", ap->a_vp); + vprint("", ap->a_bp->b_vp); + ap->a_bp->b_ioflags |= BIO_ERROR; + ap->a_bp->b_error = EOPNOTSUPP; + bufdone(ap->a_bp); + return (EOPNOTSUPP); +} + +/* + * vop_stdpathconf: + * + * Standard implementation of POSIX pathconf, to get information about limits + * for a filesystem. + * Override per filesystem for the case where the filesystem has smaller + * limits. + */ +int +vop_stdpathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_MAX_CANON: + *ap->a_retval = MAX_CANON; + return (0); + case _PC_MAX_INPUT: + *ap->a_retval = MAX_INPUT; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_VDISABLE: + *ap->a_retval = _POSIX_VDISABLE; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Standard lock, unlock and islocked functions. + */ +int +vop_stdlock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + int a_flags; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + +#ifndef DEBUG_LOCKS + return (lockmgr(&vp->v_lock, ap->a_flags, &vp->v_interlock, ap->a_td)); +#else + return (debuglockmgr(&vp->v_lock, ap->a_flags, &vp->v_interlock, + ap->a_td, "vop_stdlock", vp->filename, vp->line)); +#endif +} + +/* See above. */ +int +vop_stdunlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + int a_flags; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + return (lockmgr(&vp->v_lock, ap->a_flags | LK_RELEASE, &vp->v_interlock, + ap->a_td)); +} + +/* See above. */ +int +vop_stdislocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + + return (lockstatus(&ap->a_vp->v_lock, ap->a_td)); +} + +/* Mark the vnode inactive */ +int +vop_stdinactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + + VOP_UNLOCK(ap->a_vp, 0, ap->a_td); + return (0); +} + +/* + * Return true for select/poll. + */ +int +vop_nopoll(ap) + struct vop_poll_args /* { + struct vnode *a_vp; + int a_events; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + /* + * Return true for read/write. If the user asked for something + * special, return POLLNVAL, so that clients have a way of + * determining reliably whether or not the extended + * functionality is present without hard-coding knowledge + * of specific filesystem implementations. + */ + if (ap->a_events & ~POLLSTANDARD) + return (POLLNVAL); + + return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); +} + +/* + * Implement poll for local filesystems that support it. + */ +int +vop_stdpoll(ap) + struct vop_poll_args /* { + struct vnode *a_vp; + int a_events; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + if (ap->a_events & ~POLLSTANDARD) + return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events)); + return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); +} + +/* + * Stubs to use when there is no locking to be done on the underlying object. + * A minimal shared lock is necessary to ensure that the underlying object + * is not revoked while an operation is in progress. So, an active shared + * count is maintained in an auxillary vnode lock structure. + */ +int +vop_sharedlock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + int a_flags; + struct thread *a_td; + } */ *ap; +{ + /* + * This code cannot be used until all the non-locking filesystems + * (notably NFS) are converted to properly lock and release nodes. + * Also, certain vnode operations change the locking state within + * the operation (create, mknod, remove, link, rename, mkdir, rmdir, + * and symlink). Ideally these operations should not change the + * lock state, but should be changed to let the caller of the + * function unlock them. Otherwise all intermediate vnode layers + * (such as union, umapfs, etc) must catch these functions to do + * the necessary locking at their layer. Note that the inactive + * and lookup operations also change their lock state, but this + * cannot be avoided, so these two operations will always need + * to be handled in intermediate layers. + */ + struct vnode *vp = ap->a_vp; + int vnflags, flags = ap->a_flags; + + switch (flags & LK_TYPE_MASK) { + case LK_DRAIN: + vnflags = LK_DRAIN; + break; + case LK_EXCLUSIVE: +#ifdef DEBUG_VFS_LOCKS + /* + * Normally, we use shared locks here, but that confuses + * the locking assertions. + */ + vnflags = LK_EXCLUSIVE; + break; +#endif + case LK_SHARED: + vnflags = LK_SHARED; + break; + case LK_UPGRADE: + case LK_EXCLUPGRADE: + case LK_DOWNGRADE: + return (0); + case LK_RELEASE: + default: + panic("vop_sharedlock: bad operation %d", flags & LK_TYPE_MASK); + } + if (flags & LK_INTERLOCK) + vnflags |= LK_INTERLOCK; +#ifndef DEBUG_LOCKS + return (lockmgr(&vp->v_lock, vnflags, &vp->v_interlock, ap->a_td)); +#else + return (debuglockmgr(&vp->v_lock, vnflags, &vp->v_interlock, ap->a_td, + "vop_sharedlock", vp->filename, vp->line)); +#endif +} + +/* + * Stubs to use when there is no locking to be done on the underlying object. + * A minimal shared lock is necessary to ensure that the underlying object + * is not revoked while an operation is in progress. So, an active shared + * count is maintained in an auxillary vnode lock structure. + */ +int +vop_nolock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + int a_flags; + struct thread *a_td; + } */ *ap; +{ +#ifdef notyet + /* + * This code cannot be used until all the non-locking filesystems + * (notably NFS) are converted to properly lock and release nodes. + * Also, certain vnode operations change the locking state within + * the operation (create, mknod, remove, link, rename, mkdir, rmdir, + * and symlink). Ideally these operations should not change the + * lock state, but should be changed to let the caller of the + * function unlock them. Otherwise all intermediate vnode layers + * (such as union, umapfs, etc) must catch these functions to do + * the necessary locking at their layer. Note that the inactive + * and lookup operations also change their lock state, but this + * cannot be avoided, so these two operations will always need + * to be handled in intermediate layers. + */ + struct vnode *vp = ap->a_vp; + int vnflags, flags = ap->a_flags; + + switch (flags & LK_TYPE_MASK) { + case LK_DRAIN: + vnflags = LK_DRAIN; + break; + case LK_EXCLUSIVE: + case LK_SHARED: + vnflags = LK_SHARED; + break; + case LK_UPGRADE: + case LK_EXCLUPGRADE: + case LK_DOWNGRADE: + return (0); + case LK_RELEASE: + default: + panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK); + } + if (flags & LK_INTERLOCK) + vnflags |= LK_INTERLOCK; + return(lockmgr(&vp->v_lock, vnflags, &vp->v_interlock, ap->a_td)); +#else /* for now */ + /* + * Since we are not using the lock manager, we must clear + * the interlock here. + */ + if (ap->a_flags & LK_INTERLOCK) + mtx_unlock(&ap->a_vp->v_interlock); + return (0); +#endif +} + +/* + * Do the inverse of vop_nolock, handling the interlock in a compatible way. + */ +int +vop_nounlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + int a_flags; + struct thread *a_td; + } */ *ap; +{ + + /* + * Since we are not using the lock manager, we must clear + * the interlock here. + */ + if (ap->a_flags & LK_INTERLOCK) + mtx_unlock(&ap->a_vp->v_interlock); + return (0); +} + +/* + * Return whether or not the node is in use. + */ +int +vop_noislocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + + return (0); +} + +/* + * Return our mount point, as we will take charge of the writes. + */ +int +vop_stdgetwritemount(ap) + struct vop_getwritemount_args /* { + struct vnode *a_vp; + struct mount **a_mpp; + } */ *ap; +{ + + *(ap->a_mpp) = ap->a_vp->v_mount; + return (0); +} + +/* Create the VM system backing object for this vnode */ +int +vop_stdcreatevobject(ap) + struct vop_createvobject_args /* { + struct vnode *vp; + struct ucred *cred; + struct thread *td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct ucred *cred = ap->a_cred; + struct thread *td = ap->a_td; + struct vattr vat; + vm_object_t object; + int error = 0; + + GIANT_REQUIRED; + + if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE) + return (0); + +retry: + if ((object = vp->v_object) == NULL) { + if (vp->v_type == VREG || vp->v_type == VDIR) { + if ((error = VOP_GETATTR(vp, &vat, cred, td)) != 0) + goto retn; + object = vnode_pager_alloc(vp, vat.va_size, 0, 0); + } else if (devsw(vp->v_rdev) != NULL) { + /* + * This simply allocates the biggest object possible + * for a disk vnode. This should be fixed, but doesn't + * cause any problems (yet). + */ + object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); + } else { + goto retn; + } + /* + * Dereference the reference we just created. This assumes + * that the object is associated with the vp. + */ + object->ref_count--; + vp->v_usecount--; + } else { + if (object->flags & OBJ_DEAD) { + VOP_UNLOCK(vp, 0, td); + tsleep(object, PVM, "vodead", 0); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + goto retry; + } + } + + KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object")); + vp->v_flag |= VOBJBUF; + +retn: + return (error); +} + +/* Destroy the VM system object associated with this vnode */ +int +vop_stddestroyvobject(ap) + struct vop_destroyvobject_args /* { + struct vnode *vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + vm_object_t obj = vp->v_object; + + GIANT_REQUIRED; + + if (vp->v_object == NULL) + return (0); + + if (obj->ref_count == 0) { + /* + * vclean() may be called twice. The first time + * removes the primary reference to the object, + * the second time goes one further and is a + * special-case to terminate the object. + * + * don't double-terminate the object + */ + if ((obj->flags & OBJ_DEAD) == 0) + vm_object_terminate(obj); + } else { + /* + * Woe to the process that tries to page now :-). + */ + vm_pager_deallocate(obj); + } + return (0); +} + +/* + * Return the underlying VM object. This routine may be called with or + * without the vnode interlock held. If called without, the returned + * object is not guarenteed to be valid. The syncer typically gets the + * object without holding the interlock in order to quickly test whether + * it might be dirty before going heavy-weight. vm_object's use zalloc + * and thus stable-storage, so this is safe. + */ +int +vop_stdgetvobject(ap) + struct vop_getvobject_args /* { + struct vnode *vp; + struct vm_object **objpp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vm_object **objpp = ap->a_objpp; + + if (objpp) + *objpp = vp->v_object; + return (vp->v_object ? 0 : EINVAL); +} + +/* XXX Needs good comment and VOP_BMAP(9) manpage */ +int +vop_stdbmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; + } */ *ap; +{ + + if (ap->a_vpp != NULL) + *ap->a_vpp = ap->a_vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize); + if (ap->a_runp != NULL) + *ap->a_runp = 0; + if (ap->a_runb != NULL) + *ap->a_runb = 0; + return (0); +} + +/* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */ +int +vop_stdgetpages(ap) + struct vop_getpages_args /* { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int a_reqpage; + vm_ooffset_t a_offset; + } */ *ap; +{ + + return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, + ap->a_count, ap->a_reqpage); +} + +/* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */ +int +vop_stdputpages(ap) + struct vop_putpages_args /* { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int a_sync; + int *a_rtvals; + vm_ooffset_t a_offset; + } */ *ap; +{ + + return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, + ap->a_sync, ap->a_rtvals); +} + + + +/* + * vfs default ops + * used to fill the vfs function table to get reasonable default return values. + */ +int +vfs_stdmount (mp, path, data, ndp, td) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct thread *td; +{ + return (0); +} + +int +vfs_stdunmount (mp, mntflags, td) + struct mount *mp; + int mntflags; + struct thread *td; +{ + return (0); +} + +int +vfs_stdroot (mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + return (EOPNOTSUPP); +} + +int +vfs_stdstatfs (mp, sbp, td) + struct mount *mp; + struct statfs *sbp; + struct thread *td; +{ + return (EOPNOTSUPP); +} + +int +vfs_stdvptofh (vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + return (EOPNOTSUPP); +} + +int +vfs_stdstart (mp, flags, td) + struct mount *mp; + int flags; + struct thread *td; +{ + return (0); +} + +int +vfs_stdquotactl (mp, cmds, uid, arg, td) + struct mount *mp; + int cmds; + uid_t uid; + caddr_t arg; + struct thread *td; +{ + return (EOPNOTSUPP); +} + +int +vfs_stdsync (mp, waitfor, cred, td) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct thread *td; +{ + return (0); +} + +int +vfs_stdvget (mp, ino, flags, vpp) + struct mount *mp; + ino_t ino; + int flags; + struct vnode **vpp; +{ + return (EOPNOTSUPP); +} + +int +vfs_stdfhtovp (mp, fhp, vpp) + struct mount *mp; + struct fid *fhp; + struct vnode **vpp; +{ + return (EOPNOTSUPP); +} + +int +vfs_stdinit (vfsp) + struct vfsconf *vfsp; +{ + return (0); +} + +int +vfs_stduninit (vfsp) + struct vfsconf *vfsp; +{ + return(0); +} + +int +vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td) + struct mount *mp; + int cmd; + struct vnode *filename_vp; + int attrnamespace; + const char *attrname; + struct thread *td; +{ + return(EOPNOTSUPP); +} + +/* end of vfs default ops */ diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c new file mode 100644 index 0000000..ec135bd --- /dev/null +++ b/sys/kern/vfs_export.c @@ -0,0 +1,400 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/mount.h> +#include <net/radix.h> +#include <sys/domain.h> +#include <sys/dirent.h> +#include <sys/vnode.h> + +static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); + +static void vfs_free_addrlist(struct netexport *nep); +static int vfs_free_netcred(struct radix_node *rn, void *w); +static int vfs_hang_addrlist(struct mount *mp, struct netexport *nep, + struct export_args *argp); + +/* + * Network address lookup element + */ +struct netcred { + struct radix_node netc_rnodes[2]; + int netc_exflags; + struct ucred netc_anon; +}; + +/* + * Network export information + */ +struct netexport { + struct netcred ne_defexported; /* Default export */ + struct radix_node_head *ne_rtable[AF_MAX+1]; /* Individual exports */ +}; + +/* + * Build hash lists of net addresses and hang them off the mount point. + * Called by ufs_mount() to set up the lists of export addresses. + */ +static int +vfs_hang_addrlist(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + register struct netcred *np; + register struct radix_node_head *rnh; + register int i; + struct radix_node *rn; + struct sockaddr *saddr, *smask = 0; + struct domain *dom; + int error; + + /* + * XXX: This routine converts from a `struct xucred' + * (argp->ex_anon) to a `struct ucred' (np->netc_anon). This + * operation is questionable; for example, what should be done + * with fields like cr_uidinfo and cr_prison? Currently, this + * routine does not touch them (leaves them as NULL). + */ + if (argp->ex_anon.cr_version != XUCRED_VERSION) + return (EINVAL); + + if (argp->ex_addrlen == 0) { + if (mp->mnt_flag & MNT_DEFEXPORTED) + return (EPERM); + np = &nep->ne_defexported; + np->netc_exflags = argp->ex_flags; + bzero(&np->netc_anon, sizeof(np->netc_anon)); + np->netc_anon.cr_uid = argp->ex_anon.cr_uid; + np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; + bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, + sizeof(np->netc_anon.cr_groups)); + np->netc_anon.cr_ref = 1; + mp->mnt_flag |= MNT_DEFEXPORTED; + return (0); + } + + if (argp->ex_addrlen > MLEN) + return (EINVAL); + + i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; + np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); + saddr = (struct sockaddr *) (np + 1); + if ((error = copyin(argp->ex_addr, saddr, argp->ex_addrlen))) + goto out; + if (saddr->sa_len > argp->ex_addrlen) + saddr->sa_len = argp->ex_addrlen; + if (argp->ex_masklen) { + smask = (struct sockaddr *) (saddr + argp->ex_addrlen); + error = copyin(argp->ex_mask, smask, argp->ex_masklen); + if (error) + goto out; + if (smask->sa_len > argp->ex_masklen) + smask->sa_len = argp->ex_masklen; + } + i = saddr->sa_family; + if ((rnh = nep->ne_rtable[i]) == 0) { + /* + * Seems silly to initialize every AF when most are not used, + * do so on demand here + */ + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_family == i && dom->dom_rtattach) { + dom->dom_rtattach((void **) &nep->ne_rtable[i], + dom->dom_rtoffset); + break; + } + if ((rnh = nep->ne_rtable[i]) == 0) { + error = ENOBUFS; + goto out; + } + } + rn = (*rnh->rnh_addaddr) (saddr, smask, rnh, + np->netc_rnodes); + if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ + error = EPERM; + goto out; + } + np->netc_exflags = argp->ex_flags; + bzero(&np->netc_anon, sizeof(np->netc_anon)); + np->netc_anon.cr_uid = argp->ex_anon.cr_uid; + np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; + bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, + sizeof(np->netc_anon.cr_groups)); + np->netc_anon.cr_ref = 1; + return (0); +out: + free(np, M_NETADDR); + return (error); +} + +/* Helper for vfs_free_addrlist. */ +/* ARGSUSED */ +static int +vfs_free_netcred(rn, w) + struct radix_node *rn; + void *w; +{ + register struct radix_node_head *rnh = (struct radix_node_head *) w; + + (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); + free(rn, M_NETADDR); + return (0); +} + +/* + * Free the net address hash lists that are hanging off the mount points. + */ +static void +vfs_free_addrlist(nep) + struct netexport *nep; +{ + register int i; + register struct radix_node_head *rnh; + + for (i = 0; i <= AF_MAX; i++) + if ((rnh = nep->ne_rtable[i])) { + (*rnh->rnh_walktree) (rnh, vfs_free_netcred, rnh); + free(rnh, M_RTABLE); + nep->ne_rtable[i] = 0; + } +} + +/* + * High level function to manipulate export options on a mount point + * and the passed in netexport. + * Struct export_args *argp is the variable used to twiddle options, + * the structure is described in sys/mount.h + */ +int +vfs_export(mp, argp) + struct mount *mp; + struct export_args *argp; +{ + struct netexport *nep; + int error; + + nep = mp->mnt_export; + if (argp->ex_flags & MNT_DELEXPORT) { + if (nep == NULL) + return (ENOENT); + if (mp->mnt_flag & MNT_EXPUBLIC) { + vfs_setpublicfs(NULL, NULL, NULL); + mp->mnt_flag &= ~MNT_EXPUBLIC; + } + vfs_free_addrlist(nep); + mp->mnt_export = NULL; + free(nep, M_MOUNT); + mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); + } + if (argp->ex_flags & MNT_EXPORTED) { + if (nep == NULL) { + nep = malloc(sizeof(struct netexport), M_MOUNT, M_WAITOK | M_ZERO); + mp->mnt_export = nep; + } + if (argp->ex_flags & MNT_EXPUBLIC) { + if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) + return (error); + mp->mnt_flag |= MNT_EXPUBLIC; + } + if ((error = vfs_hang_addrlist(mp, nep, argp))) + return (error); + mp->mnt_flag |= MNT_EXPORTED; + } + return (0); +} + +/* + * Set the publicly exported filesystem (WebNFS). Currently, only + * one public filesystem is possible in the spec (RFC 2054 and 2055) + */ +int +vfs_setpublicfs(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + int error; + struct vnode *rvp; + char *cp; + + /* + * mp == NULL -> invalidate the current info, the FS is + * no longer exported. May be called from either vfs_export + * or unmount, so check if it hasn't already been done. + */ + if (mp == NULL) { + if (nfs_pub.np_valid) { + nfs_pub.np_valid = 0; + if (nfs_pub.np_index != NULL) { + FREE(nfs_pub.np_index, M_TEMP); + nfs_pub.np_index = NULL; + } + } + return (0); + } + + /* + * Only one allowed at a time. + */ + if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) + return (EBUSY); + + /* + * Get real filehandle for root of exported FS. + */ + bzero(&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); + nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; + + if ((error = VFS_ROOT(mp, &rvp))) + return (error); + + if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) + return (error); + + vput(rvp); + + /* + * If an indexfile was specified, pull it in. + */ + if (argp->ex_indexfile != NULL) { + MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, + M_WAITOK); + error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, + MAXNAMLEN, (size_t *)0); + if (!error) { + /* + * Check for illegal filenames. + */ + for (cp = nfs_pub.np_index; *cp; cp++) { + if (*cp == '/') { + error = EINVAL; + break; + } + } + } + if (error) { + FREE(nfs_pub.np_index, M_TEMP); + return (error); + } + } + + nfs_pub.np_mount = mp; + nfs_pub.np_valid = 1; + return (0); +} + +/* + * Used by the filesystems to determine if a given network address + * (passed in 'nam') is present in thier exports list, returns a pointer + * to struct netcred so that the filesystem can examine it for + * access rights (read/write/etc). + */ +struct netcred * +vfs_export_lookup(mp, nam) + register struct mount *mp; + struct sockaddr *nam; +{ + struct netexport *nep; + register struct netcred *np; + register struct radix_node_head *rnh; + struct sockaddr *saddr; + + nep = mp->mnt_export; + if (nep == NULL) + return (NULL); + np = NULL; + if (mp->mnt_flag & MNT_EXPORTED) { + /* + * Lookup in the export list first. + */ + if (nam != NULL) { + saddr = nam; + rnh = nep->ne_rtable[saddr->sa_family]; + if (rnh != NULL) { + np = (struct netcred *) + (*rnh->rnh_matchaddr)(saddr, + rnh); + if (np && np->netc_rnodes->rn_flags & RNF_ROOT) + np = NULL; + } + } + /* + * If no address match, use the default if it exists. + */ + if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) + np = &nep->ne_defexported; + } + return (np); +} + +/* + * XXX: This comment comes from the deprecated ufs_check_export() + * XXX: and may not entirely apply, but lacking something better: + * This is the generic part of fhtovp called after the underlying + * filesystem has validated the file handle. + * + * Verify that a host should have access to a filesystem. + */ + +int +vfs_stdcheckexp(mp, nam, extflagsp, credanonp) + struct mount *mp; + struct sockaddr *nam; + int *extflagsp; + struct ucred **credanonp; +{ + struct netcred *np; + + np = vfs_export_lookup(mp, nam); + if (np == NULL) + return (EACCES); + *extflagsp = np->netc_exflags; + *credanonp = &np->netc_anon; + return (0); +} + diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c new file mode 100644 index 0000000..1244e54 --- /dev/null +++ b/sys/kern/vfs_extattr.c @@ -0,0 +1,4862 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 + * $FreeBSD$ + */ + +/* For 4.3 integer FS ID compatibility */ +#include "opt_compat.h" +#include "opt_ffs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/sysent.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/mutex.h> +#include <sys/sysproto.h> +#include <sys/namei.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/linker.h> +#include <sys/stat.h> +#include <sys/sx.h> +#include <sys/unistd.h> +#include <sys/vnode.h> +#include <sys/proc.h> +#include <sys/dirent.h> +#include <sys/extattr.h> +#include <sys/jail.h> +#include <sys/sysctl.h> + +#include <machine/limits.h> +#include <machine/stdarg.h> + +#include <vm/vm.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/uma.h> + +static int change_dir(struct nameidata *ndp, struct thread *td); +static void checkdirs(struct vnode *olddp, struct vnode *newdp); +static int chroot_refuse_vdir_fds(struct filedesc *fdp); +static int getutimes(const struct timeval *, struct timespec *); +static int setfown(struct thread *td, struct vnode *, uid_t, gid_t); +static int setfmode(struct thread *td, struct vnode *, int); +static int setfflags(struct thread *td, struct vnode *, int); +static int setutimes(struct thread *td, struct vnode *, + const struct timespec *, int); +static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, + struct thread *td); +static int vfs_nmount(struct thread *td, int, struct uio *); + +static int usermount = 0; /* if 1, non-root can mount fs. */ + +int (*union_dircheckp)(struct thread *td, struct vnode **, struct file *); + +SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, ""); + +/* + * Virtual File System System Calls + */ + +#ifndef _SYS_SYSPROTO_H_ +struct nmount_args { + struct iovec *iovp; + unsigned int iovcnt; + int flags; +}; +#endif +/* ARGSUSED */ +int +nmount(td, uap) + struct thread *td; + struct nmount_args /* { + syscallarg(struct iovec *) iovp; + syscallarg(unsigned int) iovcnt; + syscallarg(int) flags; + } */ *uap; +{ + struct uio auio; + struct iovec *iov, *needfree; + struct iovec aiov[UIO_SMALLIOV]; + unsigned int i; + int error; + u_int iovlen, iovcnt; + + iovcnt = SCARG(uap, iovcnt); + iovlen = iovcnt * sizeof (struct iovec); + /* + * Check that we have an even number of iovec's + * and that we have at least two options. + */ + if ((iovcnt & 1) || (iovcnt < 4) || (iovcnt > UIO_MAXIOV)) + return (EINVAL); + + if (iovcnt > UIO_SMALLIOV) { + MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); + needfree = iov; + } else { + iov = aiov; + needfree = NULL; + } + auio.uio_iov = iov; + auio.uio_iovcnt = iovcnt; + auio.uio_segflg = UIO_USERSPACE; + if ((error = copyin(uap->iovp, iov, iovlen))) + goto finish; + + for (i = 0; i < iovcnt; i++) { + if (iov->iov_len > MMAXOPTIONLEN) { + error = EINVAL; + goto finish; + } + iov++; + } + error = vfs_nmount(td, SCARG(uap, flags), &auio); +finish: + if (needfree != NULL) + free(needfree, M_TEMP); + return (error); +} + +/* + * Release all resources related to the + * mount options. + */ +void +vfs_freeopts(struct vfsoptlist *opts) +{ + struct vfsopt *opt; + + while (!TAILQ_EMPTY(opts)) { + opt = TAILQ_FIRST(opts); + TAILQ_REMOVE(opts, opt, link); + free(opt->name, M_MOUNT); + free(opt->value, M_MOUNT); + free(opt, M_MOUNT); + } + free(opts, M_MOUNT); +} + +int +kernel_mount(iovp, iovcnt, flags) + struct iovec *iovp; + unsigned int iovcnt; + int flags; +{ + struct uio auio; + int error; + + /* + * Check that we have an even number of iovec's + * and that we have at least two options. + */ + if ((iovcnt & 1) || (iovcnt < 4)) + return (EINVAL); + + auio.uio_iov = iovp; + auio.uio_iovcnt = iovcnt; + auio.uio_segflg = UIO_SYSSPACE; + + error = vfs_nmount(curthread, flags, &auio); + return (error); +} + +int +kernel_vmount(int flags, ...) +{ + struct iovec *iovp; + struct uio auio; + va_list ap; + unsigned int iovcnt, iovlen, len; + const char *cp; + char *buf, *pos; + size_t n; + int error, i; + + len = 0; + va_start(ap, flags); + for (iovcnt = 0; (cp = va_arg(ap, const char *)) != NULL; iovcnt++) + len += strlen(cp) + 1; + va_end(ap); + + if (iovcnt < 4 || iovcnt & 1) + return (EINVAL); + + iovlen = iovcnt * sizeof (struct iovec); + MALLOC(iovp, struct iovec *, iovlen, M_MOUNT, M_WAITOK); + MALLOC(buf, char *, len, M_MOUNT, M_WAITOK); + pos = buf; + va_start(ap, flags); + for (i = 0; i < iovcnt; i++) { + cp = va_arg(ap, const char *); + copystr(cp, pos, len - (pos - buf), &n); + iovp[i].iov_base = pos; + iovp[i].iov_len = n; + pos += n; + } + va_end(ap); + + auio.uio_iov = iovp; + auio.uio_iovcnt = iovcnt; + auio.uio_segflg = UIO_SYSSPACE; + + error = vfs_nmount(curthread, flags, &auio); + FREE(iovp, M_MOUNT); + FREE(buf, M_MOUNT); + return (error); +} + +/* + * vfs_nmount(): actually attempt a filesystem mount. + */ +static int +vfs_nmount(td, fsflags, fsoptions) + struct thread *td; + int fsflags; /* Flags common to all filesystems. */ + struct uio *fsoptions; /* Options local to the filesystem. */ +{ + linker_file_t lf; + struct vnode *vp; + struct mount *mp; + struct vfsconf *vfsp; + struct vfsoptlist *optlist; + char *fstype, *fspath; + int error, flag = 0, kern_flag = 0; + int fstypelen, fspathlen; + struct vattr va; + struct nameidata nd; + + error = vfs_buildopts(fsoptions, &optlist); + if (error) + return (error); + + /* + * We need these two options before the others, + * and they are mandatory for any filesystem. + * Ensure they are NUL terminated as well. + */ + fstypelen = 0; + error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); + if (error || fstype[fstypelen - 1] != '\0') { + error = EINVAL; + goto bad; + } + fspathlen = 0; + error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); + if (error || fspath[fspathlen - 1] != '\0') { + error = EINVAL; + goto bad; + } + + /* + * Be ultra-paranoid about making sure the type and fspath + * variables will fit in our mp buffers, including the + * terminating NUL. + */ + if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) { + error = ENAMETOOLONG; + goto bad; + } + + if (usermount == 0) { + error = suser(td); + if (error) + goto bad; + } + /* + * Do not allow NFS export by non-root users. + */ + if (fsflags & MNT_EXPORTED) { + error = suser(td); + if (error) + goto bad; + } + /* + * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users. + */ + if (suser(td)) + fsflags |= MNT_NOSUID | MNT_NODEV; + /* + * Get vnode to be covered + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); + if ((error = namei(&nd)) != 0) + goto bad; + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + if (fsflags & MNT_UPDATE) { + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + error = EINVAL; + goto bad; + } + mp = vp->v_mount; + flag = mp->mnt_flag; + kern_flag = mp->mnt_kern_flag; + /* + * We only allow the filesystem to be reloaded if it + * is currently mounted read-only. + */ + if ((fsflags & MNT_RELOAD) && + ((mp->mnt_flag & MNT_RDONLY) == 0)) { + vput(vp); + error = EOPNOTSUPP; /* Needs translation */ + goto bad; + } + /* + * Only root, or the user that did the original mount is + * permitted to update it. + */ + if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) { + error = suser(td); + if (error) { + vput(vp); + goto bad; + } + } + if (vfs_busy(mp, LK_NOWAIT, 0, td)) { + vput(vp); + error = EBUSY; + goto bad; + } + mtx_lock(&vp->v_interlock); + if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) { + mtx_unlock(&vp->v_interlock); + vfs_unbusy(mp, td); + vput(vp); + error = EBUSY; + goto bad; + } + vp->v_flag |= VMOUNT; + mtx_unlock(&vp->v_interlock); + mp->mnt_flag |= fsflags & + (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT); + VOP_UNLOCK(vp, 0, td); + goto update; + } + /* + * If the user is not root, ensure that they own the directory + * onto which we are attempting to mount. + */ + error = VOP_GETATTR(vp, &va, td->td_ucred, td); + if (error) { + vput(vp); + goto bad; + } + if (va.va_uid != td->td_ucred->cr_uid) { + error = suser(td); + if (error) { + vput(vp); + goto bad; + } + } + if ((error = vinvalbuf(vp, V_SAVE, td->td_ucred, td, 0, 0)) != 0) { + vput(vp); + goto bad; + } + if (vp->v_type != VDIR) { + vput(vp); + error = ENOTDIR; + goto bad; + } + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstype)) + break; + if (vfsp == NULL) { + /* Only load modules for root (very important!). */ + error = suser(td); + if (error) { + vput(vp); + goto bad; + } + error = securelevel_gt(td->td_ucred, 0); + if (error) { + vput(vp); + goto bad; + } + error = linker_load_file(fstype, &lf); + if (error || lf == NULL) { + vput(vp); + if (lf == NULL) + error = ENODEV; + goto bad; + } + lf->userrefs++; + /* Look up again to see if the VFS was loaded. */ + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstype)) + break; + if (vfsp == NULL) { + lf->userrefs--; + linker_file_unload(lf); + vput(vp); + error = ENODEV; + goto bad; + } + } + mtx_lock(&vp->v_interlock); + if ((vp->v_flag & VMOUNT) != 0 || + vp->v_mountedhere != NULL) { + mtx_unlock(&vp->v_interlock); + vput(vp); + error = EBUSY; + goto bad; + } + vp->v_flag |= VMOUNT; + mtx_unlock(&vp->v_interlock); + + /* + * Allocate and initialize the filesystem. + */ + mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); + TAILQ_INIT(&mp->mnt_nvnodelist); + TAILQ_INIT(&mp->mnt_reservedvnlist); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); + (void)vfs_busy(mp, LK_NOWAIT, 0, td); + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_vfc = vfsp; + vfsp->vfc_refcount++; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN); + mp->mnt_vnodecovered = vp; + mp->mnt_stat.f_owner = td->td_ucred->cr_uid; + strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); + mp->mnt_iosize_max = DFLTPHYS; + VOP_UNLOCK(vp, 0, td); + +update: + mp->mnt_optnew = optlist; + /* + * Check if the fs implements the new VFS_NMOUNT() + * function, since the new system call was used. + */ + if (mp->mnt_op->vfs_mount != NULL) { + printf("%s doesn't support the new mount syscall\n", + mp->mnt_vfc->vfc_name); + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + mtx_unlock(&vp->v_interlock); + if (mp->mnt_flag & MNT_UPDATE) + vfs_unbusy(mp, td); + else { + mp->mnt_vfc->vfc_refcount--; + vfs_unbusy(mp, td); + free(mp, M_MOUNT); + } + vrele(vp); + error = EOPNOTSUPP; + goto bad; + } + + /* + * Set the mount level flags. + */ + if (fsflags & MNT_RDONLY) + mp->mnt_flag |= MNT_RDONLY; + else if (mp->mnt_flag & MNT_RDONLY) + mp->mnt_kern_flag |= MNTK_WANTRDWR; + mp->mnt_flag &=~ MNT_UPDATEMASK; + mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE); + /* + * Mount the filesystem. + * XXX The final recipients of VFS_MOUNT just overwrite the ndp they + * get. No freeing of cn_pnbuf. + */ + error = VFS_NMOUNT(mp, &nd, td); + if (!error) { + if (mp->mnt_opt != NULL) + vfs_freeopts(mp->mnt_opt); + mp->mnt_opt = mp->mnt_optnew; + } + /* + * Prevent external consumers of mount + * options to read mnt_optnew. + */ + mp->mnt_optnew = NULL; + if (mp->mnt_flag & MNT_UPDATE) { + if (mp->mnt_kern_flag & MNTK_WANTRDWR) + mp->mnt_flag &= ~MNT_RDONLY; + mp->mnt_flag &=~ + (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); + mp->mnt_kern_flag &=~ MNTK_WANTRDWR; + if (error) { + mp->mnt_flag = flag; + mp->mnt_kern_flag = kern_flag; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + if (mp->mnt_syncer == NULL) + error = vfs_allocate_syncvnode(mp); + } else { + if (mp->mnt_syncer != NULL) + vrele(mp->mnt_syncer); + mp->mnt_syncer = NULL; + } + vfs_unbusy(mp, td); + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + mtx_unlock(&vp->v_interlock); + vrele(vp); + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + /* + * Put the new filesystem on the mount list after root. + */ + cache_purge(vp); + if (!error) { + struct vnode *newdp; + + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + vp->v_mountedhere = mp; + mtx_unlock(&vp->v_interlock); + mtx_lock(&mountlist_mtx); + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mtx_unlock(&mountlist_mtx); + if (VFS_ROOT(mp, &newdp)) + panic("mount: lost mount"); + checkdirs(vp, newdp); + vput(newdp); + VOP_UNLOCK(vp, 0, td); + if ((mp->mnt_flag & MNT_RDONLY) == 0) + error = vfs_allocate_syncvnode(mp); + vfs_unbusy(mp, td); + if ((error = VFS_START(mp, 0, td)) != 0) { + vrele(vp); + goto bad; + } + } else { + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + mtx_unlock(&vp->v_interlock); + mp->mnt_vfc->vfc_refcount--; + vfs_unbusy(mp, td); + free(mp, M_MOUNT); + vput(vp); + goto bad; + } + return (0); +bad: + vfs_freeopts(optlist); + return (error); +} + +/* + * Old Mount API. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mount_args { + char *type; + char *path; + int flags; + caddr_t data; +}; +#endif +/* ARGSUSED */ +int +mount(td, uap) + struct thread *td; + struct mount_args /* { + syscallarg(char *) type; + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(caddr_t) data; + } */ *uap; +{ + char *fstype; + char *fspath; + int error; + + fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); + fspath = malloc(MNAMELEN, M_TEMP, M_WAITOK); + + /* + * vfs_mount() actually takes a kernel string for `type' and + * `path' now, so extract them. + */ + error = copyinstr(SCARG(uap, type), fstype, MFSNAMELEN, NULL); + if (error) + goto finish; + error = copyinstr(SCARG(uap, path), fspath, MNAMELEN, NULL); + if (error) + goto finish; + error = vfs_mount(td, fstype, fspath, SCARG(uap, flags), + SCARG(uap, data)); +finish: + free(fstype, M_TEMP); + free(fspath, M_TEMP); + return (error); +} + +/* + * vfs_mount(): actually attempt a filesystem mount. + * + * This routine is designed to be a "generic" entry point for routines + * that wish to mount a filesystem. All parameters except `fsdata' are + * pointers into kernel space. `fsdata' is currently still a pointer + * into userspace. + */ +int +vfs_mount(td, fstype, fspath, fsflags, fsdata) + struct thread *td; + const char *fstype; + char *fspath; + int fsflags; + void *fsdata; +{ + linker_file_t lf; + struct vnode *vp; + struct mount *mp; + struct vfsconf *vfsp; + int error, flag = 0, kern_flag = 0; + struct vattr va; + struct nameidata nd; + + /* + * Be ultra-paranoid about making sure the type and fspath + * variables will fit in our mp buffers, including the + * terminating NUL. + */ + if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) + return (ENAMETOOLONG); + + if (usermount == 0) { + error = suser(td); + if (error) + return (error); + } + /* + * Do not allow NFS export by non-root users. + */ + if (fsflags & MNT_EXPORTED) { + error = suser(td); + if (error) + return (error); + } + /* + * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users. + */ + if (suser(td)) + fsflags |= MNT_NOSUID | MNT_NODEV; + /* + * Get vnode to be covered + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + if (fsflags & MNT_UPDATE) { + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + mp = vp->v_mount; + flag = mp->mnt_flag; + kern_flag = mp->mnt_kern_flag; + /* + * We only allow the filesystem to be reloaded if it + * is currently mounted read-only. + */ + if ((fsflags & MNT_RELOAD) && + ((mp->mnt_flag & MNT_RDONLY) == 0)) { + vput(vp); + return (EOPNOTSUPP); /* Needs translation */ + } + /* + * Only root, or the user that did the original mount is + * permitted to update it. + */ + if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) { + error = suser(td); + if (error) { + vput(vp); + return (error); + } + } + if (vfs_busy(mp, LK_NOWAIT, 0, td)) { + vput(vp); + return (EBUSY); + } + mtx_lock(&vp->v_interlock); + if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) { + mtx_unlock(&vp->v_interlock); + vfs_unbusy(mp, td); + vput(vp); + return (EBUSY); + } + vp->v_flag |= VMOUNT; + mtx_unlock(&vp->v_interlock); + mp->mnt_flag |= fsflags & + (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT); + VOP_UNLOCK(vp, 0, td); + goto update; + } + /* + * If the user is not root, ensure that they own the directory + * onto which we are attempting to mount. + */ + error = VOP_GETATTR(vp, &va, td->td_ucred, td); + if (error) { + vput(vp); + return (error); + } + if (va.va_uid != td->td_ucred->cr_uid) { + error = suser(td); + if (error) { + vput(vp); + return (error); + } + } + if ((error = vinvalbuf(vp, V_SAVE, td->td_ucred, td, 0, 0)) != 0) { + vput(vp); + return (error); + } + if (vp->v_type != VDIR) { + vput(vp); + return (ENOTDIR); + } + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstype)) + break; + if (vfsp == NULL) { + /* Only load modules for root (very important!). */ + error = suser(td); + if (error) { + vput(vp); + return (error); + } + error = securelevel_gt(td->td_ucred, 0); + if (error) { + vput(vp); + return (error); + } + error = linker_load_file(fstype, &lf); + if (error || lf == NULL) { + vput(vp); + if (lf == NULL) + error = ENODEV; + return (error); + } + lf->userrefs++; + /* Look up again to see if the VFS was loaded. */ + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstype)) + break; + if (vfsp == NULL) { + lf->userrefs--; + linker_file_unload(lf); + vput(vp); + return (ENODEV); + } + } + mtx_lock(&vp->v_interlock); + if ((vp->v_flag & VMOUNT) != 0 || + vp->v_mountedhere != NULL) { + mtx_unlock(&vp->v_interlock); + vput(vp); + return (EBUSY); + } + vp->v_flag |= VMOUNT; + mtx_unlock(&vp->v_interlock); + + /* + * Allocate and initialize the filesystem. + */ + mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); + TAILQ_INIT(&mp->mnt_nvnodelist); + TAILQ_INIT(&mp->mnt_reservedvnlist); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); + (void)vfs_busy(mp, LK_NOWAIT, 0, td); + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_vfc = vfsp; + vfsp->vfc_refcount++; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN); + mp->mnt_vnodecovered = vp; + mp->mnt_stat.f_owner = td->td_ucred->cr_uid; + strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); + mp->mnt_iosize_max = DFLTPHYS; + VOP_UNLOCK(vp, 0, td); +update: + /* + * Check if the fs implements the old VFS_MOUNT() + * function, since the old system call was used. + */ + if (mp->mnt_op->vfs_mount == NULL) { + printf("%s doesn't support the old mount syscall\n", + mp->mnt_vfc->vfc_name); + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + mtx_unlock(&vp->v_interlock); + if (mp->mnt_flag & MNT_UPDATE) + vfs_unbusy(mp, td); + else { + mp->mnt_vfc->vfc_refcount--; + vfs_unbusy(mp, td); + free(mp, M_MOUNT); + } + vrele(vp); + return (EOPNOTSUPP); + } + + /* + * Set the mount level flags. + */ + if (fsflags & MNT_RDONLY) + mp->mnt_flag |= MNT_RDONLY; + else if (mp->mnt_flag & MNT_RDONLY) + mp->mnt_kern_flag |= MNTK_WANTRDWR; + mp->mnt_flag &=~ MNT_UPDATEMASK; + mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE); + /* + * Mount the filesystem. + * XXX The final recipients of VFS_MOUNT just overwrite the ndp they + * get. No freeing of cn_pnbuf. + */ + error = VFS_MOUNT(mp, fspath, fsdata, &nd, td); + if (mp->mnt_flag & MNT_UPDATE) { + if (mp->mnt_kern_flag & MNTK_WANTRDWR) + mp->mnt_flag &= ~MNT_RDONLY; + mp->mnt_flag &=~ + (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); + mp->mnt_kern_flag &=~ MNTK_WANTRDWR; + if (error) { + mp->mnt_flag = flag; + mp->mnt_kern_flag = kern_flag; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + if (mp->mnt_syncer == NULL) + error = vfs_allocate_syncvnode(mp); + } else { + if (mp->mnt_syncer != NULL) + vrele(mp->mnt_syncer); + mp->mnt_syncer = NULL; + } + vfs_unbusy(mp, td); + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + mtx_unlock(&vp->v_interlock); + vrele(vp); + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + /* + * Put the new filesystem on the mount list after root. + */ + cache_purge(vp); + if (!error) { + struct vnode *newdp; + + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + vp->v_mountedhere = mp; + mtx_unlock(&vp->v_interlock); + mtx_lock(&mountlist_mtx); + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mtx_unlock(&mountlist_mtx); + if (VFS_ROOT(mp, &newdp)) + panic("mount: lost mount"); + checkdirs(vp, newdp); + vput(newdp); + VOP_UNLOCK(vp, 0, td); + if ((mp->mnt_flag & MNT_RDONLY) == 0) + error = vfs_allocate_syncvnode(mp); + vfs_unbusy(mp, td); + if ((error = VFS_START(mp, 0, td)) != 0) + vrele(vp); + } else { + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + mtx_unlock(&vp->v_interlock); + mp->mnt_vfc->vfc_refcount--; + vfs_unbusy(mp, td); + free(mp, M_MOUNT); + vput(vp); + } + return (error); +} + +/* + * Scan all active processes to see if any of them have a current + * or root directory of `olddp'. If so, replace them with the new + * mount point. + */ +static void +checkdirs(olddp, newdp) + struct vnode *olddp, *newdp; +{ + struct filedesc *fdp; + struct proc *p; + int nrele; + + if (olddp->v_usecount == 1) + return; + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + PROC_LOCK(p); + fdp = p->p_fd; + if (fdp == NULL) { + PROC_UNLOCK(p); + continue; + } + nrele = 0; + FILEDESC_LOCK(fdp); + if (fdp->fd_cdir == olddp) { + VREF(newdp); + fdp->fd_cdir = newdp; + nrele++; + } + if (fdp->fd_rdir == olddp) { + VREF(newdp); + fdp->fd_rdir = newdp; + nrele++; + } + FILEDESC_UNLOCK(fdp); + PROC_UNLOCK(p); + while (nrele--) + vrele(olddp); + } + sx_sunlock(&allproc_lock); + if (rootvnode == olddp) { + vrele(rootvnode); + VREF(newdp); + rootvnode = newdp; + } +} + +/* + * Unmount a filesystem. + * + * Note: unmount takes a path to the vnode mounted on as argument, + * not special file (as before). + */ +#ifndef _SYS_SYSPROTO_H_ +struct unmount_args { + char *path; + int flags; +}; +#endif +/* ARGSUSED */ +int +unmount(td, uap) + struct thread *td; + register struct unmount_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + register struct vnode *vp; + struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + mp = vp->v_mount; + + /* + * Only root, or the user that did the original mount is + * permitted to unmount this filesystem. + */ + if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) { + error = suser(td); + if (error) { + vput(vp); + return (error); + } + } + + /* + * Don't allow unmounting the root filesystem. + */ + if (mp->mnt_flag & MNT_ROOTFS) { + vput(vp); + return (EINVAL); + } + + /* + * Must be the root of the filesystem + */ + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + vput(vp); + return (dounmount(mp, SCARG(uap, flags), td)); +} + +/* + * Do the actual filesystem unmount. + */ +int +dounmount(mp, flags, td) + struct mount *mp; + int flags; + struct thread *td; +{ + struct vnode *coveredvp, *fsrootvp; + int error; + int async_flag; + + mtx_lock(&mountlist_mtx); + if (mp->mnt_kern_flag & MNTK_UNMOUNT) { + mtx_unlock(&mountlist_mtx); + return (EBUSY); + } + mp->mnt_kern_flag |= MNTK_UNMOUNT; + /* Allow filesystems to detect that a forced unmount is in progress. */ + if (flags & MNT_FORCE) + mp->mnt_kern_flag |= MNTK_UNMOUNTF; + error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK | + ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), &mountlist_mtx, td); + if (error) { + mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); + if (mp->mnt_kern_flag & MNTK_MWAIT) + wakeup(mp); + return (error); + } + vn_start_write(NULL, &mp, V_WAIT); + + if (mp->mnt_flag & MNT_EXPUBLIC) + vfs_setpublicfs(NULL, NULL, NULL); + + vfs_msync(mp, MNT_WAIT); + async_flag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &=~ MNT_ASYNC; + cache_purgevfs(mp); /* remove cache entries for this file sys */ + if (mp->mnt_syncer != NULL) + vrele(mp->mnt_syncer); + /* Move process cdir/rdir refs on fs root to underlying vnode. */ + if (VFS_ROOT(mp, &fsrootvp) == 0) { + if (mp->mnt_vnodecovered != NULL) + checkdirs(fsrootvp, mp->mnt_vnodecovered); + if (fsrootvp == rootvnode) { + vrele(rootvnode); + rootvnode = NULL; + } + vput(fsrootvp); + } + if (((mp->mnt_flag & MNT_RDONLY) || + (error = VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td)) == 0) || + (flags & MNT_FORCE)) { + error = VFS_UNMOUNT(mp, flags, td); + } + vn_finished_write(mp); + if (error) { + /* Undo cdir/rdir and rootvnode changes made above. */ + if (VFS_ROOT(mp, &fsrootvp) == 0) { + if (mp->mnt_vnodecovered != NULL) + checkdirs(mp->mnt_vnodecovered, fsrootvp); + if (rootvnode == NULL) { + rootvnode = fsrootvp; + vref(rootvnode); + } + vput(fsrootvp); + } + if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) + (void) vfs_allocate_syncvnode(mp); + mtx_lock(&mountlist_mtx); + mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); + mp->mnt_flag |= async_flag; + lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, + &mountlist_mtx, td); + if (mp->mnt_kern_flag & MNTK_MWAIT) + wakeup(mp); + return (error); + } + mtx_lock(&mountlist_mtx); + TAILQ_REMOVE(&mountlist, mp, mnt_list); + if ((coveredvp = mp->mnt_vnodecovered) != NULL) + coveredvp->v_mountedhere = NULL; + mp->mnt_vfc->vfc_refcount--; + if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) + panic("unmount: dangling vnode"); + lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_mtx, td); + lockdestroy(&mp->mnt_lock); + if (coveredvp != NULL) + vrele(coveredvp); + if (mp->mnt_kern_flag & MNTK_MWAIT) + wakeup(mp); + if (mp->mnt_op->vfs_mount == NULL) + vfs_freeopts(mp->mnt_opt); + free(mp, M_MOUNT); + return (0); +} + +/* + * Sync each mounted filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct sync_args { + int dummy; +}; +#endif + +#ifdef DEBUG +static int syncprt = 0; +SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); +#endif + +/* ARGSUSED */ +int +sync(td, uap) + struct thread *td; + struct sync_args *uap; +{ + struct mount *mp, *nmp; + int asyncflag; + + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0 && + vn_start_write(NULL, &mp, V_NOWAIT) == 0) { + asyncflag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &= ~MNT_ASYNC; + vfs_msync(mp, MNT_NOWAIT); + VFS_SYNC(mp, MNT_NOWAIT, + ((td != NULL) ? td->td_ucred : NOCRED), td); + mp->mnt_flag |= asyncflag; + vn_finished_write(mp); + } + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + } + mtx_unlock(&mountlist_mtx); +#if 0 +/* + * XXX don't call vfs_bufstats() yet because that routine + * was not imported in the Lite2 merge. + */ +#ifdef DIAGNOSTIC + if (syncprt) + vfs_bufstats(); +#endif /* DIAGNOSTIC */ +#endif + return (0); +} + +/* XXX PRISON: could be per prison flag */ +static int prison_quotas; +#if 0 +SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, ""); +#endif + +/* + * Change filesystem quotas. + */ +#ifndef _SYS_SYSPROTO_H_ +struct quotactl_args { + char *path; + int cmd; + int uid; + caddr_t arg; +}; +#endif +/* ARGSUSED */ +int +quotactl(td, uap) + struct thread *td; + register struct quotactl_args /* { + syscallarg(char *) path; + syscallarg(int) cmd; + syscallarg(int) uid; + syscallarg(caddr_t) arg; + } */ *uap; +{ + struct mount *mp; + int error; + struct nameidata nd; + + if (jailed(td->td_ucred) && !prison_quotas) + return (EPERM); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); + vrele(nd.ni_vp); + if (error) + return (error); + error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid), + SCARG(uap, arg), td); + vn_finished_write(mp); + return (error); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct statfs_args { + char *path; + struct statfs *buf; +}; +#endif +/* ARGSUSED */ +int +statfs(td, uap) + struct thread *td; + register struct statfs_args /* { + syscallarg(char *) path; + syscallarg(struct statfs *) buf; + } */ *uap; +{ + register struct mount *mp; + register struct statfs *sp; + int error; + struct nameidata nd; + struct statfs sb; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + mp = nd.ni_vp->v_mount; + sp = &mp->mnt_stat; + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(nd.ni_vp); + error = VFS_STATFS(mp, sp, td); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (suser(td)) { + bcopy(sp, &sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } + return (copyout(sp, SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fstatfs_args { + int fd; + struct statfs *buf; +}; +#endif +/* ARGSUSED */ +int +fstatfs(td, uap) + struct thread *td; + register struct fstatfs_args /* { + syscallarg(int) fd; + syscallarg(struct statfs *) buf; + } */ *uap; +{ + struct file *fp; + struct mount *mp; + register struct statfs *sp; + int error; + struct statfs sb; + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + mp = ((struct vnode *)fp->f_data)->v_mount; + fdrop(fp, td); + if (mp == NULL) + return (EBADF); + sp = &mp->mnt_stat; + error = VFS_STATFS(mp, sp, td); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (suser(td)) { + bcopy(sp, &sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } + return (copyout(sp, SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Get statistics on all filesystems. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getfsstat_args { + struct statfs *buf; + long bufsize; + int flags; +}; +#endif +int +getfsstat(td, uap) + struct thread *td; + register struct getfsstat_args /* { + syscallarg(struct statfs *) buf; + syscallarg(long) bufsize; + syscallarg(int) flags; + } */ *uap; +{ + register struct mount *mp, *nmp; + register struct statfs *sp; + caddr_t sfsp; + long count, maxcount, error; + + maxcount = SCARG(uap, bufsize) / sizeof(struct statfs); + sfsp = (caddr_t)SCARG(uap, buf); + count = 0; + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } + if (sfsp && count < maxcount) { + sp = &mp->mnt_stat; + /* + * If MNT_NOWAIT or MNT_LAZY is specified, do not + * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY + * overrides MNT_WAIT. + */ + if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 || + (SCARG(uap, flags) & MNT_WAIT)) && + (error = VFS_STATFS(mp, sp, td))) { + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + continue; + } + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + error = copyout(sp, sfsp, sizeof(*sp)); + if (error) { + vfs_unbusy(mp, td); + return (error); + } + sfsp += sizeof(*sp); + } + count++; + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + } + mtx_unlock(&mountlist_mtx); + if (sfsp && count > maxcount) + td->td_retval[0] = maxcount; + else + td->td_retval[0] = count; + return (0); +} + +/* + * Change current working directory to a given file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchdir_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +fchdir(td, uap) + struct thread *td; + struct fchdir_args /* { + syscallarg(int) fd; + } */ *uap; +{ + register struct filedesc *fdp = td->td_proc->p_fd; + struct vnode *vp, *tdp, *vpold; + struct mount *mp; + struct file *fp; + int error; + + if ((error = getvnode(fdp, SCARG(uap, fd), &fp)) != 0) + return (error); + vp = (struct vnode *)fp->f_data; + VREF(vp); + fdrop(fp, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); + while (!error && (mp = vp->v_mountedhere) != NULL) { + if (vfs_busy(mp, 0, 0, td)) + continue; + error = VFS_ROOT(mp, &tdp); + vfs_unbusy(mp, td); + if (error) + break; + vput(vp); + vp = tdp; + } + if (error) { + vput(vp); + return (error); + } + VOP_UNLOCK(vp, 0, td); + FILEDESC_LOCK(fdp); + vpold = fdp->fd_cdir; + fdp->fd_cdir = vp; + FILEDESC_UNLOCK(fdp); + vrele(vpold); + return (0); +} + +/* + * Change current working directory (``.''). + */ +#ifndef _SYS_SYSPROTO_H_ +struct chdir_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +chdir(td, uap) + struct thread *td; + struct chdir_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct filedesc *fdp = td->td_proc->p_fd; + int error; + struct nameidata nd; + struct vnode *vp; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = change_dir(&nd, td)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + FILEDESC_LOCK(fdp); + vp = fdp->fd_cdir; + fdp->fd_cdir = nd.ni_vp; + FILEDESC_UNLOCK(fdp); + vrele(vp); + return (0); +} + +/* + * Helper function for raised chroot(2) security function: Refuse if + * any filedescriptors are open directories. + */ +static int +chroot_refuse_vdir_fds(fdp) + struct filedesc *fdp; +{ + struct vnode *vp; + struct file *fp; + int fd; + + FILEDESC_LOCK(fdp); + for (fd = 0; fd < fdp->fd_nfiles ; fd++) { + fp = fget_locked(fdp, fd); + if (fp == NULL) + continue; + if (fp->f_type == DTYPE_VNODE) { + vp = (struct vnode *)fp->f_data; + if (vp->v_type == VDIR) { + FILEDESC_UNLOCK(fdp); + return (EPERM); + } + } + } + FILEDESC_UNLOCK(fdp); + return (0); +} + +/* + * This sysctl determines if we will allow a process to chroot(2) if it + * has a directory open: + * 0: disallowed for all processes. + * 1: allowed for processes that were not already chroot(2)'ed. + * 2: allowed for all processes. + */ + +static int chroot_allow_open_directories = 1; + +SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, + &chroot_allow_open_directories, 0, ""); + +/* + * Change notion of root (``/'') directory. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chroot_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +chroot(td, uap) + struct thread *td; + struct chroot_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct filedesc *fdp = td->td_proc->p_fd; + int error; + struct nameidata nd; + struct vnode *vp; + + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + FILEDESC_LOCK(fdp); + if (chroot_allow_open_directories == 0 || + (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { + FILEDESC_UNLOCK(fdp); + error = chroot_refuse_vdir_fds(fdp); + } else + FILEDESC_UNLOCK(fdp); + if (error) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = change_dir(&nd, td)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + FILEDESC_LOCK(fdp); + vp = fdp->fd_rdir; + fdp->fd_rdir = nd.ni_vp; + if (!fdp->fd_jdir) { + fdp->fd_jdir = nd.ni_vp; + VREF(fdp->fd_jdir); + } + FILEDESC_UNLOCK(fdp); + vrele(vp); + return (0); +} + +/* + * Common routine for chroot and chdir. + */ +static int +change_dir(ndp, td) + register struct nameidata *ndp; + struct thread *td; +{ + struct vnode *vp; + int error; + + error = namei(ndp); + if (error) + return (error); + vp = ndp->ni_vp; + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); + if (error) + vput(vp); + else + VOP_UNLOCK(vp, 0, td); + return (error); +} + +/* + * Check permissions, allocate an open file structure, + * and call the device open routine if any. + */ +#ifndef _SYS_SYSPROTO_H_ +struct open_args { + char *path; + int flags; + int mode; +}; +#endif +int +open(td, uap) + struct thread *td; + register struct open_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(int) mode; + } */ *uap; +{ + struct proc *p = td->td_proc; + struct filedesc *fdp = p->p_fd; + struct file *fp; + struct vnode *vp; + struct vattr vat; + struct mount *mp; + int cmode, flags, oflags; + struct file *nfp; + int type, indx, error; + struct flock lf; + struct nameidata nd; + + oflags = SCARG(uap, flags); + if ((oflags & O_ACCMODE) == O_ACCMODE) + return (EINVAL); + flags = FFLAGS(oflags); + error = falloc(td, &nfp, &indx); + if (error) + return (error); + fp = nfp; + FILEDESC_LOCK(fdp); + cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; + FILEDESC_UNLOCK(fdp); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + td->td_dupfd = -indx - 1; /* XXX check for fdopen */ + /* + * Bump the ref count to prevent another process from closing + * the descriptor while we are blocked in vn_open() + */ + fhold(fp); + error = vn_open(&nd, &flags, cmode); + if (error) { + /* + * release our own reference + */ + fdrop(fp, td); + + /* + * handle special fdopen() case. bleh. dupfdopen() is + * responsible for dropping the old contents of ofiles[indx] + * if it succeeds. + */ + if ((error == ENODEV || error == ENXIO) && + td->td_dupfd >= 0 && /* XXX from fdopen */ + (error = + dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) { + td->td_retval[0] = indx; + return (0); + } + /* + * Clean up the descriptor, but only if another thread hadn't + * replaced or closed it. + */ + FILEDESC_LOCK(fdp); + if (fdp->fd_ofiles[indx] == fp) { + fdp->fd_ofiles[indx] = NULL; + FILEDESC_UNLOCK(fdp); + fdrop(fp, td); + } else + FILEDESC_UNLOCK(fdp); + + if (error == ERESTART) + error = EINTR; + return (error); + } + td->td_dupfd = 0; + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + + /* + * There should be 2 references on the file, one from the descriptor + * table, and one for us. + * + * Handle the case where someone closed the file (via its file + * descriptor) while we were blocked. The end result should look + * like opening the file succeeded but it was immediately closed. + */ + FILEDESC_LOCK(fdp); + FILE_LOCK(fp); + if (fp->f_count == 1) { + KASSERT(fdp->fd_ofiles[indx] != fp, + ("Open file descriptor lost all refs")); + FILEDESC_UNLOCK(fdp); + FILE_UNLOCK(fp); + VOP_UNLOCK(vp, 0, td); + vn_close(vp, flags & FMASK, fp->f_cred, td); + fdrop(fp, td); + td->td_retval[0] = indx; + return 0; + } + + /* assert that vn_open created a backing object if one is needed */ + KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0, + ("open: vmio vnode has no backing object after vn_open")); + + fp->f_data = vp; + fp->f_flag = flags & FMASK; + fp->f_ops = &vnops; + fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); + FILEDESC_UNLOCK(fdp); + FILE_UNLOCK(fp); + VOP_UNLOCK(vp, 0, td); + if (flags & (O_EXLOCK | O_SHLOCK)) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (flags & O_EXLOCK) + lf.l_type = F_WRLCK; + else + lf.l_type = F_RDLCK; + type = F_FLOCK; + if ((flags & FNONBLOCK) == 0) + type |= F_WAIT; + if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, + type)) != 0) + goto bad; + fp->f_flag |= FHASLOCK; + } + if (flags & O_TRUNC) { + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto bad; + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + VATTR_NULL(&vat); + vat.va_size = 0; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_SETATTR(vp, &vat, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + if (error) + goto bad; + } + /* + * Release our private reference, leaving the one associated with + * the descriptor table intact. + */ + fdrop(fp, td); + td->td_retval[0] = indx; + return (0); +bad: + FILEDESC_LOCK(fdp); + if (fdp->fd_ofiles[indx] == fp) { + fdp->fd_ofiles[indx] = NULL; + FILEDESC_UNLOCK(fdp); + fdrop(fp, td); + } else + FILEDESC_UNLOCK(fdp); + return (error); +} + +#ifdef COMPAT_43 +/* + * Create a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ocreat_args { + char *path; + int mode; +}; +#endif +int +ocreat(td, uap) + struct thread *td; + register struct ocreat_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + struct open_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(int) mode; + } */ nuap; + + SCARG(&nuap, path) = SCARG(uap, path); + SCARG(&nuap, mode) = SCARG(uap, mode); + SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC; + return (open(td, &nuap)); +} +#endif /* COMPAT_43 */ + +/* + * Create a special file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mknod_args { + char *path; + int mode; + int dev; +}; +#endif +/* ARGSUSED */ +int +mknod(td, uap) + struct thread *td; + register struct mknod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + syscallarg(int) dev; + } */ *uap; +{ + struct vnode *vp; + struct mount *mp; + struct vattr vattr; + int error; + int whiteout = 0; + struct nameidata nd; + + switch (SCARG(uap, mode) & S_IFMT) { + case S_IFCHR: + case S_IFBLK: + error = suser(td); + break; + default: + error = suser_cred(td->td_ucred, PRISON_ROOT); + break; + } + if (error) + return (error); +restart: + bwillwrite(); + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + vrele(vp); + error = EEXIST; + } else { + VATTR_NULL(&vattr); + FILEDESC_LOCK(td->td_proc->p_fd); + vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); + vattr.va_rdev = SCARG(uap, dev); + whiteout = 0; + + switch (SCARG(uap, mode) & S_IFMT) { + case S_IFMT: /* used by badsect to flag bad sectors */ + vattr.va_type = VBAD; + break; + case S_IFCHR: + vattr.va_type = VCHR; + break; + case S_IFBLK: + vattr.va_type = VBLK; + break; + case S_IFWHT: + whiteout = 1; + break; + default: + error = EINVAL; + break; + } + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + if (!error) { + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + if (whiteout) + error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); + else { + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, + &nd.ni_cnd, &vattr); + if (error == 0) + vput(nd.ni_vp); + } + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod"); + return (error); +} + +/* + * Create a named pipe. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mkfifo_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +mkfifo(td, uap) + struct thread *td; + register struct mkfifo_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + struct mount *mp; + struct vattr vattr; + int error; + struct nameidata nd; + +restart: + bwillwrite(); + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + if (nd.ni_vp != NULL) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(nd.ni_vp); + vput(nd.ni_dvp); + return (EEXIST); + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VATTR_NULL(&vattr); + vattr.va_type = VFIFO; + FILEDESC_LOCK(td->td_proc->p_fd); + vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + if (error == 0) + vput(nd.ni_vp); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vn_finished_write(mp); + return (error); +} + +/* + * Make a hard file link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct link_args { + char *path; + char *link; +}; +#endif +/* ARGSUSED */ +int +link(td, uap) + struct thread *td; + register struct link_args /* { + syscallarg(char *) path; + syscallarg(char *) link; + } */ *uap; +{ + struct vnode *vp; + struct mount *mp; + struct nameidata nd; + int error; + + bwillwrite(); + NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + if (vp->v_type == VDIR) { + vrele(vp); + return (EPERM); /* POSIX */ + } + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + vrele(vp); + return (error); + } + NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td); + if ((error = namei(&nd)) == 0) { + if (nd.ni_vp != NULL) { + vrele(nd.ni_vp); + error = EEXIST; + } else { + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + } + vrele(vp); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "link"); + return (error); +} + +/* + * Make a symbolic link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct symlink_args { + char *path; + char *link; +}; +#endif +/* ARGSUSED */ +int +symlink(td, uap) + struct thread *td; + register struct symlink_args /* { + syscallarg(char *) path; + syscallarg(char *) link; + } */ *uap; +{ + struct mount *mp; + struct vattr vattr; + char *path; + int error; + struct nameidata nd; + + path = uma_zalloc(namei_zone, M_WAITOK); + if ((error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) != 0) + goto out; +restart: + bwillwrite(); + NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td); + if ((error = namei(&nd)) != 0) + goto out; + if (nd.ni_vp) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(nd.ni_vp); + vput(nd.ni_dvp); + error = EEXIST; + goto out; + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VATTR_NULL(&vattr); + FILEDESC_LOCK(td->td_proc->p_fd); + vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path); + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error == 0) + vput(nd.ni_vp); + vput(nd.ni_dvp); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); +out: + uma_zfree(namei_zone, path); + return (error); +} + +/* + * Delete a whiteout from the filesystem. + */ +/* ARGSUSED */ +int +undelete(td, uap) + struct thread *td; + register struct undelete_args /* { + syscallarg(char *) path; + } */ *uap; +{ + int error; + struct mount *mp; + struct nameidata nd; + +restart: + bwillwrite(); + NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE, + SCARG(uap, path), td); + error = namei(&nd); + if (error) + return (error); + + if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_vp) + vrele(nd.ni_vp); + vput(nd.ni_dvp); + return (EEXIST); + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete"); + return (error); +} + +/* + * Delete a name from the filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct unlink_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +unlink(td, uap) + struct thread *td; + struct unlink_args /* { + syscallarg(char *) path; + } */ *uap; +{ + struct mount *mp; + struct vnode *vp; + int error; + struct nameidata nd; + +restart: + bwillwrite(); + NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if (vp->v_type == VDIR) + error = EPERM; /* POSIX */ + else { + /* + * The root of a mounted filesystem cannot be deleted. + * + * XXX: can this only be a VDIR case? + */ + if (vp->v_flag & VROOT) + error = EBUSY; + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(vp); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + if (!error) { + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vput(vp); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink"); + return (error); +} + +/* + * Reposition read/write file offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lseek_args { + int fd; + int pad; + off_t offset; + int whence; +}; +#endif +int +lseek(td, uap) + struct thread *td; + register struct lseek_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) offset; + syscallarg(int) whence; + } */ *uap; +{ + struct ucred *cred = td->td_ucred; + struct file *fp; + struct vnode *vp; + struct vattr vattr; + off_t offset; + int error, noneg; + + if ((error = fget(td, uap->fd, &fp)) != 0) + return (error); + if (fp->f_type != DTYPE_VNODE) { + fdrop(fp, td); + return (ESPIPE); + } + vp = (struct vnode *)fp->f_data; + noneg = (vp->v_type != VCHR); + offset = SCARG(uap, offset); + switch (SCARG(uap, whence)) { + case L_INCR: + if (noneg && + (fp->f_offset < 0 || + (offset > 0 && fp->f_offset > OFF_MAX - offset))) + return (EOVERFLOW); + offset += fp->f_offset; + break; + case L_XTND: + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_GETATTR(vp, &vattr, cred, td); + VOP_UNLOCK(vp, 0, td); + if (error) + return (error); + if (noneg && + (vattr.va_size > OFF_MAX || + (offset > 0 && vattr.va_size > OFF_MAX - offset))) + return (EOVERFLOW); + offset += vattr.va_size; + break; + case L_SET: + break; + default: + fdrop(fp, td); + return (EINVAL); + } + if (noneg && offset < 0) + return (EINVAL); + fp->f_offset = offset; + *(off_t *)(td->td_retval) = fp->f_offset; + fdrop(fp, td); + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Reposition read/write file offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct olseek_args { + int fd; + long offset; + int whence; +}; +#endif +int +olseek(td, uap) + struct thread *td; + register struct olseek_args /* { + syscallarg(int) fd; + syscallarg(long) offset; + syscallarg(int) whence; + } */ *uap; +{ + struct lseek_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) offset; + syscallarg(int) whence; + } */ nuap; + int error; + + SCARG(&nuap, fd) = SCARG(uap, fd); + SCARG(&nuap, offset) = SCARG(uap, offset); + SCARG(&nuap, whence) = SCARG(uap, whence); + error = lseek(td, &nuap); + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Check access permissions using passed credentials. + */ +static int +vn_access(vp, user_flags, cred, td) + struct vnode *vp; + int user_flags; + struct ucred *cred; + struct thread *td; +{ + int error, flags; + + /* Flags == 0 means only check for existence. */ + error = 0; + if (user_flags) { + flags = 0; + if (user_flags & R_OK) + flags |= VREAD; + if (user_flags & W_OK) + flags |= VWRITE; + if (user_flags & X_OK) + flags |= VEXEC; + if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) + error = VOP_ACCESS(vp, flags, cred, td); + } + return (error); +} + +/* + * Check access permissions using "real" credentials. + */ +#ifndef _SYS_SYSPROTO_H_ +struct access_args { + char *path; + int flags; +}; +#endif +int +access(td, uap) + struct thread *td; + register struct access_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + struct ucred *cred, *tmpcred; + register struct vnode *vp; + int error; + struct nameidata nd; + + /* + * Create and modify a temporary credential instead of one that + * is potentially shared. This could also mess up socket + * buffer accounting which can run in an interrupt context. + * + * XXX - Depending on how "threads" are finally implemented, it + * may be better to explicitly pass the credential to namei() + * rather than to modify the potentially shared process structure. + */ + cred = td->td_ucred; + tmpcred = crdup(cred); + tmpcred->cr_uid = cred->cr_ruid; + tmpcred->cr_groups[0] = cred->cr_rgid; + td->td_ucred = tmpcred; + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + goto out1; + vp = nd.ni_vp; + + error = vn_access(vp, SCARG(uap, flags), tmpcred, td); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(vp); +out1: + td->td_ucred = cred; + crfree(tmpcred); + return (error); +} + +/* + * Check access permissions using "effective" credentials. + */ +#ifndef _SYS_SYSPROTO_H_ +struct eaccess_args { + char *path; + int flags; +}; +#endif +int +eaccess(td, uap) + struct thread *td; + register struct eaccess_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + struct nameidata nd; + struct vnode *vp; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + + error = vn_access(vp, SCARG(uap, flags), td->td_ucred, td); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(vp); + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Get file status; this version follows links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ostat_args { + char *path; + struct ostat *ub; +}; +#endif +/* ARGSUSED */ +int +ostat(td, uap) + struct thread *td; + register struct ostat_args /* { + syscallarg(char *) path; + syscallarg(struct ostat *) ub; + } */ *uap; +{ + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = vn_stat(nd.ni_vp, &sb, td); + vput(nd.ni_vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout(&osb, SCARG(uap, ub), sizeof (osb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct olstat_args { + char *path; + struct ostat *ub; +}; +#endif +/* ARGSUSED */ +int +olstat(td, uap) + struct thread *td; + register struct olstat_args /* { + syscallarg(char *) path; + syscallarg(struct ostat *) ub; + } */ *uap; +{ + struct vnode *vp; + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + error = vn_stat(vp, &sb, td); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout(&osb, SCARG(uap, ub), sizeof (osb)); + return (error); +} + +/* + * Convert from an old to a new stat structure. + */ +void +cvtstat(st, ost) + struct stat *st; + struct ostat *ost; +{ + + ost->st_dev = st->st_dev; + ost->st_ino = st->st_ino; + ost->st_mode = st->st_mode; + ost->st_nlink = st->st_nlink; + ost->st_uid = st->st_uid; + ost->st_gid = st->st_gid; + ost->st_rdev = st->st_rdev; + if (st->st_size < (quad_t)1 << 32) + ost->st_size = st->st_size; + else + ost->st_size = -2; + ost->st_atime = st->st_atime; + ost->st_mtime = st->st_mtime; + ost->st_ctime = st->st_ctime; + ost->st_blksize = st->st_blksize; + ost->st_blocks = st->st_blocks; + ost->st_flags = st->st_flags; + ost->st_gen = st->st_gen; +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Get file status; this version follows links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct stat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +stat(td, uap) + struct thread *td; + register struct stat_args /* { + syscallarg(char *) path; + syscallarg(struct stat *) ub; + } */ *uap; +{ + struct stat sb; + int error; + struct nameidata nd; + +#ifdef LOOKUP_SHARED + NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | NOOBJ, + UIO_USERSPACE, SCARG(uap, path), td); +#else + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); +#endif + if ((error = namei(&nd)) != 0) + return (error); + error = vn_stat(nd.ni_vp, &sb, td); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_vp); + if (error) + return (error); + error = copyout(&sb, SCARG(uap, ub), sizeof (sb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lstat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +lstat(td, uap) + struct thread *td; + register struct lstat_args /* { + syscallarg(char *) path; + syscallarg(struct stat *) ub; + } */ *uap; +{ + int error; + struct vnode *vp; + struct stat sb; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + error = vn_stat(vp, &sb, td); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(vp); + if (error) + return (error); + error = copyout(&sb, SCARG(uap, ub), sizeof (sb)); + return (error); +} + +/* + * Implementation of the NetBSD stat() function. + * XXX This should probably be collapsed with the FreeBSD version, + * as the differences are only due to vn_stat() clearing spares at + * the end of the structures. vn_stat could be split to avoid this, + * and thus collapse the following to close to zero code. + */ +void +cvtnstat(sb, nsb) + struct stat *sb; + struct nstat *nsb; +{ + bzero(nsb, sizeof *nsb); + nsb->st_dev = sb->st_dev; + nsb->st_ino = sb->st_ino; + nsb->st_mode = sb->st_mode; + nsb->st_nlink = sb->st_nlink; + nsb->st_uid = sb->st_uid; + nsb->st_gid = sb->st_gid; + nsb->st_rdev = sb->st_rdev; + nsb->st_atimespec = sb->st_atimespec; + nsb->st_mtimespec = sb->st_mtimespec; + nsb->st_ctimespec = sb->st_ctimespec; + nsb->st_size = sb->st_size; + nsb->st_blocks = sb->st_blocks; + nsb->st_blksize = sb->st_blksize; + nsb->st_flags = sb->st_flags; + nsb->st_gen = sb->st_gen; + nsb->st_createtimespec = sb->st_createtimespec; +} + +#ifndef _SYS_SYSPROTO_H_ +struct nstat_args { + char *path; + struct nstat *ub; +}; +#endif +/* ARGSUSED */ +int +nstat(td, uap) + struct thread *td; + register struct nstat_args /* { + syscallarg(char *) path; + syscallarg(struct nstat *) ub; + } */ *uap; +{ + struct stat sb; + struct nstat nsb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = vn_stat(nd.ni_vp, &sb, td); + vput(nd.ni_vp); + if (error) + return (error); + cvtnstat(&sb, &nsb); + error = copyout(&nsb, SCARG(uap, ub), sizeof (nsb)); + return (error); +} + +/* + * NetBSD lstat. Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lstat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +nlstat(td, uap) + struct thread *td; + register struct nlstat_args /* { + syscallarg(char *) path; + syscallarg(struct nstat *) ub; + } */ *uap; +{ + int error; + struct vnode *vp; + struct stat sb; + struct nstat nsb; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + error = vn_stat(vp, &sb, td); + vput(vp); + if (error) + return (error); + cvtnstat(&sb, &nsb); + error = copyout(&nsb, SCARG(uap, ub), sizeof (nsb)); + return (error); +} + +/* + * Get configurable pathname variables. + */ +#ifndef _SYS_SYSPROTO_H_ +struct pathconf_args { + char *path; + int name; +}; +#endif +/* ARGSUSED */ +int +pathconf(td, uap) + struct thread *td; + register struct pathconf_args /* { + syscallarg(char *) path; + syscallarg(int) name; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), td->td_retval); + vput(nd.ni_vp); + return (error); +} + +/* + * Return target name of a symbolic link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct readlink_args { + char *path; + char *buf; + int count; +}; +#endif +/* ARGSUSED */ +int +readlink(td, uap) + struct thread *td; + register struct readlink_args /* { + syscallarg(char *) path; + syscallarg(char *) buf; + syscallarg(int) count; + } */ *uap; +{ + register struct vnode *vp; + struct iovec aiov; + struct uio auio; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + if (vp->v_type != VLNK) + error = EINVAL; + else { + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + auio.uio_resid = SCARG(uap, count); + error = VOP_READLINK(vp, &auio, td->td_ucred); + } + vput(vp); + td->td_retval[0] = SCARG(uap, count) - auio.uio_resid; + return (error); +} + +/* + * Common implementation code for chflags() and fchflags(). + */ +static int +setfflags(td, vp, flags) + struct thread *td; + struct vnode *vp; + int flags; +{ + int error; + struct mount *mp; + struct vattr vattr; + + /* + * Prevent non-root users from setting flags on devices. When + * a device is reused, users can retain ownership of the device + * if they are allowed to set flags and programs assume that + * chown can't fail when done as root. + */ + if (vp->v_type == VCHR || vp->v_type == VBLK) { + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + } + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + VATTR_NULL(&vattr); + vattr.va_flags = flags; + error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return (error); +} + +/* + * Change flags of a file given a path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chflags_args { + char *path; + int flags; +}; +#endif +/* ARGSUSED */ +int +chflags(td, uap) + struct thread *td; + register struct chflags_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfflags(td, nd.ni_vp, SCARG(uap, flags)); + vrele(nd.ni_vp); + return error; +} + +/* + * Same as chflags() but doesn't follow symlinks. + */ +int +lchflags(td, uap) + struct thread *td; + register struct lchflags_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfflags(td, nd.ni_vp, SCARG(uap, flags)); + vrele(nd.ni_vp); + return error; +} + +/* + * Change flags of a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchflags_args { + int fd; + int flags; +}; +#endif +/* ARGSUSED */ +int +fchflags(td, uap) + struct thread *td; + register struct fchflags_args /* { + syscallarg(int) fd; + syscallarg(int) flags; + } */ *uap; +{ + struct file *fp; + int error; + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + error = setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags)); + fdrop(fp, td); + return (error); +} + +/* + * Common implementation code for chmod(), lchmod() and fchmod(). + */ +static int +setfmode(td, vp, mode) + struct thread *td; + struct vnode *vp; + int mode; +{ + int error; + struct mount *mp; + struct vattr vattr; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + VATTR_NULL(&vattr); + vattr.va_mode = mode & ALLPERMS; + error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return error; +} + +/* + * Change mode of a file given path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chmod_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +chmod(td, uap) + struct thread *td; + register struct chmod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfmode(td, nd.ni_vp, SCARG(uap, mode)); + vrele(nd.ni_vp); + return error; +} + +/* + * Change mode of a file given path name (don't follow links.) + */ +#ifndef _SYS_SYSPROTO_H_ +struct lchmod_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +lchmod(td, uap) + struct thread *td; + register struct lchmod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfmode(td, nd.ni_vp, SCARG(uap, mode)); + vrele(nd.ni_vp); + return error; +} + +/* + * Change mode of a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchmod_args { + int fd; + int mode; +}; +#endif +/* ARGSUSED */ +int +fchmod(td, uap) + struct thread *td; + register struct fchmod_args /* { + syscallarg(int) fd; + syscallarg(int) mode; + } */ *uap; +{ + struct file *fp; + struct vnode *vp; + int error; + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + vp = (struct vnode *)fp->f_data; + error = setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode)); + fdrop(fp, td); + return (error); +} + +/* + * Common implementation for chown(), lchown(), and fchown() + */ +static int +setfown(td, vp, uid, gid) + struct thread *td; + struct vnode *vp; + uid_t uid; + gid_t gid; +{ + int error; + struct mount *mp; + struct vattr vattr; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + VATTR_NULL(&vattr); + vattr.va_uid = uid; + vattr.va_gid = gid; + error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return error; +} + +/* + * Set ownership given a path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chown_args { + char *path; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +chown(td, uap) + struct thread *td; + register struct chown_args /* { + syscallarg(char *) path; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid)); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set ownership given a path name, do not cross symlinks. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lchown_args { + char *path; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +lchown(td, uap) + struct thread *td; + register struct lchown_args /* { + syscallarg(char *) path; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid)); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set ownership given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchown_args { + int fd; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +fchown(td, uap) + struct thread *td; + register struct fchown_args /* { + syscallarg(int) fd; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; +{ + struct file *fp; + struct vnode *vp; + int error; + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + vp = (struct vnode *)fp->f_data; + error = setfown(td, (struct vnode *)fp->f_data, + SCARG(uap, uid), SCARG(uap, gid)); + fdrop(fp, td); + return (error); +} + +/* + * Common implementation code for utimes(), lutimes(), and futimes(). + */ +static int +getutimes(usrtvp, tsp) + const struct timeval *usrtvp; + struct timespec *tsp; +{ + struct timeval tv[2]; + int error; + + if (usrtvp == NULL) { + microtime(&tv[0]); + TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); + tsp[1] = tsp[0]; + } else { + if ((error = copyin(usrtvp, tv, sizeof (tv))) != 0) + return (error); + TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); + TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]); + } + return 0; +} + +/* + * Common implementation code for utimes(), lutimes(), and futimes(). + */ +static int +setutimes(td, vp, ts, nullflag) + struct thread *td; + struct vnode *vp; + const struct timespec *ts; + int nullflag; +{ + int error; + struct mount *mp; + struct vattr vattr; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + VATTR_NULL(&vattr); + vattr.va_atime = ts[0]; + vattr.va_mtime = ts[1]; + if (nullflag) + vattr.va_vaflags |= VA_UTIMES_NULL; + error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return error; +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct utimes_args { + char *path; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +utimes(td, uap) + struct thread *td; + register struct utimes_args /* { + syscallarg(char *) path; + syscallarg(struct timeval *) tptr; + } */ *uap; +{ + struct timespec ts[2]; + struct timeval *usrtvp; + int error; + struct nameidata nd; + + usrtvp = SCARG(uap, tptr); + if ((error = getutimes(usrtvp, ts)) != 0) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lutimes_args { + char *path; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +lutimes(td, uap) + struct thread *td; + register struct lutimes_args /* { + syscallarg(char *) path; + syscallarg(struct timeval *) tptr; + } */ *uap; +{ + struct timespec ts[2]; + struct timeval *usrtvp; + int error; + struct nameidata nd; + + usrtvp = SCARG(uap, tptr); + if ((error = getutimes(usrtvp, ts)) != 0) + return (error); + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct futimes_args { + int fd; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +futimes(td, uap) + struct thread *td; + register struct futimes_args /* { + syscallarg(int ) fd; + syscallarg(struct timeval *) tptr; + } */ *uap; +{ + struct timespec ts[2]; + struct file *fp; + struct timeval *usrtvp; + int error; + + usrtvp = SCARG(uap, tptr); + if ((error = getutimes(usrtvp, ts)) != 0) + return (error); + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + error = setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL); + fdrop(fp, td); + return (error); +} + +/* + * Truncate a file given its path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct truncate_args { + char *path; + int pad; + off_t length; +}; +#endif +/* ARGSUSED */ +int +truncate(td, uap) + struct thread *td; + register struct truncate_args /* { + syscallarg(char *) path; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ *uap; +{ + struct mount *mp; + struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + if (uap->length < 0) + return(EINVAL); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + vrele(vp); + return (error); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0 && + (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = SCARG(uap, length); + error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); + } + vput(vp); + vn_finished_write(mp); + return (error); +} + +/* + * Truncate a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ftruncate_args { + int fd; + int pad; + off_t length; +}; +#endif +/* ARGSUSED */ +int +ftruncate(td, uap) + struct thread *td; + register struct ftruncate_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ *uap; +{ + struct mount *mp; + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (uap->length < 0) + return(EINVAL); + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + if ((fp->f_flag & FWRITE) == 0) { + fdrop(fp, td); + return (EINVAL); + } + vp = (struct vnode *)fp->f_data; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + fdrop(fp, td); + return (error); + } + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = SCARG(uap, length); + error = VOP_SETATTR(vp, &vattr, fp->f_cred, td); + } + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + fdrop(fp, td); + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Truncate a file given its path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct otruncate_args { + char *path; + long length; +}; +#endif +/* ARGSUSED */ +int +otruncate(td, uap) + struct thread *td; + register struct otruncate_args /* { + syscallarg(char *) path; + syscallarg(long) length; + } */ *uap; +{ + struct truncate_args /* { + syscallarg(char *) path; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ nuap; + + SCARG(&nuap, path) = SCARG(uap, path); + SCARG(&nuap, length) = SCARG(uap, length); + return (truncate(td, &nuap)); +} + +/* + * Truncate a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct oftruncate_args { + int fd; + long length; +}; +#endif +/* ARGSUSED */ +int +oftruncate(td, uap) + struct thread *td; + register struct oftruncate_args /* { + syscallarg(int) fd; + syscallarg(long) length; + } */ *uap; +{ + struct ftruncate_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ nuap; + + SCARG(&nuap, fd) = SCARG(uap, fd); + SCARG(&nuap, length) = SCARG(uap, length); + return (ftruncate(td, &nuap)); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Sync an open file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fsync_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +fsync(td, uap) + struct thread *td; + struct fsync_args /* { + syscallarg(int) fd; + } */ *uap; +{ + struct vnode *vp; + struct mount *mp; + struct file *fp; + vm_object_t obj; + int error; + + GIANT_REQUIRED; + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + vp = (struct vnode *)fp->f_data; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + fdrop(fp, td); + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + if (VOP_GETVOBJECT(vp, &obj) == 0) { + vm_object_page_clean(obj, 0, 0, 0); + } + error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, td); +#ifdef SOFTUPDATES + if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)) + error = softdep_fsync(vp); +#endif + + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + fdrop(fp, td); + return (error); +} + +/* + * Rename files. Source and destination must either both be directories, + * or both not be directories. If target is a directory, it must be empty. + */ +#ifndef _SYS_SYSPROTO_H_ +struct rename_args { + char *from; + char *to; +}; +#endif +/* ARGSUSED */ +int +rename(td, uap) + struct thread *td; + register struct rename_args /* { + syscallarg(char *) from; + syscallarg(char *) to; + } */ *uap; +{ + struct mount *mp; + struct vnode *tvp, *fvp, *tdvp; + struct nameidata fromnd, tond; + int error; + + bwillwrite(); + NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE, + SCARG(uap, from), td); + if ((error = namei(&fromnd)) != 0) + return (error); + fvp = fromnd.ni_vp; + if ((error = vn_start_write(fvp, &mp, V_WAIT | PCATCH)) != 0) { + NDFREE(&fromnd, NDF_ONLY_PNBUF); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ, + UIO_USERSPACE, SCARG(uap, to), td); + if (fromnd.ni_vp->v_type == VDIR) + tond.ni_cnd.cn_flags |= WILLBEDIR; + if ((error = namei(&tond)) != 0) { + /* Translate error code for rename("dir1", "dir2/."). */ + if (error == EISDIR && fvp->v_type == VDIR) + error = EINVAL; + NDFREE(&fromnd, NDF_ONLY_PNBUF); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + if (fvp == tdvp) + error = EINVAL; + /* + * If source is the same as the destination (that is the + * same inode number with the same name in the same directory), + * then there is nothing to do. + */ + if (fvp == tvp && fromnd.ni_dvp == tdvp && + fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && + !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, + fromnd.ni_cnd.cn_namelen)) + error = -1; +out: + if (!error) { + VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE); + if (fromnd.ni_dvp != tdvp) { + VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + } + if (tvp) { + VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE); + } + error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, + tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); + NDFREE(&fromnd, NDF_ONLY_PNBUF); + NDFREE(&tond, NDF_ONLY_PNBUF); + } else { + NDFREE(&fromnd, NDF_ONLY_PNBUF); + NDFREE(&tond, NDF_ONLY_PNBUF); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + vrele(tond.ni_startdir); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename"); + ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename"); + ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename"); + ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename"); +out1: + if (fromnd.ni_startdir) + vrele(fromnd.ni_startdir); + if (error == -1) + return (0); + return (error); +} + +/* + * Make a directory file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mkdir_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +mkdir(td, uap) + struct thread *td; + register struct mkdir_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + + return vn_mkdir(uap->path, uap->mode, UIO_USERSPACE, td); +} + +int +vn_mkdir(path, mode, segflg, td) + char *path; + int mode; + enum uio_seg segflg; + struct thread *td; +{ + struct mount *mp; + struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + +restart: + bwillwrite(); + NDINIT(&nd, CREATE, LOCKPARENT, segflg, path, td); + nd.ni_cnd.cn_flags |= WILLBEDIR; + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(vp); + /* + * XXX namei called with LOCKPARENT but not LOCKLEAF has + * the strange behaviour of leaving the vnode unlocked + * if the target is the same vnode as the parent. + */ + if (vp == nd.ni_dvp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + return (EEXIST); + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VATTR_NULL(&vattr); + vattr.va_type = VDIR; + FILEDESC_LOCK(td->td_proc->p_fd); + vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if (!error) + vput(nd.ni_vp); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir"); + return (error); +} + +/* + * Remove a directory file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct rmdir_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +rmdir(td, uap) + struct thread *td; + struct rmdir_args /* { + syscallarg(char *) path; + } */ *uap; +{ + struct mount *mp; + struct vnode *vp; + int error; + struct nameidata nd; + +restart: + bwillwrite(); + NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + /* + * No rmdir "." please. + */ + if (nd.ni_dvp == vp) { + error = EINVAL; + goto out; + } + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) { + error = EBUSY; + goto out; + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + vn_finished_write(mp); +out: + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir"); + return (error); +} + +#ifdef COMPAT_43 +/* + * Read a block of directory entries in a filesystem independent format. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ogetdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif +int +ogetdirentries(td, uap) + struct thread *td; + register struct ogetdirentries_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + syscallarg(long *) basep; + } */ *uap; +{ + struct vnode *vp; + struct file *fp; + struct uio auio, kuio; + struct iovec aiov, kiov; + struct dirent *dp, *edp; + caddr_t dirbuf; + int error, eofflag, readcnt; + long loff; + + /* XXX arbitrary sanity limit on `count'. */ + if (SCARG(uap, count) > 64 * 1024) + return (EINVAL); + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); + return (EBADF); + } + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) { + fdrop(fp, td); + return (EINVAL); + } + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + auio.uio_resid = SCARG(uap, count); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + loff = auio.uio_offset = fp->f_offset; +# if (BYTE_ORDER != LITTLE_ENDIAN) + if (vp->v_mount->mnt_maxsymlinklen <= 0) { + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, + NULL, NULL); + fp->f_offset = auio.uio_offset; + } else +# endif + { + kuio = auio; + kuio.uio_iov = &kiov; + kuio.uio_segflg = UIO_SYSSPACE; + kiov.iov_len = SCARG(uap, count); + MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK); + kiov.iov_base = dirbuf; + error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, + NULL, NULL); + fp->f_offset = kuio.uio_offset; + if (error == 0) { + readcnt = SCARG(uap, count) - kuio.uio_resid; + edp = (struct dirent *)&dirbuf[readcnt]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + /* + * The expected low byte of + * dp->d_namlen is our dp->d_type. + * The high MBZ byte of dp->d_namlen + * is our dp->d_namlen. + */ + dp->d_type = dp->d_namlen; + dp->d_namlen = 0; +# else + /* + * The dp->d_type is the high byte + * of the expected dp->d_namlen, + * so must be zero'ed. + */ + dp->d_type = 0; +# endif + if (dp->d_reclen > 0) { + dp = (struct dirent *) + ((char *)dp + dp->d_reclen); + } else { + error = EIO; + break; + } + } + if (dp >= edp) + error = uiomove(dirbuf, readcnt, &auio); + } + FREE(dirbuf, M_TEMP); + } + VOP_UNLOCK(vp, 0, td); + if (error) { + fdrop(fp, td); + return (error); + } + if (SCARG(uap, count) == auio.uio_resid) { + if (union_dircheckp) { + error = union_dircheckp(td, &vp, fp); + if (error == -1) + goto unionread; + if (error) { + fdrop(fp, td); + return (error); + } + } + if ((vp->v_flag & VROOT) && + (vp->v_mount->mnt_flag & MNT_UNION)) { + struct vnode *tvp = vp; + vp = vp->v_mount->mnt_vnodecovered; + VREF(vp); + fp->f_data = vp; + fp->f_offset = 0; + vrele(tvp); + goto unionread; + } + } + error = copyout(&loff, SCARG(uap, basep), sizeof(long)); + fdrop(fp, td); + td->td_retval[0] = SCARG(uap, count) - auio.uio_resid; + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Read a block of directory entries in a filesystem independent format. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif +int +getdirentries(td, uap) + struct thread *td; + register struct getdirentries_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + syscallarg(long *) basep; + } */ *uap; +{ + struct vnode *vp; + struct file *fp; + struct uio auio; + struct iovec aiov; + long loff; + int error, eofflag; + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); + return (EBADF); + } + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) { + fdrop(fp, td); + return (EINVAL); + } + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + auio.uio_resid = SCARG(uap, count); + /* vn_lock(vp, LK_SHARED | LK_RETRY, td); */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + loff = auio.uio_offset = fp->f_offset; + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); + fp->f_offset = auio.uio_offset; + VOP_UNLOCK(vp, 0, td); + if (error) { + fdrop(fp, td); + return (error); + } + if (SCARG(uap, count) == auio.uio_resid) { + if (union_dircheckp) { + error = union_dircheckp(td, &vp, fp); + if (error == -1) + goto unionread; + if (error) { + fdrop(fp, td); + return (error); + } + } + if ((vp->v_flag & VROOT) && + (vp->v_mount->mnt_flag & MNT_UNION)) { + struct vnode *tvp = vp; + vp = vp->v_mount->mnt_vnodecovered; + VREF(vp); + fp->f_data = vp; + fp->f_offset = 0; + vrele(tvp); + goto unionread; + } + } + if (SCARG(uap, basep) != NULL) { + error = copyout(&loff, SCARG(uap, basep), sizeof(long)); + } + td->td_retval[0] = SCARG(uap, count) - auio.uio_resid; + fdrop(fp, td); + return (error); +} +#ifndef _SYS_SYSPROTO_H_ +struct getdents_args { + int fd; + char *buf; + size_t count; +}; +#endif +int +getdents(td, uap) + struct thread *td; + register struct getdents_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + } */ *uap; +{ + struct getdirentries_args ap; + ap.fd = uap->fd; + ap.buf = uap->buf; + ap.count = uap->count; + ap.basep = NULL; + return getdirentries(td, &ap); +} + +/* + * Set the mode mask for creation of filesystem nodes. + * + * MP SAFE + */ +#ifndef _SYS_SYSPROTO_H_ +struct umask_args { + int newmask; +}; +#endif +int +umask(td, uap) + struct thread *td; + struct umask_args /* { + syscallarg(int) newmask; + } */ *uap; +{ + register struct filedesc *fdp; + + FILEDESC_LOCK(td->td_proc->p_fd); + fdp = td->td_proc->p_fd; + td->td_retval[0] = fdp->fd_cmask; + fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS; + FILEDESC_UNLOCK(td->td_proc->p_fd); + return (0); +} + +/* + * Void all references to file by ripping underlying filesystem + * away from vnode. + */ +#ifndef _SYS_SYSPROTO_H_ +struct revoke_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +revoke(td, uap) + struct thread *td; + register struct revoke_args /* { + syscallarg(char *) path; + } */ *uap; +{ + struct mount *mp; + struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), + td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (vp->v_type != VCHR) { + vput(vp); + return (EINVAL); + } + error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); + if (error) { + vput(vp); + return (error); + } + VOP_UNLOCK(vp, 0, td); + if (td->td_ucred->cr_uid != vattr.va_uid) { + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + goto out; + } + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto out; + if (vcount(vp) > 1) + VOP_REVOKE(vp, REVOKEALL); + vn_finished_write(mp); +out: + vrele(vp); + return (error); +} + +/* + * Convert a user file descriptor to a kernel file entry. + * The file entry is locked upon returning. + */ +int +getvnode(fdp, fd, fpp) + struct filedesc *fdp; + int fd; + struct file **fpp; +{ + int error; + struct file *fp; + + fp = NULL; + if (fdp == NULL) + error = EBADF; + else { + FILEDESC_LOCK(fdp); + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + error = EBADF; + else if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { + fp = NULL; + error = EINVAL; + } else { + fhold(fp); + error = 0; + } + FILEDESC_UNLOCK(fdp); + } + *fpp = fp; + return (error); +} +/* + * Get (NFS) file handle + */ +#ifndef _SYS_SYSPROTO_H_ +struct getfh_args { + char *fname; + fhandle_t *fhp; +}; +#endif +int +getfh(td, uap) + struct thread *td; + register struct getfh_args *uap; +{ + struct nameidata nd; + fhandle_t fh; + register struct vnode *vp; + int error; + + /* + * Must be super user + */ + error = suser(td); + if (error) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, td); + error = namei(&nd); + if (error) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + bzero(&fh, sizeof(fh)); + fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; + error = VFS_VPTOFH(vp, &fh.fh_fid); + vput(vp); + if (error) + return (error); + error = copyout(&fh, uap->fhp, sizeof (fh)); + return (error); +} + +/* + * syscall for the rpc.lockd to use to translate a NFS file handle into + * an open descriptor. + * + * warning: do not remove the suser() call or this becomes one giant + * security hole. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fhopen_args { + const struct fhandle *u_fhp; + int flags; +}; +#endif +int +fhopen(td, uap) + struct thread *td; + struct fhopen_args /* { + syscallarg(const struct fhandle *) u_fhp; + syscallarg(int) flags; + } */ *uap; +{ + struct proc *p = td->td_proc; + struct mount *mp; + struct vnode *vp; + struct fhandle fhp; + struct vattr vat; + struct vattr *vap = &vat; + struct flock lf; + struct file *fp; + register struct filedesc *fdp = p->p_fd; + int fmode, mode, error, type; + struct file *nfp; + int indx; + + /* + * Must be super user + */ + error = suser(td); + if (error) + return (error); + + fmode = FFLAGS(SCARG(uap, flags)); + /* why not allow a non-read/write open for our lockd? */ + if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) + return (EINVAL); + error = copyin(SCARG(uap,u_fhp), &fhp, sizeof(fhp)); + if (error) + return(error); + /* find the mount point */ + mp = vfs_getvfs(&fhp.fh_fsid); + if (mp == NULL) + return (ESTALE); + /* now give me my vnode, it gets returned to me locked */ + error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp); + if (error) + return (error); + /* + * from now on we have to make sure not + * to forget about the vnode + * any error that causes an abort must vput(vp) + * just set error = err and 'goto bad;'. + */ + + /* + * from vn_open + */ + if (vp->v_type == VLNK) { + error = EMLINK; + goto bad; + } + if (vp->v_type == VSOCK) { + error = EOPNOTSUPP; + goto bad; + } + mode = 0; + if (fmode & (FWRITE | O_TRUNC)) { + if (vp->v_type == VDIR) { + error = EISDIR; + goto bad; + } + error = vn_writechk(vp); + if (error) + goto bad; + mode |= VWRITE; + } + if (fmode & FREAD) + mode |= VREAD; + if (mode) { + error = VOP_ACCESS(vp, mode, td->td_ucred, td); + if (error) + goto bad; + } + if (fmode & O_TRUNC) { + VOP_UNLOCK(vp, 0, td); /* XXX */ + if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) { + vrele(vp); + return (error); + } + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* XXX */ + VATTR_NULL(vap); + vap->va_size = 0; + error = VOP_SETATTR(vp, vap, td->td_ucred, td); + vn_finished_write(mp); + if (error) + goto bad; + } + error = VOP_OPEN(vp, fmode, td->td_ucred, td); + if (error) + goto bad; + /* + * Make sure that a VM object is created for VMIO support. + */ + if (vn_canvmio(vp) == TRUE) { + if ((error = vfs_object_create(vp, td, td->td_ucred)) != 0) + goto bad; + } + if (fmode & FWRITE) + vp->v_writecount++; + + /* + * end of vn_open code + */ + + if ((error = falloc(td, &nfp, &indx)) != 0) { + if (fmode & FWRITE) + vp->v_writecount--; + goto bad; + } + fp = nfp; + + /* + * Hold an extra reference to avoid having fp ripped out + * from under us while we block in the lock op + */ + fhold(fp); + nfp->f_data = vp; + nfp->f_flag = fmode & FMASK; + nfp->f_ops = &vnops; + nfp->f_type = DTYPE_VNODE; + if (fmode & (O_EXLOCK | O_SHLOCK)) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (fmode & O_EXLOCK) + lf.l_type = F_WRLCK; + else + lf.l_type = F_RDLCK; + type = F_FLOCK; + if ((fmode & FNONBLOCK) == 0) + type |= F_WAIT; + VOP_UNLOCK(vp, 0, td); + if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, + type)) != 0) { + /* + * The lock request failed. Normally close the + * descriptor but handle the case where someone might + * have dup()d or close()d it when we weren't looking. + */ + FILEDESC_LOCK(fdp); + if (fdp->fd_ofiles[indx] == fp) { + fdp->fd_ofiles[indx] = NULL; + FILEDESC_UNLOCK(fdp); + fdrop(fp, td); + } else + FILEDESC_UNLOCK(fdp); + /* + * release our private reference + */ + fdrop(fp, td); + return(error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + fp->f_flag |= FHASLOCK; + } + if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0)) + vfs_object_create(vp, td, td->td_ucred); + + VOP_UNLOCK(vp, 0, td); + fdrop(fp, td); + td->td_retval[0] = indx; + return (0); + +bad: + vput(vp); + return (error); +} + +/* + * Stat an (NFS) file handle. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fhstat_args { + struct fhandle *u_fhp; + struct stat *sb; +}; +#endif +int +fhstat(td, uap) + struct thread *td; + register struct fhstat_args /* { + syscallarg(struct fhandle *) u_fhp; + syscallarg(struct stat *) sb; + } */ *uap; +{ + struct stat sb; + fhandle_t fh; + struct mount *mp; + struct vnode *vp; + int error; + + /* + * Must be super user + */ + error = suser(td); + if (error) + return (error); + + error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t)); + if (error) + return (error); + + if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) + return (error); + error = vn_stat(vp, &sb, td); + vput(vp); + if (error) + return (error); + error = copyout(&sb, SCARG(uap, sb), sizeof(sb)); + return (error); +} + +/* + * Implement fstatfs() for (NFS) file handles. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fhstatfs_args { + struct fhandle *u_fhp; + struct statfs *buf; +}; +#endif +int +fhstatfs(td, uap) + struct thread *td; + struct fhstatfs_args /* { + syscallarg(struct fhandle) *u_fhp; + syscallarg(struct statfs) *buf; + } */ *uap; +{ + struct statfs *sp; + struct mount *mp; + struct vnode *vp; + struct statfs sb; + fhandle_t fh; + int error; + + /* + * Must be super user + */ + error = suser(td); + if (error) + return (error); + + if ((error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t))) != 0) + return (error); + + if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) + return (error); + mp = vp->v_mount; + sp = &mp->mnt_stat; + vput(vp); + if ((error = VFS_STATFS(mp, sp, td)) != 0) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (suser(td)) { + bcopy(sp, &sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } + return (copyout(sp, SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Syscall to push extended attribute configuration information into the + * VFS. Accepts a path, which it converts to a mountpoint, as well as + * a command (int cmd), and attribute name and misc data. For now, the + * attribute name is left in userspace for consumption by the VFS_op. + * It will probably be changed to be copied into sysspace by the + * syscall in the future, once issues with various consumers of the + * attribute code have raised their hands. + * + * Currently this is used only by UFS Extended Attributes. + */ +int +extattrctl(td, uap) + struct thread *td; + struct extattrctl_args /* { + syscallarg(const char *) path; + syscallarg(int) cmd; + syscallarg(const char *) filename; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + } */ *uap; +{ + struct vnode *filename_vp; + struct nameidata nd; + struct mount *mp, *mp_writable; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + /* + * uap->attrname is not always defined. We check again later when we + * invoke the VFS call so as to pass in NULL there if needed. + */ + if (uap->attrname != NULL) { + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, + NULL); + if (error) + return (error); + } + + /* + * uap->filename is not always defined. If it is, grab a vnode lock, + * which VFS_EXTATTRCTL() will later release. + */ + filename_vp = NULL; + if (uap->filename != NULL) { + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + uap->filename, td); + if ((error = namei(&nd)) != 0) + return (error); + filename_vp = nd.ni_vp; + NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK); + } + + /* uap->path is always defined. */ + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); + if ((error = namei(&nd)) != 0) { + if (filename_vp != NULL) + vput(filename_vp); + return (error); + } + mp = nd.ni_vp->v_mount; + error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH); + NDFREE(&nd, 0); + if (error) { + if (filename_vp != NULL) + vput(filename_vp); + return (error); + } + + if (uap->attrname != NULL) { + error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, + uap->attrnamespace, attrname, td); + } else { + error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, + uap->attrnamespace, NULL, td); + } + + vn_finished_write(mp_writable); + /* + * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, + * filename_vp, so vrele it if it is defined. + */ + if (filename_vp != NULL) + vrele(filename_vp); + + return (error); +} + +/*- + * Set a named extended attribute on a file or directory + * + * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", + * kernelspace string pointer "attrname", userspace buffer + * pointer "data", buffer length "nbytes", thread "td". + * Returns: 0 on success, an error number otherwise + * Locks: none + * References: vp must be a valid reference for the duration of the call + */ +static int +extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, + void *data, size_t nbytes, struct thread *td) +{ + struct mount *mp; + struct uio auio; + struct iovec aiov; + ssize_t cnt; + int error; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + + aiov.iov_base = data; + aiov.iov_len = nbytes; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + if (nbytes > INT_MAX) { + error = EINVAL; + goto done; + } + auio.uio_resid = nbytes; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + cnt = nbytes; + + error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, + td->td_ucred, td); + cnt -= auio.uio_resid; + td->td_retval[0] = cnt; + +done: + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return (error); +} + +int +extattr_set_file(td, uap) + struct thread *td; + struct extattr_set_file_args /* { + syscallarg(const char *) path; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + syscallarg(void *) data; + syscallarg(size_t) nbytes; + } */ *uap; +{ + struct nameidata nd; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + + error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname, + uap->data, uap->nbytes, td); + + vrele(nd.ni_vp); + return (error); +} + +int +extattr_set_fd(td, uap) + struct thread *td; + struct extattr_set_fd_args /* { + syscallarg(int) fd; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + syscallarg(void *) data; + syscallarg(size_t) nbytes; + } */ *uap; +{ + struct file *fp; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + + if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) + return (error); + + error = extattr_set_vp((struct vnode *)fp->f_data, uap->attrnamespace, + attrname, uap->data, uap->nbytes, td); + fdrop(fp, td); + + return (error); +} + +/*- + * Get a named extended attribute on a file or directory + * + * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", + * kernelspace string pointer "attrname", userspace buffer + * pointer "data", buffer length "nbytes", thread "td". + * Returns: 0 on success, an error number otherwise + * Locks: none + * References: vp must be a valid reference for the duration of the call + */ +static int +extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname, + void *data, size_t nbytes, struct thread *td) +{ + struct uio auio, *auiop; + struct iovec aiov; + ssize_t cnt; + size_t size, *sizep; + int error; + + VOP_LEASE(vp, td, td->td_ucred, LEASE_READ); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + + /* + * Slightly unusual semantics: if the user provides a NULL data + * pointer, they don't want to receive the data, just the + * maximum read length. + */ + auiop = NULL; + sizep = NULL; + cnt = 0; + if (data != NULL) { + aiov.iov_base = data; + aiov.iov_len = nbytes; + auio.uio_iov = &aiov; + auio.uio_offset = 0; + if (nbytes > INT_MAX) { + error = EINVAL; + goto done; + } + auio.uio_resid = nbytes; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + auiop = &auio; + cnt = nbytes; + } else + sizep = &size; + + error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep, + td->td_ucred, td); + + if (auiop != NULL) { + cnt -= auio.uio_resid; + td->td_retval[0] = cnt; + } else + td->td_retval[0] = size; + +done: + VOP_UNLOCK(vp, 0, td); + return (error); +} + +int +extattr_get_file(td, uap) + struct thread *td; + struct extattr_get_file_args /* { + syscallarg(const char *) path; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + syscallarg(void *) data; + syscallarg(size_t) nbytes; + } */ *uap; +{ + struct nameidata nd; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + + error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname, + uap->data, uap->nbytes, td); + + vrele(nd.ni_vp); + return (error); +} + +int +extattr_get_fd(td, uap) + struct thread *td; + struct extattr_get_fd_args /* { + syscallarg(int) fd; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + syscallarg(void *) data; + syscallarg(size_t) nbytes; + } */ *uap; +{ + struct file *fp; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + + error = extattr_get_vp((struct vnode *)fp->f_data, uap->attrnamespace, + attrname, uap->data, uap->nbytes, td); + + fdrop(fp, td); + return (error); +} + +/* + * extattr_delete_vp(): Delete a named extended attribute on a file or + * directory + * + * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", + * kernelspace string pointer "attrname", proc "p" + * Returns: 0 on success, an error number otherwise + * Locks: none + * References: vp must be a valid reference for the duration of the call + */ +static int +extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname, + struct thread *td) +{ + struct mount *mp; + int error; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + + error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_ucred, + td); + + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return (error); +} + +int +extattr_delete_file(td, uap) + struct thread *td; + struct extattr_delete_file_args /* { + syscallarg(const char *) path; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + } */ *uap; +{ + struct nameidata nd; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return(error); + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); + if ((error = namei(&nd)) != 0) + return(error); + NDFREE(&nd, NDF_ONLY_PNBUF); + + error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td); + + vrele(nd.ni_vp); + return(error); +} + +int +extattr_delete_fd(td, uap) + struct thread *td; + struct extattr_delete_fd_args /* { + syscallarg(int) fd; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + } */ *uap; +{ + struct file *fp; + struct vnode *vp; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + + if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) + return (error); + vp = (struct vnode *)fp->f_data; + + error = extattr_delete_vp((struct vnode *)fp->f_data, + uap->attrnamespace, attrname, td); + + fdrop(fp, td); + return (error); +} diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c new file mode 100644 index 0000000..b221cd3 --- /dev/null +++ b/sys/kern/vfs_init.c @@ -0,0 +1,477 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed + * to Berkeley by John Heidemann of the UCLA Ficus project. + * + * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_init.c 8.3 (Berkeley) 1/4/94 + * $FreeBSD$ + */ + + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/mount.h> +#include <sys/sysctl.h> +#include <sys/vnode.h> +#include <sys/malloc.h> + + +MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); + +/* + * The highest defined VFS number. + */ +int maxvfsconf = VFS_GENERIC + 1; + +/* + * Single-linked list of configured VFSes. + * New entries are added/deleted by vfs_register()/vfs_unregister() + */ +struct vfsconf *vfsconf; + +/* + * vfs_init.c + * + * Allocate and fill in operations vectors. + * + * An undocumented feature of this approach to defining operations is that + * there can be multiple entries in vfs_opv_descs for the same operations + * vector. This allows third parties to extend the set of operations + * supported by another layer in a binary compatibile way. For example, + * assume that NFS needed to be modified to support Ficus. NFS has an entry + * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by + * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions) + * listing those new operations Ficus adds to NFS, all without modifying the + * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but + * that is a(whole)nother story.) This is a feature. + */ + +/* Table of known vnodeop vectors (list of VFS vnode vectors) */ +static const struct vnodeopv_desc **vnodeopv_descs; +static int vnodeopv_num; + +/* Table of known descs (list of vnode op handlers "vop_access_desc") */ +static struct vnodeop_desc **vfs_op_descs; +/* Reference counts for vfs_op_descs */ +static int *vfs_op_desc_refs; +/* Number of descriptions */ +static int num_op_descs; +/* Number of entries in each description */ +static int vfs_opv_numops = 64; + +/* Allow this number to be tuned at boot */ +TUNABLE_INT("vfs.opv_numops", &vfs_opv_numops); +SYSCTL_INT(_vfs, OID_AUTO, opv_numops, CTLFLAG_RD, &vfs_opv_numops, + 0, "Maximum number of operations in vop_t vector"); + +static int int_cmp(const void *a, const void *b); + +static int +int_cmp(const void *a, const void *b) +{ + return(*(const int *)a - *(const int *)b); +} + +/* + * Recalculate the operations vector/description (those parts of it that can + * be recalculated, that is.) + * Always allocate operations vector large enough to hold vfs_opv_numops + * entries. The vector is never freed or deallocated once it is initialized, + * so that vnodes might safely reference it through their v_op pointer without + * vector changing suddenly from under them. + */ +static void +vfs_opv_recalc(void) +{ + int i, j, k; + int *vfs_op_offsets; + vop_t ***opv_desc_vector_p; + vop_t **opv_desc_vector; + struct vnodeopv_entry_desc *opve_descp; + const struct vnodeopv_desc *opv; + + if (vfs_op_descs == NULL) + panic("vfs_opv_recalc called with null vfs_op_descs"); + + /* + * Allocate and initialize temporary array to store + * offsets. Sort it to put all uninitialized entries + * first and to make holes in existing offset sequence + * detectable. + */ + MALLOC(vfs_op_offsets, int *, + num_op_descs * sizeof(int), M_TEMP, M_WAITOK); + if (vfs_op_offsets == NULL) + panic("vfs_opv_recalc: no memory"); + for (i = 0; i < num_op_descs; i++) + vfs_op_offsets[i] = vfs_op_descs[i]->vdesc_offset; + qsort(vfs_op_offsets, num_op_descs, sizeof(int), int_cmp); + + /* + * Run through and make sure all known descs have an offset. + * Use vfs_op_offsets to locate holes in offset sequence and + * reuse them. + * vop_default_desc is hardwired at offset 1, and offset 0 + * is a panic sanity check. + */ + j = 1; k = 1; + for (i = 0; i < num_op_descs; i++) { + if (vfs_op_descs[i]->vdesc_offset != 0) + continue; + /* + * Look at two adjacent entries vfs_op_offsets[j - 1] and + * vfs_op_offsets[j] and see if we can fit a new offset + * number in between. If not, look at the next pair until + * hole is found or the end of the vfs_op_offsets vector is + * reached. j has been initialized to 1 above so that + * referencing (j-1)-th element is safe and the loop will + * never execute if num_op_descs is 1. For each new value s + * of i the j loop pick up from where previous iteration has + * left off. When the last hole has been consumed or if no + * hole has been found, we will start allocating new numbers + * starting from the biggest already available offset + 1. + */ + for (; j < num_op_descs; j++) { + if (vfs_op_offsets[j - 1] < k && vfs_op_offsets[j] > k) + break; + k = vfs_op_offsets[j] + 1; + } + vfs_op_descs[i]->vdesc_offset = k++; + } + FREE(vfs_op_offsets, M_TEMP); + + /* Panic if new vops will cause vector overflow */ + if (k > vfs_opv_numops) + panic("VFS: Ran out of vop_t vector entries. %d entries required, only %d available.\n", k, vfs_opv_numops); + + /* + * Allocate and fill in the vectors + */ + for (i = 0; i < vnodeopv_num; i++) { + opv = vnodeopv_descs[i]; + opv_desc_vector_p = opv->opv_desc_vector_p; + if (*opv_desc_vector_p == NULL) + MALLOC(*opv_desc_vector_p, vop_t **, + vfs_opv_numops * sizeof(vop_t *), M_VNODE, + M_WAITOK | M_ZERO); + + /* Fill in, with slot 0 being to return EOPNOTSUPP */ + opv_desc_vector = *opv_desc_vector_p; + opv_desc_vector[0] = (vop_t *)vop_eopnotsupp; + for (j = 0; opv->opv_desc_ops[j].opve_op; j++) { + opve_descp = &(opv->opv_desc_ops[j]); + opv_desc_vector[opve_descp->opve_op->vdesc_offset] = + opve_descp->opve_impl; + } + + /* Replace unfilled routines with their default (slot 1). */ + opv_desc_vector = *(opv->opv_desc_vector_p); + if (opv_desc_vector[1] == NULL) + panic("vfs_opv_recalc: vector without a default."); + for (j = 0; j < vfs_opv_numops; j++) + if (opv_desc_vector[j] == NULL) + opv_desc_vector[j] = opv_desc_vector[1]; + } +} + +/* Add a set of vnode operations (a description) to the table above. */ +void +vfs_add_vnodeops(const void *data) +{ + const struct vnodeopv_desc *opv; + const struct vnodeopv_desc **newopv; + struct vnodeop_desc **newop; + int *newref; + vop_t **opv_desc_vector; + struct vnodeop_desc *desc; + int i, j; + + opv = (const struct vnodeopv_desc *)data; + MALLOC(newopv, const struct vnodeopv_desc **, + (vnodeopv_num + 1) * sizeof(*newopv), M_VNODE, M_WAITOK); + if (vnodeopv_descs) { + bcopy(vnodeopv_descs, newopv, vnodeopv_num * sizeof(*newopv)); + FREE(vnodeopv_descs, M_VNODE); + } + newopv[vnodeopv_num] = opv; + vnodeopv_descs = newopv; + vnodeopv_num++; + + /* See if we have turned up a new vnode op desc */ + opv_desc_vector = *(opv->opv_desc_vector_p); + for (i = 0; (desc = opv->opv_desc_ops[i].opve_op); i++) { + for (j = 0; j < num_op_descs; j++) { + if (desc == vfs_op_descs[j]) { + /* found it, increase reference count */ + vfs_op_desc_refs[j]++; + break; + } + } + if (j == num_op_descs) { + /* not found, new entry */ + MALLOC(newop, struct vnodeop_desc **, + (num_op_descs + 1) * sizeof(*newop), + M_VNODE, M_WAITOK); + /* new reference count (for unload) */ + MALLOC(newref, int *, + (num_op_descs + 1) * sizeof(*newref), + M_VNODE, M_WAITOK); + if (vfs_op_descs) { + bcopy(vfs_op_descs, newop, + num_op_descs * sizeof(*newop)); + FREE(vfs_op_descs, M_VNODE); + } + if (vfs_op_desc_refs) { + bcopy(vfs_op_desc_refs, newref, + num_op_descs * sizeof(*newref)); + FREE(vfs_op_desc_refs, M_VNODE); + } + newop[num_op_descs] = desc; + newref[num_op_descs] = 1; + vfs_op_descs = newop; + vfs_op_desc_refs = newref; + num_op_descs++; + } + } + vfs_opv_recalc(); +} + +/* Remove a vnode type from the vnode description table above. */ +void +vfs_rm_vnodeops(const void *data) +{ + const struct vnodeopv_desc *opv; + const struct vnodeopv_desc **newopv; + struct vnodeop_desc **newop; + int *newref; + vop_t **opv_desc_vector; + struct vnodeop_desc *desc; + int i, j, k; + + opv = (const struct vnodeopv_desc *)data; + /* Lower ref counts on descs in the table and release if zero */ + for (i = 0; (desc = opv->opv_desc_ops[i].opve_op); i++) { + for (j = 0; j < num_op_descs; j++) { + if (desc == vfs_op_descs[j]) { + /* found it, decrease reference count */ + vfs_op_desc_refs[j]--; + break; + } + } + for (j = 0; j < num_op_descs; j++) { + if (vfs_op_desc_refs[j] > 0) + continue; + if (vfs_op_desc_refs[j] < 0) + panic("vfs_remove_vnodeops: negative refcnt"); + /* Entry is going away - replace it with defaultop */ + for (k = 0; k < vnodeopv_num; k++) { + opv_desc_vector = + *(vnodeopv_descs[k]->opv_desc_vector_p); + if (opv_desc_vector != NULL) + opv_desc_vector[desc->vdesc_offset] = + opv_desc_vector[1]; + } + MALLOC(newop, struct vnodeop_desc **, + (num_op_descs - 1) * sizeof(*newop), + M_VNODE, M_WAITOK); + /* new reference count (for unload) */ + MALLOC(newref, int *, + (num_op_descs - 1) * sizeof(*newref), + M_VNODE, M_WAITOK); + for (k = j; k < (num_op_descs - 1); k++) { + vfs_op_descs[k] = vfs_op_descs[k + 1]; + vfs_op_desc_refs[k] = vfs_op_desc_refs[k + 1]; + } + bcopy(vfs_op_descs, newop, + (num_op_descs - 1) * sizeof(*newop)); + bcopy(vfs_op_desc_refs, newref, + (num_op_descs - 1) * sizeof(*newref)); + FREE(vfs_op_descs, M_VNODE); + FREE(vfs_op_desc_refs, M_VNODE); + vfs_op_descs = newop; + vfs_op_desc_refs = newref; + num_op_descs--; + } + } + + for (i = 0; i < vnodeopv_num; i++) { + if (vnodeopv_descs[i] == opv) { + for (j = i; j < (vnodeopv_num - 1); j++) + vnodeopv_descs[j] = vnodeopv_descs[j + 1]; + break; + } + } + if (i == vnodeopv_num) + panic("vfs_remove_vnodeops: opv not found"); + opv_desc_vector = *(opv->opv_desc_vector_p); + if (opv_desc_vector != NULL) + FREE(opv_desc_vector, M_VNODE); + MALLOC(newopv, const struct vnodeopv_desc **, + (vnodeopv_num - 1) * sizeof(*newopv), M_VNODE, M_WAITOK); + bcopy(vnodeopv_descs, newopv, (vnodeopv_num - 1) * sizeof(*newopv)); + FREE(vnodeopv_descs, M_VNODE); + vnodeopv_descs = newopv; + vnodeopv_num--; + + vfs_opv_recalc(); +} + +/* + * Routines having to do with the management of the vnode table. + */ +struct vattr va_null; + +/* + * Initialize the vnode structures and initialize each filesystem type. + */ +/* ARGSUSED*/ +static void +vfsinit(void *dummy) +{ + + vattr_null(&va_null); +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vfsinit, NULL) + +/* Register a new filesystem type in the global table */ +int +vfs_register(struct vfsconf *vfc) +{ + struct sysctl_oid *oidp; + struct vfsconf *vfsp; + + vfsp = NULL; + if (vfsconf) + for (vfsp = vfsconf; vfsp->vfc_next; vfsp = vfsp->vfc_next) + if (strcmp(vfc->vfc_name, vfsp->vfc_name) == 0) + return EEXIST; + + vfc->vfc_typenum = maxvfsconf++; + if (vfsp) + vfsp->vfc_next = vfc; + else + vfsconf = vfc; + vfc->vfc_next = NULL; + + /* + * If this filesystem has a sysctl node under vfs + * (i.e. vfs.xxfs), then change the oid number of that node to + * match the filesystem's type number. This allows user code + * which uses the type number to read sysctl variables defined + * by the filesystem to continue working. Since the oids are + * in a sorted list, we need to make sure the order is + * preserved by re-registering the oid after modifying its + * number. + */ + SLIST_FOREACH(oidp, &sysctl__vfs_children, oid_link) + if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) { + sysctl_unregister_oid(oidp); + oidp->oid_number = vfc->vfc_typenum; + sysctl_register_oid(oidp); + } + + /* + * Call init function for this VFS... + */ + (*(vfc->vfc_vfsops->vfs_init))(vfc); + + return 0; +} + + +/* Remove registration of a filesystem type */ +int +vfs_unregister(struct vfsconf *vfc) +{ + struct vfsconf *vfsp, *prev_vfsp; + int error, i, maxtypenum; + + i = vfc->vfc_typenum; + + prev_vfsp = NULL; + for (vfsp = vfsconf; vfsp; + prev_vfsp = vfsp, vfsp = vfsp->vfc_next) { + if (!strcmp(vfc->vfc_name, vfsp->vfc_name)) + break; + } + if (vfsp == NULL) + return EINVAL; + if (vfsp->vfc_refcount) + return EBUSY; + if (vfc->vfc_vfsops->vfs_uninit != NULL) { + error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp); + if (error) + return (error); + } + if (prev_vfsp) + prev_vfsp->vfc_next = vfsp->vfc_next; + else + vfsconf = vfsp->vfc_next; + maxtypenum = VFS_GENERIC; + for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next) + if (maxtypenum < vfsp->vfc_typenum) + maxtypenum = vfsp->vfc_typenum; + maxvfsconf = maxtypenum + 1; + return 0; +} + +/* + * Standard kernel module handling code for filesystem modules. + * Referenced from VFS_SET(). + */ +int +vfs_modevent(module_t mod, int type, void *data) +{ + struct vfsconf *vfc; + int error = 0; + + vfc = (struct vfsconf *)data; + + switch (type) { + case MOD_LOAD: + if (vfc) + error = vfs_register(vfc); + break; + + case MOD_UNLOAD: + if (vfc) + error = vfs_unregister(vfc); + break; + default: /* including MOD_SHUTDOWN */ + break; + } + return (error); +} diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c new file mode 100644 index 0000000..8e4af42 --- /dev/null +++ b/sys/kern/vfs_lookup.c @@ -0,0 +1,754 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94 + * $FreeBSD$ + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/filedesc.h> +#include <sys/proc.h> +#ifdef KTRACE +#include <sys/ktrace.h> +#endif + +#include <vm/uma.h> + +/* + * Allocation zone for namei + */ +uma_zone_t namei_zone; + +static void +nameiinit(void *dummy __unused) +{ + namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL) + +/* + * Convert a pathname into a pointer to a locked inode. + * + * The FOLLOW flag is set when symbolic links are to be followed + * when they occur at the end of the name translation process. + * Symbolic links are always followed for all other pathname + * components other than the last. + * + * The segflg defines whether the name is to be copied from user + * space or kernel space. + * + * Overall outline of namei: + * + * copy in name + * get starting directory + * while (!done && !error) { + * call lookup to search path. + * if symbolic link, massage name in buffer and continue + * } + */ +int +namei(ndp) + register struct nameidata *ndp; +{ + register struct filedesc *fdp; /* pointer to file descriptor state */ + register char *cp; /* pointer into pathname argument */ + register struct vnode *dp; /* the directory we are searching */ + struct iovec aiov; /* uio for reading symbolic links */ + struct uio auio; + int error, linklen; + struct componentname *cnp = &ndp->ni_cnd; + struct thread *td = cnp->cn_thread; + struct proc *p = td->td_proc; + + ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred; + KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc")); + KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0, + ("namei: nameiop contaminated with flags")); + KASSERT((cnp->cn_flags & OPMASK) == 0, + ("namei: flags contaminated with nameiops")); + fdp = p->p_fd; + + /* + * Get a buffer for the name to be translated, and copy the + * name into the buffer. + */ + if ((cnp->cn_flags & HASBUF) == 0) + cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); + if (ndp->ni_segflg == UIO_SYSSPACE) + error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, + MAXPATHLEN, (size_t *)&ndp->ni_pathlen); + else + error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, + MAXPATHLEN, (size_t *)&ndp->ni_pathlen); + + /* + * Don't allow empty pathnames. + */ + if (!error && *cnp->cn_pnbuf == '\0') + error = ENOENT; + + if (error) { + uma_zfree(namei_zone, cnp->cn_pnbuf); + ndp->ni_vp = NULL; + return (error); + } + ndp->ni_loopcnt = 0; +#ifdef KTRACE + if (KTRPOINT(td, KTR_NAMEI)) { + KASSERT(cnp->cn_thread == curthread, + ("namei not using curthread")); + ktrnamei(cnp->cn_pnbuf); + } +#endif + + /* + * Get starting point for the translation. + */ + FILEDESC_LOCK(fdp); + ndp->ni_rootdir = fdp->fd_rdir; + ndp->ni_topdir = fdp->fd_jdir; + + dp = fdp->fd_cdir; + VREF(dp); + FILEDESC_UNLOCK(fdp); + for (;;) { + /* + * Check if root directory should replace current directory. + * Done at start of translation and after symbolic link. + */ + cnp->cn_nameptr = cnp->cn_pnbuf; + if (*(cnp->cn_nameptr) == '/') { + vrele(dp); + while (*(cnp->cn_nameptr) == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + dp = ndp->ni_rootdir; + VREF(dp); + } + ndp->ni_startdir = dp; + error = lookup(ndp); + if (error) { + uma_zfree(namei_zone, cnp->cn_pnbuf); + return (error); + } + /* + * Check for symbolic link + */ + if ((cnp->cn_flags & ISSYMLINK) == 0) { + if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) + uma_zfree(namei_zone, cnp->cn_pnbuf); + else + cnp->cn_flags |= HASBUF; + + if (vn_canvmio(ndp->ni_vp) == TRUE && + (cnp->cn_nameiop != DELETE) && + ((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == + LOCKLEAF)) + vfs_object_create(ndp->ni_vp, td, + ndp->ni_cnd.cn_cred); + + return (0); + } + if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) + VOP_UNLOCK(ndp->ni_dvp, 0, td); + if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { + error = ELOOP; + break; + } + if (ndp->ni_pathlen > 1) + cp = uma_zalloc(namei_zone, M_WAITOK); + else + cp = cnp->cn_pnbuf; + aiov.iov_base = cp; + aiov.iov_len = MAXPATHLEN; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = (struct thread *)0; + auio.uio_resid = MAXPATHLEN; + error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); + if (error) { + if (ndp->ni_pathlen > 1) + uma_zfree(namei_zone, cp); + break; + } + linklen = MAXPATHLEN - auio.uio_resid; + if (linklen == 0) { + if (ndp->ni_pathlen > 1) + uma_zfree(namei_zone, cp); + error = ENOENT; + break; + } + if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { + if (ndp->ni_pathlen > 1) + uma_zfree(namei_zone, cp); + error = ENAMETOOLONG; + break; + } + if (ndp->ni_pathlen > 1) { + bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); + uma_zfree(namei_zone, cnp->cn_pnbuf); + cnp->cn_pnbuf = cp; + } else + cnp->cn_pnbuf[linklen] = '\0'; + ndp->ni_pathlen += linklen; + vput(ndp->ni_vp); + dp = ndp->ni_dvp; + } + uma_zfree(namei_zone, cnp->cn_pnbuf); + vrele(ndp->ni_dvp); + vput(ndp->ni_vp); + ndp->ni_vp = NULL; + return (error); +} + +/* + * Search a pathname. + * This is a very central and rather complicated routine. + * + * The pathname is pointed to by ni_ptr and is of length ni_pathlen. + * The starting directory is taken from ni_startdir. The pathname is + * descended until done, or a symbolic link is encountered. The variable + * ni_more is clear if the path is completed; it is set to one if a + * symbolic link needing interpretation is encountered. + * + * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on + * whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it, the parent directory is returned + * locked. If flag has WANTPARENT or'ed into it, the parent directory is + * returned unlocked. Otherwise the parent directory is not returned. If + * the target of the pathname exists and LOCKLEAF is or'ed into the flag + * the target is returned locked, otherwise it is returned unlocked. + * When creating or renaming and LOCKPARENT is specified, the target may not + * be ".". When deleting and LOCKPARENT is specified, the target may be ".". + * + * Overall outline of lookup: + * + * dirloop: + * identify next component of name at ndp->ni_ptr + * handle degenerate case where name is null string + * if .. and crossing mount points and on mounted filesys, find parent + * call VOP_LOOKUP routine for next component name + * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set + * component vnode returned in ni_vp (if it exists), locked. + * if result vnode is mounted on and crossing mount points, + * find mounted on vnode + * if more components of name, do next level at dirloop + * return the answer in ni_vp, locked if LOCKLEAF set + * if LOCKPARENT set, return locked parent in ni_dvp + * if WANTPARENT set, return unlocked parent in ni_dvp + */ +int +lookup(ndp) + register struct nameidata *ndp; +{ + register char *cp; /* pointer into pathname argument */ + register struct vnode *dp = 0; /* the directory we are searching */ + struct vnode *tdp; /* saved dp */ + struct mount *mp; /* mount table entry */ + int docache; /* == 0 do not cache last component */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int trailing_slash; + int error = 0; + int dpunlocked = 0; /* dp has already been unlocked */ + struct componentname *cnp = &ndp->ni_cnd; + struct thread *td = cnp->cn_thread; + + /* + * Setup: break out flag bits into variables. + */ + wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); + docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; + if (cnp->cn_nameiop == DELETE || + (wantparent && cnp->cn_nameiop != CREATE && + cnp->cn_nameiop != LOOKUP)) + docache = 0; + rdonly = cnp->cn_flags & RDONLY; + ndp->ni_dvp = NULL; + cnp->cn_flags &= ~ISSYMLINK; + dp = ndp->ni_startdir; + ndp->ni_startdir = NULLVP; + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td); + +dirloop: + /* + * Search a new directory. + * + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ + cnp->cn_consume = 0; + for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) + continue; + cnp->cn_namelen = cp - cnp->cn_nameptr; + if (cnp->cn_namelen > NAME_MAX) { + error = ENAMETOOLONG; + goto bad; + } +#ifdef NAMEI_DIAGNOSTIC + { char c = *cp; + *cp = '\0'; + printf("{%s}: ", cnp->cn_nameptr); + *cp = c; } +#endif + ndp->ni_pathlen -= cnp->cn_namelen; + ndp->ni_next = cp; + + /* + * Replace multiple slashes by a single slash and trailing slashes + * by a null. This must be done before VOP_LOOKUP() because some + * fs's don't know about trailing slashes. Remember if there were + * trailing slashes to handle symlinks, existing non-directories + * and non-existing files that won't be directories specially later. + */ + trailing_slash = 0; + while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { + cp++; + ndp->ni_pathlen--; + if (*cp == '\0') { + trailing_slash = 1; + *ndp->ni_next = '\0'; /* XXX for direnter() ... */ + } + } + ndp->ni_next = cp; + + cnp->cn_flags |= MAKEENTRY; + if (*cp == '\0' && docache == 0) + cnp->cn_flags &= ~MAKEENTRY; + if (cnp->cn_namelen == 2 && + cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') + cnp->cn_flags |= ISDOTDOT; + else + cnp->cn_flags &= ~ISDOTDOT; + if (*ndp->ni_next == 0) + cnp->cn_flags |= ISLASTCN; + else + cnp->cn_flags &= ~ISLASTCN; + + + /* + * Check for degenerate name (e.g. / or "") + * which is a way of talking about a directory, + * e.g. like "/." or ".". + */ + if (cnp->cn_nameptr[0] == '\0') { + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto bad; + } + if (cnp->cn_nameiop != LOOKUP) { + error = EISDIR; + goto bad; + } + if (wantparent) { + ndp->ni_dvp = dp; + VREF(dp); + } + ndp->ni_vp = dp; + if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) + VOP_UNLOCK(dp, 0, td); + /* XXX This should probably move to the top of function. */ + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + return (0); + } + + /* + * Handle "..": two special cases. + * 1. If at root directory (e.g. after chroot) + * or at absolute root directory + * then ignore it so can't get out. + * 2. If this vnode is the root of a mounted + * filesystem, then replace it with the + * vnode which was mounted on so we take the + * .. in the other filesystem. + * 3. If the vnode is the top directory of + * the jail or chroot, don't let them out. + */ + if (cnp->cn_flags & ISDOTDOT) { + for (;;) { + if (dp == ndp->ni_rootdir || + dp == ndp->ni_topdir || + dp == rootvnode) { + ndp->ni_dvp = dp; + ndp->ni_vp = dp; + VREF(dp); + goto nextname; + } + if ((dp->v_flag & VROOT) == 0 || + (cnp->cn_flags & NOCROSSMOUNT)) + break; + if (dp->v_mount == NULL) { /* forced unmount */ + error = EBADF; + goto bad; + } + tdp = dp; + dp = dp->v_mount->mnt_vnodecovered; + vput(tdp); + VREF(dp); + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td); + } + } + + /* + * We now have a segment name to search for, and a directory to search. + */ +unionlookup: + ndp->ni_dvp = dp; + ndp->ni_vp = NULL; + cnp->cn_flags &= ~PDIRUNLOCK; + ASSERT_VOP_LOCKED(dp, "lookup"); + if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) { + KASSERT(ndp->ni_vp == NULL, ("leaf should be empty")); +#ifdef NAMEI_DIAGNOSTIC + printf("not found\n"); +#endif + if ((error == ENOENT) && + (dp->v_flag & VROOT) && (dp->v_mount != NULL) && + (dp->v_mount->mnt_flag & MNT_UNION)) { + tdp = dp; + dp = dp->v_mount->mnt_vnodecovered; + if (cnp->cn_flags & PDIRUNLOCK) + vrele(tdp); + else + vput(tdp); + VREF(dp); + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td); + goto unionlookup; + } + + if (error != EJUSTRETURN) + goto bad; + /* + * If creating and at end of pathname, then can consider + * allowing file to be created. + */ + if (rdonly) { + error = EROFS; + goto bad; + } + if (*cp == '\0' && trailing_slash && + !(cnp->cn_flags & WILLBEDIR)) { + error = ENOENT; + goto bad; + } + /* + * We return with ni_vp NULL to indicate that the entry + * doesn't currently exist, leaving a pointer to the + * (possibly locked) directory inode in ndp->ni_dvp. + */ + if (cnp->cn_flags & SAVESTART) { + ndp->ni_startdir = ndp->ni_dvp; + VREF(ndp->ni_startdir); + } + return (0); + } +#ifdef NAMEI_DIAGNOSTIC + printf("found\n"); +#endif + + ASSERT_VOP_LOCKED(ndp->ni_vp, "lookup"); + + /* + * Take into account any additional components consumed by + * the underlying filesystem. + */ + if (cnp->cn_consume > 0) { + cnp->cn_nameptr += cnp->cn_consume; + ndp->ni_next += cnp->cn_consume; + ndp->ni_pathlen -= cnp->cn_consume; + cnp->cn_consume = 0; + } + + dp = ndp->ni_vp; + + /* + * Check to see if the vnode has been mounted on; + * if so find the root of the mounted filesystem. + */ + while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && + (cnp->cn_flags & NOCROSSMOUNT) == 0) { + if (vfs_busy(mp, 0, 0, td)) + continue; + VOP_UNLOCK(dp, 0, td); + error = VFS_ROOT(mp, &tdp); + vfs_unbusy(mp, td); + if (error) { + dpunlocked = 1; + goto bad2; + } + vrele(dp); + ndp->ni_vp = dp = tdp; + } + + /* + * Check for symbolic link + */ + if ((dp->v_type == VLNK) && + ((cnp->cn_flags & FOLLOW) || trailing_slash || + *ndp->ni_next == '/')) { + cnp->cn_flags |= ISSYMLINK; + if (dp->v_mount == NULL) { + /* We can't know whether the directory was mounted with + * NOSYMFOLLOW, so we can't follow safely. */ + error = EBADF; + goto bad2; + } + if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) { + error = EACCES; + goto bad2; + } + return (0); + } + + /* + * Check for bogus trailing slashes. + */ + if (trailing_slash && dp->v_type != VDIR) { + error = ENOTDIR; + goto bad2; + } + +nextname: + /* + * Not a symbolic link. If more pathname, + * continue at next component, else return. + */ + if (*ndp->ni_next == '/') { + cnp->cn_nameptr = ndp->ni_next; + while (*cnp->cn_nameptr == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + if (ndp->ni_dvp != ndp->ni_vp) + ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "lookup"); + vrele(ndp->ni_dvp); + goto dirloop; + } + /* + * Disallow directory write attempts on read-only filesystems. + */ + if (rdonly && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EROFS; + goto bad2; + } + if (cnp->cn_flags & SAVESTART) { + ndp->ni_startdir = ndp->ni_dvp; + VREF(ndp->ni_startdir); + } + if (!wantparent) + vrele(ndp->ni_dvp); + + if ((cnp->cn_flags & LOCKLEAF) == 0) + VOP_UNLOCK(dp, 0, td); + return (0); + +bad2: + if ((cnp->cn_flags & (LOCKPARENT | PDIRUNLOCK)) == LOCKPARENT && + *ndp->ni_next == '\0') + VOP_UNLOCK(ndp->ni_dvp, 0, td); + vrele(ndp->ni_dvp); +bad: + if (dpunlocked) + vrele(dp); + else + vput(dp); + ndp->ni_vp = NULL; + return (error); +} + +/* + * relookup - lookup a path name component + * Used by lookup to re-aquire things. + */ +int +relookup(dvp, vpp, cnp) + struct vnode *dvp, **vpp; + struct componentname *cnp; +{ + struct thread *td = cnp->cn_thread; + struct vnode *dp = 0; /* the directory we are searching */ + int docache; /* == 0 do not cache last component */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int error = 0; +#ifdef NAMEI_DIAGNOSTIC + int newhash; /* DEBUG: check name hash */ + char *cp; /* DEBUG: check name ptr/len */ +#endif + + /* + * Setup: break out flag bits into variables. + */ + wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); + docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; + if (cnp->cn_nameiop == DELETE || + (wantparent && cnp->cn_nameiop != CREATE)) + docache = 0; + rdonly = cnp->cn_flags & RDONLY; + cnp->cn_flags &= ~ISSYMLINK; + dp = dvp; + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td); + +/* dirloop: */ + /* + * Search a new directory. + * + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ +#ifdef NAMEI_DIAGNOSTIC + if (cnp->cn_namelen != cp - cnp->cn_nameptr) + panic ("relookup: bad len"); + if (*cp != 0) + panic("relookup: not last component"); + printf("{%s}: ", cnp->cn_nameptr); +#endif + + /* + * Check for degenerate name (e.g. / or "") + * which is a way of talking about a directory, + * e.g. like "/." or ".". + */ + if (cnp->cn_nameptr[0] == '\0') { + if (cnp->cn_nameiop != LOOKUP || wantparent) { + error = EISDIR; + goto bad; + } + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto bad; + } + if (!(cnp->cn_flags & LOCKLEAF)) + VOP_UNLOCK(dp, 0, td); + *vpp = dp; + /* XXX This should probably move to the top of function. */ + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + return (0); + } + + if (cnp->cn_flags & ISDOTDOT) + panic ("relookup: lookup on dot-dot"); + + /* + * We now have a segment name to search for, and a directory to search. + */ + if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) { + KASSERT(*vpp == NULL, ("leaf should be empty")); + if (error != EJUSTRETURN) + goto bad; + /* + * If creating and at end of pathname, then can consider + * allowing file to be created. + */ + if (rdonly) { + error = EROFS; + goto bad; + } + /* ASSERT(dvp == ndp->ni_startdir) */ + if (cnp->cn_flags & SAVESTART) + VREF(dvp); + /* + * We return with ni_vp NULL to indicate that the entry + * doesn't currently exist, leaving a pointer to the + * (possibly locked) directory inode in ndp->ni_dvp. + */ + return (0); + } + dp = *vpp; + + /* + * Check for symbolic link + */ + KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW), + ("relookup: symlink found.\n")); + + /* + * Disallow directory write attempts on read-only filesystems. + */ + if (rdonly && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EROFS; + goto bad2; + } + /* ASSERT(dvp == ndp->ni_startdir) */ + if (cnp->cn_flags & SAVESTART) + VREF(dvp); + + if (!wantparent) + vrele(dvp); + + if (vn_canvmio(dp) == TRUE && + ((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == LOCKLEAF)) + vfs_object_create(dp, td, cnp->cn_cred); + + if ((cnp->cn_flags & LOCKLEAF) == 0) + VOP_UNLOCK(dp, 0, td); + return (0); + +bad2: + if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN)) + VOP_UNLOCK(dvp, 0, td); + vrele(dvp); +bad: + vput(dp); + *vpp = NULL; + return (error); +} diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c new file mode 100644 index 0000000..20d9b90 --- /dev/null +++ b/sys/kern/vfs_mount.c @@ -0,0 +1,396 @@ +/*- + * Copyright (c) 1999 Michael Smith + * All rights reserved. + * Copyright (c) 1999 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Locate and mount the root filesystem. + * + * The root filesystem is detailed in the kernel environment variable + * vfs.root.mountfrom, which is expected to be in the general format + * + * <vfsname>:[<path>] + * vfsname := the name of a VFS known to the kernel and capable + * of being mounted as root + * path := disk device name or other data used by the filesystem + * to locate its physical store + * + */ + +#include "opt_rootdevname.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/malloc.h> +#include <sys/reboot.h> +#include <sys/diskslice.h> +#include <sys/disklabel.h> +#include <sys/conf.h> +#include <sys/cons.h> +#include <sys/proc.h> + +#include "opt_ddb.h" + +#ifdef DDB +#include <ddb/ddb.h> +#endif + +#include <paths.h> + +MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); + +#define ROOTNAME "root_device" + +/* + * The vnode of the system's root (/ in the filesystem, without chroot + * active.) + */ +struct vnode *rootvnode; + +/* + * The root specifiers we will try if RB_CDROM is specified. + */ +static char *cdrom_rootdevnames[] = { + "cd9660:cd0a", + "cd9660:acd0a", + "cd9660:wcd0a", + NULL +}; + +static int vfs_mountroot_try(char *mountfrom); +static int vfs_mountroot_ask(void); +static void gets(char *cp); + +/* legacy find-root code */ +char *rootdevnames[2] = {NULL, NULL}; +static int setrootbyname(char *name); +dev_t rootdev = NODEV; + +/* + * Find and mount the root filesystem + */ +void +vfs_mountroot(void *foo __unused) +{ + char *cp; + int i, error; + + /* + * The root filesystem information is compiled in, and we are + * booted with instructions to use it. + */ +#ifdef ROOTDEVNAME + if ((boothowto & RB_DFLTROOT) && + !vfs_mountroot_try(ROOTDEVNAME)) + return; +#endif + /* + * We are booted with instructions to prompt for the root filesystem, + * or to use the compiled-in default when it doesn't exist. + */ + if (boothowto & (RB_DFLTROOT | RB_ASKNAME)) { + if (!vfs_mountroot_ask()) + return; + } + + /* + * We've been given the generic "use CDROM as root" flag. This is + * necessary because one media may be used in many different + * devices, so we need to search for them. + */ + if (boothowto & RB_CDROM) { + for (i = 0; cdrom_rootdevnames[i] != NULL; i++) { + if (!vfs_mountroot_try(cdrom_rootdevnames[i])) + return; + } + } + + /* + * Try to use the value read by the loader from /etc/fstab, or + * supplied via some other means. This is the preferred + * mechanism. + */ + if ((cp = getenv("vfs.root.mountfrom")) != NULL) { + error = vfs_mountroot_try(cp); + freeenv(cp); + if (!error) + return; + } + + /* + * Try values that may have been computed by the machine-dependant + * legacy code. + */ + if (!vfs_mountroot_try(rootdevnames[0])) + return; + if (!vfs_mountroot_try(rootdevnames[1])) + return; + + /* + * If we have a compiled-in default, and haven't already tried it, try + * it now. + */ +#ifdef ROOTDEVNAME + if (!(boothowto & RB_DFLTROOT)) + if (!vfs_mountroot_try(ROOTDEVNAME)) + return; +#endif + + /* + * Everything so far has failed, prompt on the console if we haven't + * already tried that. + */ + if (!(boothowto & (RB_DFLTROOT | RB_ASKNAME)) && !vfs_mountroot_ask()) + return; + panic("Root mount failed, startup aborted."); +} + +/* + * Mount (mountfrom) as the root filesystem. + */ +static int +vfs_mountroot_try(char *mountfrom) +{ + struct mount *mp; + char *vfsname, *path; + int error; + char patt[32]; + int s; + + vfsname = NULL; + path = NULL; + mp = NULL; + error = EINVAL; + + if (mountfrom == NULL) + return(error); /* don't complain */ + + s = splcam(); /* Overkill, but annoying without it */ + printf("Mounting root from %s\n", mountfrom); + splx(s); + + /* parse vfs name and path */ + vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK); + path = malloc(MNAMELEN, M_MOUNT, M_WAITOK); + vfsname[0] = path[0] = 0; + sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN); + if (sscanf(mountfrom, patt, vfsname, path) < 1) + goto done; + + /* allocate a root mount */ + error = vfs_rootmountalloc(vfsname, path[0] != 0 ? path : ROOTNAME, + &mp); + if (error != 0) { + printf("Can't allocate root mount for filesystem '%s': %d\n", + vfsname, error); + goto done; + } + mp->mnt_flag |= MNT_ROOTFS; + + /* do our best to set rootdev */ + if ((path[0] != 0) && setrootbyname(path)) + printf("setrootbyname failed\n"); + + /* If the root device is a type "memory disk", mount RW */ + if (rootdev != NODEV && devsw(rootdev) && + (devsw(rootdev)->d_flags & D_MEMDISK)) + mp->mnt_flag &= ~MNT_RDONLY; + + /* + * Set the mount path to be something useful, because the + * filesystem code isn't responsible now for initialising + * f_mntonname unless they want to override the default + * (which is `path'.) + */ + strncpy(mp->mnt_stat.f_mntonname, "/", MNAMELEN); + + error = VFS_MOUNT(mp, NULL, NULL, NULL, curthread); + +done: + if (vfsname != NULL) + free(vfsname, M_MOUNT); + if (path != NULL) + free(path, M_MOUNT); + if (error != 0) { + if (mp != NULL) { + vfs_unbusy(mp, curthread); + free(mp, M_MOUNT); + } + printf("Root mount failed: %d\n", error); + } else { + + /* register with list of mounted filesystems */ + mtx_lock(&mountlist_mtx); + TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); + mtx_unlock(&mountlist_mtx); + + /* sanity check system clock against root filesystem timestamp */ + inittodr(mp->mnt_time); + vfs_unbusy(mp, curthread); + } + return(error); +} + +/* + * Spin prompting on the console for a suitable root filesystem + */ +static int +vfs_mountroot_ask(void) +{ + char name[128]; + int i; + dev_t dev; + + for(;;) { + printf("\nManual root filesystem specification:\n"); + printf(" <fstype>:<device> Mount <device> using filesystem <fstype>\n"); +#if defined(__i386__) || defined(__ia64__) + printf(" eg. ufs:da0s1a\n"); +#else + printf(" eg. ufs:da0a\n"); +#endif + printf(" ? List valid disk boot devices\n"); + printf(" <empty line> Abort manual input\n"); + printf("\nmountroot> "); + gets(name); + if (name[0] == 0) + return(1); + if (name[0] == '?') { + printf("Possibly valid devices for 'ufs' root:\n"); + for (i = 0; i < NUMCDEVSW; i++) { + dev = makedev(i, 0); + if (devsw(dev) != NULL) + printf(" \"%s\"", devsw(dev)->d_name); + } + printf("\n"); + continue; + } + if (!vfs_mountroot_try(name)) + return(0); + } +} + +/* + * Local helper function for vfs_mountroot_ask. + */ +static void +gets(char *cp) +{ + char *lp; + int c; + + lp = cp; + for (;;) { + printf("%c", c = cngetc() & 0177); + switch (c) { + case -1: + case '\n': + case '\r': + *lp++ = '\0'; + return; + case '\b': + case '\177': + if (lp > cp) { + printf(" \b"); + lp--; + } + continue; + case '#': + lp--; + if (lp < cp) + lp = cp; + continue; + case '@': + case 'u' & 037: + lp = cp; + printf("%c", '\n'); + continue; + default: + *lp++ = c; + } + } +} + +/* + * Convert a given name to the dev_t of the disk-like device + * it refers to. + */ +dev_t +getdiskbyname(char *name) { + char *cp; + dev_t dev; + + cp = name; + if (!bcmp(cp, "/dev/", 5)) + cp += 5; + + dev = NODEV; + EVENTHANDLER_INVOKE(dev_clone, cp, strlen(cp), &dev); + return (dev); +} + +/* + * Set rootdev to match (name), given that we expect it to + * refer to a disk-like device. + */ +static int +setrootbyname(char *name) +{ + dev_t diskdev; + + diskdev = getdiskbyname(name); + if (diskdev != NODEV) { + rootdev = diskdev; + return (0); + } + + return (1); +} + +/* Show the dev_t for a disk specified by name */ +#ifdef DDB +DB_SHOW_COMMAND(disk, db_getdiskbyname) +{ + dev_t dev; + + if (modif[0] == '\0') { + db_error("usage: show disk/devicename"); + return; + } + dev = getdiskbyname(modif); + if (dev != NODEV) + db_printf("dev_t = %p\n", dev); + else + db_printf("No disk device matched.\n"); +} +#endif diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c new file mode 100644 index 0000000..0575662 --- /dev/null +++ b/sys/kern/vfs_subr.c @@ -0,0 +1,3275 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 + * $FreeBSD$ + */ + +/* + * External virtual filesystem routines + */ +#include "opt_ddb.h" +#include "opt_ffs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/eventhandler.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/namei.h> +#include <sys/stat.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/vmmeter.h> +#include <sys/vnode.h> + +#include <vm/vm.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/uma.h> + +static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); + +static void addalias(struct vnode *vp, dev_t nvp_rdev); +static void insmntque(struct vnode *vp, struct mount *mp); +static void vclean(struct vnode *vp, int flags, struct thread *td); +static void vlruvp(struct vnode *vp); + +/* + * Number of vnodes in existence. Increased whenever getnewvnode() + * allocates a new vnode, never decreased. + */ +static unsigned long numvnodes; + +SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); + +/* + * Conversion tables for conversion from vnode types to inode formats + * and back. + */ +enum vtype iftovt_tab[16] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, +}; +int vttoif_tab[9] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, + S_IFSOCK, S_IFIFO, S_IFMT, +}; + +/* + * List of vnodes that are ready for recycling. + */ +static TAILQ_HEAD(freelst, vnode) vnode_free_list; + +/* + * Minimum number of free vnodes. If there are fewer than this free vnodes, + * getnewvnode() will return a newly allocated vnode. + */ +static u_long wantfreevnodes = 25; +SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); +/* Number of vnodes in the free list. */ +static u_long freevnodes; +SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); + +/* + * Various variables used for debugging the new implementation of + * reassignbuf(). + * XXX these are probably of (very) limited utility now. + */ +static int reassignbufcalls; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); +static int reassignbufloops; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); +static int reassignbufsortgood; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); +static int reassignbufsortbad; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); +/* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ +static int reassignbufmethod = 1; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); +static int nameileafonly; +SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, ""); + +#ifdef ENABLE_VFS_IOOPT +/* See NOTES for a description of this setting. */ +int vfs_ioopt; +SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); +#endif + +/* List of mounted filesystems. */ +struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); + +/* For any iteration/modification of mountlist */ +struct mtx mountlist_mtx; + +/* For any iteration/modification of mnt_vnodelist */ +struct mtx mntvnode_mtx; + +/* + * Cache for the mount type id assigned to NFS. This is used for + * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. + */ +int nfs_mount_type = -1; + +/* To keep more than one thread at a time from running vfs_getnewfsid */ +static struct mtx mntid_mtx; + +/* For any iteration/modification of vnode_free_list */ +static struct mtx vnode_free_list_mtx; + +/* + * For any iteration/modification of dev->si_hlist (linked through + * v_specnext) + */ +static struct mtx spechash_mtx; + +/* Publicly exported FS */ +struct nfs_public nfs_pub; + +/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ +static uma_zone_t vnode_zone; +static uma_zone_t vnodepoll_zone; + +/* Set to 1 to print out reclaim of active vnodes */ +int prtactive; + +/* + * The workitem queue. + * + * It is useful to delay writes of file data and filesystem metadata + * for tens of seconds so that quickly created and deleted files need + * not waste disk bandwidth being created and removed. To realize this, + * we append vnodes to a "workitem" queue. When running with a soft + * updates implementation, most pending metadata dependencies should + * not wait for more than a few seconds. Thus, mounted on block devices + * are delayed only about a half the time that file data is delayed. + * Similarly, directory updates are more critical, so are only delayed + * about a third the time that file data is delayed. Thus, there are + * SYNCER_MAXDELAY queues that are processed round-robin at a rate of + * one each second (driven off the filesystem syncer process). The + * syncer_delayno variable indicates the next queue that is to be processed. + * Items that need to be processed soon are placed in this queue: + * + * syncer_workitem_pending[syncer_delayno] + * + * A delay of fifteen seconds is done by placing the request fifteen + * entries later in the queue: + * + * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] + * + */ +static int syncer_delayno; +static long syncer_mask; +LIST_HEAD(synclist, vnode); +static struct synclist *syncer_workitem_pending; + +#define SYNCER_MAXDELAY 32 +static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ +static int syncdelay = 30; /* max time to delay syncing data */ +static int filedelay = 30; /* time to delay syncing files */ +SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); +static int dirdelay = 29; /* time to delay syncing directories */ +SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); +static int metadelay = 28; /* time to delay syncing metadata */ +SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); +static int rushjob; /* number of slots to run ASAP */ +static int stat_rush_requests; /* number of times I/O speeded up */ +SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); + +/* + * Number of vnodes we want to exist at any one time. This is mostly used + * to size hash tables in vnode-related code. It is normally not used in + * getnewvnode(), as wantfreevnodes is normally nonzero.) + * + * XXX desiredvnodes is historical cruft and should not exist. + */ +int desiredvnodes; +SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, + &desiredvnodes, 0, "Maximum number of vnodes"); +static int minvnodes; +SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, + &minvnodes, 0, "Minimum number of vnodes"); +static int vnlru_nowhere; +SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, + "Number of times the vnlru process ran without success"); + +#ifdef DEBUG_VFS_LOCKS +/* Print lock violations */ +int vfs_badlock_print = 1; +/* Panic on violation */ +int vfs_badlock_panic = 1; +#endif + +void +v_addpollinfo(struct vnode *vp) +{ + vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK); + mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); +} + +/* + * Initialize the vnode management data structures. + */ +static void +vntblinit(void *dummy __unused) +{ + + desiredvnodes = maxproc + cnt.v_page_count / 4; + minvnodes = desiredvnodes / 4; + mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF); + mtx_init(&mntvnode_mtx, "mntvnode", NULL, MTX_DEF); + mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); + mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF); + TAILQ_INIT(&vnode_free_list); + mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); + vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + /* + * Initialize the filesystem syncer. + */ + syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, + &syncer_mask); + syncer_maxdelay = syncer_mask + 1; +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) + + +/* + * Mark a mount point as busy. Used to synchronize access and to delay + * unmounting. Interlock is not released on failure. + */ +int +vfs_busy(mp, flags, interlkp, td) + struct mount *mp; + int flags; + struct mtx *interlkp; + struct thread *td; +{ + int lkflags; + + if (mp->mnt_kern_flag & MNTK_UNMOUNT) { + if (flags & LK_NOWAIT) + return (ENOENT); + mp->mnt_kern_flag |= MNTK_MWAIT; + /* + * Since all busy locks are shared except the exclusive + * lock granted when unmounting, the only place that a + * wakeup needs to be done is at the release of the + * exclusive lock at the end of dounmount. + */ + msleep(mp, interlkp, PVFS, "vfs_busy", 0); + return (ENOENT); + } + lkflags = LK_SHARED | LK_NOPAUSE; + if (interlkp) + lkflags |= LK_INTERLOCK; + if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td)) + panic("vfs_busy: unexpected lock failure"); + return (0); +} + +/* + * Free a busy filesystem. + */ +void +vfs_unbusy(mp, td) + struct mount *mp; + struct thread *td; +{ + + lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); +} + +/* + * Lookup a filesystem type, and if found allocate and initialize + * a mount structure for it. + * + * Devname is usually updated by mount(8) after booting. + */ +int +vfs_rootmountalloc(fstypename, devname, mpp) + char *fstypename; + char *devname; + struct mount **mpp; +{ + struct thread *td = curthread; /* XXX */ + struct vfsconf *vfsp; + struct mount *mp; + + if (fstypename == NULL) + return (ENODEV); + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstypename)) + break; + if (vfsp == NULL) + return (ENODEV); + mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); + (void)vfs_busy(mp, LK_NOWAIT, 0, td); + TAILQ_INIT(&mp->mnt_nvnodelist); + TAILQ_INIT(&mp->mnt_reservedvnlist); + mp->mnt_vfc = vfsp; + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_flag = MNT_RDONLY; + mp->mnt_vnodecovered = NULLVP; + vfsp->vfc_refcount++; + mp->mnt_iosize_max = DFLTPHYS; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); + mp->mnt_stat.f_mntonname[0] = '/'; + mp->mnt_stat.f_mntonname[1] = 0; + (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); + *mpp = mp; + return (0); +} + +/* + * Find an appropriate filesystem to use for the root. If a filesystem + * has not been preselected, walk through the list of known filesystems + * trying those that have mountroot routines, and try them until one + * works or we have tried them all. + */ +#ifdef notdef /* XXX JH */ +int +lite2_vfs_mountroot() +{ + struct vfsconf *vfsp; + extern int (*lite2_mountroot)(void); + int error; + + if (lite2_mountroot != NULL) + return ((*lite2_mountroot)()); + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { + if (vfsp->vfc_mountroot == NULL) + continue; + if ((error = (*vfsp->vfc_mountroot)()) == 0) + return (0); + printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); + } + return (ENODEV); +} +#endif + +/* + * Lookup a mount point by filesystem identifier. + */ +struct mount * +vfs_getvfs(fsid) + fsid_t *fsid; +{ + register struct mount *mp; + + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && + mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { + mtx_unlock(&mountlist_mtx); + return (mp); + } + } + mtx_unlock(&mountlist_mtx); + return ((struct mount *) 0); +} + +/* + * Get a new unique fsid. Try to make its val[0] unique, since this value + * will be used to create fake device numbers for stat(). Also try (but + * not so hard) make its val[0] unique mod 2^16, since some emulators only + * support 16-bit device numbers. We end up with unique val[0]'s for the + * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. + * + * Keep in mind that several mounts may be running in parallel. Starting + * the search one past where the previous search terminated is both a + * micro-optimization and a defense against returning the same fsid to + * different mounts. + */ +void +vfs_getnewfsid(mp) + struct mount *mp; +{ + static u_int16_t mntid_base; + fsid_t tfsid; + int mtype; + + mtx_lock(&mntid_mtx); + mtype = mp->mnt_vfc->vfc_typenum; + tfsid.val[1] = mtype; + mtype = (mtype & 0xFF) << 24; + for (;;) { + tfsid.val[0] = makeudev(255, + mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); + mntid_base++; + if (vfs_getvfs(&tfsid) == NULL) + break; + } + mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; + mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; + mtx_unlock(&mntid_mtx); +} + +/* + * Knob to control the precision of file timestamps: + * + * 0 = seconds only; nanoseconds zeroed. + * 1 = seconds and nanoseconds, accurate within 1/HZ. + * 2 = seconds and nanoseconds, truncated to microseconds. + * >=3 = seconds and nanoseconds, maximum precision. + */ +enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; + +static int timestamp_precision = TSP_SEC; +SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, + ×tamp_precision, 0, ""); + +/* + * Get a current timestamp. + */ +void +vfs_timestamp(tsp) + struct timespec *tsp; +{ + struct timeval tv; + + switch (timestamp_precision) { + case TSP_SEC: + tsp->tv_sec = time_second; + tsp->tv_nsec = 0; + break; + case TSP_HZ: + getnanotime(tsp); + break; + case TSP_USEC: + microtime(&tv); + TIMEVAL_TO_TIMESPEC(&tv, tsp); + break; + case TSP_NSEC: + default: + nanotime(tsp); + break; + } +} + +/* + * Build a linked list of mount options from a struct uio. + */ +int +vfs_buildopts(struct uio *auio, struct vfsoptlist **options) +{ + struct vfsoptlist *opts; + struct vfsopt *opt; + unsigned int i, iovcnt; + int error, namelen, optlen; + + iovcnt = auio->uio_iovcnt; + opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); + TAILQ_INIT(opts); + for (i = 0; i < iovcnt; i += 2) { + opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); + namelen = auio->uio_iov[i].iov_len; + optlen = auio->uio_iov[i + 1].iov_len; + opt->name = malloc(namelen, M_MOUNT, M_WAITOK); + opt->value = malloc(optlen, M_MOUNT, M_WAITOK); + opt->len = optlen; + if (auio->uio_segflg == UIO_SYSSPACE) { + bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); + bcopy(auio->uio_iov[i + 1].iov_base, opt->value, + optlen); + } else { + error = copyin(auio->uio_iov[i].iov_base, opt->name, + namelen); + if (!error) + error = copyin(auio->uio_iov[i + 1].iov_base, + opt->value, optlen); + if (error) + goto bad; + } + TAILQ_INSERT_TAIL(opts, opt, link); + } + *options = opts; + return (0); +bad: + vfs_freeopts(opts); + return (error); +} + +/* + * Get a mount option by its name. + * + * Return 0 if the option was found, ENOENT otherwise. + * If len is non-NULL it will be filled with the length + * of the option. If buf is non-NULL, it will be filled + * with the address of the option. + */ +int +vfs_getopt(opts, name, buf, len) + struct vfsoptlist *opts; + const char *name; + void **buf; + int *len; +{ + struct vfsopt *opt; + + TAILQ_FOREACH(opt, opts, link) { + if (strcmp(name, opt->name) == 0) { + if (len != NULL) + *len = opt->len; + if (buf != NULL) + *buf = opt->value; + return (0); + } + } + return (ENOENT); +} + +/* + * Find and copy a mount option. + * + * The size of the buffer has to be specified + * in len, if it is not the same length as the + * mount option, EINVAL is returned. + * Returns ENOENT if the option is not found. + */ +int +vfs_copyopt(opts, name, dest, len) + struct vfsoptlist *opts; + const char *name; + void *dest; + int len; +{ + struct vfsopt *opt; + + TAILQ_FOREACH(opt, opts, link) { + if (strcmp(name, opt->name) == 0) { + if (len != opt->len) + return (EINVAL); + bcopy(opt->value, dest, opt->len); + return (0); + } + } + return (ENOENT); +} + +/* + * Set vnode attributes to VNOVAL + */ +void +vattr_null(vap) + register struct vattr *vap; +{ + + vap->va_type = VNON; + vap->va_size = VNOVAL; + vap->va_bytes = VNOVAL; + vap->va_mode = VNOVAL; + vap->va_nlink = VNOVAL; + vap->va_uid = VNOVAL; + vap->va_gid = VNOVAL; + vap->va_fsid = VNOVAL; + vap->va_fileid = VNOVAL; + vap->va_blocksize = VNOVAL; + vap->va_rdev = VNOVAL; + vap->va_atime.tv_sec = VNOVAL; + vap->va_atime.tv_nsec = VNOVAL; + vap->va_mtime.tv_sec = VNOVAL; + vap->va_mtime.tv_nsec = VNOVAL; + vap->va_ctime.tv_sec = VNOVAL; + vap->va_ctime.tv_nsec = VNOVAL; + vap->va_flags = VNOVAL; + vap->va_gen = VNOVAL; + vap->va_vaflags = 0; +} + +/* + * This routine is called when we have too many vnodes. It attempts + * to free <count> vnodes and will potentially free vnodes that still + * have VM backing store (VM backing store is typically the cause + * of a vnode blowout so we want to do this). Therefore, this operation + * is not considered cheap. + * + * A number of conditions may prevent a vnode from being reclaimed. + * the buffer cache may have references on the vnode, a directory + * vnode may still have references due to the namei cache representing + * underlying files, or the vnode may be in active use. It is not + * desireable to reuse such vnodes. These conditions may cause the + * number of vnodes to reach some minimum value regardless of what + * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. + */ +static int +vlrureclaim(struct mount *mp, int count) +{ + struct vnode *vp; + int done; + int trigger; + int usevnodes; + + /* + * Calculate the trigger point, don't allow user + * screwups to blow us up. This prevents us from + * recycling vnodes with lots of resident pages. We + * aren't trying to free memory, we are trying to + * free vnodes. + */ + usevnodes = desiredvnodes; + if (usevnodes <= 0) + usevnodes = 1; + trigger = cnt.v_page_count * 2 / usevnodes; + + done = 0; + mtx_lock(&mntvnode_mtx); + while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { + TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + + if (vp->v_type != VNON && + vp->v_type != VBAD && + VMIGHTFREE(vp) && /* critical path opt */ + (vp->v_object == NULL || vp->v_object->resident_page_count < trigger) && + mtx_trylock(&vp->v_interlock) + ) { + mtx_unlock(&mntvnode_mtx); + if (VMIGHTFREE(vp)) { + vgonel(vp, curthread); + done++; + } else { + mtx_unlock(&vp->v_interlock); + } + mtx_lock(&mntvnode_mtx); + } + --count; + } + mtx_unlock(&mntvnode_mtx); + return done; +} + +/* + * Attempt to recycle vnodes in a context that is always safe to block. + * Calling vlrurecycle() from the bowels of filesystem code has some + * interesting deadlock problems. + */ +static struct proc *vnlruproc; +static int vnlruproc_sig; + +static void +vnlru_proc(void) +{ + struct mount *mp, *nmp; + int s; + int done; + struct proc *p = vnlruproc; + struct thread *td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */ + + mtx_lock(&Giant); + + EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, + SHUTDOWN_PRI_FIRST); + + s = splbio(); + for (;;) { + kthread_suspend_check(p); + if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) { + vnlruproc_sig = 0; + tsleep(vnlruproc, PVFS, "vlruwt", 0); + continue; + } + done = 0; + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } + done += vlrureclaim(mp, 10); + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + } + mtx_unlock(&mountlist_mtx); + if (done == 0) { +#if 0 + /* These messages are temporary debugging aids */ + if (vnlru_nowhere < 5) + printf("vnlru process getting nowhere..\n"); + else if (vnlru_nowhere == 5) + printf("vnlru process messages stopped.\n"); +#endif + vnlru_nowhere++; + tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); + } + } + splx(s); +} + +static struct kproc_desc vnlru_kp = { + "vnlru", + vnlru_proc, + &vnlruproc +}; +SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) + + +/* + * Routines having to do with the management of the vnode table. + */ + +/* + * Return the next vnode from the free list. + */ +int +getnewvnode(tag, mp, vops, vpp) + enum vtagtype tag; + struct mount *mp; + vop_t **vops; + struct vnode **vpp; +{ + int s; + struct thread *td = curthread; /* XXX */ + struct vnode *vp = NULL; + struct mount *vnmp; + vm_object_t object; + + s = splbio(); + /* + * Try to reuse vnodes if we hit the max. This situation only + * occurs in certain large-memory (2G+) situations. We cannot + * attempt to directly reclaim vnodes due to nasty recursion + * problems. + */ + if (vnlruproc_sig == 0 && numvnodes - freevnodes > desiredvnodes) { + vnlruproc_sig = 1; /* avoid unnecessary wakeups */ + wakeup(vnlruproc); + } + + /* + * Attempt to reuse a vnode already on the free list, allocating + * a new vnode if we can't find one or if we have not reached a + * good minimum for good LRU performance. + */ + + mtx_lock(&vnode_free_list_mtx); + + if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) { + int count; + + for (count = 0; count < freevnodes; count++) { + vp = TAILQ_FIRST(&vnode_free_list); + if (vp == NULL || vp->v_usecount) + panic("getnewvnode: free vnode isn't"); + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + + /* Don't recycle if we can't get the interlock */ + if (!mtx_trylock(&vp->v_interlock)) { + vp = NULL; + continue; + } + + /* We should be able to immediately acquire this */ + if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td) != 0) + continue; + /* + * Don't recycle if we still have cached pages. + */ + if (VOP_GETVOBJECT(vp, &object) == 0 && + (object->resident_page_count || + object->ref_count)) { + TAILQ_INSERT_TAIL(&vnode_free_list, vp, + v_freelist); + VOP_UNLOCK(vp, 0, td); + vp = NULL; + continue; + } + if (LIST_FIRST(&vp->v_cache_src)) { + /* + * note: nameileafonly sysctl is temporary, + * for debugging only, and will eventually be + * removed. + */ + if (nameileafonly > 0) { + /* + * Do not reuse namei-cached directory + * vnodes that have cached + * subdirectories. + */ + if (cache_leaf_test(vp) < 0) { + VOP_UNLOCK(vp, 0, td); + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + vp = NULL; + continue; + } + } else if (nameileafonly < 0 || + vmiodirenable == 0) { + /* + * Do not reuse namei-cached directory + * vnodes if nameileafonly is -1 or + * if VMIO backing for directories is + * turned off (otherwise we reuse them + * too quickly). + */ + VOP_UNLOCK(vp, 0, td); + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + vp = NULL; + continue; + } + } + /* + * Skip over it if its filesystem is being suspended. + */ + if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) + break; + VOP_UNLOCK(vp, 0, td); + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + vp = NULL; + } + } + if (vp) { + vp->v_flag |= VDOOMED; + vp->v_flag &= ~VFREE; + freevnodes--; + mtx_unlock(&vnode_free_list_mtx); + cache_purge(vp); + if (vp->v_type != VBAD) { + VOP_UNLOCK(vp, 0, td); + vgone(vp); + } else { + VOP_UNLOCK(vp, 0, td); + } + vn_finished_write(vnmp); + +#ifdef INVARIANTS + { + int s; + + if (vp->v_data) + panic("cleaned vnode isn't"); + s = splbio(); + if (vp->v_numoutput) + panic("Clean vnode has pending I/O's"); + splx(s); + if (vp->v_writecount != 0) + panic("Non-zero write count"); + } +#endif + if (vp->v_pollinfo) { + mtx_destroy(&vp->v_pollinfo->vpi_lock); + uma_zfree(vnodepoll_zone, vp->v_pollinfo); + } + vp->v_pollinfo = NULL; + vp->v_flag = 0; + vp->v_lastw = 0; + vp->v_lasta = 0; + vp->v_cstart = 0; + vp->v_clen = 0; + vp->v_socket = 0; + } else { + mtx_unlock(&vnode_free_list_mtx); + vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); + bzero((char *) vp, sizeof *vp); + mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); + vp->v_dd = vp; + cache_purge(vp); + LIST_INIT(&vp->v_cache_src); + TAILQ_INIT(&vp->v_cache_dst); + numvnodes++; + } + + TAILQ_INIT(&vp->v_cleanblkhd); + TAILQ_INIT(&vp->v_dirtyblkhd); + vp->v_type = VNON; + vp->v_tag = tag; + vp->v_op = vops; + lockinit(&vp->v_lock, PVFS, "vnlock", VLKTIMEOUT, LK_NOPAUSE); + insmntque(vp, mp); + *vpp = vp; + vp->v_usecount = 1; + vp->v_data = 0; + + splx(s); + +#if 0 + vnodeallocs++; + if (vnodeallocs % vnoderecycleperiod == 0 && + freevnodes < vnoderecycleminfreevn && + vnoderecyclemintotalvn < numvnodes) { + /* Recycle vnodes. */ + cache_purgeleafdirs(vnoderecyclenumber); + } +#endif + + return (0); +} + +/* + * Move a vnode from one mount queue to another. + */ +static void +insmntque(vp, mp) + register struct vnode *vp; + register struct mount *mp; +{ + + mtx_lock(&mntvnode_mtx); + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) + TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); + /* + * Insert into list of vnodes for the new mount point, if available. + */ + if ((vp->v_mount = mp) == NULL) { + mtx_unlock(&mntvnode_mtx); + return; + } + TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + mtx_unlock(&mntvnode_mtx); +} + +/* + * Update outstanding I/O count and do wakeup if requested. + */ +void +vwakeup(bp) + register struct buf *bp; +{ + register struct vnode *vp; + + bp->b_flags &= ~B_WRITEINPROG; + if ((vp = bp->b_vp)) { + vp->v_numoutput--; + if (vp->v_numoutput < 0) + panic("vwakeup: neg numoutput"); + if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { + vp->v_flag &= ~VBWAIT; + wakeup(&vp->v_numoutput); + } + } +} + +/* + * Flush out and invalidate all buffers associated with a vnode. + * Called with the underlying object locked. + */ +int +vinvalbuf(vp, flags, cred, td, slpflag, slptimeo) + register struct vnode *vp; + int flags; + struct ucred *cred; + struct thread *td; + int slpflag, slptimeo; +{ + register struct buf *bp; + struct buf *nbp, *blist; + int s, error; + vm_object_t object; + + GIANT_REQUIRED; + + if (flags & V_SAVE) { + s = splbio(); + while (vp->v_numoutput) { + vp->v_flag |= VBWAIT; + error = tsleep(&vp->v_numoutput, + slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); + if (error) { + splx(s); + return (error); + } + } + if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { + splx(s); + if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0) + return (error); + s = splbio(); + if (vp->v_numoutput > 0 || + !TAILQ_EMPTY(&vp->v_dirtyblkhd)) + panic("vinvalbuf: dirty bufs"); + } + splx(s); + } + s = splbio(); + for (;;) { + blist = TAILQ_FIRST(&vp->v_cleanblkhd); + if (!blist) + blist = TAILQ_FIRST(&vp->v_dirtyblkhd); + if (!blist) + break; + + for (bp = blist; bp; bp = nbp) { + nbp = TAILQ_NEXT(bp, b_vnbufs); + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { + error = BUF_TIMELOCK(bp, + LK_EXCLUSIVE | LK_SLEEPFAIL, + "vinvalbuf", slpflag, slptimeo); + if (error == ENOLCK) + break; + splx(s); + return (error); + } + /* + * XXX Since there are no node locks for NFS, I + * believe there is a slight chance that a delayed + * write will occur while sleeping just above, so + * check for it. Note that vfs_bio_awrite expects + * buffers to reside on a queue, while BUF_WRITE and + * brelse do not. + */ + if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && + (flags & V_SAVE)) { + + if (bp->b_vp == vp) { + if (bp->b_flags & B_CLUSTEROK) { + BUF_UNLOCK(bp); + vfs_bio_awrite(bp); + } else { + bremfree(bp); + bp->b_flags |= B_ASYNC; + BUF_WRITE(bp); + } + } else { + bremfree(bp); + (void) BUF_WRITE(bp); + } + break; + } + bremfree(bp); + bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); + bp->b_flags &= ~B_ASYNC; + brelse(bp); + } + } + + /* + * Wait for I/O to complete. XXX needs cleaning up. The vnode can + * have write I/O in-progress but if there is a VM object then the + * VM object can also have read-I/O in-progress. + */ + do { + while (vp->v_numoutput > 0) { + vp->v_flag |= VBWAIT; + tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); + } + if (VOP_GETVOBJECT(vp, &object) == 0) { + while (object->paging_in_progress) + vm_object_pip_sleep(object, "vnvlbx"); + } + } while (vp->v_numoutput > 0); + + splx(s); + + /* + * Destroy the copy in the VM cache, too. + */ + mtx_lock(&vp->v_interlock); + if (VOP_GETVOBJECT(vp, &object) == 0) { + vm_object_page_remove(object, 0, 0, + (flags & V_SAVE) ? TRUE : FALSE); + } + mtx_unlock(&vp->v_interlock); + + if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) + panic("vinvalbuf: flush failed"); + return (0); +} + +/* + * Truncate a file's buffer and pages to a specified length. This + * is in lieu of the old vinvalbuf mechanism, which performed unneeded + * sync activity. + */ +int +vtruncbuf(vp, cred, td, length, blksize) + register struct vnode *vp; + struct ucred *cred; + struct thread *td; + off_t length; + int blksize; +{ + register struct buf *bp; + struct buf *nbp; + int s, anyfreed; + int trunclbn; + + /* + * Round up to the *next* lbn. + */ + trunclbn = (length + blksize - 1) / blksize; + + s = splbio(); +restart: + anyfreed = 1; + for (;anyfreed;) { + anyfreed = 0; + for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { + nbp = TAILQ_NEXT(bp, b_vnbufs); + if (bp->b_lblkno >= trunclbn) { + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { + BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); + goto restart; + } else { + bremfree(bp); + bp->b_flags |= (B_INVAL | B_RELBUF); + bp->b_flags &= ~B_ASYNC; + brelse(bp); + anyfreed = 1; + } + if (nbp && + (((nbp->b_xflags & BX_VNCLEAN) == 0) || + (nbp->b_vp != vp) || + (nbp->b_flags & B_DELWRI))) { + goto restart; + } + } + } + + for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = TAILQ_NEXT(bp, b_vnbufs); + if (bp->b_lblkno >= trunclbn) { + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { + BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); + goto restart; + } else { + bremfree(bp); + bp->b_flags |= (B_INVAL | B_RELBUF); + bp->b_flags &= ~B_ASYNC; + brelse(bp); + anyfreed = 1; + } + if (nbp && + (((nbp->b_xflags & BX_VNDIRTY) == 0) || + (nbp->b_vp != vp) || + (nbp->b_flags & B_DELWRI) == 0)) { + goto restart; + } + } + } + } + + if (length > 0) { +restartsync: + for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = TAILQ_NEXT(bp, b_vnbufs); + if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { + BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); + goto restart; + } else { + bremfree(bp); + if (bp->b_vp == vp) { + bp->b_flags |= B_ASYNC; + } else { + bp->b_flags &= ~B_ASYNC; + } + BUF_WRITE(bp); + } + goto restartsync; + } + + } + } + + while (vp->v_numoutput > 0) { + vp->v_flag |= VBWAIT; + tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); + } + + splx(s); + + vnode_pager_setsize(vp, length); + + return (0); +} + +/* + * Associate a buffer with a vnode. + */ +void +bgetvp(vp, bp) + register struct vnode *vp; + register struct buf *bp; +{ + int s; + + KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); + + vhold(vp); + bp->b_vp = vp; + bp->b_dev = vn_todev(vp); + /* + * Insert onto list for new vnode. + */ + s = splbio(); + bp->b_xflags |= BX_VNCLEAN; + bp->b_xflags &= ~BX_VNDIRTY; + TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); + splx(s); +} + +/* + * Disassociate a buffer from a vnode. + */ +void +brelvp(bp) + register struct buf *bp; +{ + struct vnode *vp; + struct buflists *listheadp; + int s; + + KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); + + /* + * Delete from old vnode list, if on one. + */ + vp = bp->b_vp; + s = splbio(); + if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { + if (bp->b_xflags & BX_VNDIRTY) + listheadp = &vp->v_dirtyblkhd; + else + listheadp = &vp->v_cleanblkhd; + TAILQ_REMOVE(listheadp, bp, b_vnbufs); + bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); + } + if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { + vp->v_flag &= ~VONWORKLST; + LIST_REMOVE(vp, v_synclist); + } + splx(s); + bp->b_vp = (struct vnode *) 0; + vdrop(vp); +} + +/* + * Add an item to the syncer work queue. + */ +static void +vn_syncer_add_to_worklist(struct vnode *vp, int delay) +{ + int s, slot; + + s = splbio(); + + if (vp->v_flag & VONWORKLST) { + LIST_REMOVE(vp, v_synclist); + } + + if (delay > syncer_maxdelay - 2) + delay = syncer_maxdelay - 2; + slot = (syncer_delayno + delay) & syncer_mask; + + LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); + vp->v_flag |= VONWORKLST; + splx(s); +} + +struct proc *updateproc; +static void sched_sync(void); +static struct kproc_desc up_kp = { + "syncer", + sched_sync, + &updateproc +}; +SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) + +/* + * System filesystem synchronizer daemon. + */ +void +sched_sync(void) +{ + struct synclist *slp; + struct vnode *vp; + struct mount *mp; + long starttime; + int s; + struct thread *td = FIRST_THREAD_IN_PROC(updateproc); /* XXXKSE */ + + mtx_lock(&Giant); + + EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc, + SHUTDOWN_PRI_LAST); + + for (;;) { + kthread_suspend_check(td->td_proc); + + starttime = time_second; + + /* + * Push files whose dirty time has expired. Be careful + * of interrupt race on slp queue. + */ + s = splbio(); + slp = &syncer_workitem_pending[syncer_delayno]; + syncer_delayno += 1; + if (syncer_delayno == syncer_maxdelay) + syncer_delayno = 0; + splx(s); + + while ((vp = LIST_FIRST(slp)) != NULL) { + if (VOP_ISLOCKED(vp, NULL) == 0 && + vn_start_write(vp, &mp, V_NOWAIT) == 0) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + (void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + } + s = splbio(); + if (LIST_FIRST(slp) == vp) { + /* + * Note: v_tag VT_VFS vps can remain on the + * worklist too with no dirty blocks, but + * since sync_fsync() moves it to a different + * slot we are safe. + */ + if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && + !vn_isdisk(vp, NULL)) + panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); + /* + * Put us back on the worklist. The worklist + * routine will remove us from our current + * position and then add us back in at a later + * position. + */ + vn_syncer_add_to_worklist(vp, syncdelay); + } + splx(s); + } + + /* + * Do soft update processing. + */ +#ifdef SOFTUPDATES + softdep_process_worklist(NULL); +#endif + + /* + * The variable rushjob allows the kernel to speed up the + * processing of the filesystem syncer process. A rushjob + * value of N tells the filesystem syncer to process the next + * N seconds worth of work on its queue ASAP. Currently rushjob + * is used by the soft update code to speed up the filesystem + * syncer process when the incore state is getting so far + * ahead of the disk that the kernel memory pool is being + * threatened with exhaustion. + */ + if (rushjob > 0) { + rushjob -= 1; + continue; + } + /* + * If it has taken us less than a second to process the + * current work, then wait. Otherwise start right over + * again. We can still lose time if any single round + * takes more than two seconds, but it does not really + * matter as we are just trying to generally pace the + * filesystem activity. + */ + if (time_second == starttime) + tsleep(&lbolt, PPAUSE, "syncer", 0); + } +} + +/* + * Request the syncer daemon to speed up its work. + * We never push it to speed up more than half of its + * normal turn time, otherwise it could take over the cpu. + * XXXKSE only one update? + */ +int +speedup_syncer() +{ + + mtx_lock_spin(&sched_lock); + if (FIRST_THREAD_IN_PROC(updateproc)->td_wchan == &lbolt) /* XXXKSE */ + setrunnable(FIRST_THREAD_IN_PROC(updateproc)); + mtx_unlock_spin(&sched_lock); + if (rushjob < syncdelay / 2) { + rushjob += 1; + stat_rush_requests += 1; + return (1); + } + return(0); +} + +/* + * Associate a p-buffer with a vnode. + * + * Also sets B_PAGING flag to indicate that vnode is not fully associated + * with the buffer. i.e. the bp has not been linked into the vnode or + * ref-counted. + */ +void +pbgetvp(vp, bp) + register struct vnode *vp; + register struct buf *bp; +{ + + KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); + + bp->b_vp = vp; + bp->b_flags |= B_PAGING; + bp->b_dev = vn_todev(vp); +} + +/* + * Disassociate a p-buffer from a vnode. + */ +void +pbrelvp(bp) + register struct buf *bp; +{ + + KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); + + /* XXX REMOVE ME */ + if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { + panic( + "relpbuf(): b_vp was probably reassignbuf()d %p %x", + bp, + (int)bp->b_flags + ); + } + bp->b_vp = (struct vnode *) 0; + bp->b_flags &= ~B_PAGING; +} + +/* + * Reassign a buffer from one vnode to another. + * Used to assign file specific control information + * (indirect blocks) to the vnode to which they belong. + */ +void +reassignbuf(bp, newvp) + register struct buf *bp; + register struct vnode *newvp; +{ + struct buflists *listheadp; + int delay; + int s; + + if (newvp == NULL) { + printf("reassignbuf: NULL"); + return; + } + ++reassignbufcalls; + + /* + * B_PAGING flagged buffers cannot be reassigned because their vp + * is not fully linked in. + */ + if (bp->b_flags & B_PAGING) + panic("cannot reassign paging buffer"); + + s = splbio(); + /* + * Delete from old vnode list, if on one. + */ + if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { + if (bp->b_xflags & BX_VNDIRTY) + listheadp = &bp->b_vp->v_dirtyblkhd; + else + listheadp = &bp->b_vp->v_cleanblkhd; + TAILQ_REMOVE(listheadp, bp, b_vnbufs); + bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); + if (bp->b_vp != newvp) { + vdrop(bp->b_vp); + bp->b_vp = NULL; /* for clarification */ + } + } + /* + * If dirty, put on list of dirty buffers; otherwise insert onto list + * of clean buffers. + */ + if (bp->b_flags & B_DELWRI) { + struct buf *tbp; + + listheadp = &newvp->v_dirtyblkhd; + if ((newvp->v_flag & VONWORKLST) == 0) { + switch (newvp->v_type) { + case VDIR: + delay = dirdelay; + break; + case VCHR: + if (newvp->v_rdev->si_mountpoint != NULL) { + delay = metadelay; + break; + } + /* fall through */ + default: + delay = filedelay; + } + vn_syncer_add_to_worklist(newvp, delay); + } + bp->b_xflags |= BX_VNDIRTY; + tbp = TAILQ_FIRST(listheadp); + if (tbp == NULL || + bp->b_lblkno == 0 || + (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || + (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { + TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); + ++reassignbufsortgood; + } else if (bp->b_lblkno < 0) { + TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); + ++reassignbufsortgood; + } else if (reassignbufmethod == 1) { + /* + * New sorting algorithm, only handle sequential case, + * otherwise append to end (but before metadata) + */ + if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && + (tbp->b_xflags & BX_VNDIRTY)) { + /* + * Found the best place to insert the buffer + */ + TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); + ++reassignbufsortgood; + } else { + /* + * Missed, append to end, but before meta-data. + * We know that the head buffer in the list is + * not meta-data due to prior conditionals. + * + * Indirect effects: NFS second stage write + * tends to wind up here, giving maximum + * distance between the unstable write and the + * commit rpc. + */ + tbp = TAILQ_LAST(listheadp, buflists); + while (tbp && tbp->b_lblkno < 0) + tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); + TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); + ++reassignbufsortbad; + } + } else { + /* + * Old sorting algorithm, scan queue and insert + */ + struct buf *ttbp; + while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && + (ttbp->b_lblkno < bp->b_lblkno)) { + ++reassignbufloops; + tbp = ttbp; + } + TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); + } + } else { + bp->b_xflags |= BX_VNCLEAN; + TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); + if ((newvp->v_flag & VONWORKLST) && + TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { + newvp->v_flag &= ~VONWORKLST; + LIST_REMOVE(newvp, v_synclist); + } + } + if (bp->b_vp != newvp) { + bp->b_vp = newvp; + vhold(bp->b_vp); + } + splx(s); +} + +/* + * Create a vnode for a device. + * Used for mounting the root filesystem. + */ +int +bdevvp(dev, vpp) + dev_t dev; + struct vnode **vpp; +{ + register struct vnode *vp; + struct vnode *nvp; + int error; + + if (dev == NODEV) { + *vpp = NULLVP; + return (ENXIO); + } + if (vfinddev(dev, VCHR, vpp)) + return (0); + error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); + if (error) { + *vpp = NULLVP; + return (error); + } + vp = nvp; + vp->v_type = VCHR; + addalias(vp, dev); + *vpp = vp; + return (0); +} + +/* + * Add vnode to the alias list hung off the dev_t. + * + * The reason for this gunk is that multiple vnodes can reference + * the same physical device, so checking vp->v_usecount to see + * how many users there are is inadequate; the v_usecount for + * the vnodes need to be accumulated. vcount() does that. + */ +struct vnode * +addaliasu(nvp, nvp_rdev) + struct vnode *nvp; + udev_t nvp_rdev; +{ + struct vnode *ovp; + vop_t **ops; + dev_t dev; + + if (nvp->v_type == VBLK) + return (nvp); + if (nvp->v_type != VCHR) + panic("addaliasu on non-special vnode"); + dev = udev2dev(nvp_rdev, 0); + /* + * Check to see if we have a bdevvp vnode with no associated + * filesystem. If so, we want to associate the filesystem of + * the new newly instigated vnode with the bdevvp vnode and + * discard the newly created vnode rather than leaving the + * bdevvp vnode lying around with no associated filesystem. + */ + if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { + addalias(nvp, dev); + return (nvp); + } + /* + * Discard unneeded vnode, but save its node specific data. + * Note that if there is a lock, it is carried over in the + * node specific data to the replacement vnode. + */ + vref(ovp); + ovp->v_data = nvp->v_data; + ovp->v_tag = nvp->v_tag; + nvp->v_data = NULL; + lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg, + nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK); + if (nvp->v_vnlock) + ovp->v_vnlock = &ovp->v_lock; + ops = ovp->v_op; + ovp->v_op = nvp->v_op; + if (VOP_ISLOCKED(nvp, curthread)) { + VOP_UNLOCK(nvp, 0, curthread); + vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread); + } + nvp->v_op = ops; + insmntque(ovp, nvp->v_mount); + vrele(nvp); + vgone(nvp); + return (ovp); +} + +/* This is a local helper function that do the same as addaliasu, but for a + * dev_t instead of an udev_t. */ +static void +addalias(nvp, dev) + struct vnode *nvp; + dev_t dev; +{ + + KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); + nvp->v_rdev = dev; + mtx_lock(&spechash_mtx); + SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); + mtx_unlock(&spechash_mtx); +} + +/* + * Grab a particular vnode from the free list, increment its + * reference count and lock it. The vnode lock bit is set if the + * vnode is being eliminated in vgone. The process is awakened + * when the transition is completed, and an error returned to + * indicate that the vnode is no longer usable (possibly having + * been changed to a new filesystem type). + */ +int +vget(vp, flags, td) + register struct vnode *vp; + int flags; + struct thread *td; +{ + int error; + + /* + * If the vnode is in the process of being cleaned out for + * another use, we wait for the cleaning to finish and then + * return failure. Cleaning is determined by checking that + * the VXLOCK flag is set. + */ + if ((flags & LK_INTERLOCK) == 0) + mtx_lock(&vp->v_interlock); + if (vp->v_flag & VXLOCK) { + if (vp->v_vxproc == curthread) { +#if 0 + /* this can now occur in normal operation */ + log(LOG_INFO, "VXLOCK interlock avoided\n"); +#endif + } else { + vp->v_flag |= VXWANT; + msleep(vp, &vp->v_interlock, PINOD | PDROP, "vget", 0); + return (ENOENT); + } + } + + vp->v_usecount++; + + if (VSHOULDBUSY(vp)) + vbusy(vp); + if (flags & LK_TYPE_MASK) { + if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) { + /* + * must expand vrele here because we do not want + * to call VOP_INACTIVE if the reference count + * drops back to zero since it was never really + * active. We must remove it from the free list + * before sleeping so that multiple processes do + * not try to recycle it. + */ + mtx_lock(&vp->v_interlock); + vp->v_usecount--; + if (VSHOULDFREE(vp)) + vfree(vp); + else + vlruvp(vp); + mtx_unlock(&vp->v_interlock); + } + return (error); + } + mtx_unlock(&vp->v_interlock); + return (0); +} + +/* + * Increase the reference count of a vnode. + */ +void +vref(struct vnode *vp) +{ + mtx_lock(&vp->v_interlock); + vp->v_usecount++; + mtx_unlock(&vp->v_interlock); +} + +/* + * Vnode put/release. + * If count drops to zero, call inactive routine and return to freelist. + */ +void +vrele(vp) + struct vnode *vp; +{ + struct thread *td = curthread; /* XXX */ + + KASSERT(vp != NULL, ("vrele: null vp")); + + mtx_lock(&vp->v_interlock); + + /* Skip this v_writecount check if we're going to panic below. */ + KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, + ("vrele: missed vn_close")); + + if (vp->v_usecount > 1) { + + vp->v_usecount--; + mtx_unlock(&vp->v_interlock); + + return; + } + + if (vp->v_usecount == 1) { + vp->v_usecount--; + /* + * We must call VOP_INACTIVE with the node locked. + * If we are doing a vput, the node is already locked, + * but, in the case of vrele, we must explicitly lock + * the vnode before calling VOP_INACTIVE. + */ + if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) + VOP_INACTIVE(vp, td); + if (VSHOULDFREE(vp)) + vfree(vp); + else + vlruvp(vp); + + } else { +#ifdef DIAGNOSTIC + vprint("vrele: negative ref count", vp); + mtx_unlock(&vp->v_interlock); +#endif + panic("vrele: negative ref cnt"); + } +} + +/* + * Release an already locked vnode. This give the same effects as + * unlock+vrele(), but takes less time and avoids releasing and + * re-aquiring the lock (as vrele() aquires the lock internally.) + */ +void +vput(vp) + struct vnode *vp; +{ + struct thread *td = curthread; /* XXX */ + + GIANT_REQUIRED; + + KASSERT(vp != NULL, ("vput: null vp")); + mtx_lock(&vp->v_interlock); + /* Skip this v_writecount check if we're going to panic below. */ + KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, + ("vput: missed vn_close")); + + if (vp->v_usecount > 1) { + vp->v_usecount--; + VOP_UNLOCK(vp, LK_INTERLOCK, td); + return; + } + + if (vp->v_usecount == 1) { + vp->v_usecount--; + /* + * We must call VOP_INACTIVE with the node locked. + * If we are doing a vput, the node is already locked, + * so we just need to release the vnode mutex. + */ + mtx_unlock(&vp->v_interlock); + VOP_INACTIVE(vp, td); + if (VSHOULDFREE(vp)) + vfree(vp); + else + vlruvp(vp); + + } else { +#ifdef DIAGNOSTIC + vprint("vput: negative ref count", vp); +#endif + panic("vput: negative ref cnt"); + } +} + +/* + * Somebody doesn't want the vnode recycled. + */ +void +vhold(vp) + register struct vnode *vp; +{ + int s; + + s = splbio(); + vp->v_holdcnt++; + if (VSHOULDBUSY(vp)) + vbusy(vp); + splx(s); +} + +/* + * Note that there is one less who cares about this vnode. vdrop() is the + * opposite of vhold(). + */ +void +vdrop(vp) + register struct vnode *vp; +{ + int s; + + s = splbio(); + if (vp->v_holdcnt <= 0) + panic("vdrop: holdcnt"); + vp->v_holdcnt--; + if (VSHOULDFREE(vp)) + vfree(vp); + else + vlruvp(vp); + splx(s); +} + +/* + * Remove any vnodes in the vnode table belonging to mount point mp. + * + * If FORCECLOSE is not specified, there should not be any active ones, + * return error if any are found (nb: this is a user error, not a + * system error). If FORCECLOSE is specified, detach any active vnodes + * that are found. + * + * If WRITECLOSE is set, only flush out regular file vnodes open for + * writing. + * + * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped. + * + * `rootrefs' specifies the base reference count for the root vnode + * of this filesystem. The root vnode is considered busy if its + * v_usecount exceeds this value. On a successful return, vflush() + * will call vrele() on the root vnode exactly rootrefs times. + * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must + * be zero. + */ +#ifdef DIAGNOSTIC +static int busyprt = 0; /* print out busy vnodes */ +SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); +#endif + +int +vflush(mp, rootrefs, flags) + struct mount *mp; + int rootrefs; + int flags; +{ + struct thread *td = curthread; /* XXX */ + struct vnode *vp, *nvp, *rootvp = NULL; + struct vattr vattr; + int busy = 0, error; + + if (rootrefs > 0) { + KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, + ("vflush: bad args")); + /* + * Get the filesystem root vnode. We can vput() it + * immediately, since with rootrefs > 0, it won't go away. + */ + if ((error = VFS_ROOT(mp, &rootvp)) != 0) + return (error); + vput(rootvp); + } + mtx_lock(&mntvnode_mtx); +loop: + for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) { + /* + * Make sure this vnode wasn't reclaimed in getnewvnode(). + * Start over if it has (it won't be on the list anymore). + */ + if (vp->v_mount != mp) + goto loop; + nvp = TAILQ_NEXT(vp, v_nmntvnodes); + + mtx_unlock(&mntvnode_mtx); + mtx_lock(&vp->v_interlock); + /* + * Skip over a vnodes marked VSYSTEM. + */ + if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { + mtx_unlock(&vp->v_interlock); + mtx_lock(&mntvnode_mtx); + continue; + } + /* + * If WRITECLOSE is set, flush out unlinked but still open + * files (even if open only for reading) and regular file + * vnodes open for writing. + */ + if ((flags & WRITECLOSE) && + (vp->v_type == VNON || + (VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 && + vattr.va_nlink > 0)) && + (vp->v_writecount == 0 || vp->v_type != VREG)) { + mtx_unlock(&vp->v_interlock); + mtx_lock(&mntvnode_mtx); + continue; + } + + /* + * With v_usecount == 0, all we need to do is clear out the + * vnode data structures and we are done. + */ + if (vp->v_usecount == 0) { + vgonel(vp, td); + mtx_lock(&mntvnode_mtx); + continue; + } + + /* + * If FORCECLOSE is set, forcibly close the vnode. For block + * or character devices, revert to an anonymous device. For + * all other files, just kill them. + */ + if (flags & FORCECLOSE) { + if (vp->v_type != VCHR) { + vgonel(vp, td); + } else { + vclean(vp, 0, td); + vp->v_op = spec_vnodeop_p; + insmntque(vp, (struct mount *) 0); + } + mtx_lock(&mntvnode_mtx); + continue; + } +#ifdef DIAGNOSTIC + if (busyprt) + vprint("vflush: busy vnode", vp); +#endif + mtx_unlock(&vp->v_interlock); + mtx_lock(&mntvnode_mtx); + busy++; + } + mtx_unlock(&mntvnode_mtx); + if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { + /* + * If just the root vnode is busy, and if its refcount + * is equal to `rootrefs', then go ahead and kill it. + */ + mtx_lock(&rootvp->v_interlock); + KASSERT(busy > 0, ("vflush: not busy")); + KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs")); + if (busy == 1 && rootvp->v_usecount == rootrefs) { + vgonel(rootvp, td); + busy = 0; + } else + mtx_unlock(&rootvp->v_interlock); + } + if (busy) + return (EBUSY); + for (; rootrefs > 0; rootrefs--) + vrele(rootvp); + return (0); +} + +/* + * This moves a now (likely recyclable) vnode to the end of the + * mountlist. XXX However, it is temporarily disabled until we + * can clean up ffs_sync() and friends, which have loop restart + * conditions which this code causes to operate O(N^2). + */ +static void +vlruvp(struct vnode *vp) +{ +#if 0 + struct mount *mp; + + if ((mp = vp->v_mount) != NULL) { + mtx_lock(&mntvnode_mtx); + TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + mtx_unlock(&mntvnode_mtx); + } +#endif +} + +/* + * Disassociate the underlying filesystem from a vnode. + */ +static void +vclean(vp, flags, td) + struct vnode *vp; + int flags; + struct thread *td; +{ + int active; + + /* + * Check to see if the vnode is in use. If so we have to reference it + * before we clean it out so that its count cannot fall to zero and + * generate a race against ourselves to recycle it. + */ + if ((active = vp->v_usecount)) + vp->v_usecount++; + + /* + * Prevent the vnode from being recycled or brought into use while we + * clean it out. + */ + if (vp->v_flag & VXLOCK) + panic("vclean: deadlock"); + vp->v_flag |= VXLOCK; + vp->v_vxproc = curthread; + /* + * Even if the count is zero, the VOP_INACTIVE routine may still + * have the object locked while it cleans it out. The VOP_LOCK + * ensures that the VOP_INACTIVE routine is done with its work. + * For active vnodes, it ensures that no other activity can + * occur while the underlying object is being cleaned out. + */ + VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td); + + /* + * Clean out any buffers associated with the vnode. + * If the flush fails, just toss the buffers. + */ + if (flags & DOCLOSE) { + if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) + (void) vn_write_suspend_wait(vp, NULL, V_WAIT); + if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0) + vinvalbuf(vp, 0, NOCRED, td, 0, 0); + } + + VOP_DESTROYVOBJECT(vp); + + /* + * If purging an active vnode, it must be closed and + * deactivated before being reclaimed. Note that the + * VOP_INACTIVE will unlock the vnode. + */ + if (active) { + if (flags & DOCLOSE) + VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); + VOP_INACTIVE(vp, td); + } else { + /* + * Any other processes trying to obtain this lock must first + * wait for VXLOCK to clear, then call the new lock operation. + */ + VOP_UNLOCK(vp, 0, td); + } + /* + * Reclaim the vnode. + */ + if (VOP_RECLAIM(vp, td)) + panic("vclean: cannot reclaim"); + + if (active) { + /* + * Inline copy of vrele() since VOP_INACTIVE + * has already been called. + */ + mtx_lock(&vp->v_interlock); + if (--vp->v_usecount <= 0) { +#ifdef DIAGNOSTIC + if (vp->v_usecount < 0 || vp->v_writecount != 0) { + vprint("vclean: bad ref count", vp); + panic("vclean: ref cnt"); + } +#endif + vfree(vp); + } + mtx_unlock(&vp->v_interlock); + } + + cache_purge(vp); + vp->v_vnlock = NULL; + lockdestroy(&vp->v_lock); + + if (VSHOULDFREE(vp)) + vfree(vp); + + /* + * Done with purge, notify sleepers of the grim news. + */ + vp->v_op = dead_vnodeop_p; + if (vp->v_pollinfo != NULL) + vn_pollgone(vp); + vp->v_tag = VT_NON; + vp->v_flag &= ~VXLOCK; + vp->v_vxproc = NULL; + if (vp->v_flag & VXWANT) { + vp->v_flag &= ~VXWANT; + wakeup(vp); + } +} + +/* + * Eliminate all activity associated with the requested vnode + * and with all vnodes aliased to the requested vnode. + */ +int +vop_revoke(ap) + struct vop_revoke_args /* { + struct vnode *a_vp; + int a_flags; + } */ *ap; +{ + struct vnode *vp, *vq; + dev_t dev; + + KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); + + vp = ap->a_vp; + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + msleep(vp, &vp->v_interlock, PINOD | PDROP, + "vop_revokeall", 0); + return (0); + } + dev = vp->v_rdev; + for (;;) { + mtx_lock(&spechash_mtx); + vq = SLIST_FIRST(&dev->si_hlist); + mtx_unlock(&spechash_mtx); + if (!vq) + break; + vgone(vq); + } + return (0); +} + +/* + * Recycle an unused vnode to the front of the free list. + * Release the passed interlock if the vnode will be recycled. + */ +int +vrecycle(vp, inter_lkp, td) + struct vnode *vp; + struct mtx *inter_lkp; + struct thread *td; +{ + + mtx_lock(&vp->v_interlock); + if (vp->v_usecount == 0) { + if (inter_lkp) { + mtx_unlock(inter_lkp); + } + vgonel(vp, td); + return (1); + } + mtx_unlock(&vp->v_interlock); + return (0); +} + +/* + * Eliminate all activity associated with a vnode + * in preparation for reuse. + */ +void +vgone(vp) + register struct vnode *vp; +{ + struct thread *td = curthread; /* XXX */ + + mtx_lock(&vp->v_interlock); + vgonel(vp, td); +} + +/* + * vgone, with the vp interlock held. + */ +void +vgonel(vp, td) + struct vnode *vp; + struct thread *td; +{ + int s; + + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + msleep(vp, &vp->v_interlock, PINOD | PDROP, "vgone", 0); + return; + } + + /* + * Clean out the filesystem specific data. + */ + vclean(vp, DOCLOSE, td); + mtx_lock(&vp->v_interlock); + + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) + insmntque(vp, (struct mount *)0); + /* + * If special device, remove it from special device alias list + * if it is on one. + */ + if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { + mtx_lock(&spechash_mtx); + SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); + freedev(vp->v_rdev); + mtx_unlock(&spechash_mtx); + vp->v_rdev = NULL; + } + + /* + * If it is on the freelist and not already at the head, + * move it to the head of the list. The test of the + * VDOOMED flag and the reference count of zero is because + * it will be removed from the free list by getnewvnode, + * but will not have its reference count incremented until + * after calling vgone. If the reference count were + * incremented first, vgone would (incorrectly) try to + * close the previous instance of the underlying object. + */ + if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { + s = splbio(); + mtx_lock(&vnode_free_list_mtx); + if (vp->v_flag & VFREE) + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + else + freevnodes++; + vp->v_flag |= VFREE; + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + mtx_unlock(&vnode_free_list_mtx); + splx(s); + } + + vp->v_type = VBAD; + mtx_unlock(&vp->v_interlock); +} + +/* + * Lookup a vnode by device number. + */ +int +vfinddev(dev, type, vpp) + dev_t dev; + enum vtype type; + struct vnode **vpp; +{ + struct vnode *vp; + + mtx_lock(&spechash_mtx); + SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { + if (type == vp->v_type) { + *vpp = vp; + mtx_unlock(&spechash_mtx); + return (1); + } + } + mtx_unlock(&spechash_mtx); + return (0); +} + +/* + * Calculate the total number of references to a special device. + */ +int +vcount(vp) + struct vnode *vp; +{ + struct vnode *vq; + int count; + + count = 0; + mtx_lock(&spechash_mtx); + SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) + count += vq->v_usecount; + mtx_unlock(&spechash_mtx); + return (count); +} + +/* + * Same as above, but using the dev_t as argument + */ +int +count_dev(dev) + dev_t dev; +{ + struct vnode *vp; + + vp = SLIST_FIRST(&dev->si_hlist); + if (vp == NULL) + return (0); + return(vcount(vp)); +} + +/* + * Print out a description of a vnode. + */ +static char *typename[] = +{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; + +void +vprint(label, vp) + char *label; + struct vnode *vp; +{ + char buf[96]; + + if (label != NULL) + printf("%s: %p: ", label, (void *)vp); + else + printf("%p: ", (void *)vp); + printf("type %s, usecount %d, writecount %d, refcount %d,", + typename[vp->v_type], vp->v_usecount, vp->v_writecount, + vp->v_holdcnt); + buf[0] = '\0'; + if (vp->v_flag & VROOT) + strcat(buf, "|VROOT"); + if (vp->v_flag & VTEXT) + strcat(buf, "|VTEXT"); + if (vp->v_flag & VSYSTEM) + strcat(buf, "|VSYSTEM"); + if (vp->v_flag & VXLOCK) + strcat(buf, "|VXLOCK"); + if (vp->v_flag & VXWANT) + strcat(buf, "|VXWANT"); + if (vp->v_flag & VBWAIT) + strcat(buf, "|VBWAIT"); + if (vp->v_flag & VDOOMED) + strcat(buf, "|VDOOMED"); + if (vp->v_flag & VFREE) + strcat(buf, "|VFREE"); + if (vp->v_flag & VOBJBUF) + strcat(buf, "|VOBJBUF"); + if (buf[0] != '\0') + printf(" flags (%s)", &buf[1]); + if (vp->v_data == NULL) { + printf("\n"); + } else { + printf("\n\t"); + VOP_PRINT(vp); + } +} + +#ifdef DDB +#include <ddb/ddb.h> +/* + * List all of the locked vnodes in the system. + * Called when debugging the kernel. + */ +DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) +{ + struct thread *td = curthread; /* XXX */ + struct mount *mp, *nmp; + struct vnode *vp; + + printf("Locked vnodes\n"); + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } + mtx_lock(&mntvnode_mtx); + TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { + if (VOP_ISLOCKED(vp, NULL)) + vprint((char *)0, vp); + } + mtx_unlock(&mntvnode_mtx); + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + } + mtx_unlock(&mountlist_mtx); +} +#endif + +/* + * Top level filesystem related information gathering. + */ +static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); + +static int +vfs_sysctl(SYSCTL_HANDLER_ARGS) +{ + int *name = (int *)arg1 - 1; /* XXX */ + u_int namelen = arg2 + 1; /* XXX */ + struct vfsconf *vfsp; + +#if 1 || defined(COMPAT_PRELITE2) + /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ + if (namelen == 1) + return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); +#endif + + /* XXX the below code does not compile; vfs_sysctl does not exist. */ +#ifdef notyet + /* all sysctl names at this level are at least name and field */ + if (namelen < 2) + return (ENOTDIR); /* overloaded */ + if (name[0] != VFS_GENERIC) { + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == name[0]) + break; + if (vfsp == NULL) + return (EOPNOTSUPP); + return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, + oldp, oldlenp, newp, newlen, td)); + } +#endif + switch (name[1]) { + case VFS_MAXTYPENUM: + if (namelen != 2) + return (ENOTDIR); + return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); + case VFS_CONF: + if (namelen != 3) + return (ENOTDIR); /* overloaded */ + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == name[2]) + break; + if (vfsp == NULL) + return (EOPNOTSUPP); + return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); + } + return (EOPNOTSUPP); +} + +SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, + "Generic filesystem"); + +#if 1 || defined(COMPAT_PRELITE2) + +static int +sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) +{ + int error; + struct vfsconf *vfsp; + struct ovfsconf ovfs; + + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { + ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ + strcpy(ovfs.vfc_name, vfsp->vfc_name); + ovfs.vfc_index = vfsp->vfc_typenum; + ovfs.vfc_refcount = vfsp->vfc_refcount; + ovfs.vfc_flags = vfsp->vfc_flags; + error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); + if (error) + return error; + } + return 0; +} + +#endif /* 1 || COMPAT_PRELITE2 */ + +#if COMPILING_LINT +#define KINFO_VNODESLOP 10 +/* + * Dump vnode list (via sysctl). + * Copyout address of vnode followed by vnode. + */ +/* ARGSUSED */ +static int +sysctl_vnode(SYSCTL_HANDLER_ARGS) +{ + struct thread *td = curthread; /* XXX */ + struct mount *mp, *nmp; + struct vnode *nvp, *vp; + int error; + +#define VPTRSZ sizeof (struct vnode *) +#define VNODESZ sizeof (struct vnode) + + req->lock = 0; + if (!req->oldptr) /* Make an estimate */ + return (SYSCTL_OUT(req, 0, + (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); + + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } + mtx_lock(&mntvnode_mtx); +again: + for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); + vp != NULL; + vp = nvp) { + /* + * Check that the vp is still associated with + * this filesystem. RACE: could have been + * recycled onto the same filesystem. + */ + if (vp->v_mount != mp) + goto again; + nvp = TAILQ_NEXT(vp, v_nmntvnodes); + mtx_unlock(&mntvnode_mtx); + if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || + (error = SYSCTL_OUT(req, vp, VNODESZ))) + return (error); + mtx_lock(&mntvnode_mtx); + } + mtx_unlock(&mntvnode_mtx); + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + } + mtx_unlock(&mountlist_mtx); + + return (0); +} + +/* + * XXX + * Exporting the vnode list on large systems causes them to crash. + * Exporting the vnode list on medium systems causes sysctl to coredump. + */ +SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, 0, sysctl_vnode, "S,vnode", ""); +#endif + +/* + * Check to see if a filesystem is mounted on a block device. + */ +int +vfs_mountedon(vp) + struct vnode *vp; +{ + + if (vp->v_rdev->si_mountpoint != NULL) + return (EBUSY); + return (0); +} + +/* + * Unmount all filesystems. The list is traversed in reverse order + * of mounting to avoid dependencies. + */ +void +vfs_unmountall() +{ + struct mount *mp; + struct thread *td; + int error; + + if (curthread != NULL) + td = curthread; + else + td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */ + /* + * Since this only runs when rebooting, it is not interlocked. + */ + while(!TAILQ_EMPTY(&mountlist)) { + mp = TAILQ_LAST(&mountlist, mntlist); + error = dounmount(mp, MNT_FORCE, td); + if (error) { + TAILQ_REMOVE(&mountlist, mp, mnt_list); + printf("unmount of %s failed (", + mp->mnt_stat.f_mntonname); + if (error == EBUSY) + printf("BUSY)\n"); + else + printf("%d)\n", error); + } else { + /* The unmount has removed mp from the mountlist */ + } + } +} + +/* + * perform msync on all vnodes under a mount point + * the mount point must be locked. + */ +void +vfs_msync(struct mount *mp, int flags) +{ + struct vnode *vp, *nvp; + struct vm_object *obj; + int tries; + + GIANT_REQUIRED; + + tries = 5; + mtx_lock(&mntvnode_mtx); +loop: + for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) { + if (vp->v_mount != mp) { + if (--tries > 0) + goto loop; + break; + } + nvp = TAILQ_NEXT(vp, v_nmntvnodes); + + if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ + continue; + + if (vp->v_flag & VNOSYNC) /* unlinked, skip it */ + continue; + + if ((vp->v_flag & VOBJDIRTY) && + (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { + mtx_unlock(&mntvnode_mtx); + if (!vget(vp, + LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) { + if (VOP_GETVOBJECT(vp, &obj) == 0) { + vm_object_page_clean(obj, 0, 0, + flags == MNT_WAIT ? + OBJPC_SYNC : OBJPC_NOSYNC); + } + vput(vp); + } + mtx_lock(&mntvnode_mtx); + if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) { + if (--tries > 0) + goto loop; + break; + } + } + } + mtx_unlock(&mntvnode_mtx); +} + +/* + * Create the VM object needed for VMIO and mmap support. This + * is done for all VREG files in the system. Some filesystems might + * afford the additional metadata buffering capability of the + * VMIO code by making the device node be VMIO mode also. + * + * vp must be locked when vfs_object_create is called. + */ +int +vfs_object_create(vp, td, cred) + struct vnode *vp; + struct thread *td; + struct ucred *cred; +{ + GIANT_REQUIRED; + return (VOP_CREATEVOBJECT(vp, cred, td)); +} + +/* + * Mark a vnode as free, putting it up for recycling. + */ +void +vfree(vp) + struct vnode *vp; +{ + int s; + + s = splbio(); + mtx_lock(&vnode_free_list_mtx); + KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); + if (vp->v_flag & VAGE) { + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + } else { + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + } + freevnodes++; + mtx_unlock(&vnode_free_list_mtx); + vp->v_flag &= ~VAGE; + vp->v_flag |= VFREE; + splx(s); +} + +/* + * Opposite of vfree() - mark a vnode as in use. + */ +void +vbusy(vp) + struct vnode *vp; +{ + int s; + + s = splbio(); + mtx_lock(&vnode_free_list_mtx); + KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + freevnodes--; + mtx_unlock(&vnode_free_list_mtx); + vp->v_flag &= ~(VFREE|VAGE); + splx(s); +} + +/* + * Record a process's interest in events which might happen to + * a vnode. Because poll uses the historic select-style interface + * internally, this routine serves as both the ``check for any + * pending events'' and the ``record my interest in future events'' + * functions. (These are done together, while the lock is held, + * to avoid race conditions.) + */ +int +vn_pollrecord(vp, td, events) + struct vnode *vp; + struct thread *td; + short events; +{ + + if (vp->v_pollinfo == NULL) + v_addpollinfo(vp); + mtx_lock(&vp->v_pollinfo->vpi_lock); + if (vp->v_pollinfo->vpi_revents & events) { + /* + * This leaves events we are not interested + * in available for the other process which + * which presumably had requested them + * (otherwise they would never have been + * recorded). + */ + events &= vp->v_pollinfo->vpi_revents; + vp->v_pollinfo->vpi_revents &= ~events; + + mtx_unlock(&vp->v_pollinfo->vpi_lock); + return events; + } + vp->v_pollinfo->vpi_events |= events; + selrecord(td, &vp->v_pollinfo->vpi_selinfo); + mtx_unlock(&vp->v_pollinfo->vpi_lock); + return 0; +} + +/* + * Note the occurrence of an event. If the VN_POLLEVENT macro is used, + * it is possible for us to miss an event due to race conditions, but + * that condition is expected to be rare, so for the moment it is the + * preferred interface. + */ +void +vn_pollevent(vp, events) + struct vnode *vp; + short events; +{ + + if (vp->v_pollinfo == NULL) + v_addpollinfo(vp); + mtx_lock(&vp->v_pollinfo->vpi_lock); + if (vp->v_pollinfo->vpi_events & events) { + /* + * We clear vpi_events so that we don't + * call selwakeup() twice if two events are + * posted before the polling process(es) is + * awakened. This also ensures that we take at + * most one selwakeup() if the polling process + * is no longer interested. However, it does + * mean that only one event can be noticed at + * a time. (Perhaps we should only clear those + * event bits which we note?) XXX + */ + vp->v_pollinfo->vpi_events = 0; /* &= ~events ??? */ + vp->v_pollinfo->vpi_revents |= events; + selwakeup(&vp->v_pollinfo->vpi_selinfo); + } + mtx_unlock(&vp->v_pollinfo->vpi_lock); +} + +/* + * Wake up anyone polling on vp because it is being revoked. + * This depends on dead_poll() returning POLLHUP for correct + * behavior. + */ +void +vn_pollgone(vp) + struct vnode *vp; +{ + + mtx_lock(&vp->v_pollinfo->vpi_lock); + VN_KNOTE(vp, NOTE_REVOKE); + if (vp->v_pollinfo->vpi_events) { + vp->v_pollinfo->vpi_events = 0; + selwakeup(&vp->v_pollinfo->vpi_selinfo); + } + mtx_unlock(&vp->v_pollinfo->vpi_lock); +} + + + +/* + * Routine to create and manage a filesystem syncer vnode. + */ +#define sync_close ((int (*)(struct vop_close_args *))nullop) +static int sync_fsync(struct vop_fsync_args *); +static int sync_inactive(struct vop_inactive_args *); +static int sync_reclaim(struct vop_reclaim_args *); +#define sync_lock ((int (*)(struct vop_lock_args *))vop_nolock) +#define sync_unlock ((int (*)(struct vop_unlock_args *))vop_nounlock) +static int sync_print(struct vop_print_args *); +#define sync_islocked ((int(*)(struct vop_islocked_args *))vop_noislocked) + +static vop_t **sync_vnodeop_p; +static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { + { &vop_default_desc, (vop_t *) vop_eopnotsupp }, + { &vop_close_desc, (vop_t *) sync_close }, /* close */ + { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ + { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ + { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ + { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ + { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ + { &vop_print_desc, (vop_t *) sync_print }, /* print */ + { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ + { NULL, NULL } +}; +static struct vnodeopv_desc sync_vnodeop_opv_desc = + { &sync_vnodeop_p, sync_vnodeop_entries }; + +VNODEOP_SET(sync_vnodeop_opv_desc); + +/* + * Create a new filesystem syncer vnode for the specified mount point. + */ +int +vfs_allocate_syncvnode(mp) + struct mount *mp; +{ + struct vnode *vp; + static long start, incr, next; + int error; + + /* Allocate a new vnode */ + if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { + mp->mnt_syncer = NULL; + return (error); + } + vp->v_type = VNON; + /* + * Place the vnode onto the syncer worklist. We attempt to + * scatter them about on the list so that they will go off + * at evenly distributed times even if all the filesystems + * are mounted at once. + */ + next += incr; + if (next == 0 || next > syncer_maxdelay) { + start /= 2; + incr /= 2; + if (start == 0) { + start = syncer_maxdelay / 2; + incr = syncer_maxdelay; + } + next = start; + } + vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); + mp->mnt_syncer = vp; + return (0); +} + +/* + * Do a lazy sync of the filesystem. + */ +static int +sync_fsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct thread *a_td; + } */ *ap; +{ + struct vnode *syncvp = ap->a_vp; + struct mount *mp = syncvp->v_mount; + struct thread *td = ap->a_td; + int asyncflag; + + /* + * We only need to do something if this is a lazy evaluation. + */ + if (ap->a_waitfor != MNT_LAZY) + return (0); + + /* + * Move ourselves to the back of the sync list. + */ + vn_syncer_add_to_worklist(syncvp, syncdelay); + + /* + * Walk the list of vnodes pushing all that are dirty and + * not already on the sync list. + */ + mtx_lock(&mountlist_mtx); + if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) { + mtx_unlock(&mountlist_mtx); + return (0); + } + if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { + vfs_unbusy(mp, td); + return (0); + } + asyncflag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &= ~MNT_ASYNC; + vfs_msync(mp, MNT_NOWAIT); + VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td); + if (asyncflag) + mp->mnt_flag |= MNT_ASYNC; + vn_finished_write(mp); + vfs_unbusy(mp, td); + return (0); +} + +/* + * The syncer vnode is no referenced. + */ +static int +sync_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + + vgone(ap->a_vp); + return (0); +} + +/* + * The syncer vnode is no longer needed and is being decommissioned. + * + * Modifications to the worklist must be protected at splbio(). + */ +static int +sync_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + int s; + + s = splbio(); + vp->v_mount->mnt_syncer = NULL; + if (vp->v_flag & VONWORKLST) { + LIST_REMOVE(vp, v_synclist); + vp->v_flag &= ~VONWORKLST; + } + splx(s); + + return (0); +} + +/* + * Print out a syncer vnode. + */ +static int +sync_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + printf("syncer vnode"); + if (vp->v_vnlock != NULL) + lockmgr_printinfo(vp->v_vnlock); + printf("\n"); + return (0); +} + +/* + * extract the dev_t from a VCHR + */ +dev_t +vn_todev(vp) + struct vnode *vp; +{ + if (vp->v_type != VCHR) + return (NODEV); + return (vp->v_rdev); +} + +/* + * Check if vnode represents a disk device + */ +int +vn_isdisk(vp, errp) + struct vnode *vp; + int *errp; +{ + struct cdevsw *cdevsw; + + if (vp->v_type != VCHR) { + if (errp != NULL) + *errp = ENOTBLK; + return (0); + } + if (vp->v_rdev == NULL) { + if (errp != NULL) + *errp = ENXIO; + return (0); + } + cdevsw = devsw(vp->v_rdev); + if (cdevsw == NULL) { + if (errp != NULL) + *errp = ENXIO; + return (0); + } + if (!(cdevsw->d_flags & D_DISK)) { + if (errp != NULL) + *errp = ENOTBLK; + return (0); + } + if (errp != NULL) + *errp = 0; + return (1); +} + +/* + * Free data allocated by namei(); see namei(9) for details. + */ +void +NDFREE(ndp, flags) + struct nameidata *ndp; + const uint flags; +{ + if (!(flags & NDF_NO_FREE_PNBUF) && + (ndp->ni_cnd.cn_flags & HASBUF)) { + uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); + ndp->ni_cnd.cn_flags &= ~HASBUF; + } + if (!(flags & NDF_NO_DVP_UNLOCK) && + (ndp->ni_cnd.cn_flags & LOCKPARENT) && + ndp->ni_dvp != ndp->ni_vp) + VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread); + if (!(flags & NDF_NO_DVP_RELE) && + (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { + vrele(ndp->ni_dvp); + ndp->ni_dvp = NULL; + } + if (!(flags & NDF_NO_VP_UNLOCK) && + (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) + VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread); + if (!(flags & NDF_NO_VP_RELE) && + ndp->ni_vp) { + vrele(ndp->ni_vp); + ndp->ni_vp = NULL; + } + if (!(flags & NDF_NO_STARTDIR_RELE) && + (ndp->ni_cnd.cn_flags & SAVESTART)) { + vrele(ndp->ni_startdir); + ndp->ni_startdir = NULL; + } +} + +/* + * Common filesystem object access control check routine. Accepts a + * vnode's type, "mode", uid and gid, requested access mode, credentials, + * and optional call-by-reference privused argument allowing vaccess() + * to indicate to the caller whether privilege was used to satisfy the + * request. Returns 0 on success, or an errno on failure. + */ +int +vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) + enum vtype type; + mode_t file_mode; + uid_t file_uid; + gid_t file_gid; + mode_t acc_mode; + struct ucred *cred; + int *privused; +{ + mode_t dac_granted; +#ifdef CAPABILITIES + mode_t cap_granted; +#endif + + /* + * Look for a normal, non-privileged way to access the file/directory + * as requested. If it exists, go with that. + */ + + if (privused != NULL) + *privused = 0; + + dac_granted = 0; + + /* Check the owner. */ + if (cred->cr_uid == file_uid) { + dac_granted |= VADMIN; + if (file_mode & S_IXUSR) + dac_granted |= VEXEC; + if (file_mode & S_IRUSR) + dac_granted |= VREAD; + if (file_mode & S_IWUSR) + dac_granted |= VWRITE; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + + goto privcheck; + } + + /* Otherwise, check the groups (first match) */ + if (groupmember(file_gid, cred)) { + if (file_mode & S_IXGRP) + dac_granted |= VEXEC; + if (file_mode & S_IRGRP) + dac_granted |= VREAD; + if (file_mode & S_IWGRP) + dac_granted |= VWRITE; + + if ((acc_mode & dac_granted) == acc_mode) + return (0); + + goto privcheck; + } + + /* Otherwise, check everyone else. */ + if (file_mode & S_IXOTH) + dac_granted |= VEXEC; + if (file_mode & S_IROTH) + dac_granted |= VREAD; + if (file_mode & S_IWOTH) + dac_granted |= VWRITE; + if ((acc_mode & dac_granted) == acc_mode) + return (0); + +privcheck: + if (!suser_cred(cred, PRISON_ROOT)) { + /* XXX audit: privilege used */ + if (privused != NULL) + *privused = 1; + return (0); + } + +#ifdef CAPABILITIES + /* + * Build a capability mask to determine if the set of capabilities + * satisfies the requirements when combined with the granted mask + * from above. + * For each capability, if the capability is required, bitwise + * or the request type onto the cap_granted mask. + */ + cap_granted = 0; + + if (type == VDIR) { + /* + * For directories, use CAP_DAC_READ_SEARCH to satisfy + * VEXEC requests, instead of CAP_DAC_EXECUTE. + */ + if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && + !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) + cap_granted |= VEXEC; + } else { + if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && + !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) + cap_granted |= VEXEC; + } + + if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && + !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) + cap_granted |= VREAD; + + if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && + !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) + cap_granted |= VWRITE; + + if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && + !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT)) + cap_granted |= VADMIN; + + if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { + /* XXX audit: privilege used */ + if (privused != NULL) + *privused = 1; + return (0); + } +#endif + + return ((acc_mode & VADMIN) ? EPERM : EACCES); +} diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c new file mode 100644 index 0000000..1244e54 --- /dev/null +++ b/sys/kern/vfs_syscalls.c @@ -0,0 +1,4862 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 + * $FreeBSD$ + */ + +/* For 4.3 integer FS ID compatibility */ +#include "opt_compat.h" +#include "opt_ffs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/sysent.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/mutex.h> +#include <sys/sysproto.h> +#include <sys/namei.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/linker.h> +#include <sys/stat.h> +#include <sys/sx.h> +#include <sys/unistd.h> +#include <sys/vnode.h> +#include <sys/proc.h> +#include <sys/dirent.h> +#include <sys/extattr.h> +#include <sys/jail.h> +#include <sys/sysctl.h> + +#include <machine/limits.h> +#include <machine/stdarg.h> + +#include <vm/vm.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/uma.h> + +static int change_dir(struct nameidata *ndp, struct thread *td); +static void checkdirs(struct vnode *olddp, struct vnode *newdp); +static int chroot_refuse_vdir_fds(struct filedesc *fdp); +static int getutimes(const struct timeval *, struct timespec *); +static int setfown(struct thread *td, struct vnode *, uid_t, gid_t); +static int setfmode(struct thread *td, struct vnode *, int); +static int setfflags(struct thread *td, struct vnode *, int); +static int setutimes(struct thread *td, struct vnode *, + const struct timespec *, int); +static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, + struct thread *td); +static int vfs_nmount(struct thread *td, int, struct uio *); + +static int usermount = 0; /* if 1, non-root can mount fs. */ + +int (*union_dircheckp)(struct thread *td, struct vnode **, struct file *); + +SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, ""); + +/* + * Virtual File System System Calls + */ + +#ifndef _SYS_SYSPROTO_H_ +struct nmount_args { + struct iovec *iovp; + unsigned int iovcnt; + int flags; +}; +#endif +/* ARGSUSED */ +int +nmount(td, uap) + struct thread *td; + struct nmount_args /* { + syscallarg(struct iovec *) iovp; + syscallarg(unsigned int) iovcnt; + syscallarg(int) flags; + } */ *uap; +{ + struct uio auio; + struct iovec *iov, *needfree; + struct iovec aiov[UIO_SMALLIOV]; + unsigned int i; + int error; + u_int iovlen, iovcnt; + + iovcnt = SCARG(uap, iovcnt); + iovlen = iovcnt * sizeof (struct iovec); + /* + * Check that we have an even number of iovec's + * and that we have at least two options. + */ + if ((iovcnt & 1) || (iovcnt < 4) || (iovcnt > UIO_MAXIOV)) + return (EINVAL); + + if (iovcnt > UIO_SMALLIOV) { + MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); + needfree = iov; + } else { + iov = aiov; + needfree = NULL; + } + auio.uio_iov = iov; + auio.uio_iovcnt = iovcnt; + auio.uio_segflg = UIO_USERSPACE; + if ((error = copyin(uap->iovp, iov, iovlen))) + goto finish; + + for (i = 0; i < iovcnt; i++) { + if (iov->iov_len > MMAXOPTIONLEN) { + error = EINVAL; + goto finish; + } + iov++; + } + error = vfs_nmount(td, SCARG(uap, flags), &auio); +finish: + if (needfree != NULL) + free(needfree, M_TEMP); + return (error); +} + +/* + * Release all resources related to the + * mount options. + */ +void +vfs_freeopts(struct vfsoptlist *opts) +{ + struct vfsopt *opt; + + while (!TAILQ_EMPTY(opts)) { + opt = TAILQ_FIRST(opts); + TAILQ_REMOVE(opts, opt, link); + free(opt->name, M_MOUNT); + free(opt->value, M_MOUNT); + free(opt, M_MOUNT); + } + free(opts, M_MOUNT); +} + +int +kernel_mount(iovp, iovcnt, flags) + struct iovec *iovp; + unsigned int iovcnt; + int flags; +{ + struct uio auio; + int error; + + /* + * Check that we have an even number of iovec's + * and that we have at least two options. + */ + if ((iovcnt & 1) || (iovcnt < 4)) + return (EINVAL); + + auio.uio_iov = iovp; + auio.uio_iovcnt = iovcnt; + auio.uio_segflg = UIO_SYSSPACE; + + error = vfs_nmount(curthread, flags, &auio); + return (error); +} + +int +kernel_vmount(int flags, ...) +{ + struct iovec *iovp; + struct uio auio; + va_list ap; + unsigned int iovcnt, iovlen, len; + const char *cp; + char *buf, *pos; + size_t n; + int error, i; + + len = 0; + va_start(ap, flags); + for (iovcnt = 0; (cp = va_arg(ap, const char *)) != NULL; iovcnt++) + len += strlen(cp) + 1; + va_end(ap); + + if (iovcnt < 4 || iovcnt & 1) + return (EINVAL); + + iovlen = iovcnt * sizeof (struct iovec); + MALLOC(iovp, struct iovec *, iovlen, M_MOUNT, M_WAITOK); + MALLOC(buf, char *, len, M_MOUNT, M_WAITOK); + pos = buf; + va_start(ap, flags); + for (i = 0; i < iovcnt; i++) { + cp = va_arg(ap, const char *); + copystr(cp, pos, len - (pos - buf), &n); + iovp[i].iov_base = pos; + iovp[i].iov_len = n; + pos += n; + } + va_end(ap); + + auio.uio_iov = iovp; + auio.uio_iovcnt = iovcnt; + auio.uio_segflg = UIO_SYSSPACE; + + error = vfs_nmount(curthread, flags, &auio); + FREE(iovp, M_MOUNT); + FREE(buf, M_MOUNT); + return (error); +} + +/* + * vfs_nmount(): actually attempt a filesystem mount. + */ +static int +vfs_nmount(td, fsflags, fsoptions) + struct thread *td; + int fsflags; /* Flags common to all filesystems. */ + struct uio *fsoptions; /* Options local to the filesystem. */ +{ + linker_file_t lf; + struct vnode *vp; + struct mount *mp; + struct vfsconf *vfsp; + struct vfsoptlist *optlist; + char *fstype, *fspath; + int error, flag = 0, kern_flag = 0; + int fstypelen, fspathlen; + struct vattr va; + struct nameidata nd; + + error = vfs_buildopts(fsoptions, &optlist); + if (error) + return (error); + + /* + * We need these two options before the others, + * and they are mandatory for any filesystem. + * Ensure they are NUL terminated as well. + */ + fstypelen = 0; + error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); + if (error || fstype[fstypelen - 1] != '\0') { + error = EINVAL; + goto bad; + } + fspathlen = 0; + error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); + if (error || fspath[fspathlen - 1] != '\0') { + error = EINVAL; + goto bad; + } + + /* + * Be ultra-paranoid about making sure the type and fspath + * variables will fit in our mp buffers, including the + * terminating NUL. + */ + if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) { + error = ENAMETOOLONG; + goto bad; + } + + if (usermount == 0) { + error = suser(td); + if (error) + goto bad; + } + /* + * Do not allow NFS export by non-root users. + */ + if (fsflags & MNT_EXPORTED) { + error = suser(td); + if (error) + goto bad; + } + /* + * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users. + */ + if (suser(td)) + fsflags |= MNT_NOSUID | MNT_NODEV; + /* + * Get vnode to be covered + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); + if ((error = namei(&nd)) != 0) + goto bad; + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + if (fsflags & MNT_UPDATE) { + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + error = EINVAL; + goto bad; + } + mp = vp->v_mount; + flag = mp->mnt_flag; + kern_flag = mp->mnt_kern_flag; + /* + * We only allow the filesystem to be reloaded if it + * is currently mounted read-only. + */ + if ((fsflags & MNT_RELOAD) && + ((mp->mnt_flag & MNT_RDONLY) == 0)) { + vput(vp); + error = EOPNOTSUPP; /* Needs translation */ + goto bad; + } + /* + * Only root, or the user that did the original mount is + * permitted to update it. + */ + if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) { + error = suser(td); + if (error) { + vput(vp); + goto bad; + } + } + if (vfs_busy(mp, LK_NOWAIT, 0, td)) { + vput(vp); + error = EBUSY; + goto bad; + } + mtx_lock(&vp->v_interlock); + if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) { + mtx_unlock(&vp->v_interlock); + vfs_unbusy(mp, td); + vput(vp); + error = EBUSY; + goto bad; + } + vp->v_flag |= VMOUNT; + mtx_unlock(&vp->v_interlock); + mp->mnt_flag |= fsflags & + (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT); + VOP_UNLOCK(vp, 0, td); + goto update; + } + /* + * If the user is not root, ensure that they own the directory + * onto which we are attempting to mount. + */ + error = VOP_GETATTR(vp, &va, td->td_ucred, td); + if (error) { + vput(vp); + goto bad; + } + if (va.va_uid != td->td_ucred->cr_uid) { + error = suser(td); + if (error) { + vput(vp); + goto bad; + } + } + if ((error = vinvalbuf(vp, V_SAVE, td->td_ucred, td, 0, 0)) != 0) { + vput(vp); + goto bad; + } + if (vp->v_type != VDIR) { + vput(vp); + error = ENOTDIR; + goto bad; + } + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstype)) + break; + if (vfsp == NULL) { + /* Only load modules for root (very important!). */ + error = suser(td); + if (error) { + vput(vp); + goto bad; + } + error = securelevel_gt(td->td_ucred, 0); + if (error) { + vput(vp); + goto bad; + } + error = linker_load_file(fstype, &lf); + if (error || lf == NULL) { + vput(vp); + if (lf == NULL) + error = ENODEV; + goto bad; + } + lf->userrefs++; + /* Look up again to see if the VFS was loaded. */ + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstype)) + break; + if (vfsp == NULL) { + lf->userrefs--; + linker_file_unload(lf); + vput(vp); + error = ENODEV; + goto bad; + } + } + mtx_lock(&vp->v_interlock); + if ((vp->v_flag & VMOUNT) != 0 || + vp->v_mountedhere != NULL) { + mtx_unlock(&vp->v_interlock); + vput(vp); + error = EBUSY; + goto bad; + } + vp->v_flag |= VMOUNT; + mtx_unlock(&vp->v_interlock); + + /* + * Allocate and initialize the filesystem. + */ + mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); + TAILQ_INIT(&mp->mnt_nvnodelist); + TAILQ_INIT(&mp->mnt_reservedvnlist); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); + (void)vfs_busy(mp, LK_NOWAIT, 0, td); + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_vfc = vfsp; + vfsp->vfc_refcount++; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN); + mp->mnt_vnodecovered = vp; + mp->mnt_stat.f_owner = td->td_ucred->cr_uid; + strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); + mp->mnt_iosize_max = DFLTPHYS; + VOP_UNLOCK(vp, 0, td); + +update: + mp->mnt_optnew = optlist; + /* + * Check if the fs implements the new VFS_NMOUNT() + * function, since the new system call was used. + */ + if (mp->mnt_op->vfs_mount != NULL) { + printf("%s doesn't support the new mount syscall\n", + mp->mnt_vfc->vfc_name); + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + mtx_unlock(&vp->v_interlock); + if (mp->mnt_flag & MNT_UPDATE) + vfs_unbusy(mp, td); + else { + mp->mnt_vfc->vfc_refcount--; + vfs_unbusy(mp, td); + free(mp, M_MOUNT); + } + vrele(vp); + error = EOPNOTSUPP; + goto bad; + } + + /* + * Set the mount level flags. + */ + if (fsflags & MNT_RDONLY) + mp->mnt_flag |= MNT_RDONLY; + else if (mp->mnt_flag & MNT_RDONLY) + mp->mnt_kern_flag |= MNTK_WANTRDWR; + mp->mnt_flag &=~ MNT_UPDATEMASK; + mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE); + /* + * Mount the filesystem. + * XXX The final recipients of VFS_MOUNT just overwrite the ndp they + * get. No freeing of cn_pnbuf. + */ + error = VFS_NMOUNT(mp, &nd, td); + if (!error) { + if (mp->mnt_opt != NULL) + vfs_freeopts(mp->mnt_opt); + mp->mnt_opt = mp->mnt_optnew; + } + /* + * Prevent external consumers of mount + * options to read mnt_optnew. + */ + mp->mnt_optnew = NULL; + if (mp->mnt_flag & MNT_UPDATE) { + if (mp->mnt_kern_flag & MNTK_WANTRDWR) + mp->mnt_flag &= ~MNT_RDONLY; + mp->mnt_flag &=~ + (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); + mp->mnt_kern_flag &=~ MNTK_WANTRDWR; + if (error) { + mp->mnt_flag = flag; + mp->mnt_kern_flag = kern_flag; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + if (mp->mnt_syncer == NULL) + error = vfs_allocate_syncvnode(mp); + } else { + if (mp->mnt_syncer != NULL) + vrele(mp->mnt_syncer); + mp->mnt_syncer = NULL; + } + vfs_unbusy(mp, td); + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + mtx_unlock(&vp->v_interlock); + vrele(vp); + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + /* + * Put the new filesystem on the mount list after root. + */ + cache_purge(vp); + if (!error) { + struct vnode *newdp; + + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + vp->v_mountedhere = mp; + mtx_unlock(&vp->v_interlock); + mtx_lock(&mountlist_mtx); + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mtx_unlock(&mountlist_mtx); + if (VFS_ROOT(mp, &newdp)) + panic("mount: lost mount"); + checkdirs(vp, newdp); + vput(newdp); + VOP_UNLOCK(vp, 0, td); + if ((mp->mnt_flag & MNT_RDONLY) == 0) + error = vfs_allocate_syncvnode(mp); + vfs_unbusy(mp, td); + if ((error = VFS_START(mp, 0, td)) != 0) { + vrele(vp); + goto bad; + } + } else { + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + mtx_unlock(&vp->v_interlock); + mp->mnt_vfc->vfc_refcount--; + vfs_unbusy(mp, td); + free(mp, M_MOUNT); + vput(vp); + goto bad; + } + return (0); +bad: + vfs_freeopts(optlist); + return (error); +} + +/* + * Old Mount API. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mount_args { + char *type; + char *path; + int flags; + caddr_t data; +}; +#endif +/* ARGSUSED */ +int +mount(td, uap) + struct thread *td; + struct mount_args /* { + syscallarg(char *) type; + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(caddr_t) data; + } */ *uap; +{ + char *fstype; + char *fspath; + int error; + + fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); + fspath = malloc(MNAMELEN, M_TEMP, M_WAITOK); + + /* + * vfs_mount() actually takes a kernel string for `type' and + * `path' now, so extract them. + */ + error = copyinstr(SCARG(uap, type), fstype, MFSNAMELEN, NULL); + if (error) + goto finish; + error = copyinstr(SCARG(uap, path), fspath, MNAMELEN, NULL); + if (error) + goto finish; + error = vfs_mount(td, fstype, fspath, SCARG(uap, flags), + SCARG(uap, data)); +finish: + free(fstype, M_TEMP); + free(fspath, M_TEMP); + return (error); +} + +/* + * vfs_mount(): actually attempt a filesystem mount. + * + * This routine is designed to be a "generic" entry point for routines + * that wish to mount a filesystem. All parameters except `fsdata' are + * pointers into kernel space. `fsdata' is currently still a pointer + * into userspace. + */ +int +vfs_mount(td, fstype, fspath, fsflags, fsdata) + struct thread *td; + const char *fstype; + char *fspath; + int fsflags; + void *fsdata; +{ + linker_file_t lf; + struct vnode *vp; + struct mount *mp; + struct vfsconf *vfsp; + int error, flag = 0, kern_flag = 0; + struct vattr va; + struct nameidata nd; + + /* + * Be ultra-paranoid about making sure the type and fspath + * variables will fit in our mp buffers, including the + * terminating NUL. + */ + if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) + return (ENAMETOOLONG); + + if (usermount == 0) { + error = suser(td); + if (error) + return (error); + } + /* + * Do not allow NFS export by non-root users. + */ + if (fsflags & MNT_EXPORTED) { + error = suser(td); + if (error) + return (error); + } + /* + * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users. + */ + if (suser(td)) + fsflags |= MNT_NOSUID | MNT_NODEV; + /* + * Get vnode to be covered + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + if (fsflags & MNT_UPDATE) { + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + mp = vp->v_mount; + flag = mp->mnt_flag; + kern_flag = mp->mnt_kern_flag; + /* + * We only allow the filesystem to be reloaded if it + * is currently mounted read-only. + */ + if ((fsflags & MNT_RELOAD) && + ((mp->mnt_flag & MNT_RDONLY) == 0)) { + vput(vp); + return (EOPNOTSUPP); /* Needs translation */ + } + /* + * Only root, or the user that did the original mount is + * permitted to update it. + */ + if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) { + error = suser(td); + if (error) { + vput(vp); + return (error); + } + } + if (vfs_busy(mp, LK_NOWAIT, 0, td)) { + vput(vp); + return (EBUSY); + } + mtx_lock(&vp->v_interlock); + if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) { + mtx_unlock(&vp->v_interlock); + vfs_unbusy(mp, td); + vput(vp); + return (EBUSY); + } + vp->v_flag |= VMOUNT; + mtx_unlock(&vp->v_interlock); + mp->mnt_flag |= fsflags & + (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT); + VOP_UNLOCK(vp, 0, td); + goto update; + } + /* + * If the user is not root, ensure that they own the directory + * onto which we are attempting to mount. + */ + error = VOP_GETATTR(vp, &va, td->td_ucred, td); + if (error) { + vput(vp); + return (error); + } + if (va.va_uid != td->td_ucred->cr_uid) { + error = suser(td); + if (error) { + vput(vp); + return (error); + } + } + if ((error = vinvalbuf(vp, V_SAVE, td->td_ucred, td, 0, 0)) != 0) { + vput(vp); + return (error); + } + if (vp->v_type != VDIR) { + vput(vp); + return (ENOTDIR); + } + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstype)) + break; + if (vfsp == NULL) { + /* Only load modules for root (very important!). */ + error = suser(td); + if (error) { + vput(vp); + return (error); + } + error = securelevel_gt(td->td_ucred, 0); + if (error) { + vput(vp); + return (error); + } + error = linker_load_file(fstype, &lf); + if (error || lf == NULL) { + vput(vp); + if (lf == NULL) + error = ENODEV; + return (error); + } + lf->userrefs++; + /* Look up again to see if the VFS was loaded. */ + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstype)) + break; + if (vfsp == NULL) { + lf->userrefs--; + linker_file_unload(lf); + vput(vp); + return (ENODEV); + } + } + mtx_lock(&vp->v_interlock); + if ((vp->v_flag & VMOUNT) != 0 || + vp->v_mountedhere != NULL) { + mtx_unlock(&vp->v_interlock); + vput(vp); + return (EBUSY); + } + vp->v_flag |= VMOUNT; + mtx_unlock(&vp->v_interlock); + + /* + * Allocate and initialize the filesystem. + */ + mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); + TAILQ_INIT(&mp->mnt_nvnodelist); + TAILQ_INIT(&mp->mnt_reservedvnlist); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); + (void)vfs_busy(mp, LK_NOWAIT, 0, td); + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_vfc = vfsp; + vfsp->vfc_refcount++; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN); + mp->mnt_vnodecovered = vp; + mp->mnt_stat.f_owner = td->td_ucred->cr_uid; + strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); + mp->mnt_iosize_max = DFLTPHYS; + VOP_UNLOCK(vp, 0, td); +update: + /* + * Check if the fs implements the old VFS_MOUNT() + * function, since the old system call was used. + */ + if (mp->mnt_op->vfs_mount == NULL) { + printf("%s doesn't support the old mount syscall\n", + mp->mnt_vfc->vfc_name); + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + mtx_unlock(&vp->v_interlock); + if (mp->mnt_flag & MNT_UPDATE) + vfs_unbusy(mp, td); + else { + mp->mnt_vfc->vfc_refcount--; + vfs_unbusy(mp, td); + free(mp, M_MOUNT); + } + vrele(vp); + return (EOPNOTSUPP); + } + + /* + * Set the mount level flags. + */ + if (fsflags & MNT_RDONLY) + mp->mnt_flag |= MNT_RDONLY; + else if (mp->mnt_flag & MNT_RDONLY) + mp->mnt_kern_flag |= MNTK_WANTRDWR; + mp->mnt_flag &=~ MNT_UPDATEMASK; + mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE); + /* + * Mount the filesystem. + * XXX The final recipients of VFS_MOUNT just overwrite the ndp they + * get. No freeing of cn_pnbuf. + */ + error = VFS_MOUNT(mp, fspath, fsdata, &nd, td); + if (mp->mnt_flag & MNT_UPDATE) { + if (mp->mnt_kern_flag & MNTK_WANTRDWR) + mp->mnt_flag &= ~MNT_RDONLY; + mp->mnt_flag &=~ + (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); + mp->mnt_kern_flag &=~ MNTK_WANTRDWR; + if (error) { + mp->mnt_flag = flag; + mp->mnt_kern_flag = kern_flag; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + if (mp->mnt_syncer == NULL) + error = vfs_allocate_syncvnode(mp); + } else { + if (mp->mnt_syncer != NULL) + vrele(mp->mnt_syncer); + mp->mnt_syncer = NULL; + } + vfs_unbusy(mp, td); + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + mtx_unlock(&vp->v_interlock); + vrele(vp); + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + /* + * Put the new filesystem on the mount list after root. + */ + cache_purge(vp); + if (!error) { + struct vnode *newdp; + + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + vp->v_mountedhere = mp; + mtx_unlock(&vp->v_interlock); + mtx_lock(&mountlist_mtx); + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mtx_unlock(&mountlist_mtx); + if (VFS_ROOT(mp, &newdp)) + panic("mount: lost mount"); + checkdirs(vp, newdp); + vput(newdp); + VOP_UNLOCK(vp, 0, td); + if ((mp->mnt_flag & MNT_RDONLY) == 0) + error = vfs_allocate_syncvnode(mp); + vfs_unbusy(mp, td); + if ((error = VFS_START(mp, 0, td)) != 0) + vrele(vp); + } else { + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + mtx_unlock(&vp->v_interlock); + mp->mnt_vfc->vfc_refcount--; + vfs_unbusy(mp, td); + free(mp, M_MOUNT); + vput(vp); + } + return (error); +} + +/* + * Scan all active processes to see if any of them have a current + * or root directory of `olddp'. If so, replace them with the new + * mount point. + */ +static void +checkdirs(olddp, newdp) + struct vnode *olddp, *newdp; +{ + struct filedesc *fdp; + struct proc *p; + int nrele; + + if (olddp->v_usecount == 1) + return; + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + PROC_LOCK(p); + fdp = p->p_fd; + if (fdp == NULL) { + PROC_UNLOCK(p); + continue; + } + nrele = 0; + FILEDESC_LOCK(fdp); + if (fdp->fd_cdir == olddp) { + VREF(newdp); + fdp->fd_cdir = newdp; + nrele++; + } + if (fdp->fd_rdir == olddp) { + VREF(newdp); + fdp->fd_rdir = newdp; + nrele++; + } + FILEDESC_UNLOCK(fdp); + PROC_UNLOCK(p); + while (nrele--) + vrele(olddp); + } + sx_sunlock(&allproc_lock); + if (rootvnode == olddp) { + vrele(rootvnode); + VREF(newdp); + rootvnode = newdp; + } +} + +/* + * Unmount a filesystem. + * + * Note: unmount takes a path to the vnode mounted on as argument, + * not special file (as before). + */ +#ifndef _SYS_SYSPROTO_H_ +struct unmount_args { + char *path; + int flags; +}; +#endif +/* ARGSUSED */ +int +unmount(td, uap) + struct thread *td; + register struct unmount_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + register struct vnode *vp; + struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + mp = vp->v_mount; + + /* + * Only root, or the user that did the original mount is + * permitted to unmount this filesystem. + */ + if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) { + error = suser(td); + if (error) { + vput(vp); + return (error); + } + } + + /* + * Don't allow unmounting the root filesystem. + */ + if (mp->mnt_flag & MNT_ROOTFS) { + vput(vp); + return (EINVAL); + } + + /* + * Must be the root of the filesystem + */ + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + vput(vp); + return (dounmount(mp, SCARG(uap, flags), td)); +} + +/* + * Do the actual filesystem unmount. + */ +int +dounmount(mp, flags, td) + struct mount *mp; + int flags; + struct thread *td; +{ + struct vnode *coveredvp, *fsrootvp; + int error; + int async_flag; + + mtx_lock(&mountlist_mtx); + if (mp->mnt_kern_flag & MNTK_UNMOUNT) { + mtx_unlock(&mountlist_mtx); + return (EBUSY); + } + mp->mnt_kern_flag |= MNTK_UNMOUNT; + /* Allow filesystems to detect that a forced unmount is in progress. */ + if (flags & MNT_FORCE) + mp->mnt_kern_flag |= MNTK_UNMOUNTF; + error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK | + ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), &mountlist_mtx, td); + if (error) { + mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); + if (mp->mnt_kern_flag & MNTK_MWAIT) + wakeup(mp); + return (error); + } + vn_start_write(NULL, &mp, V_WAIT); + + if (mp->mnt_flag & MNT_EXPUBLIC) + vfs_setpublicfs(NULL, NULL, NULL); + + vfs_msync(mp, MNT_WAIT); + async_flag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &=~ MNT_ASYNC; + cache_purgevfs(mp); /* remove cache entries for this file sys */ + if (mp->mnt_syncer != NULL) + vrele(mp->mnt_syncer); + /* Move process cdir/rdir refs on fs root to underlying vnode. */ + if (VFS_ROOT(mp, &fsrootvp) == 0) { + if (mp->mnt_vnodecovered != NULL) + checkdirs(fsrootvp, mp->mnt_vnodecovered); + if (fsrootvp == rootvnode) { + vrele(rootvnode); + rootvnode = NULL; + } + vput(fsrootvp); + } + if (((mp->mnt_flag & MNT_RDONLY) || + (error = VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td)) == 0) || + (flags & MNT_FORCE)) { + error = VFS_UNMOUNT(mp, flags, td); + } + vn_finished_write(mp); + if (error) { + /* Undo cdir/rdir and rootvnode changes made above. */ + if (VFS_ROOT(mp, &fsrootvp) == 0) { + if (mp->mnt_vnodecovered != NULL) + checkdirs(mp->mnt_vnodecovered, fsrootvp); + if (rootvnode == NULL) { + rootvnode = fsrootvp; + vref(rootvnode); + } + vput(fsrootvp); + } + if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) + (void) vfs_allocate_syncvnode(mp); + mtx_lock(&mountlist_mtx); + mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); + mp->mnt_flag |= async_flag; + lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, + &mountlist_mtx, td); + if (mp->mnt_kern_flag & MNTK_MWAIT) + wakeup(mp); + return (error); + } + mtx_lock(&mountlist_mtx); + TAILQ_REMOVE(&mountlist, mp, mnt_list); + if ((coveredvp = mp->mnt_vnodecovered) != NULL) + coveredvp->v_mountedhere = NULL; + mp->mnt_vfc->vfc_refcount--; + if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) + panic("unmount: dangling vnode"); + lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_mtx, td); + lockdestroy(&mp->mnt_lock); + if (coveredvp != NULL) + vrele(coveredvp); + if (mp->mnt_kern_flag & MNTK_MWAIT) + wakeup(mp); + if (mp->mnt_op->vfs_mount == NULL) + vfs_freeopts(mp->mnt_opt); + free(mp, M_MOUNT); + return (0); +} + +/* + * Sync each mounted filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct sync_args { + int dummy; +}; +#endif + +#ifdef DEBUG +static int syncprt = 0; +SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); +#endif + +/* ARGSUSED */ +int +sync(td, uap) + struct thread *td; + struct sync_args *uap; +{ + struct mount *mp, *nmp; + int asyncflag; + + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0 && + vn_start_write(NULL, &mp, V_NOWAIT) == 0) { + asyncflag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &= ~MNT_ASYNC; + vfs_msync(mp, MNT_NOWAIT); + VFS_SYNC(mp, MNT_NOWAIT, + ((td != NULL) ? td->td_ucred : NOCRED), td); + mp->mnt_flag |= asyncflag; + vn_finished_write(mp); + } + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + } + mtx_unlock(&mountlist_mtx); +#if 0 +/* + * XXX don't call vfs_bufstats() yet because that routine + * was not imported in the Lite2 merge. + */ +#ifdef DIAGNOSTIC + if (syncprt) + vfs_bufstats(); +#endif /* DIAGNOSTIC */ +#endif + return (0); +} + +/* XXX PRISON: could be per prison flag */ +static int prison_quotas; +#if 0 +SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, ""); +#endif + +/* + * Change filesystem quotas. + */ +#ifndef _SYS_SYSPROTO_H_ +struct quotactl_args { + char *path; + int cmd; + int uid; + caddr_t arg; +}; +#endif +/* ARGSUSED */ +int +quotactl(td, uap) + struct thread *td; + register struct quotactl_args /* { + syscallarg(char *) path; + syscallarg(int) cmd; + syscallarg(int) uid; + syscallarg(caddr_t) arg; + } */ *uap; +{ + struct mount *mp; + int error; + struct nameidata nd; + + if (jailed(td->td_ucred) && !prison_quotas) + return (EPERM); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); + vrele(nd.ni_vp); + if (error) + return (error); + error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid), + SCARG(uap, arg), td); + vn_finished_write(mp); + return (error); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct statfs_args { + char *path; + struct statfs *buf; +}; +#endif +/* ARGSUSED */ +int +statfs(td, uap) + struct thread *td; + register struct statfs_args /* { + syscallarg(char *) path; + syscallarg(struct statfs *) buf; + } */ *uap; +{ + register struct mount *mp; + register struct statfs *sp; + int error; + struct nameidata nd; + struct statfs sb; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + mp = nd.ni_vp->v_mount; + sp = &mp->mnt_stat; + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(nd.ni_vp); + error = VFS_STATFS(mp, sp, td); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (suser(td)) { + bcopy(sp, &sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } + return (copyout(sp, SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fstatfs_args { + int fd; + struct statfs *buf; +}; +#endif +/* ARGSUSED */ +int +fstatfs(td, uap) + struct thread *td; + register struct fstatfs_args /* { + syscallarg(int) fd; + syscallarg(struct statfs *) buf; + } */ *uap; +{ + struct file *fp; + struct mount *mp; + register struct statfs *sp; + int error; + struct statfs sb; + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + mp = ((struct vnode *)fp->f_data)->v_mount; + fdrop(fp, td); + if (mp == NULL) + return (EBADF); + sp = &mp->mnt_stat; + error = VFS_STATFS(mp, sp, td); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (suser(td)) { + bcopy(sp, &sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } + return (copyout(sp, SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Get statistics on all filesystems. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getfsstat_args { + struct statfs *buf; + long bufsize; + int flags; +}; +#endif +int +getfsstat(td, uap) + struct thread *td; + register struct getfsstat_args /* { + syscallarg(struct statfs *) buf; + syscallarg(long) bufsize; + syscallarg(int) flags; + } */ *uap; +{ + register struct mount *mp, *nmp; + register struct statfs *sp; + caddr_t sfsp; + long count, maxcount, error; + + maxcount = SCARG(uap, bufsize) / sizeof(struct statfs); + sfsp = (caddr_t)SCARG(uap, buf); + count = 0; + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } + if (sfsp && count < maxcount) { + sp = &mp->mnt_stat; + /* + * If MNT_NOWAIT or MNT_LAZY is specified, do not + * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY + * overrides MNT_WAIT. + */ + if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 || + (SCARG(uap, flags) & MNT_WAIT)) && + (error = VFS_STATFS(mp, sp, td))) { + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + continue; + } + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + error = copyout(sp, sfsp, sizeof(*sp)); + if (error) { + vfs_unbusy(mp, td); + return (error); + } + sfsp += sizeof(*sp); + } + count++; + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp, td); + } + mtx_unlock(&mountlist_mtx); + if (sfsp && count > maxcount) + td->td_retval[0] = maxcount; + else + td->td_retval[0] = count; + return (0); +} + +/* + * Change current working directory to a given file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchdir_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +fchdir(td, uap) + struct thread *td; + struct fchdir_args /* { + syscallarg(int) fd; + } */ *uap; +{ + register struct filedesc *fdp = td->td_proc->p_fd; + struct vnode *vp, *tdp, *vpold; + struct mount *mp; + struct file *fp; + int error; + + if ((error = getvnode(fdp, SCARG(uap, fd), &fp)) != 0) + return (error); + vp = (struct vnode *)fp->f_data; + VREF(vp); + fdrop(fp, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); + while (!error && (mp = vp->v_mountedhere) != NULL) { + if (vfs_busy(mp, 0, 0, td)) + continue; + error = VFS_ROOT(mp, &tdp); + vfs_unbusy(mp, td); + if (error) + break; + vput(vp); + vp = tdp; + } + if (error) { + vput(vp); + return (error); + } + VOP_UNLOCK(vp, 0, td); + FILEDESC_LOCK(fdp); + vpold = fdp->fd_cdir; + fdp->fd_cdir = vp; + FILEDESC_UNLOCK(fdp); + vrele(vpold); + return (0); +} + +/* + * Change current working directory (``.''). + */ +#ifndef _SYS_SYSPROTO_H_ +struct chdir_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +chdir(td, uap) + struct thread *td; + struct chdir_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct filedesc *fdp = td->td_proc->p_fd; + int error; + struct nameidata nd; + struct vnode *vp; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = change_dir(&nd, td)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + FILEDESC_LOCK(fdp); + vp = fdp->fd_cdir; + fdp->fd_cdir = nd.ni_vp; + FILEDESC_UNLOCK(fdp); + vrele(vp); + return (0); +} + +/* + * Helper function for raised chroot(2) security function: Refuse if + * any filedescriptors are open directories. + */ +static int +chroot_refuse_vdir_fds(fdp) + struct filedesc *fdp; +{ + struct vnode *vp; + struct file *fp; + int fd; + + FILEDESC_LOCK(fdp); + for (fd = 0; fd < fdp->fd_nfiles ; fd++) { + fp = fget_locked(fdp, fd); + if (fp == NULL) + continue; + if (fp->f_type == DTYPE_VNODE) { + vp = (struct vnode *)fp->f_data; + if (vp->v_type == VDIR) { + FILEDESC_UNLOCK(fdp); + return (EPERM); + } + } + } + FILEDESC_UNLOCK(fdp); + return (0); +} + +/* + * This sysctl determines if we will allow a process to chroot(2) if it + * has a directory open: + * 0: disallowed for all processes. + * 1: allowed for processes that were not already chroot(2)'ed. + * 2: allowed for all processes. + */ + +static int chroot_allow_open_directories = 1; + +SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, + &chroot_allow_open_directories, 0, ""); + +/* + * Change notion of root (``/'') directory. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chroot_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +chroot(td, uap) + struct thread *td; + struct chroot_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct filedesc *fdp = td->td_proc->p_fd; + int error; + struct nameidata nd; + struct vnode *vp; + + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + FILEDESC_LOCK(fdp); + if (chroot_allow_open_directories == 0 || + (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { + FILEDESC_UNLOCK(fdp); + error = chroot_refuse_vdir_fds(fdp); + } else + FILEDESC_UNLOCK(fdp); + if (error) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = change_dir(&nd, td)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + FILEDESC_LOCK(fdp); + vp = fdp->fd_rdir; + fdp->fd_rdir = nd.ni_vp; + if (!fdp->fd_jdir) { + fdp->fd_jdir = nd.ni_vp; + VREF(fdp->fd_jdir); + } + FILEDESC_UNLOCK(fdp); + vrele(vp); + return (0); +} + +/* + * Common routine for chroot and chdir. + */ +static int +change_dir(ndp, td) + register struct nameidata *ndp; + struct thread *td; +{ + struct vnode *vp; + int error; + + error = namei(ndp); + if (error) + return (error); + vp = ndp->ni_vp; + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); + if (error) + vput(vp); + else + VOP_UNLOCK(vp, 0, td); + return (error); +} + +/* + * Check permissions, allocate an open file structure, + * and call the device open routine if any. + */ +#ifndef _SYS_SYSPROTO_H_ +struct open_args { + char *path; + int flags; + int mode; +}; +#endif +int +open(td, uap) + struct thread *td; + register struct open_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(int) mode; + } */ *uap; +{ + struct proc *p = td->td_proc; + struct filedesc *fdp = p->p_fd; + struct file *fp; + struct vnode *vp; + struct vattr vat; + struct mount *mp; + int cmode, flags, oflags; + struct file *nfp; + int type, indx, error; + struct flock lf; + struct nameidata nd; + + oflags = SCARG(uap, flags); + if ((oflags & O_ACCMODE) == O_ACCMODE) + return (EINVAL); + flags = FFLAGS(oflags); + error = falloc(td, &nfp, &indx); + if (error) + return (error); + fp = nfp; + FILEDESC_LOCK(fdp); + cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; + FILEDESC_UNLOCK(fdp); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + td->td_dupfd = -indx - 1; /* XXX check for fdopen */ + /* + * Bump the ref count to prevent another process from closing + * the descriptor while we are blocked in vn_open() + */ + fhold(fp); + error = vn_open(&nd, &flags, cmode); + if (error) { + /* + * release our own reference + */ + fdrop(fp, td); + + /* + * handle special fdopen() case. bleh. dupfdopen() is + * responsible for dropping the old contents of ofiles[indx] + * if it succeeds. + */ + if ((error == ENODEV || error == ENXIO) && + td->td_dupfd >= 0 && /* XXX from fdopen */ + (error = + dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) { + td->td_retval[0] = indx; + return (0); + } + /* + * Clean up the descriptor, but only if another thread hadn't + * replaced or closed it. + */ + FILEDESC_LOCK(fdp); + if (fdp->fd_ofiles[indx] == fp) { + fdp->fd_ofiles[indx] = NULL; + FILEDESC_UNLOCK(fdp); + fdrop(fp, td); + } else + FILEDESC_UNLOCK(fdp); + + if (error == ERESTART) + error = EINTR; + return (error); + } + td->td_dupfd = 0; + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + + /* + * There should be 2 references on the file, one from the descriptor + * table, and one for us. + * + * Handle the case where someone closed the file (via its file + * descriptor) while we were blocked. The end result should look + * like opening the file succeeded but it was immediately closed. + */ + FILEDESC_LOCK(fdp); + FILE_LOCK(fp); + if (fp->f_count == 1) { + KASSERT(fdp->fd_ofiles[indx] != fp, + ("Open file descriptor lost all refs")); + FILEDESC_UNLOCK(fdp); + FILE_UNLOCK(fp); + VOP_UNLOCK(vp, 0, td); + vn_close(vp, flags & FMASK, fp->f_cred, td); + fdrop(fp, td); + td->td_retval[0] = indx; + return 0; + } + + /* assert that vn_open created a backing object if one is needed */ + KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0, + ("open: vmio vnode has no backing object after vn_open")); + + fp->f_data = vp; + fp->f_flag = flags & FMASK; + fp->f_ops = &vnops; + fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); + FILEDESC_UNLOCK(fdp); + FILE_UNLOCK(fp); + VOP_UNLOCK(vp, 0, td); + if (flags & (O_EXLOCK | O_SHLOCK)) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (flags & O_EXLOCK) + lf.l_type = F_WRLCK; + else + lf.l_type = F_RDLCK; + type = F_FLOCK; + if ((flags & FNONBLOCK) == 0) + type |= F_WAIT; + if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, + type)) != 0) + goto bad; + fp->f_flag |= FHASLOCK; + } + if (flags & O_TRUNC) { + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto bad; + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + VATTR_NULL(&vat); + vat.va_size = 0; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_SETATTR(vp, &vat, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + if (error) + goto bad; + } + /* + * Release our private reference, leaving the one associated with + * the descriptor table intact. + */ + fdrop(fp, td); + td->td_retval[0] = indx; + return (0); +bad: + FILEDESC_LOCK(fdp); + if (fdp->fd_ofiles[indx] == fp) { + fdp->fd_ofiles[indx] = NULL; + FILEDESC_UNLOCK(fdp); + fdrop(fp, td); + } else + FILEDESC_UNLOCK(fdp); + return (error); +} + +#ifdef COMPAT_43 +/* + * Create a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ocreat_args { + char *path; + int mode; +}; +#endif +int +ocreat(td, uap) + struct thread *td; + register struct ocreat_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + struct open_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(int) mode; + } */ nuap; + + SCARG(&nuap, path) = SCARG(uap, path); + SCARG(&nuap, mode) = SCARG(uap, mode); + SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC; + return (open(td, &nuap)); +} +#endif /* COMPAT_43 */ + +/* + * Create a special file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mknod_args { + char *path; + int mode; + int dev; +}; +#endif +/* ARGSUSED */ +int +mknod(td, uap) + struct thread *td; + register struct mknod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + syscallarg(int) dev; + } */ *uap; +{ + struct vnode *vp; + struct mount *mp; + struct vattr vattr; + int error; + int whiteout = 0; + struct nameidata nd; + + switch (SCARG(uap, mode) & S_IFMT) { + case S_IFCHR: + case S_IFBLK: + error = suser(td); + break; + default: + error = suser_cred(td->td_ucred, PRISON_ROOT); + break; + } + if (error) + return (error); +restart: + bwillwrite(); + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + vrele(vp); + error = EEXIST; + } else { + VATTR_NULL(&vattr); + FILEDESC_LOCK(td->td_proc->p_fd); + vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); + vattr.va_rdev = SCARG(uap, dev); + whiteout = 0; + + switch (SCARG(uap, mode) & S_IFMT) { + case S_IFMT: /* used by badsect to flag bad sectors */ + vattr.va_type = VBAD; + break; + case S_IFCHR: + vattr.va_type = VCHR; + break; + case S_IFBLK: + vattr.va_type = VBLK; + break; + case S_IFWHT: + whiteout = 1; + break; + default: + error = EINVAL; + break; + } + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + if (!error) { + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + if (whiteout) + error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); + else { + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, + &nd.ni_cnd, &vattr); + if (error == 0) + vput(nd.ni_vp); + } + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod"); + return (error); +} + +/* + * Create a named pipe. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mkfifo_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +mkfifo(td, uap) + struct thread *td; + register struct mkfifo_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + struct mount *mp; + struct vattr vattr; + int error; + struct nameidata nd; + +restart: + bwillwrite(); + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + if (nd.ni_vp != NULL) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(nd.ni_vp); + vput(nd.ni_dvp); + return (EEXIST); + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VATTR_NULL(&vattr); + vattr.va_type = VFIFO; + FILEDESC_LOCK(td->td_proc->p_fd); + vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + if (error == 0) + vput(nd.ni_vp); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vn_finished_write(mp); + return (error); +} + +/* + * Make a hard file link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct link_args { + char *path; + char *link; +}; +#endif +/* ARGSUSED */ +int +link(td, uap) + struct thread *td; + register struct link_args /* { + syscallarg(char *) path; + syscallarg(char *) link; + } */ *uap; +{ + struct vnode *vp; + struct mount *mp; + struct nameidata nd; + int error; + + bwillwrite(); + NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + if (vp->v_type == VDIR) { + vrele(vp); + return (EPERM); /* POSIX */ + } + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + vrele(vp); + return (error); + } + NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td); + if ((error = namei(&nd)) == 0) { + if (nd.ni_vp != NULL) { + vrele(nd.ni_vp); + error = EEXIST; + } else { + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + } + vrele(vp); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "link"); + return (error); +} + +/* + * Make a symbolic link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct symlink_args { + char *path; + char *link; +}; +#endif +/* ARGSUSED */ +int +symlink(td, uap) + struct thread *td; + register struct symlink_args /* { + syscallarg(char *) path; + syscallarg(char *) link; + } */ *uap; +{ + struct mount *mp; + struct vattr vattr; + char *path; + int error; + struct nameidata nd; + + path = uma_zalloc(namei_zone, M_WAITOK); + if ((error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) != 0) + goto out; +restart: + bwillwrite(); + NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td); + if ((error = namei(&nd)) != 0) + goto out; + if (nd.ni_vp) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(nd.ni_vp); + vput(nd.ni_dvp); + error = EEXIST; + goto out; + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VATTR_NULL(&vattr); + FILEDESC_LOCK(td->td_proc->p_fd); + vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path); + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error == 0) + vput(nd.ni_vp); + vput(nd.ni_dvp); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); +out: + uma_zfree(namei_zone, path); + return (error); +} + +/* + * Delete a whiteout from the filesystem. + */ +/* ARGSUSED */ +int +undelete(td, uap) + struct thread *td; + register struct undelete_args /* { + syscallarg(char *) path; + } */ *uap; +{ + int error; + struct mount *mp; + struct nameidata nd; + +restart: + bwillwrite(); + NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE, + SCARG(uap, path), td); + error = namei(&nd); + if (error) + return (error); + + if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_vp) + vrele(nd.ni_vp); + vput(nd.ni_dvp); + return (EEXIST); + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete"); + return (error); +} + +/* + * Delete a name from the filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct unlink_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +unlink(td, uap) + struct thread *td; + struct unlink_args /* { + syscallarg(char *) path; + } */ *uap; +{ + struct mount *mp; + struct vnode *vp; + int error; + struct nameidata nd; + +restart: + bwillwrite(); + NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if (vp->v_type == VDIR) + error = EPERM; /* POSIX */ + else { + /* + * The root of a mounted filesystem cannot be deleted. + * + * XXX: can this only be a VDIR case? + */ + if (vp->v_flag & VROOT) + error = EBUSY; + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(vp); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + if (!error) { + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vput(vp); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink"); + return (error); +} + +/* + * Reposition read/write file offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lseek_args { + int fd; + int pad; + off_t offset; + int whence; +}; +#endif +int +lseek(td, uap) + struct thread *td; + register struct lseek_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) offset; + syscallarg(int) whence; + } */ *uap; +{ + struct ucred *cred = td->td_ucred; + struct file *fp; + struct vnode *vp; + struct vattr vattr; + off_t offset; + int error, noneg; + + if ((error = fget(td, uap->fd, &fp)) != 0) + return (error); + if (fp->f_type != DTYPE_VNODE) { + fdrop(fp, td); + return (ESPIPE); + } + vp = (struct vnode *)fp->f_data; + noneg = (vp->v_type != VCHR); + offset = SCARG(uap, offset); + switch (SCARG(uap, whence)) { + case L_INCR: + if (noneg && + (fp->f_offset < 0 || + (offset > 0 && fp->f_offset > OFF_MAX - offset))) + return (EOVERFLOW); + offset += fp->f_offset; + break; + case L_XTND: + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_GETATTR(vp, &vattr, cred, td); + VOP_UNLOCK(vp, 0, td); + if (error) + return (error); + if (noneg && + (vattr.va_size > OFF_MAX || + (offset > 0 && vattr.va_size > OFF_MAX - offset))) + return (EOVERFLOW); + offset += vattr.va_size; + break; + case L_SET: + break; + default: + fdrop(fp, td); + return (EINVAL); + } + if (noneg && offset < 0) + return (EINVAL); + fp->f_offset = offset; + *(off_t *)(td->td_retval) = fp->f_offset; + fdrop(fp, td); + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Reposition read/write file offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct olseek_args { + int fd; + long offset; + int whence; +}; +#endif +int +olseek(td, uap) + struct thread *td; + register struct olseek_args /* { + syscallarg(int) fd; + syscallarg(long) offset; + syscallarg(int) whence; + } */ *uap; +{ + struct lseek_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) offset; + syscallarg(int) whence; + } */ nuap; + int error; + + SCARG(&nuap, fd) = SCARG(uap, fd); + SCARG(&nuap, offset) = SCARG(uap, offset); + SCARG(&nuap, whence) = SCARG(uap, whence); + error = lseek(td, &nuap); + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Check access permissions using passed credentials. + */ +static int +vn_access(vp, user_flags, cred, td) + struct vnode *vp; + int user_flags; + struct ucred *cred; + struct thread *td; +{ + int error, flags; + + /* Flags == 0 means only check for existence. */ + error = 0; + if (user_flags) { + flags = 0; + if (user_flags & R_OK) + flags |= VREAD; + if (user_flags & W_OK) + flags |= VWRITE; + if (user_flags & X_OK) + flags |= VEXEC; + if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) + error = VOP_ACCESS(vp, flags, cred, td); + } + return (error); +} + +/* + * Check access permissions using "real" credentials. + */ +#ifndef _SYS_SYSPROTO_H_ +struct access_args { + char *path; + int flags; +}; +#endif +int +access(td, uap) + struct thread *td; + register struct access_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + struct ucred *cred, *tmpcred; + register struct vnode *vp; + int error; + struct nameidata nd; + + /* + * Create and modify a temporary credential instead of one that + * is potentially shared. This could also mess up socket + * buffer accounting which can run in an interrupt context. + * + * XXX - Depending on how "threads" are finally implemented, it + * may be better to explicitly pass the credential to namei() + * rather than to modify the potentially shared process structure. + */ + cred = td->td_ucred; + tmpcred = crdup(cred); + tmpcred->cr_uid = cred->cr_ruid; + tmpcred->cr_groups[0] = cred->cr_rgid; + td->td_ucred = tmpcred; + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + goto out1; + vp = nd.ni_vp; + + error = vn_access(vp, SCARG(uap, flags), tmpcred, td); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(vp); +out1: + td->td_ucred = cred; + crfree(tmpcred); + return (error); +} + +/* + * Check access permissions using "effective" credentials. + */ +#ifndef _SYS_SYSPROTO_H_ +struct eaccess_args { + char *path; + int flags; +}; +#endif +int +eaccess(td, uap) + struct thread *td; + register struct eaccess_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + struct nameidata nd; + struct vnode *vp; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + + error = vn_access(vp, SCARG(uap, flags), td->td_ucred, td); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(vp); + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Get file status; this version follows links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ostat_args { + char *path; + struct ostat *ub; +}; +#endif +/* ARGSUSED */ +int +ostat(td, uap) + struct thread *td; + register struct ostat_args /* { + syscallarg(char *) path; + syscallarg(struct ostat *) ub; + } */ *uap; +{ + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = vn_stat(nd.ni_vp, &sb, td); + vput(nd.ni_vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout(&osb, SCARG(uap, ub), sizeof (osb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct olstat_args { + char *path; + struct ostat *ub; +}; +#endif +/* ARGSUSED */ +int +olstat(td, uap) + struct thread *td; + register struct olstat_args /* { + syscallarg(char *) path; + syscallarg(struct ostat *) ub; + } */ *uap; +{ + struct vnode *vp; + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + error = vn_stat(vp, &sb, td); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout(&osb, SCARG(uap, ub), sizeof (osb)); + return (error); +} + +/* + * Convert from an old to a new stat structure. + */ +void +cvtstat(st, ost) + struct stat *st; + struct ostat *ost; +{ + + ost->st_dev = st->st_dev; + ost->st_ino = st->st_ino; + ost->st_mode = st->st_mode; + ost->st_nlink = st->st_nlink; + ost->st_uid = st->st_uid; + ost->st_gid = st->st_gid; + ost->st_rdev = st->st_rdev; + if (st->st_size < (quad_t)1 << 32) + ost->st_size = st->st_size; + else + ost->st_size = -2; + ost->st_atime = st->st_atime; + ost->st_mtime = st->st_mtime; + ost->st_ctime = st->st_ctime; + ost->st_blksize = st->st_blksize; + ost->st_blocks = st->st_blocks; + ost->st_flags = st->st_flags; + ost->st_gen = st->st_gen; +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Get file status; this version follows links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct stat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +stat(td, uap) + struct thread *td; + register struct stat_args /* { + syscallarg(char *) path; + syscallarg(struct stat *) ub; + } */ *uap; +{ + struct stat sb; + int error; + struct nameidata nd; + +#ifdef LOOKUP_SHARED + NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | NOOBJ, + UIO_USERSPACE, SCARG(uap, path), td); +#else + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); +#endif + if ((error = namei(&nd)) != 0) + return (error); + error = vn_stat(nd.ni_vp, &sb, td); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_vp); + if (error) + return (error); + error = copyout(&sb, SCARG(uap, ub), sizeof (sb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lstat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +lstat(td, uap) + struct thread *td; + register struct lstat_args /* { + syscallarg(char *) path; + syscallarg(struct stat *) ub; + } */ *uap; +{ + int error; + struct vnode *vp; + struct stat sb; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + error = vn_stat(vp, &sb, td); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(vp); + if (error) + return (error); + error = copyout(&sb, SCARG(uap, ub), sizeof (sb)); + return (error); +} + +/* + * Implementation of the NetBSD stat() function. + * XXX This should probably be collapsed with the FreeBSD version, + * as the differences are only due to vn_stat() clearing spares at + * the end of the structures. vn_stat could be split to avoid this, + * and thus collapse the following to close to zero code. + */ +void +cvtnstat(sb, nsb) + struct stat *sb; + struct nstat *nsb; +{ + bzero(nsb, sizeof *nsb); + nsb->st_dev = sb->st_dev; + nsb->st_ino = sb->st_ino; + nsb->st_mode = sb->st_mode; + nsb->st_nlink = sb->st_nlink; + nsb->st_uid = sb->st_uid; + nsb->st_gid = sb->st_gid; + nsb->st_rdev = sb->st_rdev; + nsb->st_atimespec = sb->st_atimespec; + nsb->st_mtimespec = sb->st_mtimespec; + nsb->st_ctimespec = sb->st_ctimespec; + nsb->st_size = sb->st_size; + nsb->st_blocks = sb->st_blocks; + nsb->st_blksize = sb->st_blksize; + nsb->st_flags = sb->st_flags; + nsb->st_gen = sb->st_gen; + nsb->st_createtimespec = sb->st_createtimespec; +} + +#ifndef _SYS_SYSPROTO_H_ +struct nstat_args { + char *path; + struct nstat *ub; +}; +#endif +/* ARGSUSED */ +int +nstat(td, uap) + struct thread *td; + register struct nstat_args /* { + syscallarg(char *) path; + syscallarg(struct nstat *) ub; + } */ *uap; +{ + struct stat sb; + struct nstat nsb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = vn_stat(nd.ni_vp, &sb, td); + vput(nd.ni_vp); + if (error) + return (error); + cvtnstat(&sb, &nsb); + error = copyout(&nsb, SCARG(uap, ub), sizeof (nsb)); + return (error); +} + +/* + * NetBSD lstat. Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lstat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +nlstat(td, uap) + struct thread *td; + register struct nlstat_args /* { + syscallarg(char *) path; + syscallarg(struct nstat *) ub; + } */ *uap; +{ + int error; + struct vnode *vp; + struct stat sb; + struct nstat nsb; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + error = vn_stat(vp, &sb, td); + vput(vp); + if (error) + return (error); + cvtnstat(&sb, &nsb); + error = copyout(&nsb, SCARG(uap, ub), sizeof (nsb)); + return (error); +} + +/* + * Get configurable pathname variables. + */ +#ifndef _SYS_SYSPROTO_H_ +struct pathconf_args { + char *path; + int name; +}; +#endif +/* ARGSUSED */ +int +pathconf(td, uap) + struct thread *td; + register struct pathconf_args /* { + syscallarg(char *) path; + syscallarg(int) name; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), td->td_retval); + vput(nd.ni_vp); + return (error); +} + +/* + * Return target name of a symbolic link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct readlink_args { + char *path; + char *buf; + int count; +}; +#endif +/* ARGSUSED */ +int +readlink(td, uap) + struct thread *td; + register struct readlink_args /* { + syscallarg(char *) path; + syscallarg(char *) buf; + syscallarg(int) count; + } */ *uap; +{ + register struct vnode *vp; + struct iovec aiov; + struct uio auio; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + if (vp->v_type != VLNK) + error = EINVAL; + else { + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + auio.uio_resid = SCARG(uap, count); + error = VOP_READLINK(vp, &auio, td->td_ucred); + } + vput(vp); + td->td_retval[0] = SCARG(uap, count) - auio.uio_resid; + return (error); +} + +/* + * Common implementation code for chflags() and fchflags(). + */ +static int +setfflags(td, vp, flags) + struct thread *td; + struct vnode *vp; + int flags; +{ + int error; + struct mount *mp; + struct vattr vattr; + + /* + * Prevent non-root users from setting flags on devices. When + * a device is reused, users can retain ownership of the device + * if they are allowed to set flags and programs assume that + * chown can't fail when done as root. + */ + if (vp->v_type == VCHR || vp->v_type == VBLK) { + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + } + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + VATTR_NULL(&vattr); + vattr.va_flags = flags; + error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return (error); +} + +/* + * Change flags of a file given a path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chflags_args { + char *path; + int flags; +}; +#endif +/* ARGSUSED */ +int +chflags(td, uap) + struct thread *td; + register struct chflags_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfflags(td, nd.ni_vp, SCARG(uap, flags)); + vrele(nd.ni_vp); + return error; +} + +/* + * Same as chflags() but doesn't follow symlinks. + */ +int +lchflags(td, uap) + struct thread *td; + register struct lchflags_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfflags(td, nd.ni_vp, SCARG(uap, flags)); + vrele(nd.ni_vp); + return error; +} + +/* + * Change flags of a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchflags_args { + int fd; + int flags; +}; +#endif +/* ARGSUSED */ +int +fchflags(td, uap) + struct thread *td; + register struct fchflags_args /* { + syscallarg(int) fd; + syscallarg(int) flags; + } */ *uap; +{ + struct file *fp; + int error; + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + error = setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags)); + fdrop(fp, td); + return (error); +} + +/* + * Common implementation code for chmod(), lchmod() and fchmod(). + */ +static int +setfmode(td, vp, mode) + struct thread *td; + struct vnode *vp; + int mode; +{ + int error; + struct mount *mp; + struct vattr vattr; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + VATTR_NULL(&vattr); + vattr.va_mode = mode & ALLPERMS; + error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return error; +} + +/* + * Change mode of a file given path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chmod_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +chmod(td, uap) + struct thread *td; + register struct chmod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfmode(td, nd.ni_vp, SCARG(uap, mode)); + vrele(nd.ni_vp); + return error; +} + +/* + * Change mode of a file given path name (don't follow links.) + */ +#ifndef _SYS_SYSPROTO_H_ +struct lchmod_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +lchmod(td, uap) + struct thread *td; + register struct lchmod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfmode(td, nd.ni_vp, SCARG(uap, mode)); + vrele(nd.ni_vp); + return error; +} + +/* + * Change mode of a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchmod_args { + int fd; + int mode; +}; +#endif +/* ARGSUSED */ +int +fchmod(td, uap) + struct thread *td; + register struct fchmod_args /* { + syscallarg(int) fd; + syscallarg(int) mode; + } */ *uap; +{ + struct file *fp; + struct vnode *vp; + int error; + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + vp = (struct vnode *)fp->f_data; + error = setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode)); + fdrop(fp, td); + return (error); +} + +/* + * Common implementation for chown(), lchown(), and fchown() + */ +static int +setfown(td, vp, uid, gid) + struct thread *td; + struct vnode *vp; + uid_t uid; + gid_t gid; +{ + int error; + struct mount *mp; + struct vattr vattr; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + VATTR_NULL(&vattr); + vattr.va_uid = uid; + vattr.va_gid = gid; + error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return error; +} + +/* + * Set ownership given a path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chown_args { + char *path; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +chown(td, uap) + struct thread *td; + register struct chown_args /* { + syscallarg(char *) path; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid)); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set ownership given a path name, do not cross symlinks. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lchown_args { + char *path; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +lchown(td, uap) + struct thread *td; + register struct lchown_args /* { + syscallarg(char *) path; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid)); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set ownership given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchown_args { + int fd; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +fchown(td, uap) + struct thread *td; + register struct fchown_args /* { + syscallarg(int) fd; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; +{ + struct file *fp; + struct vnode *vp; + int error; + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + vp = (struct vnode *)fp->f_data; + error = setfown(td, (struct vnode *)fp->f_data, + SCARG(uap, uid), SCARG(uap, gid)); + fdrop(fp, td); + return (error); +} + +/* + * Common implementation code for utimes(), lutimes(), and futimes(). + */ +static int +getutimes(usrtvp, tsp) + const struct timeval *usrtvp; + struct timespec *tsp; +{ + struct timeval tv[2]; + int error; + + if (usrtvp == NULL) { + microtime(&tv[0]); + TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); + tsp[1] = tsp[0]; + } else { + if ((error = copyin(usrtvp, tv, sizeof (tv))) != 0) + return (error); + TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); + TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]); + } + return 0; +} + +/* + * Common implementation code for utimes(), lutimes(), and futimes(). + */ +static int +setutimes(td, vp, ts, nullflag) + struct thread *td; + struct vnode *vp; + const struct timespec *ts; + int nullflag; +{ + int error; + struct mount *mp; + struct vattr vattr; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + VATTR_NULL(&vattr); + vattr.va_atime = ts[0]; + vattr.va_mtime = ts[1]; + if (nullflag) + vattr.va_vaflags |= VA_UTIMES_NULL; + error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return error; +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct utimes_args { + char *path; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +utimes(td, uap) + struct thread *td; + register struct utimes_args /* { + syscallarg(char *) path; + syscallarg(struct timeval *) tptr; + } */ *uap; +{ + struct timespec ts[2]; + struct timeval *usrtvp; + int error; + struct nameidata nd; + + usrtvp = SCARG(uap, tptr); + if ((error = getutimes(usrtvp, ts)) != 0) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lutimes_args { + char *path; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +lutimes(td, uap) + struct thread *td; + register struct lutimes_args /* { + syscallarg(char *) path; + syscallarg(struct timeval *) tptr; + } */ *uap; +{ + struct timespec ts[2]; + struct timeval *usrtvp; + int error; + struct nameidata nd; + + usrtvp = SCARG(uap, tptr); + if ((error = getutimes(usrtvp, ts)) != 0) + return (error); + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct futimes_args { + int fd; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +futimes(td, uap) + struct thread *td; + register struct futimes_args /* { + syscallarg(int ) fd; + syscallarg(struct timeval *) tptr; + } */ *uap; +{ + struct timespec ts[2]; + struct file *fp; + struct timeval *usrtvp; + int error; + + usrtvp = SCARG(uap, tptr); + if ((error = getutimes(usrtvp, ts)) != 0) + return (error); + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + error = setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL); + fdrop(fp, td); + return (error); +} + +/* + * Truncate a file given its path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct truncate_args { + char *path; + int pad; + off_t length; +}; +#endif +/* ARGSUSED */ +int +truncate(td, uap) + struct thread *td; + register struct truncate_args /* { + syscallarg(char *) path; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ *uap; +{ + struct mount *mp; + struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + if (uap->length < 0) + return(EINVAL); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + vrele(vp); + return (error); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0 && + (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = SCARG(uap, length); + error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); + } + vput(vp); + vn_finished_write(mp); + return (error); +} + +/* + * Truncate a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ftruncate_args { + int fd; + int pad; + off_t length; +}; +#endif +/* ARGSUSED */ +int +ftruncate(td, uap) + struct thread *td; + register struct ftruncate_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ *uap; +{ + struct mount *mp; + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (uap->length < 0) + return(EINVAL); + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + if ((fp->f_flag & FWRITE) == 0) { + fdrop(fp, td); + return (EINVAL); + } + vp = (struct vnode *)fp->f_data; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + fdrop(fp, td); + return (error); + } + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = SCARG(uap, length); + error = VOP_SETATTR(vp, &vattr, fp->f_cred, td); + } + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + fdrop(fp, td); + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Truncate a file given its path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct otruncate_args { + char *path; + long length; +}; +#endif +/* ARGSUSED */ +int +otruncate(td, uap) + struct thread *td; + register struct otruncate_args /* { + syscallarg(char *) path; + syscallarg(long) length; + } */ *uap; +{ + struct truncate_args /* { + syscallarg(char *) path; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ nuap; + + SCARG(&nuap, path) = SCARG(uap, path); + SCARG(&nuap, length) = SCARG(uap, length); + return (truncate(td, &nuap)); +} + +/* + * Truncate a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct oftruncate_args { + int fd; + long length; +}; +#endif +/* ARGSUSED */ +int +oftruncate(td, uap) + struct thread *td; + register struct oftruncate_args /* { + syscallarg(int) fd; + syscallarg(long) length; + } */ *uap; +{ + struct ftruncate_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ nuap; + + SCARG(&nuap, fd) = SCARG(uap, fd); + SCARG(&nuap, length) = SCARG(uap, length); + return (ftruncate(td, &nuap)); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Sync an open file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fsync_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +fsync(td, uap) + struct thread *td; + struct fsync_args /* { + syscallarg(int) fd; + } */ *uap; +{ + struct vnode *vp; + struct mount *mp; + struct file *fp; + vm_object_t obj; + int error; + + GIANT_REQUIRED; + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + vp = (struct vnode *)fp->f_data; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + fdrop(fp, td); + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + if (VOP_GETVOBJECT(vp, &obj) == 0) { + vm_object_page_clean(obj, 0, 0, 0); + } + error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, td); +#ifdef SOFTUPDATES + if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)) + error = softdep_fsync(vp); +#endif + + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + fdrop(fp, td); + return (error); +} + +/* + * Rename files. Source and destination must either both be directories, + * or both not be directories. If target is a directory, it must be empty. + */ +#ifndef _SYS_SYSPROTO_H_ +struct rename_args { + char *from; + char *to; +}; +#endif +/* ARGSUSED */ +int +rename(td, uap) + struct thread *td; + register struct rename_args /* { + syscallarg(char *) from; + syscallarg(char *) to; + } */ *uap; +{ + struct mount *mp; + struct vnode *tvp, *fvp, *tdvp; + struct nameidata fromnd, tond; + int error; + + bwillwrite(); + NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE, + SCARG(uap, from), td); + if ((error = namei(&fromnd)) != 0) + return (error); + fvp = fromnd.ni_vp; + if ((error = vn_start_write(fvp, &mp, V_WAIT | PCATCH)) != 0) { + NDFREE(&fromnd, NDF_ONLY_PNBUF); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ, + UIO_USERSPACE, SCARG(uap, to), td); + if (fromnd.ni_vp->v_type == VDIR) + tond.ni_cnd.cn_flags |= WILLBEDIR; + if ((error = namei(&tond)) != 0) { + /* Translate error code for rename("dir1", "dir2/."). */ + if (error == EISDIR && fvp->v_type == VDIR) + error = EINVAL; + NDFREE(&fromnd, NDF_ONLY_PNBUF); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + if (fvp == tdvp) + error = EINVAL; + /* + * If source is the same as the destination (that is the + * same inode number with the same name in the same directory), + * then there is nothing to do. + */ + if (fvp == tvp && fromnd.ni_dvp == tdvp && + fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && + !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, + fromnd.ni_cnd.cn_namelen)) + error = -1; +out: + if (!error) { + VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE); + if (fromnd.ni_dvp != tdvp) { + VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + } + if (tvp) { + VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE); + } + error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, + tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); + NDFREE(&fromnd, NDF_ONLY_PNBUF); + NDFREE(&tond, NDF_ONLY_PNBUF); + } else { + NDFREE(&fromnd, NDF_ONLY_PNBUF); + NDFREE(&tond, NDF_ONLY_PNBUF); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + vrele(tond.ni_startdir); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename"); + ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename"); + ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename"); + ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename"); +out1: + if (fromnd.ni_startdir) + vrele(fromnd.ni_startdir); + if (error == -1) + return (0); + return (error); +} + +/* + * Make a directory file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mkdir_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +mkdir(td, uap) + struct thread *td; + register struct mkdir_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + + return vn_mkdir(uap->path, uap->mode, UIO_USERSPACE, td); +} + +int +vn_mkdir(path, mode, segflg, td) + char *path; + int mode; + enum uio_seg segflg; + struct thread *td; +{ + struct mount *mp; + struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + +restart: + bwillwrite(); + NDINIT(&nd, CREATE, LOCKPARENT, segflg, path, td); + nd.ni_cnd.cn_flags |= WILLBEDIR; + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(vp); + /* + * XXX namei called with LOCKPARENT but not LOCKLEAF has + * the strange behaviour of leaving the vnode unlocked + * if the target is the same vnode as the parent. + */ + if (vp == nd.ni_dvp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + return (EEXIST); + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VATTR_NULL(&vattr); + vattr.va_type = VDIR; + FILEDESC_LOCK(td->td_proc->p_fd); + vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; + FILEDESC_UNLOCK(td->td_proc->p_fd); + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if (!error) + vput(nd.ni_vp); + vn_finished_write(mp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir"); + return (error); +} + +/* + * Remove a directory file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct rmdir_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +rmdir(td, uap) + struct thread *td; + struct rmdir_args /* { + syscallarg(char *) path; + } */ *uap; +{ + struct mount *mp; + struct vnode *vp; + int error; + struct nameidata nd; + +restart: + bwillwrite(); + NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + /* + * No rmdir "." please. + */ + if (nd.ni_dvp == vp) { + error = EINVAL; + goto out; + } + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) { + error = EBUSY; + goto out; + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + vn_finished_write(mp); +out: + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir"); + return (error); +} + +#ifdef COMPAT_43 +/* + * Read a block of directory entries in a filesystem independent format. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ogetdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif +int +ogetdirentries(td, uap) + struct thread *td; + register struct ogetdirentries_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + syscallarg(long *) basep; + } */ *uap; +{ + struct vnode *vp; + struct file *fp; + struct uio auio, kuio; + struct iovec aiov, kiov; + struct dirent *dp, *edp; + caddr_t dirbuf; + int error, eofflag, readcnt; + long loff; + + /* XXX arbitrary sanity limit on `count'. */ + if (SCARG(uap, count) > 64 * 1024) + return (EINVAL); + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); + return (EBADF); + } + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) { + fdrop(fp, td); + return (EINVAL); + } + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + auio.uio_resid = SCARG(uap, count); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + loff = auio.uio_offset = fp->f_offset; +# if (BYTE_ORDER != LITTLE_ENDIAN) + if (vp->v_mount->mnt_maxsymlinklen <= 0) { + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, + NULL, NULL); + fp->f_offset = auio.uio_offset; + } else +# endif + { + kuio = auio; + kuio.uio_iov = &kiov; + kuio.uio_segflg = UIO_SYSSPACE; + kiov.iov_len = SCARG(uap, count); + MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK); + kiov.iov_base = dirbuf; + error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, + NULL, NULL); + fp->f_offset = kuio.uio_offset; + if (error == 0) { + readcnt = SCARG(uap, count) - kuio.uio_resid; + edp = (struct dirent *)&dirbuf[readcnt]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + /* + * The expected low byte of + * dp->d_namlen is our dp->d_type. + * The high MBZ byte of dp->d_namlen + * is our dp->d_namlen. + */ + dp->d_type = dp->d_namlen; + dp->d_namlen = 0; +# else + /* + * The dp->d_type is the high byte + * of the expected dp->d_namlen, + * so must be zero'ed. + */ + dp->d_type = 0; +# endif + if (dp->d_reclen > 0) { + dp = (struct dirent *) + ((char *)dp + dp->d_reclen); + } else { + error = EIO; + break; + } + } + if (dp >= edp) + error = uiomove(dirbuf, readcnt, &auio); + } + FREE(dirbuf, M_TEMP); + } + VOP_UNLOCK(vp, 0, td); + if (error) { + fdrop(fp, td); + return (error); + } + if (SCARG(uap, count) == auio.uio_resid) { + if (union_dircheckp) { + error = union_dircheckp(td, &vp, fp); + if (error == -1) + goto unionread; + if (error) { + fdrop(fp, td); + return (error); + } + } + if ((vp->v_flag & VROOT) && + (vp->v_mount->mnt_flag & MNT_UNION)) { + struct vnode *tvp = vp; + vp = vp->v_mount->mnt_vnodecovered; + VREF(vp); + fp->f_data = vp; + fp->f_offset = 0; + vrele(tvp); + goto unionread; + } + } + error = copyout(&loff, SCARG(uap, basep), sizeof(long)); + fdrop(fp, td); + td->td_retval[0] = SCARG(uap, count) - auio.uio_resid; + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Read a block of directory entries in a filesystem independent format. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif +int +getdirentries(td, uap) + struct thread *td; + register struct getdirentries_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + syscallarg(long *) basep; + } */ *uap; +{ + struct vnode *vp; + struct file *fp; + struct uio auio; + struct iovec aiov; + long loff; + int error, eofflag; + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); + return (EBADF); + } + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) { + fdrop(fp, td); + return (EINVAL); + } + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + auio.uio_resid = SCARG(uap, count); + /* vn_lock(vp, LK_SHARED | LK_RETRY, td); */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + loff = auio.uio_offset = fp->f_offset; + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); + fp->f_offset = auio.uio_offset; + VOP_UNLOCK(vp, 0, td); + if (error) { + fdrop(fp, td); + return (error); + } + if (SCARG(uap, count) == auio.uio_resid) { + if (union_dircheckp) { + error = union_dircheckp(td, &vp, fp); + if (error == -1) + goto unionread; + if (error) { + fdrop(fp, td); + return (error); + } + } + if ((vp->v_flag & VROOT) && + (vp->v_mount->mnt_flag & MNT_UNION)) { + struct vnode *tvp = vp; + vp = vp->v_mount->mnt_vnodecovered; + VREF(vp); + fp->f_data = vp; + fp->f_offset = 0; + vrele(tvp); + goto unionread; + } + } + if (SCARG(uap, basep) != NULL) { + error = copyout(&loff, SCARG(uap, basep), sizeof(long)); + } + td->td_retval[0] = SCARG(uap, count) - auio.uio_resid; + fdrop(fp, td); + return (error); +} +#ifndef _SYS_SYSPROTO_H_ +struct getdents_args { + int fd; + char *buf; + size_t count; +}; +#endif +int +getdents(td, uap) + struct thread *td; + register struct getdents_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + } */ *uap; +{ + struct getdirentries_args ap; + ap.fd = uap->fd; + ap.buf = uap->buf; + ap.count = uap->count; + ap.basep = NULL; + return getdirentries(td, &ap); +} + +/* + * Set the mode mask for creation of filesystem nodes. + * + * MP SAFE + */ +#ifndef _SYS_SYSPROTO_H_ +struct umask_args { + int newmask; +}; +#endif +int +umask(td, uap) + struct thread *td; + struct umask_args /* { + syscallarg(int) newmask; + } */ *uap; +{ + register struct filedesc *fdp; + + FILEDESC_LOCK(td->td_proc->p_fd); + fdp = td->td_proc->p_fd; + td->td_retval[0] = fdp->fd_cmask; + fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS; + FILEDESC_UNLOCK(td->td_proc->p_fd); + return (0); +} + +/* + * Void all references to file by ripping underlying filesystem + * away from vnode. + */ +#ifndef _SYS_SYSPROTO_H_ +struct revoke_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +revoke(td, uap) + struct thread *td; + register struct revoke_args /* { + syscallarg(char *) path; + } */ *uap; +{ + struct mount *mp; + struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), + td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (vp->v_type != VCHR) { + vput(vp); + return (EINVAL); + } + error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); + if (error) { + vput(vp); + return (error); + } + VOP_UNLOCK(vp, 0, td); + if (td->td_ucred->cr_uid != vattr.va_uid) { + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + goto out; + } + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto out; + if (vcount(vp) > 1) + VOP_REVOKE(vp, REVOKEALL); + vn_finished_write(mp); +out: + vrele(vp); + return (error); +} + +/* + * Convert a user file descriptor to a kernel file entry. + * The file entry is locked upon returning. + */ +int +getvnode(fdp, fd, fpp) + struct filedesc *fdp; + int fd; + struct file **fpp; +{ + int error; + struct file *fp; + + fp = NULL; + if (fdp == NULL) + error = EBADF; + else { + FILEDESC_LOCK(fdp); + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + error = EBADF; + else if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { + fp = NULL; + error = EINVAL; + } else { + fhold(fp); + error = 0; + } + FILEDESC_UNLOCK(fdp); + } + *fpp = fp; + return (error); +} +/* + * Get (NFS) file handle + */ +#ifndef _SYS_SYSPROTO_H_ +struct getfh_args { + char *fname; + fhandle_t *fhp; +}; +#endif +int +getfh(td, uap) + struct thread *td; + register struct getfh_args *uap; +{ + struct nameidata nd; + fhandle_t fh; + register struct vnode *vp; + int error; + + /* + * Must be super user + */ + error = suser(td); + if (error) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, td); + error = namei(&nd); + if (error) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + bzero(&fh, sizeof(fh)); + fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; + error = VFS_VPTOFH(vp, &fh.fh_fid); + vput(vp); + if (error) + return (error); + error = copyout(&fh, uap->fhp, sizeof (fh)); + return (error); +} + +/* + * syscall for the rpc.lockd to use to translate a NFS file handle into + * an open descriptor. + * + * warning: do not remove the suser() call or this becomes one giant + * security hole. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fhopen_args { + const struct fhandle *u_fhp; + int flags; +}; +#endif +int +fhopen(td, uap) + struct thread *td; + struct fhopen_args /* { + syscallarg(const struct fhandle *) u_fhp; + syscallarg(int) flags; + } */ *uap; +{ + struct proc *p = td->td_proc; + struct mount *mp; + struct vnode *vp; + struct fhandle fhp; + struct vattr vat; + struct vattr *vap = &vat; + struct flock lf; + struct file *fp; + register struct filedesc *fdp = p->p_fd; + int fmode, mode, error, type; + struct file *nfp; + int indx; + + /* + * Must be super user + */ + error = suser(td); + if (error) + return (error); + + fmode = FFLAGS(SCARG(uap, flags)); + /* why not allow a non-read/write open for our lockd? */ + if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) + return (EINVAL); + error = copyin(SCARG(uap,u_fhp), &fhp, sizeof(fhp)); + if (error) + return(error); + /* find the mount point */ + mp = vfs_getvfs(&fhp.fh_fsid); + if (mp == NULL) + return (ESTALE); + /* now give me my vnode, it gets returned to me locked */ + error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp); + if (error) + return (error); + /* + * from now on we have to make sure not + * to forget about the vnode + * any error that causes an abort must vput(vp) + * just set error = err and 'goto bad;'. + */ + + /* + * from vn_open + */ + if (vp->v_type == VLNK) { + error = EMLINK; + goto bad; + } + if (vp->v_type == VSOCK) { + error = EOPNOTSUPP; + goto bad; + } + mode = 0; + if (fmode & (FWRITE | O_TRUNC)) { + if (vp->v_type == VDIR) { + error = EISDIR; + goto bad; + } + error = vn_writechk(vp); + if (error) + goto bad; + mode |= VWRITE; + } + if (fmode & FREAD) + mode |= VREAD; + if (mode) { + error = VOP_ACCESS(vp, mode, td->td_ucred, td); + if (error) + goto bad; + } + if (fmode & O_TRUNC) { + VOP_UNLOCK(vp, 0, td); /* XXX */ + if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) { + vrele(vp); + return (error); + } + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* XXX */ + VATTR_NULL(vap); + vap->va_size = 0; + error = VOP_SETATTR(vp, vap, td->td_ucred, td); + vn_finished_write(mp); + if (error) + goto bad; + } + error = VOP_OPEN(vp, fmode, td->td_ucred, td); + if (error) + goto bad; + /* + * Make sure that a VM object is created for VMIO support. + */ + if (vn_canvmio(vp) == TRUE) { + if ((error = vfs_object_create(vp, td, td->td_ucred)) != 0) + goto bad; + } + if (fmode & FWRITE) + vp->v_writecount++; + + /* + * end of vn_open code + */ + + if ((error = falloc(td, &nfp, &indx)) != 0) { + if (fmode & FWRITE) + vp->v_writecount--; + goto bad; + } + fp = nfp; + + /* + * Hold an extra reference to avoid having fp ripped out + * from under us while we block in the lock op + */ + fhold(fp); + nfp->f_data = vp; + nfp->f_flag = fmode & FMASK; + nfp->f_ops = &vnops; + nfp->f_type = DTYPE_VNODE; + if (fmode & (O_EXLOCK | O_SHLOCK)) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (fmode & O_EXLOCK) + lf.l_type = F_WRLCK; + else + lf.l_type = F_RDLCK; + type = F_FLOCK; + if ((fmode & FNONBLOCK) == 0) + type |= F_WAIT; + VOP_UNLOCK(vp, 0, td); + if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, + type)) != 0) { + /* + * The lock request failed. Normally close the + * descriptor but handle the case where someone might + * have dup()d or close()d it when we weren't looking. + */ + FILEDESC_LOCK(fdp); + if (fdp->fd_ofiles[indx] == fp) { + fdp->fd_ofiles[indx] = NULL; + FILEDESC_UNLOCK(fdp); + fdrop(fp, td); + } else + FILEDESC_UNLOCK(fdp); + /* + * release our private reference + */ + fdrop(fp, td); + return(error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + fp->f_flag |= FHASLOCK; + } + if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0)) + vfs_object_create(vp, td, td->td_ucred); + + VOP_UNLOCK(vp, 0, td); + fdrop(fp, td); + td->td_retval[0] = indx; + return (0); + +bad: + vput(vp); + return (error); +} + +/* + * Stat an (NFS) file handle. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fhstat_args { + struct fhandle *u_fhp; + struct stat *sb; +}; +#endif +int +fhstat(td, uap) + struct thread *td; + register struct fhstat_args /* { + syscallarg(struct fhandle *) u_fhp; + syscallarg(struct stat *) sb; + } */ *uap; +{ + struct stat sb; + fhandle_t fh; + struct mount *mp; + struct vnode *vp; + int error; + + /* + * Must be super user + */ + error = suser(td); + if (error) + return (error); + + error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t)); + if (error) + return (error); + + if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) + return (error); + error = vn_stat(vp, &sb, td); + vput(vp); + if (error) + return (error); + error = copyout(&sb, SCARG(uap, sb), sizeof(sb)); + return (error); +} + +/* + * Implement fstatfs() for (NFS) file handles. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fhstatfs_args { + struct fhandle *u_fhp; + struct statfs *buf; +}; +#endif +int +fhstatfs(td, uap) + struct thread *td; + struct fhstatfs_args /* { + syscallarg(struct fhandle) *u_fhp; + syscallarg(struct statfs) *buf; + } */ *uap; +{ + struct statfs *sp; + struct mount *mp; + struct vnode *vp; + struct statfs sb; + fhandle_t fh; + int error; + + /* + * Must be super user + */ + error = suser(td); + if (error) + return (error); + + if ((error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t))) != 0) + return (error); + + if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) + return (error); + mp = vp->v_mount; + sp = &mp->mnt_stat; + vput(vp); + if ((error = VFS_STATFS(mp, sp, td)) != 0) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (suser(td)) { + bcopy(sp, &sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } + return (copyout(sp, SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Syscall to push extended attribute configuration information into the + * VFS. Accepts a path, which it converts to a mountpoint, as well as + * a command (int cmd), and attribute name and misc data. For now, the + * attribute name is left in userspace for consumption by the VFS_op. + * It will probably be changed to be copied into sysspace by the + * syscall in the future, once issues with various consumers of the + * attribute code have raised their hands. + * + * Currently this is used only by UFS Extended Attributes. + */ +int +extattrctl(td, uap) + struct thread *td; + struct extattrctl_args /* { + syscallarg(const char *) path; + syscallarg(int) cmd; + syscallarg(const char *) filename; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + } */ *uap; +{ + struct vnode *filename_vp; + struct nameidata nd; + struct mount *mp, *mp_writable; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + /* + * uap->attrname is not always defined. We check again later when we + * invoke the VFS call so as to pass in NULL there if needed. + */ + if (uap->attrname != NULL) { + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, + NULL); + if (error) + return (error); + } + + /* + * uap->filename is not always defined. If it is, grab a vnode lock, + * which VFS_EXTATTRCTL() will later release. + */ + filename_vp = NULL; + if (uap->filename != NULL) { + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + uap->filename, td); + if ((error = namei(&nd)) != 0) + return (error); + filename_vp = nd.ni_vp; + NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK); + } + + /* uap->path is always defined. */ + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); + if ((error = namei(&nd)) != 0) { + if (filename_vp != NULL) + vput(filename_vp); + return (error); + } + mp = nd.ni_vp->v_mount; + error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH); + NDFREE(&nd, 0); + if (error) { + if (filename_vp != NULL) + vput(filename_vp); + return (error); + } + + if (uap->attrname != NULL) { + error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, + uap->attrnamespace, attrname, td); + } else { + error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, + uap->attrnamespace, NULL, td); + } + + vn_finished_write(mp_writable); + /* + * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, + * filename_vp, so vrele it if it is defined. + */ + if (filename_vp != NULL) + vrele(filename_vp); + + return (error); +} + +/*- + * Set a named extended attribute on a file or directory + * + * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", + * kernelspace string pointer "attrname", userspace buffer + * pointer "data", buffer length "nbytes", thread "td". + * Returns: 0 on success, an error number otherwise + * Locks: none + * References: vp must be a valid reference for the duration of the call + */ +static int +extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, + void *data, size_t nbytes, struct thread *td) +{ + struct mount *mp; + struct uio auio; + struct iovec aiov; + ssize_t cnt; + int error; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + + aiov.iov_base = data; + aiov.iov_len = nbytes; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + if (nbytes > INT_MAX) { + error = EINVAL; + goto done; + } + auio.uio_resid = nbytes; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + cnt = nbytes; + + error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, + td->td_ucred, td); + cnt -= auio.uio_resid; + td->td_retval[0] = cnt; + +done: + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return (error); +} + +int +extattr_set_file(td, uap) + struct thread *td; + struct extattr_set_file_args /* { + syscallarg(const char *) path; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + syscallarg(void *) data; + syscallarg(size_t) nbytes; + } */ *uap; +{ + struct nameidata nd; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + + error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname, + uap->data, uap->nbytes, td); + + vrele(nd.ni_vp); + return (error); +} + +int +extattr_set_fd(td, uap) + struct thread *td; + struct extattr_set_fd_args /* { + syscallarg(int) fd; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + syscallarg(void *) data; + syscallarg(size_t) nbytes; + } */ *uap; +{ + struct file *fp; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + + if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) + return (error); + + error = extattr_set_vp((struct vnode *)fp->f_data, uap->attrnamespace, + attrname, uap->data, uap->nbytes, td); + fdrop(fp, td); + + return (error); +} + +/*- + * Get a named extended attribute on a file or directory + * + * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", + * kernelspace string pointer "attrname", userspace buffer + * pointer "data", buffer length "nbytes", thread "td". + * Returns: 0 on success, an error number otherwise + * Locks: none + * References: vp must be a valid reference for the duration of the call + */ +static int +extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname, + void *data, size_t nbytes, struct thread *td) +{ + struct uio auio, *auiop; + struct iovec aiov; + ssize_t cnt; + size_t size, *sizep; + int error; + + VOP_LEASE(vp, td, td->td_ucred, LEASE_READ); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + + /* + * Slightly unusual semantics: if the user provides a NULL data + * pointer, they don't want to receive the data, just the + * maximum read length. + */ + auiop = NULL; + sizep = NULL; + cnt = 0; + if (data != NULL) { + aiov.iov_base = data; + aiov.iov_len = nbytes; + auio.uio_iov = &aiov; + auio.uio_offset = 0; + if (nbytes > INT_MAX) { + error = EINVAL; + goto done; + } + auio.uio_resid = nbytes; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + auiop = &auio; + cnt = nbytes; + } else + sizep = &size; + + error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep, + td->td_ucred, td); + + if (auiop != NULL) { + cnt -= auio.uio_resid; + td->td_retval[0] = cnt; + } else + td->td_retval[0] = size; + +done: + VOP_UNLOCK(vp, 0, td); + return (error); +} + +int +extattr_get_file(td, uap) + struct thread *td; + struct extattr_get_file_args /* { + syscallarg(const char *) path; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + syscallarg(void *) data; + syscallarg(size_t) nbytes; + } */ *uap; +{ + struct nameidata nd; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + + error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname, + uap->data, uap->nbytes, td); + + vrele(nd.ni_vp); + return (error); +} + +int +extattr_get_fd(td, uap) + struct thread *td; + struct extattr_get_fd_args /* { + syscallarg(int) fd; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + syscallarg(void *) data; + syscallarg(size_t) nbytes; + } */ *uap; +{ + struct file *fp; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + + if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) + return (error); + + error = extattr_get_vp((struct vnode *)fp->f_data, uap->attrnamespace, + attrname, uap->data, uap->nbytes, td); + + fdrop(fp, td); + return (error); +} + +/* + * extattr_delete_vp(): Delete a named extended attribute on a file or + * directory + * + * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", + * kernelspace string pointer "attrname", proc "p" + * Returns: 0 on success, an error number otherwise + * Locks: none + * References: vp must be a valid reference for the duration of the call + */ +static int +extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname, + struct thread *td) +{ + struct mount *mp; + int error; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + + error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_ucred, + td); + + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + return (error); +} + +int +extattr_delete_file(td, uap) + struct thread *td; + struct extattr_delete_file_args /* { + syscallarg(const char *) path; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + } */ *uap; +{ + struct nameidata nd; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return(error); + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); + if ((error = namei(&nd)) != 0) + return(error); + NDFREE(&nd, NDF_ONLY_PNBUF); + + error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td); + + vrele(nd.ni_vp); + return(error); +} + +int +extattr_delete_fd(td, uap) + struct thread *td; + struct extattr_delete_fd_args /* { + syscallarg(int) fd; + syscallarg(int) attrnamespace; + syscallarg(const char *) attrname; + } */ *uap; +{ + struct file *fp; + struct vnode *vp; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + + if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) + return (error); + vp = (struct vnode *)fp->f_data; + + error = extattr_delete_vp((struct vnode *)fp->f_data, + uap->attrnamespace, attrname, td); + + fdrop(fp, td); + return (error); +} diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c new file mode 100644 index 0000000..77568c2 --- /dev/null +++ b/sys/kern/vfs_vnops.c @@ -0,0 +1,1056 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/mount.h> +#include <sys/mutex.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/filio.h> +#include <sys/sx.h> +#include <sys/ttycom.h> +#include <sys/conf.h> +#include <sys/syslog.h> + +#include <machine/limits.h> + +static int vn_closefile(struct file *fp, struct thread *td); +static int vn_ioctl(struct file *fp, u_long com, caddr_t data, + struct thread *td); +static int vn_read(struct file *fp, struct uio *uio, + struct ucred *cred, int flags, struct thread *td); +static int vn_poll(struct file *fp, int events, struct ucred *cred, + struct thread *td); +static int vn_kqfilter(struct file *fp, struct knote *kn); +static int vn_statfile(struct file *fp, struct stat *sb, struct thread *td); +static int vn_write(struct file *fp, struct uio *uio, + struct ucred *cred, int flags, struct thread *td); + +struct fileops vnops = { + vn_read, vn_write, vn_ioctl, vn_poll, vn_kqfilter, + vn_statfile, vn_closefile +}; + +int +vn_open(ndp, flagp, cmode) + register struct nameidata *ndp; + int *flagp, cmode; +{ + struct thread *td = ndp->ni_cnd.cn_thread; + + return (vn_open_cred(ndp, flagp, cmode, td->td_ucred)); +} + +/* + * Common code for vnode open operations. + * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. + * + * Note that this does NOT free nameidata for the successful case, + * due to the NDINIT being done elsewhere. + */ +int +vn_open_cred(ndp, flagp, cmode, cred) + register struct nameidata *ndp; + int *flagp, cmode; + struct ucred *cred; +{ + struct vnode *vp; + struct mount *mp; + struct thread *td = ndp->ni_cnd.cn_thread; + struct vattr vat; + struct vattr *vap = &vat; + int mode, fmode, error; +#ifdef LOOKUP_SHARED + int exclusive; /* The current intended lock state */ + + exclusive = 0; +#endif + +restart: + fmode = *flagp; + if (fmode & O_CREAT) { + ndp->ni_cnd.cn_nameiop = CREATE; + ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; + if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) + ndp->ni_cnd.cn_flags |= FOLLOW; + bwillwrite(); + if ((error = namei(ndp)) != 0) + return (error); + if (ndp->ni_vp == NULL) { + VATTR_NULL(vap); + vap->va_type = VREG; + vap->va_mode = cmode; + if (fmode & O_EXCL) + vap->va_vaflags |= VA_EXCLUSIVE; + if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(ndp, NDF_ONLY_PNBUF); + vput(ndp->ni_dvp); + if ((error = vn_start_write(NULL, &mp, + V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE); + error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, + &ndp->ni_cnd, vap); + vput(ndp->ni_dvp); + vn_finished_write(mp); + if (error) { + NDFREE(ndp, NDF_ONLY_PNBUF); + return (error); + } + ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create"); + ASSERT_VOP_LOCKED(ndp->ni_vp, "create"); + fmode &= ~O_TRUNC; + vp = ndp->ni_vp; +#ifdef LOOKUP_SHARED + exclusive = 1; +#endif + } else { + if (ndp->ni_dvp == ndp->ni_vp) + vrele(ndp->ni_dvp); + else + vput(ndp->ni_dvp); + ndp->ni_dvp = NULL; + vp = ndp->ni_vp; + if (fmode & O_EXCL) { + error = EEXIST; + goto bad; + } + fmode &= ~O_CREAT; + } + } else { + ndp->ni_cnd.cn_nameiop = LOOKUP; +#ifdef LOOKUP_SHARED + ndp->ni_cnd.cn_flags = + ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | + LOCKSHARED | LOCKLEAF; +#else + ndp->ni_cnd.cn_flags = + ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF; +#endif + if ((error = namei(ndp)) != 0) + return (error); + vp = ndp->ni_vp; + } + if (vp->v_type == VLNK) { + error = EMLINK; + goto bad; + } + if (vp->v_type == VSOCK) { + error = EOPNOTSUPP; + goto bad; + } + if ((fmode & O_CREAT) == 0) { + mode = 0; + if (fmode & (FWRITE | O_TRUNC)) { + if (vp->v_type == VDIR) { + error = EISDIR; + goto bad; + } + error = vn_writechk(vp); + if (error) + goto bad; + mode |= VWRITE; + } + if (fmode & FREAD) + mode |= VREAD; + if (mode) { + error = VOP_ACCESS(vp, mode, cred, td); + if (error) + goto bad; + } + } + if ((error = VOP_OPEN(vp, fmode, cred, td)) != 0) + goto bad; + /* + * Make sure that a VM object is created for VMIO support. + */ + if (vn_canvmio(vp) == TRUE) { +#ifdef LOOKUP_SHARED + int flock; + + if (!exclusive && VOP_GETVOBJECT(vp, NULL) != 0) + VOP_LOCK(vp, LK_UPGRADE, td); + /* + * In cases where the object is marked as dead object_create + * will unlock and relock exclusive. It is safe to call in + * here with a shared lock because we only examine fields that + * the shared lock guarantees will be stable. In the UPGRADE + * case it is not likely that anyone has used this vnode yet + * so there will be no contention. The logic after this call + * restores the requested locking state. + */ +#endif + if ((error = vfs_object_create(vp, td, cred)) != 0) { + VOP_UNLOCK(vp, 0, td); + VOP_CLOSE(vp, fmode, cred, td); + NDFREE(ndp, NDF_ONLY_PNBUF); + vrele(vp); + *flagp = fmode; + return (error); + } +#ifdef LOOKUP_SHARED + flock = VOP_ISLOCKED(vp, td); + if (!exclusive && flock == LK_EXCLUSIVE) + VOP_LOCK(vp, LK_DOWNGRADE, td); +#endif + } + + if (fmode & FWRITE) + vp->v_writecount++; + *flagp = fmode; + return (0); +bad: + NDFREE(ndp, NDF_ONLY_PNBUF); + vput(vp); + *flagp = fmode; + return (error); +} + +/* + * Check for write permissions on the specified vnode. + * Prototype text segments cannot be written. + */ +int +vn_writechk(vp) + register struct vnode *vp; +{ + + /* + * If there's shared text associated with + * the vnode, try to free it up once. If + * we fail, we can't allow writing. + */ + if (vp->v_flag & VTEXT) + return (ETXTBSY); + return (0); +} + +/* + * Vnode close call + */ +int +vn_close(vp, flags, cred, td) + register struct vnode *vp; + int flags; + struct ucred *cred; + struct thread *td; +{ + int error; + + if (flags & FWRITE) + vp->v_writecount--; + error = VOP_CLOSE(vp, flags, cred, td); + /* + * XXX - In certain instances VOP_CLOSE has to do the vrele + * itself. If the vrele has been done, it will return EAGAIN + * to indicate that the vrele should not be done again. When + * this happens, we just return success. The correct thing to + * do would be to have all VOP_CLOSE instances do the vrele. + */ + if (error == EAGAIN) + return (0); + vrele(vp); + return (error); +} + +/* + * Sequential heuristic - detect sequential operation + */ +static __inline +int +sequential_heuristic(struct uio *uio, struct file *fp) +{ + + if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || + uio->uio_offset == fp->f_nextoff) { + /* + * XXX we assume that the filesystem block size is + * the default. Not true, but still gives us a pretty + * good indicator of how sequential the read operations + * are. + */ + fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; + if (fp->f_seqcount >= 127) + fp->f_seqcount = 127; + return(fp->f_seqcount << 16); + } + + /* + * Not sequential, quick draw-down of seqcount + */ + if (fp->f_seqcount > 1) + fp->f_seqcount = 1; + else + fp->f_seqcount = 0; + return(0); +} + +/* + * Package up an I/O request on a vnode into a uio and do it. + */ +int +vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td) + enum uio_rw rw; + struct vnode *vp; + caddr_t base; + int len; + off_t offset; + enum uio_seg segflg; + int ioflg; + struct ucred *cred; + int *aresid; + struct thread *td; +{ + struct uio auio; + struct iovec aiov; + struct mount *mp; + int error; + + if ((ioflg & IO_NODELOCKED) == 0) { + mp = NULL; + if (rw == UIO_WRITE) { + if (vp->v_type != VCHR && + (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) + != 0) + return (error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + } else { + vn_lock(vp, LK_SHARED | LK_RETRY, td); + } + + } + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = base; + aiov.iov_len = len; + auio.uio_resid = len; + auio.uio_offset = offset; + auio.uio_segflg = segflg; + auio.uio_rw = rw; + auio.uio_td = td; + if (rw == UIO_READ) { + error = VOP_READ(vp, &auio, ioflg, cred); + } else { + error = VOP_WRITE(vp, &auio, ioflg, cred); + } + if (aresid) + *aresid = auio.uio_resid; + else + if (auio.uio_resid && error == 0) + error = EIO; + if ((ioflg & IO_NODELOCKED) == 0) { + if (rw == UIO_WRITE) + vn_finished_write(mp); + VOP_UNLOCK(vp, 0, td); + } + return (error); +} + +/* + * Package up an I/O request on a vnode into a uio and do it. The I/O + * request is split up into smaller chunks and we try to avoid saturating + * the buffer cache while potentially holding a vnode locked, so we + * check bwillwrite() before calling vn_rdwr(). We also call uio_yield() + * to give other processes a chance to lock the vnode (either other processes + * core'ing the same binary, or unrelated processes scanning the directory). + */ +int +vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td) + enum uio_rw rw; + struct vnode *vp; + caddr_t base; + int len; + off_t offset; + enum uio_seg segflg; + int ioflg; + struct ucred *cred; + int *aresid; + struct thread *td; +{ + int error = 0; + + do { + int chunk = (len > MAXBSIZE) ? MAXBSIZE : len; + + if (rw != UIO_READ && vp->v_type == VREG) + bwillwrite(); + error = vn_rdwr(rw, vp, base, chunk, offset, segflg, + ioflg, cred, aresid, td); + len -= chunk; /* aresid calc already includes length */ + if (error) + break; + offset += chunk; + base += chunk; + uio_yield(); + } while (len); + if (aresid) + *aresid += len; + return (error); +} + +/* + * File table vnode read routine. + */ +static int +vn_read(fp, uio, cred, flags, td) + struct file *fp; + struct uio *uio; + struct ucred *cred; + struct thread *td; + int flags; +{ + struct vnode *vp; + int error, ioflag; + + mtx_lock(&Giant); + KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", + uio->uio_td, td)); + vp = (struct vnode *)fp->f_data; + ioflag = 0; + if (fp->f_flag & FNONBLOCK) + ioflag |= IO_NDELAY; + if (fp->f_flag & O_DIRECT) + ioflag |= IO_DIRECT; + VOP_LEASE(vp, td, cred, LEASE_READ); + vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td); + if ((flags & FOF_OFFSET) == 0) + uio->uio_offset = fp->f_offset; + + ioflag |= sequential_heuristic(uio, fp); + + error = VOP_READ(vp, uio, ioflag, cred); + if ((flags & FOF_OFFSET) == 0) + fp->f_offset = uio->uio_offset; + fp->f_nextoff = uio->uio_offset; + VOP_UNLOCK(vp, 0, td); + mtx_unlock(&Giant); + return (error); +} + +/* + * File table vnode write routine. + */ +static int +vn_write(fp, uio, cred, flags, td) + struct file *fp; + struct uio *uio; + struct ucred *cred; + struct thread *td; + int flags; +{ + struct vnode *vp; + struct mount *mp; + int error, ioflag; + + mtx_lock(&Giant); + KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", + uio->uio_td, td)); + vp = (struct vnode *)fp->f_data; + if (vp->v_type == VREG) + bwillwrite(); + ioflag = IO_UNIT; + if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) + ioflag |= IO_APPEND; + if (fp->f_flag & FNONBLOCK) + ioflag |= IO_NDELAY; + if (fp->f_flag & O_DIRECT) + ioflag |= IO_DIRECT; + if ((fp->f_flag & O_FSYNC) || + (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) + ioflag |= IO_SYNC; + mp = NULL; + if (vp->v_type != VCHR && + (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + mtx_unlock(&Giant); + return (error); + } + VOP_LEASE(vp, td, cred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + if ((flags & FOF_OFFSET) == 0) + uio->uio_offset = fp->f_offset; + ioflag |= sequential_heuristic(uio, fp); + error = VOP_WRITE(vp, uio, ioflag, cred); + if ((flags & FOF_OFFSET) == 0) + fp->f_offset = uio->uio_offset; + fp->f_nextoff = uio->uio_offset; + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + mtx_unlock(&Giant); + return (error); +} + +/* + * File table vnode stat routine. + */ +static int +vn_statfile(fp, sb, td) + struct file *fp; + struct stat *sb; + struct thread *td; +{ + struct vnode *vp = (struct vnode *)fp->f_data; + int error; + + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = vn_stat(vp, sb, td); + VOP_UNLOCK(vp, 0, td); + + return (error); +} + +/* + * Stat a vnode; implementation for the stat syscall + */ +int +vn_stat(vp, sb, td) + struct vnode *vp; + register struct stat *sb; + struct thread *td; +{ + struct vattr vattr; + register struct vattr *vap; + int error; + u_short mode; + + vap = &vattr; + error = VOP_GETATTR(vp, vap, td->td_ucred, td); + if (error) + return (error); + + /* + * Zero the spare stat fields + */ + bzero(sb, sizeof *sb); + + /* + * Copy from vattr table + */ + if (vap->va_fsid != VNOVAL) + sb->st_dev = vap->va_fsid; + else + sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; + sb->st_ino = vap->va_fileid; + mode = vap->va_mode; + switch (vap->va_type) { + case VREG: + mode |= S_IFREG; + break; + case VDIR: + mode |= S_IFDIR; + break; + case VBLK: + mode |= S_IFBLK; + break; + case VCHR: + mode |= S_IFCHR; + break; + case VLNK: + mode |= S_IFLNK; + /* This is a cosmetic change, symlinks do not have a mode. */ + if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) + sb->st_mode &= ~ACCESSPERMS; /* 0000 */ + else + sb->st_mode |= ACCESSPERMS; /* 0777 */ + break; + case VSOCK: + mode |= S_IFSOCK; + break; + case VFIFO: + mode |= S_IFIFO; + break; + default: + return (EBADF); + }; + sb->st_mode = mode; + sb->st_nlink = vap->va_nlink; + sb->st_uid = vap->va_uid; + sb->st_gid = vap->va_gid; + sb->st_rdev = vap->va_rdev; + if (vap->va_size > OFF_MAX) + return (EOVERFLOW); + sb->st_size = vap->va_size; + sb->st_atimespec = vap->va_atime; + sb->st_mtimespec = vap->va_mtime; + sb->st_ctimespec = vap->va_ctime; + sb->st_createtimespec = vap->va_createtime; + + /* + * According to www.opengroup.org, the meaning of st_blksize is + * "a filesystem-specific preferred I/O block size for this + * object. In some filesystem types, this may vary from file + * to file" + * Default to PAGE_SIZE after much discussion. + */ + + if (vap->va_type == VREG) { + sb->st_blksize = vap->va_blocksize; + } else if (vn_isdisk(vp, NULL)) { + sb->st_blksize = vp->v_rdev->si_bsize_best; + if (sb->st_blksize < vp->v_rdev->si_bsize_phys) + sb->st_blksize = vp->v_rdev->si_bsize_phys; + if (sb->st_blksize < BLKDEV_IOSIZE) + sb->st_blksize = BLKDEV_IOSIZE; + } else { + sb->st_blksize = PAGE_SIZE; + } + + sb->st_flags = vap->va_flags; + if (suser(td)) + sb->st_gen = 0; + else + sb->st_gen = vap->va_gen; + +#if (S_BLKSIZE == 512) + /* Optimize this case */ + sb->st_blocks = vap->va_bytes >> 9; +#else + sb->st_blocks = vap->va_bytes / S_BLKSIZE; +#endif + return (0); +} + +/* + * File table vnode ioctl routine. + */ +static int +vn_ioctl(fp, com, data, td) + struct file *fp; + u_long com; + caddr_t data; + struct thread *td; +{ + register struct vnode *vp = ((struct vnode *)fp->f_data); + struct vnode *vpold; + struct vattr vattr; + int error; + + switch (vp->v_type) { + + case VREG: + case VDIR: + if (com == FIONREAD) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); + VOP_UNLOCK(vp, 0, td); + if (error) + return (error); + *(int *)data = vattr.va_size - fp->f_offset; + return (0); + } + if (com == FIONBIO || com == FIOASYNC) /* XXX */ + return (0); /* XXX */ + /* fall into ... */ + + default: +#if 0 + return (ENOTTY); +#endif + case VFIFO: + case VCHR: + case VBLK: + if (com == FIODTYPE) { + if (vp->v_type != VCHR && vp->v_type != VBLK) + return (ENOTTY); + *(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK; + return (0); + } + error = VOP_IOCTL(vp, com, data, fp->f_flag, td->td_ucred, td); + if (error == 0 && com == TIOCSCTTY) { + + /* Do nothing if reassigning same control tty */ + sx_slock(&proctree_lock); + if (td->td_proc->p_session->s_ttyvp == vp) { + sx_sunlock(&proctree_lock); + return (0); + } + + vpold = td->td_proc->p_session->s_ttyvp; + VREF(vp); + SESS_LOCK(td->td_proc->p_session); + td->td_proc->p_session->s_ttyvp = vp; + SESS_UNLOCK(td->td_proc->p_session); + + sx_sunlock(&proctree_lock); + + /* Get rid of reference to old control tty */ + if (vpold) + vrele(vpold); + } + return (error); + } +} + +/* + * File table vnode poll routine. + */ +static int +vn_poll(fp, events, cred, td) + struct file *fp; + int events; + struct ucred *cred; + struct thread *td; +{ + + return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, td)); +} + +/* + * Check that the vnode is still valid, and if so + * acquire requested lock. + */ +int +#ifndef DEBUG_LOCKS +vn_lock(vp, flags, td) +#else +debug_vn_lock(vp, flags, td, filename, line) +#endif + struct vnode *vp; + int flags; + struct thread *td; +#ifdef DEBUG_LOCKS + const char *filename; + int line; +#endif +{ + int error; + + do { + if ((flags & LK_INTERLOCK) == 0) + mtx_lock(&vp->v_interlock); + if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curthread) { + vp->v_flag |= VXWANT; + msleep(vp, &vp->v_interlock, PINOD | PDROP, + "vn_lock", 0); + error = ENOENT; + } else { +#if 0 + /* this can now occur in normal operation */ + if (vp->v_vxproc != NULL) + log(LOG_INFO, "VXLOCK interlock avoided in vn_lock\n"); +#endif +#ifdef DEBUG_LOCKS + vp->filename = filename; + vp->line = line; +#endif + error = VOP_LOCK(vp, + flags | LK_NOPAUSE | LK_INTERLOCK, td); + if (error == 0) + return (error); + } + flags &= ~LK_INTERLOCK; + } while (flags & LK_RETRY); + return (error); +} + +/* + * File table vnode close routine. + */ +static int +vn_closefile(fp, td) + struct file *fp; + struct thread *td; +{ + + fp->f_ops = &badfileops; + return (vn_close(((struct vnode *)fp->f_data), fp->f_flag, + fp->f_cred, td)); +} + +/* + * Preparing to start a filesystem write operation. If the operation is + * permitted, then we bump the count of operations in progress and + * proceed. If a suspend request is in progress, we wait until the + * suspension is over, and then proceed. + */ +int +vn_start_write(vp, mpp, flags) + struct vnode *vp; + struct mount **mpp; + int flags; +{ + struct mount *mp; + int error; + + /* + * If a vnode is provided, get and return the mount point that + * to which it will write. + */ + if (vp != NULL) { + if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { + *mpp = NULL; + if (error != EOPNOTSUPP) + return (error); + return (0); + } + } + if ((mp = *mpp) == NULL) + return (0); + /* + * Check on status of suspension. + */ + while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { + if (flags & V_NOWAIT) + return (EWOULDBLOCK); + error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH), + "suspfs", 0); + if (error) + return (error); + } + if (flags & V_XSLEEP) + return (0); + mp->mnt_writeopcount++; + return (0); +} + +/* + * Secondary suspension. Used by operations such as vop_inactive + * routines that are needed by the higher level functions. These + * are allowed to proceed until all the higher level functions have + * completed (indicated by mnt_writeopcount dropping to zero). At that + * time, these operations are halted until the suspension is over. + */ +int +vn_write_suspend_wait(vp, mp, flags) + struct vnode *vp; + struct mount *mp; + int flags; +{ + int error; + + if (vp != NULL) { + if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) { + if (error != EOPNOTSUPP) + return (error); + return (0); + } + } + /* + * If we are not suspended or have not yet reached suspended + * mode, then let the operation proceed. + */ + if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) + return (0); + if (flags & V_NOWAIT) + return (EWOULDBLOCK); + /* + * Wait for the suspension to finish. + */ + return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH), + "suspfs", 0)); +} + +/* + * Filesystem write operation has completed. If we are suspending and this + * operation is the last one, notify the suspender that the suspension is + * now in effect. + */ +void +vn_finished_write(mp) + struct mount *mp; +{ + + if (mp == NULL) + return; + mp->mnt_writeopcount--; + if (mp->mnt_writeopcount < 0) + panic("vn_finished_write: neg cnt"); + if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && + mp->mnt_writeopcount <= 0) + wakeup(&mp->mnt_writeopcount); +} + +/* + * Request a filesystem to suspend write operations. + */ +void +vfs_write_suspend(mp) + struct mount *mp; +{ + struct thread *td = curthread; + + if (mp->mnt_kern_flag & MNTK_SUSPEND) + return; + mp->mnt_kern_flag |= MNTK_SUSPEND; + if (mp->mnt_writeopcount > 0) + (void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0); + VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td); + mp->mnt_kern_flag |= MNTK_SUSPENDED; +} + +/* + * Request a filesystem to resume write operations. + */ +void +vfs_write_resume(mp) + struct mount *mp; +{ + + if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) + return; + mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED); + wakeup(&mp->mnt_writeopcount); + wakeup(&mp->mnt_flag); +} + +/* + * Implement kqueues for files by translating it to vnode operation. + */ +static int +vn_kqfilter(struct file *fp, struct knote *kn) +{ + + return (VOP_KQFILTER(((struct vnode *)fp->f_data), kn)); +} + +/* + * Simplified in-kernel wrapper calls for extended attribute access. + * Both calls pass in a NULL credential, authorizing as "kernel" access. + * Set IO_NODELOCKED in ioflg if the vnode is already locked. + */ +int +vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, + const char *attrname, int *buflen, char *buf, struct thread *td) +{ + struct uio auio; + struct iovec iov; + int error; + + iov.iov_len = *buflen; + iov.iov_base = buf; + + auio.uio_iov = &iov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_offset = 0; + auio.uio_resid = *buflen; + + if ((ioflg & IO_NODELOCKED) == 0) + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + + /* authorize attribute retrieval as kernel */ + error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, + td); + + if ((ioflg & IO_NODELOCKED) == 0) + VOP_UNLOCK(vp, 0, td); + + if (error == 0) { + *buflen = *buflen - auio.uio_resid; + } + + return (error); +} + +/* + * XXX failure mode if partially written? + */ +int +vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, + const char *attrname, int buflen, char *buf, struct thread *td) +{ + struct uio auio; + struct iovec iov; + struct mount *mp; + int error; + + iov.iov_len = buflen; + iov.iov_base = buf; + + auio.uio_iov = &iov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_offset = 0; + auio.uio_resid = buflen; + + if ((ioflg & IO_NODELOCKED) == 0) { + if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) + return (error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + } + + /* authorize attribute setting as kernel */ + error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); + + if ((ioflg & IO_NODELOCKED) == 0) { + vn_finished_write(mp); + VOP_UNLOCK(vp, 0, td); + } + + return (error); +} + +int +vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, + const char *attrname, struct thread *td) +{ + struct mount *mp; + int error; + + if ((ioflg & IO_NODELOCKED) == 0) { + if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) + return (error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + } + + /* authorize attribute removal as kernel */ + error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td); + + if ((ioflg & IO_NODELOCKED) == 0) { + vn_finished_write(mp); + VOP_UNLOCK(vp, 0, td); + } + + return (error); +} diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src new file mode 100644 index 0000000..cdeb5e5 --- /dev/null +++ b/sys/kern/vnode_if.src @@ -0,0 +1,556 @@ +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)vnode_if.src 8.12 (Berkeley) 5/14/95 +# $FreeBSD$ +# + +# +# Above each of the vop descriptors is a specification of the locking +# protocol used by each vop call. The first column is the name of +# the variable, the remaining three columns are in, out and error +# respectively. The "in" column defines the lock state on input, +# the "out" column defines the state on succesful return, and the +# "error" column defines the locking state on error exit. +# +# The locking value can take the following values: +# L: locked; not converted to type of lock. +# A: any lock type. +# S: locked with shared lock. +# E: locked with exclusive lock for this process. +# O: locked with exclusive lock for other process. +# U: unlocked. +# -: not applicable. vnode does not yet (or no longer) exists. +# =: the same on input and output, may be either L or U. +# X: locked if not nil. +# + +# +#% islocked vp = = = +# +vop_islocked { + IN struct vnode *vp; + IN struct thread *td; +}; + +# +#% lookup dvp L ? ? +#% lookup vpp - L - +# +# XXX - the lookup locking protocol defies simple description and depends +# on the flags and operation fields in the (cnp) structure. Note +# especially that *vpp may equal dvp and both may be locked. +# +vop_lookup { + IN struct vnode *dvp; + INOUT struct vnode **vpp; + IN struct componentname *cnp; +}; + +# +#% cachedlookup dvp L ? ? +#% cachedlookup vpp - L - +# +# This must be an exact copy of lookup. See kern/vfs_cache.c for details. +# +vop_cachedlookup { + IN struct vnode *dvp; + INOUT struct vnode **vpp; + IN struct componentname *cnp; +}; + +# +#% create dvp L L L +#% create vpp - L - +# +vop_create { + IN struct vnode *dvp; + OUT struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; +}; + +# +#% whiteout dvp L L L +# +vop_whiteout { + IN struct vnode *dvp; + IN struct componentname *cnp; + IN int flags; +}; + +# +#% mknod dvp L L L +#% mknod vpp - L - +# +vop_mknod { + IN struct vnode *dvp; + OUT struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; +}; + +# +#% open vp L L L +# +vop_open { + IN struct vnode *vp; + IN int mode; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% close vp U U U +# +vop_close { + IN struct vnode *vp; + IN int fflag; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% access vp L L L +# +vop_access { + IN struct vnode *vp; + IN int mode; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% getattr vp = = = +# +# XXX: This should be A A A +# +vop_getattr { + IN struct vnode *vp; + OUT struct vattr *vap; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% setattr vp L L L +# +vop_setattr { + IN struct vnode *vp; + IN struct vattr *vap; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% read vp L L L +# +vop_read { + IN struct vnode *vp; + INOUT struct uio *uio; + IN int ioflag; + IN struct ucred *cred; +}; + +# +#% write vp L L L +# +vop_write { + IN struct vnode *vp; + INOUT struct uio *uio; + IN int ioflag; + IN struct ucred *cred; +}; + +# +#% lease vp = = = +# +vop_lease { + IN struct vnode *vp; + IN struct thread *td; + IN struct ucred *cred; + IN int flag; +}; + +# +#% ioctl vp U U U +# +vop_ioctl { + IN struct vnode *vp; + IN u_long command; + IN caddr_t data; + IN int fflag; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% poll vp U U U +# +vop_poll { + IN struct vnode *vp; + IN int events; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% kqfilter vp U U U +# +vop_kqfilter { + IN struct vnode *vp; + IN struct knote *kn; +}; + +# +#% revoke vp U U U +# +vop_revoke { + IN struct vnode *vp; + IN int flags; +}; + +# +#% fsync vp L L L +# +vop_fsync { + IN struct vnode *vp; + IN struct ucred *cred; + IN int waitfor; + IN struct thread *td; +}; + +# +#% remove dvp L L L +#% remove vp L L L +# +vop_remove { + IN struct vnode *dvp; + IN struct vnode *vp; + IN struct componentname *cnp; +}; + +# +#% link tdvp L L L +#% link vp U U U +# +vop_link { + IN struct vnode *tdvp; + IN struct vnode *vp; + IN struct componentname *cnp; +}; + +# +#% rename fdvp U U U +#% rename fvp U U U +#% rename tdvp L U U +#% rename tvp X U U +# +vop_rename { + IN WILLRELE struct vnode *fdvp; + IN WILLRELE struct vnode *fvp; + IN struct componentname *fcnp; + IN WILLRELE struct vnode *tdvp; + IN WILLRELE struct vnode *tvp; + IN struct componentname *tcnp; +}; + +# +#% mkdir dvp L L L +#% mkdir vpp - L - +# +vop_mkdir { + IN struct vnode *dvp; + OUT struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; +}; + +# +#% rmdir dvp L L L +#% rmdir vp L L L +# +vop_rmdir { + IN struct vnode *dvp; + IN struct vnode *vp; + IN struct componentname *cnp; +}; + +# +#% symlink dvp L L L +#% symlink vpp - L - +# +vop_symlink { + IN struct vnode *dvp; + OUT struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; + IN char *target; +}; + +# +#% readdir vp L L L +# +vop_readdir { + IN struct vnode *vp; + INOUT struct uio *uio; + IN struct ucred *cred; + INOUT int *eofflag; + OUT int *ncookies; + INOUT u_long **cookies; +}; + +# +#% readlink vp L L L +# +vop_readlink { + IN struct vnode *vp; + INOUT struct uio *uio; + IN struct ucred *cred; +}; + +# +#% inactive vp L U U +# +vop_inactive { + IN struct vnode *vp; + IN struct thread *td; +}; + +# +#% reclaim vp U U U +# +vop_reclaim { + IN struct vnode *vp; + IN struct thread *td; +}; + +# +#% lock vp ? ? ? +# +vop_lock { + IN struct vnode *vp; + IN int flags; + IN struct thread *td; +}; + +# +#% unlock vp L U L +# +vop_unlock { + IN struct vnode *vp; + IN int flags; + IN struct thread *td; +}; + +# +#% bmap vp L L L +#% bmap vpp - U - +# +vop_bmap { + IN struct vnode *vp; + IN daddr_t bn; + OUT struct vnode **vpp; + IN daddr_t *bnp; + OUT int *runp; + OUT int *runb; +}; + +# +#% strategy vp L L L +# +vop_strategy { + IN struct vnode *vp; + IN struct buf *bp; +}; + +# +#% getwritemount vp = = = +# +vop_getwritemount { + IN struct vnode *vp; + OUT struct mount **mpp; +}; + +# +#% print vp = = = +# +vop_print { + IN struct vnode *vp; +}; + +# +#% pathconf vp L L L +# +vop_pathconf { + IN struct vnode *vp; + IN int name; + OUT register_t *retval; +}; + +# +#% advlock vp U U U +# +vop_advlock { + IN struct vnode *vp; + IN caddr_t id; + IN int op; + IN struct flock *fl; + IN int flags; +}; + +# +#% reallocblks vp L L L +# +vop_reallocblks { + IN struct vnode *vp; + IN struct cluster_save *buflist; +}; + +# +#% getpages vp L L L +# +vop_getpages { + IN struct vnode *vp; + IN vm_page_t *m; + IN int count; + IN int reqpage; + IN vm_ooffset_t offset; +}; + +# +#% putpages vp L L L +# +vop_putpages { + IN struct vnode *vp; + IN vm_page_t *m; + IN int count; + IN int sync; + IN int *rtvals; + IN vm_ooffset_t offset; +}; + +# +#% freeblks vp - - - +# +# This call is used by the filesystem to release blocks back to +# device-driver. This is useful if the driver has a lengthy +# erase handling or similar. +# + +vop_freeblks { + IN struct vnode *vp; + IN daddr_t addr; + IN daddr_t length; +}; + +# +#% getacl vp L L L +# +vop_getacl { + IN struct vnode *vp; + IN acl_type_t type; + OUT struct acl *aclp; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% setacl vp L L L +# +vop_setacl { + IN struct vnode *vp; + IN acl_type_t type; + IN struct acl *aclp; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% aclcheck vp = = = +# +vop_aclcheck { + IN struct vnode *vp; + IN acl_type_t type; + IN struct acl *aclp; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% getextattr vp L L L +# +vop_getextattr { + IN struct vnode *vp; + IN int attrnamespace; + IN const char *name; + INOUT struct uio *uio; + OUT size_t *size; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% setextattr vp L L L +# +vop_setextattr { + IN struct vnode *vp; + IN int attrnamespace; + IN const char *name; + INOUT struct uio *uio; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% createvobject vp L L L +# +vop_createvobject { + IN struct vnode *vp; + IN struct ucred *cred; + IN struct thread *td; +}; + +# +#% destroyvobject vp L L L +# +vop_destroyvobject { + IN struct vnode *vp; +}; + +# +#% getvobject vp L L L +# +vop_getvobject { + IN struct vnode *vp; + OUT struct vm_object **objpp; +}; |